Tutorial (Japanese Named Entity Recognition)

Train a Japanese NER model for KWDLC

This tutorial provides an example of training a Japanese NER model by using Kyoto University Web Document Leads Corpus(KWDLC).

Download the dataset

Please download KWDLC from http://nlp.ist.i.kyoto-u.ac.jp/EN/index.php?KWDLC manually. Copy the corpus to the working directory.

Install python libraries

Before we get started, please run the following command to install the libraries used in this tutorial.

pip install nagisa
pip install seqeval
pip install beautifulsoup4

Preprocess the dataset

First, convert the downloaded data to the input data format for nagisa. The input data format of the train/dev/test files is the tsv format. The Each line is word and tag and one line is represented by word \t tag. Note that you put EOS between sentences.

This preprocess is a little complicated, so please copy the code below and run it. After running the code, kwdlc.txt is output to the working directory.

python tutorial_preprocess_kwdlc.py
tutorial_preprocess_kwdlc.py
  1import bs4
  2import glob
  3
  4
  5def load_kwdlc(dir_path):
  6    files = glob.glob(dir_path+"/*/*", recursive=True)
  7
  8    data = []
  9
 10    words = []
 11    position2ne = {}
 12
 13    for fn in files:
 14        with open(fn, "r") as f:
 15            for line in f:
 16                line = line.strip()
 17                first_char = line[0]
 18
 19                if first_char == "+":
 20                    soup = bs4.BeautifulSoup(line, "html.parser")
 21                    num_tags = len(soup.contents)
 22                    for i in range(num_tags):
 23                        if str(type(soup.contents[i])) == "<class 'bs4.element.Tag'>":
 24                            if "ne" == soup.contents[i].name:
 25                                target = soup.contents[i].attrs["target"]
 26                                netype = soup.contents[i].attrs["type"]
 27
 28                                position2ne[len(words)] = [target, netype]
 29
 30                elif first_char == "#" or first_char == "*":
 31                    None
 32
 33                elif line == "EOS":
 34                    # process
 35                    if len(position2ne) > 0:
 36                        positions = position2ne.keys()
 37                        for position in positions:
 38                            target = position2ne[position][0]
 39                            netype = position2ne[position][1]
 40
 41                    data.append([words, position2ne])
 42
 43                    # reset
 44                    words = []
 45                    position2ne = {}
 46
 47                else:
 48                    tokens = line.split()
 49                    surface = tokens[0]
 50                    words.append(surface)
 51
 52    return data, position2ne
 53
 54
 55def write_kwdlc_as_single_file(filename, data, position2ne):
 56
 57    with open(filename, "w") as f:
 58        for line in data:
 59            words, position2ne = line
 60
 61            nes = [v[0] for k, v in sorted(position2ne.items(), key=lambda x:x[0])]
 62            nes = list(reversed(nes))
 63
 64            tags = [v[1] for k, v in sorted(position2ne.items(), key=lambda x:x[0])]
 65            tags = list(reversed(tags))
 66
 67            if len(nes) == 0:
 68                None
 69
 70            else:
 71                ne_tags = []
 72
 73                ne = nes.pop()
 74                tag = tags.pop()
 75                ne_target_char = ne[0]
 76
 77                partical = []
 78                for word in words:
 79                    first_char = word[0]
 80                    if first_char == ne_target_char:
 81
 82                        if word in ne:
 83                            partical.append(word)
 84
 85                            if "".join(partical) == ne:
 86
 87                                for i, word in enumerate(partical):
 88                                    if i == 0:
 89                                        ne_tags.append("B-"+tag)
 90                                    elif i == len(partical) - 1:
 91                                        ne_tags.append("E-"+tag)
 92                                    else:
 93                                        ne_tags.append("M-"+tag)
 94
 95                                if len(nes) > 0:
 96                                    ne = nes.pop()
 97                                    tag = tags.pop()
 98                                    ne_target_char = ne[0]
 99
100                                partical = []
101
102                            else:
103                                ne_target_char = ne[len("".join(partical))]
104
105                        else:
106                            partical = []
107                            ne_tags.append("O")
108
109                    else:
110                        partical = []
111                        ne_tags.append("O")
112
113
114                for word, ne_tag in zip(words, ne_tags):
115                    f.write("\t".join([word, ne_tag])+"\n")
116                f.write("EOS\n")
117
118
119def main():
120    dir_path = "./KWDLC-1.0/dat/rel"
121    data, position2ne = load_kwdlc(dir_path)
122
123    fn_out = "kwdlc.txt"
124    write_kwdlc_as_single_file(fn_out, data, position2ne)
125
126
127if __name__ == "__main__":
128    main()

Train a model

Next, you train a NER model by using the nagisa.fit() function. After finish training the model, save the three model files (kwdlc_ner_model.vocabs, kwdlc_ner_model.params, kwdlc_ner_model.hp) in the current directory.

python tutorial_train_kwdlc.py
tutorial_train_kwdlc.py
 1import random
 2
 3import nagisa
 4
 5
 6def write_file(filename, X, Y):
 7    with open(filename, "w") as f:
 8        for x, y in zip(X, Y):
 9            for word, tag in zip(x, y):
10                f.write("\t".join([word, tag])+"\n")
11            f.write("EOS\n")
12
13
14def main():
15    random.seed(1234)
16
17    # preprocess
18    fn_in = "kwdlc.txt"
19    X, Y = nagisa.utils.load_file(fn_in)
20    indice = [i for i in range(len(X))]
21    random.shuffle(indice)
22
23    num_train = int(0.8 * len(indice))
24    num_dev = int(0.1 * len(indice))
25    num_test = int(0.1 * len(indice))
26
27    train_X = [X[i] for i in indice[:num_train]]
28    train_Y = [Y[i] for i in indice[:num_train]]
29    dev_X = [X[i] for i in indice[num_train:num_train+num_dev]]
30    dev_Y = [Y[i] for i in indice[num_train:num_train+num_dev]]
31    test_X = [X[i] for i in indice[num_train+num_dev:num_train+num_dev+num_test]]
32    test_Y = [Y[i] for i in indice[num_train+num_dev:num_train+num_dev+num_test]]
33
34    fn_out_train = "kwdlc.train"
35    fn_out_dev = "kwdlc.dev"
36    fn_out_test = "kwdlc.test"
37    write_file(fn_out_train, train_X, train_Y)
38    write_file(fn_out_dev, dev_X, dev_Y)
39    write_file(fn_out_test, test_X, test_Y)
40
41    # start training
42    fn_out_model = "kwdlc_ner_model"
43    nagisa.fit(
44        train_file=fn_out_train,
45        dev_file=fn_out_dev,
46        test_file=fn_out_test,
47        model_name=fn_out_model
48    )
49
50
51if __name__ == "__main__":
52    main()
53
54

This is a log of the training process.

[dynet] random seed: 1234
[dynet] allocating memory: 32MB
[dynet] memory allocation done.
[nagisa] LAYERS: 1
[nagisa] THRESHOLD: 3
[nagisa] DECAY: 1
[nagisa] EPOCH: 10
[nagisa] WINDOW_SIZE: 3
[nagisa] DIM_UNI: 32
[nagisa] DIM_BI: 16
[nagisa] DIM_WORD: 16
[nagisa] DIM_CTYPE: 8
[nagisa] DIM_TAGEMB: 16
[nagisa] DIM_HIDDEN: 100
[nagisa] LEARNING_RATE: 0.1
[nagisa] DROPOUT_RATE: 0.3
[nagisa] SEED: 1234
[nagisa] TRAINSET: kwdlc.train
[nagisa] TESTSET: kwdlc.test
[nagisa] DEVSET: kwdlc.dev
[nagisa] DICTIONARY: None
[nagisa] EMBEDDING: None
[nagisa] HYPERPARAMS: kwdlc_ner_model.hp
[nagisa] MODEL: kwdlc_ner_model.params
[nagisa] VOCAB: kwdlc_ner_model.vocabs
[nagisa] EPOCH_MODEL: kwdlc_ner_model_epoch.params
[nagisa] NUM_TRAIN: 3816
[nagisa] NUM_TEST: 477
[nagisa] NUM_DEV: 477
[nagisa] VOCAB_SIZE_UNI: 1838
[nagisa] VOCAB_SIZE_BI: 12774
[nagisa] VOCAB_SIZE_WORD: 4809
[nagisa] VOCAB_SIZE_POSTAG: 29
Epoch       LR      Loss    Time_m  DevWS_f1        DevPOS_f1       TestWS_f1       TestPOS_f1
1           0.100   15.09   0.632   92.41           83.14           91.70           82.63
2           0.100   8.818   0.637   93.59           85.59           93.21           85.28
3           0.100   6.850   0.637   93.98           85.60           93.75           86.01
4           0.100   5.751   0.634   94.44           87.29           94.01           86.99
5           0.050   5.028   0.614   94.35           87.02           94.01           86.99
6           0.050   3.727   0.647   94.84           87.52           94.79           87.91
7           0.025   3.268   0.613   94.52           87.45           94.79           87.91
8           0.012   2.761   0.610   94.75           87.58           94.79           87.91
9           0.012   2.447   0.634   94.95           87.79           95.00           88.28
10          0.006   2.333   0.624   94.73           87.41           95.00           88.28

Predict

You can build the tagger only by loading the three trained model files (kwdlc_ner_model.vocabs, kwdlc_ner_model.params, kwdlc_ner_model.hp) to set arguments in nagisa.Tagger().

python tutorial_predict_kwdlc.py

Error analysis

By checking tag-level accuracy/entity-level macro-f1/classification_report, you can see what the model is wrong with.

python tutorial_error_analysis_kwdlc.py
tutorial_error_analysis_kwdlc.py
 1import nagisa
 2
 3from seqeval.metrics import f1_score
 4from seqeval.metrics import accuracy_score
 5from seqeval.metrics import classification_report
 6
 7
 8def main():
 9    # load the testset
10    test_X, test_Y = nagisa.utils.load_file("kwdlc.test")
11
12    # build the tagger for kwdlc
13    ner_tagger = nagisa.Tagger(vocabs='kwdlc_ner_model.vocabs',
14                              params='kwdlc_ner_model.params',
15                              hp='kwdlc_ner_model.hp')
16
17    # predict
18    true_Y = []
19    pred_Y = []
20    for words, true_y in zip(test_X, test_Y):
21        pred_y= ner_tagger.decode(words)
22
23        _pred_y = []
24        _true_y = []
25        for word, pred, true in zip(words, pred_y, true_y):
26            _pred_y.append(pred)
27            _true_y.append(true)
28        true_Y.append(_true_y)
29        pred_Y.append(_pred_y)
30
31    # evaluate
32    accuracy = accuracy_score(true_Y, pred_Y)
33    print("accuracy: {}".format(accuracy))
34    f1 = f1_score(true_Y, pred_Y)
35    print("macro-f1: {}".format(f1))
36    report = classification_report(true_Y, pred_Y)
37    print(report)
38
39
40if __name__ == "__main__":
41    main()
42
accuracy: 0.9166868198307134
macro-f1: 0.5900383141762452
                  precision    recall  f1-score   support

    ARTIFACT       0.33      0.35      0.34        86
    OPTIONAL       0.32      0.19      0.24        31
ORGANIZATION       0.40      0.33      0.36       109
        DATE       0.84      0.87      0.86       154
    LOCATION       0.64      0.68      0.66       262
       MONEY       0.88      0.88      0.88        16
      PERSON       0.44      0.62      0.51        94
        TIME       0.40      0.44      0.42         9
     PERCENT       0.75      0.50      0.60         6

 avg / total       0.58      0.60      0.59       767