Tutorial (Japanese Named Entity Recognition)
Train a Japanese NER model for KWDLC
This tutorial provides an example of training a Japanese NER model by using Kyoto University Web Document Leads Corpus(KWDLC).
Download the dataset
Please download KWDLC from http://nlp.ist.i.kyoto-u.ac.jp/EN/index.php?KWDLC manually. Copy the corpus to the working directory.
Install python libraries
Before we get started, please run the following command to install the libraries used in this tutorial.
pip install nagisa
pip install seqeval
pip install beautifulsoup4
Preprocess the dataset
First, convert the downloaded data to the input data format for nagisa. The input data format of the train/dev/test files is the tsv format. The Each line is word and tag and one line is represented by word \t tag. Note that you put EOS between sentences.
This preprocess is a little complicated, so please copy the code below and run it. After running the code, kwdlc.txt is output to the working directory.
python tutorial_preprocess_kwdlc.py
1import bs4
2import glob
3
4
5def load_kwdlc(dir_path):
6 files = glob.glob(dir_path+"/*/*", recursive=True)
7
8 data = []
9
10 words = []
11 position2ne = {}
12
13 for fn in files:
14 with open(fn, "r") as f:
15 for line in f:
16 line = line.strip()
17 first_char = line[0]
18
19 if first_char == "+":
20 soup = bs4.BeautifulSoup(line, "html.parser")
21 num_tags = len(soup.contents)
22 for i in range(num_tags):
23 if str(type(soup.contents[i])) == "<class 'bs4.element.Tag'>":
24 if "ne" == soup.contents[i].name:
25 target = soup.contents[i].attrs["target"]
26 netype = soup.contents[i].attrs["type"]
27
28 position2ne[len(words)] = [target, netype]
29
30 elif first_char == "#" or first_char == "*":
31 None
32
33 elif line == "EOS":
34 # process
35 if len(position2ne) > 0:
36 positions = position2ne.keys()
37 for position in positions:
38 target = position2ne[position][0]
39 netype = position2ne[position][1]
40
41 data.append([words, position2ne])
42
43 # reset
44 words = []
45 position2ne = {}
46
47 else:
48 tokens = line.split()
49 surface = tokens[0]
50 words.append(surface)
51
52 return data, position2ne
53
54
55def write_kwdlc_as_single_file(filename, data, position2ne):
56
57 with open(filename, "w") as f:
58 for line in data:
59 words, position2ne = line
60
61 nes = [v[0] for k, v in sorted(position2ne.items(), key=lambda x:x[0])]
62 nes = list(reversed(nes))
63
64 tags = [v[1] for k, v in sorted(position2ne.items(), key=lambda x:x[0])]
65 tags = list(reversed(tags))
66
67 if len(nes) == 0:
68 None
69
70 else:
71 ne_tags = []
72
73 ne = nes.pop()
74 tag = tags.pop()
75 ne_target_char = ne[0]
76
77 partical = []
78 for word in words:
79 first_char = word[0]
80 if first_char == ne_target_char:
81
82 if word in ne:
83 partical.append(word)
84
85 if "".join(partical) == ne:
86
87 for i, word in enumerate(partical):
88 if i == 0:
89 ne_tags.append("B-"+tag)
90 elif i == len(partical) - 1:
91 ne_tags.append("E-"+tag)
92 else:
93 ne_tags.append("M-"+tag)
94
95 if len(nes) > 0:
96 ne = nes.pop()
97 tag = tags.pop()
98 ne_target_char = ne[0]
99
100 partical = []
101
102 else:
103 ne_target_char = ne[len("".join(partical))]
104
105 else:
106 partical = []
107 ne_tags.append("O")
108
109 else:
110 partical = []
111 ne_tags.append("O")
112
113
114 for word, ne_tag in zip(words, ne_tags):
115 f.write("\t".join([word, ne_tag])+"\n")
116 f.write("EOS\n")
117
118
119def main():
120 dir_path = "./KWDLC-1.0/dat/rel"
121 data, position2ne = load_kwdlc(dir_path)
122
123 fn_out = "kwdlc.txt"
124 write_kwdlc_as_single_file(fn_out, data, position2ne)
125
126
127if __name__ == "__main__":
128 main()
Train a model
Next, you train a NER model by using the nagisa.fit()
function.
After finish training the model, save the three model files (kwdlc_ner_model.vocabs, kwdlc_ner_model.params, kwdlc_ner_model.hp) in the current directory.
python tutorial_train_kwdlc.py
1import random
2
3import nagisa
4
5
6def write_file(filename, X, Y):
7 with open(filename, "w") as f:
8 for x, y in zip(X, Y):
9 for word, tag in zip(x, y):
10 f.write("\t".join([word, tag])+"\n")
11 f.write("EOS\n")
12
13
14def main():
15 random.seed(1234)
16
17 # preprocess
18 fn_in = "kwdlc.txt"
19 X, Y = nagisa.utils.load_file(fn_in)
20 indice = [i for i in range(len(X))]
21 random.shuffle(indice)
22
23 num_train = int(0.8 * len(indice))
24 num_dev = int(0.1 * len(indice))
25 num_test = int(0.1 * len(indice))
26
27 train_X = [X[i] for i in indice[:num_train]]
28 train_Y = [Y[i] for i in indice[:num_train]]
29 dev_X = [X[i] for i in indice[num_train:num_train+num_dev]]
30 dev_Y = [Y[i] for i in indice[num_train:num_train+num_dev]]
31 test_X = [X[i] for i in indice[num_train+num_dev:num_train+num_dev+num_test]]
32 test_Y = [Y[i] for i in indice[num_train+num_dev:num_train+num_dev+num_test]]
33
34 fn_out_train = "kwdlc.train"
35 fn_out_dev = "kwdlc.dev"
36 fn_out_test = "kwdlc.test"
37 write_file(fn_out_train, train_X, train_Y)
38 write_file(fn_out_dev, dev_X, dev_Y)
39 write_file(fn_out_test, test_X, test_Y)
40
41 # start training
42 fn_out_model = "kwdlc_ner_model"
43 nagisa.fit(
44 train_file=fn_out_train,
45 dev_file=fn_out_dev,
46 test_file=fn_out_test,
47 model_name=fn_out_model
48 )
49
50
51if __name__ == "__main__":
52 main()
53
54
This is a log of the training process.
[dynet] random seed: 1234
[dynet] allocating memory: 32MB
[dynet] memory allocation done.
[nagisa] LAYERS: 1
[nagisa] THRESHOLD: 3
[nagisa] DECAY: 1
[nagisa] EPOCH: 10
[nagisa] WINDOW_SIZE: 3
[nagisa] DIM_UNI: 32
[nagisa] DIM_BI: 16
[nagisa] DIM_WORD: 16
[nagisa] DIM_CTYPE: 8
[nagisa] DIM_TAGEMB: 16
[nagisa] DIM_HIDDEN: 100
[nagisa] LEARNING_RATE: 0.1
[nagisa] DROPOUT_RATE: 0.3
[nagisa] SEED: 1234
[nagisa] TRAINSET: kwdlc.train
[nagisa] TESTSET: kwdlc.test
[nagisa] DEVSET: kwdlc.dev
[nagisa] DICTIONARY: None
[nagisa] EMBEDDING: None
[nagisa] HYPERPARAMS: kwdlc_ner_model.hp
[nagisa] MODEL: kwdlc_ner_model.params
[nagisa] VOCAB: kwdlc_ner_model.vocabs
[nagisa] EPOCH_MODEL: kwdlc_ner_model_epoch.params
[nagisa] NUM_TRAIN: 3816
[nagisa] NUM_TEST: 477
[nagisa] NUM_DEV: 477
[nagisa] VOCAB_SIZE_UNI: 1838
[nagisa] VOCAB_SIZE_BI: 12774
[nagisa] VOCAB_SIZE_WORD: 4809
[nagisa] VOCAB_SIZE_POSTAG: 29
Epoch LR Loss Time_m DevWS_f1 DevPOS_f1 TestWS_f1 TestPOS_f1
1 0.100 15.09 0.632 92.41 83.14 91.70 82.63
2 0.100 8.818 0.637 93.59 85.59 93.21 85.28
3 0.100 6.850 0.637 93.98 85.60 93.75 86.01
4 0.100 5.751 0.634 94.44 87.29 94.01 86.99
5 0.050 5.028 0.614 94.35 87.02 94.01 86.99
6 0.050 3.727 0.647 94.84 87.52 94.79 87.91
7 0.025 3.268 0.613 94.52 87.45 94.79 87.91
8 0.012 2.761 0.610 94.75 87.58 94.79 87.91
9 0.012 2.447 0.634 94.95 87.79 95.00 88.28
10 0.006 2.333 0.624 94.73 87.41 95.00 88.28
Predict
You can build the tagger only by loading the three trained model files (kwdlc_ner_model.vocabs, kwdlc_ner_model.params, kwdlc_ner_model.hp) to set arguments in nagisa.Tagger()
.
python tutorial_predict_kwdlc.py
Error analysis
By checking tag-level accuracy/entity-level macro-f1/classification_report, you can see what the model is wrong with.
python tutorial_error_analysis_kwdlc.py
1import nagisa
2
3from seqeval.metrics import f1_score
4from seqeval.metrics import accuracy_score
5from seqeval.metrics import classification_report
6
7
8def main():
9 # load the testset
10 test_X, test_Y = nagisa.utils.load_file("kwdlc.test")
11
12 # build the tagger for kwdlc
13 ner_tagger = nagisa.Tagger(vocabs='kwdlc_ner_model.vocabs',
14 params='kwdlc_ner_model.params',
15 hp='kwdlc_ner_model.hp')
16
17 # predict
18 true_Y = []
19 pred_Y = []
20 for words, true_y in zip(test_X, test_Y):
21 pred_y= ner_tagger.decode(words)
22
23 _pred_y = []
24 _true_y = []
25 for word, pred, true in zip(words, pred_y, true_y):
26 _pred_y.append(pred)
27 _true_y.append(true)
28 true_Y.append(_true_y)
29 pred_Y.append(_pred_y)
30
31 # evaluate
32 accuracy = accuracy_score(true_Y, pred_Y)
33 print("accuracy: {}".format(accuracy))
34 f1 = f1_score(true_Y, pred_Y)
35 print("macro-f1: {}".format(f1))
36 report = classification_report(true_Y, pred_Y)
37 print(report)
38
39
40if __name__ == "__main__":
41 main()
42
accuracy: 0.9166868198307134
macro-f1: 0.5900383141762452
precision recall f1-score support
ARTIFACT 0.33 0.35 0.34 86
OPTIONAL 0.32 0.19 0.24 31
ORGANIZATION 0.40 0.33 0.36 109
DATE 0.84 0.87 0.86 154
LOCATION 0.64 0.68 0.66 262
MONEY 0.88 0.88 0.88 16
PERSON 0.44 0.62 0.51 94
TIME 0.40 0.44 0.42 9
PERCENT 0.75 0.50 0.60 6
avg / total 0.58 0.60 0.59 767