Tutorial (Japanese Named Entity Recognition)

Train a Japanese NER model for KWDLC

This tutorial provides an example of training a Japanese NER model by using Kyoto University Web Document Leads Corpus(KWDLC).

Download the dataset

Please download KWDLC from http://nlp.ist.i.kyoto-u.ac.jp/EN/index.php?KWDLC manually. Copy the corpus to the working directory.

Install python libraries

Before we get started, please run the following command to install the libraries used in this tutorial.

pip install nagisa
pip install seqeval
pip install beautifulsoup4

Preprocess the dataset

First, convert the downloaded data to the input data format for nagisa. The input data format of the train/dev/test files is the tsv format. The Each line is word and tag and one line is represented by word \t tag. Note that you put EOS between sentences.

This preprocess is a little complicated, so please copy the code below and run it. After running the code, kwdlc.txt is output to the working directory.

python tutorial_preprocess_kwdlc.py
tutorial_preprocess_kwdlc.py
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import bs4
import glob


def load_kwdlc(dir_path):
    files = glob.glob(dir_path+"/*/*", recursive=True)

    data = []

    words = []
    position2ne = {}

    for fn in files:
        with open(fn, "r") as f:
            for line in f:
                line = line.strip()
                first_char = line[0]

                if first_char == "+":
                    soup = bs4.BeautifulSoup(line, "html.parser")
                    num_tags = len(soup.contents)
                    for i in range(num_tags):
                        if str(type(soup.contents[i])) == "<class 'bs4.element.Tag'>":
                            if "ne" == soup.contents[i].name:
                                target = soup.contents[i].attrs["target"]
                                netype = soup.contents[i].attrs["type"]

                                position2ne[len(words)] = [target, netype]

                elif first_char == "#" or first_char == "*":
                    None

                elif line == "EOS":
                    # process
                    if len(position2ne) > 0:
                        positions = position2ne.keys()
                        for position in positions:
                            target = position2ne[position][0]
                            netype = position2ne[position][1]

                    data.append([words, position2ne])

                    # reset
                    words = []
                    position2ne = {}

                else:
                    tokens = line.split()
                    surface = tokens[0]
                    words.append(surface)

    return data, position2ne


def write_kwdlc_as_single_file(filename, data, position2ne):

    with open(filename, "w") as f:
        for line in data:
            words, position2ne = line

            nes = [v[0] for k, v in sorted(position2ne.items(), key=lambda x:x[0])]
            nes = list(reversed(nes))

            tags = [v[1] for k, v in sorted(position2ne.items(), key=lambda x:x[0])]
            tags = list(reversed(tags))

            if len(nes) == 0:
                None

            else:
                ne_tags = []

                ne = nes.pop()
                tag = tags.pop()
                ne_target_char = ne[0]

                partical = []
                for word in words:
                    first_char = word[0]
                    if first_char == ne_target_char:

                        if word in ne:
                            partical.append(word)

                            if "".join(partical) == ne:

                                for i, word in enumerate(partical):
                                    if i == 0:
                                        ne_tags.append("B-"+tag)
                                    elif i == len(partical) - 1:
                                        ne_tags.append("E-"+tag)
                                    else:
                                        ne_tags.append("M-"+tag)

                                if len(nes) > 0:
                                    ne = nes.pop()
                                    tag = tags.pop()
                                    ne_target_char = ne[0]

                                partical = []

                            else:
                                ne_target_char = ne[len("".join(partical))]

                        else:
                            partical = []
                            ne_tags.append("O")

                    else:
                        partical = []
                        ne_tags.append("O")


                for word, ne_tag in zip(words, ne_tags):
                    f.write("\t".join([word, ne_tag])+"\n")
                f.write("EOS\n")


def main():
    dir_path = "./KWDLC-1.0/dat/rel"
    data, position2ne = load_kwdlc(dir_path)

    fn_out = "kwdlc.txt"
    write_kwdlc_as_single_file(fn_out, data, position2ne)


if __name__ == "__main__":
    main()

Train a model

Next, you train a NER model by using the nagisa.fit() function. After finish training the model, save the three model files (kwdlc_ner_model.vocabs, kwdlc_ner_model.params, kwdlc_ner_model.hp) in the current directory.

python tutorial_train_kwdlc.py
tutorial_train_kwdlc.py
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import random

import nagisa


def write_file(filename, X, Y):
    with open(filename, "w") as f:
        for x, y in zip(X, Y):
            for word, tag in zip(x, y):
                f.write("\t".join([word, tag])+"\n")
            f.write("EOS\n")


def main():
    random.seed(1234)

    # preprocess
    fn_in = "kwdlc.txt"
    X, Y = nagisa.utils.load_file(fn_in)
    indice = [i for i in range(len(X))]
    random.shuffle(indice)

    num_train = int(0.8 * len(indice))
    num_dev = int(0.1 * len(indice))
    num_test = int(0.1 * len(indice))

    train_X = [X[i] for i in indice[:num_train]]
    train_Y = [Y[i] for i in indice[:num_train]]
    dev_X = [X[i] for i in indice[num_train:num_train+num_dev]]
    dev_Y = [Y[i] for i in indice[num_train:num_train+num_dev]]
    test_X = [X[i] for i in indice[num_train+num_dev:num_train+num_dev+num_test]]
    test_Y = [Y[i] for i in indice[num_train+num_dev:num_train+num_dev+num_test]]

    fn_out_train = "kwdlc.train"
    fn_out_dev = "kwdlc.dev"
    fn_out_test = "kwdlc.test"
    write_file(fn_out_train, train_X, train_Y)
    write_file(fn_out_dev, dev_X, dev_Y)
    write_file(fn_out_test, test_X, test_Y)

    # start training
    fn_out_model = "kwdlc_ner_model"
    nagisa.fit(
        train_file=fn_out_train,
        dev_file=fn_out_dev,
        test_file=fn_out_test,
        model_name=fn_out_model
    )


if __name__ == "__main__":
    main()


This is a log of the training process.

[dynet] random seed: 1234
[dynet] allocating memory: 32MB
[dynet] memory allocation done.
[nagisa] LAYERS: 1
[nagisa] THRESHOLD: 3
[nagisa] DECAY: 1
[nagisa] EPOCH: 10
[nagisa] WINDOW_SIZE: 3
[nagisa] DIM_UNI: 32
[nagisa] DIM_BI: 16
[nagisa] DIM_WORD: 16
[nagisa] DIM_CTYPE: 8
[nagisa] DIM_TAGEMB: 16
[nagisa] DIM_HIDDEN: 100
[nagisa] LEARNING_RATE: 0.1
[nagisa] DROPOUT_RATE: 0.3
[nagisa] SEED: 1234
[nagisa] TRAINSET: kwdlc.train
[nagisa] TESTSET: kwdlc.test
[nagisa] DEVSET: kwdlc.dev
[nagisa] DICTIONARY: None
[nagisa] EMBEDDING: None
[nagisa] HYPERPARAMS: kwdlc_ner_model.hp
[nagisa] MODEL: kwdlc_ner_model.params
[nagisa] VOCAB: kwdlc_ner_model.vocabs
[nagisa] EPOCH_MODEL: kwdlc_ner_model_epoch.params
[nagisa] NUM_TRAIN: 3816
[nagisa] NUM_TEST: 477
[nagisa] NUM_DEV: 477
[nagisa] VOCAB_SIZE_UNI: 1838
[nagisa] VOCAB_SIZE_BI: 12774
[nagisa] VOCAB_SIZE_WORD: 4809
[nagisa] VOCAB_SIZE_POSTAG: 29
Epoch       LR      Loss    Time_m  DevWS_f1        DevPOS_f1       TestWS_f1       TestPOS_f1
1           0.100   15.09   0.632   92.41           83.14           91.70           82.63
2           0.100   8.818   0.637   93.59           85.59           93.21           85.28
3           0.100   6.850   0.637   93.98           85.60           93.75           86.01
4           0.100   5.751   0.634   94.44           87.29           94.01           86.99
5           0.050   5.028   0.614   94.35           87.02           94.01           86.99
6           0.050   3.727   0.647   94.84           87.52           94.79           87.91
7           0.025   3.268   0.613   94.52           87.45           94.79           87.91
8           0.012   2.761   0.610   94.75           87.58           94.79           87.91
9           0.012   2.447   0.634   94.95           87.79           95.00           88.28
10          0.006   2.333   0.624   94.73           87.41           95.00           88.28

Predict

You can build the tagger only by loading the three trained model files (kwdlc_ner_model.vocabs, kwdlc_ner_model.params, kwdlc_ner_model.hp) to set arguments in nagisa.Tagger().

python tutorial_predict_kwdlc.py

Error analysis

By checking tag-level accuracy/entity-level macro-f1/classification_report, you can see what the model is wrong with.

python tutorial_error_analysis_kwdlc.py
tutorial_error_analysis_kwdlc.py
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import nagisa

from seqeval.metrics import f1_score
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report


def main():
    # load the testset
    test_X, test_Y = nagisa.utils.load_file("kwdlc.test")

    # build the tagger for kwdlc
    ner_tagger = nagisa.Tagger(vocabs='kwdlc_ner_model.vocabs',
                              params='kwdlc_ner_model.params',
                              hp='kwdlc_ner_model.hp')

    # predict
    true_Y = []
    pred_Y = []
    for words, true_y in zip(test_X, test_Y):
        pred_y= ner_tagger.decode(words)

        _pred_y = []
        _true_y = []
        for word, pred, true in zip(words, pred_y, true_y):
            _pred_y.append(pred)
            _true_y.append(true)
        true_Y.append(_true_y)
        pred_Y.append(_pred_y)

    # evaluate
    accuracy = accuracy_score(true_Y, pred_Y)
    print("accuracy: {}".format(accuracy))
    f1 = f1_score(true_Y, pred_Y)
    print("macro-f1: {}".format(f1))
    report = classification_report(true_Y, pred_Y)
    print(report)


if __name__ == "__main__":
    main()

accuracy: 0.9166868198307134
macro-f1: 0.5900383141762452
                  precision    recall  f1-score   support

    ARTIFACT       0.33      0.35      0.34        86
    OPTIONAL       0.32      0.19      0.24        31
ORGANIZATION       0.40      0.33      0.36       109
        DATE       0.84      0.87      0.86       154
    LOCATION       0.64      0.68      0.66       262
       MONEY       0.88      0.88      0.88        16
      PERSON       0.44      0.62      0.51        94
        TIME       0.40      0.44      0.42         9
     PERCENT       0.75      0.50      0.60         6

 avg / total       0.58      0.60      0.59       767