Tutorial (Japanese Named Entity Recognition)

Train a Japanese NER model for KWDLC

This tutorial provides an example of training a Japanese NER model by using Kyoto University Web Document Leads Corpus(KWDLC).

Download the dataset

Please download KWDLC from http://nlp.ist.i.kyoto-u.ac.jp/EN/index.php?KWDLC manually. Copy the corpus to the working directory.

Install python libraries

Before we get started, please run the following command to install the libraries used in this tutorial.

pip install nagisa
pip install seqeval
pip install beautifulsoup4

Preprocess the dataset

First, convert the downloaded data to the input data format for nagisa. The input data format of the train/dev/test files is the tsv format. The Each line is word and tag and one line is represented by word \t tag. Note that you put EOS between sentences.

This preprocess is a little complicated, so please copy the code below and run it. After running the code, kwdlc.txt is output to the working directory.

python tutorial_preprocess_kwdlc.py

tutorial_preprocess_kwdlc.py

import bs4
import glob


def load_kwdlc(dir_path):
    files = glob.glob(dir_path+"/*/*", recursive=True)

    data = []

    words = []
    position2ne = {}

    for fn in files:
        with open(fn, "r") as f:
            for line in f:
                line = line.strip()
                first_char = line[0]

                if first_char == "+":
                    soup = bs4.BeautifulSoup(line, "html.parser")
                    num_tags = len(soup.contents)
                    for i in range(num_tags):
                        if str(type(soup.contents[i])) == "<class 'bs4.element.Tag'>":
                            if "ne" == soup.contents[i].name:
                                target = soup.contents[i].attrs["target"]
                                netype = soup.contents[i].attrs["type"]

                                position2ne[len(words)] = [target, netype]

                elif first_char == "#" or first_char == "*":
                    None

                elif line == "EOS":
                    # process
                    if len(position2ne) > 0:
                        positions = position2ne.keys()
                        for position in positions:
                            target = position2ne[position][0]
                            netype = position2ne[position][1]

                    data.append([words, position2ne])

                    # reset
                    words = []
                    position2ne = {}

                else:
                    tokens = line.split()
                    surface = tokens[0]
                    words.append(surface)

    return data, position2ne


def write_kwdlc_as_single_file(filename, data, position2ne):

    with open(filename, "w") as f:
        for line in data:
            words, position2ne = line

            nes = [v[0] for k, v in sorted(position2ne.items(), key=lambda x:x[0])]
            nes = list(reversed(nes))

            tags = [v[1] for k, v in sorted(position2ne.items(), key=lambda x:x[0])]
            tags = list(reversed(tags))

            if len(nes) == 0:
                None

            else:
                ne_tags = []

                ne = nes.pop()
                tag = tags.pop()
                ne_target_char = ne[0]

                partical = []
                for word in words:
                    first_char = word[0]
                    if first_char == ne_target_char:

                        if word in ne:
                            partical.append(word)

                            if "".join(partical) == ne:

                                for i, word in enumerate(partical):
                                    if i == 0:
                                        ne_tags.append("B-"+tag)
                                    elif i == len(partical) - 1:
                                        ne_tags.append("E-"+tag)
                                    else:
                                        ne_tags.append("M-"+tag)

                                if len(nes) > 0:
                                    ne = nes.pop()
                                    tag = tags.pop()
                                    ne_target_char = ne[0]

                                partical = []

                            else:
                                ne_target_char = ne[len("".join(partical))]

                        else:
                            partical = []
                            ne_tags.append("O")

                    else:
                        partical = []
                        ne_tags.append("O")


                for word, ne_tag in zip(words, ne_tags):
                    f.write("\t".join([word, ne_tag])+"\n")
                f.write("EOS\n")


def main():
    dir_path = "./KWDLC-1.0/dat/rel"
    data, position2ne = load_kwdlc(dir_path)

    fn_out = "kwdlc.txt"
    write_kwdlc_as_single_file(fn_out, data, position2ne)


if __name__ == "__main__":
    main()

Train a model

Next, you train a NER model by using the nagisa.fit() function. After finish training the model, save the three model files (kwdlc_ner_model.vocabs, kwdlc_ner_model.params, kwdlc_ner_model.hp) in the current directory.

python tutorial_train_kwdlc.py

tutorial_train_kwdlc.py

import random

import nagisa


def write_file(filename, X, Y):
    with open(filename, "w") as f:
        for x, y in zip(X, Y):
            for word, tag in zip(x, y):
                f.write("\t".join([word, tag])+"\n")
            f.write("EOS\n")


def main():
    random.seed(1234)

    # preprocess
    fn_in = "kwdlc.txt"
    X, Y = nagisa.utils.load_file(fn_in)
    indice = [i for i in range(len(X))]
    random.shuffle(indice)

    num_train = int(0.8 * len(indice))
    num_dev = int(0.1 * len(indice))
    num_test = int(0.1 * len(indice))

    train_X = [X[i] for i in indice[:num_train]]
    train_Y = [Y[i] for i in indice[:num_train]]
    dev_X = [X[i] for i in indice[num_train:num_train+num_dev]]
    dev_Y = [Y[i] for i in indice[num_train:num_train+num_dev]]
    test_X = [X[i] for i in indice[num_train+num_dev:num_train+num_dev+num_test]]
    test_Y = [Y[i] for i in indice[num_train+num_dev:num_train+num_dev+num_test]]

    fn_out_train = "kwdlc.train"
    fn_out_dev = "kwdlc.dev"
    fn_out_test = "kwdlc.test"
    write_file(fn_out_train, train_X, train_Y)
    write_file(fn_out_dev, dev_X, dev_Y)
    write_file(fn_out_test, test_X, test_Y)

    # start training
    fn_out_model = "kwdlc_ner_model"
    nagisa.fit(
        train_file=fn_out_train,
        dev_file=fn_out_dev,
        test_file=fn_out_test,
        model_name=fn_out_model
    )


if __name__ == "__main__":
    main()

This is a log of the training process.

[dynet] random seed: 1234
[dynet] allocating memory: 32MB
[dynet] memory allocation done.
[nagisa] LAYERS: 1
[nagisa] THRESHOLD: 3
[nagisa] DECAY: 1
[nagisa] EPOCH: 10
[nagisa] WINDOW_SIZE: 3
[nagisa] DIM_UNI: 32
[nagisa] DIM_BI: 16
[nagisa] DIM_WORD: 16
[nagisa] DIM_CTYPE: 8
[nagisa] DIM_TAGEMB: 16
[nagisa] DIM_HIDDEN: 100
[nagisa] LEARNING_RATE: 0.1
[nagisa] DROPOUT_RATE: 0.3
[nagisa] SEED: 1234
[nagisa] TRAINSET: kwdlc.train
[nagisa] TESTSET: kwdlc.test
[nagisa] DEVSET: kwdlc.dev
[nagisa] DICTIONARY: None
[nagisa] EMBEDDING: None
[nagisa] HYPERPARAMS: kwdlc_ner_model.hp
[nagisa] MODEL: kwdlc_ner_model.params
[nagisa] VOCAB: kwdlc_ner_model.vocabs
[nagisa] EPOCH_MODEL: kwdlc_ner_model_epoch.params
[nagisa] NUM_TRAIN: 3816
[nagisa] NUM_TEST: 477
[nagisa] NUM_DEV: 477
[nagisa] VOCAB_SIZE_UNI: 1838
[nagisa] VOCAB_SIZE_BI: 12774
[nagisa] VOCAB_SIZE_WORD: 4809
[nagisa] VOCAB_SIZE_POSTAG: 29
Epoch       LR      Loss    Time_m  DevWS_f1        DevPOS_f1       TestWS_f1       TestPOS_f1
1           0.100   15.09   0.632   92.41           83.14           91.70           82.63
2           0.100   8.818   0.637   93.59           85.59           93.21           85.28
3           0.100   6.850   0.637   93.98           85.60           93.75           86.01
4           0.100   5.751   0.634   94.44           87.29           94.01           86.99
5           0.050   5.028   0.614   94.35           87.02           94.01           86.99
6           0.050   3.727   0.647   94.84           87.52           94.79           87.91
7           0.025   3.268   0.613   94.52           87.45           94.79           87.91
8           0.012   2.761   0.610   94.75           87.58           94.79           87.91
9           0.012   2.447   0.634   94.95           87.79           95.00           88.28
10          0.006   2.333   0.624   94.73           87.41           95.00           88.28

Predict

You can build the tagger only by loading the three trained model files (kwdlc_ner_model.vocabs, kwdlc_ner_model.params, kwdlc_ner_model.hp) to set arguments in nagisa.Tagger().

python tutorial_predict_kwdlc.py

Error analysis

By checking tag-level accuracy/entity-level macro-f1/classification_report, you can see what the model is wrong with.

python tutorial_error_analysis_kwdlc.py

tutorial_error_analysis_kwdlc.py

import nagisa

from seqeval.metrics import f1_score
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report


def main():
    # load the testset
    test_X, test_Y = nagisa.utils.load_file("kwdlc.test")

    # build the tagger for kwdlc
    ner_tagger = nagisa.Tagger(vocabs='kwdlc_ner_model.vocabs',
                              params='kwdlc_ner_model.params',
                              hp='kwdlc_ner_model.hp')

    # predict
    true_Y = []
    pred_Y = []
    for words, true_y in zip(test_X, test_Y):
        pred_y= ner_tagger.decode(words)

        _pred_y = []
        _true_y = []
        for word, pred, true in zip(words, pred_y, true_y):
            _pred_y.append(pred)
            _true_y.append(true)
        true_Y.append(_true_y)
        pred_Y.append(_pred_y)

    # evaluate
    accuracy = accuracy_score(true_Y, pred_Y)
    print("accuracy: {}".format(accuracy))
    f1 = f1_score(true_Y, pred_Y)
    print("macro-f1: {}".format(f1))
    report = classification_report(true_Y, pred_Y)
    print(report)


if __name__ == "__main__":
    main()

accuracy: 0.9166868198307134
macro-f1: 0.5900383141762452
                  precision    recall  f1-score   support

    ARTIFACT       0.33      0.35      0.34        86
    OPTIONAL       0.32      0.19      0.24        31
ORGANIZATION       0.40      0.33      0.36       109
        DATE       0.84      0.87      0.86       154
    LOCATION       0.64      0.68      0.66       262
       MONEY       0.88      0.88      0.88        16
      PERSON       0.44      0.62      0.51        94
        TIME       0.40      0.44      0.42         9
     PERCENT       0.75      0.50      0.60         6

 avg / total       0.58      0.60      0.59       767