Source code for nagisa.tagger

# -*- coding:utf-8 -*-

from __future__ import division, print_function, absolute_import

import os
import re
import sys

import nagisa_utils as utils
import nagisa.np_model as model

base = os.path.dirname(os.path.abspath(__file__))
sys.path.append(base)


[docs]class Tagger(object):
    """
    This class has a word segmentation function and a POS-tagging function for Japanese.
    """

    def __init__(self, vocabs=None, params=None, hp=None, single_word_list=None):
        if vocabs is None:
            vocabs = base + '/data/nagisa_v001.dict'
        if params is None:
            params = base + '/data/nagisa_v001.model'
        if hp is None:
            hp = base + '/data/nagisa_v001.hp'

        # Load vocaburary files
        vocabs = utils.load_data(vocabs)
        self._uni2id, self._bi2id, self._word2id, self._pos2id, self._word2postags = vocabs
        self._id2pos = {v:k for k, v in self._pos2id.items()}
        self.id2pos  = self._id2pos
        self.postags = [postag for postag in self._pos2id.keys()]
        # Load a hyper-parameter file
        self._hp = utils.load_data(hp)
        # Construct a word segmentation model and a pos tagging model
        self._model = model.Model(self._hp, params)

        # If a word is included in the single_word_list,
        # it is recognized as a single word forcibly.
        self.pattern = None
        if single_word_list:
            single_word_list = [utils.preprocess(w) for w in single_word_list if len(w) > 1]
            single_word_list = [w.replace('(', r'\(').replace(')', r'\)')
                                for w in single_word_list]
            single_word_list = sorted(single_word_list, key=lambda x:-len(x))
            if len(single_word_list) > 0:
                self.pattern = re.compile('|'.join(single_word_list))

        # If use_noun_heuristic is True, nouns are more lilely to appear.
        if u'名詞' in self._pos2id:
            self.use_noun_heuristic = True
        else:
            self.use_noun_heuristic = False


    def wakati(self, text, lower=False):
        """Word segmentation function. Return the segmented words.

        args:
            - text (str): An input sentence.
            - lower (bool): If lower is True, all uppercase characters in a list \
                            of the words are converted into lowercase characters.

        return:
            - words (list): A list of the words.
        """
        text = utils.preprocess(text)
        lower_text = text.lower()
        feats = utils.feature_extraction(text=lower_text,
                                         uni2id=self._uni2id,
                                         bi2id=self._bi2id,
                                         dictionary=self._word2id,
                                         window_size=self._hp['WINDOW_SIZE'])
        obs  = self._model.encode_ws(feats)
        tags = utils.np_viterbi(self._model.trans_array, obs)

        # A word can be recognized as a single word forcibly.
        if self.pattern:
            for match in self.pattern.finditer(text):
                span = match.span()
                span_s = span[0]
                span_e = span[1]

                if (span_e - span_s) == 1:
                    tags[span_s:span_e] = [3]
                else:
                    tags[span_s:span_e] = [0]+[1]*((span_e-span_s)-2)+[2]

                if span_s != 0:
                    previous_tag = tags[span_s-1]
                    if previous_tag == 0:   # 0 is BEGIN tag
                        tags[span_s-1] = 3  # 3 is SINGLE tag
                    elif previous_tag == 1: # 1 is MIDDEL tag
                        tags[span_s-1] = 2  # 2 is END tag

                if span_e != len(text):
                    next_tag = tags[span_e]
                    if next_tag == 1:    # 1 is MIDDEL tag
                        tags[span_e] = 0 # 0 is BEGIN tag
                    elif next_tag == 2:  # 2 is END tag
                        tags[span_e] = 3 # 3 is SINGLE tag

        if lower is True:
            words = utils.segmenter_for_bmes(lower_text, tags)
        else:
            words = utils.segmenter_for_bmes(text, tags)
        return words


    def _postagging(self, words, lower=False):
        if lower is True:
            words = [w.lower() for w in words]

        wids = utils.conv_tokens_to_ids(words, self._word2id)
        cids = [utils.conv_tokens_to_ids([c for c in w], self._uni2id) for w in words]

        # Improve the bottleneck in part-of-speech tagging.
        # No changes made to output results by this change.
        tids = []
        for w in words:
            w2p = set(self._word2postags.get(w, [0]))
            if self.use_noun_heuristic and w.isalnum():
                if 0 in w2p:
                    w2p.remove(0)
                w2p.add(self._pos2id[u'名詞'])
            tids.append(list(w2p))

        X = [cids, wids, tids]
        postags = [self._id2pos[pid] for pid in self._model.POStagging(X)]
        return postags


    def postagging(self, words, lower=False):
        """ Return the words with POS-tags of the given words.

        args:
            - words (list): Input words.
            - lower (bool): If lower is True, all uppercase characters in a list \
                            of the words are converted into lowercase characters.
        return:
            - object : The object of the words with POS-tags.
        """
        return self.decode(words, lower)


    def decode(self, words, lower=False):
        """ Return the words with tags of the given words.

        args:
            - words (list): Input words.
            - lower (bool, optional): If lower is True, all uppercase characters in a list \
                            of the words are converted into lowercase characters.
        return:
            - object : The object of the words with tags.
        """
        if not type(words) == list:
            raise AssertionError("Please input a list of words.")
        words = [utils.preprocess_without_rstrip(w) if w == " " or w == "　"
                 else utils.preprocess(w) for w in words]
        postags = self._postagging(words, lower)
        return postags


    def tagging(self, text, lower=False):
        """ Return the words with POS-tags of the given sentence.

        args:
            - text (str): An input sentence.
            - lower (bool): If lower is True, all uppercase characters in a list \
                            of the words are converted into lowercase characters.
        return:
            - object : The object of the words with POS-tags.
        """
        return self._Token(text, lower, self.wakati, self._postagging)


    def filter(self, text, lower=False, filter_postags=None):
        """Return the filtered words with POS-tags of the given sentence.

        args:
            - text (str): An input sentence.
            - lower (bool): If lower is True, all uppercase characters in a list \
                            of the words are converted into lowercase characters.
            - filter_postags (list): Filtering the word with the POS-tag in \
                                     filter_postags from a text.

        return:
            - object : The object of the words with POS-tags.
        """
        if filter_postags is None:
            filter_postags = []

        words   = []
        postags = []
        tokens  = self.tagging(text, lower)
        for word, postag in zip(tokens.words, tokens.postags):
            if not postag in filter_postags:
                words.append(word)
                postags.append(postag)
        return self._Token(text, lower, self.wakati, self._postagging,
                           _words=words, _postags=postags)


    def extract(self, text, lower=False, extract_postags=None):
        """Return the extracted words with POS-tags of the given sentence.

        args:
            - text (str): An input sentence.
            - lower (bool): If lower is True, all uppercase characters in a list \
                            of the words are converted into lowercase characters.
            - filter_postags (list): Extracting the word with the POS-tag in \
                                     filter_postags from a text.

        return:
            - object : The object of the words with POS-tags.

        """
        if extract_postags is None:
            extract_postags = []

        words   = []
        postags = []
        tokens  = self.tagging(text, lower)
        for word, postag in zip(tokens.words, tokens.postags):
            if postag in extract_postags:
                words.append(word)
                postags.append(postag)
        return self._Token(text, lower, self.wakati, self._postagging,
                           _words=words, _postags=postags)


    class _Token(object):
        def __init__(self, text, lower, wakati, postagging, _words=None, _postags=None):
            self.text = text
            self.__lower = lower
            self.__words = _words
            self.__postags = _postags
            self.__wakati = wakati
            self.__postagging = postagging

        @property
        def words(self):
            if self.__words is None:
                self.__words = self.__wakati(self.text, self.__lower)
            return self.__words

        @property
        def postags(self):
            if self.__postags is None:
                self.__postags = self.__postagging(self.words, self.__lower)
            return self.__postags

        def __str__(self):
            return ' '.join([w+'/'+p for w, p in zip(self.words, self.postags)])