com.worksap.nlp.sudachi.dictionary.DoubleArrayLexicon Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sudachi Show documentation
Japanese morphological analyzer
There is a newer version: 0.7.5
/*
 * Copyright (c) 2021 Works Applications Co., Ltd.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.worksap.nlp.sudachi.dictionary;

import java.nio.Buffer;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import java.util.Iterator;

import com.worksap.nlp.dartsclone.DoubleArray;
import com.worksap.nlp.sudachi.MorphemeList;
import com.worksap.nlp.sudachi.Tokenizer;

public class DoubleArrayLexicon implements Lexicon {

    static final int USER_DICT_COST_PAR_MORPH = -20;

    private WordIdTable wordIdTable;
    private WordParameterList wordParams;
    private WordInfoList wordInfos;
    private DoubleArray trie;

    public DoubleArrayLexicon(ByteBuffer bytes, int offset, boolean hasSynonymGid) {
        trie = new DoubleArray();
        int size = bytes.getInt(offset);
        offset += 4;
        ((Buffer) bytes).position(offset); // a kludge for Java 9
        IntBuffer array = bytes.asIntBuffer();
        trie.setArray(array, size);
        offset += trie.totalSize();

        wordIdTable = new WordIdTable(bytes, offset);
        offset += wordIdTable.storageSize();

        wordParams = new WordParameterList(bytes, offset);
        offset += wordParams.storageSize();

        wordInfos = new WordInfoList(bytes, offset, wordParams.size(), hasSynonymGid);
    }

    /**
     * Returns the word IDs obtained by common prefix search.
     *
     * 
     * The search begin with the position at the {@code offset} of the {@code text}.
     *
     * 
     * The return value is consist of the word ID and the length of the matched
     * part.
     * 
     * @param text
     *            the key
     * @param offset
     *            the offset of the key
     * @return the iterator of results
     */
    @Override
    public Iterator lookup(byte[] text, int offset) {
        Iterator iterator = trie.commonPrefixSearch(text, offset);
        if (!iterator.hasNext()) {
            return iterator;
        }
        return new Itr(iterator);
    }

    private class Itr implements Iterator {
        private final Iterator iterator;
        private Integer[] wordIds;
        private int length;
        private int index;

        Itr(Iterator iterator) {
            this.iterator = iterator;
            index = -1;
        }

        @Override
        public boolean hasNext() {
            if (index < 0) {
                return iterator.hasNext();
            } else {
                return (index < wordIds.length) || iterator.hasNext();
            }
        }

        @Override
        public int[] next() {
            if (index < 0 || index >= wordIds.length) {
                int[] p = iterator.next();
                wordIds = wordIdTable.get(p[0]);
                length = p[1];
                index = 0;
            }
            return new int[] { wordIds[index++], length };
        }
    }

    @Override
    public int getWordId(String headword, short posId, String readingForm) {
        for (int wid = 0; wid < wordInfos.size(); wid++) {
            WordInfo info = wordInfos.getWordInfo(wid);
            if (info.getSurface().equals(headword) && info.getPOSId() == posId
                    && info.getReadingForm().equals(readingForm)) {
                return wid;
            }
        }
        return -1;
    }

    @Override
    public short getLeftId(int wordId) {
        return wordParams.getLeftId(wordId);
    }

    @Override
    public short getRightId(int wordId) {
        return wordParams.getRightId(wordId);
    }

    @Override
    public short getCost(int wordId) {
        return wordParams.getCost(wordId);
    }

    @Override
    public WordInfo getWordInfo(int wordId) {
        return wordInfos.getWordInfo(wordId);
    }

    @Override
    public int getDictionaryId(int wordId) {
        return 0;
    }

    @Override
    public int size() {
        return wordParams.size();
    }

    public void calculateCost(Tokenizer tokenizer) {
        for (int wordId = 0; wordId < wordParams.size(); wordId++) {
            if (getCost(wordId) != Short.MIN_VALUE) {
                continue;
            }
            String surface = getWordInfo(wordId).getSurface();
            MorphemeList ms = (MorphemeList) tokenizer.tokenize(surface);
            int cost = ms.getInternalCost() + USER_DICT_COST_PAR_MORPH * ms.size();
            if (cost > Short.MAX_VALUE) {
                cost = Short.MAX_VALUE;
            } else if (cost < Short.MIN_VALUE) {
                cost = Short.MIN_VALUE;
            }
            wordParams.setCost(wordId, (short) cost);
        }
    }
}