com.atilika.kuromoji.dict.TokenInfoDictionary Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of deeplearning4j-nlp-japanese Show documentation
The newest version!
/*-*
 * Copyright © 2010-2015 Atilika Inc. and contributors (see CONTRIBUTORS.md)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.  A copy of the
 * License is distributed with this work in the LICENSE.md file.  You may
 * also obtain a copy of the License from
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.atilika.kuromoji.dict;

import com.atilika.kuromoji.buffer.BufferEntry;
import com.atilika.kuromoji.buffer.StringValueMapBuffer;
import com.atilika.kuromoji.buffer.TokenInfoBuffer;
import com.atilika.kuromoji.buffer.WordIdMap;
import com.atilika.kuromoji.util.DictionaryEntryLineParser;
import com.atilika.kuromoji.util.KuromojiBinFilesFetcher;
import com.atilika.kuromoji.util.ResourceResolver;
import com.atilika.kuromoji.util.StringUtils;

import java.io.File;
import java.io.IOException;

public class TokenInfoDictionary implements Dictionary {

    public static final String TOKEN_INFO_DICTIONARY_FILENAME = new File(KuromojiBinFilesFetcher.getKuromojiRoot(), "tokenInfoDictionary.bin").getAbsolutePath();
    public static final String FEATURE_MAP_FILENAME = new File(KuromojiBinFilesFetcher.getKuromojiRoot(),"tokenInfoFeaturesMap.bin").getAbsolutePath();
    public static final String POS_MAP_FILENAME = new File(KuromojiBinFilesFetcher.getKuromojiRoot(),"tokenInfoPartOfSpeechMap.bin").getAbsolutePath();
    public static final String TARGETMAP_FILENAME = new File(KuromojiBinFilesFetcher.getKuromojiRoot(),"tokenInfoTargetMap.bin").getAbsolutePath();

    private static final int LEFT_ID = 0;
    private static final int RIGHT_ID = 1;
    private static final int WORD_COST = 2;
    private static final int TOKEN_INFO_OFFSET = 3;

    private static final String FEATURE_SEPARATOR = ",";

    protected TokenInfoBuffer tokenInfoBuffer;
    protected StringValueMapBuffer posValues;
    protected StringValueMapBuffer stringValues;
    protected WordIdMap wordIdMap;

    public int[] lookupWordIds(int sourceId) {
        return wordIdMap.lookUp(sourceId);
    }

    @Override
    public int getLeftId(int wordId) {
        return tokenInfoBuffer.lookupTokenInfo(wordId, LEFT_ID);
    }

    @Override
    public int getRightId(int wordId) {
        return tokenInfoBuffer.lookupTokenInfo(wordId, RIGHT_ID);
    }

    @Override
    public int getWordCost(int wordId) {
        return tokenInfoBuffer.lookupTokenInfo(wordId, WORD_COST);
    }

    @Override
    public String[] getAllFeaturesArray(int wordId) {
        BufferEntry bufferEntry = tokenInfoBuffer.lookupEntry(wordId);

        int posLength = bufferEntry.posInfos.length;
        int featureLength = bufferEntry.featureInfos.length;

        boolean partOfSpeechAsShorts = false;

        if (posLength == 0) {
            posLength = bufferEntry.tokenInfos.length - TOKEN_INFO_OFFSET;
            partOfSpeechAsShorts = true;
        }

        String[] result = new String[posLength + featureLength];

        if (partOfSpeechAsShorts) {
            for (int i = 0; i < posLength; i++) {
                int feature = bufferEntry.tokenInfos[i + TOKEN_INFO_OFFSET];
                result[i] = posValues.get(feature);
            }
        } else {
            for (int i = 0; i < posLength; i++) {
                int feature = bufferEntry.posInfos[i] & 0xff;
                result[i] = posValues.get(feature);
            }
        }

        for (int i = 0; i < featureLength; i++) {
            int feature = bufferEntry.featureInfos[i];
            String s = stringValues.get(feature);
            result[i + posLength] = s;
        }

        return result;
    }

    @Override
    public String getAllFeatures(int wordId) {
        String[] features = getAllFeaturesArray(wordId);

        for (int i = 0; i < features.length; i++) {
            String feature = features[i];
            features[i] = DictionaryEntryLineParser.escape(feature);
        }

        return StringUtils.join(features, FEATURE_SEPARATOR);
    }

    @Override
    public String getFeature(int wordId, int... fields) {
        if (fields.length == 1) {
            return extractSingleFeature(wordId, fields[0]);
        }

        return extractMultipleFeatures(wordId, fields);
    }

    private String extractSingleFeature(int wordId, int field) {
        int featureId;

        if (tokenInfoBuffer.isPartOfSpeechFeature(field)) {
            featureId = tokenInfoBuffer.lookupPartOfSpeechFeature(wordId, field);
            return posValues.get(featureId);
        }

        featureId = tokenInfoBuffer.lookupFeature(wordId, field);
        return stringValues.get(featureId);
    }

    private String extractMultipleFeatures(int wordId, int[] fields) {
        if (fields.length == 0) {
            return getAllFeatures(wordId);
        }

        if (fields.length == 1) {
            return extractSingleFeature(wordId, fields[0]);
        }

        String[] allFeatures = getAllFeaturesArray(wordId);
        String[] features = new String[fields.length];

        for (int i = 0; i < fields.length; i++) {
            int featureNumber = fields[i];
            features[i] = DictionaryEntryLineParser.escape(allFeatures[featureNumber]);
        }
        return StringUtils.join(features, FEATURE_SEPARATOR);
    }

    public static TokenInfoDictionary newInstance(ResourceResolver resolver) throws IOException {
        TokenInfoDictionary dictionary = new TokenInfoDictionary();
        dictionary.setup(resolver);
        return dictionary;
    }

    private void setup(ResourceResolver resolver) throws IOException {
        tokenInfoBuffer = new TokenInfoBuffer(resolver.resolve(TOKEN_INFO_DICTIONARY_FILENAME));
        stringValues = new StringValueMapBuffer(resolver.resolve(FEATURE_MAP_FILENAME));
        posValues = new StringValueMapBuffer(resolver.resolve(POS_MAP_FILENAME));
        wordIdMap = new WordIdMap(resolver.resolve(TARGETMAP_FILENAME));
    }
}