All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.atilika.kuromoji.buffer.TokenInfoBuffer Maven / Gradle / Ivy

/**
 * Copyright © 2010-2015 Atilika Inc. and contributors (see CONTRIBUTORS.md)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.  A copy of the
 * License is distributed with this work in the LICENSE.md file.  You may
 * also obtain a copy of the License from
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.atilika.kuromoji.buffer;

import com.atilika.kuromoji.io.ByteBufferIO;

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;

public class TokenInfoBuffer {

    private static final int INTEGER_BYTES = Integer.SIZE / Byte.SIZE;
    private static final int SHORT_BYTES = Short.SIZE / Byte.SIZE;

    private ByteBuffer buffer;

    private final int tokenInfoCount;
    private final int posInfoCount;
    private final int featureCount;

    private final int entrySize;

    public TokenInfoBuffer(InputStream is) throws IOException {
        buffer = ByteBufferIO.read(is);
        tokenInfoCount = getTokenInfoCount();
        posInfoCount = getPosInfoCount();
        featureCount = getFeatureCount();
        entrySize = getEntrySize(tokenInfoCount, posInfoCount, featureCount);
    }

    public BufferEntry lookupEntry(int offset) {
        BufferEntry entry = new BufferEntry();

        entry.tokenInfos = new short[tokenInfoCount];
        entry.posInfos = new byte[posInfoCount];
        entry.featureInfos = new int[featureCount];

        int entrySize = getEntrySize(tokenInfoCount, posInfoCount, featureCount);
        int position = getPosition(offset, entrySize);

        // Get left id, right id and word cost
        for (int i = 0; i < tokenInfoCount; i++) {
            entry.tokenInfos[i] = buffer.getShort(position + i * SHORT_BYTES);
        }

        // Get part of speech tags values (not strings yet)
        for (int i = 0; i < posInfoCount; i++) {
            entry.posInfos[i] = buffer.get(position + tokenInfoCount * SHORT_BYTES + i);
        }

        // Get field value references (string references)
        for (int i = 0; i < featureCount; i++) {
            entry.featureInfos[i] = buffer.getInt(position + tokenInfoCount * SHORT_BYTES + posInfoCount + i * INTEGER_BYTES);
        }

        return entry;
    }

    public int lookupTokenInfo(int offset, int i) {
        int position = getPosition(offset, entrySize);
        return buffer.getShort(position + i * SHORT_BYTES);
    }

    public int lookupPartOfSpeechFeature(int offset, int i) {
        int position = getPosition(offset, entrySize);

        return 0xff & buffer.get(position + tokenInfoCount * SHORT_BYTES + i);
    }

    public int lookupFeature(int offset, int i) {
        int position = getPosition(offset, entrySize);

        return buffer.getInt(position + tokenInfoCount * SHORT_BYTES + posInfoCount + (i - posInfoCount) * INTEGER_BYTES);
    }

    public boolean isPartOfSpeechFeature(int i) {
        int posInfoCount = getPosInfoCount();
        return (i < posInfoCount);
    }

    private int getTokenInfoCount() {
        return buffer.getInt(INTEGER_BYTES * 2);
    }

    private int getPosInfoCount() {
        return buffer.getInt(INTEGER_BYTES * 3);
    }

    private int getFeatureCount() {
        return buffer.getInt(INTEGER_BYTES * 4);
    }

    private int getEntrySize(int tokenInfoCount, int posInfoCount, int featureCount) {
        return tokenInfoCount * SHORT_BYTES + posInfoCount + featureCount * INTEGER_BYTES;
    }

    private int getPosition(int offset, int entrySize) {
        return offset * entrySize + INTEGER_BYTES * 5;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy