com.atilika.kuromoji.buffer.TokenInfoBuffer Maven / Gradle / Ivy
/*-*
* Copyright © 2010-2015 Atilika Inc. and contributors (see CONTRIBUTORS.md)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. A copy of the
* License is distributed with this work in the LICENSE.md file. You may
* also obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.atilika.kuromoji.buffer;
import com.atilika.kuromoji.io.ByteBufferIO;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
public class TokenInfoBuffer {
private static final int INTEGER_BYTES = Integer.SIZE / Byte.SIZE;
private static final int SHORT_BYTES = Short.SIZE / Byte.SIZE;
private ByteBuffer buffer;
private final int tokenInfoCount;
private final int posInfoCount;
private final int featureCount;
private final int entrySize;
public TokenInfoBuffer(InputStream is) throws IOException {
buffer = ByteBufferIO.read(is);
tokenInfoCount = getTokenInfoCount();
posInfoCount = getPosInfoCount();
featureCount = getFeatureCount();
entrySize = getEntrySize(tokenInfoCount, posInfoCount, featureCount);
}
public BufferEntry lookupEntry(int offset) {
BufferEntry entry = new BufferEntry();
entry.tokenInfos = new short[tokenInfoCount];
entry.posInfos = new byte[posInfoCount];
entry.featureInfos = new int[featureCount];
int entrySize = getEntrySize(tokenInfoCount, posInfoCount, featureCount);
int position = getPosition(offset, entrySize);
// Get left id, right id and word cost
for (int i = 0; i < tokenInfoCount; i++) {
entry.tokenInfos[i] = buffer.getShort(position + i * SHORT_BYTES);
}
// Get part of speech tags values (not strings yet)
for (int i = 0; i < posInfoCount; i++) {
entry.posInfos[i] = buffer.get(position + tokenInfoCount * SHORT_BYTES + i);
}
// Get field value references (string references)
for (int i = 0; i < featureCount; i++) {
entry.featureInfos[i] =
buffer.getInt(position + tokenInfoCount * SHORT_BYTES + posInfoCount + i * INTEGER_BYTES);
}
return entry;
}
public int lookupTokenInfo(int offset, int i) {
int position = getPosition(offset, entrySize);
return buffer.getShort(position + i * SHORT_BYTES);
}
public int lookupPartOfSpeechFeature(int offset, int i) {
int position = getPosition(offset, entrySize);
return 0xff & buffer.get(position + tokenInfoCount * SHORT_BYTES + i);
}
public int lookupFeature(int offset, int i) {
int position = getPosition(offset, entrySize);
return buffer.getInt(
position + tokenInfoCount * SHORT_BYTES + posInfoCount + (i - posInfoCount) * INTEGER_BYTES);
}
public boolean isPartOfSpeechFeature(int i) {
int posInfoCount = getPosInfoCount();
return (i < posInfoCount);
}
private int getTokenInfoCount() {
return buffer.getInt(INTEGER_BYTES * 2);
}
private int getPosInfoCount() {
return buffer.getInt(INTEGER_BYTES * 3);
}
private int getFeatureCount() {
return buffer.getInt(INTEGER_BYTES * 4);
}
private int getEntrySize(int tokenInfoCount, int posInfoCount, int featureCount) {
return tokenInfoCount * SHORT_BYTES + posInfoCount + featureCount * INTEGER_BYTES;
}
private int getPosition(int offset, int entrySize) {
return offset * entrySize + INTEGER_BYTES * 5;
}
}