com.atilika.kuromoji.compile.TokenInfoDictionaryCompilerBase Maven / Gradle / Ivy
/**
* Copyright © 2010-2015 Atilika Inc. and contributors (see CONTRIBUTORS.md)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. A copy of the
* License is distributed with this work in the LICENSE.md file. You may
* also obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.atilika.kuromoji.compile;
import com.atilika.kuromoji.buffer.BufferEntry;
import com.atilika.kuromoji.buffer.FeatureInfoMap;
import com.atilika.kuromoji.buffer.StringValueMapBuffer;
import com.atilika.kuromoji.buffer.WordIdMap;
import com.atilika.kuromoji.dict.DictionaryEntryBase;
import com.atilika.kuromoji.dict.GenericDictionaryEntry;
import com.atilika.kuromoji.dict.TokenInfoDictionary;
import java.io.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.TreeMap;
public abstract class TokenInfoDictionaryCompilerBase implements Compiler {
protected List bufferEntries = new ArrayList<>();
protected FeatureInfoMap posInfo = new FeatureInfoMap();
protected FeatureInfoMap otherInfo = new FeatureInfoMap();
protected WordIdMapCompiler wordIdsCompiler = new WordIdMapCompiler();
// optional list to collect the generic dictionary entries
protected List dictionaryEntries = null;
private String encoding;
private List surfaces = new ArrayList<>();
public TokenInfoDictionaryCompilerBase(String encoding) {
this.encoding = encoding;
}
public void analyzeTokenInfo(InputStream input) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(input, encoding));
String line;
while ((line = reader.readLine()) != null) {
T entry = parse(line);
GenericDictionaryEntry dictionaryEntry = generateGenericDictionaryEntry(entry);
posInfo.mapFeatures(dictionaryEntry.getPosFeatures());
}
}
public void readTokenInfo(InputStream input) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(input, encoding));
String line;
int entryCount = posInfo.getEntryCount();
while ((line = reader.readLine()) != null) {
T entry = parse(line);
GenericDictionaryEntry dictionaryEntry = generateGenericDictionaryEntry(entry);
short leftId = dictionaryEntry.getLeftId();
short rightId = dictionaryEntry.getRightId();
short wordCost = dictionaryEntry.getWordCost();
List allPosFeatures = dictionaryEntry.getPosFeatures();
List posFeatureIds = posInfo.mapFeatures(allPosFeatures);
List featureList = dictionaryEntry.getFeatures();
List otherFeatureIds = otherInfo.mapFeatures(featureList);
BufferEntry bufferEntry = new BufferEntry();
bufferEntry.tokenInfo.add(leftId);
bufferEntry.tokenInfo.add(rightId);
bufferEntry.tokenInfo.add(wordCost);
if (entriesFitInAByte(entryCount)) {
List posFeatureIdBytes = createPosFeatureIds(posFeatureIds);
bufferEntry.posInfo.addAll(posFeatureIdBytes);
} else {
for (Integer posFeatureId : posFeatureIds) {
bufferEntry.tokenInfo.add(posFeatureId.shortValue());
}
}
bufferEntry.features.addAll(otherFeatureIds);
bufferEntries.add(bufferEntry);
surfaces.add(dictionaryEntry.getSurface());
if (dictionaryEntries != null) {
dictionaryEntries.add(dictionaryEntry);
}
}
}
protected abstract GenericDictionaryEntry generateGenericDictionaryEntry(T entry);
protected abstract T parse(String line);
@Override
public void compile() throws IOException {
// TODO: Should call this method instead of write()
}
private boolean entriesFitInAByte(int entryCount) {
return entryCount <= 0xff;
}
private List createPosFeatureIds(List posFeatureIds) {
List posFeatureIdBytes = new ArrayList<>();
for (Integer posFeatureId : posFeatureIds) {
posFeatureIdBytes.add(posFeatureId.byteValue());
}
return posFeatureIdBytes;
}
public InputStream combinedSequentialFileInputStream(File dir) throws FileNotFoundException {
List fileInputStreams = new ArrayList<>();
List files = getCsvFiles(dir);
for (File file : files) {
fileInputStreams.add(new FileInputStream(file));
}
return new SequenceInputStream(Collections.enumeration(fileInputStreams));
}
public List getCsvFiles(File dir) {
FilenameFilter filter = new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.endsWith(".csv");
}
};
ArrayList files = new ArrayList<>();
Collections.addAll(files, dir.listFiles(filter));
Collections.sort(files);
return files;
}
public void addMapping(int sourceId, int wordId) {
wordIdsCompiler.addMapping(sourceId, wordId);
}
public List getSurfaces() {
return surfaces;
}
public void write(String directoryName) throws IOException {
writeDictionary(directoryName + File.separator + TokenInfoDictionary.TOKEN_INFO_DICTIONARY_FILENAME);
writeMap(directoryName + File.separator + TokenInfoDictionary.POS_MAP_FILENAME, posInfo);
writeMap(directoryName + File.separator + TokenInfoDictionary.FEATURE_MAP_FILENAME, otherInfo);
writeWordIds(directoryName + File.separator + TokenInfoDictionary.TARGETMAP_FILENAME);
}
protected void writeMap(String filename, FeatureInfoMap map) throws IOException {
TreeMap features = map.invert();
StringValueMapBuffer mapBuffer = new StringValueMapBuffer(features);
FileOutputStream fos = new FileOutputStream(filename);
mapBuffer.write(fos);
}
protected void writeDictionary(String filename) throws IOException {
TokenInfoBufferCompiler tokenInfoBufferCompiler = new TokenInfoBufferCompiler(
new FileOutputStream(filename),
bufferEntries
);
tokenInfoBufferCompiler.compile();
}
protected void writeWordIds(String filename) throws IOException {
wordIdsCompiler.write(new FileOutputStream(filename));
}
@Deprecated
public WordIdMap getWordIdMap() throws IOException {
File file = File.createTempFile("kuromoji-wordid-", ".bin");
file.deleteOnExit();
OutputStream output = new BufferedOutputStream(new FileOutputStream(file));
wordIdsCompiler.write(output);
output.close();
InputStream input = new BufferedInputStream(new FileInputStream(file));
WordIdMap wordIds = new WordIdMap(input);
input.close();
return wordIds;
}
@Deprecated
public List getBufferEntries() {
return bufferEntries;
}
@Deprecated
public List getDictionaryEntries() {
return dictionaryEntries;
}
@Deprecated
public void setDictionaryEntries(List dictionaryEntries) {
this.dictionaryEntries = dictionaryEntries;
}
}