com.carrotsearch.labs.langid.Model Maven / Gradle / Ivy
package com.carrotsearch.labs.langid;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectInputStream;
import java.io.ObjectOutput;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.Set;
/**
* Data model for {@link LangIdV3}.
*
* @see #defaultModel()
*/
public final class Model {
/** The default model, initialized lazily (once). */
private static Model defaultModel;
/**
* Language classes.
*/
String[] langClasses;
/**
* Flattened matrix of per-language feature probabilities.
*
* [featureIndex][langIndex]
* where
* index = {@link #numClasses} * langIndex + featureIndex
*
*/
float[] nb_ptc;
/**
* Conditional init per-language probabilities (?).
*/
float[] nb_pc;
/**
* State machine for walking byte n-grams.
*/
short[] dsa;
/**
* An output (may be null) associated with each state.
*/
int[][] dsaOutput;
/** Number of classes (languages). */
int numClasses;
/** Number of features (total). */
int numFeatures;
/**
* Create a new model.
*/
Model(String [] langClasses, float [] ptc, float [] pc, short [] dsa, int[][] dsaOutput) {
this.langClasses = langClasses;
this.nb_ptc = ptc;
this.nb_pc = pc;
this.dsa = dsa;
this.dsaOutput = dsaOutput;
assert nb_pc.length == langClasses.length;
this.numClasses = langClasses.length;
this.numFeatures = nb_ptc.length / numClasses;
}
/**
* Read a model from an external data stream.
*/
public static Model readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
String [] langClasses = (String[]) in.readObject();
float[] nb_ptc = (float[]) in.readObject();
float[] nb_pc = (float[]) in.readObject();
short[] dsa = (short[]) in.readObject();
int[][] dsaOutput = (int[][]) in.readObject();
return new Model(langClasses, nb_ptc, nb_pc, dsa, dsaOutput);
}
void writeExternal(ObjectOutput out) throws IOException {
out.writeObject(langClasses);
out.writeObject(nb_ptc);
out.writeObject(nb_pc);
out.writeObject(dsa);
out.writeObject(dsaOutput);
}
/**
* Return a copy of this model trimmed to detect only a subset of languages.
*/
public static Model detectOnly(Set langCodes) {
final Model source = defaultModel();
Set newClasses = new LinkedHashSet(Arrays.asList(source.langClasses));
newClasses.retainAll(langCodes);
if (newClasses.size() < 2) {
throw new IllegalArgumentException("A model must contain at least two languages.");
}
// Limit the set of supported languages (fewer languages = tighter loops and faster execution).
String [] trimmed_nb_classes = newClasses.toArray(new String[newClasses.size()]);
float[] trimmed_nb_pc = new float [newClasses.size()];
float[] trimmed_nb_ptc = new float [newClasses.size() * source.numFeatures];
for (int i = 0, j = 0; i < source.numClasses; i++) {
if (newClasses.contains(source.langClasses[i])) {
trimmed_nb_pc[j] = source.nb_pc[i];
for (int f = 0; f < source.numFeatures; f++) {
int iFrom = source.numFeatures * i + f;
int iTo = source.numFeatures * j + f;
trimmed_nb_ptc[iTo] = source.nb_ptc[iFrom];
}
j++;
}
}
return new Model(
trimmed_nb_classes,
trimmed_nb_ptc,
trimmed_nb_pc,
source.dsa,
source.dsaOutput);
}
/**
* Return a set of detected languages.
*/
public Set getDetectedLanguages() {
return Collections.unmodifiableSet(new LinkedHashSet(Arrays.asList(langClasses)));
}
/**
* Return the default model with a full set of detected languages.
*/
public static synchronized Model defaultModel() {
if (defaultModel != null) {
return defaultModel;
}
DataInputStream is = null;
try {
ByteArrayOutputStream os = new ByteArrayOutputStream();
is = new DataInputStream(
new BufferedInputStream(
Model.class.getResourceAsStream("langid.lzma")));
byte[] streamProperties = new byte[5];
is.readFully(streamProperties);
LzmaDecoder decoder = new LzmaDecoder();
if (!decoder.SetDecoderProperties(streamProperties))
throw new IOException("Incorrect stream properties.");
byte [] streamSize = new byte [8];
is.readFully(streamSize);
long streamSizeLong = 0;
for (int i = 8; --i >= 0;) {
streamSizeLong <<= 8;
streamSizeLong |= streamSize[i] & 0xFF;
}
if (!decoder.Code(is, os, streamSizeLong)) {
throw new IOException("Error in data stream");
}
os.flush();
return Model.readExternal(
new ObjectInputStream(
new ByteArrayInputStream(
os.toByteArray())));
} catch (Exception e) {
throw new RuntimeException("Default model not available.", e);
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
// Ignore, nothing to do.
}
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy