All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.carrotsearch.labs.langid.Model Maven / Gradle / Ivy

package com.carrotsearch.labs.langid;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectInputStream;
import java.io.ObjectOutput;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.Set;



/**
 * Data model for {@link LangIdV3}.
 * 
 * @see #defaultModel()
 */
public final class Model {
  /** The default model, initialized lazily (once). */
  private static Model defaultModel;

  /**
   * Language classes.
   */
  String[] langClasses;

  /**
   * Flattened matrix of per-language feature probabilities.
   * 
   * [featureIndex][langIndex]
   * where 
   * index = {@link #numClasses} * langIndex + featureIndex
   * 
*/ float[] nb_ptc; /** * Conditional init per-language probabilities (?). */ float[] nb_pc; /** * State machine for walking byte n-grams. */ short[] dsa; /** * An output (may be null) associated with each state. */ int[][] dsaOutput; /** Number of classes (languages). */ int numClasses; /** Number of features (total). */ int numFeatures; /** * Create a new model. */ Model(String [] langClasses, float [] ptc, float [] pc, short [] dsa, int[][] dsaOutput) { this.langClasses = langClasses; this.nb_ptc = ptc; this.nb_pc = pc; this.dsa = dsa; this.dsaOutput = dsaOutput; assert nb_pc.length == langClasses.length; this.numClasses = langClasses.length; this.numFeatures = nb_ptc.length / numClasses; } /** * Read a model from an external data stream. */ public static Model readExternal(ObjectInput in) throws IOException, ClassNotFoundException { String [] langClasses = (String[]) in.readObject(); float[] nb_ptc = (float[]) in.readObject(); float[] nb_pc = (float[]) in.readObject(); short[] dsa = (short[]) in.readObject(); int[][] dsaOutput = (int[][]) in.readObject(); return new Model(langClasses, nb_ptc, nb_pc, dsa, dsaOutput); } void writeExternal(ObjectOutput out) throws IOException { out.writeObject(langClasses); out.writeObject(nb_ptc); out.writeObject(nb_pc); out.writeObject(dsa); out.writeObject(dsaOutput); } /** * Return a copy of this model trimmed to detect only a subset of languages. */ public static Model detectOnly(Set langCodes) { final Model source = defaultModel(); Set newClasses = new LinkedHashSet(Arrays.asList(source.langClasses)); newClasses.retainAll(langCodes); if (newClasses.size() < 2) { throw new IllegalArgumentException("A model must contain at least two languages."); } // Limit the set of supported languages (fewer languages = tighter loops and faster execution). String [] trimmed_nb_classes = newClasses.toArray(new String[newClasses.size()]); float[] trimmed_nb_pc = new float [newClasses.size()]; float[] trimmed_nb_ptc = new float [newClasses.size() * source.numFeatures]; for (int i = 0, j = 0; i < source.numClasses; i++) { if (newClasses.contains(source.langClasses[i])) { trimmed_nb_pc[j] = source.nb_pc[i]; for (int f = 0; f < source.numFeatures; f++) { int iFrom = source.numFeatures * i + f; int iTo = source.numFeatures * j + f; trimmed_nb_ptc[iTo] = source.nb_ptc[iFrom]; } j++; } } return new Model( trimmed_nb_classes, trimmed_nb_ptc, trimmed_nb_pc, source.dsa, source.dsaOutput); } /** * Return a set of detected languages. */ public Set getDetectedLanguages() { return Collections.unmodifiableSet(new LinkedHashSet(Arrays.asList(langClasses))); } /** * Return the default model with a full set of detected languages. */ public static synchronized Model defaultModel() { if (defaultModel != null) { return defaultModel; } DataInputStream is = null; try { ByteArrayOutputStream os = new ByteArrayOutputStream(); is = new DataInputStream( new BufferedInputStream( Model.class.getResourceAsStream("langid.lzma"))); byte[] streamProperties = new byte[5]; is.readFully(streamProperties); LzmaDecoder decoder = new LzmaDecoder(); if (!decoder.SetDecoderProperties(streamProperties)) throw new IOException("Incorrect stream properties."); byte [] streamSize = new byte [8]; is.readFully(streamSize); long streamSizeLong = 0; for (int i = 8; --i >= 0;) { streamSizeLong <<= 8; streamSizeLong |= streamSize[i] & 0xFF; } if (!decoder.Code(is, os, streamSizeLong)) { throw new IOException("Error in data stream"); } os.flush(); return Model.readExternal( new ObjectInputStream( new ByteArrayInputStream( os.toByteArray()))); } catch (Exception e) { throw new RuntimeException("Default model not available.", e); } finally { if (is != null) { try { is.close(); } catch (IOException e) { // Ignore, nothing to do. } } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy