All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.process.DistSimClassifier Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.process;

import java.io.Serializable;
import java.util.Map;

import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Timing;


/** Maps a String to its distributional similarity class.
 *
 *  @author Christopher Manning
 */
public class DistSimClassifier implements Serializable {

  private static final long serialVersionUID = 3L;

  private final Map lexicon;
  private final boolean cased;
  private final boolean numberEquivalence;
  private final String unknownWordClass;


  public DistSimClassifier(String filename, boolean cased, boolean numberEquivalence) {
    this(filename, "alexClark", "utf-8", -1, cased, numberEquivalence, "NULL");
  }

  public DistSimClassifier(String filename, boolean cased,
                           boolean numberEquivalence, String unknownWordClass) {
    this(filename, "alexClark", "utf-8", -1, cased, numberEquivalence, unknownWordClass);
  }

  public DistSimClassifier(String filename, String format, String encoding,
                           int distSimMaxBits,
                           boolean cased, boolean numberEquivalence,
                           String unknownWordClass) {
    this.cased = cased;
    this.numberEquivalence = numberEquivalence;
    this.unknownWordClass = unknownWordClass;
    Timing.startDoing("Loading distsim lexicon from " + filename);
    lexicon = Generics.newHashMap(1 << 15);  // make a reasonable starting size
    boolean terryKoo = "terryKoo".equals(format);
    for (String line : ObjectBank.getLineIterator(filename, encoding)) {
      String word;
      String wordClass;
      if (terryKoo) {
        String[] bits = line.split("\\t");
        word = bits[1];
        wordClass = bits[0];
        if (distSimMaxBits > 0 && wordClass.length() > distSimMaxBits) {
          wordClass = wordClass.substring(0, distSimMaxBits);
        }
      } else {
        // "alexClark"
        String[] bits = line.split("\\s+");
        word = bits[0];
        wordClass = bits[1];
      }
      if ( ! cased) {
        word = word.toLowerCase();
      }
      if (numberEquivalence) {
        word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS);
      }
      lexicon.put(word, wordClass);
    }
    Timing.endDoing();
  }


  public String distSimClass(String word) {
    if ( ! cased) {
      word = word.toLowerCase();
    }
    if (numberEquivalence) {
      word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS);
    }
    String distSim = lexicon.get(word);
    if (distSim == null) {
      distSim = unknownWordClass;
    }
    return distSim;
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy