All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.tagger.maxent.TaggerConfig Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.tagger.maxent; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.util.StringUtils;

import java.io.*;
import java.util.Map;
import java.util.Properties;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.util.Generics;

/**
 * Reads and stores configuration information for a POS tagger.
 *
 * Implementation note: To add a new parameter: (1) define a default
 * String value, (2) add it to defaultValues map, (3) add line to constructor,
 * (4) add getter method, (5) add to dump() method, (6) add to printGenProps()
 * method, (7) add to class javadoc of MaxentTagger.
 *
 *  @author William Morgan
 *  @author Anna Rafferty
 *  @author Michel Galley
 */
public class TaggerConfig extends Properties /* Inherits implementation of Serializable! */  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(TaggerConfig.class);

  private static final long serialVersionUID = -4136407850147157497L;

  public enum Mode {
    TRAIN, TEST, TAG, DUMP
  }

  /* defaults. sentenceDelimiter might be null; the others all have non-null values. */
  public static final String
  SEARCH = "qn",
  TAG_SEPARATOR = "/",
  TOKENIZE = "true",
  DEBUG = "false",
  ITERATIONS = "100",
  ARCH = "",
  WORD_FUNCTION = "",
  RARE_WORD_THRESH = "5",
  MIN_FEATURE_THRESH = "5",
  CUR_WORD_MIN_FEATURE_THRESH = "2",
  RARE_WORD_MIN_FEATURE_THRESH = "10",
  VERY_COMMON_WORD_THRESH = "250",
  OCCURRING_TAGS_ONLY = "false",
  POSSIBLE_TAGS_ONLY = "false",
  SIGMA_SQUARED = String.valueOf(0.5),
  ENCODING = "UTF-8",
  LEARN_CLOSED_CLASS = "false",
  CLOSED_CLASS_THRESHOLD = "40",
  VERBOSE = "false",
  VERBOSE_RESULTS = "true",
  SGML = "false",
  LANG = "",
  TOKENIZER_FACTORY = "",
  XML_INPUT = "",
  TAG_INSIDE = "",
  APPROXIMATE = "-1.0",
  TOKENIZER_OPTIONS = "",
  DEFAULT_REG_L1 = "1.0",
  OUTPUT_FILE = "",
  OUTPUT_FORMAT = "slashTags",
  OUTPUT_FORMAT_OPTIONS = "",
  NTHREADS = "1";

  public static final String ENCODING_PROPERTY = "encoding",
  TAG_SEPARATOR_PROPERTY = "tagSeparator";


  private static final Map defaultValues = Generics.newHashMap();
  static {
    defaultValues.put("arch", ARCH);
    defaultValues.put("wordFunction", WORD_FUNCTION);
    defaultValues.put("closedClassTags", "");
    defaultValues.put("closedClassTagThreshold", CLOSED_CLASS_THRESHOLD);
    defaultValues.put("search", SEARCH);
    defaultValues.put(TAG_SEPARATOR_PROPERTY, TAG_SEPARATOR);
    defaultValues.put("tokenize", TOKENIZE);
    defaultValues.put("debug", DEBUG);
    defaultValues.put("iterations", ITERATIONS);
    defaultValues.put("rareWordThresh", RARE_WORD_THRESH);
    defaultValues.put("minFeatureThresh", MIN_FEATURE_THRESH);
    defaultValues.put("curWordMinFeatureThresh", CUR_WORD_MIN_FEATURE_THRESH);
    defaultValues.put("rareWordMinFeatureThresh", RARE_WORD_MIN_FEATURE_THRESH);
    defaultValues.put("veryCommonWordThresh", VERY_COMMON_WORD_THRESH);
    defaultValues.put("occurringTagsOnly", OCCURRING_TAGS_ONLY);
    defaultValues.put("possibleTagsOnly", POSSIBLE_TAGS_ONLY);
    defaultValues.put("sigmaSquared", SIGMA_SQUARED);
    defaultValues.put(ENCODING_PROPERTY, ENCODING);
    defaultValues.put("learnClosedClassTags", LEARN_CLOSED_CLASS);
    defaultValues.put("verbose", VERBOSE);
    defaultValues.put("verboseResults", VERBOSE_RESULTS);
    defaultValues.put("openClassTags", "");
    defaultValues.put("lang", LANG);
    defaultValues.put("tokenizerFactory", TOKENIZER_FACTORY);
    defaultValues.put("xmlInput", XML_INPUT);
    defaultValues.put("tagInside", TAG_INSIDE);
    defaultValues.put("sgml", SGML);
    defaultValues.put("approximate", APPROXIMATE);
    defaultValues.put("tokenizerOptions", TOKENIZER_OPTIONS);
    defaultValues.put("regL1", DEFAULT_REG_L1);
    defaultValues.put("outputFile", OUTPUT_FILE);
    defaultValues.put("outputFormat", OUTPUT_FORMAT);
    defaultValues.put("outputFormatOptions", OUTPUT_FORMAT_OPTIONS);
    defaultValues.put("nthreads", NTHREADS);
  }

  /**
   * This constructor is just for creating an instance with default values.
   * Used internally.
   */
  private TaggerConfig() {
    super();
    this.putAll(defaultValues);
  }

  /**
   * We force you to pass in a TaggerConfig rather than any other
   * superclass so that we know the arg error checking has already occurred
   */
  public TaggerConfig(TaggerConfig old) {
    super(old);
  }

  public TaggerConfig(String... args) {
    this(StringUtils.argsToProperties(args));
  }

  public TaggerConfig(Properties props) {
    // load up the default properties
    this();

    /* Try and use the default properties from the model */
    // Properties modelProps = new Properties();
    // TaggerConfig oldConfig = new TaggerConfig(); // loads default values in oldConfig
    if (! props.containsKey("trainFile")) {
      String name = props.getProperty("model");
      if (name == null) {
        name = props.getProperty("dump");
      }
      if (name != null) {
        try {
          log.info("Loading default properties from tagger " + name);
          DataInputStream in = new DataInputStream(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(name));
          this.putAll(TaggerConfig.readConfig(in)); // overwrites defaults with any serialized values.
          in.close();
        } catch (Exception e) {
          throw new RuntimeIOException("No such trained tagger config file found: " + name);
        }
      }
    }

    setProperties(props);
  }

  public void setProperties(Properties props) {
    if (props.getProperty("") != null) {
      throw new RuntimeException("unknown argument(s): \"" + props.getProperty("") + '\"');
    }

    if (props.getProperty("genprops") != null) {
      printGenProps(System.out);
      System.exit(0);
    }

    if (props.containsKey("mode") && props.containsKey("file")) {
      this.setProperty("mode", props.getProperty("mode"));
      this.setProperty("file", props.getProperty("file"));
    } else if (props.containsKey("trainFile")) {
      //Training mode
      this.setProperty("mode", Mode.TRAIN.toString());
      this.setProperty("file", props.getProperty("trainFile", "").trim());
    } else if (props.containsKey("testFile")) {
      //Testing mode
      this.setProperty("mode", Mode.TEST.toString());
      this.setProperty("file", props.getProperty("testFile", "").trim());
    } else if (props.containsKey("textFile")) {
      //Tagging mode
      this.setProperty("mode", Mode.TAG.toString());
      this.setProperty("file", props.getProperty("textFile", "").trim());
    } else if (props.containsKey("dump")) {
      this.setProperty("mode", Mode.DUMP.toString());
      // this.setProperty("file", props.getProperty("dump").trim());
      props.setProperty("model", props.getProperty("dump").trim());
    } else {
      this.setProperty("mode", Mode.TAG.toString());
      this.setProperty("file", "stdin");
    }
    //for any mode other than train, we load a classifier, which means we load a config - model always needs to be specified
    //on command line/in props file
    //Get the path to the model (or the path where you'd like to save the model); this is necessary for training, testing, and tagging
    this.setProperty("model", props.getProperty("model", this.getProperty("model", "")).trim());
    if ( ! (this.getMode() == Mode.DUMP) && this.getProperty("model").equals("")) {
      throw new RuntimeException("'model' parameter must be specified");
    }

    this.setProperty("search", props.getProperty("search", this.getProperty("search")).trim().toLowerCase());
    String srch = this.getProperty("search");
    if ( ! (srch.equals("cg") || srch.equals("iis") || srch.equals("owlqn") || srch.equals("qn") || srch.equals("owlqn2"))) {
      throw new RuntimeException("'search' must be one of 'iis', 'cg', 'qn' or 'owlqn' or 'owlqn2': " + srch);
    }

    this.setProperty("sigmaSquared", props.getProperty("sigmaSquared", this.getProperty("sigmaSquared")));

    this.setProperty(TAG_SEPARATOR_PROPERTY, props.getProperty(TAG_SEPARATOR_PROPERTY, this.getProperty(TAG_SEPARATOR_PROPERTY)));

    this.setProperty("iterations", props.getProperty("iterations", this.getProperty("iterations")));
    this.setProperty("rareWordThresh", props.getProperty("rareWordThresh", this.getProperty("rareWordThresh")));
    this.setProperty("minFeatureThresh", props.getProperty("minFeatureThresh", this.getProperty("minFeatureThresh")));
    this.setProperty("curWordMinFeatureThresh", props.getProperty("curWordMinFeatureThresh", this.getProperty("curWordMinFeatureThresh")));
    this.setProperty("rareWordMinFeatureThresh", props.getProperty("rareWordMinFeatureThresh", this.getProperty("rareWordMinFeatureThresh")));
    this.setProperty("veryCommonWordThresh", props.getProperty("veryCommonWordThresh", this.getProperty("veryCommonWordThresh")));
    this.setProperty("occurringTagsOnly", props.getProperty("occurringTagsOnly", this.getProperty("occurringTagsOnly", OCCURRING_TAGS_ONLY)));
    this.setProperty("possibleTagsOnly", props.getProperty("possibleTagsOnly", this.getProperty("possibleTagsOnly")));

    this.setProperty("lang", props.getProperty("lang", this.getProperty("lang")));

    this.setProperty("openClassTags", props.getProperty("openClassTags", this.getProperty("openClassTags")).trim());
    this.setProperty("closedClassTags", props.getProperty("closedClassTags", this.getProperty("closedClassTags")).trim());

    this.setProperty("learnClosedClassTags", props.getProperty("learnClosedClassTags", this.getProperty("learnClosedClassTags")));

    this.setProperty("closedClassTagThreshold", props.getProperty("closedClassTagThreshold", this.getProperty("closedClassTagThreshold")));

    this.setProperty("arch", props.getProperty("arch", this.getProperty("arch")));
    if (this.getMode() == Mode.TRAIN && this.getProperty("arch").equals("")) {
      throw new IllegalArgumentException("No architecture specified; " +
                                         "set the -arch flag with " +
                                         "the features to be used");
    }

    this.setProperty("wordFunction", props.getProperty("wordFunction", this.getProperty("wordFunction", WORD_FUNCTION)));

    this.setProperty("tokenize", props.getProperty("tokenize", this.getProperty("tokenize")));
    this.setProperty("tokenizerFactory", props.getProperty("tokenizerFactory", this.getProperty("tokenizerFactory")));

    this.setProperty("debugPrefix", props.getProperty("debugPrefix", this.getProperty("debugPrefix", "")));
    this.setProperty("debug", props.getProperty("debug", DEBUG));

    this.setProperty(ENCODING_PROPERTY, props.getProperty(ENCODING_PROPERTY, this.getProperty(ENCODING_PROPERTY)));
    this.setProperty("sgml", props.getProperty("sgml", this.getProperty("sgml")));
    this.setProperty("verbose", props.getProperty("verbose", this.getProperty("verbose")));
    this.setProperty("verboseResults", props.getProperty("verboseResults", this.getProperty("verboseResults")));

    this.setProperty("regL1", props.getProperty("regL1", this.getProperty("regL1")));

    //this is a property that is stored (not like the general properties)
    this.setProperty("xmlInput", props.getProperty("xmlInput", this.getProperty("xmlInput")).trim());

    this.setProperty("tagInside", props.getProperty("tagInside", this.getProperty("tagInside"))); //this isn't something we save from time to time
    this.setProperty("approximate", props.getProperty("approximate", this.getProperty("approximate"))); //this isn't something we save from time to time
    this.setProperty("tokenizerOptions", props.getProperty("tokenizerOptions", this.getProperty("tokenizerOptions"))); //this isn't something we save from time to time
    this.setProperty("outputFile", props.getProperty("outputFile", this.getProperty("outputFile")).trim()); //this isn't something we save from time to time
    this.setProperty("outputFormat", props.getProperty("outputFormat", this.getProperty("outputFormat")).trim()); //this isn't something we save from time to time
    this.setProperty("outputFormatOptions", props.getProperty("outputFormatOptions", this.getProperty("outputFormatOptions")).trim()); //this isn't something we save from time to time
    this.setProperty("nthreads", props.getProperty("nthreads", this.getProperty("nthreads", NTHREADS)).trim());
    String sentenceDelimiter = props.getProperty("sentenceDelimiter", this.getProperty("sentenceDelimiter"));
    if (sentenceDelimiter != null) {
      // this isn't something we save from time to time.
      // It is only relevant when tagging text files.
      // In fact, we let this one be null, as it really is useful to
      // let the null value represent no sentence delimiter.
      this.setProperty("sentenceDelimiter", sentenceDelimiter);
    }
  }


  public String getModel() { return getProperty("model"); }

  public String getFile() { return getProperty("file"); }

  public String getOutputFile() { return getProperty("outputFile"); }

  public String getOutputFormat() { return getProperty("outputFormat"); }

  public String[] getOutputOptions() { return getProperty("outputFormatOptions").split("\\s*,\\s*"); }

  public boolean getOutputVerbosity() {
    return getOutputOptionsContains("verbose");
  }

  public boolean getOutputLemmas() {
    return getOutputOptionsContains("lemmatize");
  }

  public boolean keepEmptySentences() {
    return getOutputOptionsContains("keepEmptySentences");    
  }

  public boolean getOutputOptionsContains(String sought) {
    String[] options = getOutputOptions();
    for (String option : options) {
      if (option.equals(sought)) {
        return true;
      }
    }
    return false;
  }

  public String getSearch() { return getProperty("search"); }

  public double getSigmaSquared() { return Double.parseDouble(getProperty("sigmaSquared")); }

  public int getIterations() { return Integer.parseInt(getProperty("iterations")); }

  public int getRareWordThresh() { return Integer.parseInt(getProperty("rareWordThresh")); }

  public int getMinFeatureThresh() { return Integer.parseInt(getProperty("minFeatureThresh")); }

  public int getCurWordMinFeatureThresh() { return Integer.parseInt(getProperty("curWordMinFeatureThresh")); }

  public int getRareWordMinFeatureThresh() { return Integer.parseInt(getProperty("rareWordMinFeatureThresh")); }

  public int getVeryCommonWordThresh() { return Integer.parseInt(getProperty("veryCommonWordThresh")); }

  public boolean occurringTagsOnly() { return Boolean.parseBoolean(getProperty("occurringTagsOnly")); }

  public boolean possibleTagsOnly() { return Boolean.parseBoolean(getProperty("possibleTagsOnly")); }

  public String getLang() { return getProperty("lang"); }

  public String[] getOpenClassTags() {
    return wsvStringToStringArray(getProperty("openClassTags"));
  }

  public String[] getClosedClassTags() {
    return wsvStringToStringArray(getProperty("closedClassTags"));
  }

  private static String[] wsvStringToStringArray(String str) {
    if (str == null || str.equals("")) {
      return StringUtils.EMPTY_STRING_ARRAY;
    } else {
      return str.split("\\s+");
    }
  }

  public boolean getLearnClosedClassTags() { return Boolean.parseBoolean(getProperty("learnClosedClassTags")); }

  public int getClosedTagThreshold() { return Integer.parseInt(getProperty("closedClassTagThreshold")); }

  public String getArch() { return getProperty("arch"); }

  public String getWordFunction() { return getProperty("wordFunction"); }

  public boolean getDebug() { return Boolean.parseBoolean(getProperty("debug")); }

  public String getDebugPrefix() { return getProperty("debugPrefix"); }

  public String getTokenizerFactory() { return getProperty("tokenizerFactory"); }

  public static String getDefaultTagSeparator() { return TAG_SEPARATOR; }

  public final String getTagSeparator() { return getProperty(TAG_SEPARATOR_PROPERTY); }

  public boolean getTokenize() { return Boolean.parseBoolean(getProperty("tokenize")); }

  public String getEncoding() { return getProperty(ENCODING_PROPERTY); }

  public double getRegL1() { return Double.parseDouble(getProperty("regL1")); }

  public String[] getXMLInput() {
    return wsvStringToStringArray(getProperty("xmlInput"));
  }

  public boolean getVerbose() { return Boolean.parseBoolean(getProperty("verbose")); }

  public boolean getVerboseResults() { return Boolean.parseBoolean(getProperty("verboseResults")); }

  public boolean getSGML() { return Boolean.parseBoolean(getProperty("sgml")); }

  public int getNThreads() { return Integer.parseInt(getProperty("nthreads")); }


  /** Return a regex of XML elements to tag inside of.  This may return an
   *  empty String, but never null.
   *
   * @return A regex of XML elements to tag inside of
   */
  public String getTagInside() {
    String str = getProperty("tagInside");
    if (str == null) {
      return "";
    }
    return str;
  }

  public String getTokenizerOptions() { return getProperty("tokenizerOptions"); }

  public boolean getTokenizerInvertible() {
    String tokenizerOptions = getTokenizerOptions();
    if (tokenizerOptions != null &&
        tokenizerOptions.matches("(^|.*,)invertible=true"))
      return true;
    return getOutputVerbosity() || getOutputLemmas();
  }

  /**
   * Returns a default score to be used for each tag that is incompatible with
   * the current word (e.g., the tag CC for the word "apple"). Using a default
   * score may slightly decrease performance for some languages (e.g., Chinese and
   * German), but allows the tagger to run considerably faster (since the computation
   * of the normalization term Z requires much less feature extraction). This approximation
   * does not decrease performance in English (on the WSJ). If this function returns
   * 0.0, the tagger will compute exact scores.
   *
   * @return default score
   */
  public double getDefaultScore() {
    String approx = getProperty("approximate");
    if ("false".equalsIgnoreCase(approx)) {
      return -1.0;
    } else if ("true".equalsIgnoreCase(approx)) {
      return 1.0;
    } else {
      return Double.parseDouble(approx);
    }
  }


  public void dump() { dump(new PrintWriter(System.err)); }

  public void dump(PrintStream stream) {
    PrintWriter pw = new PrintWriter(stream);
    dump(pw);
  }

  private void dump(PrintWriter pw) {
    pw.println("                   model = " + getProperty("model"));
    pw.println("                    arch = " + getProperty("arch"));
    pw.println("            wordFunction = " + getProperty("wordFunction"));
    if (this.getMode() == Mode.TRAIN || this.getMode() == Mode.DUMP) {
      pw.println("               trainFile = " + getProperty("file"));
    } else if (this.getMode() == Mode.TAG) {
      pw.println("                textFile = " + getProperty("file"));
    } else if (this.getMode() == Mode.TEST) {
      pw.println("                testFile = " + getProperty("file"));
    }

    pw.println("         closedClassTags = " + getProperty("closedClassTags"));
    pw.println(" closedClassTagThreshold = " + getProperty("closedClassTagThreshold"));
    pw.println(" curWordMinFeatureThresh = " + getProperty("curWordMinFeatureThresh"));
    pw.println("                   debug = " + getProperty("debug"));
    pw.println("             debugPrefix = " + getProperty("debugPrefix"));
    pw.println("            " + TAG_SEPARATOR_PROPERTY + " = " +
               getProperty(TAG_SEPARATOR_PROPERTY));
    pw.println("                " + ENCODING_PROPERTY + " = " +
               getProperty(ENCODING_PROPERTY));
    pw.println("              iterations = " + getProperty("iterations"));
    pw.println("                    lang = " + getProperty("lang"));
    pw.println("    learnClosedClassTags = " + getProperty("learnClosedClassTags"));
    pw.println("        minFeatureThresh = " + getProperty("minFeatureThresh"));
    pw.println("           openClassTags = " + getProperty("openClassTags"));
    pw.println("rareWordMinFeatureThresh = " + getProperty("rareWordMinFeatureThresh"));
    pw.println("          rareWordThresh = " + getProperty("rareWordThresh"));
    pw.println("                  search = " + getProperty("search"));
    pw.println("                    sgml = " + getProperty("sgml"));
    pw.println("            sigmaSquared = " + getProperty("sigmaSquared"));
    pw.println("                   regL1 = " + getProperty("regL1"));
    pw.println("               tagInside = " + getProperty("tagInside"));
    pw.println("                tokenize = " + getProperty("tokenize"));
    pw.println("        tokenizerFactory = " + getProperty("tokenizerFactory"));
    pw.println("        tokenizerOptions = " + getProperty("tokenizerOptions"));
    pw.println("                 verbose = " + getProperty("verbose"));
    pw.println("          verboseResults = " + getProperty("verboseResults"));
    pw.println("    veryCommonWordThresh = " + getProperty("veryCommonWordThresh"));
    pw.println("                xmlInput = " + getProperty("xmlInput"));
    pw.println("              outputFile = " + getProperty("outputFile"));
    pw.println("            outputFormat = " + getProperty("outputFormat"));
    pw.println("     outputFormatOptions = " + getProperty("outputFormatOptions"));
    pw.println("                nthreads = " + getProperty("nthreads"));
    pw.flush();
  }

  @Override
  public String toString() {
    StringWriter sw = new StringWriter(200);
    PrintWriter pw = new PrintWriter(sw);
    dump(pw);
    return sw.toString();
  }

  /**
   * This returns the sentence delimiter used when tokenizing text
   * using the tokenizer requested in this config.  In general, it is
   * assumed the tokenizer doesn't need a sentence delimiter.... If you
   * use the whitespace tokenizer, though, a newline breaks sentences.
   *
   * @return A null String unless tokenize is false and then the String
   */
  public String getSentenceDelimiter() {
    String delimiter = getProperty("sentenceDelimiter");
    if (delimiter == null && !getTokenize()) {
      delimiter = "\n";
    }
    return delimiter;
  }

  /**
   * Returns whether or not we should use stdin for reading when
   * tagging data.  For now, this returns true iff the filename given
   * was "stdin".
   * (TODO: kind of ugly)
   */
  public boolean useStdin() {
    return getFile().trim().equalsIgnoreCase("stdin");
  }

  /**
   * Prints out the automatically generated props file - in its own
   * method to make code above easier to read
   */
  private static void printGenProps(PrintStream out) {
    out.println("## Sample properties file for maxent tagger. This file is used for three main");
    out.println("## operations: training, testing, and tagging. It may also be used to dump");
    out.println("## the contents of a model.");
    out.println("## To train or test a model, or to tag something, run:");
    out.println("##   java edu.stanford.nlp.tagger.maxent.MaxentTagger -prop ");
    out.println("## Arguments can be overridden on the commandline, e.g.:");
    out.println("##   java ....MaxentTagger -prop  -testFile /other/file ");
    out.println();

    out.println("# Model file name (created at train time; used at tag and test time)");
    out.println("# (you can leave this blank and specify it on the commandline with -model)");
    out.println("# model = ");
    out.println();

    out.println("# Path to file to be operated on (trained from, tested against, or tagged)");
    out.println("# Specify -textFile  to tag text in the given file, -trainFile  to");
    out.println("# to train a model using data in the given file, or -testFile  to test your");
    out.println("# model using data in the given file.  Alternatively, you may specify");
    out.println("# -dump  to dump the parameters stored in a model or ");
    out.println("# -convertToSingleFile  to save an old, multi-file model (specified as -model)");
    out.println("# to the new single file format.  The new model will be saved in the file filename.");
    out.println("# If you choose to convert an old file, you must specify ");
    out.println("# the correct 'arch' parameter used to create the original model.");
    out.println("# trainFile = ");
    out.println();

    out.println("# Path to outputFile to write tagged output to.");
    out.println("# If empty, stdout is used.");
    out.println("# outputFile = " + OUTPUT_FILE);
    out.println();

    out.println("# Output format. One of: slashTags (default), xml, or tsv");
    out.println("# outputFormat = " + OUTPUT_FORMAT);
    out.println();

    out.println("# Output format options. Comma separated list.");
    out.println("# currently \"lemmatize\" and \"keepEmptySentences\" are supported.");
    out.println("# outputFormatOptions = " + OUTPUT_FORMAT_OPTIONS);
    out.println();

    out.println("# Tag separator character that separates word and pos tags");
    out.println("# (for both training and test data) and used for");
    out.println("# separating words and tags in slashTags format output.");
    out.println("# tagSeparator = " + TAG_SEPARATOR);
    out.println();

    out.println("# Encoding format in which files are stored.  If left blank, UTF-8 is assumed.");
    out.println("# encoding = " + ENCODING);
    out.println();

    out.println("# A couple flags for controlling the amount of output:");
    out.println("# - print extra debugging information:");
    out.println("# verbose = " + VERBOSE);
    out.println("# - print intermediate results:");
    out.println("# verboseResults = " + VERBOSE_RESULTS);

    out.println("######### parameters for tag and test operations #########");
    out.println();

    out.println("# Class to use for tokenization. Default blank value means Penn Treebank");
    out.println("# tokenization.  If you'd like to just assume that tokenization has been done,");
    out.println("# and the input is whitespace-tokenized, use");
    out.println("# edu.stanford.nlp.process.WhitespaceTokenizer or set tokenize to false.");
    out.println("# tokenizerFactory = ");
    out.println();

    out.println("# Options to the tokenizer.  A comma separated list.");
    out.println("# This depends on what the tokenizer supports.");
    out.println("# For PTBTokenizer, you might try options like americanize=false");
    out.println("# or asciiQuotes (for German!).");
    out.println("# tokenizerOptions = ");
    out.println();
    out.println("# Whether to tokenize text for tag and test operations. Default is true.");
    out.println("# If false, your text must already be whitespace tokenized.");
    out.println("# tokenize = " + TOKENIZE);
    out.println();

    out.println("# Write debugging information (words, top words, unknown words). Useful for");
    out.println("# error analysis. Default is false.");
    out.println("# debug = "+ DEBUG);
    out.println();

    out.println("# Prefix for debugging output (if debug == true). Default is to use the");
    out.println("# filename from 'file'");
    out.println("# debugPrefix = ");
    out.println();

    out.println("######### parameters for training  #########");
    out.println();

    out.println("# model architecture: This is one or more comma separated strings, which");
    out.println("# specify which extractors to use. Some of them take one or more integer");
    out.println("# or string ");
    out.println("# (file path) arguments in parentheses, written as m, n, and s below:");
    out.println("# 'left3words', 'left5words', 'bidirectional', 'bidirectional5words',");
    out.println("# 'generic', 'sighan2005', 'german', 'words(m,n)', 'wordshapes(m,n)',");
    out.println("# 'biwords(m,n)', 'lowercasewords(m,n)', 'vbn(n)', distsimconjunction(s,m,n)',");
    out.println("# 'naacl2003unknowns', 'naacl2003conjunctions', 'distsim(s,m,n)',");
    out.println("# 'suffix(n)', 'prefix(n)', 'prefixsuffix(n)', 'capitalizationsuffix(n)',");
    out.println("# 'wordshapes(m,n)', 'unicodeshapes(m,n)', 'unicodeshapeconjunction(m,n)',");
    out.println("# 'lctagfeatures', 'order(k)', 'chinesedictionaryfeatures(s)'.");
    out.println("# These keywords determines the features extracted.  'generic' is language independent.");
    out.println("# distsim: Distributional similarity classes can be an added source of information");
    out.println("# about your words. An English distsim file is included, or you can use your own.");
    out.println("# arch = ");
    out.println();
    out.println("# 'wordFunction'.  A function applied to the text before training or tagging.");
    out.println("# For example, edu.stanford.nlp.util.LowercaseFunction");
    out.println("# This function turns all the words into lowercase");
    out.println("# The function must implement java.util.function.Function");
    out.println("# Blank means no preprocessing function");
    out.println("# wordFunction = ");
    out.println();


    out.println("# 'language'.  This is really the tag set which is used for the");
    out.println("# list of open-class tags, and perhaps deterministic  tag");
    out.println("# expansion). Currently we have 'english', 'arabic', 'german', 'chinese'");
    out.println("# or 'polish' predefined. For your own language, you can specify ");
    out.println("# the same information via openClassTags or closedClassTags below");
    out.println("# (only ONE of these three options may be specified). ");
    out.println("# 'english' means UPenn English treebank tags. 'german' is STTS");
    out.println("# 'chinese' is CTB, and Arabic is an expanded Bies mapping from the ATB");
    out.println("# 'polish' means some tags that some guy on the internet once used. ");
    out.println("# See the TTags class for more information.");
    out.println("# lang = ");
    out.println();

    out.println("# a space-delimited list of open-class parts of speech");
    out.println("# alternatively, you can specify language above to use a pre-defined list or specify the closed class tags (below)");
    out.println("# openClassTags = ");
    out.println();

    out.println("# a space-delimited list of closed-class parts of speech");
    out.println("# alternatively, you can specify language above to use a pre-defined list or specify the open class tags (above)");
    out.println("# closedClassTags = ");
    out.println();

    out.println("# A boolean indicating whether you would like the trained model to set POS tags as closed");
    out.println("# based on their frequency in training; default is false.  The frequency threshold can be set below. ");
    out.println("# This option is ignored if any of {openClassTags, closedClassTags, lang} are specified.");
    out.println("# learnClosedClassTags = ");
    out.println();

    out.println("# Used only if learnClosedClassTags=true.  Tags that have fewer tokens than this threshold are");
    out.println("# considered closed in the trained model.");
    out.println("# closedClassTagThreshold = ");
    out.println();

    out.println("# search method for optimization. Normally use the default 'qn'. choices: 'qn' (quasi-Newton),");
    out.println("# 'cg' (conjugate gradient, 'owlqn' (L1 regularization) or 'iis' (improved iterative scaling)");
    out.println("# search = " + SEARCH);
    out.println();

    out.println("# for conjugate gradient or quasi-Newton search, sigma-squared smoothing/regularization");
    out.println("# parameter. if left blank, the default is 0.5, which is usually okay");
    out.println("# sigmaSquared = " + SIGMA_SQUARED);
    out.println();

    out.println("# for OWLQN search, regularization");
    out.println("# parameter. if left blank, the default is 1.0, which is usually okay");
    out.println("# regL1 = " + DEFAULT_REG_L1);
    out.println();

    out.println("# For improved iterative scaling, the number of iterations, otherwise ignored");
    out.println("# iterations = " + ITERATIONS);
    out.println();

    out.println("# rare word threshold. words that occur less than this number of");
    out.println("# times are considered rare words.");
    out.println("# rareWordThresh = " + RARE_WORD_THRESH);
    out.println();

    out.println("# minimum feature threshold. features whose history appears less");
    out.println("# than this number of times are ignored.");
    out.println("# minFeatureThresh = " + MIN_FEATURE_THRESH);
    out.println();

    out.println("# current word feature threshold. words that occur more than this");
    out.println("# number of times will generate features with all of their occurring");
    out.println("# tags.");
    out.println("# curWordMinFeatureThresh = " + CUR_WORD_MIN_FEATURE_THRESH);
    out.println();

    out.println("# rare word minimum feature threshold. features of rare words whose histories");
    out.println("# appear less than this times will be ignored.");
    out.println("# rareWordMinFeatureThresh = " + RARE_WORD_MIN_FEATURE_THRESH);
    out.println();

    out.println("# very common word threshold. words that occur more than this number of");
    out.println("# times will form an equivalence class by themselves. ignored unless");
    out.println("# you are using equivalence classes.");
    out.println("# veryCommonWordThresh = " + VERY_COMMON_WORD_THRESH);
    out.println();

    out.println("# sgml = ");
    out.println("# tagInside = ");
    out.println();

    out.println("# testFile and textFile can use multiple threads to process text.");
    out.println("# nthreads = " + NTHREADS);
  }

  public Mode getMode() {
    if (!containsKey("mode")) {
      return null;
    }
    return Mode.valueOf(getProperty("mode"));
  }


  /** Serialize the TaggerConfig.
   *
   * @param os Where to write this TaggerConfig
   * @throws IOException If any IO problems
   */
  public void saveConfig(OutputStream os) throws IOException {
    ObjectOutputStream out = new ObjectOutputStream(os);
    out.writeObject(this);
  }


  /** Read in a TaggerConfig.
   *
   * @param stream Where to read from
   * @return The TaggerConfig
   * @throws IOException Misc IOError
   * @throws ClassNotFoundException Class error
   */
  public static TaggerConfig readConfig(DataInputStream stream)
    throws IOException, ClassNotFoundException
  {
    ObjectInputStream in = new ObjectInputStream(stream);
    return (TaggerConfig) in.readObject();
  }


}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy