edu.stanford.nlp.tagger.maxent.TestThreadedTagger Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
// TestThreadedTagger -- StanfordMaxEnt, A Maximum Entropy Toolkit
// Copyright (c) 2002-2011 Leland Stanford Junior University
//
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//    Support/Questions: [email protected]
//    Licensing: [email protected]
//    http://www-nlp.stanford.edu/software/tagger.shtml
package edu.stanford.nlp.tagger.maxent;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Properties;
import edu.stanford.nlp.util.Timing;
import edu.stanford.nlp.util.StringUtils;

/**
 * First, this runs a tagger once to see what results it comes up with.
 * Then it runs the same tagger in two separate threads to make sure the results are the same.
 * The results are printed to stdout; the user is expected to verify they are as expected.
 *
 * Normally you would run MaxentTagger with command line arguments such as:
 *
 * -model ../data/tagger/my-left3words-distsim-wsj-0-18.tagger
 * -testFile ../data/tagger/test-wsj-19-21 -verboseResults false
 *
 * If you provide the same arguments to this program, it will first
 * run the given tagger on the given test file once to establish the
 * "baseline" results.  It will then run the same tagger in more than
 * one thread at the same time; the output for both threads should be
 * the same if the MaxentTagger is re-entrant.  The number of threads
 * to be run can be specified with -numThreads; the default is
 * DEFAULT_NUM_THREADS.
 *
 * You can also provide multiple models.  After performing that test
 * on model1, it will then run the same test file on model2, model3,
 * etc to establish baseline results for that tagger.  After that, it
 * runs both taggers at the same time.  The taggers should be
 * completely separate structures.  In other words, the second tagger
 * should not have clobbered any static state initialized by the first
 * tagger.  Thus, the results of the two simultaneous taggers should
 * be the same as the two taggers' baselines.
 *
 * Example arguments for the more complicated test:
 *
 * -model1 ../data/pos-tagger/newmodels/left3words-distsim-wsj-0-18.tagger
 * -model2 ../data/pos-tagger/newmodels/left3words-wsj-0-18.tagger
 * -testFile ../data/pos-tagger/training/english/test-wsj-19-21
 * -verboseResults false
 *
 * @author John Bauer
 */
class TestThreadedTagger {
  /**
   * Default number of threads to launch in the first test.
   * Can be specified with -numThreads.
   */
  static final int DEFAULT_NUM_THREADS = 2;

  static final String THREAD_FLAG = "numThreads";


  private TestThreadedTagger() {} // static methods


  /**
   * This internal class takes a config, a tagger, and a thread name.
   * The "run" method then runs the given tagger on the data file
   * specified in the config.
   */
  private static class TaggerThread extends Thread {

    private final MaxentTagger tagger;
    private final String threadName;

    private String resultsString = "";
    public String getResultsString() { return resultsString; }

    TaggerThread(MaxentTagger tagger, String name) {
      this.tagger = tagger;
      this.threadName = name;
    }

    @Override
    public void run() {
      try {
        Timing t = new Timing();
        TestClassifier testClassifier = new TestClassifier(tagger);
        long millis = t.stop();
        resultsString = testClassifier.resultsString(tagger);
        System.out.println("Thread " + threadName + " took " + millis +
                           " milliseconds to tag " + testClassifier.getNumWords() +
                           " words.\n" + resultsString);
      } catch(IOException e) {
        throw new RuntimeException(e);
      }
    }
  } // end class TaggerThread

  public static void compareResults(String results, String baseline) {
    if (!results.equals(baseline)) {
      throw new RuntimeException("Results different from expected baseline");
    }
  }

  public static void main(final String[] args)
    throws ClassNotFoundException, IOException, InterruptedException
  {
    Properties props = StringUtils.argsToProperties(args);
    runThreadedTest(props);
  }

  public static void runThreadedTest(Properties props)
    throws ClassNotFoundException, IOException, InterruptedException
  {
    ArrayList configs = new ArrayList<>();
    ArrayList taggers = new ArrayList<>();
    int numThreads = DEFAULT_NUM_THREADS;

    // let the user specify how many threads to run in the first test case
    if (props.getProperty(THREAD_FLAG) != null) {
      numThreads = Integer.valueOf(props.getProperty(THREAD_FLAG));
    }

    // read in each of the taggers specified on the command line
    System.out.println();
    System.out.println("Loading taggers...");
    System.out.println();

    if (props.getProperty("model") != null) {
      configs.add(props);
      taggers.add(new MaxentTagger(configs.get(0).getProperty("model"), configs.get(0)));
    } else {
      int taggerNum = 1;
      String taggerName = "model" + taggerNum;
      while (props.getProperty(taggerName) != null) {
        Properties newProps = new Properties();
        newProps.putAll(props);
        newProps.setProperty("model", props.getProperty(taggerName));
        configs.add(newProps);
        taggers.add(new MaxentTagger(configs.get(taggerNum - 1).getProperty("model"),
                                     configs.get(taggerNum - 1)));

        ++taggerNum;
        taggerName = "model" + taggerNum;
      }
    }

    // no models at all => bad
    if (taggers.isEmpty()) {
      throw new IllegalArgumentException("Please specify at least one of " +
                                         "-model or -model1");
    }

    System.out.println();
    System.out.println("Running the baseline results for tagger 1");
    System.out.println();

    // run baseline results for the first tagger model
    TaggerThread baselineThread =
      new TaggerThread(taggers.get(0), "BaseResults-1");
    baselineThread.start();
    baselineThread.join();

    ArrayList baselineResults = new ArrayList<>();
    baselineResults.add(baselineThread.getResultsString());

    System.out.println();
    System.out.println("Running " + numThreads + " threads of tagger 1");
    System.out.println();

    // run the first tagger in X separate threads at the same time
    // at the end of this test, those X threads should produce the same results
    ArrayList threads = new ArrayList<>();
    for (int i = 0; i < numThreads; ++i) {
      threads.add(new TaggerThread(taggers.get(0),
                                   "Simultaneous-" + (i + 1)));
    }
    for (TaggerThread thread : threads) {
      thread.start();
    }
    for (TaggerThread thread : threads) {
      thread.join();
      compareResults(thread.getResultsString(),
                     baselineResults.get(0));
    }

    // if we have more than one model...
    if (taggers.size() > 1) {
      // first, produce baseline results for the other models
      // do this one thread at a time so we know there are no
      // thread-related screwups
      // TODO: would iterables be cleaner?
      for (int i = 1; i < taggers.size(); ++i) {
        System.out.println();
        System.out.println("Running the baseline results for tagger " + (i + 1));
        System.out.println();

        baselineThread = new TaggerThread(taggers.get(i),
                                          "BaseResults-" + (i + 1));
        baselineThread.start();
        baselineThread.join();
        baselineResults.add(baselineThread.getResultsString());
      }

      System.out.println();
      System.out.println("Running " + taggers.size() +
                         " threads of different taggers");
      System.out.println();

      // now, run the X models at the same time.  there used to be a
      // whole bunch of static state in the tagger, which used to mean
      // such a thing was not be possible to do.  now that should not
      // be a problem any more
      threads.clear();
      for (int i = 0; i < taggers.size(); ++i) {
        threads.add(new TaggerThread(taggers.get(i),
                                     "DifferentTaggers-" + (i + 1)));
      }
      for (TaggerThread thread : threads) {
        thread.start();
      }
      for (int i = 0; i < taggers.size(); ++i) {
        TaggerThread thread = threads.get(i);
        thread.join();
        compareResults(thread.getResultsString(),
                       baselineResults.get(i));
      }
    }

    System.out.println("Done!");
  }
}