edu.nyu.jet.hmm.ActiveLearner Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of jet Show documentation

Information extraction is the process of identifying specified classes of entities, relations, and events in natural language text – creating structured data from unstructured input. JET, the Java Extraction Toolkit, developed at New York University over the past fifteen years, provides a rich set of tools for research and education in information extraction from English text. These include standard language processing tools such as a tokenizer, sentence segmenter, part-of-speech tagger, name tagger, regular-expression pattern matcher, and dependency parser. Also provided are relation and event extractors based on the specifications of the U.S. Government's ACE [Automatic Content Extraction] program. The program is provided under an Apache 2.0 license.

The newest version!

// -*- tab-width: 4 -*-
package edu.nyu.jet.hmm;

import java.util.*;
import java.io.*;
import edu.nyu.jet.tipster.*;
import edu.nyu.jet.lisp.FeatureSet;
import edu.nyu.jet.lex.Tokenizer;
import edu.nyu.jet.zoner.SentenceSplitter;
import edu.nyu.jet.scorer.*;

/*
 *  ActiveLearner:
 *  We divide a collection into three parts:
 *    initialTrainingSet  [fully annotated]
 *    activeLearningSet   [annotated incrementally]
 *    testSet
 *  Sentences included in training are marked with the feature "training"
 *  with value "true" on the SENTENCE annotation.
 */

public class ActiveLearner {

	static HMMNameTagger nt;

	static String[] tagsToRead = {"ENAMEX", "TIMEX", "NUMEX"};
	static final int initialTrainingSetSize = 50;
	static final int testSetSize = 50;
	// activeTraining:
	//    if true, select sentences with smallest margin
	//    if false, select sentences at random
	static final boolean activeTraining = true;
	static final boolean simulatedTraining = false;
	static final boolean multithread = true;
	static final int sentencesPerSweep = 5;
	static ArrayList sentencesWithSmallestMargin;
	static ArrayList sentencesToAnnotate;
	static ArrayList documentsBeingAnnotated = new ArrayList();
	static InteractiveAnnotator annotationThread = null;
	// poolSentences:
	//     if activeTraining = false, collects candidates for annotation
	//     (sentences not yet annotated)
	static ArrayList poolSentences;
	// keepLearning:  set to false by user through annotationTool to quit learning
	static public volatile boolean keepLearning = true;
	// number of (unannotated) sentences in active training pool
	static int sentencesInPool = 0;
	static DocumentCollection col;
	static PrintWriter logFile = null;

	public static void main (String[] args) throws IOException {
		String home = "C:/Documents and Settings/Ralph Grishman/My Documents/";
		String logFileName = home + "active.log";
		logFile = new PrintWriter(new BufferedWriter(new FileWriter(logFileName)));
		if (!simulatedTraining) {
			new AnnotationColor(home + "HMM");
	  }
 		// col = new DocumentCollection(home + "HMM/NE/ACE sep02 nwire Collection.txt");
		// col = new DocumentCollection(home + "HMM/NE/ACE aug03 written Collection.txt");
		col = new DocumentCollection(home + "HMM/NE/ACE training Collection.txt");

	  initialize();
	  for (int rep=0; rep<=500; rep+=sentencesPerSweep) {
			learn();
			if (!keepLearning) break;
		}
		logFile.close();
	}

	static void initialize () {

		// load collection with NE annotation

		// split documents into sentences and tokenize them
		col.open();
		for (int i=0; i= initialTrainingSetSize) doc.removeAnnotation(enamex);
	        }
	    }

		//    train HMM on initial training set

		nt = new HMMNameTagger(WordFeatureHMMemitter.class);
		nt.buildNameHMM("data/ACEnameTags.txt");
		if (activeTraining) nt.nameHMM.recordMargin();

		for (int i=0; i s2.margin)
			return 1;
		else
			return 0;
	}
}