All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.nyu.jet.hmm.ActiveLearner Maven / Gradle / Ivy

Go to download

Information extraction is the process of identifying specified classes of entities, relations, and events in natural language text – creating structured data from unstructured input. JET, the Java Extraction Toolkit, developed at New York University over the past fifteen years, provides a rich set of tools for research and education in information extraction from English text. These include standard language processing tools such as a tokenizer, sentence segmenter, part-of-speech tagger, name tagger, regular-expression pattern matcher, and dependency parser. Also provided are relation and event extractors based on the specifications of the U.S. Government's ACE [Automatic Content Extraction] program. The program is provided under an Apache 2.0 license.

The newest version!
// -*- tab-width: 4 -*-
package edu.nyu.jet.hmm;

import java.util.*;
import java.io.*;
import edu.nyu.jet.tipster.*;
import edu.nyu.jet.lisp.FeatureSet;
import edu.nyu.jet.lex.Tokenizer;
import edu.nyu.jet.zoner.SentenceSplitter;
import edu.nyu.jet.scorer.*;

/*
 *  ActiveLearner:
 *  We divide a collection into three parts:
 *    initialTrainingSet  [fully annotated]
 *    activeLearningSet   [annotated incrementally]
 *    testSet
 *  Sentences included in training are marked with the feature "training"
 *  with value "true" on the SENTENCE annotation.
 */

public class ActiveLearner {

	static HMMNameTagger nt;

	static String[] tagsToRead = {"ENAMEX", "TIMEX", "NUMEX"};
	static final int initialTrainingSetSize = 50;
	static final int testSetSize = 50;
	// activeTraining:
	//    if true, select sentences with smallest margin
	//    if false, select sentences at random
	static final boolean activeTraining = true;
	static final boolean simulatedTraining = false;
	static final boolean multithread = true;
	static final int sentencesPerSweep = 5;
	static ArrayList sentencesWithSmallestMargin;
	static ArrayList sentencesToAnnotate;
	static ArrayList documentsBeingAnnotated = new ArrayList();
	static InteractiveAnnotator annotationThread = null;
	// poolSentences:
	//     if activeTraining = false, collects candidates for annotation
	//     (sentences not yet annotated)
	static ArrayList poolSentences;
	// keepLearning:  set to false by user through annotationTool to quit learning
	static public volatile boolean keepLearning = true;
	// number of (unannotated) sentences in active training pool
	static int sentencesInPool = 0;
	static DocumentCollection col;
	static PrintWriter logFile = null;

	public static void main (String[] args) throws IOException {
		String home = "C:/Documents and Settings/Ralph Grishman/My Documents/";
		String logFileName = home + "active.log";
		logFile = new PrintWriter(new BufferedWriter(new FileWriter(logFileName)));
		if (!simulatedTraining) {
			new AnnotationColor(home + "HMM");
	  }
 		// col = new DocumentCollection(home + "HMM/NE/ACE sep02 nwire Collection.txt");
		// col = new DocumentCollection(home + "HMM/NE/ACE aug03 written Collection.txt");
		col = new DocumentCollection(home + "HMM/NE/ACE training Collection.txt");

	  initialize();
	  for (int rep=0; rep<=500; rep+=sentencesPerSweep) {
			learn();
			if (!keepLearning) break;
		}
		logFile.close();
	}

	static void initialize () {

		// load collection with NE annotation

		// split documents into sentences and tokenize them
		col.open();
		for (int i=0; i= initialTrainingSetSize) doc.removeAnnotation(enamex);
	        }
	    }

		//    train HMM on initial training set

		nt = new HMMNameTagger(WordFeatureHMMemitter.class);
		nt.buildNameHMM("data/ACEnameTags.txt");
		if (activeTraining) nt.nameHMM.recordMargin();

		for (int i=0; i s2.margin)
			return 1;
		else
			return 0;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy