cc.mallet.share.mccallum.ner.TUI Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */




/** 
   @author Andrew McCallum [email protected]
 */

package cc.mallet.share.mccallum.ner;

import java.util.Random;
import java.util.regex.*;
import java.io.*;

import cc.mallet.fst.*;
import cc.mallet.pipe.*;
import cc.mallet.pipe.iterator.*;
import cc.mallet.pipe.tsf.*;
import cc.mallet.types.*;
import cc.mallet.util.*;

public class TUI
{
	static CommandOption.Double gaussianVarianceOption = new CommandOption.Double
	(TUI.class, "gaussian-variance", "DECIMAL", true, 10.0,
	 "The gaussian prior variance used for training.", null);

	static CommandOption.Double hyperbolicSlopeOption = new CommandOption.Double
	(TUI.class, "hyperbolic-slope", "DECIMAL", true, 0.2,
	 "The hyperbolic prior slope used for training.", null);

	static CommandOption.Double hyperbolicSharpnessOption = new CommandOption.Double
	(TUI.class, "hyperbolic-sharpness", "DECIMAL", true, 10.0,
	 "The hyperbolic prior sharpness used for training.", null);

	static CommandOption.File crfInputFileOption = new CommandOption.File
	(TUI.class, "crf-input-file", "FILENAME", true, null,
	 "The name of the file to write the CRF after training.", null);

	static CommandOption.Integer randomSeedOption = new CommandOption.Integer
	(TUI.class, "random-seed", "INTEGER", true, 0,
	 "The random seed for randomly selecting a proportion of the instance list for training", null);

	static CommandOption.Integer labelGramOption = new CommandOption.Integer
	(TUI.class, "label-gram", "INTEGER", true, 1,
	 "Markov order of labels: 1, 2, 3", null);

	static CommandOption.Integer wordWindowFeatureOption = new CommandOption.Integer
	(TUI.class, "word-window-size", "INTEGER", true, 0,
	 "Size of window of words as features: 0=none, 10, 20...", null);

	static CommandOption.Boolean useTestbOption = new CommandOption.Boolean
	(TUI.class, "use-testb", "true|false", true, false,
	 "Use testb, final test set", null);
	
	static CommandOption.Boolean useHyperbolicPriorOption = new CommandOption.Boolean
	(TUI.class, "use-hyperbolic-prior", "true|false", true, false,
	 "Use hyperbolic prior", null);

	static CommandOption.Boolean useFeatureInductionOption = new CommandOption.Boolean
	(TUI.class, "use-feature-induction", "true|false", true, false,
	 "Not use or use feature induction", null);

	static CommandOption.Boolean clusterFeatureInductionOption = new CommandOption.Boolean
	(TUI.class, "cluster-feature-induction", "true|false", true, false,
	 "Cluster in feature induction", null);

	static CommandOption.Boolean useFirstMentionFeatureOption = new CommandOption.Boolean
	(TUI.class, "use-firstmention-feature", "true|false", true, false,
	 "Don't use first-mention feature", null);

	static CommandOption.Boolean useDocHeaderFeatureOption = new CommandOption.Boolean
	(TUI.class, "use-docheader-feature", "true|false", true, false,
	 "", null);

	static CommandOption.Boolean includeConllLexiconsOption = new CommandOption.Boolean
	(TUI.class, "include-conll-lexicons", "true|false", true, false,
	 "", null);

	static CommandOption.Boolean charNGramsOption = new CommandOption.Boolean
	(TUI.class, "char-ngrams", "true|false", true, false,
	 "", null);
	
	static CommandOption.String offsetsOption = new CommandOption.String
	(TUI.class, "offsets", "e.g. [[0,0],[1]]", true, "[[-2],[-1],[1],[2]]", 
	 "Offset conjunctions", null);

	static CommandOption.String capOffsetsOption = new CommandOption.String
	(TUI.class, "cap-offsets", "e.g. [[0,0],[0,1]]", true, "", 
	 "Offset conjunctions applied to features that are [A-Z]*", null);

	static CommandOption.String viterbiFilePrefixOption = new CommandOption.String
	(TUI.class, "viterbi-file", "FILE", true, "TUI", 
	 "Filename in which to store most recent Viterbi output", null);
	

	static final CommandOption.List commandOptions =
	new CommandOption.List (
		"Training, testing and running a Chinese word segmenter.",
		new CommandOption[] {
			gaussianVarianceOption,
			hyperbolicSlopeOption,
			hyperbolicSharpnessOption,
			randomSeedOption,
			labelGramOption,
			wordWindowFeatureOption,
			useHyperbolicPriorOption,
			useFeatureInductionOption,
			clusterFeatureInductionOption,
			useFirstMentionFeatureOption,
			useDocHeaderFeatureOption,
			includeConllLexiconsOption,
			offsetsOption,
			capOffsetsOption,
			viterbiFilePrefixOption,
			useTestbOption,
		});

	
	int numEvaluations = 0;
	static int iterationsBetweenEvals = 16;
	static boolean doingFeatureInduction = true;
	static boolean doingClusteredFeatureInduction = false;

	private static String CAPS = "[\\p{Lu}]";
	private static String LOW = "[\\p{Ll}]";
	private static String CAPSNUM = "[\\p{Lu}\\p{Nd}]";
	private static String ALPHA = "[\\p{Lu}\\p{Ll}]";
	private static String ALPHANUM = "[\\p{Lu}\\p{Ll}\\p{Nd}]";
	private static String PUNT = "[,\\.;:?!()]";
	private static String QUOTE = "[\"`']";
	
	public static void main (String[] args) throws FileNotFoundException, Exception
	{
		commandOptions.process (args);
		String homedir = System.getProperty ("HOME");
		String lexdir = homedir+"/research/data/resources/";

		String offsetsString = offsetsOption.value.replace('[','{').replace(']','}');
		int[][] offsets = (int[][]) CommandOption.getInterpreter().eval ("new int[][] "+offsetsString);

		String capOffsetsString = capOffsetsOption.value.replace('[','{').replace(']','}');
		int[][] capOffsets = null;
		if (capOffsetsString.length() > 0)
			capOffsets = (int[][]) CommandOption.getInterpreter().eval ("new int[][] "+capOffsetsString);

		Pipe conllLexiconsPipe = null;
		if (includeConllLexiconsOption.value)
			conllLexiconsPipe = new SerialPipes (new Pipe[] {
				new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOPER")),
				new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOLOC")),
				new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOORG")),
				new TrieLexiconMembership (new File(lexdir + "conll/CONLLTWOMISC")),
			});
		
		Pipe p = new SerialPipes (new Pipe[] {
			new ConllNer2003Sentence2TokenSequence (),
			new RegexMatches ("INITCAP", Pattern.compile (CAPS+".*")),
			new RegexMatches ("CAPITALIZED", Pattern.compile (CAPS+LOW+"*")),
			new RegexMatches ("ALLCAPS", Pattern.compile (CAPS+"+")),
			new RegexMatches ("MIXEDCAPS", Pattern.compile ("[A-Z][a-z]+[A-Z][A-Za-z]*")),
			new RegexMatches ("CONTAINSDIGITS", Pattern.compile (".*[0-9].*")),
			new RegexMatches ("ALLDIGITS", Pattern.compile ("[0-9]+")),
			new RegexMatches ("NUMERICAL", Pattern.compile ("[-0-9]+[\\.,]+[0-9\\.,]+")),
			//new RegexMatches ("ALPHNUMERIC", Pattern.compile ("[A-Za-z0-9]+")),
			//new RegexMatches ("ROMAN", Pattern.compile ("[ivxdlcm]+|[IVXDLCM]+")),
			new RegexMatches ("MULTIDOTS", Pattern.compile ("\\.\\.+")),
			new RegexMatches ("ENDSINDOT", Pattern.compile ("[^\\.]+.*\\.")),
			new RegexMatches ("CONTAINSDASH", Pattern.compile (ALPHANUM+"+-"+ALPHANUM+"*")),
			new RegexMatches ("ACRO", Pattern.compile ("[A-Z][A-Z\\.]*\\.[A-Z\\.]*")),
			new RegexMatches ("LONELYINITIAL", Pattern.compile (CAPS+"\\.")),
			new RegexMatches ("SINGLECHAR", Pattern.compile (ALPHA)),
			new RegexMatches ("CAPLETTER", Pattern.compile ("[A-Z]")),
			new RegexMatches ("PUNC", Pattern.compile (PUNT)),
			new RegexMatches ("QUOTE", Pattern.compile (QUOTE)),
			//new RegexMatches ("LOWER", Pattern.compile (LOW+"+")),
			//new RegexMatches ("MIXEDCAPS", Pattern.compile ("[A-Z]+[a-z]+[A-Z]+[a-z]*")),

			(includeConllLexiconsOption.value ? conllLexiconsPipe : new Noop ()),

			// Note that the word has not been lowecased!  so INITCAP, etc, is redundant
			//new TokenSequenceLowercase (),
			new TokenText ("W="),
			//new TokenSequenceFirstSentenceAllCaps (),

			new OffsetConjunctions (offsets),
			(capOffsets != null ? (Pipe) new OffsetConjunctions (capOffsets) : (Pipe) new Noop ()),

			//// Don't lowercase the W= if you want to use this.
			(!useFirstMentionFeatureOption.value
			 ? (Pipe) new Noop ()
			 : (Pipe) new FeaturesOfFirstMention ("FIRSTMENTION=", Pattern.compile (CAPS+".*"),
																						// Exclude singleton W=foo features b/c redundant
																						Pattern.compile ("W=[^@&]+"), false)),
			(!useDocHeaderFeatureOption.value	? (Pipe) new Noop () : (Pipe) new TokenSequenceDocHeader ()),
			
			(wordWindowFeatureOption.value > 0
			 ? (Pipe) new FeaturesInWindow ("WINDOW=", -wordWindowFeatureOption.value,
																			wordWindowFeatureOption.value,	Pattern.compile ("WORD=.*"), true)
			 : (Pipe) new Noop()),
			(charNGramsOption.value
			 ? (Pipe) new TokenTextCharNGrams ("CHARNGRAM=", new int[] {2,3,4})
			 : (Pipe) new Noop()),

			new PrintTokenSequenceFeatures(),
			new TokenSequence2FeatureVectorSequence (true, true)
		});


		// Set up training and testing data
		//args = new String[] {homedir+"/research/data/ie/ner2003/eng.testa"};
		if (useTestbOption.value)
			args = new String[] {homedir+"/research/data/ie/ner2003/eng.train",
													 homedir+"/research/data/ie/ner2003/eng.testb"};
		else
			args = new String[] {homedir+"/research/data/ie/ner2003/eng.train",
													 homedir+"/research/data/ie/ner2003/eng.testa"};

		InstanceList trainingData = new InstanceList (p);
		trainingData.addThruPipe (new LineGroupIterator (new FileReader (new File (args[0])),
																						 Pattern.compile("^.DOCSTART. .X. .X. .$"), true));
		System.out.println ("Read "+trainingData.size()+" training instances");
		
		InstanceList testingData = null;
		if (args.length > 1) {
			testingData = new InstanceList (p);
			testingData.addThruPipe (new LineGroupIterator (new FileReader (new File (args[1])),
																							Pattern.compile("^.DOCSTART. .X. .X. .$"), true));
		}

		if (testingData == null) {
			// For now, just train on a small fraction of the data
			Random r = new Random (1);
			// Proportions below is: {training, testing, ignore}
			InstanceList[] trainingLists = trainingData.split (r, new double[] {.2, .1, .7});
			trainingData = trainingLists[0];
			// and test on just 50% of the data
			if (testingData != null) {
				InstanceList[] testingLists = testingData.split (r, new double[] {.5, .5});
				testingData = testingLists[0];
				testingLists = null;
			} else {
				testingData = trainingLists[1];
			}
			trainingLists = null;
			assert (testingData != null);
		}

		// Print out all the target names
		Alphabet targets = p.getTargetAlphabet();
		System.out.print ("State labels:");
		for (int i = 0; i < targets.size(); i++)
			System.out.print (" " + targets.lookupObject(i));
		System.out.println ("");

		// Print out some feature information
		System.out.println ("Number of features = "+p.getDataAlphabet().size());

		CRF crf = new CRF (p, null);
		if (labelGramOption.value == 1)
			crf.addStatesForLabelsConnectedAsIn (trainingData);
		else if (labelGramOption.value == 2)
			crf.addStatesForBiLabelsConnectedAsIn (trainingData);
		//else if (labelGramOption.value == 3)
		//crf.addStatesForTriLabelsConnectedAsIn (trainingData);
		else
			throw new IllegalStateException ("label-gram must be 1, 2, or 3, not "+ labelGramOption.value);
		CRFTrainerByLabelLikelihood crft = new CRFTrainerByLabelLikelihood (crf);		
		
		if (useHyperbolicPriorOption.value) {
			crft.setUseHyperbolicPrior (true);
			crft.setHyperbolicPriorSlope (hyperbolicSlopeOption.value);
			crft.setHyperbolicPriorSharpness (hyperbolicSharpnessOption.value);
		} else {
			crft.setGaussianPriorVariance (gaussianVarianceOption.value);
		}
		for (int i = 0; i < crf.numStates(); i++) {
			Transducer.State s = crf.getState (i);
			if (s.getName().charAt(0) == 'I')
				s.setInitialWeight (Double.POSITIVE_INFINITY);
		}

		System.out.println("Training on "+trainingData.size()+" training instances, "+
											 testingData.size()+" testing instances...");
		MultiSegmentationEvaluator eval =
			new MultiSegmentationEvaluator (new InstanceList[] {trainingData, testingData},
					new String[] {"Training", "Testing"},
					new String[] {"B-PER", "B-LOC", "B-ORG", "B-MISC"},
					new String[] {"I-PER", "I-LOC", "I-ORG", "I-MISC"});
		ViterbiWriter vw = new ViterbiWriter (viterbiFilePrefixOption.value,
				new InstanceList[] {trainingData, testingData}, new String[] {"Training", "Testing"});
			
		if (useFeatureInductionOption.value) {
			if (clusterFeatureInductionOption.value)
				crft.trainWithFeatureInduction (trainingData, null, testingData,
																			 eval, 99999,
																			 10, 99, 200, 0.5, true,
																			 new double[] {.1, .2, .5, .7});
			else
				crft.trainWithFeatureInduction (trainingData, null, testingData,
																			 eval, 99999,
																			 10, 99, 1000, 0.5, false,
																			 new double[] {.1, .2, .5, .7});
		}
		else {
			double[] trainingProportions = new double[] {.1, .2, .5, .7};
			for (int i = 0; i < trainingProportions.length; i++) {
				crft.train(trainingData, 3, new double[] {trainingProportions[i]});
				eval.evaluate(crft);
				vw.evaluate(crft);
			}
			while (crft.train(trainingData, 3)) {
				eval.evaluate(crft);
				vw.evaluate(crft);
			}
			eval.evaluate(crft);
			vw.evaluate(crft);
		}
	}

	
}