lv.lumii.ner.NerPipe Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright 2013,2014 Institute of Mathematics and Computer Science, University of Latvia
 * Author: Artūrs Znotiņš
 * 
 *     This program is free software: you can redistribute it and/or modify
 *     it under the terms of the GNU General Public License as published by
 *     the Free Software Foundation, either version 3 of the License, or
 *     (at your option) any later version.
 * 
 *     This program is distributed in the hope that it will be useful,
 *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *     GNU General Public License for more details.
 * 
 *     You should have received a copy of the GNU General Public License
 *     along with this program.  If not, see .
 *******************************************************************************/
package lv.lumii.ner;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.ListNERSequenceClassifier;
import edu.stanford.nlp.ie.NERClassifierCombiner;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ie.regexp.RegexNERSequenceClassifier;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
import edu.stanford.nlp.sequences.LVCoNLLDocumentReaderAndWriter;
import edu.stanford.nlp.util.StringUtils;
public class NerPipe {
	private static enum inputTypes {SENTENCE, CONLL};
	private static enum outputTypes {CONLL_X, SIMPLE, INFEATURES};
	
	public Properties props;
	public NERClassifierCombiner classifier;
	public DocumentReaderAndWriter defaultReaderWriter;
					
	private static inputTypes inputType = inputTypes.SENTENCE;
	private static outputTypes outputType = outputTypes.CONLL_X;
	
	private static String defaultCrfClassifier; // = "lv-ner-model.ser.gz" removed for testing, use loadClassifier in properties file
	
	@SuppressWarnings("unchecked")
	public NerPipe(Properties props) throws ClassCastException, ClassNotFoundException, IOException {
		this.props = props;
		initializeFromProperties();
		
		List> classifiers = new ArrayList<>();
		
		if (props.containsKey("whiteListCasedLemmas")) classifiers.add(new ListNERSequenceClassifier(props.getProperty("whiteListCasedLemmas"), false, true, true));
		if (props.containsKey("whiteListUncasedWords"))
			classifiers.add(new ListNERSequenceClassifier(props.getProperty("whiteListUncasedWords"), true, false, true));
		if (props.containsKey("whiteListCasedWords"))
			classifiers.add(new ListNERSequenceClassifier(props.getProperty("whiteListCasedWords"), false, false, true));
		if (defaultCrfClassifier != null) classifiers.add(CRFClassifier.getClassifier(defaultCrfClassifier, props));
		if (props.containsKey("regexList")) classifiers.add(new RegexNERSequenceClassifier(props.getProperty("regexList"), true, true));

		classifier = new NERClassifierCombiner(classifiers);
		defaultReaderWriter = new LVCoNLLDocumentReaderAndWriter();
		defaultReaderWriter.init(classifier.flags);
	}
	
	public void setDefaultClassifier(NERClassifierCombiner classifier) {		
		this.classifier = classifier;
	}
	
	public NerPipe (Properties props, AbstractSequenceClassifier classifier) throws FileNotFoundException {
		this(props, new NERClassifierCombiner(classifier));
	}
	
	public NerPipe(Properties props, NERClassifierCombiner nerClassifier) {
		this.props = props;
		classifier = nerClassifier;
		defaultReaderWriter = new LVCoNLLDocumentReaderAndWriter();
	}

	void initializeFromProperties() {	
		if (props.getProperty("conll-in") != null) LVCoNLLDocumentReaderAndWriter.inputType = LVCoNLLDocumentReaderAndWriter.inputTypes.CONLL;
		if (props.getProperty("conll-x") != null) LVCoNLLDocumentReaderAndWriter.outputType = LVCoNLLDocumentReaderAndWriter.outputTypes.CONLL;
		if (props.getProperty("simple") != null) LVCoNLLDocumentReaderAndWriter.outputType = LVCoNLLDocumentReaderAndWriter.outputTypes.SIMPLE;
		if (props.getProperty("toFeatures") != null) LVCoNLLDocumentReaderAndWriter.outputType = LVCoNLLDocumentReaderAndWriter.outputTypes.INFEATURES;
		if (props.getProperty("saveExtraColumns") != null) LVCoNLLDocumentReaderAndWriter.saveExtraColumns = true;
		if (props.getProperty("loadClassifier") != null) defaultCrfClassifier = props.getProperty("loadClassifier");
		
		switch(outputType) {
		case SIMPLE:
			LVCoNLLDocumentReaderAndWriter.outputType = LVCoNLLDocumentReaderAndWriter.outputTypes.SIMPLE;
			break;
		case INFEATURES:
			LVCoNLLDocumentReaderAndWriter.outputType = LVCoNLLDocumentReaderAndWriter.outputTypes.INFEATURES;
		default:				
	}
	
	}
	
	public void setReaderWriter(DocumentReaderAndWriter readerWriter) {
		this.defaultReaderWriter = readerWriter;
	}
	
	public DocumentReaderAndWriter getReaderWriter() {
		return this.defaultReaderWriter;
	}
	
	public void classifyDocumentStdin(DocumentReaderAndWriter readerWriter)
			throws IOException 
	{
		classifier.classifyDocumentStdin(readerWriter);
	}	
		
	public void classifyDocumentStdin() throws IOException {
		classifyDocumentStdin(defaultReaderWriter);
	}
	
	public List classify(List document) {
		classifier.classify(document);
		return document;
	}
	
	public ObjectBank> classify(ObjectBank> documents) {
		for (List doc : documents) {
			//System.out.println(doc);
		    classifier.classify(doc);
		}
		return documents;
	}

	public List classify(String filename) {
		ObjectBank> ob = classifier.makeObjectBankFromFile(filename, defaultReaderWriter);
		List res = new ArrayList();		
		for (List doc : ob) {
		      classifier.classify(doc);
		      res.addAll(doc);
		}
		return res;
	}
	
	
	public String getAnswerString(List doc) {
		return getAnswerString(doc, defaultReaderWriter);
	}
	
	public String getAnswerString(List doc, DocumentReaderAndWriter rw) {
		StringWriter bos = new StringWriter();
		PrintWriter printer = new PrintWriter(bos);
		rw.printAnswers(doc, printer);
		return bos.toString();
	}
	
	public void writeAnswers(List doc) {
		try {
			classifier.writeAnswers(doc,
			        IOUtils.encodedOutputStreamPrintWriter(System.out, "utf-8", true), defaultReaderWriter);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	public void writeAnswers(List doc, DocumentReaderAndWriter writer) {
		try {
			classifier.writeAnswers(doc,
			        IOUtils.encodedOutputStreamPrintWriter(System.out, "utf-8", true), writer);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	public void writeAnswers(ObjectBank> documents, DocumentReaderAndWriter writer) {
		try {
			for(List doc : documents) {
				classifier.writeAnswers(doc,
			        IOUtils.encodedOutputStreamPrintWriter(System.out, "utf-8", true), writer);
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	public static void main(String[] args) throws IOException, ClassCastException, ClassNotFoundException {
		Properties props = new Properties();
		props = StringUtils.argsToProperties(args);	
		if (props.containsKey("h") || props.containsKey("help") || props.containsKey("?")) {
			System.out.println("LV Named Entity Recogniser");
			System.out.println("\nInput formats");
			System.out.println("\tDefault : conll-in");
			System.out.println("\t-conll-in : CONLL shared task data format - one line per token, with tab-delimited columns, sentences separated by blank lines.");
			System.out.println("\nOutput formats");
			System.out.println("\tDefault : conll-x");
			System.out.println("\t-conll-x : CONLL-X shared task data format - one line per token, with tab-delimited columns, sentences separated by blank lines.");;
			System.out.println("\t-simple : Simple compare format used for ner analysis");
			System.out.println("\t-toFeatures : add ner key and value to morphoFeature string");
			System.out.println("\nOther options:");
			System.out.println("\t-saveExtraColumns : save extra columns after typical conll input (6 columns)");
			System.out.println("\t-whiteList : files containing white list named entities (separated by comma)");
			System.out.flush();
			System.exit(0);
		}
		NerPipe ner = new NerPipe(props);
		ner.classifyDocumentStdin();
		
		//CRFClassifier classifier = CRFClassifier.getClassifier(NerPipe.defaultCrfClassifier, props);
		///LVCoNLLDocumentReaderAndWriter lvconll = new LVCoNLLDocumentReaderAndWriter();
		//lvconll.init(classifier.flags);
//		ObjectBank> b = classifier.makeObjectBankFromFile("z_tomins.conll",lvconll);
//		classifier.printProbsDocuments(b);
//		classifier.printFirstOrderProbsDocuments(b);
		//try {classifier.printLabelInformation("z_tomins.conll", lvconll);} catch (Exception e) {e.printStackTrace();}
	}

	
//	private static void outputSentence(CRFClassifier crf,
//			PrintStream out, List sentence) {
//		sentence = crf.classify(sentence);
//		switch (outputType) {
//		default: 
//			out.println( output_CONLL(sentence) );    
//		}
//		out.flush();
//	}	
	
//	private static String output_CONLL(List tokens){
//		StringBuilder s = new StringBuilder();
//		int counter = 1;
//		String eol = System.getProperty("line.separator");
//		for (CoreLabel word : tokens) {
//			String token = word.getString(TextAnnotation.class);
//			if (token.contains("")) continue;
//			token = token.replace(' ', '_');
//			
//			s.append(Integer.toString(counter));
//			s.append('\t');
//			s.append(token);
//			s.append('\t');
//			s.append(word.getString(LemmaAnnotation.class));
//			s.append('\t');
//			s.append(word.tag());
//			s.append('\t');
//			s.append(word.getString(FullTagAnnotation.class));
//			s.append('\t');
//			s.append(word.getString(MorphologyFeatureStringAnnotation.class));
//			s.append('\t');
//			String syntax = word.getString(ConllSyntaxAnnotation.class);
//			if (syntax != null) {
//				s.append(syntax);
//			}
//			else s.append("_\t_\t_\t_\t");
//			s.append(word.getString(AnswerAnnotation.class));
//			s.append('\t');
//			s.append(eol);
//			counter++;
//		}
//		
//		return s.toString();
//	}
	
//	public static List> readCONLL(BufferedReader in, CRFClassifier nerClassifier) throws IOException {
//		String s;
//	    List sentence = new LinkedList();
//	    List> result = new LinkedList>();
//		
//		boolean empty = true; // no words in the input
//	    
//	    while ((s = in.readLine()) != null) {
//	    	if (s.trim().length() > 0) {
//	    		String[] fields = s.split("\t");
//	    		String token = fields[1];
//	    		String lemma = fields[2];
//	    		String tag = fields[3];
//	    		String fullTag = fields[4];
//	    		String morphoFeatures = fields[5];
//	    		
//	    		//if (token.contains("")) continue;
//
//	    		CoreLabel word = new CoreLabel(); 
//	    		if (!token.equalsIgnoreCase("_")) token = token.replace('_', ' ');
//	    		
//				word.set(TextAnnotation.class, token);
//				word.setLemma(lemma);
//				word.setTag(fullTag.substring(0,1));
//				word.set(FullTagAnnotation.class, fullTag);
//				word.set(MorphologyFeatureStringAnnotation.class, morphoFeatures);
//				
//				word.set(ShapeAnnotation.class, WordShapeClassifier.wordShape(token, nerClassifier.flags.wordShape)); //nepieliek zināmos LC vārdus
//	
//	    		if (fields.length > 8) {
//	    			String syntax = fields[6] + "\t" + fields[7] + "\t" + fields[8] + "\t" + fields[9];
//	    			word.set(ConllSyntaxAnnotation.class, syntax);
//	    		}	    		
//	    		sentence.add(word);
//	    		empty = false;
//	    	} else {
//	    		result.add(sentence); 		
//	    		sentence = new LinkedList();
//	    		break; // stop reading sentence at first blank line
//	    	}
//	    }
//	    if (sentence.size() > 0) {
//	    	result.add(sentence);
//	    }
//	    if (empty) return null;
//		return result;
//	}

}