All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.share.casutton.ner.ConllNer2003Sentence2TokenSequence Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */


/** 
   @author Andrew McCallum [email protected]
 */

/*
	An error?  CoNLLTrue MalletTrue MalletPred
	O O O
	I-MISC B-MISC B-MISC
	B-MISC B-MISC I-MISC
	I-MISC B-MISC I-MISC
	O O O
	O O O
	O O O
*/

package cc.mallet.share.casutton.ner; // Generated package name


import java.util.regex.*;

import cc.mallet.extract.StringSpan;
import cc.mallet.extract.StringTokenization;
import cc.mallet.pipe.*;
import cc.mallet.types.*;

/**
 * Reads a data file in CoNLL 2003 format, and makes some simple
 *  transformations.
 *
 * Unlike the version in mccallum.ner, does not expect fields in
 *  the data file for tags and phrasos if those features are off.  Does
 *  not look for target field if isTargetProcessing() is false.
 */
public class ConllNer2003Sentence2TokenSequence extends Pipe
{
	static final String[] endings = new String[]
	{"ing", "ed", "ogy", "s", "ly", "ion", "tion", "ity", "ies"};
	static Pattern[] endingPatterns = new Pattern[endings.length];
	// Indexed by {forward,backward} {0,1,2 offset} {ending char ngram index}
	static final String[][][] endingNames = new String[2][3][endings.length];

	{
		for (int i = 0; i < endings.length; i++) {
			endingPatterns[i] = Pattern.compile (".*"+endings[i]+"$");
			for (int j = 0; j < 3; j++) {
				for (int k = 0; k < 2; k++)
					endingNames[k][j][i] = "W"+(k==1?"-":"")+j+"=";
			}
		}
	}

	boolean saveSource = true;
	boolean doConjunctions = false;
	boolean doTags = true;
	boolean doPhrases = true;
	boolean doSpelling = false;
	boolean doDigitCollapses = true;
	boolean doDowncasing = false;
	
	public ConllNer2003Sentence2TokenSequence ()
	{
		super (null, new LabelAlphabet());
	}

	public ConllNer2003Sentence2TokenSequence (boolean useTags, boolean usePhrases)
	{
		super (null, new LabelAlphabet());
		this.doTags = useTags;
		this.doPhrases = usePhrases;
	}

	/* Lines look like this:
		 -DOCSTART- -X- -X- O

		 EU NNP I-NP I-ORG
		 rejects VBZ I-VP O
		 German JJ I-NP I-MISC
		 call NN I-NP O
		 to TO I-VP O
		 boycott VB I-VP O
		 British JJ I-NP I-MISC
		 lamb NN I-NP O
		 . . O O

		 Peter NNP I-NP I-PER
		 Blackburn NNP I-NP I-PER

		 BRUSSELS NNP I-NP I-LOC
		 1996-08-22 CD I-NP O

		 The DT I-NP O
		 European NNP I-NP I-ORG
		 Commission NNP I-NP I-ORG
		 said VBD I-VP O
		 on IN I-PP O
		 ...
	*/

	public Instance pipe (Instance carrier)
	{
		String sentenceLines = (String) carrier.getData();
		String[] tokens = sentenceLines.split ("\n");
		LabelSequence target = new LabelSequence ((LabelAlphabet)getTargetAlphabet(), tokens.length);
		boolean [][] ending = new boolean[3][endings.length];
		boolean [][] endingp1 = new boolean[3][endings.length];
		boolean [][] endingp2 = new boolean[3][endings.length];
		StringBuffer source = saveSource ? new StringBuffer() : null;
    TokenSequence data = new StringTokenization (source);

		String prevLabel = "NOLABEL";
		Pattern ipattern = Pattern.compile ("I-.*");
		String word, tag = null, phrase = null, label = null;

		for (int i = 0; i < tokens.length; i++) {
      if (tokens[i].length() != 0) {
        try {
				  String[] features = tokens[i].split (" ");
          int fieldIdx = 0;
          word = features[fieldIdx++]; // .toLowerCase();
          if (doTags) tag = features[fieldIdx++];
				  if (doPhrases) phrase = features[fieldIdx++];
				  if (isTargetProcessing ()) label = features[fieldIdx++];
        } catch (ArrayIndexOutOfBoundsException e) {
          throw new IllegalArgumentException ("Invalid line "+tokens[i]+" : expected word "
            + (doTags ? ", tag" : "")
            + (doPhrases ? ", phrase" : "")
            + (isTargetProcessing () ? ", target" : "")
            + ".");
        }
      } else {
				word = "--";
				tag = "--";
				phrase = "--";
				label = "O";
			}

			// Transformations
			if (doDigitCollapses) {
				if (word.matches ("19\\d\\d"))
					word = "";
				else if (word.matches ("19\\d\\ds"))
					word = "";
				else if (word.matches ("19\\d\\d-\\d+"))
					word = "";
				else if (word.matches ("\\d+\\\\/\\d"))
					word = "";
				else if (word.matches ("\\d[\\d,\\.]*"))
					word = "";
				else if (word.matches ("19\\d\\d-\\d\\d-\\d--d"))
					word = "";
				else if (word.matches ("19\\d\\d-\\d\\d-\\d\\d"))
					word = "";
				else if (word.matches (".*-led"))
					word = "";
				else if (word.matches (".*-sponsored"))
					word = "";
			}

			if (doDowncasing)
				word = word.toLowerCase();

      int start = source.length ();

      if (saveSource) {
        if (word.equals ("--")) source.append ("\n\n");
        source.append (word); source.append (" ");
      }

      Token token = new StringSpan (source, start, source.length () - 1);

			// Word and tag unigram at current time
			if (doSpelling) {
				for (int j = 0; j < endings.length; j++) {
					ending[2][j] = ending[1][j];
					ending[1][j] = ending[0][j];
					ending[0][j] = endingPatterns[j].matcher(word).matches();
					if (ending[0][j]) token.setFeatureValue (endingNames[0][0][j], 1);
				}
			}

			if (doTags) {
				token.setFeatureValue ("T="+tag, 1);
			}

			if (doPhrases) {
				token.setFeatureValue ("P="+phrase, 1);
			}

      data.add (token);

      if (isTargetProcessing ()) {
        // Change so each segment always begins with a "B-",
				// even if previous token did not have this label.
				String oldLabel = label;
				if (ipattern.matcher(label).matches ()
						&& (prevLabel.length() < 3		// prevLabel is "O"
								|| !prevLabel.substring(2).equals (label.substring(2)))) {
					label = "B" + oldLabel.substring(1);
				}
				prevLabel = oldLabel;
			  target.add (label);
      }

    }

    carrier.setData(data);
		if (isTargetProcessing ()) carrier.setTarget(target);
		if (saveSource) carrier.setSource(source);

    return carrier;
	}

  // serialization garbage

  private static final long serialVersionUID = -7326674871670572522L;
}