cc.mallet.pipe.TokenSequenceParseFeatureString Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */




package cc.mallet.pipe;

import java.io.*;

import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;

/**

	 Convert the string in each field Token.text to a list
	 of Strings (space delimited). Add each string as a feature to the
	 token. If realValued is true, then treat the position
	 in the list as the feature name and the value as a
	 double. Otherwise, the feature name is the string itself and the
	 value is 1.0.
     
	 Modified to allow feature names and values to be specified.eg:
	 featureName1=featureValue1 featureName2=featureValue2 ...
	 The name/value separator (here '=') can be specified.
     

	 *  If your data consists of feature/value pairs (eg height=10.7 width=3.6 length=1.7),
	 *   use new TokenSequenceParseFeatureString(true, true, "="). This
	 *   format is typically used for sparse data, in which most features are equal to 0 in
	 *   any given instance.
	 *  

	 *  If your data consists only of values, and the position determines which feature
	 *   the value is for (eg 10.7  3.6  1.7),
	 *   use new TokenSequenceParseFeatureString(true). 
	 *   This format is typically used for data that has a small number of features
	 *   that all have non-zero values most of the time.
	 *  
	 *  If your data is in the form of named binary indicator variables
	 *   (eg yellow quacks has_webbed_feet), use the constructor
	 *   new TokenSequenceParseFeatureString(false). Each token will be 
	 *   interpreted as the name of a feature, whose value is 1.0.

   @author Aron Culotta [email protected]
*/

public class TokenSequenceParseFeatureString extends Pipe implements Serializable
{
	boolean realValued; // are these real-valued features?
	boolean specifyFeatureNames; // are the feature names given as well?
	String nameValueSeparator; // what separates the name from the value? (CAN'T BE WHITESPACE!)
	
	/**
	 *  @param _realValued interpret each data token as a double, and associate it with a 
	 *     feature called "Feature#K" where K is the order of the token, starting with 0. 
	 *     Note that this option is currently ignored if _specifyFeatureNames is true.
	 *  @param _specifyFeatureNames interpret each data token as a feature name/value pair,
	 *     separated by some delimiter, which is the equals sign ("=") unless otherwise specified.
	 *  @param _nameValueSeparator use a string other than = to separate name/value pairs. Colon (":") is
	 *     a common choice. Note that this string cannot consist of any whitespace, as the tokens stream
	 *     will already have been split.
	 */
	public TokenSequenceParseFeatureString (boolean _realValued, boolean _specifyFeatureNames, String _nameValueSeparator) {
		this.realValued = _realValued;
		if (_nameValueSeparator.trim().length()==0) {
			throw new IllegalArgumentException ("nameValueSeparator can't be whitespace");
		}
		nameValueSeparator = _nameValueSeparator;
		this.specifyFeatureNames = _specifyFeatureNames;
	}

	public TokenSequenceParseFeatureString (boolean _realValued, boolean _specifyFeatureNames) {
		this (_realValued, _specifyFeatureNames, "=");
	}
	
	public TokenSequenceParseFeatureString (boolean _realValued) {
		this (_realValued, false, "=");
	}

	
	public Instance pipe (Instance carrier) {
		TokenSequence ts = (TokenSequence) carrier.getData ();
		for (int i=0; i < ts.size(); i++) {
			Token t = ts.get (i);
			String[] values = t.getText().split("\\s+");
			for (int j=0; j < values.length; j++) {
				if (specifyFeatureNames) {
					String[] nameAndValue = values[j].split(nameValueSeparator);						
					if (nameAndValue.length != 2) { // no feature name. use token as feature.
						t.setFeatureValue ("Token="+values[j], 1.0);
					}
					else {
						t.setFeatureValue (nameAndValue[0], Double.parseDouble (nameAndValue[1]));						
					}
				}
				else if (realValued) {
					t.setFeatureValue ("Feature#" + j, Double.parseDouble (values[j]));
				}
				else
					t.setFeatureValue (values[j], 1.0);					
			}
		}
		carrier.setData (ts);
		return carrier;
	}

	// Serialization 
	
	private static final long serialVersionUID = 1;
	private static final int CURRENT_SERIAL_VERSION = 1;
	
	private void writeObject (ObjectOutputStream out) throws IOException {
		out.writeInt (CURRENT_SERIAL_VERSION);
		out.writeBoolean (realValued);
		out.writeBoolean (specifyFeatureNames);
		out.writeObject (nameValueSeparator);
	}
	
	private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
		int version = in.readInt ();
		realValued = in.readBoolean ();
		if (version >= CURRENT_SERIAL_VERSION) {
			specifyFeatureNames = in.readBoolean();
			nameValueSeparator = (String)in.readObject();
		}
	}
}