All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.pipe.tsf.OffsetFeatureConjunction Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */




/**
	 Create new feature from the conjunction of features from given
    offsets that match given regular expressions.  This can be seen
    as hand-coding in a few of the conjunctions that you'd get
    from {@link OffsetConjunctions}.
 

For example, creating a pipe with new OffsetFeatureConjunction ("TIME", new String[] { "number", "W=:" "number" }, new int[] { 0, 1, 2 }) will create a feature that is true whenever all of (a) a feature at the current time matches "number" (b) a feature at the next time step matches "W=:" (b) a feature 2 timesteps from now match "number", so that you have a simple time detector.

If the conjunction passes, then either the first timestep (that is, the one all the offsets were computed from), or all matching timesteps, get the feature "TIME" --- depending on the value of the field tagAllTimesteps. @author Charles Sutton [email protected] @author Andrew McCallum [email protected] */ package cc.mallet.pipe.tsf; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.Serializable; import java.util.regex.Pattern; import cc.mallet.pipe.Pipe; import cc.mallet.types.Instance; import cc.mallet.types.Token; import cc.mallet.types.TokenSequence; import cc.mallet.util.PropertyList; public class OffsetFeatureConjunction extends Pipe implements Serializable { private String thisFeatureName; private Pattern[] featurePatterns; private int[] offsets; private boolean[] isNonNegated; private boolean tagAllTimesteps; /** * Create a Pipe for adding conjunctions of specified features. * @param thisFeatureName Name of this conjunction feature. * @param featureNames String giving name for each subfeature i. * @param offsets For each subfeature i, which offset from the current timestep * must i appear at. * @param isNonNegated If element i is false, then the negation of the * feature is added to the conjuction. */ public OffsetFeatureConjunction (String thisFeatureName, String[] featureNames, int[] offsets, boolean[] isNonNegated, boolean tagAllTimesteps) { this.thisFeatureName = thisFeatureName; this.featurePatterns = patternify (featureNames); this.offsets = offsets; this.isNonNegated = isNonNegated; this.tagAllTimesteps = tagAllTimesteps; } private static boolean[] trueArray (int length) { boolean[] ret = new boolean[length]; for (int i = 0; i < length; i++) ret[i] = true; return ret; } private Pattern[] patternify (String[] regex) { Pattern[] retval = new Pattern [regex.length]; for (int i = 0; i < regex.length; i++) { retval [i] = Pattern.compile (regex[i]); } return retval; } public OffsetFeatureConjunction (String thisFeatureName, String[] featureNames, int[] offsets, boolean tagAllTimesteps) { this (thisFeatureName, featureNames, offsets, trueArray(featureNames.length), tagAllTimesteps); } public OffsetFeatureConjunction (String thisFeatureName, String[] featureNames, int[] offsets) { this (thisFeatureName, featureNames, offsets, trueArray(featureNames.length), false); } public boolean isTagAllTimesteps () { return tagAllTimesteps; } public String getFeatureName () { return thisFeatureName; } public Pattern[] getFeaturePatterns () { return featurePatterns; } public int[] getOffsets () { return offsets; } public boolean[] getNonNegated () { return isNonNegated; } public Instance pipe (Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); int tsSize = ts.size(); for (int t = 0; t < tsSize; t++) { // Check whether the conjunction is true at time step t boolean passes = true; for (int fnum = 0; fnum < featurePatterns.length; fnum++) { int pos = t + offsets[fnum]; if (!(pos >= 0 && pos < tsSize)) { passes = false; break; } boolean featurePresent = hasMatchingFeature (ts.get(pos), featurePatterns [fnum]); if (featurePresent != isNonNegated [fnum]) { passes = false; break; } } if (passes) { if (tagAllTimesteps) { for (int fnum = 0; fnum < featurePatterns.length; fnum++) { int pos = t + offsets[fnum]; ts.get(pos).setFeatureValue (thisFeatureName, 1.0); } } else { ts.get(t).setFeatureValue (thisFeatureName, 1.0); } } } return carrier; } private boolean hasMatchingFeature (Token token, Pattern pattern) { PropertyList.Iterator iter = token.getFeatures ().iterator (); while (iter.hasNext()) { iter.next(); if (pattern.matcher (iter.getKey()). matches ()) { if (iter.getNumericValue() == 1.0) { return true; } } } return false; } // Serialization private static final long serialVersionUID = 1; private static final int CURRENT_SERIAL_VERSION = 1; private static final int NULL_INTEGER = -1; private void writeObject (ObjectOutputStream out) throws IOException { out.writeInt (CURRENT_SERIAL_VERSION); out.writeObject (thisFeatureName); out.writeBoolean (tagAllTimesteps); int size; size = (featurePatterns == null) ? NULL_INTEGER : featurePatterns.length; out.writeInt(size); if (size != NULL_INTEGER) { for (int i = 0; i = 1) tagAllTimesteps = in.readBoolean (); size = in.readInt(); if (size == NULL_INTEGER) { featurePatterns = null; offsets = null; isNonNegated = null; } else { featurePatterns = new Pattern[size]; offsets = new int[size]; isNonNegated = new boolean[size]; for (int i = 0; i < size; i++) { featurePatterns[i] = (Pattern) in.readObject(); offsets[i] = in.readInt(); isNonNegated[i] = in.readBoolean(); } } } }