
cc.mallet.pipe.tsf.FeaturesInWindow Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing,
document classification, clustering, topic modeling, information extraction,
and other machine learning applications to text.
The newest version!
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
Create new features from features (matching a regex within a window +/- the current position).
For example,
FeaturesInWindow p = new FeaturesInWindow("PREV-", -1, 1, Pattern.compile("POS-.*"), true)
will create a pipe that adds a feature to the current position for each
feature in the previous starting with "POS-". So if the previous position
has "POS-NN" we add "PREV-POS-NN". The last argument to the constructor is
currently ignored. The alternative constructor matches all patterns, so:
FeaturesInWindow p = new FeaturesInWindow(s, l, r);
is equivalent to
FeaturesInWindow p = new FeaturesInWindow("PREV-", -1, 1, Pattern.compile(".*"), true);
but more efficient, since we don't actually check using the Pattern.
@author Andrew McCallum [email protected]
*/
package cc.mallet.pipe.tsf;
import java.io.*;
import java.util.regex.*;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import cc.mallet.util.PropertyList;
public class FeaturesInWindow extends Pipe implements Serializable
{
String namePrefix, namePrefixLeft;
int leftBoundary;
int rightBoundary;
Pattern featureRegex;
boolean includeBeginEndBoundaries;
boolean includeCurrentToken = false;
private static final int maxWindowSize = 20;
private static final PropertyList[] startfs = new PropertyList[maxWindowSize];
private static final PropertyList[] endfs = new PropertyList[maxWindowSize];
static {
initStartEndFs ();
}
private static void initStartEndFs ()
{
for (int i = 0; i < maxWindowSize; i++) {
startfs[i] = PropertyList.add ("", 1.0, null);
endfs[i] = PropertyList.add ("", 1.0, null);
}
}
/** @param namePrefix what to prepend to feature names
* @param leftBoundaryOffset left boundary of the window (e.g. -1 means
* include the previous word
* @param rightBoundaryOffset right boundary for this window (e.g. 1 means
* include the current position, but not the next
* @param featureRegex add only for features matching this (null = always match
* @param includeBeginEndBoundaries ignored
*/
public FeaturesInWindow (String namePrefix, int leftBoundaryOffset, int rightBoundaryOffset,
Pattern featureRegex, boolean includeBeginEndBoundaries)
{
this.namePrefix = namePrefix;
this.leftBoundary = leftBoundaryOffset;
this.rightBoundary = rightBoundaryOffset;
this.featureRegex = featureRegex;
this.includeBeginEndBoundaries = includeBeginEndBoundaries;
}
/**
equivalent to
FeaturesInWindow((namePrefix, leftBoundaryOffset, rightBoundaryOffset, null, true);
*/
public FeaturesInWindow (String namePrefix, int leftBoundaryOffset, int rightBoundaryOffset)
{
this (namePrefix, leftBoundaryOffset, rightBoundaryOffset, null, true);
}
public Instance pipe (Instance carrier)
{
TokenSequence ts = (TokenSequence) carrier.getData();
int tsSize = ts.size();
PropertyList[] newFeatures = new PropertyList[tsSize];
for (int i = 0; i < tsSize; i++) {
Token t = ts.get (i);
PropertyList pl = t.getFeatures();
newFeatures[i] = pl;
for (int position = i + leftBoundary; position < i + rightBoundary; position++) {
if (position == i && !includeCurrentToken)
continue;
PropertyList pl2;
if (position < 0)
pl2 = startfs[-position];
else if (position >= tsSize)
pl2 = endfs[position-tsSize];
else
pl2 = ts.get(position).getFeatures ();
PropertyList.Iterator pl2i = pl2.iterator();
while (pl2i.hasNext()) {
pl2i.next();
String key = pl2i.getKey();
if (featureRegex == null || featureRegex.matcher(key).matches()) {
newFeatures[i] = PropertyList.add ((namePrefixLeft == null || position-i>0 ? namePrefix : namePrefixLeft)+key,
pl2i.getNumericValue(), newFeatures[i]);
}
}
}
}
for (int i = 0; i < tsSize; i++) {
// Put the new PropertyLists in place
ts.get (i).setFeatures (newFeatures[i]);
}
return carrier;
}
// Serialization
private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 0;
private void writeObject (ObjectOutputStream out) throws IOException {
out.writeInt (CURRENT_SERIAL_VERSION);
out.writeObject (namePrefix);
out.writeInt (leftBoundary);
out.writeInt (rightBoundary);
out.writeObject (featureRegex);
out.writeBoolean (includeBeginEndBoundaries);
}
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
int version = in.readInt ();
namePrefix = (String) in.readObject();
leftBoundary = in.readInt ();
rightBoundary = in.readInt ();
featureRegex = (Pattern) in.readObject();
includeBeginEndBoundaries = in.readBoolean();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy