
cc.mallet.pipe.FeatureValueString2FeatureVector Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing,
document classification, clustering, topic modeling, information extraction,
and other machine learning applications to text.
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
Convert a String containing space-separated feature-name floating-point-value pairs
into a FeatureVector. For example:
length=12 width=1.75 blue temperature=-17.2
Features without a corresponding value (ie those not including the character "=",
such as the feature blue
here) will be set to 1.0.
If a feature occurs more than once in the input string, the values of each
occurrence will be added.
@author David Mimno and Andrew McCallum
*/
package cc.mallet.pipe;
import java.io.*;
import cc.mallet.types.Alphabet;
import cc.mallet.types.Instance;
import cc.mallet.types.FeatureVector;
public class FeatureValueString2FeatureVector extends Pipe implements Serializable {
public FeatureValueString2FeatureVector (Alphabet dataDict) {
super (dataDict, null);
}
public FeatureValueString2FeatureVector () {
super(new Alphabet(), null);
}
public Instance pipe (Instance carrier) {
String[] fields = carrier.getData().toString().split("\\s+");
int numFields = fields.length;
Object[] featureNames = new Object[numFields];
double[] featureValues = new double[numFields];
for (int i = 0; i < numFields; i++) {
if (fields[i].contains("=")) {
String[] subFields = fields[i].split("=");
featureNames[i] = subFields[0];
featureValues[i] = Double.parseDouble(subFields[1]);
}
else {
featureNames[i] = fields[i];
featureValues[i] = 1.0;
}
}
carrier.setData(new FeatureVector(getDataAlphabet(), featureNames, featureValues));
return carrier;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy