
cc.mallet.pipe.Csv2FeatureVector Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.pipe;
import java.util.logging.*;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.util.MalletLogger;
/**
* Converts a string of the form
* feature_1:val_1 feature_2:val_2 ... feature_k:val_k
* into a (sparse) FeatureVector.
*
* Features with no ":" character are assumed to have value 1.0.
*
* @author Gary Huang
*/
public class Csv2FeatureVector extends Pipe {
private static Logger logger = MalletLogger.getLogger(Csv2FeatureVector.class.getName());
public Csv2FeatureVector(int capacity) {
this.dataAlphabet = new Alphabet(capacity);
}
public Csv2FeatureVector() {
this(1000);
}
/**
* Convert the data in the given Instance from a CharSequence
* of sparse feature-value pairs to a FeatureVector
*/
public Instance pipe(Instance carrier) {
CharSequence c = (CharSequence) carrier.getData();
String[] pairs = c.toString().trim().split("\\s+");
int[] keys = new int[pairs.length];
double[] values = new double[pairs.length];
for (int i = 0; i < pairs.length; i++) {
int delimIndex = pairs[i].lastIndexOf(":");
if (delimIndex <= 0 || delimIndex == (pairs[i].length()-1)) {
keys[i] = dataAlphabet.lookupIndex(pairs[i], true);
values[i] = 1.0;
}
else {
keys[i] = dataAlphabet.lookupIndex(pairs[i].substring(0, delimIndex), true);
values[i] = Double.parseDouble(pairs[i].substring(delimIndex+1));
}
}
// [removed code that sorted indices but NOT values -DM]
FeatureVector fv = new FeatureVector(dataAlphabet, keys, values);
carrier.setData( fv );
return carrier;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy