
cc.mallet.pipe.Csv2FeatureVector Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing,
document classification, clustering, topic modeling, information extraction,
and other machine learning applications to text.
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.pipe;
import java.util.logging.*;
import java.util.*;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.Labeling;
import cc.mallet.util.MalletLogger;
/**
* Converts a string of the form
* feature_1:val_1 feature_2:val_2 ... feature_k:val_k
* into a (sparse) FeatureVector.
*
* Features with no ":" character are assumed to have value 1.0.
*
* @author Gary Huang
*/
public class Csv2FeatureVector extends Pipe {
private static Logger logger = MalletLogger.getLogger(Csv2FeatureVector.class.getName());
public Csv2FeatureVector(int capacity) {
this.dataAlphabet = new Alphabet(capacity);
}
public Csv2FeatureVector() {
this(1000);
}
/**
* Convert the data in the given Instance from a CharSequence
* of sparse feature-value pairs to a FeatureVector
*/
public Instance pipe(Instance carrier) {
CharSequence c = (CharSequence) carrier.getData();
String[] pairs = c.toString().trim().split("\\s+");
int[] keys = new int[pairs.length];
double[] values = new double[pairs.length];
for (int i = 0; i < pairs.length; i++) {
int delimIndex = pairs[i].lastIndexOf(":");
if (delimIndex <= 0 || delimIndex == (pairs[i].length()-1)) {
keys[i] = dataAlphabet.lookupIndex(pairs[i], true);
values[i] = 1.0;
}
else {
keys[i] = dataAlphabet.lookupIndex(pairs[i].substring(0, delimIndex), true);
values[i] = Double.parseDouble(pairs[i].substring(delimIndex+1));
}
}
// [removed code that sorted indices but NOT values -DM]
FeatureVector fv = new FeatureVector(dataAlphabet, keys, values);
carrier.setData( fv );
return carrier;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy