
cc.mallet.pipe.Csv2Array Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing,
document classification, clustering, topic modeling, information extraction,
and other machine learning applications to text.
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.pipe;
import java.util.logging.*;
import java.lang.reflect.Array;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.Labeling;
import cc.mallet.util.CharSequenceLexer;
import cc.mallet.util.MalletLogger;
/**
Converts a string of comma separated values to an array. To be used
prior to {@link Array2FeatureVector}. Note that this class assumes
that each location of the line corresponds to a feature index
(i.e. "dense" representation) eg:
instance 1: 1,0,0,1,0,0,1 << feature alphabet size = 7
instance 2: 0,0,1,0,0,0,1 << feature alphabet size = 7
@author Aron Culotta
*/
public class Csv2Array extends Pipe {
CharSequenceLexer lexer;
int numberFeatures = -1;
private static Logger logger = MalletLogger.getLogger(Csv2Array.class.getName());
public Csv2Array () {
this.lexer = new CharSequenceLexer ("([^,]+)");
}
public Csv2Array (String regex) {
this.lexer = new CharSequenceLexer (regex);
}
public Csv2Array (CharSequenceLexer l) {
this.lexer = l;
}
/** Convert the data in an Instance
from a CharSequence
* of comma-separated-values to an array, where each index is the
* feature name.
*/
public Instance pipe( Instance carrier ) {
CharSequence c = (CharSequence)carrier.getData();
int nf = countNumberFeatures (c);
if (numberFeatures == -1) // first instance seen
numberFeatures = nf;
else if (numberFeatures != nf)
throw new IllegalArgumentException ("Instances must have same-length feature vectors. length_i: " + numberFeatures + " length_j: " + nf);
double[] feats = new double[numberFeatures];
lexer.setCharSequence (c);
int i=0;
while (lexer.hasNext())
feats[i++] = Double.parseDouble ((String)lexer.next());
carrier.setData (feats);
return carrier;
}
private int countNumberFeatures (CharSequence c) {
String s = c.toString();
int ret = 0;
int pos = 0;
while ((pos = s.indexOf (",", pos) + 1) != 0)
ret++;
return ret+1;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy