All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.pipe.Csv2Array Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

There is a newer version: 2.0.12
Show newest version
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */





package cc.mallet.pipe;


import java.util.logging.*;
import java.lang.reflect.Array;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.Labeling;
import cc.mallet.util.CharSequenceLexer;
import cc.mallet.util.MalletLogger;

/**

   Converts a string of comma separated values to an array. To be used
	 prior to {@link Array2FeatureVector}. Note that this class assumes
	 that each location of the line corresponds to a feature index
	 (i.e. "dense" representation) eg:

	 instance 1: 1,0,0,1,0,0,1  << feature alphabet size = 7
	 instance 2: 0,0,1,0,0,0,1  << feature alphabet size = 7

 	 @author Aron Culotta
 */
public class Csv2Array extends Pipe {

	CharSequenceLexer lexer;
	int numberFeatures = -1;
	private static Logger logger = MalletLogger.getLogger(Csv2Array.class.getName());

	public Csv2Array () {
		this.lexer = new CharSequenceLexer ("([^,]+)");
	}

	public Csv2Array (String regex) {
		this.lexer = new CharSequenceLexer (regex);
	}

	public Csv2Array (CharSequenceLexer l) {
		this.lexer = l;
	}

	/** Convert the data in an Instance from a CharSequence
	 * of comma-separated-values to an array, where each index is the
	 * feature name.
	 */
	public Instance pipe(  Instance carrier ) {
		
		CharSequence c = (CharSequence)carrier.getData();
		int nf = countNumberFeatures (c);
		if (numberFeatures == -1) // first instance seen
			numberFeatures = nf;
		else if (numberFeatures != nf)
			throw new IllegalArgumentException ("Instances must have same-length feature vectors. length_i: " + numberFeatures + " length_j: " + nf);
		double[] feats = new double[numberFeatures];
		lexer.setCharSequence (c);
		int i=0;
		while (lexer.hasNext()) 
			feats[i++] = Double.parseDouble ((String)lexer.next());
		carrier.setData (feats);
		return carrier;
		
	}
				 
	private int countNumberFeatures (CharSequence c) {
		String s = c.toString();
		int ret = 0;
		int pos = 0;
		while ((pos = s.indexOf (",", pos) + 1) != 0)
			ret++;
		return ret+1;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy