All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.pipe.iterator.CsvIterator Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

There is a newer version: 2.0.12
Show newest version
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */




/** 
   @author Andrew McCallum [email protected]
 */

package cc.mallet.pipe.iterator;

import cc.mallet.types.Instance;
import java.io.*;
import java.util.Iterator;
import java.util.regex.*;
import java.net.URI;
import java.net.URISyntaxException;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.*;

/**
 *  This iterator, perhaps more properly called a Line Pattern Iterator, 
 *   reads through a file and returns one instance per line,
 *   based on a regular expression.

* * If you have data of the form *

[name]  [label]  [data]
* and a {@link Pipe} instancePipe, you could read instances using this code:
    InstanceList instances = new InstanceList(instancePipe);

    instances.addThruPipe(new CsvIterator(new FileReader(dataFile),
                                          "(\\w+)\\s+(\\w+)\\s+(.*)",
                                          3, 2, 1)  // (data, target, name) field indices                    
                         );
* */ public class CsvIterator implements Iterator { LineNumberReader reader; Pattern lineRegex; int uriGroup, targetGroup, dataGroup; String currentLine; public CsvIterator (Reader input, Pattern lineRegex, int dataGroup, int targetGroup, int uriGroup) { this.reader = new LineNumberReader (input); this.lineRegex = lineRegex; this.targetGroup = targetGroup; this.dataGroup = dataGroup; this.uriGroup = uriGroup; if (dataGroup <= 0) throw new IllegalStateException ("You must extract a data field."); try { this.currentLine = reader.readLine(); } catch (IOException e) { throw new IllegalStateException (); } } public CsvIterator (Reader input, String lineRegex, int dataGroup, int targetGroup, int uriGroup) { this (input, Pattern.compile (lineRegex), dataGroup, targetGroup, uriGroup); } public CsvIterator (String filename, String lineRegex, int dataGroup, int targetGroup, int uriGroup) throws java.io.FileNotFoundException { this (new FileReader (new File(filename)), Pattern.compile (lineRegex), dataGroup, targetGroup, uriGroup); } // The PipeInputIterator interface public Instance next () { String uriStr = null; String data = null; String target = null; Matcher matcher = lineRegex.matcher(currentLine); if (matcher.find()) { if (uriGroup > 0) uriStr = matcher.group(uriGroup); if (targetGroup > 0) target = matcher.group(targetGroup); if (dataGroup > 0) data = matcher.group(dataGroup); } else { throw new IllegalStateException ("Line #"+reader.getLineNumber()+" does not match regex:\n" + currentLine); } String uri; if (uriStr == null) { uri = "csvline:"+reader.getLineNumber(); } else { uri = uriStr; } assert (data != null); Instance carrier = new Instance (data, target, uri, null); try { this.currentLine = reader.readLine(); } catch (IOException e) { throw new IllegalStateException (); } return carrier; } public boolean hasNext () { return currentLine != null; } public void remove () { throw new IllegalStateException ("This Iterator does not support remove()."); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy