cmu.arktweetnlp.io.CoNLLReader Maven / Gradle / Ivy
The newest version!
package cmu.arktweetnlp.io;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import cmu.arktweetnlp.impl.Sentence;
import cmu.arktweetnlp.util.BasicFileIO;
import edu.stanford.nlp.util.Pair;
/**
* Read a simplified version of the CoNLL format. Two columns
* Word \t POSTag
*
* With a blank line separating sentences.
*
* Returns 'null' for the input record string
*/
public class CoNLLReader {
public static ArrayList readFile(String filename) throws IOException {
BufferedReader reader = BasicFileIO.openFileToReadUTF8(filename);
ArrayList sentences = new ArrayList();
ArrayList curLines = new ArrayList();
String line;
while ( (line = reader.readLine()) != null ) {
if (line.matches("^\\s*$")) {
if (curLines.size() > 0) {
// Flush
sentences.add(sentenceFromLines(curLines));
curLines.clear();
}
} else {
curLines.add(line);
}
}
if (curLines.size() > 0) {
sentences.add(sentenceFromLines(curLines));
}
return sentences;
}
// private static Pair wrap(Sentence s) {
// return new Pair(null, s);
// }
private static Sentence sentenceFromLines(List lines) {
Sentence s = new Sentence();
for (String line : lines) {
String[] parts = line.split("\t");
assert parts.length == 2;
s.tokens.add( parts[0].trim() );
s.labels.add( parts[1].trim() );
}
// System.out.println(s);
return s;
}
}