![JAR search and dependency download from the Maven repository](/logo.png)
edu.jhu.hlt.concrete.ingesters.conll.ConcreteToCoNLLX Maven / Gradle / Ivy
package edu.jhu.hlt.concrete.ingesters.conll;
import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPOutputStream;
import edu.jhu.hlt.concrete.Communication;
import edu.jhu.hlt.concrete.Section;
import edu.jhu.hlt.concrete.Sentence;
import edu.jhu.hlt.concrete.Token;
import edu.jhu.hlt.concrete.Tokenization;
import edu.jhu.hlt.concrete.serialization.CommunicationSerializer;
import edu.jhu.hlt.concrete.serialization.CompactCommunicationSerializer;
import edu.jhu.hlt.concrete.serialization.iterators.TarGzArchiveEntryCommunicationIterator;
import edu.jhu.hlt.utilt.AutoCloseableIterator;
/**
* Accepts {@link Communication}s and dumps to CoNLL-X files. Accepts either a
* TGZ (many documents) or a single .comm file. If the input is a TGZ, the
* output must be a directory, into which [commId].conll files will be put.
*
* Optionally outputs an aux file which specifies where each conll sentence
* belongs in the original {@link Communication}, which is necessary for zipping
* back up conll annotations into a {@link Communication})
* @see CoNLLX which accepts this aux file
*
*
* NOTE: This description is taken from the Google Syntaxnet (aka parsey mcparseface)
* documentation: bazel-syntaxnet/syntaxnet/text_formats.cc
*
* CoNLL document format reader for dependency annotated corpora.
* The expected format is described e.g. at http://ilk.uvt.nl/conll/#dataformat
*
* Data should adhere to the following rules:
* - Data files contain sentences separated by a blank line.
* - A sentence consists of one or tokens, each one starting on a new line.
* - A token consists of ten fields described in the table below.
* - Fields are separated by a single tab character.
* - All data files will contains these ten fields, although only the ID
* column is required to contain non-dummy (i.e. non-underscore) values.
* Data files should be UTF-8 encoded (Unicode).
*
* Fields:
* 1 ID: Token counter, starting at 1 for each new sentence and increasing
* by 1 for every new token.
* 2 FORM: Word form or punctuation symbol.
* 3 LEMMA: Lemma or stem.
* 4 CPOSTAG: Coarse-grained part-of-speech tag or category.
* 5 POSTAG: Fine-grained part-of-speech tag. Note that the same POS tag
* cannot appear with multiple coarse-grained POS tags.
* 6 FEATS: Unordered set of syntactic and/or morphological features.
* 7 HEAD: Head of the current token, which is either a value of ID or '0'.
* 8 DEPREL: Dependency relation to the HEAD.
* 9 PHEAD: Projective head of current token.
* 10 PDEPREL: Dependency relation to the PHEAD.
*
* This CoNLL reader is compatible with the CoNLL-U format described at
* http://universaldependencies.org/format.html
* Note that this reader skips CoNLL-U multiword tokens and ignores the last two
* fields of every line, which are PHEAD and PDEPREL in CoNLL format, but are
* replaced by DEPS and MISC in CoNLL-U.
*
* @author travis
*/
public class ConcreteToCoNLLX {
private static final CommunicationSerializer ser = new CompactCommunicationSerializer();
private String outputPosToolname = null; // null means just output words
public boolean verbose = false;
private static void dirArg(File f) {
if (f == null)
return;
if (f.isFile())
throw new IllegalArgumentException("must be a directory: " + f.getPath());
if (!f.isDirectory())
f.mkdirs();
}
/**
* Only dumps the words.
*
* @param communicationIn can be a .tgz, .tar.gz, or .comm file
* @param conllOut can be a directory (in which case commId.conll will be used) or a file
* @param sectionMetaInfoOut may be null, otherwise same rules as conllOut
*/
public void dump(File communicationIn, File conllOut, File sectionMetaInfoOut) throws Exception {
if (verbose)
System.out.println("reading communication from " + communicationIn.getPath());
String s = communicationIn.getName().toLowerCase();
if (s.endsWith(".tgz") || s.endsWith(".tar.gz")) {
// Read multiple communications
dirArg(conllOut);
dirArg(sectionMetaInfoOut);
try (InputStream is = Files.newInputStream(communicationIn.toPath());
BufferedInputStream bis = new BufferedInputStream(is, 1024 * 8 * 24);
AutoCloseableIterator iter = new TarGzArchiveEntryCommunicationIterator(bis)) {
while (iter.hasNext()) {
Communication n = iter.next();
dump(n, conllOut, sectionMetaInfoOut);
}
}
} else {
// Read one communication
Communication c = ser.fromPath(communicationIn.toPath());
dump(c, conllOut, sectionMetaInfoOut);
}
}
/**
* Only dumps the words.
*
* @param commIn
* @param conllOut can be a directory (in which case commId.conll will be used) or a file
* @param sectionMetaInfoOut may be null, otherwise same rules as conllOut
*/
public void dump(Communication commIn, File conllOut, File sectionMetaInfoOut) throws IOException {
if (verbose)
System.out.println("writing conll to " + conllOut.getPath());
String TAB_NIL = "\t_";
if (conllOut.isDirectory())
conllOut = new File(conllOut, commIn.getId() + ".conll");
if (sectionMetaInfoOut != null && sectionMetaInfoOut.isDirectory())
sectionMetaInfoOut = new File(sectionMetaInfoOut, commIn.getId() + ".section-info.aux");
OutputStream os = new FileOutputStream(conllOut);
if (conllOut.getName().toLowerCase().endsWith(".gz"))
os = new GZIPOutputStream(os);
List metaBySent = new ArrayList<>();
try (BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os))) {
for (int sectionIdx = 0; sectionIdx < commIn.getSectionListSize(); sectionIdx++) {
if (verbose)
System.out.println("section " + sectionIdx);
Section section = commIn.getSectionList().get(sectionIdx);
if (!section.isSetSentenceList() || section.getSentenceListSize() == 0)
continue;
String meta = null;
if (sectionMetaInfoOut != null) {
// Produce a section-uniq metadata string which each sentence will receive
String nl = "";
if (section.isSetNumberList()) {
for (int i : section.getNumberList()) {
if (!nl.isEmpty()) nl += " ";
nl += i;
}
} else {
// If not set, assume that numberLists follow ([0], [1], ..., [n])
nl = String.valueOf(sectionIdx);
}
String k = section.isSetKind() ? section.getKind() : "";
String l = section.isSetLabel() ? section.getLabel() : "";
assert k.indexOf('\t') < 0;
assert l.indexOf('\t') < 0;
meta = k + "\t" + l + "\t" + nl;
}
int sentIdx = 0;
for (Sentence sentence : section.getSentenceList()) {
Tokenization toks = sentence.getTokenization();
List tokens = toks.getTokenList().getTokenList();
if (verbose)
System.out.println("sentence " + sentIdx++);
if (outputPosToolname != null)
throw new RuntimeException("implement me");
for (int i = 0; i < tokens.size(); i++) {
Token t = tokens.get(i);
w.write((i+1) // ID
+ "\t" + t.getText() // FORM
+ TAB_NIL // LEMMA
+ TAB_NIL // CPOSTAG
+ TAB_NIL // POSTAG
+ TAB_NIL // FEATS
+ TAB_NIL // HEAD
+ TAB_NIL // DEPREL
+ TAB_NIL // PHEAD
+ TAB_NIL); // PDEPREL
w.newLine();
}
w.newLine(); // empty line after a sentence
if (sectionMetaInfoOut != null)
metaBySent.add(meta);
}
}
}
if (sectionMetaInfoOut != null) {
if (verbose)
System.err.println("writing sentence-section meta information to " + sectionMetaInfoOut.getPath());
try (BufferedWriter w = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(sectionMetaInfoOut)))) {
for (String line : metaBySent) {
w.write(line);
w.newLine();
}
}
}
}
public static void main(String[] args) throws Exception {
if (args.length != 2 && args.length != 3) {
System.err.println("please provide:");
System.err.println("1) an input Communication file");
System.err.println("2) an output CoNLL-X file or directory (copy name from input, changes suffix to .conll)");
System.err.println("3) [optional] an output sentence-section meta info file, one line per sentence, each matching ");
System.exit(1);
}
ConcreteToCoNLLX c2c = new ConcreteToCoNLLX();
File in = new File(args[0]);
File out = new File(args[1]);
File meta = null;
if (args.length == 3)
meta = new File(args[2]);
c2c.dump(in, out, meta);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy