![JAR search and dependency download from the Maven repository](/logo.png)
opennlp.tools.formats.conllu.ConlluStream Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.formats.conllu;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.stream.Collectors;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.ParagraphStream;
import opennlp.tools.util.PlainTextByLineStream;
/**
* The CoNNL-U Format is specified
* here.
*/
public class ConlluStream implements ObjectStream {
private final ObjectStream sentenceStream;
private static final Pattern regex = Pattern.compile("text_([a-z]{2,3})");
/**
* Initializes a {@link ConlluStream}.
*
* @param in The {@link InputStreamFactory} to use. Characters will be interpreted in UTF-8.
*
* @throws IOException Thrown if IO errors occurred during initialization.
*/
public ConlluStream(InputStreamFactory in) throws IOException {
this.sentenceStream = new ParagraphStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8));
}
@Override
public ConlluSentence read() throws IOException {
String sentence = sentenceStream.read();
if (sentence != null) {
List wordLines = new ArrayList<>();
BufferedReader reader = new BufferedReader(new StringReader(sentence));
boolean newDocument = false;
boolean newParagraph = false;
String documentId = null;
String paragraphId = null;
String sentenceId = null;
String text = null;
Map textLang = null;
String translit = null;
String line;
while ((line = reader.readLine()) != null) {
// # indicates a comment line and contains additional data
if (line.trim().startsWith("#")) {
String commentLine = line.trim().substring(1);
int separator = commentLine.indexOf('=');
if (separator != -1) {
String firstPart = commentLine.substring(0, separator).trim();
String secondPart = commentLine.substring(separator + 1, commentLine.length()).trim();
if (!secondPart.isEmpty()) {
switch (firstPart) {
case "newdoc id":
newDocument = true;
documentId = secondPart;
break;
case "newpar id":
newParagraph = true;
paragraphId = secondPart;
break;
case "sent_id":
sentenceId = secondPart;
break;
case "text":
text = secondPart;
break;
case "translit":
translit = secondPart;
break;
}
}
if (firstPart.startsWith("text_")) {
if (textLang == null) {
textLang = new HashMap<>();
}
addTextLang(firstPart, secondPart, textLang);
}
}
else {
switch (commentLine.trim()) {
case "newdoc":
newDocument = true;
break;
case "newpar":
newParagraph = true;
break;
}
}
}
else {
wordLines.add(new ConlluWordLine(line));
}
}
wordLines = postProcessContractions(wordLines);
return new ConlluSentence(wordLines, sentenceId, text, newDocument, documentId, newParagraph,
paragraphId, textLang, translit);
}
return null;
}
private List postProcessContractions(List lines) {
// 1. Find contractions
Map index = new HashMap<>();
Map> contractions = new HashMap<>();
List linesToDelete = new ArrayList<>();
for (int i = 0; i < lines.size(); i++) {
ConlluWordLine line = lines.get(i);
index.put(line.getId(), i);
if (line.getId().contains("-")) {
List expandedContractions = new ArrayList<>();
String[] ids = line.getId().split("-");
int start = Integer.parseInt(ids[0]);
int end = Integer.parseInt(ids[1]);
for (int j = start; j <= end; j++) {
String js = Integer.toString(j);
expandedContractions.add(js);
linesToDelete.add(js);
}
contractions.put(line.getId(), expandedContractions);
}
}
// 2. Merge annotation
for (Entry> entry : contractions.entrySet()) {
final String contractionId = entry.getKey();
final List expandedContractions = entry.getValue();
int contractionIndex = index.get(contractionId);
ConlluWordLine contraction = lines.get(contractionIndex);
List expandedParts = new ArrayList<>();
for (String id : expandedContractions) {
expandedParts.add(lines.get(index.get(id)));
}
ConlluWordLine merged = mergeAnnotation(contraction, expandedParts);
lines.set(contractionIndex, merged);
}
// 3. Delete the expanded parts
for (int i = linesToDelete.size() - 1; i >= 0; i--) {
lines.remove(index.get(linesToDelete.get(i)).intValue());
}
return lines;
}
/**
* Merges token level annotations.
*
* @param contraction The line that receives the annotation.
* @param expandedParts The lines to get annotation.
*
* @return The {@link ConlluWordLine merged line}.
*/
private ConlluWordLine mergeAnnotation(ConlluWordLine contraction,
List expandedParts) {
String id = contraction.getId();
String form = contraction.getForm();
String lemma = expandedParts.stream()
.filter(p -> !"_".equals(p.getLemma()))
.map(ConlluWordLine::getLemma)
.collect(Collectors.joining("+"));
String uPosTag = expandedParts.stream()
.filter(p -> !"_".equals(p.getPosTag(ConlluTagset.U)))
.map(p -> p.getPosTag(ConlluTagset.U))
.collect(Collectors.joining("+"));
String xPosTag = expandedParts.stream()
.filter(p -> !"_".equals(p.getPosTag(ConlluTagset.X)))
.map(p -> p.getPosTag(ConlluTagset.X))
.collect(Collectors.joining("+"));
String feats = expandedParts.stream()
.filter(p -> !"_".equals(p.getFeats()))
.map(ConlluWordLine::getFeats)
.collect(Collectors.joining("+"));
String head = contraction.getHead();
String deprel = contraction.getDeprel();
String deps = contraction.getDeps();
String misc = contraction.getMisc();
return new ConlluWordLine(id, form, lemma, uPosTag, xPosTag, feats,head, deprel, deps, misc);
}
private Map addTextLang(String firstPart, String secondPart,
Map textLang) throws InvalidFormatException {
String lang = "";
try {
Matcher regexMatcher = regex.matcher(firstPart);
if (regexMatcher.find()) {
lang = regexMatcher.group(1);
}
} catch (PatternSyntaxException e) {
throw new InvalidFormatException(e);
}
if (!lang.isEmpty()) {
textLang.put(new Locale(lang), secondPart);
}
else {
throw new InvalidFormatException(String.format("Locale language code is invalid: %s", lang));
}
return textLang;
}
@Override
public void close() throws IOException {
sentenceStream.close();
}
@Override
public void reset() throws IOException, UnsupportedOperationException {
sentenceStream.reset();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy