opennlp.tools.formats.brat.BratAnnotationStream Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.formats.brat;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
/**
* Reads the annotations from the brat .ann annotation file.
*/
public class BratAnnotationStream implements ObjectStream {
static abstract class BratAnnotationParser {
static final int ID_OFFSET = 0;
static final int TYPE_OFFSET = 1;
BratAnnotation parse(Span[] tokens, CharSequence line) throws IOException {
return null;
}
protected int parseInt(String intString) throws InvalidFormatException {
try {
return Integer.parseInt(intString);
}
catch (NumberFormatException e) {
throw new InvalidFormatException(e);
}
}
}
static class SpanAnnotationParser extends BratAnnotationParser {
private static final int BEGIN_OFFSET = 2;
private static final int END_OFFSET = 3;
@Override
BratAnnotation parse(Span[] values, CharSequence line) throws IOException {
if (values.length > 4) {
String type = values[BratAnnotationParser.TYPE_OFFSET].getCoveredText(line).toString();
int endOffset = -1;
int firstTextTokenIndex = -1;
for (int i = END_OFFSET; i < values.length; i++) {
if (!values[i].getCoveredText(line).toString().contains(";")) {
endOffset = parseInt(values[i].getCoveredText(line).toString());
firstTextTokenIndex = i + 1;
break;
}
}
String id = values[BratAnnotationParser.ID_OFFSET].getCoveredText(line).toString();
String coveredText = line.subSequence(values[firstTextTokenIndex].getStart(),
values[values.length - 1].getEnd()).toString();
try {
return new SpanAnnotation(id, type, new Span(parseInt(values[BEGIN_OFFSET]
.getCoveredText(line).toString()), endOffset, type), coveredText);
}
catch (IllegalArgumentException e) {
throw new InvalidFormatException(e);
}
}
else {
throw new InvalidFormatException("Line must have at least 5 fields");
}
}
}
static class RelationAnnotationParser extends BratAnnotationParser {
private static final int ARG1_OFFSET = 2;
private static final int ARG2_OFFSET = 3;
private String parseArg(String arg) throws InvalidFormatException {
if (arg.length() > 4) {
return arg.substring(5).trim();
}
else {
throw new InvalidFormatException("Failed to parse argument: " + arg);
}
}
@Override
BratAnnotation parse(Span[] tokens, CharSequence line) throws IOException {
return new RelationAnnotation(tokens[BratAnnotationParser.ID_OFFSET].getCoveredText(line).toString(),
tokens[BratAnnotationParser.TYPE_OFFSET].getCoveredText(line).toString(),
parseArg(tokens[ARG1_OFFSET].getCoveredText(line).toString()),
parseArg(tokens[ARG2_OFFSET].getCoveredText(line).toString()));
}
}
static class EventAnnotationParser extends BratAnnotationParser {
@Override
BratAnnotation parse(Span[] tokens, CharSequence line) throws IOException {
String[] typeParts = tokens[TYPE_OFFSET].getCoveredText(line).toString().split(":");
if (typeParts.length != 2) {
throw new InvalidFormatException(String.format(
"Failed to parse [%s], type part must be in the format type:trigger", line));
}
String type = typeParts[0];
String eventTrigger = typeParts[1];
Map arguments = new HashMap<>();
for (int i = TYPE_OFFSET + 1; i < tokens.length; i++) {
String[] parts = tokens[i].getCoveredText(line).toString().split(":");
if (parts.length != 2) {
throw new InvalidFormatException(String.format(
"Failed to parse [%s], argument parts must be in form argument:value", line));
}
arguments.put(parts[0], parts[1]);
}
return new EventAnnotation(tokens[ID_OFFSET].getCoveredText(line).toString(),type, eventTrigger,
arguments);
}
}
static class AttributeAnnotationParser extends BratAnnotationParser {
private static final int ATTACHED_TO_OFFSET = 2;
private static final int VALUE_OFFSET = 3;
@Override
BratAnnotation parse(Span[] values, CharSequence line) throws IOException {
if (values.length == 3 || values.length == 4) {
String value = null;
if (values.length == 4) {
value = values[VALUE_OFFSET].getCoveredText(line).toString();
}
return new AttributeAnnotation(values[ID_OFFSET].getCoveredText(line).toString(),
values[TYPE_OFFSET].getCoveredText(line).toString(),
values[ATTACHED_TO_OFFSET].getCoveredText(line).toString(), value);
}
else {
throw new InvalidFormatException("Line must have 3 or 4 fields");
}
}
}
private final AnnotationConfiguration config;
private final BufferedReader reader;
private final String id;
public BratAnnotationStream(AnnotationConfiguration config, String id, InputStream in) {
this.config = config;
this.id = id;
reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
}
public BratAnnotation read() throws IOException {
String line = reader.readLine();
if (line != null) {
Span[] tokens = WhitespaceTokenizer.INSTANCE.tokenizePos(line);
if (tokens.length > 2) {
String annId = tokens[BratAnnotationParser.ID_OFFSET].getCoveredText(line).toString();
if (annId.length() == 0) {
throw new InvalidFormatException("annotation id is empty");
}
// The first leter of the annotation id marks the annotation type
final BratAnnotationParser parser;
switch (annId.charAt(0)) {
case 'T':
parser = new SpanAnnotationParser();
break;
case 'R':
parser = new RelationAnnotationParser();
break;
case 'A':
parser = new AttributeAnnotationParser();
break;
case 'E':
parser = new EventAnnotationParser();
break;
default:
// Skip it, do that for everything unsupported (e.g. "*" id)
return read();
}
try {
return parser.parse(tokens, line);
}
catch (IOException e) {
throw new IOException(String.format("Failed to parse ann document with id [%s.ann]", id), e);
}
}
}
return null;
}
public void reset() throws IOException, UnsupportedOperationException {
reader.reset();
}
public void close() throws IOException {
reader.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy