eu.danieldk.nlp.conllx.reader.CONLLReader Maven / Gradle / Ivy
The newest version!
// Copyright 2008, 2013 Daniel de Kok
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package eu.danieldk.nlp.conllx.reader;
import com.google.common.base.Optional;
import eu.danieldk.nlp.conllx.CONLLToken;
import eu.danieldk.nlp.conllx.Sentence;
import eu.danieldk.nlp.conllx.SimpleSentence;
import eu.danieldk.nlp.conllx.Token;
import org.apache.commons.lang3.StringUtils;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class CONLLReader implements CorpusReader {
private final BufferedReader reader;
private final boolean strict;
/**
* Construct a CoNLL corpus reader in non-strict mode. The caller should
* ensure that the provided reader reads UTF-8.
*
* @param reader A buffered reader.
*/
public CONLLReader(BufferedReader reader) {
this(reader, false);
}
/**
* Construct a CoNLL corpus reader. The caller should ensure that the provided
* reader reads UTF-8. If a strictness is enabled, the reader will check if the
* following is true:
*
*
* - Tokens are numbered consecutively, starting at 1.
* - If tokens have a head, ensure that it refers to a token or 0.
*
*
* @param reader
* @param strict
*/
public CONLLReader(BufferedReader reader, boolean strict) {
this.reader = reader;
this.strict = strict;
}
@Override
public void close() throws IOException {
reader.close();
}
@Override
public Sentence readSentence() throws IOException {
List tokens = new ArrayList<>();
String line;
while ((line = reader.readLine()) != null) {
String parts[] = StringUtils.split(line.trim(), '\t');
// We are done with these tokens.
if (parts.length == 0) {
if (tokens.isEmpty())
continue;
return constructSentence(tokens);
}
if (parts.length < 2)
throw new IOException(String.format("Line has fewer than two columns: %s", line));
Integer tokenId = Integer.parseInt(parts[0]);
Optional form = valueForColumn(parts, 1);
Optional lemma = valueForColumn(parts, 2);
Optional courseTag = valueForColumn(parts, 3);
Optional tag = valueForColumn(parts, 4);
Optional features = valueForColumn(parts, 5);
Optional head = intValueForColumn(parts, 6);
Optional headRel = valueForColumn(parts, 7);
Optional pHead = intValueForColumn(parts, 8);
Optional pHeadRel = valueForColumn(parts, 9);
Token token = new CONLLToken(tokenId, form, lemma, courseTag, tag, features, head, headRel,
pHead, pHeadRel);
tokens.add(token);
}
// If the the file does not end with a blank line, we have left-overs.
if (!tokens.isEmpty()) {
return constructSentence(tokens);
}
return null;
}
/**
* Construct a sentence. If strictness is used and invariants do not hold, convert
* the exception to an IOException.
*/
private Sentence constructSentence(List tokens) throws IOException {
Sentence sentence;
try {
sentence = new SimpleSentence(tokens, strict);
} catch (IllegalArgumentException e) {
throw new IOException(e.getMessage());
}
return sentence;
}
private Optional valueForColumn(String[] columns, int column) {
if (column >= columns.length)
return Optional.absent();
if (columns[column].equals("_"))
return Optional.absent();
return Optional.of(columns[column]);
}
private Optional intValueForColumn(String[] columns, int column) {
if (column >= columns.length)
return Optional.absent();
if (columns[column].equals("_"))
return Optional.absent();
return Optional.of(Integer.parseInt(columns[column]));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy