Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.formats.masc;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.SAXParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import opennlp.tools.util.Span;
import opennlp.tools.util.XmlUtil;
public class MascDocument {
private static final Logger logger = LoggerFactory.getLogger(MascDocument.class);
private final List sentences;
private final String pathToFile;
private Iterator sentenceIterator;
private boolean hasPennTags = false;
private boolean hasNamedEntities = false;
public MascDocument(String path, List sentences) {
this.pathToFile = path;
this.sentences = sentences;
this.sentenceIterator = sentences.iterator();
}
/**
* Initializes a {@link MascDocument} with all the stand-off annotations translated into the
* internal structure.
*
* @param path The path where the document header is.
* @param f_primary The {@link InputStream file} with the raw corpus text.
* @param f_seg The {@link InputStream file} with segmentation into quarks.
* @param f_ne The {@link InputStream file} with named entities.
* @param f_penn The {@link InputStream file} with tokenization and Penn POS tags produced
* by GATE-5.0 ANNIE application.
* @param f_s The {@link InputStream file} with sentence boundaries.
* @return A document containing the text and its annotations. Immutability is not guaranteed yet.
* @throws IOException if the raw data cannot be read or the alignment of the raw data
* with annotations fails
*/
public static MascDocument parseDocument(String path, InputStream f_primary, InputStream f_seg,
InputStream f_penn, InputStream f_s, InputStream f_ne)
throws IOException {
String text = readText(f_primary);
List words = parseWords(f_seg);
List sentenceSpans = parseSentences(f_s);
List sentences = combineAnnotations(text, sentenceSpans, words);
final MascDocument doc = new MascDocument(path, sentences);
// if the file has Penn POS tags, add them
if (f_penn != null) {
doc.addPennTags(parsePennTags(f_penn));
}
if (f_ne != null) {
doc.addNamedEntityTags(parseNamedEntity(f_ne));
}
//TODO: make the annotations immutable
//TODO: should we cleanup the document (e.g. remove sentences without tokens?)
return doc;
}
/**
* Reads in the corpus file text.
*
* @param stream A valid, open {@link InputStream stream} for a corpus file.
*
* @return The text of the file.
* @throws IOException Thrown if IO errors occurred.
*/
private static String readText(InputStream stream) throws IOException {
try (Reader reader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))) {
StringBuilder contents = new StringBuilder();
char[] buffer = new char[8192];
int read;
while ((read = reader.read(buffer, 0, buffer.length)) > 0) {
contents.append(buffer, 0, read);
}
return contents.toString();
}
}
/**
* Parses the word segmentation stand-off annotation
*
* @param f_seg A valid, open {@link InputStream stream} for a file with segmentation.
* @return A list of individual quarks, expressed as MascWord-s
* @throws IOException Thrown if IO errors occurred.
*/
private static List parseWords(InputStream f_seg) throws IOException {
try (BufferedInputStream bStream = new BufferedInputStream(f_seg)) {
SAXParser saxParser = XmlUtil.createSaxParser();
MascWordParser handler = new MascWordParser();
try {
saxParser.parse(bStream, handler);
} catch (SAXException e) {
throw new IOException("Could not parse the region annotation file");
}
return Collections.unmodifiableList(handler.getAnchors());
}
}
/**
* Parses the sentence annotation file, align it with the raw text
*
* @param f_s A valid, open {@link InputStream stream} for a sentence annotation file.
* @return The {@link List} delimiting each sentence.
* @throws IOException if the sentence file cannot be parsed or closed
*/
private static List parseSentences(InputStream f_s) throws IOException {
try (BufferedInputStream bStream = new BufferedInputStream(f_s)) {
SAXParser saxParser = XmlUtil.createSaxParser();
MascSentenceParser handler = new MascSentenceParser();
try {
saxParser.parse(bStream, handler);
} catch (SAXException e) {
throw new IOException("Could not parse the sentence annotation file");
}
List anchors = handler.getAnchors();
/*
* Filter out sentence overlaps.
* Keep only those sentences where sentence.end < nextSentence.beginning
* avoid deleting in the middle and repeatedly shifting the list by copying into a new list
*/
//TODO: can we know a priori, if we need this filtering?
List filteredAnchors = new ArrayList<>();
for (int i = 0; i < anchors.size() - 1; i++) {
if (anchors.get(i).getEnd() < anchors.get(i + 1).getStart()) {
filteredAnchors.add(anchors.get(i));
}
}
filteredAnchors.add(anchors.get(anchors.size() - 1));
return Collections.unmodifiableList(filteredAnchors);
}
}
/**
* Parses the Penn-POS (GATE5-ANNIE) stand-off annotation.
*
* @param f_penn A valid, open {@link InputStream stream} for a file with Penn POS tags.
*
* @return A map of three sub-maps: tokenToTag, from Penn token ID (int) to Penn POS-tag,
* tokenToBase, from Penn token ID (int) to the base and tokenToQuarks, from Penn token ID
* (int) to a List of quark IDs contained in that token.
* @throws IOException Thrown if IO errors occurred.
*/
private static Map> parsePennTags(InputStream f_penn) throws IOException {
Map> tagsAndBases = new HashMap<>();
try (BufferedInputStream bStream = new BufferedInputStream(f_penn)) {
SAXParser saxParser = XmlUtil.createSaxParser();
MascPennTagParser handler = new MascPennTagParser();
try {
saxParser.parse(bStream, handler);
} catch (SAXException e) {
throw new IOException("Could not parse the Penn tag annotation file");
}
tagsAndBases.put("tokenToTag", handler.getTags());
tagsAndBases.put("tokenToBase", handler.getBases());
tagsAndBases.put("tokenToQuarks", handler.getTokenToQuarks());
return tagsAndBases;
}
}
/**
* Parses the named entity stand-off annotation.
*
* @param f_ne A valid, open {@link InputStream stream} for a file with named entity annotations.
* @return A map with two sub-maps, entityIDtoEntityType, mapping entity ID integers
* to entity type Strings, and entityIDsToTokens, mapping entity ID integers to Penn
* token ID integers.
* @throws IOException Thrown if IO errors occurred.
*/
private static Map> parseNamedEntity(InputStream f_ne) throws IOException {
try (BufferedInputStream bStream = new BufferedInputStream(f_ne)) {
SAXParser saxParser = XmlUtil.createSaxParser();
MascNamedEntityParser handler = new MascNamedEntityParser();
try {
saxParser.parse(bStream, handler);
} catch (SAXException e) {
throw new IOException("Could not parse the named entity annotation file", e);
}
Map entityIDtoEntityType = handler.getEntityIDtoEntityType();
Map> entityIDsToTokens = handler.getEntityIDsToTokens();
Map> results = new HashMap<>();
results.put("entityIDtoEntityType", entityIDtoEntityType);
results.put("entityIDsToTokens", entityIDsToTokens);
return results;
}
}
/**
* Combines the raw text with annotations that every file should have.
*
* @param text The raw text.
* @param sentenceSpans The spans defining individual sentences. Overlaps are not permitted.
* @param words The quarks of the raw text.
* @return A list of sentences, each of which is a list of quarks. Some quarks may belong to
* more than one sentence. Quarks which do not belong to a single sentence are silently dropped.
* @throws IOException If sentences and quarks cannot be aligned.
*/
private static List combineAnnotations(String text, List sentenceSpans,
List words) throws IOException {
int wordIndex = 0;
int wordCount = words.size();
List sentences = new ArrayList<>();
for (Span s : sentenceSpans) {
if (s.getEnd() - s.getStart() > 0) {
List quarks = new ArrayList<>();
int sentenceStart = s.getStart();
int sentenceEnd = s.getEnd();
// TODO: is it okay that quarks can cross sentence boundary? What are the implications?
/*
* Allow quarks to cross sentence boundary.
* The decisive factor determining if a quark belongs to a sentence is if they overlap.
* I.e. sent.getEnd() > quark.getStart() && sent.getStart() < quark.getEnd()
*/
MascWord nextWord = words.get(wordIndex);
// Find sentence beginning, should not be needed unless overlaps occur
while (sentenceStart < nextWord.getEnd() && wordIndex > 0) {
wordIndex--;
nextWord = words.get(wordIndex);
}
// TODO: can this be translated into Span's methods .crosses()/.contains()?
// Find all quarks contained or crossing the span of that sentence
boolean sentenceOver = false;
while ((!sentenceOver) && wordIndex < wordCount) {
nextWord = words.get(wordIndex);
int nextWordStart = nextWord.getStart();
int nextWordEnd = nextWord.getEnd();
// word either ends or starts or ends & starts in the middle of sentence
if (sentenceEnd > nextWordStart && sentenceStart < nextWordEnd) {
quarks.add(nextWord);
if (sentenceEnd == nextWordEnd) {
sentenceOver = true;
}
wordIndex++;
} else if (sentenceEnd <= nextWordStart) {
sentenceOver = true;
} else {
wordIndex++;
}
}
// If we are at the end of words, but not in the last sentence, throw an error
if (!sentenceOver && sentences.size() != sentenceSpans.size() - 1) {
throw new IOException("Sentence ends and word ends do not match." +
"First sentence not completed ends at character: " + sentenceEnd);
}
MascSentence sentence = new MascSentence(sentenceStart, sentenceEnd, text, quarks,
words);
sentences.add(sentence);
}
}
return Collections.unmodifiableList(sentences);
}
/**
* Attaches the named entity labels to individual tokens.
*
* @param namedEntities A map with two sub-maps, entityIDtoEntityType, mapping entity ID integers
* to entity type Strings, and entityIDsToTokens, mapping entity ID integers to Penn
* token ID integers
*/
@SuppressWarnings("unchecked")
private void addNamedEntityTags(Map> namedEntities) {
try {
Map entityIDtoEntityType =
(Map) namedEntities.get("entityIDtoEntityType");
Map> entityIDsToTokens =
(Map>) namedEntities.get("entityIDsToTokens");
for (MascSentence s : sentences) {
boolean success = s.addNamedEntities(entityIDtoEntityType, entityIDsToTokens);
if (!success) {
logger.warn("Issues occurred in the file: {}", pathToFile);
}
}
hasNamedEntities = true;
} catch (IOException e) {
logger.error("Failed connecting tokens and named entities. " +
"The error occurred in the file: {}", pathToFile, e);
}
}
/**
* Attach tags and bases to MascWords in each of the sentences.
*
* @param tagMaps A map of three sub-maps: tokenToTag, from Penn token ID (int) to Penn POS-tag,
* * tokenToBase, from Penn token ID (int) to the base and tokenToQuarks, from Penn token ID
* * (int) to a List of quark IDs contained in that token.
*/
@SuppressWarnings("unchecked")
private void addPennTags(Map> tagMaps) throws IOException {
try {
// Extract individual mappings
Map tokenToTag = (Map) tagMaps.get("tokenToTag");
Map tokenToBase = (Map) tagMaps.get("tokenToBase");
Map tokenToQuarks = (Map) tagMaps.get("tokenToQuarks");
//Check that all tokens have at least one quark.
for (Map.Entry token : tokenToQuarks.entrySet()) {
if (token.getValue().length == 0) {
logger.warn("Token without quarks: {}", token.getKey());
}
}
Map quarkToTokens = new HashMap<>();
for (Map.Entry tokenAndQuarks : tokenToQuarks.entrySet()) {
int token = tokenAndQuarks.getKey();
int[] quarks = tokenAndQuarks.getValue();
for (int quark : quarks) {
//very rarely, one quark may belong to several token
//this is probably a mistake in the corpus annotation
if (quarkToTokens.containsKey(quark)) {
int[] tokens = quarkToTokens.get(quark);
int[] newTokens = new int[tokens.length + 1];
newTokens[0] = token;
System.arraycopy(tokens, 0, newTokens, 1, tokens.length);
logger.warn("One quark belongs to several tokens. f-seg ID: {}.", quark);
logger.warn("The error occurred in file: {}", pathToFile);
quarkToTokens.put(quark, newTokens);
} else {
quarkToTokens.put(quark, new int[] {token});
}
}
}
for (MascSentence s : sentences) {
boolean success = s.tokenizePenn(tokenToQuarks, quarkToTokens, tokenToBase, tokenToTag);
if (!success) {
logger.warn("Issues occurred in the file: {}", pathToFile);
}
}
hasPennTags = true;
} catch (Exception e) {
throw new IOException("Could not attach POS tags to words. " +
e.getMessage() + Arrays.toString(e.getStackTrace()));
}
}
/**
* Checks whether there is Penn tagging produced by GATE-5.0 ANNIE.
*
* @return {@code true} if this file has aligned tags/tokens, {@code false} otherwise.
*/
public boolean hasPennTags() {
return hasPennTags;
}
/**
* Checks whether there is NER by GATE-5.0 ANNIE.
*
* @return {@code true} if this file has named entities, {@code false} otherwise.
*/
public boolean hasNamedEntities() {
return hasNamedEntities;
}
/**
* @return Retrieves the next sentence or {@code null} if end of document reached.
*/
public MascSentence read() {
MascSentence next = null;
if (sentenceIterator.hasNext()) {
next = sentenceIterator.next();
}
return next;
}
/**
* Resets the reading of sentences to the beginning of the document.
*/
public void reset() {
this.sentenceIterator = this.sentences.iterator();
}
}