de.datexis.model.Document Maven / Gradle / Ivy
package de.datexis.model;
import com.fasterxml.jackson.annotation.*;
import de.datexis.common.WordHelpers;
import de.datexis.model.tag.Tag;
import de.datexis.preprocess.DocumentFactory;
import de.datexis.preprocess.DocumentFactory.Newlines;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static de.datexis.model.Dataset.random;
/**
* A Document is a piece of text that mayu contain Sentences, Tokens and Annotations.
* @author sarnold, fgrimme
*/
@JsonPropertyOrder({ "class", "id", "uid", "refUid", "title", "language", "type", "begin", "length", "text", "annotations" })
@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "class", defaultImpl=Document.class)
@JsonIdentityInfo(generator = ObjectIdGenerators.PropertyGenerator.class, property = "id")
@JsonIgnoreProperties(ignoreUnknown = true)
public class Document extends Span {
/**
* All Sentences the Document is referencing
*/
protected List sentences;
/**
* List of Annotations that were assigned to this Span from Gold, Prediction or User sources.
* Only initialized when used.
*/
protected List annotations;
/**
* The ID of this document (e.g. URL)
*/
private String id = null;
/**
* The language of this document
*/
private String language = null;
/**
* The type of this document
*/
private String type = null;
/**
* The title of this document
*/
private String title = null;
/**
* An arbitrary document source that can be used for provenance.
*/
private Object source = null;
/**
* List of Tags that were assigned to this Dataset Gold, Prediction or User sources.
* Only initialized when used.
*/
private EnumMap> assignedTags = null;
/**
* Create a new Document from plain text
*/
public static Document create(String text) {
return DocumentFactory.fromText(text, Newlines.KEEP);
}
/**
* Create an empty Document
*/
public Document() {
sentences = new ArrayList<>();
annotations = new ArrayList<>();
}
/**
* Set sentences
* @param s new value for sentences of this Document instance
*/
@Deprecated
public void setSentences(List s) {
if(!s.isEmpty()) {
sentences = s;
begin = sentences.get(0).getBegin();
end = sentences.get(sentences.size() - 1).getEnd();
} else {
sentences = new ArrayList<>();
begin = 0;
end = 0;
}
}
/**
* @return all Sentences in this Document
*/
@JsonIgnore
public List getSentences() {
return sentences;
}
/**
* @return Stream all Sentences in this Document
*/
@JsonIgnore
public Stream streamSentences() {
return sentences.stream();
}
/**
* Returns a single Sentence from this Document
* @param index position (number of sentences) starting at 0
* @return
*/
public Sentence getSentence(int index) {
return sentences.get(index);
}
/**
* Returns the complete Sentence at a given Span position.
* @return Sentence or NULL if no sentence was found at that position
*/
public Optional getSentenceAtPosition(int begin) {
return getSentences().stream()
.filter(s -> (s.getBegin() <= begin && s.getEnd() > begin))
.findFirst();
}
/**
* Returns the Sentence index at a given Span position.
* @return index or -1 if no sentence was found at that position
*/
public int getSentenceIndexAtPosition(int begin) {
AtomicInteger index = new AtomicInteger(-1);
Optional sentence = getSentences().stream()
.peek(s -> index.incrementAndGet()) // increment every element encounter
.filter(s -> (s.getBegin() > begin)) // find first sentence that starts later
.findFirst();
if(sentence.isPresent()) return index.get() - 1;
else return index.get();
}
/**
* @return all Sentences that are in a given range.
* @param begin
* @param end
* @param enclosed - TRUE to return only completely enclosed sentences, FALSE to expand sentences at the boundaries
*/
public Stream streamSentencesInRange(int begin, int end, boolean enclosed) {
if(enclosed) return getSentences().stream()
.filter(t -> t.getBegin() >= begin && t.getEnd() <= end);
else return getSentences().stream()
.filter(t -> (t.getBegin() <= begin && t.getEnd() > begin) ||
(t.getBegin() >= begin && t.getEnd() <= end && begin != end) ||
(t.getBegin() < end && t.getEnd() >= end));
}
/**
* @return all Tokens in a given range.
* @param enclosed - TRUE to return only completely enclosed tokens, FALSE to expand sentences at the boundaries
*/
public Stream streamTokensInRange(int begin, int end, boolean enclosed) {
if(enclosed) return streamTokens().filter(t -> t.getBegin() >= begin && t.getEnd() <= end);
else return streamTokens().filter(t -> (t.getBegin() <= begin && t.getEnd() > begin) ||
(t.getBegin() >= begin && t.getEnd() <= end && begin != end) ||
(t.getBegin() < end && t.getEnd() >= end));
}
public Optional getToken(int index) {
return streamTokens().skip(index).limit(1).findFirst();
}
/**
* Returns a random Sentence from this Document
* @return
*/
@JsonIgnore
Sentence getRandomSentence() {
int index = random.nextInt(sentences.size());
return getSentence(index);
}
/**
* Appends a Sentence to the end of the document. Span offsets are adjusted accordingly.
* @param s The Sentence to add.
*/
public void addSentence(Sentence s) {
addSentence(s, true);
}
public void addSentence(Sentence s, boolean adjustOffsets) {
if(adjustOffsets) {
if(sentences.isEmpty()) begin = 0;
int cursor = getEnd();
if (!sentences.isEmpty()) cursor ++;
int length = s.getLength();
// FIXME: setBegin should adjust Token's positions. Or use relative positions thoughout.
s.setBegin(cursor);
s.setLength(length);
end = s.getEnd();
} else {
if(sentences.isEmpty()) begin = s.getBegin();
end = s.getEnd();
}
s.setDocumentRef(this);
sentences.add(s);
}
/*
we simply use .endswith, hope that is not too slow
public void append(Document doc) {
if(isEmpty()) append(doc, false);
else if(streamTokens() // check if last token is newline
.reduce((a, b) -> b)
.filter(t -> t.getText().equals("\n"))
.isPresent()
) append(doc, false);
else append(doc, true);
}*/
public void append(Document doc) {
int offset;
if(isEmpty() || getText().endsWith("\n") || getText().endsWith(" ")) offset = getEnd();
else offset = getEnd() + 1;
int length = doc.getLength();
doc.setBegin(doc.getBegin() + offset);
doc.setLength(length);
for(Sentence s : doc.getSentences()) {
s.addOffset(offset);
s.setDocumentRef(this);
sentences.add(s);
setEnd(s.getEnd());
doc.setEnd(s.getEnd());
}
}
public void setId(String id) {
this.id = id;
}
public String getId() {
return this.id;
}
public void setTagAvailable(Annotation.Source source, Class tag, boolean exists) {
if(exists) { // set
if(assignedTags == null) assignedTags = new EnumMap<>(Annotation.Source.class);
if(!assignedTags.containsKey(source)) assignedTags.put(source, new TreeSet<>());
assignedTags.get(source).add(tag.getCanonicalName());
} else { //unset
if(assignedTags != null && assignedTags.containsKey(source)) {
assignedTags.get(source).remove(tag.getCanonicalName());
}
}
}
public boolean isTagAvaliable(Annotation.Source source, Class tag) {
if(assignedTags != null && assignedTags.containsKey(source)) {
return assignedTags.get(source).contains(tag.getCanonicalName());
} else {
return false;
}
}
/**
* Adds a single Annotation to this document.
* @param Type of the Annotation
*/
public void addAnnotation(A ann) {
if(annotations == null) annotations = new ArrayList<>(countSentences() * 4);
ann.setDocumentRef(this);
annotations.add(ann);
}
/**
* Adds a List of Annotations to this document.
* @param anns The Annotations to add. Duplicates will not be replaced.
*/
public void addAnnotations(List extends Annotation> anns) {
if(annotations == null) annotations = new ArrayList<>(Math.max(countSentences() * 4, anns.size() * 2));
anns.stream().forEach(ann -> ann.setDocumentRef(this));
annotations.addAll(anns);
}
/**
* @return All Annotations attached to this Document.
*/
@JsonIgnore
protected Stream extends Annotation> streamAnnotations() {
if(annotations == null) return Stream.empty();
else return annotations.stream().map(ann -> ann.getClass().cast(ann));
}
/**
* @return All Annotations of Source