All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ixa.kaflib.AnnotationContainer Maven / Gradle / Ivy

package ixa.kaflib;

import java.io.IOException;
import java.io.Serializable;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.SortedSet;
import java.util.TreeSet;
import org.jdom2.Element;

import org.jdom2.Element;
import org.jdom2.JDOMException;

/** A container to keep all annotations of a document (word forms, terms, dependencies, chunks, entities and coreferences). There are different hash maps to index annotations by different properties as ID, sentence... It enables to retrieve annotations by different properties in an effective way. Performance is very important. */
class AnnotationContainer implements Serializable {

    private String rawText;

    /** List to keep all word forms */
    private List text;

    /** Next offset: sum of all words' length plus one char per word */
    private int nextOffset;

    /** List to keep all terms */
    private List terms;

    private Map> marks;

    /** List to keep all dependencies */
    private List deps;

    /** List to keep all chunks */
    private List chunks;

    /** List to keep all named entities */
    private List entities;

    /** List to keep all properties */
    private List properties;

    /** List to keep all categories */
    private List categories;

    /** List to keep all coreferences */
    private List coreferences;

    /** List to keep all timeExpressions */
    private List timeExpressions;

    /** List to keep all tLinks */
    private List tLinks;

    /** List to keep all tLinks */
    private List cLinks;

	/** List to keep all factualities */
	private List factualities;

	/** List to keep all linked entities */
	private List linkedEntities;

	/** List to keep all opinions */
    private List opinions;

    /** List to keep all relations */
    private List relations;

    /** List to keep all predicates */
    private List predicates;

    /** List to keep all trees */
    private List trees;

    /** UNKNOWN annotation layers in plain DOM format */
    private List unknownLayers;

    /** Hash map for mapping word forms to terms. */
    private HashMap> termsIndexedByWF;
    private HashMap>> marksIndexedByWf;
    private HashMap> depsIndexedByTerm;
    private HashMap> chunksIndexedByTerm;
    private HashMap> entitiesIndexedByTerm;
    private HashMap> corefsIndexedByTerm;
    private HashMap> timeExsIndexedByWF;
    private HashMap> factsIndexedByWF;
    private HashMap> linkedEntitiesIndexedByWF;
    private HashMap> propertiesIndexedByTerm;
    private HashMap> categoriesIndexedByTerm;
    private HashMap> opinionsIndexedByTerm;
    private HashMap> relationsIndexedByRelational;
    private HashMap> predicatesIndexedByTerm;

    HashMap> textIndexedBySent;
    HashMap> termsIndexedBySent;
    HashMap>> marksIndexedBySent;
    HashMap> entitiesIndexedBySent;
    HashMap> depsIndexedBySent;
    HashMap> chunksIndexedBySent;
    HashMap> corefsIndexedBySent;
    HashMap> timeExsIndexedBySent;
    HashMap> factsIndexedBySent;
    HashMap> linkedEntitiesIndexedBySent;
    HashMap> propertiesIndexedBySent;
    HashMap> categoriesIndexedBySent;
    HashMap> opinionsIndexedBySent;
    HashMap> relationsIndexedBySent;
    HashMap> predicatesIndexedBySent;
    HashMap> constituentsIndexedBySent;

    HashMap> sentsIndexedByParagraphs;


    /** This creates a new AnnotationContainer object */
    AnnotationContainer() {
	rawText = new String();
	text = new ArrayList();
	nextOffset = 0;
	terms = new ArrayList();
	marks = new HashMap();
	deps = new ArrayList();
	chunks = new ArrayList();
	entities = new ArrayList();
	properties = new ArrayList();
	categories = new ArrayList();
	coreferences = new ArrayList();
	timeExpressions = new ArrayList();
	tLinks = new ArrayList();
	cLinks = new ArrayList();
	factualities = new ArrayList();
	linkedEntities = new ArrayList();
	opinions = new ArrayList();
	relations = new ArrayList();
	predicates = new ArrayList();
	trees = new ArrayList();
	unknownLayers = new ArrayList();

	termsIndexedByWF = new HashMap>();
	marksIndexedByWf = new HashMap>>();
	depsIndexedByTerm = new HashMap>();
	chunksIndexedByTerm =  new HashMap>();
	entitiesIndexedByTerm =  new HashMap>();
	corefsIndexedByTerm =  new HashMap>();
	timeExsIndexedByWF =  new HashMap>();
	linkedEntitiesIndexedByWF =  new HashMap>();
	factsIndexedByWF = new HashMap>();
	propertiesIndexedByTerm =  new HashMap>();
	categoriesIndexedByTerm =  new HashMap>();
	opinionsIndexedByTerm =  new HashMap>();
	relationsIndexedByRelational =  new HashMap>();
	predicatesIndexedByTerm = new HashMap>();

	textIndexedBySent = new HashMap>();
	termsIndexedBySent = new HashMap>();
	marksIndexedBySent = new HashMap>>();
	entitiesIndexedBySent = new HashMap>();
	depsIndexedBySent = new HashMap>();
	chunksIndexedBySent = new HashMap>();
	corefsIndexedBySent = new HashMap>();
	timeExsIndexedBySent = new HashMap>();
	linkedEntitiesIndexedBySent = new HashMap>();
	factsIndexedBySent =new HashMap>();
	propertiesIndexedBySent = new HashMap>();
	categoriesIndexedBySent = new HashMap>();
	opinionsIndexedBySent = new HashMap>();
	relationsIndexedBySent = new HashMap>();
	predicatesIndexedBySent = new HashMap>();
	constituentsIndexedBySent = new HashMap>();

	sentsIndexedByParagraphs = new HashMap>();
    }

    private  void indexBySent(T annotation, Integer sent, HashMap> index) {
	if (sent > 0) {
	    if (index.get(sent) == null) {
		index.put(sent, new ArrayList());
	    }
	    index.get(sent).add(annotation);
	}
    }

    private void indexMarkBySent(Mark mark, String source, Integer sent) {
	if (sent > 0) {
	    if (marksIndexedBySent.get(sent) == null) {
		marksIndexedBySent.put(sent, new HashMap>());
	    }
	    if (marksIndexedBySent.get(sent).get(source) == null) {
		marksIndexedBySent.get(sent).put(source, new ArrayList());
	    }
	    marksIndexedBySent.get(sent).get(source).add(mark);
	}
    } 

    void indexSentByPara(Integer sent, Integer para) {
	if ((sent > 0) && (para > 0)) {
	    if (this.sentsIndexedByParagraphs.get(para) == null) {
		this.sentsIndexedByParagraphs.put(para, new LinkedHashSet());
	    }
	    this.sentsIndexedByParagraphs.get(para).add(sent);
	}
    }

    public List getSentsByParagraph(Integer para) {
	return new ArrayList(this.sentsIndexedByParagraphs.get(para));
    }

     List getLayerByPara(Integer para, HashMap> index) {
	List layer = new ArrayList();
	for (Integer sent : this.getSentsByParagraph(para)) {
	    layer.addAll(index.get(sent));
	}
	return layer;
    }

    String getRawText() {
	return rawText;
    }

    /** Returns all word forms. */
    List getText() {
	return text;
    }

    /** Returns all terms */
    List getTerms() {
	return terms;
    }

    List getMarkSources() {
	return new ArrayList(marks.keySet());
    }

    List getMarks(String source) {
	return (marks.get(source) == null) ? new ArrayList() : marks.get(source);
    }

    /** Returns all dependencies */
    List getDeps() {
	return deps;
    }

    /** Returns all chunks */
    List getChunks() {
	return chunks;
    }

    /** Returns all named entities */
    List getEntities() {
	return entities;
    }

    /** Returns all properties */
    List getProperties() {
	return properties;
    }

    /** Returns all categories */
    List getCategories() {
	return categories;
    }

    /** Returns all coreferences */
    List getCorefs() {
	return coreferences;
    }

    /** Returns all timeExpressions */
    List getTimeExs() {
	return timeExpressions;
    }

    /** Returns all tlinks */
    List getTLinks() {
	return this.tLinks;
    }

    /** Returns all clinks */
    List getCLinks() {
	return this.cLinks;
    }

	List getFactualities() {
		return factualities;
	}

	List getLinkedEntities() {
		return linkedEntities;
	}

	/** Returns all opinions */
    List getOpinions() {
	return opinions;
    }

    /** Returns all relations */
    List getRelations() {
	return relations;
    }

    /** Returns all predicates */
    List getPredicates() {
	return predicates;
    }

    /** Returns all trees */
    List getConstituents() {
	return trees;
    }

    /** Returns all unknown layers as a DOM Element list */
    List getUnknownLayers() {
	return unknownLayers;
    }

    void setRawText(String str) {
	rawText = str;
    }

    /** Adds a word form to the container */
    void add(WF wf) {
	text.add(wf);
	//nextOffset += wf.getLength() + 1;
	this.indexBySent(wf, wf.getSent(), this.textIndexedBySent);
    }

    private  void indexAnnotation(T annotation, String hashId, HashMap> index) {
	if (index.get(hashId) == null) {
	    index.put(hashId, new ArrayList());
	}
	index.get(hashId).add(annotation);
    }

    private void indexMarkByWf(Mark mark, String source, String tid) {
	if (marksIndexedByWf.get(tid) == null) {
	    marksIndexedByWf.put(tid, new HashMap>());
	}
	if (marksIndexedByWf.get(tid).get(source) == null) {
	    marksIndexedByWf.get(tid).put(source, new ArrayList());
	}
	marksIndexedByWf.get(tid).get(source).add(mark);
    }

    /** Adds a term to the container */
    void add(Term term) {
	this.add(term, this.terms.size());
    }

    void add(Term term, int index) {
	terms.add(index, term);
	for (WF wf : term.getWFs()) {
	    indexAnnotation(term, wf.getId(), termsIndexedByWF);
	}
	if (!term.isComponent()) {
	    this.indexBySent(term, term.getSent(), this.termsIndexedBySent);
	}
    }

    void remove(Term term) {
	this.terms.remove(term);
    }

    void add(Mark mark, String source) {
	List sourceMarks = marks.get(source);
	if (sourceMarks == null) {
	    sourceMarks = new ArrayList();
	}
	sourceMarks.add(mark);
	marks.put(source, sourceMarks);
	for (WF wf : mark.getSpan().getTargets()) {
	    indexMarkByWf(mark, source, wf.getId());
	}
        this.indexMarkBySent(mark, source, mark.getSpan().getTargets().get(0).getSent());
    }

    /** Adds a dependency to the container */
    void add(Dep dep) {
	deps.add(dep);
	/* Index by 'from' and 'to' terms */
	if (dep.getFrom() != null) {
	    String tId = dep.getFrom().getId();
	    indexAnnotation(dep, tId, depsIndexedByTerm);
	}
	if (dep.getTo() != null) {
	    String tId = dep.getTo().getId();
	    indexAnnotation(dep, tId, depsIndexedByTerm);
	}
	this.indexBySent(dep, dep.getFrom().getSent(), this.depsIndexedBySent);
    }

    /** Adds a chunk to the container */
    void add(Chunk chunk) {
	chunks.add(chunk);
	/* Index by terms */
	for (Term term : chunk.getTerms()) {
	    indexAnnotation(chunk, term.getId(), chunksIndexedByTerm);
	}
	this.indexBySent(chunk, chunk.getSpan().getTargets().get(0).getSent(), this.chunksIndexedBySent);
    }

    /** Adds a named entity to the container */
    void add(Entity entity) {
	entities.add(entity);
	/* Index by terms */
	for (Term term : entity.getTerms()) {
	    indexAnnotation(entity, term.getId(), entitiesIndexedByTerm);
	}
	this.indexBySent(entity, entity.getSpans().get(0).getTargets().get(0).getSent(), this.entitiesIndexedBySent);
    }

    /** Adds a feature to the container. It checks if it is a property or a category. */
    void add(Feature feature) {
	if (feature.isAProperty()) {
	    properties.add(feature);
	    /* Index by terms */
	    for (Term term : feature.getTerms()) {
		indexAnnotation(feature, term.getId(), propertiesIndexedByTerm);
	    }
	    //this.indexBySent(feature, feature.getSpans().get(0).getTargets().get(0).getSent(), this.propertiesIndexedBySent);
	}
	else {
	    categories.add(feature);
	    /* Index by terms */
	    for (Term term : feature.getTerms()) {
		indexAnnotation(feature, term.getId(), categoriesIndexedByTerm);
	    }
	    //this.indexBySent(feature, feature.getSpans().get(0).getTargets().get(0).getSent(), this.categoriesIndexedBySent);
	}
    }

    /** Adds a coreference to the container */
    void add(Coref coref) {
	coreferences.add(coref);
	/* Index by terms */
	for (Term term : coref.getTerms()) {
	    indexAnnotation(coref, term.getId(), corefsIndexedByTerm);
	}
	//this.indexBySent(coref, coref.getSpans().get(0).getTargets().get(0).getSent(), this.corefsIndexedBySent);
    }

    /** Adds a timeExpression to the container */
    void add(Timex3 timex3) {
	timeExpressions.add(timex3);
	/* Index by terms */
	if(timex3.hasSpan()){
	    for (WF wf : timex3.getSpan().getTargets()) {
		indexAnnotation(timex3, wf.getId(), timeExsIndexedByWF);
	    }
	}
    }

    /** Adds a tlink to the container */
    void add(TLink tLink) {
	tLinks.add(tLink);
	/* Index by from/to (???) */
    }

    /** Adds a clink to the container */
    void add(CLink cLink) {
	cLinks.add(cLink);
	/* Index by from/to (???) */
    }

	/** Adds a factuality to the container */
	void add(Factuality factuality) {
	    factualities.add(factuality);
	    /* Index by terms */
	    indexAnnotation(factuality, factuality.getWF().getId(), factsIndexedByWF);
	}

	/** Adds a linked entity to the container */
	void add(LinkedEntity linkedEntity) {
		linkedEntities.add(linkedEntity);
	/* Index by terms */
		if(linkedEntity.getWFs() != null){
			for (WF wf : linkedEntity.getWFs().getTargets()) {
				indexAnnotation(linkedEntity, wf.getId(), linkedEntitiesIndexedByWF);
			}
		}
	}

	/** Adds an opinion to the container */
    void add(Opinion opinion) {
	opinions.add(opinion);
	/* Index by terms */
	/* Ezin hemen indexatu, terminoak oraindik ez baitira gehitu!!!
	LinkedHashSet terms = new LinkedHashSet();
	terms.addAll(opinion.getOpinionHolder().getTerms());
	terms.addAll(opinion.getOpinionTarget().getTerms());
	terms.addAll(opinion.getOpinionExpression().getTerms());	
	for (Term term : terms) {
	    indexAnnotation(opinion, term.getId(), opinionsIndexedByTerm);
	}
	*/

    }

    /** Adds a relation to the container */
    void add(Relation relation) {
	relations.add(relation);
	/* Index by 'from' and 'to' terms */
	if (relation.getFrom() != null) {
	    String rId = relation.getFrom().getId();
	    indexAnnotation(relation, rId, relationsIndexedByRelational);
	}
	if (relation.getTo() != null) {
	    String rId = relation.getTo().getId();
	    indexAnnotation(relation, rId, relationsIndexedByRelational);
	}
    }

    /** Adds a predicate to the container */
    void add(Predicate predicate) {
	predicates.add(predicate);
	/* Index by terms */
	for (Term term : predicate.getTerms()) {
	    indexAnnotation(predicate, term.getId(), predicatesIndexedByTerm);
	}
	this.indexBySent(predicate, predicate.getSpan().getTargets().get(0).getSent(), this.predicatesIndexedBySent);
    }

    /** Adds a tree to the container */
    void add(Tree tree) {
	trees.add(tree);
	TreeNode currentNode = tree.getRoot();
	while (!currentNode.isTerminal()) {
	    currentNode = ((NonTerminal) currentNode).getChildren().get(0);
	}
	Integer sent = ((Terminal) currentNode).getSpan().getTargets().get(0).getSent(); 
	this.indexBySent(tree, sent, this.constituentsIndexedBySent);
    }

    /** Adds an unknown layer to the container in DOM format */
    void add(Element layer) {
	unknownLayers.add(layer);
    }

    /** Index a Term by its sentence number */
    void indexTermBySent(Term term, Integer sent) {
	if (sent == -1) {
	    throw new IllegalStateException("You can't call indexTermBySent not having defined the sentence for its WFs");
	}
	List sentTerms = termsIndexedBySent.get(sent);
	if (sentTerms == null) {
	    sentTerms = new ArrayList();
	    termsIndexedBySent.put(sent, sentTerms);
	}
	sentTerms.add(term);
    }

    /** Returns all tokens classified by sentences */
    List> getSentences() {
	List> sentences = new ArrayList>();
	Set sentNumsSet = this.textIndexedBySent.keySet();
        List sentNumsList = new ArrayList(sentNumsSet);
	Collections.sort(sentNumsList);
	for (int i : sentNumsList) {
	    List wfs = this.textIndexedBySent.get(i);
	    sentences.add(wfs);
	}
	return sentences;
    }

    Integer termPosition(Term term) {
	return this.terms.indexOf(term);
    }

    /** Returns WFs from a sentence */
    List getSentenceWFs(int sent) {
        return this.textIndexedBySent.get(sent);
    }

    /** Returns terms from a sentence */
    List getSentenceTerms(int sent) {
        return this.termsIndexedBySent.get(sent);
    }

    Term getTermByWF(WF wf) {
	List terms = this.termsIndexedByWF.get(wf.getId());
	if (terms == null) {
	    return null;
	}
	return terms.get(0);
    }

    List getTermsByWF(WF wf) {
	List terms = this.termsIndexedByWF.get(wf.getId());
	return (terms == null) ? new ArrayList() : terms;
    }

    /** Returns a list of terms containing the word forms given on argument.
     * @param wfIds a list of word form IDs whose terms will be found.
     * @return a list of terms containing the given word forms.
     */
    List getTermsByWFs(List wfs) {
	LinkedHashSet terms = new LinkedHashSet();
	for (WF wf : wfs) {
	    terms.addAll(getTermsByWF(wf));
	}
	return new ArrayList(terms);
    }

    List getMarksByWf(WF wf, String source) {
	Map> marks = this.marksIndexedByWf.get(wf.getId());
	if (marks == null) {
	    return new ArrayList();
	}
	List sourceMarks = marks.get(source);
	return (sourceMarks == null) ? new ArrayList() : sourceMarks;
    }

    List getDepsByTerm(Term term) {
	List deps = this.depsIndexedByTerm.get(term.getId());
	return (deps == null) ? new ArrayList() : deps;
    }

    List getChunksByTerm(Term term) {
	List chunks = this.chunksIndexedByTerm.get(term.getId());
	return (chunks == null) ? new ArrayList() : chunks;
    }

    List getEntitiesByTerm(Term term) {
	List entities = this.entitiesIndexedByTerm.get(term.getId());
	return (entities == null) ? new ArrayList() : entities;
    }

    List getCorefsByTerm(Term term) {
	List corefs = this.corefsIndexedByTerm.get(term.getId());
	return (corefs == null) ? new ArrayList() : corefs;
    }

    List getTimeExsByWF(WF wf) {
	List timeExs = this.timeExsIndexedByWF.get(wf.getId());
	return (timeExs == null) ? new ArrayList() : timeExs;
    }

    List getPropertiesByTerm(Term term) {
	List properties = this.propertiesIndexedByTerm.get(term.getId());
	return (properties == null) ? new ArrayList() : properties;
    }

    List getCategoriesByTerm(Term term) {
	List categories = this.categoriesIndexedByTerm.get(term.getId());
	return (categories == null) ? new ArrayList() : categories;
    }

    List getOpinionsByTerm(Term term) {
	List opinions = this.opinionsIndexedByTerm.get(term.getId());
	return (opinions == null) ? new ArrayList() : opinions;
    }

    List getRelationsByRelational(Relational relational) {
	List relations = this.relationsIndexedByRelational.get(relational.getId());
	return (relations == null) ? new ArrayList() : relations;
    }

    List getPredicatesByTerm(Term term) {
	List predicates = this.predicatesIndexedByTerm.get(term.getId());
	return (predicates == null) ? new ArrayList() : predicates;
    }

    List getDepsByTerms(List terms) {
	LinkedHashSet deps = new LinkedHashSet();
	for (Term term : terms) {
	    deps.addAll(getDepsByTerm(term));
	}
	return new ArrayList(deps);
    }

    List getChunksByTerms(List terms) {
	LinkedHashSet chunks = new LinkedHashSet();
	for (Term term : terms) {
	    chunks.addAll(getChunksByTerm(term));
	}
	return new ArrayList(chunks);
    }

    List getEntitiesByTerms(List terms) {
	LinkedHashSet entities = new LinkedHashSet();
	for (Term term : terms) {
	    entities.addAll(getEntitiesByTerm(term));
	}
	return new ArrayList(entities);
    }

    List getCorefsByTerms(List terms) {
	LinkedHashSet corefs = new LinkedHashSet();
	for (Term term : terms) {
	    corefs.addAll(getCorefsByTerm(term));
	}
	return new ArrayList(corefs);
    }

    List getTimeExsByWFs(List wfs) {
	LinkedHashSet timeExs = new LinkedHashSet();
	for (WF wf : wfs) {
	    timeExs.addAll(getTimeExsByWF(wf));
	}
	return new ArrayList(timeExs);
    }

    List getPropertiesByTerms(List terms) {
	LinkedHashSet properties = new LinkedHashSet();
	for (Term term : terms) {
	    properties.addAll(getPropertiesByTerm(term));
	}
	return new ArrayList(properties);
    }

    List getCategoriesByTerms(List terms) {
	LinkedHashSet categories = new LinkedHashSet();
	for (Term term : terms) {
	    categories.addAll(getCategoriesByTerm(term));
	}
	return new ArrayList(categories);
    }

    List getOpinionsByTerms(List terms) {
	LinkedHashSet opinions = new LinkedHashSet();
	for (Term term : terms) {
	    opinions.addAll(getOpinionsByTerm(term));
	}
	return new ArrayList(opinions);
    }

    List getRelationsByRelationals(List relationals) {
	LinkedHashSet relations = new LinkedHashSet();
	for (Relational relational : relationals) {
	    relations.addAll(getRelationsByRelational(relational));
	}
	return new ArrayList(relations);
    }

    List getPredicatesByTerms(List terms) {
	LinkedHashSet predicates = new LinkedHashSet();
	for (Term term : terms) {
	    predicates.addAll(getPredicatesByTerm(term));
	}
	return new ArrayList(predicates);
    }

    /** Returns next WF's offset. */
    int getNextOffset() {
	return nextOffset;
    }


    /** Deprecated. Returns a list of terms containing the word forms given on argument.
     * @param wfIds a list of word form IDs whose terms will be found.
     * @return a list of terms containing the given word forms.
     */
    List getTermsByWFIds(List wfIds) {
	LinkedHashSet terms = new LinkedHashSet();
	for (String wfId : wfIds) {
	    terms.addAll(this.termsIndexedByWF.get(wfId));
	}
	return new ArrayList(terms);
    }

    void removeLayer(KAFDocument.Layer layer) {
	switch (layer) {
	case text:
	    this.text.clear();
	    break;
	case terms:
	    this.terms.clear();
	    break;
	case deps:
	    this.deps.clear();
	    break;
	case chunks:
	    this.chunks.clear();
	    break;
	case entities:
	    this.entities.clear();
	    break;
	case properties:
	    this.properties.clear();
	    break;
	case categories:
	    this.categories.clear();
	    break;
	case coreferences:
	    this.coreferences.clear();
	    break;
	case opinions:
	    this.opinions.clear();
	    break;
	case relations:
	    this.relations.clear();
	    break;
	case srl:
	    this.predicates.clear();
	    break;
	case constituency:
	    this.trees.clear();
	    break;
	default:
	    throw new IllegalArgumentException("Wrong layer");
	}
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy