All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.geneexpbase.genemodel.GeneMention Maven / Gradle / Ivy

package de.julielab.geneexpbase.genemodel;

import cc.mallet.types.FeatureVector;
import cc.mallet.types.InstanceList;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.geneexpbase.candidateretrieval.SynHit;
import de.julielab.java.utilities.spanutils.Span;
import org.apache.commons.lang3.Range;
import org.apache.lucene.search.Query;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.function.Supplier;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import java.util.stream.Stream;


/**
 * A basic "gene mention" that most of all contains the text of the mention.
 * However, we might also need other information, i.e. offsets.
 *
 * @author faessler
 */
public class GeneMention implements Span {
    /**
     * Constant meaning that no ID is given for a GeneMention.
     */
    public static final String NOID = "NoId";
    private final static Logger log = LoggerFactory.getLogger(GeneMention.class);
    private Object originalMappedObject;
    private String docId;
    private GeneName geneName;
    /**
     * @deprecated a single mention can have multiple (taxonomy) IDs (human and murine pro-alpha3(V)...)
     */
    private String id = NOID;
    private TermNormalizer normalizer;
    private Range offsets;
    private String text;
    /**
     * @deprecated refer to {@link #overlappingGoldMentions}
     */
    private String goldTaxonomyId;
    private List overlappingGoldMentions;
    /**
     * @deprecated a single mention can have multiple (taxonomy) IDs (human and murine pro-alpha3(V)...)
     */
    private String taxonomyId;
    private List ids = Collections.emptyList();
    /**
     * Taxonomy ID candidates for this gene mention. Used during species assignment.
     */
    private Set taxonomyCandidates;
    /**
     * Indicates whether all {@link #taxonomyCandidates} should be assigned or only one of them.
     */
    private boolean isTaxonomyCandidatesConjunctive;
    /**
     * Final taxonomy IDs assigned to this gene mention.
     */
    private List taxonomyIds = Collections.emptyList();
    private Multimap taxonomyOcurrences = HashMultimap.create();
    private Map taxonomyScores;
    /**
     * These will mostly be the same as {@link #taxonomyScores} but can differ is some cases, e.g. when
     * the fallback to the default species has happened since all other candidates seemed too unlikely.
     */
    private Map processedTaxonomyScores;
    private Set taxonomyIdsSet;
    private boolean specificTypeFrozen;
    private GeneSpeciesOccurrence taxonomyReliability;
    private String documentContext;
    private Query contextQuery;
    private GeneTagger tagger = GeneTagger.UNKNOWN;
    private SpecificType specificType = SpecificType.UNKNOWN;
    private double specificTypeConfidence;
    private MentionMappingResult mentionMappingResult;
    private GeneDocument geneDocument;
    private List taggingModifiers;
    // The gene set to which this gene belongs
    private GeneSets geneSets;
    /**
     * A parent GeneMention is a GeneMention that has been split into sub-mentions,
     * most commonly due to conjunctions or enumerations within a GeneMention. Thus,
     * when parent is not null, this GeneMention resulted from a split of another
     * GeneMention.
     */
    private GeneMention parent;
    /**
     * A GeneMention that is actually a composite to be expanded to multiple different genes, like enumerations
     * or numerical ranges, can have derived GeneMentions corresponding to the individual genes denotes by the
     * composite expression. Those are the children.
     */
    private List children;
    private boolean isCompositeMention;
    private List posTags;
    private FeatureVector featureVector;
    private String reducedNameForExactMatch;
    private InstanceList instances;
    private List familyNames;
    private String bestCandidateSynonym;
    private String compositeResolver;
    private Set nameTokenSet;
    private Map familyFeatures = new HashMap<>();

    /**
     * Makes a copy of the given GeneMention but NOT from its MentionMappingResult.
     *
     * @param gm The gene mention to copy.
     */
    public GeneMention(GeneMention gm) {
        if (gm.geneName != null)
            this.geneName = new GeneName(gm.geneName);
        this.contextQuery = gm.contextQuery;
        this.docId = gm.docId;
        this.documentContext = gm.documentContext;
        this.geneDocument = gm.geneDocument;
        this.id = gm.id;
        if (gm.ids != null)
            this.ids = new ArrayList<>(gm.ids);
        this.normalizer = gm.normalizer;
        this.offsets = gm.offsets;
        this.tagger = gm.tagger;
        this.taxonomyOcurrences = gm.taxonomyOcurrences;
        this.taxonomyId = gm.taxonomyId;
        this.taxonomyIds = gm.getTaxonomyIds();
        this.taxonomyScores = gm.taxonomyScores;
        this.processedTaxonomyScores = gm.processedTaxonomyScores;
        this.text = gm.text;
        this.originalMappedObject = gm.originalMappedObject;
        this.parent = gm.parent;
        this.children = gm.children;
        this.overlappingGoldMentions = gm.overlappingGoldMentions;
        this.specificType = gm.getSpecificType();
        this.specificTypeConfidence = gm.getSpecificTypeConfidence();
        this.overlappingGoldMentions = gm.overlappingGoldMentions;
        if (gm.getMentionMappingResult() != null)
            this.mentionMappingResult = new MentionMappingResult(gm.getMentionMappingResult());
        this.familyFeatures = new HashMap<>(gm.familyFeatures);
    }

    public GeneMention(String text) {
        this();
        this.text = text;
        this.children = Collections.emptyList();
    }


    public GeneMention() {
    }

    public GeneMention(String text, TermNormalizer normalizer) {
        this(text);
        this.setNormalizer(normalizer);
    }

    public GeneMention(String text, int begin, int end) {
        this(text);
        this.offsets = Range.between(begin, end);
    }

    public GeneMention(String text, int begin, int end, TermNormalizer normalizer) {
        this(text, begin, end);
        this.setNormalizer(normalizer);
    }

    public Double addFamilyFeature(String featureName, double value) {
        return familyFeatures.put(featureName, value);
    }

    public Map getFamilyFeatures() {
        return familyFeatures;
    }

    public boolean matchesFamilyName() {
        return familyNames != null && !familyNames.isEmpty();
    }

    public GeneSet getSingleGeneSet() {
        if (geneSets.size() != 1)
            throw new IllegalArgumentException("There is not a single geneset associated with this gene mention but there are " + geneSets.size() + " for gene mention " + this + ": " + geneSets);
        return geneSets.stream().findAny().get();
    }

    public void addGeneSet(GeneSet geneSet) {
        if (geneSet == null || geneSet.isEmpty())
            throw new IllegalArgumentException("The passed geneset is " + (geneSet == null ? "null" : "empty") + ".");
        if (geneSets == null)
            this.geneSets = new GeneSets();
        this.geneSets.add(geneSet);
    }

    public boolean isTaxonomyCandidatesConjunctive() {
        return isTaxonomyCandidatesConjunctive;
    }

    public void setTaxonomyCandidatesConjunctive(boolean taxonomyCandidatesConjunctive) {
        isTaxonomyCandidatesConjunctive = taxonomyCandidatesConjunctive;
    }

    public Set getTaxonomyCandidates() {
        return taxonomyCandidates != null ? taxonomyCandidates : Collections.emptySet();
    }

    public void setTaxonomyCandidates(Set taxonomyCandidates) {
        this.taxonomyCandidates = taxonomyCandidates;
    }

    public List getFamilyNames() {
        return familyNames;
    }

    public void setFamilyNames(List matchedFamilyNames) {
        familyNames = matchedFamilyNames;
    }

    /**
     * Returns the taxonomy scores that have undergone threshold filtering. It is possible that this is not
     * the outcome of the ML-based approach but just the default species of a document due to threshold filtering.
     *
     * @return
     */
    public Map getProcessedTaxonomyScores() {
        return processedTaxonomyScores;
    }

    public void setProcessedTaxonomyScores(Map processedTaxonomyScores) {
        this.processedTaxonomyScores = processedTaxonomyScores;
    }

    public GeneMention getFirstGoldMention() {
        if (!hasGoldMentions()) return null;
        return overlappingGoldMentions.get(0);
    }

    /**
     * 

The original UIMA annotation that is mapped. Most likely a subclass of EntityMention.

* * @return The original object to be mapped. */ public Object getOriginalMappedObject() { return originalMappedObject; } public void setOriginalMappedObject(Object originalMappedObject) { this.originalMappedObject = originalMappedObject; } public List getIds() { return ids; } public void setIds(List ids) { assert !ids.stream().anyMatch(Objects::isNull) : "There is a null item in the IDs to be set."; assert ids.indexOf("null") == -1 : "The string 'null' is among the IDs to be set."; this.ids = ids; } public Stream getMappedSynHits() { if (mentionMappingResult == null) throw new IllegalArgumentException("This gene mention was not yet mapped, there are no final ranked candidates."); return mentionMappingResult.getResultCandidates(); } public Stream getMappedIds() { assert getMappedSynHits().filter(Predicate.not(SynHit::isRejectionCandidate)).map(SynHit::getId).noneMatch(Objects::isNull) : "A null ID is returned for " + this; return getMappedSynHits().filter(Predicate.not(SynHit::isRejectionCandidate)).map(SynHit::getId); } public Set getMappedIdSet() { return getMappedIds().collect(Collectors.toSet()); } public void addId(String id) { if (ids.isEmpty()) ids = new ArrayList<>(); ids.add(id); } public void addTaxonomyId(String id) { taxonomyIdsSet = null; if (taxonomyIds.isEmpty()) taxonomyIds = new ArrayList<>(); taxonomyIds.add(id); } public List getTaxonomyIds() { if (taxonomyIds != null) { return taxonomyIds; } if (taxonomyId != null) return Collections.singletonList(taxonomyId); return Collections.emptyList(); } public void setTaxonomyIds(List taxonomyIds) { this.taxonomyIds = taxonomyIds; taxonomyIdsSet = null; } public List getNonRejectedTaxonomyIds() { if (mentionMappingResult == null) return getTaxonomyIds(); // we check the lexically reranked candidates for the rejection because it is not set to the original candidates since those should remain the original list, even if empty return getTaxonomyIds().stream().filter(taxId -> !mentionMappingResult.tax2lexicallyRerankedCandidates.get(taxId).get(0).isRejectionCandidate()).collect(Collectors.toList()); } public Set getTaxonomyIdsSet() { if (taxonomyIdsSet == null) taxonomyIdsSet = new HashSet<>(getTaxonomyIds()); return taxonomyIdsSet; } public void addChild(GeneMention child) { if (children.isEmpty()) children = new ArrayList<>(); children.add(child); } public boolean isCompositeMention() { return isCompositeMention || !children.isEmpty(); } public List getOverlappingGoldMentions() { return overlappingGoldMentions; } public void setOverlappingGoldMentions(List overlappingGoldMentions) { this.overlappingGoldMentions = overlappingGoldMentions; } public String getAnyGoldTaxonomyId() { if (!hasGoldMentions()) return null; return overlappingGoldMentions.get(0).getTaxonomyIds().get(0); } public List getAnyGoldTaxonomyIds() { if (!hasGoldMentions()) return Collections.emptyList(); return overlappingGoldMentions.get(0).getTaxonomyIds(); } public List getAllGoldTaxonomyIdsAsList() { return getAllGoldTaxonomyIds(Collectors.toList(), Collections::emptyList); } public Set getAllGoldTaxonomyIdsAsSet() { return getAllGoldTaxonomyIds(Collectors.toSet(), Collections::emptySet); } public R getAllGoldTaxonomyIds(Collector collector, Supplier emptyResultSupplier) { if (!hasGoldMentions()) return emptyResultSupplier.get(); return overlappingGoldMentions.stream().map(GeneMention::getTaxonomyIds).flatMap(Collection::stream).map(id -> (T) id).collect(collector); } public Stream getAllGoldTaxonomyIds() { return overlappingGoldMentions.stream().map(GeneMention::getTaxonomyIds).flatMap(Collection::stream); } public String getAnyGoldId() { if (!hasGoldMentions()) return null; return overlappingGoldMentions.get(0).getGoldMentionId(); } public List getAnyGoldIds() { if (!hasGoldMentions()) return Collections.emptyList(); return overlappingGoldMentions.get(0).getIds(); } public List getAllGoldIdsAsList() { return getAllGoldIds(Collectors.toList(), Collections::emptyList); } public Set getAllGoldIdAsSet() { return getAllGoldIds(Collectors.toSet(), Collections::emptySet); } private R getAllGoldIds(Collector collector, Supplier emptyResultSupplier) { if (!hasGoldMentions()) return emptyResultSupplier.get(); return getAllGoldIds().map(id -> (T) id).collect(collector); } public Stream getAllGoldIds() { if (!hasGoldMentions()) return Stream.empty(); return overlappingGoldMentions.stream().map(GeneMention::getIds).flatMap(Collection::stream); } public boolean hasGoldMentions() { return overlappingGoldMentions != null && !overlappingGoldMentions.isEmpty(); } /** * @return * @deprecated Use {@link #getAllGoldTaxonomyIdsAsList()}, {@link #getAnyGoldTaxonomyId()} or {@link #getAllGoldTaxonomyIdsAsList()} instead. */ public String getGoldTaxonomyId() { return goldTaxonomyId; } public void setGoldTaxonomyId(String goldTaxonomyId) { this.goldTaxonomyId = goldTaxonomyId; } public void setTaxonomyScore(String tax, double score) { if (taxonomyScores == null) taxonomyScores = new HashMap<>(); taxonomyScores.put(tax, score); } public void setProcessedTaxonomyScore(String tax, double score) { if (processedTaxonomyScores == null) processedTaxonomyScores = new HashMap<>(); processedTaxonomyScores.put(tax, score); } public double getTaxonomyScore(String taxonomyId) { return taxonomyScores == null ? 0 : taxonomyScores.getOrDefault(taxonomyId, 0d); } public double getProcessedTaxonomyScore(String taxonomyId) { return processedTaxonomyScores == null ? 0 : processedTaxonomyScores.getOrDefault(taxonomyId, 0d); } public Map getTaxonomyScores() { return taxonomyScores; } /** * Returns the original taxonomy scores returned by the ML-approach and the rule that species occurring previous * to a gene in the same NP are surely assigned to this gene. * * @param taxonomyScores */ public void setTaxonomyScores(Map taxonomyScores) { this.taxonomyScores = taxonomyScores; } public GeneSpeciesOccurrence getTaxonomyReliability() { return taxonomyReliability; } public void setTaxonomyReliability(GeneSpeciesOccurrence taxonomyReliability) { this.taxonomyReliability = taxonomyReliability; } public List getTaggingModifiers() { return taggingModifiers; } public String getTaxonomyId() { if (taxonomyId != null) return taxonomyId; if (taxonomyIds == null || taxonomyIds.isEmpty()) return null; return taxonomyIds.get(0); } public void setTaxonomyId(String taxonomyId) { this.taxonomyId = taxonomyId; this.taxonomyIds = new ArrayList<>(); if (taxonomyId != null) this.taxonomyIds.add(taxonomyId); } public Multimap getTaxonomyOccurrences() { return taxonomyOcurrences; } public void setTaxonomyOcurrences(Multimap taxonomyOcurrences) { this.taxonomyOcurrences = taxonomyOcurrences; } public String getDocumentContext() { return documentContext; } public void setDocumentContext(String documentContext) { this.documentContext = documentContext; } public Stream getDocumentContext(int numTokens, Set excludedTokens, boolean excludeGeneMentions) { return geneDocument.getDocumentContext(offsets, excludedTokens, excludeGeneMentions, numTokens); } public Stream getDocumentContext(int numTokens) { return geneDocument.getDocumentContext(offsets, numTokens); } public Query getContextQuery() { return contextQuery; } public void setContextQuery(Query contextQuery) { this.contextQuery = contextQuery; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((docId == null) ? 0 : docId.hashCode()); result = prime * result + ((id == null) ? 0 : id.hashCode()); result = prime * result + ((offsets == null) ? 0 : offsets.hashCode()); result = prime * result + ((tagger == null) ? 0 : tagger.hashCode()); result = prime * result + ((taxonomyId == null) ? 0 : taxonomyId.hashCode()); result = prime * result + ((text == null) ? 0 : text.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; GeneMention other = (GeneMention) obj; if (docId == null) { if (other.docId != null) return false; } else if (!docId.equals(other.docId)) return false; if (id == null) { if (other.id != null) return false; } else if (!id.equals(other.id)) return false; if (offsets == null) { if (other.offsets != null) return false; } else if (!offsets.equals(other.offsets)) return false; if (tagger != other.tagger) return false; if (taxonomyId == null) { if (other.taxonomyId != null) return false; } else if (!taxonomyId.equals(other.taxonomyId)) return false; if (text == null) { return other.text == null; } else return text.equals(other.text); } public int getBegin() { return offsets.getMinimum(); } /** * Whether or not this gene mention has been rejected for being a gene mention at all. * * @return True if this gene mention was not successfully mapped to gene ID. */ public boolean isRejected() { return mentionMappingResult != null && mentionMappingResult.isRejected(); } public String getDocId() { return docId; } public void setDocId(String docId) { this.docId = docId; } public int getEnd() { return offsets.getMaximum(); } public GeneName getGeneName() { if (geneName == null && normalizer == null) throw new IllegalStateException( "This GeneMention has not set a TermNormalizer and thus cannot create a GeneName instance."); if (geneName == null) geneName = new GeneName(text, normalizer); return geneName; } public void setGeneName(GeneName geneName) { this.geneName = geneName; } /** * This field is only used for gold mentions. * * @return The gene ID of this mention, if set. * @deprecated Use {@link #overlappingGoldMentions} to represent gold annotations */ @Deprecated public String getGoldMentionId() { return id; } public void setId(String id) { this.id = id; } public TermNormalizer getNormalizer() { return normalizer; } public void setNormalizer(TermNormalizer normalizer) { this.normalizer = normalizer; if (this.geneName != null) this.geneName.setNormalizer(normalizer); } public Range getOffsets() { return offsets; } public void setOffsets(Range offsets) { this.offsets = offsets; } public String getText() { return text; } public void setText(String text) { this.text = text; if (geneName != null) geneName.setText(text); } /** * Returns the text of this gene extended to the end of its overlapping NP-chunk. If there is no such chunk, the original text is returned. * * @return */ public String getRightExtendedText() { Set, String>> chunkNP = geneDocument.getOverlappingChunks(getOffsets(), "ChunkNP"); if (!chunkNP.isEmpty()) { Integer chunkend = chunkNP.iterator().next().getKey().getMaximum(); if (chunkend > getEnd()) { return geneDocument.getCoveredText(getBegin(), chunkend); } } return text; } public Range getRightExtendedOffsets() { Set, String>> chunkNP = geneDocument.getOverlappingChunks(getOffsets(), "ChunkNP"); if (!chunkNP.isEmpty()) { Integer chunkend = chunkNP.iterator().next().getKey().getMaximum(); if (chunkend > getEnd()) { return Range.between(getBegin(), chunkend); } } return offsets; } public Range getPhraseExtendesOffsets() { Set, String>> chunkNP = geneDocument.getOverlappingChunks(getOffsets(), "ChunkNP"); if (!chunkNP.isEmpty()) { return chunkNP.iterator().next().getKey(); } return offsets; } public String getPhraseExtendedText() { return geneDocument.getCoveredText(getPhraseExtendesOffsets()); } @Override public String toString() { String id = mentionMappingResult != null && mentionMappingResult.tax2finalRankedCandidates != null ? mentionMappingResult.getResultCandidates().map(SynHit::getId).collect(Collectors.joining(", ")) : NOID; return "GeneMention [text=" + text + ", offsets=" + offsets + ", docId=" + docId + ", id=" + id + ", taxonomyIds=" + taxonomyIds + ", goldIds=" + getAllGoldIdsAsList() + ", goldTaxIds=" + getAllGoldTaxonomyIdsAsList() + ", tagger=" + tagger + "]"; } public String getNormalizedText() { return getGeneName().getNormalizedText(); } public List getNormalizedTextVariant() { return getGeneName().getNormalizedTextVariant(); } public GeneTagger getTagger() { return tagger; } public void setTagger(GeneTagger tagger) { this.tagger = tagger; } /** * @return The object representing the result of the mapping process for this * particular gene mention. */ public MentionMappingResult getMentionMappingResult() { return mentionMappingResult; } public void setMentionMappingResult(MentionMappingResult mentionMappingResult) { // assert mentionMappingResult != null : "Setting a null MentionMapping result to " + this; this.mentionMappingResult = mentionMappingResult; } public SynHit getResultCandidate(String taxonomyId) { assert mentionMappingResult != null : "The mention mapping result is null"; return mentionMappingResult.getResultCandidate(taxonomyId); } public Stream getResultCandidates() { assert mentionMappingResult != null : "The mention mapping result is null"; return mentionMappingResult.getResultCandidates(); } public GeneDocument getGeneDocument() { return geneDocument; } public void setGeneDocument(GeneDocument geneDocument) { this.geneDocument = geneDocument; } /** * A parent GeneMention is a GeneMention that has been split into sub-mentions, * most commonly due to conjunctions or enumerations within a GeneMention. Thus, * when parent is not null, this GeneMention resulted from a split of another * GeneMention. * * @return The GeneMention that has been split to produce this - and possibly * other - GeneMention(s). * @deprecated Such cases are handled by GeneCompositeNameResolver */ public GeneMention getParent() { return parent; } public void setParent(GeneMention parent) { this.parent = parent; } public void addTaggingModifier(String modifier) { if (taggingModifiers == null) taggingModifiers = new ArrayList<>(); taggingModifiers.add(modifier); } public List getPosTags() { return posTags; } public void setPosTags(List posTags) { this.posTags = posTags; } public SpecificType getSpecificType() { return specificType; } public void setSpecificType(SpecificType specificType) { if (!specificTypeFrozen) this.specificType = specificType; else log.warn("Specific type not set: It is frozen"); } public FeatureVector getFeatureVector() { return featureVector; } public void setFeatureVector(FeatureVector featureVector) { this.featureVector = featureVector; } public boolean isAbbreviationLongForm() { return !geneDocument.getOverlappingAcronymLongforms(this.offsets).isEmpty(); } public boolean isAbbreviation() { return !geneDocument.getOverlappingAcronyms(this.offsets).isEmpty(); } public double getSpecificTypeConfidence() { return specificTypeConfidence; } public void setSpecificTypeConfidence(double specificTypeConfidence) { this.specificTypeConfidence = specificTypeConfidence; } public String getReducedNameForExactMatch() { return reducedNameForExactMatch; } public void setReducedNameForExactMatch(String reducedNameForExactMatch) { this.reducedNameForExactMatch = reducedNameForExactMatch; } public void freezeSpecificType() { specificTypeFrozen = true; } public boolean hasExactCandidateMatch() { if (mentionMappingResult != null) { // TODO delegate to mmr.hasExactCandidateMatch() Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates; if (tax2originalCandidates != null) return tax2originalCandidates.keySet().stream().map(tax2originalCandidates::get).flatMap(Collection::stream).anyMatch(SynHit::isExactMatch); } return false; } public boolean hasApproximateCandidateMatch() { if (mentionMappingResult != null) { Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates; return tax2originalCandidates.keySet().stream().map(tax2originalCandidates::get).filter(Predicate.not(List::isEmpty)).map(list -> list.get(0)).anyMatch(Predicate.not(SynHit::isExactMatch)); } return false; } public boolean hasOnlyApproximateCandidateMatches() { if (mentionMappingResult != null) { Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates; return mentionMappingResult.tax2originalCandidates.values().stream().flatMap(Collection::stream).findAny().isPresent() && tax2originalCandidates.keySet().stream().map(tax2originalCandidates::get).filter(Predicate.not(List::isEmpty)).map(list -> list.get(0)).allMatch(Predicate.not(SynHit::isExactMatch)); } return false; } public String getBestCandidateSynonym() { if (mentionMappingResult != null && bestCandidateSynonym == null) { Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates; bestCandidateSynonym = tax2originalCandidates.keySet().stream().map(tax2originalCandidates::get).filter(Predicate.not(Collection::isEmpty)).map(list -> list.get(0)).map(SynHit::getSynonym).findFirst().get(); } return bestCandidateSynonym; } public Set getAllBestCandidateSynonyms() { return getAllBestCandidateSynonyms(null); } public Set getAllBestCandidateSynonyms(Set filterTax) { if (mentionMappingResult != null) { Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates; for (String taxId : tax2originalCandidates.keySet()) { if (filterTax != null && !filterTax.isEmpty() && !filterTax.contains(taxId)) continue; List candidates = tax2originalCandidates.get(taxId); if (!candidates.isEmpty()) { if (candidates.get(0).isExactMatch() || candidates.size() == 1 || candidates.get(0).getLexicalScore() > candidates.get(1).getLexicalScore()) return Set.of(candidates.get(0).getSynonym()); Set bestSynonyms = new HashSet<>(); double bestScore = candidates.get(0).getLexicalScore(); for (int i = 0; i < candidates.size() && candidates.get(i).getLexicalScore() - bestScore < 0.0001; i++) { bestSynonyms.add(candidates.get(i).getSynonym()); } return bestSynonyms; } } } return Collections.emptySet(); } public Optional getTaxonomyCandidateWithOccurrence(GeneSpeciesOccurrence occurrenceType) { return taxonomyOcurrences != null ? taxonomyOcurrences.keySet().stream().filter(taxId -> taxonomyOcurrences.get(taxId).contains(occurrenceType)).findAny() : Optional.empty(); } public boolean hasCorrectTaxonomyId() { boolean goldHasOffsets = geneDocument.isGoldHasOffsets(); if (goldHasOffsets) { return !Sets.intersection(getAllGoldTaxonomyIdsAsSet(), getTaxonomyIdsSet()).isEmpty(); } return !Sets.intersection(geneDocument.getGoldTaxonomyIds(), getTaxonomyIdsSet()).isEmpty(); } public GeneDocument.MentionCorrectness getGenesetCorrectnessLevel(String goldId) { if (!hasGoldMentions()) return GeneDocument.MentionCorrectness.CANT_FIND; Set> seenOffsets = new HashSet<>(); int goldGenesetSize = (int) geneDocument.getGenes().filter(GeneMention::hasGoldMentions).map(GeneMention::getOverlappingGoldMentions).flatMap(Collection::stream).filter(goldGm -> goldGm.getIds().contains(goldId)).filter(goldGm -> seenOffsets.add(goldGm.getOffsets())).map(GeneMention::getIds).flatMap(Collection::stream).count(); Optional any = geneSets.stream().filter(gs -> gs.stream().findAny().get() .getAllGoldIdAsSet().contains(goldId)).findAny(); int genesetSize = any.isPresent() ? any.get().size() : 0; if (goldGenesetSize == genesetSize) return GeneDocument.MentionCorrectness.CORRECT_ID; return GeneDocument.MentionCorrectness.WRONG_ID; } /** * Checks whether this gene mention has a strong candidate for the given taxonomy ID. * * @param taxId The taxonomy ID to check. * @return Whether or not there is a SynHit that is marked as anchor for the passed taxonomy ID. */ public boolean isAnchor(String taxId) { if (mentionMappingResult != null) { SynHit resultEntry = mentionMappingResult.getResultCandidate(taxId); if (resultEntry != null) return resultEntry.isAnchor(); } return false; } /** * Checks for all candidate lists for all species assigned to this gene mention whether there is a family SynHit * in the first n candidates, inclusive. * * @param n The maximum rank to search for family hits, starting at 1. * @return True if a family hit was found within the first n ranks, false otherwise. */ public boolean hasFamilyCandidateWithinRank(int n, String taxId) { if (mentionMappingResult != null) { List synHits = mentionMappingResult.tax2originalCandidates.get(taxId); for (int i = 0; i < Math.min(synHits.size(), n); i++) { SynHit synHit = synHits.get(i); if (synHit.isFamilyName()) return true; } } return false; } public boolean hasExactMatchInTax(String taxId) { if (mentionMappingResult != null) { List synHits = mentionMappingResult.tax2originalCandidates.get(taxId); try { return !synHits.isEmpty() && synHits.get(0).isExactMatch(); } catch (Exception e) { e.printStackTrace(); } } return false; } public InstanceList getInstances() { return instances; } public void setInstances(InstanceList instances) { this.instances = instances; } public boolean isExactFamilyNameMatch() { return familyNames != null && familyNames.stream().anyMatch(SynHit::isExactMatch); } public double getFamilyNameMatchScore() { return familyNames != null && !familyNames.isEmpty() ? familyNames.get(0).getLexicalScore() : 0d; } public boolean isAmbiguous() { return !getAmbiguityTypes().isEmpty(); } public Set getAmbiguityTypes() { Set ambiguityTypes = new HashSet<>(); if (mentionMappingResult != null) { boolean exactInOneSpecies = false; Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates; for (String taxId : tax2originalCandidates.keySet()) { List candidates4tax = tax2originalCandidates.get(taxId); // Ambiguous 1: Multiple exact matches if (candidates4tax.size() > 1 && candidates4tax.get(0).isExactMatch() && candidates4tax.get(1).isExactMatch()) ambiguityTypes.add(AmbiguityType.LEXICAL); if (!candidates4tax.isEmpty() && candidates4tax.get(0).isExactMatch()) { // If we have already found an exact match for another species, this is an intra species ambiguity if (exactInOneSpecies) ambiguityTypes.add(AmbiguityType.INTRASPECIES); exactInOneSpecies = true; } } } return ambiguityTypes; } public Set getNameTokenSet() { if (nameTokenSet == null) { Function> gnTokensFunc = gn -> Arrays.stream(normalizer.normalize(gn.getText()).split("\\s+")); Stream nameTokens = gnTokensFunc.apply(geneName); for (GeneName alt : geneName.getAlternatives()) { nameTokens = Stream.concat(nameTokens, gnTokensFunc.apply(alt)); } nameTokenSet = nameTokens.collect(Collectors.toSet()); } return nameTokenSet; } public String getEcNumber() { return geneName.getEcNumber(); } public String getCompositeResolver() { return compositeResolver; } public void setCompositeResolver(String compositeResolver) { this.compositeResolver = compositeResolver; } public Stream getContextGeneNames() { if (geneDocument == null) return Stream.empty(); return geneDocument.getGenes().filter(g -> g != this).map(GeneMention::getGeneName); } public GeneSets getGeneSets() { return geneSets; } public void removeGeneSet(GeneSet geneSet) { this.geneSets.remove(geneSet); } public void clearGeneSets() { if (geneSets != null) geneSets.clear(); } public void reject(MentionMappingResult.RejectReason reason) { // In some cases of overlap and peeking into gold tax IDs it can happen // that getTaxonomyIds() does not return values that are set to tax2originalCandidates // or tax2lexicallyRerankedCandidates Set taxIds = new HashSet<>(); taxIds.addAll(getTaxonomyIds()); if (mentionMappingResult != null && mentionMappingResult.tax2originalCandidates != null) taxIds.addAll(mentionMappingResult.tax2originalCandidates.keySet()); if (mentionMappingResult != null && mentionMappingResult.tax2lexicallyRerankedCandidates != null) taxIds.addAll(mentionMappingResult.tax2lexicallyRerankedCandidates.keySet()); for (String tax : taxIds) { reject(tax, reason); } } public void reject(String tax, MentionMappingResult.RejectReason reason) { if (mentionMappingResult == null) { mentionMappingResult = new MentionMappingResult(this); } if (mentionMappingResult.tax2lexicallyRerankedCandidates == null) mentionMappingResult.tax2lexicallyRerankedCandidates = new HashMap<>(); if (mentionMappingResult.tax2finalRankedCandidates == null) mentionMappingResult.tax2finalRankedCandidates = new HashMap<>(); mentionMappingResult.tax2lexicallyRerankedCandidates.put(tax, List.of(MentionMappingResult.REJECTION)); mentionMappingResult.tax2finalRankedCandidates.put(tax, List.of(MentionMappingResult.REJECTION)); mentionMappingResult.setRejectReason(tax, reason); } public enum GeneTagger { JNET, BANNER, FLAIR, FLAIR_JPG_COLLAPSED_VAR, FLAIR_JPG_COLLAPSED_VARCOMPENUM, FLAIR_BC2TRAINTEST, FLAIR_GNORMPLUSNLMIAT, GOLD, FLAIR_JPG_NOBC2TEST_NOTEST, FLAIR_JPG_NOBC2TEST_NOTEST_COLLAPSED_VAR, FLAIR_JPG_GNP_ENTITIES, CONSISTENCY_TAGGER, EXPANSION_TAGGER, GNORM_PLUS, UNKNOWN, GAZETTEER } public enum SpecificType { GENE, FAMILYNAME, DOMAINMOTIF, GENE_ENUM, NO_GENE, GROUP, COMPLEX, UNKNOWN } public enum AmbiguityType {INTRASPECIES, LEXICAL} }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy