
de.julielab.geneexpbase.genemodel.GeneMention Maven / Gradle / Ivy
package de.julielab.geneexpbase.genemodel;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.InstanceList;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.geneexpbase.candidateretrieval.SynHit;
import de.julielab.java.utilities.spanutils.Span;
import org.apache.commons.lang3.Range;
import org.apache.lucene.search.Query;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.function.Supplier;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* A basic "gene mention" that most of all contains the text of the mention.
* However, we might also need other information, i.e. offsets.
*
* @author faessler
*/
public class GeneMention implements Span {
/**
* Constant meaning that no ID is given for a GeneMention.
*/
public static final String NOID = "NoId";
private final static Logger log = LoggerFactory.getLogger(GeneMention.class);
private Object originalMappedObject;
private String docId;
private GeneName geneName;
/**
* @deprecated a single mention can have multiple (taxonomy) IDs (human and murine pro-alpha3(V)...)
*/
private String id = NOID;
private TermNormalizer normalizer;
private Range offsets;
private String text;
/**
* @deprecated refer to {@link #overlappingGoldMentions}
*/
private String goldTaxonomyId;
private List overlappingGoldMentions;
/**
* @deprecated a single mention can have multiple (taxonomy) IDs (human and murine pro-alpha3(V)...)
*/
private String taxonomyId;
private List ids = Collections.emptyList();
/**
* Taxonomy ID candidates for this gene mention. Used during species assignment.
*/
private Set taxonomyCandidates;
/**
* Indicates whether all {@link #taxonomyCandidates} should be assigned or only one of them.
*/
private boolean isTaxonomyCandidatesConjunctive;
/**
* Final taxonomy IDs assigned to this gene mention.
*/
private List taxonomyIds = Collections.emptyList();
private Multimap taxonomyOcurrences = HashMultimap.create();
private Map taxonomyScores;
/**
* These will mostly be the same as {@link #taxonomyScores} but can differ is some cases, e.g. when
* the fallback to the default species has happened since all other candidates seemed too unlikely.
*/
private Map processedTaxonomyScores;
private Set taxonomyIdsSet;
private boolean specificTypeFrozen;
private GeneSpeciesOccurrence taxonomyReliability;
private String documentContext;
private Query contextQuery;
private GeneTagger tagger = GeneTagger.UNKNOWN;
private SpecificType specificType = SpecificType.UNKNOWN;
private double specificTypeConfidence;
private MentionMappingResult mentionMappingResult;
private GeneDocument geneDocument;
private List taggingModifiers;
// The gene set to which this gene belongs
private GeneSets geneSets;
/**
* A parent GeneMention is a GeneMention that has been split into sub-mentions,
* most commonly due to conjunctions or enumerations within a GeneMention. Thus,
* when parent is not null, this GeneMention resulted from a split of another
* GeneMention.
*/
private GeneMention parent;
/**
* A GeneMention that is actually a composite to be expanded to multiple different genes, like enumerations
* or numerical ranges, can have derived GeneMentions corresponding to the individual genes denotes by the
* composite expression. Those are the children.
*/
private List children;
private boolean isCompositeMention;
private List posTags;
private FeatureVector featureVector;
private String reducedNameForExactMatch;
private InstanceList instances;
private List familyNames;
private String bestCandidateSynonym;
private String compositeResolver;
private Set nameTokenSet;
private Map familyFeatures = new HashMap<>();
/**
* Makes a copy of the given GeneMention but NOT from its MentionMappingResult.
*
* @param gm The gene mention to copy.
*/
public GeneMention(GeneMention gm) {
if (gm.geneName != null)
this.geneName = new GeneName(gm.geneName);
this.contextQuery = gm.contextQuery;
this.docId = gm.docId;
this.documentContext = gm.documentContext;
this.geneDocument = gm.geneDocument;
this.id = gm.id;
if (gm.ids != null)
this.ids = new ArrayList<>(gm.ids);
this.normalizer = gm.normalizer;
this.offsets = gm.offsets;
this.tagger = gm.tagger;
this.taxonomyOcurrences = gm.taxonomyOcurrences;
this.taxonomyId = gm.taxonomyId;
this.taxonomyIds = gm.getTaxonomyIds();
this.taxonomyScores = gm.taxonomyScores;
this.processedTaxonomyScores = gm.processedTaxonomyScores;
this.text = gm.text;
this.originalMappedObject = gm.originalMappedObject;
this.parent = gm.parent;
this.children = gm.children;
this.overlappingGoldMentions = gm.overlappingGoldMentions;
this.specificType = gm.getSpecificType();
this.specificTypeConfidence = gm.getSpecificTypeConfidence();
this.overlappingGoldMentions = gm.overlappingGoldMentions;
if (gm.getMentionMappingResult() != null)
this.mentionMappingResult = new MentionMappingResult(gm.getMentionMappingResult());
this.familyFeatures = new HashMap<>(gm.familyFeatures);
}
public GeneMention(String text) {
this();
this.text = text;
this.children = Collections.emptyList();
}
public GeneMention() {
}
public GeneMention(String text, TermNormalizer normalizer) {
this(text);
this.setNormalizer(normalizer);
}
public GeneMention(String text, int begin, int end) {
this(text);
this.offsets = Range.between(begin, end);
}
public GeneMention(String text, int begin, int end, TermNormalizer normalizer) {
this(text, begin, end);
this.setNormalizer(normalizer);
}
public Double addFamilyFeature(String featureName, double value) {
return familyFeatures.put(featureName, value);
}
public Map getFamilyFeatures() {
return familyFeatures;
}
public boolean matchesFamilyName() {
return familyNames != null && !familyNames.isEmpty();
}
public GeneSet getSingleGeneSet() {
if (geneSets.size() != 1)
throw new IllegalArgumentException("There is not a single geneset associated with this gene mention but there are " + geneSets.size() + " for gene mention " + this + ": " + geneSets);
return geneSets.stream().findAny().get();
}
public void addGeneSet(GeneSet geneSet) {
if (geneSet == null || geneSet.isEmpty())
throw new IllegalArgumentException("The passed geneset is " + (geneSet == null ? "null" : "empty") + ".");
if (geneSets == null)
this.geneSets = new GeneSets();
this.geneSets.add(geneSet);
}
public boolean isTaxonomyCandidatesConjunctive() {
return isTaxonomyCandidatesConjunctive;
}
public void setTaxonomyCandidatesConjunctive(boolean taxonomyCandidatesConjunctive) {
isTaxonomyCandidatesConjunctive = taxonomyCandidatesConjunctive;
}
public Set getTaxonomyCandidates() {
return taxonomyCandidates != null ? taxonomyCandidates : Collections.emptySet();
}
public void setTaxonomyCandidates(Set taxonomyCandidates) {
this.taxonomyCandidates = taxonomyCandidates;
}
public List getFamilyNames() {
return familyNames;
}
public void setFamilyNames(List matchedFamilyNames) {
familyNames = matchedFamilyNames;
}
/**
* Returns the taxonomy scores that have undergone threshold filtering. It is possible that this is not
* the outcome of the ML-based approach but just the default species of a document due to threshold filtering.
*
* @return
*/
public Map getProcessedTaxonomyScores() {
return processedTaxonomyScores;
}
public void setProcessedTaxonomyScores(Map processedTaxonomyScores) {
this.processedTaxonomyScores = processedTaxonomyScores;
}
public GeneMention getFirstGoldMention() {
if (!hasGoldMentions()) return null;
return overlappingGoldMentions.get(0);
}
/**
* The original UIMA annotation that is mapped. Most likely a subclass of EntityMention.
*
* @return The original object to be mapped.
*/
public Object getOriginalMappedObject() {
return originalMappedObject;
}
public void setOriginalMappedObject(Object originalMappedObject) {
this.originalMappedObject = originalMappedObject;
}
public List getIds() {
return ids;
}
public void setIds(List ids) {
assert !ids.stream().anyMatch(Objects::isNull) : "There is a null item in the IDs to be set.";
assert ids.indexOf("null") == -1 : "The string 'null' is among the IDs to be set.";
this.ids = ids;
}
public Stream getMappedSynHits() {
if (mentionMappingResult == null)
throw new IllegalArgumentException("This gene mention was not yet mapped, there are no final ranked candidates.");
return mentionMappingResult.getResultCandidates();
}
public Stream getMappedIds() {
assert getMappedSynHits().filter(Predicate.not(SynHit::isRejectionCandidate)).map(SynHit::getId).noneMatch(Objects::isNull) : "A null ID is returned for " + this;
return getMappedSynHits().filter(Predicate.not(SynHit::isRejectionCandidate)).map(SynHit::getId);
}
public Set getMappedIdSet() {
return getMappedIds().collect(Collectors.toSet());
}
public void addId(String id) {
if (ids.isEmpty())
ids = new ArrayList<>();
ids.add(id);
}
public void addTaxonomyId(String id) {
taxonomyIdsSet = null;
if (taxonomyIds.isEmpty())
taxonomyIds = new ArrayList<>();
taxonomyIds.add(id);
}
public List getTaxonomyIds() {
if (taxonomyIds != null) {
return taxonomyIds;
}
if (taxonomyId != null)
return Collections.singletonList(taxonomyId);
return Collections.emptyList();
}
public void setTaxonomyIds(List taxonomyIds) {
this.taxonomyIds = taxonomyIds;
taxonomyIdsSet = null;
}
public List getNonRejectedTaxonomyIds() {
if (mentionMappingResult == null)
return getTaxonomyIds();
// we check the lexically reranked candidates for the rejection because it is not set to the original candidates since those should remain the original list, even if empty
return getTaxonomyIds().stream().filter(taxId -> !mentionMappingResult.tax2lexicallyRerankedCandidates.get(taxId).get(0).isRejectionCandidate()).collect(Collectors.toList());
}
public Set getTaxonomyIdsSet() {
if (taxonomyIdsSet == null)
taxonomyIdsSet = new HashSet<>(getTaxonomyIds());
return taxonomyIdsSet;
}
public void addChild(GeneMention child) {
if (children.isEmpty())
children = new ArrayList<>();
children.add(child);
}
public boolean isCompositeMention() {
return isCompositeMention || !children.isEmpty();
}
public List getOverlappingGoldMentions() {
return overlappingGoldMentions;
}
public void setOverlappingGoldMentions(List overlappingGoldMentions) {
this.overlappingGoldMentions = overlappingGoldMentions;
}
public String getAnyGoldTaxonomyId() {
if (!hasGoldMentions()) return null;
return overlappingGoldMentions.get(0).getTaxonomyIds().get(0);
}
public List getAnyGoldTaxonomyIds() {
if (!hasGoldMentions()) return Collections.emptyList();
return overlappingGoldMentions.get(0).getTaxonomyIds();
}
public List getAllGoldTaxonomyIdsAsList() {
return getAllGoldTaxonomyIds(Collectors.toList(), Collections::emptyList);
}
public Set getAllGoldTaxonomyIdsAsSet() {
return getAllGoldTaxonomyIds(Collectors.toSet(), Collections::emptySet);
}
public R getAllGoldTaxonomyIds(Collector super T, A, R> collector, Supplier emptyResultSupplier) {
if (!hasGoldMentions()) return emptyResultSupplier.get();
return overlappingGoldMentions.stream().map(GeneMention::getTaxonomyIds).flatMap(Collection::stream).map(id -> (T) id).collect(collector);
}
public Stream getAllGoldTaxonomyIds() {
return overlappingGoldMentions.stream().map(GeneMention::getTaxonomyIds).flatMap(Collection::stream);
}
public String getAnyGoldId() {
if (!hasGoldMentions()) return null;
return overlappingGoldMentions.get(0).getGoldMentionId();
}
public List getAnyGoldIds() {
if (!hasGoldMentions()) return Collections.emptyList();
return overlappingGoldMentions.get(0).getIds();
}
public List getAllGoldIdsAsList() {
return getAllGoldIds(Collectors.toList(), Collections::emptyList);
}
public Set getAllGoldIdAsSet() {
return getAllGoldIds(Collectors.toSet(), Collections::emptySet);
}
private R getAllGoldIds(Collector super T, A, R> collector, Supplier emptyResultSupplier) {
if (!hasGoldMentions()) return emptyResultSupplier.get();
return getAllGoldIds().map(id -> (T) id).collect(collector);
}
public Stream getAllGoldIds() {
if (!hasGoldMentions()) return Stream.empty();
return overlappingGoldMentions.stream().map(GeneMention::getIds).flatMap(Collection::stream);
}
public boolean hasGoldMentions() {
return overlappingGoldMentions != null && !overlappingGoldMentions.isEmpty();
}
/**
* @return
* @deprecated Use {@link #getAllGoldTaxonomyIdsAsList()}, {@link #getAnyGoldTaxonomyId()} or {@link #getAllGoldTaxonomyIdsAsList()} instead.
*/
public String getGoldTaxonomyId() {
return goldTaxonomyId;
}
public void setGoldTaxonomyId(String goldTaxonomyId) {
this.goldTaxonomyId = goldTaxonomyId;
}
public void setTaxonomyScore(String tax, double score) {
if (taxonomyScores == null)
taxonomyScores = new HashMap<>();
taxonomyScores.put(tax, score);
}
public void setProcessedTaxonomyScore(String tax, double score) {
if (processedTaxonomyScores == null)
processedTaxonomyScores = new HashMap<>();
processedTaxonomyScores.put(tax, score);
}
public double getTaxonomyScore(String taxonomyId) {
return taxonomyScores == null ? 0 : taxonomyScores.getOrDefault(taxonomyId, 0d);
}
public double getProcessedTaxonomyScore(String taxonomyId) {
return processedTaxonomyScores == null ? 0 : processedTaxonomyScores.getOrDefault(taxonomyId, 0d);
}
public Map getTaxonomyScores() {
return taxonomyScores;
}
/**
* Returns the original taxonomy scores returned by the ML-approach and the rule that species occurring previous
* to a gene in the same NP are surely assigned to this gene.
*
* @param taxonomyScores
*/
public void setTaxonomyScores(Map taxonomyScores) {
this.taxonomyScores = taxonomyScores;
}
public GeneSpeciesOccurrence getTaxonomyReliability() {
return taxonomyReliability;
}
public void setTaxonomyReliability(GeneSpeciesOccurrence taxonomyReliability) {
this.taxonomyReliability = taxonomyReliability;
}
public List getTaggingModifiers() {
return taggingModifiers;
}
public String getTaxonomyId() {
if (taxonomyId != null) return taxonomyId;
if (taxonomyIds == null || taxonomyIds.isEmpty()) return null;
return taxonomyIds.get(0);
}
public void setTaxonomyId(String taxonomyId) {
this.taxonomyId = taxonomyId;
this.taxonomyIds = new ArrayList<>();
if (taxonomyId != null)
this.taxonomyIds.add(taxonomyId);
}
public Multimap getTaxonomyOccurrences() {
return taxonomyOcurrences;
}
public void setTaxonomyOcurrences(Multimap taxonomyOcurrences) {
this.taxonomyOcurrences = taxonomyOcurrences;
}
public String getDocumentContext() {
return documentContext;
}
public void setDocumentContext(String documentContext) {
this.documentContext = documentContext;
}
public Stream getDocumentContext(int numTokens, Set excludedTokens, boolean excludeGeneMentions) {
return geneDocument.getDocumentContext(offsets, excludedTokens, excludeGeneMentions, numTokens);
}
public Stream getDocumentContext(int numTokens) {
return geneDocument.getDocumentContext(offsets, numTokens);
}
public Query getContextQuery() {
return contextQuery;
}
public void setContextQuery(Query contextQuery) {
this.contextQuery = contextQuery;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((docId == null) ? 0 : docId.hashCode());
result = prime * result + ((id == null) ? 0 : id.hashCode());
result = prime * result + ((offsets == null) ? 0 : offsets.hashCode());
result = prime * result + ((tagger == null) ? 0 : tagger.hashCode());
result = prime * result + ((taxonomyId == null) ? 0 : taxonomyId.hashCode());
result = prime * result + ((text == null) ? 0 : text.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
GeneMention other = (GeneMention) obj;
if (docId == null) {
if (other.docId != null)
return false;
} else if (!docId.equals(other.docId))
return false;
if (id == null) {
if (other.id != null)
return false;
} else if (!id.equals(other.id))
return false;
if (offsets == null) {
if (other.offsets != null)
return false;
} else if (!offsets.equals(other.offsets))
return false;
if (tagger != other.tagger)
return false;
if (taxonomyId == null) {
if (other.taxonomyId != null)
return false;
} else if (!taxonomyId.equals(other.taxonomyId))
return false;
if (text == null) {
return other.text == null;
} else return text.equals(other.text);
}
public int getBegin() {
return offsets.getMinimum();
}
/**
* Whether or not this gene mention has been rejected for being a gene mention at all.
*
* @return True if this gene mention was not successfully mapped to gene ID.
*/
public boolean isRejected() {
return mentionMappingResult != null && mentionMappingResult.isRejected();
}
public String getDocId() {
return docId;
}
public void setDocId(String docId) {
this.docId = docId;
}
public int getEnd() {
return offsets.getMaximum();
}
public GeneName getGeneName() {
if (geneName == null && normalizer == null)
throw new IllegalStateException(
"This GeneMention has not set a TermNormalizer and thus cannot create a GeneName instance.");
if (geneName == null)
geneName = new GeneName(text, normalizer);
return geneName;
}
public void setGeneName(GeneName geneName) {
this.geneName = geneName;
}
/**
* This field is only used for gold mentions.
*
* @return The gene ID of this mention, if set.
* @deprecated Use {@link #overlappingGoldMentions} to represent gold annotations
*/
@Deprecated
public String getGoldMentionId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public TermNormalizer getNormalizer() {
return normalizer;
}
public void setNormalizer(TermNormalizer normalizer) {
this.normalizer = normalizer;
if (this.geneName != null)
this.geneName.setNormalizer(normalizer);
}
public Range getOffsets() {
return offsets;
}
public void setOffsets(Range offsets) {
this.offsets = offsets;
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
if (geneName != null)
geneName.setText(text);
}
/**
* Returns the text of this gene extended to the end of its overlapping NP-chunk. If there is no such chunk, the original text is returned.
*
* @return
*/
public String getRightExtendedText() {
Set, String>> chunkNP = geneDocument.getOverlappingChunks(getOffsets(), "ChunkNP");
if (!chunkNP.isEmpty()) {
Integer chunkend = chunkNP.iterator().next().getKey().getMaximum();
if (chunkend > getEnd()) {
return geneDocument.getCoveredText(getBegin(), chunkend);
}
}
return text;
}
public Range getRightExtendedOffsets() {
Set, String>> chunkNP = geneDocument.getOverlappingChunks(getOffsets(), "ChunkNP");
if (!chunkNP.isEmpty()) {
Integer chunkend = chunkNP.iterator().next().getKey().getMaximum();
if (chunkend > getEnd()) {
return Range.between(getBegin(), chunkend);
}
}
return offsets;
}
public Range getPhraseExtendesOffsets() {
Set, String>> chunkNP = geneDocument.getOverlappingChunks(getOffsets(), "ChunkNP");
if (!chunkNP.isEmpty()) {
return chunkNP.iterator().next().getKey();
}
return offsets;
}
public String getPhraseExtendedText() {
return geneDocument.getCoveredText(getPhraseExtendesOffsets());
}
@Override
public String toString() {
String id = mentionMappingResult != null && mentionMappingResult.tax2finalRankedCandidates != null ? mentionMappingResult.getResultCandidates().map(SynHit::getId).collect(Collectors.joining(", ")) : NOID;
return "GeneMention [text=" + text + ", offsets=" + offsets + ", docId=" + docId + ", id=" + id + ", taxonomyIds=" + taxonomyIds
+ ", goldIds=" + getAllGoldIdsAsList() + ", goldTaxIds=" + getAllGoldTaxonomyIdsAsList() + ", tagger=" + tagger + "]";
}
public String getNormalizedText() {
return getGeneName().getNormalizedText();
}
public List getNormalizedTextVariant() {
return getGeneName().getNormalizedTextVariant();
}
public GeneTagger getTagger() {
return tagger;
}
public void setTagger(GeneTagger tagger) {
this.tagger = tagger;
}
/**
* @return The object representing the result of the mapping process for this
* particular gene mention.
*/
public MentionMappingResult getMentionMappingResult() {
return mentionMappingResult;
}
public void setMentionMappingResult(MentionMappingResult mentionMappingResult) {
// assert mentionMappingResult != null : "Setting a null MentionMapping result to " + this;
this.mentionMappingResult = mentionMappingResult;
}
public SynHit getResultCandidate(String taxonomyId) {
assert mentionMappingResult != null : "The mention mapping result is null";
return mentionMappingResult.getResultCandidate(taxonomyId);
}
public Stream getResultCandidates() {
assert mentionMappingResult != null : "The mention mapping result is null";
return mentionMappingResult.getResultCandidates();
}
public GeneDocument getGeneDocument() {
return geneDocument;
}
public void setGeneDocument(GeneDocument geneDocument) {
this.geneDocument = geneDocument;
}
/**
* A parent GeneMention is a GeneMention that has been split into sub-mentions,
* most commonly due to conjunctions or enumerations within a GeneMention. Thus,
* when parent is not null, this GeneMention resulted from a split of another
* GeneMention.
*
* @return The GeneMention that has been split to produce this - and possibly
* other - GeneMention(s).
* @deprecated Such cases are handled by GeneCompositeNameResolver
*/
public GeneMention getParent() {
return parent;
}
public void setParent(GeneMention parent) {
this.parent = parent;
}
public void addTaggingModifier(String modifier) {
if (taggingModifiers == null)
taggingModifiers = new ArrayList<>();
taggingModifiers.add(modifier);
}
public List getPosTags() {
return posTags;
}
public void setPosTags(List posTags) {
this.posTags = posTags;
}
public SpecificType getSpecificType() {
return specificType;
}
public void setSpecificType(SpecificType specificType) {
if (!specificTypeFrozen)
this.specificType = specificType;
else
log.warn("Specific type not set: It is frozen");
}
public FeatureVector getFeatureVector() {
return featureVector;
}
public void setFeatureVector(FeatureVector featureVector) {
this.featureVector = featureVector;
}
public boolean isAbbreviationLongForm() {
return !geneDocument.getOverlappingAcronymLongforms(this.offsets).isEmpty();
}
public boolean isAbbreviation() {
return !geneDocument.getOverlappingAcronyms(this.offsets).isEmpty();
}
public double getSpecificTypeConfidence() {
return specificTypeConfidence;
}
public void setSpecificTypeConfidence(double specificTypeConfidence) {
this.specificTypeConfidence = specificTypeConfidence;
}
public String getReducedNameForExactMatch() {
return reducedNameForExactMatch;
}
public void setReducedNameForExactMatch(String reducedNameForExactMatch) {
this.reducedNameForExactMatch = reducedNameForExactMatch;
}
public void freezeSpecificType() {
specificTypeFrozen = true;
}
public boolean hasExactCandidateMatch() {
if (mentionMappingResult != null) {
// TODO delegate to mmr.hasExactCandidateMatch()
Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates;
if (tax2originalCandidates != null)
return tax2originalCandidates.keySet().stream().map(tax2originalCandidates::get).flatMap(Collection::stream).anyMatch(SynHit::isExactMatch);
}
return false;
}
public boolean hasApproximateCandidateMatch() {
if (mentionMappingResult != null) {
Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates;
return tax2originalCandidates.keySet().stream().map(tax2originalCandidates::get).filter(Predicate.not(List::isEmpty)).map(list -> list.get(0)).anyMatch(Predicate.not(SynHit::isExactMatch));
}
return false;
}
public boolean hasOnlyApproximateCandidateMatches() {
if (mentionMappingResult != null) {
Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates;
return mentionMappingResult.tax2originalCandidates.values().stream().flatMap(Collection::stream).findAny().isPresent() && tax2originalCandidates.keySet().stream().map(tax2originalCandidates::get).filter(Predicate.not(List::isEmpty)).map(list -> list.get(0)).allMatch(Predicate.not(SynHit::isExactMatch));
}
return false;
}
public String getBestCandidateSynonym() {
if (mentionMappingResult != null && bestCandidateSynonym == null) {
Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates;
bestCandidateSynonym = tax2originalCandidates.keySet().stream().map(tax2originalCandidates::get).filter(Predicate.not(Collection::isEmpty)).map(list -> list.get(0)).map(SynHit::getSynonym).findFirst().get();
}
return bestCandidateSynonym;
}
public Set getAllBestCandidateSynonyms() {
return getAllBestCandidateSynonyms(null);
}
public Set getAllBestCandidateSynonyms(Set filterTax) {
if (mentionMappingResult != null) {
Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates;
for (String taxId : tax2originalCandidates.keySet()) {
if (filterTax != null && !filterTax.isEmpty() && !filterTax.contains(taxId))
continue;
List candidates = tax2originalCandidates.get(taxId);
if (!candidates.isEmpty()) {
if (candidates.get(0).isExactMatch() || candidates.size() == 1 || candidates.get(0).getLexicalScore() > candidates.get(1).getLexicalScore())
return Set.of(candidates.get(0).getSynonym());
Set bestSynonyms = new HashSet<>();
double bestScore = candidates.get(0).getLexicalScore();
for (int i = 0; i < candidates.size() && candidates.get(i).getLexicalScore() - bestScore < 0.0001; i++) {
bestSynonyms.add(candidates.get(i).getSynonym());
}
return bestSynonyms;
}
}
}
return Collections.emptySet();
}
public Optional getTaxonomyCandidateWithOccurrence(GeneSpeciesOccurrence occurrenceType) {
return taxonomyOcurrences != null ? taxonomyOcurrences.keySet().stream().filter(taxId -> taxonomyOcurrences.get(taxId).contains(occurrenceType)).findAny() : Optional.empty();
}
public boolean hasCorrectTaxonomyId() {
boolean goldHasOffsets = geneDocument.isGoldHasOffsets();
if (goldHasOffsets) {
return !Sets.intersection(getAllGoldTaxonomyIdsAsSet(), getTaxonomyIdsSet()).isEmpty();
}
return !Sets.intersection(geneDocument.getGoldTaxonomyIds(), getTaxonomyIdsSet()).isEmpty();
}
public GeneDocument.MentionCorrectness getGenesetCorrectnessLevel(String goldId) {
if (!hasGoldMentions())
return GeneDocument.MentionCorrectness.CANT_FIND;
Set> seenOffsets = new HashSet<>();
int goldGenesetSize = (int) geneDocument.getGenes().filter(GeneMention::hasGoldMentions).map(GeneMention::getOverlappingGoldMentions).flatMap(Collection::stream).filter(goldGm -> goldGm.getIds().contains(goldId)).filter(goldGm -> seenOffsets.add(goldGm.getOffsets())).map(GeneMention::getIds).flatMap(Collection::stream).count();
Optional any = geneSets.stream().filter(gs -> gs.stream().findAny().get()
.getAllGoldIdAsSet().contains(goldId)).findAny();
int genesetSize = any.isPresent() ? any.get().size() : 0;
if (goldGenesetSize == genesetSize)
return GeneDocument.MentionCorrectness.CORRECT_ID;
return GeneDocument.MentionCorrectness.WRONG_ID;
}
/**
* Checks whether this gene mention has a strong candidate for the given taxonomy ID.
*
* @param taxId The taxonomy ID to check.
* @return Whether or not there is a SynHit that is marked as anchor for the passed taxonomy ID.
*/
public boolean isAnchor(String taxId) {
if (mentionMappingResult != null) {
SynHit resultEntry = mentionMappingResult.getResultCandidate(taxId);
if (resultEntry != null)
return resultEntry.isAnchor();
}
return false;
}
/**
* Checks for all candidate lists for all species assigned to this gene mention whether there is a family SynHit
* in the first n candidates, inclusive.
*
* @param n The maximum rank to search for family hits, starting at 1.
* @return True if a family hit was found within the first n ranks, false otherwise.
*/
public boolean hasFamilyCandidateWithinRank(int n, String taxId) {
if (mentionMappingResult != null) {
List synHits = mentionMappingResult.tax2originalCandidates.get(taxId);
for (int i = 0; i < Math.min(synHits.size(), n); i++) {
SynHit synHit = synHits.get(i);
if (synHit.isFamilyName())
return true;
}
}
return false;
}
public boolean hasExactMatchInTax(String taxId) {
if (mentionMappingResult != null) {
List synHits = mentionMappingResult.tax2originalCandidates.get(taxId);
try {
return !synHits.isEmpty() && synHits.get(0).isExactMatch();
} catch (Exception e) {
e.printStackTrace();
}
}
return false;
}
public InstanceList getInstances() {
return instances;
}
public void setInstances(InstanceList instances) {
this.instances = instances;
}
public boolean isExactFamilyNameMatch() {
return familyNames != null && familyNames.stream().anyMatch(SynHit::isExactMatch);
}
public double getFamilyNameMatchScore() {
return familyNames != null && !familyNames.isEmpty() ? familyNames.get(0).getLexicalScore() : 0d;
}
public boolean isAmbiguous() {
return !getAmbiguityTypes().isEmpty();
}
public Set getAmbiguityTypes() {
Set ambiguityTypes = new HashSet<>();
if (mentionMappingResult != null) {
boolean exactInOneSpecies = false;
Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates;
for (String taxId : tax2originalCandidates.keySet()) {
List candidates4tax = tax2originalCandidates.get(taxId);
// Ambiguous 1: Multiple exact matches
if (candidates4tax.size() > 1 && candidates4tax.get(0).isExactMatch() && candidates4tax.get(1).isExactMatch())
ambiguityTypes.add(AmbiguityType.LEXICAL);
if (!candidates4tax.isEmpty() && candidates4tax.get(0).isExactMatch()) {
// If we have already found an exact match for another species, this is an intra species ambiguity
if (exactInOneSpecies)
ambiguityTypes.add(AmbiguityType.INTRASPECIES);
exactInOneSpecies = true;
}
}
}
return ambiguityTypes;
}
public Set getNameTokenSet() {
if (nameTokenSet == null) {
Function> gnTokensFunc = gn -> Arrays.stream(normalizer.normalize(gn.getText()).split("\\s+"));
Stream nameTokens = gnTokensFunc.apply(geneName);
for (GeneName alt : geneName.getAlternatives()) {
nameTokens = Stream.concat(nameTokens, gnTokensFunc.apply(alt));
}
nameTokenSet = nameTokens.collect(Collectors.toSet());
}
return nameTokenSet;
}
public String getEcNumber() {
return geneName.getEcNumber();
}
public String getCompositeResolver() {
return compositeResolver;
}
public void setCompositeResolver(String compositeResolver) {
this.compositeResolver = compositeResolver;
}
public Stream getContextGeneNames() {
if (geneDocument == null)
return Stream.empty();
return geneDocument.getGenes().filter(g -> g != this).map(GeneMention::getGeneName);
}
public GeneSets getGeneSets() {
return geneSets;
}
public void removeGeneSet(GeneSet geneSet) {
this.geneSets.remove(geneSet);
}
public void clearGeneSets() {
if (geneSets != null)
geneSets.clear();
}
public void reject(MentionMappingResult.RejectReason reason) {
// In some cases of overlap and peeking into gold tax IDs it can happen
// that getTaxonomyIds() does not return values that are set to tax2originalCandidates
// or tax2lexicallyRerankedCandidates
Set taxIds = new HashSet<>();
taxIds.addAll(getTaxonomyIds());
if (mentionMappingResult != null && mentionMappingResult.tax2originalCandidates != null)
taxIds.addAll(mentionMappingResult.tax2originalCandidates.keySet());
if (mentionMappingResult != null && mentionMappingResult.tax2lexicallyRerankedCandidates != null)
taxIds.addAll(mentionMappingResult.tax2lexicallyRerankedCandidates.keySet());
for (String tax : taxIds) {
reject(tax, reason);
}
}
public void reject(String tax, MentionMappingResult.RejectReason reason) {
if (mentionMappingResult == null) {
mentionMappingResult = new MentionMappingResult(this);
}
if (mentionMappingResult.tax2lexicallyRerankedCandidates == null)
mentionMappingResult.tax2lexicallyRerankedCandidates = new HashMap<>();
if (mentionMappingResult.tax2finalRankedCandidates == null)
mentionMappingResult.tax2finalRankedCandidates = new HashMap<>();
mentionMappingResult.tax2lexicallyRerankedCandidates.put(tax, List.of(MentionMappingResult.REJECTION));
mentionMappingResult.tax2finalRankedCandidates.put(tax, List.of(MentionMappingResult.REJECTION));
mentionMappingResult.setRejectReason(tax, reason);
}
public enum GeneTagger {
JNET, BANNER, FLAIR, FLAIR_JPG_COLLAPSED_VAR, FLAIR_JPG_COLLAPSED_VARCOMPENUM, FLAIR_BC2TRAINTEST, FLAIR_GNORMPLUSNLMIAT, GOLD, FLAIR_JPG_NOBC2TEST_NOTEST, FLAIR_JPG_NOBC2TEST_NOTEST_COLLAPSED_VAR, FLAIR_JPG_GNP_ENTITIES, CONSISTENCY_TAGGER, EXPANSION_TAGGER, GNORM_PLUS, UNKNOWN, GAZETTEER
}
public enum SpecificType {
GENE, FAMILYNAME, DOMAINMOTIF, GENE_ENUM, NO_GENE, GROUP, COMPLEX, UNKNOWN
}
public enum AmbiguityType {INTRASPECIES, LEXICAL}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy