de.julielab.geneexpbase.genemodel.GeneMention Maven / Gradle / Ivy

Go to download
package de.julielab.geneexpbase.genemodel;

import cc.mallet.types.FeatureVector;
import cc.mallet.types.InstanceList;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.geneexpbase.candidateretrieval.SynHit;
import de.julielab.java.utilities.spanutils.Span;
import org.apache.commons.lang3.Range;
import org.apache.lucene.search.Query;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.function.Supplier;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import java.util.stream.Stream;


/**
 * A basic "gene mention" that most of all contains the text of the mention.
 * However, we might also need other information, i.e. offsets.
 *
 * @author faessler
 */
public class GeneMention implements Span {
    /**
     * Constant meaning that no ID is given for a GeneMention.
     */
    public static final String NOID = "NoId";
    private final static Logger log = LoggerFactory.getLogger(GeneMention.class);
    private Object originalMappedObject;
    private String docId;
    private GeneName geneName;
    /**
     * @deprecated a single mention can have multiple (taxonomy) IDs (human and murine pro-alpha3(V)...)
     */
    private String id = NOID;
    private TermNormalizer normalizer;
    private Range offsets;
    private String text;
    /**
     * @deprecated refer to {@link #overlappingGoldMentions}
     */
    private String goldTaxonomyId;
    private List overlappingGoldMentions;
    /**
     * @deprecated a single mention can have multiple (taxonomy) IDs (human and murine pro-alpha3(V)...)
     */
    private String taxonomyId;
    private List ids = Collections.emptyList();
    /**
     * Taxonomy ID candidates for this gene mention. Used during species assignment.
     */
    private Set taxonomyCandidates;
    /**
     * Indicates whether all {@link #taxonomyCandidates} should be assigned or only one of them.
     */
    private boolean isTaxonomyCandidatesConjunctive;
    /**
     * Final taxonomy IDs assigned to this gene mention.
     */
    private List taxonomyIds = Collections.emptyList();
    private Multimap taxonomyOcurrences = HashMultimap.create();
    private Map taxonomyScores;
    /**
     * These will mostly be the same as {@link #taxonomyScores} but can differ is some cases, e.g. when
     * the fallback to the default species has happened since all other candidates seemed too unlikely.
     */
    private Map processedTaxonomyScores;
    private Set taxonomyIdsSet;
    private boolean specificTypeFrozen;
    private GeneSpeciesOccurrence taxonomyReliability;
    private String documentContext;
    private Query contextQuery;
    private GeneTagger tagger = GeneTagger.UNKNOWN;
    private SpecificType specificType = SpecificType.UNKNOWN;
    private double specificTypeConfidence;
    private MentionMappingResult mentionMappingResult;
    private GeneDocument geneDocument;
    private List taggingModifiers;
    // The gene set to which this gene belongs
    private GeneSets geneSets;
    /**
     * A parent GeneMention is a GeneMention that has been split into sub-mentions,
     * most commonly due to conjunctions or enumerations within a GeneMention. Thus,
     * when parent is not null, this GeneMention resulted from a split of another
     * GeneMention.
     */
    private GeneMention parent;
    /**
     * A GeneMention that is actually a composite to be expanded to multiple different genes, like enumerations
     * or numerical ranges, can have derived GeneMentions corresponding to the individual genes denotes by the
     * composite expression. Those are the children.
     */
    private List children;
    private boolean isCompositeMention;
    private List posTags;
    private FeatureVector featureVector;
    private String reducedNameForExactMatch;
    private InstanceList instances;
    private List familyNames;
    private String bestCandidateSynonym;
    private String compositeResolver;
    private Set nameTokenSet;
    private Map familyFeatures = new HashMap<>();

    /**
     * Makes a copy of the given GeneMention but NOT from its MentionMappingResult.
     *
     * @param gm The gene mention to copy.
     */
    public GeneMention(GeneMention gm) {
        if (gm.geneName != null)
            this.geneName = new GeneName(gm.geneName);
        this.contextQuery = gm.contextQuery;
        this.docId = gm.docId;
        this.documentContext = gm.documentContext;
        this.geneDocument = gm.geneDocument;
        this.id = gm.id;
        if (gm.ids != null)
            this.ids = new ArrayList<>(gm.ids);
        this.normalizer = gm.normalizer;
        this.offsets = gm.offsets;
        this.tagger = gm.tagger;
        this.taxonomyOcurrences = gm.taxonomyOcurrences;
        this.taxonomyId = gm.taxonomyId;
        this.taxonomyIds = gm.getTaxonomyIds();
        this.taxonomyScores = gm.taxonomyScores;
        this.processedTaxonomyScores = gm.processedTaxonomyScores;
        this.text = gm.text;
        this.originalMappedObject = gm.originalMappedObject;
        this.parent = gm.parent;
        this.children = gm.children;
        this.overlappingGoldMentions = gm.overlappingGoldMentions;
        this.specificType = gm.getSpecificType();
        this.specificTypeConfidence = gm.getSpecificTypeConfidence();
        this.overlappingGoldMentions = gm.overlappingGoldMentions;
        if (gm.getMentionMappingResult() != null)
            this.mentionMappingResult = new MentionMappingResult(gm.getMentionMappingResult());
        this.familyFeatures = new HashMap<>(gm.familyFeatures);
    }

    public GeneMention(String text) {
        this();
        this.text = text;
        this.children = Collections.emptyList();
    }


    public GeneMention() {
    }

    public GeneMention(String text, TermNormalizer normalizer) {
        this(text);
        this.setNormalizer(normalizer);
    }

    public GeneMention(String text, int begin, int end) {
        this(text);
        this.offsets = Range.between(begin, end);
    }

    public GeneMention(String text, int begin, int end, TermNormalizer normalizer) {
        this(text, begin, end);
        this.setNormalizer(normalizer);
    }

    public Double addFamilyFeature(String featureName, double value) {
        return familyFeatures.put(featureName, value);
    }

    public Map getFamilyFeatures() {
        return familyFeatures;
    }

    public boolean matchesFamilyName() {
        return familyNames != null && !familyNames.isEmpty();
    }

    public GeneSet getSingleGeneSet() {
        if (geneSets.size() != 1)
            throw new IllegalArgumentException("There is not a single geneset associated with this gene mention but there are " + geneSets.size() + " for gene mention " + this + ": " + geneSets);
        return geneSets.stream().findAny().get();
    }

    public void addGeneSet(GeneSet geneSet) {
        if (geneSet == null || geneSet.isEmpty())
            throw new IllegalArgumentException("The passed geneset is " + (geneSet == null ? "null" : "empty") + ".");
        if (geneSets == null)
            this.geneSets = new GeneSets();
        this.geneSets.add(geneSet);
    }

    public boolean isTaxonomyCandidatesConjunctive() {
        return isTaxonomyCandidatesConjunctive;
    }

    public void setTaxonomyCandidatesConjunctive(boolean taxonomyCandidatesConjunctive) {
        isTaxonomyCandidatesConjunctive = taxonomyCandidatesConjunctive;
    }

    public Set getTaxonomyCandidates() {
        return taxonomyCandidates != null ? taxonomyCandidates : Collections.emptySet();
    }

    public void setTaxonomyCandidates(Set taxonomyCandidates) {
        this.taxonomyCandidates = taxonomyCandidates;
    }

    public List getFamilyNames() {
        return familyNames;
    }

    public void setFamilyNames(List matchedFamilyNames) {
        familyNames = matchedFamilyNames;
    }

    /**
     * Returns the taxonomy scores that have undergone threshold filtering. It is possible that this is not
     * the outcome of the ML-based approach but just the default species of a document due to threshold filtering.
     *
     * @return
     */
    public Map getProcessedTaxonomyScores() {
        return processedTaxonomyScores;
    }

    public void setProcessedTaxonomyScores(Map processedTaxonomyScores) {
        this.processedTaxonomyScores = processedTaxonomyScores;
    }

    public GeneMention getFirstGoldMention() {
        if (!hasGoldMentions()) return null;
        return overlappingGoldMentions.get(0);
    }

    /**
     * The original UIMA annotation that is mapped. Most likely a subclass of EntityMention.
     *
     * @return The original object to be mapped.
     */
    public Object getOriginalMappedObject() {
        return originalMappedObject;
    }

    public void setOriginalMappedObject(Object originalMappedObject) {
        this.originalMappedObject = originalMappedObject;
    }

    public List getIds() {
        return ids;
    }

    public void setIds(List ids) {
        assert !ids.stream().anyMatch(Objects::isNull) : "There is a null item in the IDs to be set.";
        assert ids.indexOf("null") == -1 : "The string 'null' is among the IDs to be set.";
        this.ids = ids;
    }

    public Stream getMappedSynHits() {
        if (mentionMappingResult == null)
            throw new IllegalArgumentException("This gene mention was not yet mapped, there are no final ranked candidates.");
        return mentionMappingResult.getResultCandidates();
    }

    public Stream getMappedIds() {
        assert getMappedSynHits().filter(Predicate.not(SynHit::isRejectionCandidate)).map(SynHit::getId).noneMatch(Objects::isNull) : "A null ID is returned for " + this;
        return getMappedSynHits().filter(Predicate.not(SynHit::isRejectionCandidate)).map(SynHit::getId);
    }

    public Set getMappedIdSet() {
        return getMappedIds().collect(Collectors.toSet());
    }

    public void addId(String id) {
        if (ids.isEmpty())
            ids = new ArrayList<>();
        ids.add(id);
    }

    public void addTaxonomyId(String id) {
        taxonomyIdsSet = null;
        if (taxonomyIds.isEmpty())
            taxonomyIds = new ArrayList<>();
        taxonomyIds.add(id);
    }

    public List getTaxonomyIds() {
        if (taxonomyIds != null) {
            return taxonomyIds;
        }
        if (taxonomyId != null)
            return Collections.singletonList(taxonomyId);
        return Collections.emptyList();
    }

    public void setTaxonomyIds(List taxonomyIds) {
        this.taxonomyIds = taxonomyIds;
        taxonomyIdsSet = null;
    }

    public List getNonRejectedTaxonomyIds() {
        if (mentionMappingResult == null)
            return getTaxonomyIds();
        // we check the lexically reranked candidates for the rejection because it is not set to the original candidates since those should remain the original list, even if empty
        return getTaxonomyIds().stream().filter(taxId -> !mentionMappingResult.tax2lexicallyRerankedCandidates.get(taxId).get(0).isRejectionCandidate()).collect(Collectors.toList());
    }

    public Set getTaxonomyIdsSet() {
        if (taxonomyIdsSet == null)
            taxonomyIdsSet = new HashSet<>(getTaxonomyIds());
        return taxonomyIdsSet;
    }

    public void addChild(GeneMention child) {
        if (children.isEmpty())
            children = new ArrayList<>();
        children.add(child);
    }

    public boolean isCompositeMention() {
        return isCompositeMention || !children.isEmpty();
    }

    public List getOverlappingGoldMentions() {
        return overlappingGoldMentions;
    }

    public void setOverlappingGoldMentions(List overlappingGoldMentions) {
        this.overlappingGoldMentions = overlappingGoldMentions;
    }

    public String getAnyGoldTaxonomyId() {
        if (!hasGoldMentions()) return null;
        return overlappingGoldMentions.get(0).getTaxonomyIds().get(0);
    }

    public List getAnyGoldTaxonomyIds() {
        if (!hasGoldMentions()) return Collections.emptyList();
        return overlappingGoldMentions.get(0).getTaxonomyIds();
    }

    public List getAllGoldTaxonomyIdsAsList() {
        return getAllGoldTaxonomyIds(Collectors.toList(), Collections::emptyList);
    }

    public Set getAllGoldTaxonomyIdsAsSet() {
        return getAllGoldTaxonomyIds(Collectors.toSet(), Collections::emptySet);
    }

    public  R getAllGoldTaxonomyIds(Collector collector, Supplier emptyResultSupplier) {
        if (!hasGoldMentions()) return emptyResultSupplier.get();
        return overlappingGoldMentions.stream().map(GeneMention::getTaxonomyIds).flatMap(Collection::stream).map(id -> (T) id).collect(collector);
    }

    public Stream getAllGoldTaxonomyIds() {
        return overlappingGoldMentions.stream().map(GeneMention::getTaxonomyIds).flatMap(Collection::stream);
    }

    public String getAnyGoldId() {
        if (!hasGoldMentions()) return null;
        return overlappingGoldMentions.get(0).getGoldMentionId();
    }

    public List getAnyGoldIds() {
        if (!hasGoldMentions()) return Collections.emptyList();
        return overlappingGoldMentions.get(0).getIds();
    }

    public List getAllGoldIdsAsList() {
        return getAllGoldIds(Collectors.toList(), Collections::emptyList);
    }

    public Set getAllGoldIdAsSet() {
        return getAllGoldIds(Collectors.toSet(), Collections::emptySet);
    }

    private  R getAllGoldIds(Collector collector, Supplier emptyResultSupplier) {
        if (!hasGoldMentions()) return emptyResultSupplier.get();
        return getAllGoldIds().map(id -> (T) id).collect(collector);
    }

    public Stream getAllGoldIds() {
        if (!hasGoldMentions()) return Stream.empty();
        return overlappingGoldMentions.stream().map(GeneMention::getIds).flatMap(Collection::stream);
    }

    public boolean hasGoldMentions() {
        return overlappingGoldMentions != null && !overlappingGoldMentions.isEmpty();
    }

    /**
     * @return
     * @deprecated Use {@link #getAllGoldTaxonomyIdsAsList()}, {@link #getAnyGoldTaxonomyId()} or {@link #getAllGoldTaxonomyIdsAsList()} instead.
     */
    public String getGoldTaxonomyId() {
        return goldTaxonomyId;
    }

    public void setGoldTaxonomyId(String goldTaxonomyId) {
        this.goldTaxonomyId = goldTaxonomyId;
    }

    public void setTaxonomyScore(String tax, double score) {
        if (taxonomyScores == null)
            taxonomyScores = new HashMap<>();
        taxonomyScores.put(tax, score);
    }

    public void setProcessedTaxonomyScore(String tax, double score) {
        if (processedTaxonomyScores == null)
            processedTaxonomyScores = new HashMap<>();
        processedTaxonomyScores.put(tax, score);
    }

    public double getTaxonomyScore(String taxonomyId) {
        return taxonomyScores == null ? 0 : taxonomyScores.getOrDefault(taxonomyId, 0d);
    }

    public double getProcessedTaxonomyScore(String taxonomyId) {
        return processedTaxonomyScores == null ? 0 : processedTaxonomyScores.getOrDefault(taxonomyId, 0d);
    }

    public Map getTaxonomyScores() {
        return taxonomyScores;
    }

    /**
     * Returns the original taxonomy scores returned by the ML-approach and the rule that species occurring previous
     * to a gene in the same NP are surely assigned to this gene.
     *
     * @param taxonomyScores
     */
    public void setTaxonomyScores(Map taxonomyScores) {
        this.taxonomyScores = taxonomyScores;
    }

    public GeneSpeciesOccurrence getTaxonomyReliability() {
        return taxonomyReliability;
    }

    public void setTaxonomyReliability(GeneSpeciesOccurrence taxonomyReliability) {
        this.taxonomyReliability = taxonomyReliability;
    }

    public List getTaggingModifiers() {
        return taggingModifiers;
    }

    public String getTaxonomyId() {
        if (taxonomyId != null) return taxonomyId;
        if (taxonomyIds == null || taxonomyIds.isEmpty()) return null;
        return taxonomyIds.get(0);
    }

    public void setTaxonomyId(String taxonomyId) {
        this.taxonomyId = taxonomyId;
        this.taxonomyIds = new ArrayList<>();
        if (taxonomyId != null)
            this.taxonomyIds.add(taxonomyId);
    }

    public Multimap getTaxonomyOccurrences() {
        return taxonomyOcurrences;
    }

    public void setTaxonomyOcurrences(Multimap taxonomyOcurrences) {
        this.taxonomyOcurrences = taxonomyOcurrences;
    }

    public String getDocumentContext() {
        return documentContext;
    }

    public void setDocumentContext(String documentContext) {
        this.documentContext = documentContext;
    }

    public Stream getDocumentContext(int numTokens, Set excludedTokens, boolean excludeGeneMentions) {
        return geneDocument.getDocumentContext(offsets, excludedTokens, excludeGeneMentions, numTokens);
    }

    public Stream getDocumentContext(int numTokens) {
        return geneDocument.getDocumentContext(offsets, numTokens);
    }

    public Query getContextQuery() {
        return contextQuery;
    }

    public void setContextQuery(Query contextQuery) {
        this.contextQuery = contextQuery;
    }

    @Override
    public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result + ((docId == null) ? 0 : docId.hashCode());
        result = prime * result + ((id == null) ? 0 : id.hashCode());
        result = prime * result + ((offsets == null) ? 0 : offsets.hashCode());
        result = prime * result + ((tagger == null) ? 0 : tagger.hashCode());
        result = prime * result + ((taxonomyId == null) ? 0 : taxonomyId.hashCode());
        result = prime * result + ((text == null) ? 0 : text.hashCode());
        return result;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj)
            return true;
        if (obj == null)
            return false;
        if (getClass() != obj.getClass())
            return false;
        GeneMention other = (GeneMention) obj;
        if (docId == null) {
            if (other.docId != null)
                return false;
        } else if (!docId.equals(other.docId))
            return false;
        if (id == null) {
            if (other.id != null)
                return false;
        } else if (!id.equals(other.id))
            return false;
        if (offsets == null) {
            if (other.offsets != null)
                return false;
        } else if (!offsets.equals(other.offsets))
            return false;
        if (tagger != other.tagger)
            return false;
        if (taxonomyId == null) {
            if (other.taxonomyId != null)
                return false;
        } else if (!taxonomyId.equals(other.taxonomyId))
            return false;
        if (text == null) {
            return other.text == null;
        } else return text.equals(other.text);
    }

    public int getBegin() {
        return offsets.getMinimum();
    }

    /**
     * Whether or not this gene mention has been rejected for being a gene mention at all.
     *
     * @return True if this gene mention was not successfully mapped to gene ID.
     */
    public boolean isRejected() {
        return mentionMappingResult != null && mentionMappingResult.isRejected();

    }

    public String getDocId() {
        return docId;
    }

    public void setDocId(String docId) {
        this.docId = docId;
    }

    public int getEnd() {
        return offsets.getMaximum();
    }

    public GeneName getGeneName() {
        if (geneName == null && normalizer == null)
            throw new IllegalStateException(
                    "This GeneMention has not set a TermNormalizer and thus cannot create a GeneName instance.");
        if (geneName == null)
            geneName = new GeneName(text, normalizer);
        return geneName;
    }

    public void setGeneName(GeneName geneName) {
        this.geneName = geneName;
    }

    /**
     * This field is only used for gold mentions.
     *
     * @return The gene ID of this mention, if set.
     * @deprecated Use {@link #overlappingGoldMentions} to represent gold annotations
     */
    @Deprecated
    public String getGoldMentionId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public TermNormalizer getNormalizer() {
        return normalizer;
    }

    public void setNormalizer(TermNormalizer normalizer) {
        this.normalizer = normalizer;
        if (this.geneName != null)
            this.geneName.setNormalizer(normalizer);
    }

    public Range getOffsets() {
        return offsets;
    }

    public void setOffsets(Range offsets) {
        this.offsets = offsets;
    }

    public String getText() {
        return text;
    }

    public void setText(String text) {
        this.text = text;
        if (geneName != null)
            geneName.setText(text);
    }

    /**
     * Returns the text of this gene extended to the end of its overlapping NP-chunk. If there is no such chunk, the original text is returned.
     *
     * @return
     */
    public String getRightExtendedText() {
        Set, String>> chunkNP = geneDocument.getOverlappingChunks(getOffsets(), "ChunkNP");
        if (!chunkNP.isEmpty()) {

            Integer chunkend = chunkNP.iterator().next().getKey().getMaximum();
            if (chunkend > getEnd()) {
                return geneDocument.getCoveredText(getBegin(), chunkend);
            }
        }
        return text;
    }

    public Range getRightExtendedOffsets() {
        Set, String>> chunkNP = geneDocument.getOverlappingChunks(getOffsets(), "ChunkNP");
        if (!chunkNP.isEmpty()) {

            Integer chunkend = chunkNP.iterator().next().getKey().getMaximum();
            if (chunkend > getEnd()) {
                return Range.between(getBegin(), chunkend);
            }
        }
        return offsets;
    }

    public Range getPhraseExtendesOffsets() {
        Set, String>> chunkNP = geneDocument.getOverlappingChunks(getOffsets(), "ChunkNP");
        if (!chunkNP.isEmpty()) {
            return chunkNP.iterator().next().getKey();
        }
        return offsets;
    }

    public String getPhraseExtendedText() {
        return geneDocument.getCoveredText(getPhraseExtendesOffsets());
    }

    @Override
    public String toString() {
        String id = mentionMappingResult != null && mentionMappingResult.tax2finalRankedCandidates != null ? mentionMappingResult.getResultCandidates().map(SynHit::getId).collect(Collectors.joining(", ")) : NOID;
        return "GeneMention [text=" + text + ", offsets=" + offsets + ", docId=" + docId + ", id=" + id + ",  taxonomyIds=" + taxonomyIds
                + ", goldIds=" + getAllGoldIdsAsList() + ", goldTaxIds=" + getAllGoldTaxonomyIdsAsList() + ", tagger=" + tagger + "]";
    }

    public String getNormalizedText() {
        return getGeneName().getNormalizedText();
    }

    public List getNormalizedTextVariant() {
        return getGeneName().getNormalizedTextVariant();
    }

    public GeneTagger getTagger() {
        return tagger;
    }

    public void setTagger(GeneTagger tagger) {
        this.tagger = tagger;
    }

    /**
     * @return The object representing the result of the mapping process for this
     * particular gene mention.
     */
    public MentionMappingResult getMentionMappingResult() {
        return mentionMappingResult;
    }

    public void setMentionMappingResult(MentionMappingResult mentionMappingResult) {
//        assert mentionMappingResult != null : "Setting a null MentionMapping result to " + this;
        this.mentionMappingResult = mentionMappingResult;
    }

    public SynHit getResultCandidate(String taxonomyId) {
        assert mentionMappingResult != null : "The mention mapping result is null";
        return mentionMappingResult.getResultCandidate(taxonomyId);
    }

    public Stream getResultCandidates() {
        assert mentionMappingResult != null : "The mention mapping result is null";
        return mentionMappingResult.getResultCandidates();
    }

    public GeneDocument getGeneDocument() {
        return geneDocument;
    }

    public void setGeneDocument(GeneDocument geneDocument) {
        this.geneDocument = geneDocument;
    }

    /**
     * A parent GeneMention is a GeneMention that has been split into sub-mentions,
     * most commonly due to conjunctions or enumerations within a GeneMention. Thus,
     * when parent is not null, this GeneMention resulted from a split of another
     * GeneMention.
     *
     * @return The GeneMention that has been split to produce this - and possibly
     * other - GeneMention(s).
     * @deprecated Such cases are handled by GeneCompositeNameResolver
     */
    public GeneMention getParent() {
        return parent;
    }

    public void setParent(GeneMention parent) {
        this.parent = parent;
    }

    public void addTaggingModifier(String modifier) {
        if (taggingModifiers == null)
            taggingModifiers = new ArrayList<>();
        taggingModifiers.add(modifier);
    }

    public List getPosTags() {
        return posTags;
    }

    public void setPosTags(List posTags) {
        this.posTags = posTags;
    }

    public SpecificType getSpecificType() {
        return specificType;
    }

    public void setSpecificType(SpecificType specificType) {

        if (!specificTypeFrozen)
            this.specificType = specificType;
        else
            log.warn("Specific type not set: It is frozen");
    }

    public FeatureVector getFeatureVector() {
        return featureVector;
    }

    public void setFeatureVector(FeatureVector featureVector) {
        this.featureVector = featureVector;
    }

    public boolean isAbbreviationLongForm() {
        return !geneDocument.getOverlappingAcronymLongforms(this.offsets).isEmpty();
    }

    public boolean isAbbreviation() {
        return !geneDocument.getOverlappingAcronyms(this.offsets).isEmpty();
    }

    public double getSpecificTypeConfidence() {
        return specificTypeConfidence;
    }

    public void setSpecificTypeConfidence(double specificTypeConfidence) {
        this.specificTypeConfidence = specificTypeConfidence;
    }

    public String getReducedNameForExactMatch() {
        return reducedNameForExactMatch;
    }

    public void setReducedNameForExactMatch(String reducedNameForExactMatch) {
        this.reducedNameForExactMatch = reducedNameForExactMatch;
    }

    public void freezeSpecificType() {
        specificTypeFrozen = true;
    }

    public boolean hasExactCandidateMatch() {
        if (mentionMappingResult != null) {
            // TODO delegate to mmr.hasExactCandidateMatch()
            Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates;
            if (tax2originalCandidates != null)
                return tax2originalCandidates.keySet().stream().map(tax2originalCandidates::get).flatMap(Collection::stream).anyMatch(SynHit::isExactMatch);
        }
        return false;
    }

    public boolean hasApproximateCandidateMatch() {
        if (mentionMappingResult != null) {
            Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates;
            return tax2originalCandidates.keySet().stream().map(tax2originalCandidates::get).filter(Predicate.not(List::isEmpty)).map(list -> list.get(0)).anyMatch(Predicate.not(SynHit::isExactMatch));
        }
        return false;
    }

    public boolean hasOnlyApproximateCandidateMatches() {
        if (mentionMappingResult != null) {
            Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates;
            return mentionMappingResult.tax2originalCandidates.values().stream().flatMap(Collection::stream).findAny().isPresent() && tax2originalCandidates.keySet().stream().map(tax2originalCandidates::get).filter(Predicate.not(List::isEmpty)).map(list -> list.get(0)).allMatch(Predicate.not(SynHit::isExactMatch));
        }
        return false;
    }

    public String getBestCandidateSynonym() {
        if (mentionMappingResult != null && bestCandidateSynonym == null) {
            Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates;
            bestCandidateSynonym = tax2originalCandidates.keySet().stream().map(tax2originalCandidates::get).filter(Predicate.not(Collection::isEmpty)).map(list -> list.get(0)).map(SynHit::getSynonym).findFirst().get();
        }
        return bestCandidateSynonym;
    }

    public Set getAllBestCandidateSynonyms() {
        return getAllBestCandidateSynonyms(null);
    }

    public Set getAllBestCandidateSynonyms(Set filterTax) {
        if (mentionMappingResult != null) {
            Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates;
            for (String taxId : tax2originalCandidates.keySet()) {
                if (filterTax != null && !filterTax.isEmpty() && !filterTax.contains(taxId))
                    continue;
                List candidates = tax2originalCandidates.get(taxId);
                if (!candidates.isEmpty()) {
                    if (candidates.get(0).isExactMatch() || candidates.size() == 1 || candidates.get(0).getLexicalScore() > candidates.get(1).getLexicalScore())
                        return Set.of(candidates.get(0).getSynonym());
                    Set bestSynonyms = new HashSet<>();
                    double bestScore = candidates.get(0).getLexicalScore();
                    for (int i = 0; i < candidates.size() && candidates.get(i).getLexicalScore() - bestScore < 0.0001; i++) {
                        bestSynonyms.add(candidates.get(i).getSynonym());
                    }
                    return bestSynonyms;
                }
            }
        }
        return Collections.emptySet();
    }

    public Optional getTaxonomyCandidateWithOccurrence(GeneSpeciesOccurrence occurrenceType) {
        return taxonomyOcurrences != null ? taxonomyOcurrences.keySet().stream().filter(taxId -> taxonomyOcurrences.get(taxId).contains(occurrenceType)).findAny() : Optional.empty();
    }

    public boolean hasCorrectTaxonomyId() {
        boolean goldHasOffsets = geneDocument.isGoldHasOffsets();
        if (goldHasOffsets) {
            return !Sets.intersection(getAllGoldTaxonomyIdsAsSet(), getTaxonomyIdsSet()).isEmpty();
        }
        return !Sets.intersection(geneDocument.getGoldTaxonomyIds(), getTaxonomyIdsSet()).isEmpty();
    }

    public GeneDocument.MentionCorrectness getGenesetCorrectnessLevel(String goldId) {
        if (!hasGoldMentions())
            return GeneDocument.MentionCorrectness.CANT_FIND;
        Set> seenOffsets = new HashSet<>();
        int goldGenesetSize = (int) geneDocument.getGenes().filter(GeneMention::hasGoldMentions).map(GeneMention::getOverlappingGoldMentions).flatMap(Collection::stream).filter(goldGm -> goldGm.getIds().contains(goldId)).filter(goldGm -> seenOffsets.add(goldGm.getOffsets())).map(GeneMention::getIds).flatMap(Collection::stream).count();
        Optional any = geneSets.stream().filter(gs -> gs.stream().findAny().get()
                .getAllGoldIdAsSet().contains(goldId)).findAny();
        int genesetSize = any.isPresent() ? any.get().size() : 0;
        if (goldGenesetSize == genesetSize)
            return GeneDocument.MentionCorrectness.CORRECT_ID;
        return GeneDocument.MentionCorrectness.WRONG_ID;
    }


    /**
     * Checks whether this gene mention has a strong candidate for the given taxonomy ID.
     *
     * @param taxId The taxonomy ID to check.
     * @return Whether or not there is a SynHit that is marked as anchor for the passed taxonomy ID.
     */
    public boolean isAnchor(String taxId) {
        if (mentionMappingResult != null) {
            SynHit resultEntry = mentionMappingResult.getResultCandidate(taxId);
            if (resultEntry != null)
                return resultEntry.isAnchor();
        }
        return false;
    }

    /**
     * Checks for all candidate lists for all species assigned to this gene mention whether there is a family SynHit
     * in the first n candidates, inclusive.
     *
     * @param n The maximum rank to search for family hits, starting at 1.
     * @return True if a family hit was found within the first n ranks, false otherwise.
     */
    public boolean hasFamilyCandidateWithinRank(int n, String taxId) {
        if (mentionMappingResult != null) {
            List synHits = mentionMappingResult.tax2originalCandidates.get(taxId);
            for (int i = 0; i < Math.min(synHits.size(), n); i++) {
                SynHit synHit = synHits.get(i);
                if (synHit.isFamilyName())
                    return true;
            }
        }
        return false;
    }

    public boolean hasExactMatchInTax(String taxId) {
        if (mentionMappingResult != null) {
            List synHits = mentionMappingResult.tax2originalCandidates.get(taxId);
            try {
                return !synHits.isEmpty() && synHits.get(0).isExactMatch();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        return false;
    }

    public InstanceList getInstances() {
        return instances;
    }

    public void setInstances(InstanceList instances) {
        this.instances = instances;
    }

    public boolean isExactFamilyNameMatch() {
        return familyNames != null && familyNames.stream().anyMatch(SynHit::isExactMatch);
    }

    public double getFamilyNameMatchScore() {
        return familyNames != null && !familyNames.isEmpty() ? familyNames.get(0).getLexicalScore() : 0d;
    }

    public boolean isAmbiguous() {
        return !getAmbiguityTypes().isEmpty();
    }

    public Set getAmbiguityTypes() {
        Set ambiguityTypes = new HashSet<>();
        if (mentionMappingResult != null) {
            boolean exactInOneSpecies = false;
            Map> tax2originalCandidates = mentionMappingResult.tax2originalCandidates;
            for (String taxId : tax2originalCandidates.keySet()) {
                List candidates4tax = tax2originalCandidates.get(taxId);
                // Ambiguous 1: Multiple exact matches
                if (candidates4tax.size() > 1 && candidates4tax.get(0).isExactMatch() && candidates4tax.get(1).isExactMatch())
                    ambiguityTypes.add(AmbiguityType.LEXICAL);
                if (!candidates4tax.isEmpty() && candidates4tax.get(0).isExactMatch()) {
                    // If we have already found an exact match for another species, this is an intra species ambiguity
                    if (exactInOneSpecies)
                        ambiguityTypes.add(AmbiguityType.INTRASPECIES);
                    exactInOneSpecies = true;
                }

            }
        }
        return ambiguityTypes;
    }

    public Set getNameTokenSet() {
        if (nameTokenSet == null) {
            Function> gnTokensFunc = gn -> Arrays.stream(normalizer.normalize(gn.getText()).split("\\s+"));
            Stream nameTokens = gnTokensFunc.apply(geneName);
            for (GeneName alt : geneName.getAlternatives()) {
                nameTokens = Stream.concat(nameTokens, gnTokensFunc.apply(alt));
            }
            nameTokenSet = nameTokens.collect(Collectors.toSet());
        }
        return nameTokenSet;
    }

    public String getEcNumber() {
        return geneName.getEcNumber();
    }

    public String getCompositeResolver() {
        return compositeResolver;
    }

    public void setCompositeResolver(String compositeResolver) {
        this.compositeResolver = compositeResolver;
    }

    public Stream getContextGeneNames() {
        if (geneDocument == null)
            return Stream.empty();
        return geneDocument.getGenes().filter(g -> g != this).map(GeneMention::getGeneName);
    }

    public GeneSets getGeneSets() {
        return geneSets;
    }

    public void removeGeneSet(GeneSet geneSet) {
        this.geneSets.remove(geneSet);
    }

    public void clearGeneSets() {
        if (geneSets != null)
            geneSets.clear();
    }

    public void reject(MentionMappingResult.RejectReason reason) {
        // In some cases of overlap and peeking into gold tax IDs it can happen
        // that getTaxonomyIds() does not return values that are set to tax2originalCandidates
        // or tax2lexicallyRerankedCandidates
        Set taxIds = new HashSet<>();
        taxIds.addAll(getTaxonomyIds());
        if (mentionMappingResult != null && mentionMappingResult.tax2originalCandidates != null)
            taxIds.addAll(mentionMappingResult.tax2originalCandidates.keySet());
        if (mentionMappingResult != null && mentionMappingResult.tax2lexicallyRerankedCandidates != null)
            taxIds.addAll(mentionMappingResult.tax2lexicallyRerankedCandidates.keySet());
        for (String tax : taxIds) {
            reject(tax, reason);
        }
    }

    public void reject(String tax, MentionMappingResult.RejectReason reason) {
        if (mentionMappingResult == null) {
            mentionMappingResult = new MentionMappingResult(this);
        }
        if (mentionMappingResult.tax2lexicallyRerankedCandidates == null)
            mentionMappingResult.tax2lexicallyRerankedCandidates = new HashMap<>();
        if (mentionMappingResult.tax2finalRankedCandidates == null)
            mentionMappingResult.tax2finalRankedCandidates = new HashMap<>();
        mentionMappingResult.tax2lexicallyRerankedCandidates.put(tax, List.of(MentionMappingResult.REJECTION));
        mentionMappingResult.tax2finalRankedCandidates.put(tax, List.of(MentionMappingResult.REJECTION));
        mentionMappingResult.setRejectReason(tax, reason);
    }

    public enum GeneTagger {
        JNET, BANNER, FLAIR, FLAIR_JPG_COLLAPSED_VAR, FLAIR_JPG_COLLAPSED_VARCOMPENUM, FLAIR_BC2TRAINTEST, FLAIR_GNORMPLUSNLMIAT, GOLD, FLAIR_JPG_NOBC2TEST_NOTEST, FLAIR_JPG_NOBC2TEST_NOTEST_COLLAPSED_VAR, FLAIR_JPG_GNP_ENTITIES, CONSISTENCY_TAGGER, EXPANSION_TAGGER, GNORM_PLUS, UNKNOWN, GAZETTEER
    }

    public enum SpecificType {
        GENE, FAMILYNAME, DOMAINMOTIF, GENE_ENUM, NO_GENE, GROUP, COMPLEX, UNKNOWN
    }

    public enum AmbiguityType {INTRASPECIES, LEXICAL}
}