de.julielab.geneexpbase.candidateretrieval.SynHit Maven / Gradle / Ivy

Go to download
package de.julielab.geneexpbase.candidateretrieval;

import de.julielab.geneexpbase.GeneExpRuntimeException;
import de.julielab.geneexpbase.genemodel.GeneName;

import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

public class SynHit implements Comparable, Cloneable, Serializable {
    public static final String TYPE_GEPRO = "Gene/Protein";
    public static final String TYPE_GROUP = "Group";
    /*
     * this is a random id used for sorting
     */
    int random;
    private String synonym;

    private float luceneScore;
    private double lexicalScore;
    private double contextualScore;
    private double overallScore;
    private float relevanceScore;
    private boolean isExactMatch;
    private Map speciesMentionScores = new HashMap<>();
    /**
     * All known Entrez Gene IDs for this synonym.
     */
    private List ids;
    /**
     * All known Entrez Gene IDs for this synonym.
     */
    private Set idsSet;
    /**
     * Set by {@link #setTaxId(String)}. Contains all the gene IDs associated with the assigned taxonomy ID in {@link #setTaxId(String)}. This will
     * be unique in most cases but sometimes it isn't.
     *
     * @deprecated we use gene records instead where the ID is unique per record
     */
    @Deprecated
    private String[] taxonomySpecificIds;
    private String id;
    private String source;
    private String mappedMention; // the mention found in text and searched for
    // compare type is used during scoring if two synsets have same score
    // (see in compareTo(...) method)
    private CompareType compareType = CompareType.ID;
    private String entityType;
    /**
     * All known tax IDs for this synonym.
     */
    private List taxIds;
    /**
     * All known tax IDs for this synonym.
     */
    private Set taxIdsSet;
    private String taxId;
    private GeneName mappedGeneName;
    private List synonymPriorities;
    private boolean anchor;
    private String comment;

    /**
     * @param syn
     * @param score
     * @param id
     * @param source
     */
    public SynHit(String syn, double score, String id, String source) {
        this.synonym = syn;
        this.lexicalScore = score;
        this.ids = Arrays.asList(id);
        this.id = id;
        this.source = source;
    }

    public SynHit(String synonym, double score, List ids, String source, String entityType, List taxIds) {
        this.synonym = synonym;
        this.lexicalScore = score;
        this.ids = ids;
        this.source = source;
        this.entityType = entityType;
        this.taxIds = taxIds;
    }

    /**
     * Returns a comparator that sorts SynHits first according to their equality to the gene name and then by score. This helps to overcome the issue that Lucene's float scores sometimes
     * fail to put an exact equal match to the top of the results.
     *
     * @param geneName A gene name to compare to.
     * @return A comparator that can be used to sort lists of SynHits.
     */
    public static Comparator getNormalizedExactMatchThenLuceneScoreComparator(final String geneName) {
        Comparator comparator = Comparator.comparing(sh -> sh.getSynonym().equals(geneName));
        comparator = comparator.thenComparingDouble(sh -> sh.getLuceneScore()).reversed();
        return comparator;
    }

    public float getRelevanceScore() {
        return relevanceScore;
    }

    public void setRelevanceScore(float relevanceScore) {
        this.relevanceScore = relevanceScore;
    }

    public String getEntityType() {
        return entityType;
    }

    public Map getSpeciesMentionScores() {
        return speciesMentionScores;
    }

    public Double getSpeciesMentionScore(String taxId) {
        return speciesMentionScores.get(taxId);
    }

    public void setSpeciesMentionScore(String taxId, double speciesMentionScore) {
        speciesMentionScores.put(taxId, speciesMentionScore);
    }

    public void restrictToTaxId(String taxId) {
        this.id = null;
        this.taxId = null;
        for (int i = 0; i < ids.size(); i++) {
            if (taxIds.get(i).equals(taxId)) {
                this.id = ids.get(i);
                this.taxId = taxId;
            }
        }
        if (this.id == null)
            throw new IllegalArgumentException("This SynHit does not contain taxonomy ID " + taxId + ": " + this);
    }

    /**
     * @return
     */
    public double getLexicalScore() {
        return lexicalScore;
    }

    public void setLexicalScore(double score) {
        this.lexicalScore = score;
    }

    public double getContextualScore() {
        return this.contextualScore;
    }

    public void setContextualScore(double score) {
        this.contextualScore = score;
    }

    public String getSynonym() {
        return synonym;
    }

    public void setSynonym(String syn) {
        this.synonym = syn;
    }

    public String toString() {
        String result = "syn=" + synonym + "\tid=" + getId() + "\tscore=" + lexicalScore + "\tsemScore="
                + contextualScore + "\tid=" + id + "\ttaxId="
                + taxId;
        return result;
    }

    /**
     * the comparator for two SynHits: order by score as set by setCompareType
     * method TODO: find rule how to order if several SynHits have same score
     * currently, random number is chosen
     *
     * @param o
     * @return int
     */
    public int compareTo(SynHit o) {
        int c = 0;
        if (this.compareType != o.compareType)
            throw new IllegalStateException(
                    "Two SynHits are compared that don't use the same comparison type: " + this + ", " + o);
        switch (this.compareType) {
            case RANDOM:
                c = Integer.compare(o.random, this.random);
                break;
            case SCORE:
                c = Double.compare(o.lexicalScore, this.lexicalScore);
                break;
            case SEMSCORE:
                c = Double.compare(o.contextualScore, this.contextualScore);
                break;
            case ID:
                if (o.id == null || id == null)
                    throw new IllegalStateException("Trying to compare SynHits by ID where the id is null.");
                c = o.id.compareTo(id);
                break;
        }
        return c;
    }

    @Override
    public boolean equals(Object other) {
        if (this == other) return true;
        if (other == null || getClass() != other.getClass()) return false;
        SynHit o = (SynHit) other;
        boolean c;
        switch (this.compareType) {
            case RANDOM:
                c = o.random == this.random;
                break;
            case SCORE:
                c = o.lexicalScore == this.lexicalScore;
                break;
            case SEMSCORE:
                c = o.contextualScore == this.contextualScore;
                break;
            case ID:
                if (o.id == null || id == null)
                    throw new IllegalStateException("Trying to compare SynHits by ID where the id is null.");
                c = o.id.equals(id);
                break;
            default:
                throw new IllegalStateException("Unexpected value: " + this.compareType);
        }
        return c;
    }

    @Override
    public int hashCode() {
        int c;
        switch (this.compareType) {
            case RANDOM:
                c = Objects.hashCode(this.random);
                break;
            case SCORE:
                c = Objects.hashCode(this.lexicalScore);
                break;
            case SEMSCORE:
                c = Objects.hashCode(this.contextualScore);
                break;
            case ID:
                c = Objects.hashCode(this.id);
                break;
            default:
                throw new IllegalStateException("Unexpected value: " + this.compareType);
        }
        return c;
    }

    public String getSource() {
        return source;
    }

    public void setSource(String source) {
        this.source = source;
    }

    public CompareType getCompareType() {
        return compareType;
    }

    public void setCompareType(CompareType type) {
        this.compareType = type;
    }

    /**
     * The potentially normalized and/or transformed original entity text
     * mention for which this candidate has been retrieved.
     *
     * @return The string-normalized entity name that this candidate was matched
     * to.
     */
    public String getMappedMention() {
        return mappedMention;
    }

    public void setMappedMention(String mappedSynonym) {
        this.mappedMention = mappedSynonym;
    }

    public boolean isExactMatch() {
        return isExactMatch;
    }

    public void setExactMatch(boolean exactMatch) {
        isExactMatch = exactMatch;
    }

    public SynHit clone() {
        try {
            SynHit h = (SynHit) super.clone();
            h.speciesMentionScores = new HashMap<>(speciesMentionScores);
            h.ids = new ArrayList<>(ids);
            if (taxIds != null)
                h.taxIds = new ArrayList<>(taxIds);
            if (synonymPriorities != null)
                h.synonymPriorities = new ArrayList<>(synonymPriorities);
            return h;
        } catch (CloneNotSupportedException e) {
            throw new GeneExpRuntimeException(e);
        }
    }

    public List getIds() {
        return ids;
    }

    public void setIds(List ids) {
        this.ids = ids;
        if (this.ids.size() == 1)
            this.id = ids.get(0);
        else if (this.ids.size() > 1)
            this.id = null;
        idsSet = null;
    }

    public Set getIdsSet() {
        if (idsSet == null)
            idsSet = new HashSet<>(ids);
        return idsSet;
    }

    /**
     * Returns true if a single gene or protein ID of this synonym has been determined.
     *
     * @return true if the final gene/protein ID of this synonym has been set, false otherwise.
     */
    public boolean isDisambiguated() {
        return id != null || ids.size() <= 1;
    }

    /**
     * Returns true if there is more than one gene ID associated with this synonym.
     *
     * @return Whether there are multiple gene IDs for this synonym.
     */
    public boolean isAmbiguousInGeneral() {
        return ids.size() > 1;
    }

    /**
     * Returns true if at least one taxonomy ID associated with this synonym appears multiple times.
     *
     * @return Whether this synonym exists for multiple different genes of the same species.
     */
    public boolean isIntraSpeciesAmbiguousInGeneral() {
        Set seenTaxIds = new HashSet<>();
        boolean currentTaxIdWasNotYetSeen = false;
        for (int i = 0; i < taxIds.size() && (currentTaxIdWasNotYetSeen = seenTaxIds.add(taxIds.get(i))); i++) ;
        return !currentTaxIdWasNotYetSeen;
    }

    /**
     * Returns true if there are at least two distinct taxonomy IDs associated with this synonym.
     *
     * @return Whether there are multiple different species that have a gene with this synonym.
     */
    public boolean isInterSpeciesAmbiguousInGeneral() {
        if (taxIds.size() <= 1)
            return false;
        Set seenTaxIds = new HashSet<>();
        seenTaxIds.add(taxIds.get(0));
        boolean currentTaxIdWasNotYetSeen = false;
        for (int i = 1; i < taxIds.size() && !(currentTaxIdWasNotYetSeen = seenTaxIds.add(taxIds.get(i))); i++) ;
        return currentTaxIdWasNotYetSeen;
    }

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
        int idIndex = ids.indexOf(id);
        if (ids != null && taxIds != null && synonymPriorities != null && ids.size() == taxIds.size() && ids.size() == synonymPriorities.size()) {
            setIds(List.of(ids.get(idIndex)));
            setTaxIds(List.of(taxIds.get(idIndex)));
            setSynonymPriorities(List.of(synonymPriorities.get(idIndex)));
        }
    }

    public String getTaxIdForGeneId(String id) {
        int i = this.ids.indexOf(id);
        assert i >= 0 : "Trying to set an ID to a SynHit that does not have this ID as a possibility.";
        return taxIds.get(i);
    }

    public List getTaxIds() {
        return taxIds;
    }

    /**
     * @param taxIds
     */
    public void setTaxIds(List taxIds) {
        assert taxIds != null && !taxIds.isEmpty() : "Trying to set an empty taxonomy ID list to a SynHit.";
        this.taxIds = taxIds;
        if (this.taxIds.size() == 1)
            this.taxId = this.taxIds.get(0);
        else if (this.taxIds.size() > 1)
            this.taxId = null;
    }

    public double getOverallScore() {
        return overallScore;
    }

    public void setOverallScore(double overallScore) {
        this.overallScore = overallScore;
    }

    public GeneName getMappedGeneName() {
        return mappedGeneName;
    }

    public void setMappedGeneName(GeneName mappedGeneName) {
        this.mappedGeneName = mappedGeneName;

    }

    /**
     * Returns this single accepted taxonomy ID for this synonym (depends on the document context and may differ
     * for different textual occurrences of this synonym) or null if not set.
     * The taxonomy ID is set by {@link #setTaxId(String)}.
     *
     * @return The taxonomy ID associated with this synonym or null if it wasn't successfully set.
     * @see #setTaxId(String)
     * @see #getTaxIds()
     */
    public String getTaxId() {
        return taxId;
    }

    /**
     * Accepts the passed taxonomy ID as assigned to this synonym. This causes the {@link #taxonomySpecificIds} field
     * to be set which can be retrieved using {@link #getTaxonomySpecificIds()}. In case that the taxonomy ID
     * assignment unique identifies a single gene/protein ID, this ID will be set to the {@link #id} field, marking
     * this synonym as being disambiguated.
     *
     * @param taxId The taxonomy ID to assign this synonym.
     * @throws IllegalArgumentException If the given taxonomy ID cannot be set to this synonym because it does not exist for the given tax ID.
     * @see #getTaxonomySpecificIds()
     * @see #getId()
     * @see #isDisambiguated()
     */
    public void setTaxId(String taxId) {
        this.taxId = taxId;
    }

    /**
     * @return Synonym centric index: The gene IDs associated with the taxonomy ID fixed for this SynHit
     * @deprecated We will be using the gene record index so this method won't be used in the future.
     */
    @Deprecated
    public String[] getTaxonomySpecificIds() {
        return taxonomySpecificIds;
    }

    public List getPrioritiesOfIds(String[] idArray) {
        return getPrioritiesOfIds(Stream.of(idArray));
    }

    public List getPrioritiesOfIds(Stream idStream) {
        final Set idSet = idStream.collect(Collectors.toSet());
        return IntStream.range(0, ids.size()).filter(i -> idSet.contains(ids.get(i))).mapToObj(synonymPriorities::get).collect(Collectors.toList());
    }

    public boolean hasTaxId(String taxId) {
        return taxIds.indexOf(taxId) != -1;
    }

    public List getSynonymPriorities() {
        return synonymPriorities;
    }

    public void setSynonymPriorities(List synonymPriorities) {
        this.synonymPriorities = synonymPriorities;
    }

    public int getSynonymPriority() {
        return synonymPriorities.get(0).intValue();
    }

    public Stream getGeneIdsOfTaxId(String taxId) {
        if (taxIds != null)
            return IntStream.range(0, taxIds.size()).filter(i -> taxIds.get(i).equals(taxId)).mapToObj(ids::get);
        return Stream.empty();
    }

    public Set getTaxIdsSet() {
        if (taxIdsSet == null)
            taxIdsSet = new HashSet<>(taxIds);
        return taxIdsSet;
    }

    public float getLuceneScore() {
        return luceneScore;
    }

    public void setLuceneScore(float luceneScore) {
        this.luceneScore = luceneScore;
    }

    /**
     * Anchors are hits with a very high probability of being correct. They are used to homogenize inconsistently mapped
     * gene sets.
     *
     * @return True if this SynHit is an anchor, false otherwise.
     */
    public boolean isAnchor() {
        return anchor;
    }

    /**
     * Mark this hit as very likely being correct. This information is used to resolve inconsistencies withing
     * gene sets.
     *
     * @param isAnchor Whether this hit serves as an anchor.
     */
    public void setAnchor(boolean isAnchor) {
        this.anchor = isAnchor;
    }

    public boolean isFamilyName() {
        // family names have an ID prefixed with GENO:
        return id != null && id.startsWith("GENO:");
    }

    public boolean isRejectionCandidate() {
        return false;
    }

    public String getComment() {
        return comment;
    }

    public void setComment(String comment) {
        this.comment = comment;
    }

    /**
     * Reduces the set of possible ID entries for this SynHit by all entries that have the given priority for this synonym unless the result would be empty.
     *
     * @param priorityToRemove
     */
    public void removeEntriesWithPriority(Set priorityToRemove) {
        int[] indicesToKeep = IntStream.range(0, synonymPriorities.size()).filter(i -> !priorityToRemove.contains(synonymPriorities.get(i))).toArray();
        // don't reduce the IDs if it would lead to an empty set
        if (indicesToKeep.length > 0) {
            setIds(IntStream.of(indicesToKeep).mapToObj(i -> ids.get(i)).collect(Collectors.toList()));
            setTaxIds(IntStream.of(indicesToKeep).mapToObj(i -> taxIds.get(i)).collect(Collectors.toList()));
            setSynonymPriorities(IntStream.of(indicesToKeep).mapToObj(i -> synonymPriorities.get(i)).collect(Collectors.toList()));
        }
    }

    /**
     * Removes all entries except those with the given priority unless the result would be empty.
     *
     * @param priorityToKeep
     */
    public void removeAllEntriesButWithPriority(int priorityToKeep) {
        int[] indicesToKeep = IntStream.range(0, synonymPriorities.size()).filter(i -> synonymPriorities.get(i).equals(priorityToKeep)).toArray();
        // don't reduce the IDs if it would lead to an empty set
        if (indicesToKeep.length > 0) {
            setIds(IntStream.of(indicesToKeep).mapToObj(i -> ids.get(i)).collect(Collectors.toList()));
            setTaxIds(IntStream.of(indicesToKeep).mapToObj(i -> taxIds.get(i)).collect(Collectors.toList()));
            setSynonymPriorities(IntStream.of(indicesToKeep).mapToObj(i -> synonymPriorities.get(i)).collect(Collectors.toList()));
        }
    }

    public enum CompareType {
        RANDOM, SCORE, SEMSCORE, ID
    }
}