All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.geneexpbase.candidateretrieval.SynHit Maven / Gradle / Ivy

package de.julielab.geneexpbase.candidateretrieval;

import de.julielab.geneexpbase.GeneExpRuntimeException;
import de.julielab.geneexpbase.genemodel.GeneName;

import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

public class SynHit implements Comparable, Cloneable, Serializable {
    public static final String TYPE_GEPRO = "Gene/Protein";
    public static final String TYPE_GROUP = "Group";
    /*
     * this is a random id used for sorting
     */
    int random;
    private String synonym;

    private float luceneScore;
    private double lexicalScore;
    private double contextualScore;
    private double overallScore;
    private float relevanceScore;
    private boolean isExactMatch;
    private Map speciesMentionScores = new HashMap<>();
    /**
     * All known Entrez Gene IDs for this synonym.
     */
    private List ids;
    /**
     * All known Entrez Gene IDs for this synonym.
     */
    private Set idsSet;
    /**
     * Set by {@link #setTaxId(String)}. Contains all the gene IDs associated with the assigned taxonomy ID in {@link #setTaxId(String)}. This will
     * be unique in most cases but sometimes it isn't.
     *
     * @deprecated we use gene records instead where the ID is unique per record
     */
    @Deprecated
    private String[] taxonomySpecificIds;
    private String id;
    private String source;
    private String mappedMention; // the mention found in text and searched for
    // compare type is used during scoring if two synsets have same score
    // (see in compareTo(...) method)
    private CompareType compareType = CompareType.ID;
    private String entityType;
    /**
     * All known tax IDs for this synonym.
     */
    private List taxIds;
    /**
     * All known tax IDs for this synonym.
     */
    private Set taxIdsSet;
    private String taxId;
    private GeneName mappedGeneName;
    private List synonymPriorities;
    private boolean anchor;
    private String comment;

    /**
     * @param syn
     * @param score
     * @param id
     * @param source
     */
    public SynHit(String syn, double score, String id, String source) {
        this.synonym = syn;
        this.lexicalScore = score;
        this.ids = Arrays.asList(id);
        this.id = id;
        this.source = source;
    }

    public SynHit(String synonym, double score, List ids, String source, String entityType, List taxIds) {
        this.synonym = synonym;
        this.lexicalScore = score;
        this.ids = ids;
        this.source = source;
        this.entityType = entityType;
        this.taxIds = taxIds;
    }

    /**
     * Returns a comparator that sorts SynHits first according to their equality to the gene name and then by score. This helps to overcome the issue that Lucene's float scores sometimes
     * fail to put an exact equal match to the top of the results.
     *
     * @param geneName A gene name to compare to.
     * @return A comparator that can be used to sort lists of SynHits.
     */
    public static Comparator getNormalizedExactMatchThenLuceneScoreComparator(final String geneName) {
        Comparator comparator = Comparator.comparing(sh -> sh.getSynonym().equals(geneName));
        comparator = comparator.thenComparingDouble(sh -> sh.getLuceneScore()).reversed();
        return comparator;
    }

    public float getRelevanceScore() {
        return relevanceScore;
    }

    public void setRelevanceScore(float relevanceScore) {
        this.relevanceScore = relevanceScore;
    }

    public String getEntityType() {
        return entityType;
    }

    public Map getSpeciesMentionScores() {
        return speciesMentionScores;
    }

    public Double getSpeciesMentionScore(String taxId) {
        return speciesMentionScores.get(taxId);
    }

    public void setSpeciesMentionScore(String taxId, double speciesMentionScore) {
        speciesMentionScores.put(taxId, speciesMentionScore);
    }

    public void restrictToTaxId(String taxId) {
        this.id = null;
        this.taxId = null;
        for (int i = 0; i < ids.size(); i++) {
            if (taxIds.get(i).equals(taxId)) {
                this.id = ids.get(i);
                this.taxId = taxId;
            }
        }
        if (this.id == null)
            throw new IllegalArgumentException("This SynHit does not contain taxonomy ID " + taxId + ": " + this);
    }

    /**
     * @return
     */
    public double getLexicalScore() {
        return lexicalScore;
    }

    public void setLexicalScore(double score) {
        this.lexicalScore = score;
    }

    public double getContextualScore() {
        return this.contextualScore;
    }

    public void setContextualScore(double score) {
        this.contextualScore = score;
    }

    public String getSynonym() {
        return synonym;
    }

    public void setSynonym(String syn) {
        this.synonym = syn;
    }

    public String toString() {
        String result = "syn=" + synonym + "\tid=" + getId() + "\tscore=" + lexicalScore + "\tsemScore="
                + contextualScore + "\tid=" + id + "\ttaxId="
                + taxId;
        return result;
    }

    /**
     * the comparator for two SynHits: order by score as set by setCompareType
     * method TODO: find rule how to order if several SynHits have same score
     * currently, random number is chosen
     *
     * @param o
     * @return int
     */
    public int compareTo(SynHit o) {
        int c = 0;
        if (this.compareType != o.compareType)
            throw new IllegalStateException(
                    "Two SynHits are compared that don't use the same comparison type: " + this + ", " + o);
        switch (this.compareType) {
            case RANDOM:
                c = Integer.compare(o.random, this.random);
                break;
            case SCORE:
                c = Double.compare(o.lexicalScore, this.lexicalScore);
                break;
            case SEMSCORE:
                c = Double.compare(o.contextualScore, this.contextualScore);
                break;
            case ID:
                if (o.id == null || id == null)
                    throw new IllegalStateException("Trying to compare SynHits by ID where the id is null.");
                c = o.id.compareTo(id);
                break;
        }
        return c;
    }

    @Override
    public boolean equals(Object other) {
        if (this == other) return true;
        if (other == null || getClass() != other.getClass()) return false;
        SynHit o = (SynHit) other;
        boolean c;
        switch (this.compareType) {
            case RANDOM:
                c = o.random == this.random;
                break;
            case SCORE:
                c = o.lexicalScore == this.lexicalScore;
                break;
            case SEMSCORE:
                c = o.contextualScore == this.contextualScore;
                break;
            case ID:
                if (o.id == null || id == null)
                    throw new IllegalStateException("Trying to compare SynHits by ID where the id is null.");
                c = o.id.equals(id);
                break;
            default:
                throw new IllegalStateException("Unexpected value: " + this.compareType);
        }
        return c;
    }

    @Override
    public int hashCode() {
        int c;
        switch (this.compareType) {
            case RANDOM:
                c = Objects.hashCode(this.random);
                break;
            case SCORE:
                c = Objects.hashCode(this.lexicalScore);
                break;
            case SEMSCORE:
                c = Objects.hashCode(this.contextualScore);
                break;
            case ID:
                c = Objects.hashCode(this.id);
                break;
            default:
                throw new IllegalStateException("Unexpected value: " + this.compareType);
        }
        return c;
    }

    public String getSource() {
        return source;
    }

    public void setSource(String source) {
        this.source = source;
    }

    public CompareType getCompareType() {
        return compareType;
    }

    public void setCompareType(CompareType type) {
        this.compareType = type;
    }

    /**
     * The potentially normalized and/or transformed original entity text
     * mention for which this candidate has been retrieved.
     *
     * @return The string-normalized entity name that this candidate was matched
     * to.
     */
    public String getMappedMention() {
        return mappedMention;
    }

    public void setMappedMention(String mappedSynonym) {
        this.mappedMention = mappedSynonym;
    }

    public boolean isExactMatch() {
        return isExactMatch;
    }

    public void setExactMatch(boolean exactMatch) {
        isExactMatch = exactMatch;
    }

    public SynHit clone() {
        try {
            SynHit h = (SynHit) super.clone();
            h.speciesMentionScores = new HashMap<>(speciesMentionScores);
            h.ids = new ArrayList<>(ids);
            if (taxIds != null)
                h.taxIds = new ArrayList<>(taxIds);
            if (synonymPriorities != null)
                h.synonymPriorities = new ArrayList<>(synonymPriorities);
            return h;
        } catch (CloneNotSupportedException e) {
            throw new GeneExpRuntimeException(e);
        }
    }

    public List getIds() {
        return ids;
    }

    public void setIds(List ids) {
        this.ids = ids;
        if (this.ids.size() == 1)
            this.id = ids.get(0);
        else if (this.ids.size() > 1)
            this.id = null;
        idsSet = null;
    }

    public Set getIdsSet() {
        if (idsSet == null)
            idsSet = new HashSet<>(ids);
        return idsSet;
    }

    /**
     * 

Returns true if a single gene or protein ID of this synonym has been determined.

* * @return true if the final gene/protein ID of this synonym has been set, false otherwise. */ public boolean isDisambiguated() { return id != null || ids.size() <= 1; } /** *

Returns true if there is more than one gene ID associated with this synonym.

* * @return Whether there are multiple gene IDs for this synonym. */ public boolean isAmbiguousInGeneral() { return ids.size() > 1; } /** *

Returns true if at least one taxonomy ID associated with this synonym appears multiple times.

* * @return Whether this synonym exists for multiple different genes of the same species. */ public boolean isIntraSpeciesAmbiguousInGeneral() { Set seenTaxIds = new HashSet<>(); boolean currentTaxIdWasNotYetSeen = false; for (int i = 0; i < taxIds.size() && (currentTaxIdWasNotYetSeen = seenTaxIds.add(taxIds.get(i))); i++) ; return !currentTaxIdWasNotYetSeen; } /** *

Returns true if there are at least two distinct taxonomy IDs associated with this synonym.

* * @return Whether there are multiple different species that have a gene with this synonym. */ public boolean isInterSpeciesAmbiguousInGeneral() { if (taxIds.size() <= 1) return false; Set seenTaxIds = new HashSet<>(); seenTaxIds.add(taxIds.get(0)); boolean currentTaxIdWasNotYetSeen = false; for (int i = 1; i < taxIds.size() && !(currentTaxIdWasNotYetSeen = seenTaxIds.add(taxIds.get(i))); i++) ; return currentTaxIdWasNotYetSeen; } public String getId() { return id; } public void setId(String id) { this.id = id; int idIndex = ids.indexOf(id); if (ids != null && taxIds != null && synonymPriorities != null && ids.size() == taxIds.size() && ids.size() == synonymPriorities.size()) { setIds(List.of(ids.get(idIndex))); setTaxIds(List.of(taxIds.get(idIndex))); setSynonymPriorities(List.of(synonymPriorities.get(idIndex))); } } public String getTaxIdForGeneId(String id) { int i = this.ids.indexOf(id); assert i >= 0 : "Trying to set an ID to a SynHit that does not have this ID as a possibility."; return taxIds.get(i); } public List getTaxIds() { return taxIds; } /** * @param taxIds */ public void setTaxIds(List taxIds) { assert taxIds != null && !taxIds.isEmpty() : "Trying to set an empty taxonomy ID list to a SynHit."; this.taxIds = taxIds; if (this.taxIds.size() == 1) this.taxId = this.taxIds.get(0); else if (this.taxIds.size() > 1) this.taxId = null; } public double getOverallScore() { return overallScore; } public void setOverallScore(double overallScore) { this.overallScore = overallScore; } public GeneName getMappedGeneName() { return mappedGeneName; } public void setMappedGeneName(GeneName mappedGeneName) { this.mappedGeneName = mappedGeneName; } /** *

Returns this single accepted taxonomy ID for this synonym (depends on the document context and may differ * for different textual occurrences of this synonym) or null if not set.

*

The taxonomy ID is set by {@link #setTaxId(String)}.

* * @return The taxonomy ID associated with this synonym or null if it wasn't successfully set. * @see #setTaxId(String) * @see #getTaxIds() */ public String getTaxId() { return taxId; } /** *

Accepts the passed taxonomy ID as assigned to this synonym. This causes the {@link #taxonomySpecificIds} field * to be set which can be retrieved using {@link #getTaxonomySpecificIds()}. In case that the taxonomy ID * assignment unique identifies a single gene/protein ID, this ID will be set to the {@link #id} field, marking * this synonym as being disambiguated.

* * @param taxId The taxonomy ID to assign this synonym. * @throws IllegalArgumentException If the given taxonomy ID cannot be set to this synonym because it does not exist for the given tax ID. * @see #getTaxonomySpecificIds() * @see #getId() * @see #isDisambiguated() */ public void setTaxId(String taxId) { this.taxId = taxId; } /** * @return Synonym centric index: The gene IDs associated with the taxonomy ID fixed for this SynHit * @deprecated We will be using the gene record index so this method won't be used in the future. */ @Deprecated public String[] getTaxonomySpecificIds() { return taxonomySpecificIds; } public List getPrioritiesOfIds(String[] idArray) { return getPrioritiesOfIds(Stream.of(idArray)); } public List getPrioritiesOfIds(Stream idStream) { final Set idSet = idStream.collect(Collectors.toSet()); return IntStream.range(0, ids.size()).filter(i -> idSet.contains(ids.get(i))).mapToObj(synonymPriorities::get).collect(Collectors.toList()); } public boolean hasTaxId(String taxId) { return taxIds.indexOf(taxId) != -1; } public List getSynonymPriorities() { return synonymPriorities; } public void setSynonymPriorities(List synonymPriorities) { this.synonymPriorities = synonymPriorities; } public int getSynonymPriority() { return synonymPriorities.get(0).intValue(); } public Stream getGeneIdsOfTaxId(String taxId) { if (taxIds != null) return IntStream.range(0, taxIds.size()).filter(i -> taxIds.get(i).equals(taxId)).mapToObj(ids::get); return Stream.empty(); } public Set getTaxIdsSet() { if (taxIdsSet == null) taxIdsSet = new HashSet<>(taxIds); return taxIdsSet; } public float getLuceneScore() { return luceneScore; } public void setLuceneScore(float luceneScore) { this.luceneScore = luceneScore; } /** * Anchors are hits with a very high probability of being correct. They are used to homogenize inconsistently mapped * gene sets. * * @return True if this SynHit is an anchor, false otherwise. */ public boolean isAnchor() { return anchor; } /** * Mark this hit as very likely being correct. This information is used to resolve inconsistencies withing * gene sets. * * @param isAnchor Whether this hit serves as an anchor. */ public void setAnchor(boolean isAnchor) { this.anchor = isAnchor; } public boolean isFamilyName() { // family names have an ID prefixed with GENO: return id != null && id.startsWith("GENO:"); } public boolean isRejectionCandidate() { return false; } public String getComment() { return comment; } public void setComment(String comment) { this.comment = comment; } /** * Reduces the set of possible ID entries for this SynHit by all entries that have the given priority for this synonym unless the result would be empty. * * @param priorityToRemove */ public void removeEntriesWithPriority(Set priorityToRemove) { int[] indicesToKeep = IntStream.range(0, synonymPriorities.size()).filter(i -> !priorityToRemove.contains(synonymPriorities.get(i))).toArray(); // don't reduce the IDs if it would lead to an empty set if (indicesToKeep.length > 0) { setIds(IntStream.of(indicesToKeep).mapToObj(i -> ids.get(i)).collect(Collectors.toList())); setTaxIds(IntStream.of(indicesToKeep).mapToObj(i -> taxIds.get(i)).collect(Collectors.toList())); setSynonymPriorities(IntStream.of(indicesToKeep).mapToObj(i -> synonymPriorities.get(i)).collect(Collectors.toList())); } } /** * Removes all entries except those with the given priority unless the result would be empty. * * @param priorityToKeep */ public void removeAllEntriesButWithPriority(int priorityToKeep) { int[] indicesToKeep = IntStream.range(0, synonymPriorities.size()).filter(i -> synonymPriorities.get(i).equals(priorityToKeep)).toArray(); // don't reduce the IDs if it would lead to an empty set if (indicesToKeep.length > 0) { setIds(IntStream.of(indicesToKeep).mapToObj(i -> ids.get(i)).collect(Collectors.toList())); setTaxIds(IntStream.of(indicesToKeep).mapToObj(i -> taxIds.get(i)).collect(Collectors.toList())); setSynonymPriorities(IntStream.of(indicesToKeep).mapToObj(i -> synonymPriorities.get(i)).collect(Collectors.toList())); } } public enum CompareType { RANDOM, SCORE, SEMSCORE, ID } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy