
de.julielab.geneexpbase.candidateretrieval.SynHit Maven / Gradle / Ivy
package de.julielab.geneexpbase.candidateretrieval;
import de.julielab.geneexpbase.GeneExpRuntimeException;
import de.julielab.geneexpbase.genemodel.GeneName;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
public class SynHit implements Comparable, Cloneable, Serializable {
public static final String TYPE_GEPRO = "Gene/Protein";
public static final String TYPE_GROUP = "Group";
/*
* this is a random id used for sorting
*/
int random;
private String synonym;
private float luceneScore;
private double lexicalScore;
private double contextualScore;
private double overallScore;
private float relevanceScore;
private boolean isExactMatch;
private Map speciesMentionScores = new HashMap<>();
/**
* All known Entrez Gene IDs for this synonym.
*/
private List ids;
/**
* All known Entrez Gene IDs for this synonym.
*/
private Set idsSet;
/**
* Set by {@link #setTaxId(String)}. Contains all the gene IDs associated with the assigned taxonomy ID in {@link #setTaxId(String)}. This will
* be unique in most cases but sometimes it isn't.
*
* @deprecated we use gene records instead where the ID is unique per record
*/
@Deprecated
private String[] taxonomySpecificIds;
private String id;
private String source;
private String mappedMention; // the mention found in text and searched for
// compare type is used during scoring if two synsets have same score
// (see in compareTo(...) method)
private CompareType compareType = CompareType.ID;
private String entityType;
/**
* All known tax IDs for this synonym.
*/
private List taxIds;
/**
* All known tax IDs for this synonym.
*/
private Set taxIdsSet;
private String taxId;
private GeneName mappedGeneName;
private List synonymPriorities;
private boolean anchor;
private String comment;
/**
* @param syn
* @param score
* @param id
* @param source
*/
public SynHit(String syn, double score, String id, String source) {
this.synonym = syn;
this.lexicalScore = score;
this.ids = Arrays.asList(id);
this.id = id;
this.source = source;
}
public SynHit(String synonym, double score, List ids, String source, String entityType, List taxIds) {
this.synonym = synonym;
this.lexicalScore = score;
this.ids = ids;
this.source = source;
this.entityType = entityType;
this.taxIds = taxIds;
}
/**
* Returns a comparator that sorts SynHits first according to their equality to the gene name and then by score. This helps to overcome the issue that Lucene's float scores sometimes
* fail to put an exact equal match to the top of the results.
*
* @param geneName A gene name to compare to.
* @return A comparator that can be used to sort lists of SynHits.
*/
public static Comparator getNormalizedExactMatchThenLuceneScoreComparator(final String geneName) {
Comparator comparator = Comparator.comparing(sh -> sh.getSynonym().equals(geneName));
comparator = comparator.thenComparingDouble(sh -> sh.getLuceneScore()).reversed();
return comparator;
}
public float getRelevanceScore() {
return relevanceScore;
}
public void setRelevanceScore(float relevanceScore) {
this.relevanceScore = relevanceScore;
}
public String getEntityType() {
return entityType;
}
public Map getSpeciesMentionScores() {
return speciesMentionScores;
}
public Double getSpeciesMentionScore(String taxId) {
return speciesMentionScores.get(taxId);
}
public void setSpeciesMentionScore(String taxId, double speciesMentionScore) {
speciesMentionScores.put(taxId, speciesMentionScore);
}
public void restrictToTaxId(String taxId) {
this.id = null;
this.taxId = null;
for (int i = 0; i < ids.size(); i++) {
if (taxIds.get(i).equals(taxId)) {
this.id = ids.get(i);
this.taxId = taxId;
}
}
if (this.id == null)
throw new IllegalArgumentException("This SynHit does not contain taxonomy ID " + taxId + ": " + this);
}
/**
* @return
*/
public double getLexicalScore() {
return lexicalScore;
}
public void setLexicalScore(double score) {
this.lexicalScore = score;
}
public double getContextualScore() {
return this.contextualScore;
}
public void setContextualScore(double score) {
this.contextualScore = score;
}
public String getSynonym() {
return synonym;
}
public void setSynonym(String syn) {
this.synonym = syn;
}
public String toString() {
String result = "syn=" + synonym + "\tid=" + getId() + "\tscore=" + lexicalScore + "\tsemScore="
+ contextualScore + "\tid=" + id + "\ttaxId="
+ taxId;
return result;
}
/**
* the comparator for two SynHits: order by score as set by setCompareType
* method TODO: find rule how to order if several SynHits have same score
* currently, random number is chosen
*
* @param o
* @return int
*/
public int compareTo(SynHit o) {
int c = 0;
if (this.compareType != o.compareType)
throw new IllegalStateException(
"Two SynHits are compared that don't use the same comparison type: " + this + ", " + o);
switch (this.compareType) {
case RANDOM:
c = Integer.compare(o.random, this.random);
break;
case SCORE:
c = Double.compare(o.lexicalScore, this.lexicalScore);
break;
case SEMSCORE:
c = Double.compare(o.contextualScore, this.contextualScore);
break;
case ID:
if (o.id == null || id == null)
throw new IllegalStateException("Trying to compare SynHits by ID where the id is null.");
c = o.id.compareTo(id);
break;
}
return c;
}
@Override
public boolean equals(Object other) {
if (this == other) return true;
if (other == null || getClass() != other.getClass()) return false;
SynHit o = (SynHit) other;
boolean c;
switch (this.compareType) {
case RANDOM:
c = o.random == this.random;
break;
case SCORE:
c = o.lexicalScore == this.lexicalScore;
break;
case SEMSCORE:
c = o.contextualScore == this.contextualScore;
break;
case ID:
if (o.id == null || id == null)
throw new IllegalStateException("Trying to compare SynHits by ID where the id is null.");
c = o.id.equals(id);
break;
default:
throw new IllegalStateException("Unexpected value: " + this.compareType);
}
return c;
}
@Override
public int hashCode() {
int c;
switch (this.compareType) {
case RANDOM:
c = Objects.hashCode(this.random);
break;
case SCORE:
c = Objects.hashCode(this.lexicalScore);
break;
case SEMSCORE:
c = Objects.hashCode(this.contextualScore);
break;
case ID:
c = Objects.hashCode(this.id);
break;
default:
throw new IllegalStateException("Unexpected value: " + this.compareType);
}
return c;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public CompareType getCompareType() {
return compareType;
}
public void setCompareType(CompareType type) {
this.compareType = type;
}
/**
* The potentially normalized and/or transformed original entity text
* mention for which this candidate has been retrieved.
*
* @return The string-normalized entity name that this candidate was matched
* to.
*/
public String getMappedMention() {
return mappedMention;
}
public void setMappedMention(String mappedSynonym) {
this.mappedMention = mappedSynonym;
}
public boolean isExactMatch() {
return isExactMatch;
}
public void setExactMatch(boolean exactMatch) {
isExactMatch = exactMatch;
}
public SynHit clone() {
try {
SynHit h = (SynHit) super.clone();
h.speciesMentionScores = new HashMap<>(speciesMentionScores);
h.ids = new ArrayList<>(ids);
if (taxIds != null)
h.taxIds = new ArrayList<>(taxIds);
if (synonymPriorities != null)
h.synonymPriorities = new ArrayList<>(synonymPriorities);
return h;
} catch (CloneNotSupportedException e) {
throw new GeneExpRuntimeException(e);
}
}
public List getIds() {
return ids;
}
public void setIds(List ids) {
this.ids = ids;
if (this.ids.size() == 1)
this.id = ids.get(0);
else if (this.ids.size() > 1)
this.id = null;
idsSet = null;
}
public Set getIdsSet() {
if (idsSet == null)
idsSet = new HashSet<>(ids);
return idsSet;
}
/**
* Returns true if a single gene or protein ID of this synonym has been determined.
*
* @return true if the final gene/protein ID of this synonym has been set, false otherwise.
*/
public boolean isDisambiguated() {
return id != null || ids.size() <= 1;
}
/**
* Returns true if there is more than one gene ID associated with this synonym.
*
* @return Whether there are multiple gene IDs for this synonym.
*/
public boolean isAmbiguousInGeneral() {
return ids.size() > 1;
}
/**
* Returns true if at least one taxonomy ID associated with this synonym appears multiple times.
*
* @return Whether this synonym exists for multiple different genes of the same species.
*/
public boolean isIntraSpeciesAmbiguousInGeneral() {
Set seenTaxIds = new HashSet<>();
boolean currentTaxIdWasNotYetSeen = false;
for (int i = 0; i < taxIds.size() && (currentTaxIdWasNotYetSeen = seenTaxIds.add(taxIds.get(i))); i++) ;
return !currentTaxIdWasNotYetSeen;
}
/**
* Returns true if there are at least two distinct taxonomy IDs associated with this synonym.
*
* @return Whether there are multiple different species that have a gene with this synonym.
*/
public boolean isInterSpeciesAmbiguousInGeneral() {
if (taxIds.size() <= 1)
return false;
Set seenTaxIds = new HashSet<>();
seenTaxIds.add(taxIds.get(0));
boolean currentTaxIdWasNotYetSeen = false;
for (int i = 1; i < taxIds.size() && !(currentTaxIdWasNotYetSeen = seenTaxIds.add(taxIds.get(i))); i++) ;
return currentTaxIdWasNotYetSeen;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
int idIndex = ids.indexOf(id);
if (ids != null && taxIds != null && synonymPriorities != null && ids.size() == taxIds.size() && ids.size() == synonymPriorities.size()) {
setIds(List.of(ids.get(idIndex)));
setTaxIds(List.of(taxIds.get(idIndex)));
setSynonymPriorities(List.of(synonymPriorities.get(idIndex)));
}
}
public String getTaxIdForGeneId(String id) {
int i = this.ids.indexOf(id);
assert i >= 0 : "Trying to set an ID to a SynHit that does not have this ID as a possibility.";
return taxIds.get(i);
}
public List getTaxIds() {
return taxIds;
}
/**
* @param taxIds
*/
public void setTaxIds(List taxIds) {
assert taxIds != null && !taxIds.isEmpty() : "Trying to set an empty taxonomy ID list to a SynHit.";
this.taxIds = taxIds;
if (this.taxIds.size() == 1)
this.taxId = this.taxIds.get(0);
else if (this.taxIds.size() > 1)
this.taxId = null;
}
public double getOverallScore() {
return overallScore;
}
public void setOverallScore(double overallScore) {
this.overallScore = overallScore;
}
public GeneName getMappedGeneName() {
return mappedGeneName;
}
public void setMappedGeneName(GeneName mappedGeneName) {
this.mappedGeneName = mappedGeneName;
}
/**
* Returns this single accepted taxonomy ID for this synonym (depends on the document context and may differ
* for different textual occurrences of this synonym) or null if not set.
* The taxonomy ID is set by {@link #setTaxId(String)}.
*
* @return The taxonomy ID associated with this synonym or null if it wasn't successfully set.
* @see #setTaxId(String)
* @see #getTaxIds()
*/
public String getTaxId() {
return taxId;
}
/**
* Accepts the passed taxonomy ID as assigned to this synonym. This causes the {@link #taxonomySpecificIds} field
* to be set which can be retrieved using {@link #getTaxonomySpecificIds()}. In case that the taxonomy ID
* assignment unique identifies a single gene/protein ID, this ID will be set to the {@link #id} field, marking
* this synonym as being disambiguated.
*
* @param taxId The taxonomy ID to assign this synonym.
* @throws IllegalArgumentException If the given taxonomy ID cannot be set to this synonym because it does not exist for the given tax ID.
* @see #getTaxonomySpecificIds()
* @see #getId()
* @see #isDisambiguated()
*/
public void setTaxId(String taxId) {
this.taxId = taxId;
}
/**
* @return Synonym centric index: The gene IDs associated with the taxonomy ID fixed for this SynHit
* @deprecated We will be using the gene record index so this method won't be used in the future.
*/
@Deprecated
public String[] getTaxonomySpecificIds() {
return taxonomySpecificIds;
}
public List getPrioritiesOfIds(String[] idArray) {
return getPrioritiesOfIds(Stream.of(idArray));
}
public List getPrioritiesOfIds(Stream idStream) {
final Set idSet = idStream.collect(Collectors.toSet());
return IntStream.range(0, ids.size()).filter(i -> idSet.contains(ids.get(i))).mapToObj(synonymPriorities::get).collect(Collectors.toList());
}
public boolean hasTaxId(String taxId) {
return taxIds.indexOf(taxId) != -1;
}
public List getSynonymPriorities() {
return synonymPriorities;
}
public void setSynonymPriorities(List synonymPriorities) {
this.synonymPriorities = synonymPriorities;
}
public int getSynonymPriority() {
return synonymPriorities.get(0).intValue();
}
public Stream getGeneIdsOfTaxId(String taxId) {
if (taxIds != null)
return IntStream.range(0, taxIds.size()).filter(i -> taxIds.get(i).equals(taxId)).mapToObj(ids::get);
return Stream.empty();
}
public Set getTaxIdsSet() {
if (taxIdsSet == null)
taxIdsSet = new HashSet<>(taxIds);
return taxIdsSet;
}
public float getLuceneScore() {
return luceneScore;
}
public void setLuceneScore(float luceneScore) {
this.luceneScore = luceneScore;
}
/**
* Anchors are hits with a very high probability of being correct. They are used to homogenize inconsistently mapped
* gene sets.
*
* @return True if this SynHit is an anchor, false otherwise.
*/
public boolean isAnchor() {
return anchor;
}
/**
* Mark this hit as very likely being correct. This information is used to resolve inconsistencies withing
* gene sets.
*
* @param isAnchor Whether this hit serves as an anchor.
*/
public void setAnchor(boolean isAnchor) {
this.anchor = isAnchor;
}
public boolean isFamilyName() {
// family names have an ID prefixed with GENO:
return id != null && id.startsWith("GENO:");
}
public boolean isRejectionCandidate() {
return false;
}
public String getComment() {
return comment;
}
public void setComment(String comment) {
this.comment = comment;
}
/**
* Reduces the set of possible ID entries for this SynHit by all entries that have the given priority for this synonym unless the result would be empty.
*
* @param priorityToRemove
*/
public void removeEntriesWithPriority(Set priorityToRemove) {
int[] indicesToKeep = IntStream.range(0, synonymPriorities.size()).filter(i -> !priorityToRemove.contains(synonymPriorities.get(i))).toArray();
// don't reduce the IDs if it would lead to an empty set
if (indicesToKeep.length > 0) {
setIds(IntStream.of(indicesToKeep).mapToObj(i -> ids.get(i)).collect(Collectors.toList()));
setTaxIds(IntStream.of(indicesToKeep).mapToObj(i -> taxIds.get(i)).collect(Collectors.toList()));
setSynonymPriorities(IntStream.of(indicesToKeep).mapToObj(i -> synonymPriorities.get(i)).collect(Collectors.toList()));
}
}
/**
* Removes all entries except those with the given priority unless the result would be empty.
*
* @param priorityToKeep
*/
public void removeAllEntriesButWithPriority(int priorityToKeep) {
int[] indicesToKeep = IntStream.range(0, synonymPriorities.size()).filter(i -> synonymPriorities.get(i).equals(priorityToKeep)).toArray();
// don't reduce the IDs if it would lead to an empty set
if (indicesToKeep.length > 0) {
setIds(IntStream.of(indicesToKeep).mapToObj(i -> ids.get(i)).collect(Collectors.toList()));
setTaxIds(IntStream.of(indicesToKeep).mapToObj(i -> taxIds.get(i)).collect(Collectors.toList()));
setSynonymPriorities(IntStream.of(indicesToKeep).mapToObj(i -> synonymPriorities.get(i)).collect(Collectors.toList()));
}
}
public enum CompareType {
RANDOM, SCORE, SEMSCORE, ID
}
}