
de.julielab.geneexpbase.genemodel.GeneSet Maven / Gradle / Ivy
package de.julielab.geneexpbase.genemodel;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import de.julielab.geneexpbase.candidateretrieval.SynHit;
import java.util.*;
import java.util.function.Predicate;
import java.util.function.Supplier;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class GeneSet extends TreeSet {
/**
*
*/
private static final long serialVersionUID = -4038206150665551536L;
private List setId;
private FeatureVector featureVector;
private GeneMention.SpecificType specificType;
private boolean isPlural;
private Instance instance;
private double specificTypeConfidence;
private String docId;
private Map> id2contextScores = Collections.emptyMap();
private GeneSet geneSet;
private MentionMappingResult.RejectReason rejectReason;
private List familyNames;
private MentionMappingResult mentionMappingResult;
private int number;
private String taxId;
public GeneSet(Set genes, List setId) {
this();
addAll(genes);
this.setId = setId;
}
public GeneSet() {
super(Comparator.comparingInt(GeneMention::getBegin).thenComparing(System::identityHashCode));
}
@Override
public boolean addAll(Collection extends GeneMention> c) {
boolean b = super.addAll(c);
c.forEach(gm -> gm.addGeneSet(this));
return b;
}
public boolean addAll(Collection extends GeneMention> c, boolean addGenesToSet) {
if (addGenesToSet)
return super.addAll(c);
else {
boolean changed = false;
for (GeneMention gm : c)
if (add(gm, addGenesToSet))
changed = true;
return changed;
}
}
private boolean add(GeneMention gm, boolean addGenesToSet) {
boolean add = super.add(gm);
if (addGenesToSet) {
gm.addGeneSet(this);
}
return add;
}
@Override
public boolean add(GeneMention geneMention) {
return add(geneMention, true);
}
public String getTaxId() {
return taxId;
}
public void setTaxId(String taxId) {
this.taxId = taxId;
}
public int getSmallestGeneBegin() {
Optional first = stream().findFirst();
if (first.isEmpty())
return 0;
return first.get().getBegin();
}
public List getAllGoldIdsAsList() {
return getAllGoldIds(Collectors.toList(), Collections::emptyList);
}
public Set getAllGoldIdsAsSet() {
return getAllGoldIds(Collectors.toSet(), Collections::emptySet);
}
private R getAllGoldIds(Collector super T, A, R> collector, Supplier emptyResultSupplier) {
if (!hasGoldMentions()) return emptyResultSupplier.get();
return getAllGoldIds().map(id -> (T) id).collect(collector);
}
private boolean hasGoldMentions() {
return stream().anyMatch(GeneMention::hasGoldMentions);
}
public Stream getAllGoldIds() {
if (!hasGoldMentions()) return Stream.empty();
return stream().map(GeneMention::getOverlappingGoldMentions).flatMap(Collection::stream).map(GeneMention::getIds).flatMap(Collection::stream);
}
public void addContextScore(String id, String contextFieldName, Double score) {
if (id2contextScores.isEmpty())
id2contextScores = new HashMap<>();
Map field2score = id2contextScores.compute(id, (k, v) -> v != null ? v : new HashMap<>());
field2score.put(contextFieldName, score);
}
public double getContextScore(String id, String contextFieldName) {
if (id2contextScores == null || !id2contextScores.containsKey(id) || !id2contextScores.get(id).containsKey(contextFieldName))
return 0;
return id2contextScores.get(id).get(contextFieldName);
}
public Instance getInstance() {
return instance;
}
public void setInstance(Instance instance) {
this.instance = instance;
}
/**
* The set ID represent the gene ID that all elements in the set belong to
*
* @return The ID of the elements in this set.
*/
public List getSetId() {
return setId;
}
public void setSetId(List setId) {
this.setId = setId;
}
@Override
public int hashCode() {
return System.identityHashCode(this);
}
@Override
public boolean equals(Object obj) {
return Objects.equals(this, obj);
}
/**
* Returns the text of any gene mention in this set or null, if the set is
* empty.
*
* @return Any gene mention text of this set.
*/
public String getRepresentationText() {
Optional any = stream().findAny();
if (any.isPresent())
return any.get().getText();
return null;
}
public FeatureVector getFeatureVector() {
return featureVector;
}
public void setFeatureVector(FeatureVector featureVector) {
this.featureVector = featureVector;
stream().forEach(gm -> gm.setFeatureVector(featureVector));
}
public GeneMention.SpecificType getSpecificType() {
return specificType;
}
public void setSpecificType(GeneMention.SpecificType specificType) {
this.specificType = specificType;
stream().forEach(gm -> gm.setSpecificType(specificType));
}
public boolean isPlural() {
return isPlural;
}
public void setPlural(boolean isPlural) {
this.isPlural = isPlural;
}
public double getSpecificTypeConfidence() {
return specificTypeConfidence;
}
public void setSpecificTypeConfidence(double specificTypeConfidence) {
this.specificTypeConfidence = specificTypeConfidence;
stream().forEach(gm -> gm.setSpecificTypeConfidence(specificTypeConfidence));
}
public String getDocId() {
return docId;
}
public void setDocId(String docId) {
this.docId = docId;
}
public boolean hasContradictingGeneIdMappings() {
Set knownIds = new HashSet<>();
for (GeneMention gm : this) {
Iterator idIt = gm.getResultCandidates().map(SynHit::getId).iterator();
while (idIt.hasNext()) {
String id = idIt.next();
if (!knownIds.add(id))
return true;
}
}
return false;
}
public boolean hasContradictingTaxonomyIds() {
Set knownIds = new HashSet<>();
for (GeneMention gm : this) {
Set gmIds = gm.getTaxonomyIds().stream().collect(Collectors.toSet());
if (knownIds.isEmpty() || !Sets.intersection(knownIds, gmIds).isEmpty())
knownIds.addAll(gmIds);
else
return true;
}
return false;
}
public Stream getDocumentContext(int numTokensPerGene, boolean distinct) {
return getDocumentContext(numTokensPerGene, Collections.emptySet(), false, distinct);
}
public Stream getDocumentContext(int numTokensPerGene, Set excludedTokens, boolean excludeGeneMentions, boolean distinct) {
if (numTokensPerGene == 0)
return Stream.empty();
Stream contextStream = stream().flatMap(gm -> gm.getDocumentContext(numTokensPerGene, excludedTokens, excludeGeneMentions));
if (distinct)
contextStream = contextStream.distinct();
return contextStream;
}
public GeneSet getGeneSet() {
return geneSet;
}
public void setGeneSet(GeneSet geneSet) {
this.geneSet = geneSet;
}
public Map> getId2contextScores() {
return id2contextScores;
}
/**
* Counts for each taxonomy ID how often it appears with the given species occurrence type.
*
* @param hintType
* @return
*/
public Map getSpeciesOccurrenceCounts(GeneSpeciesOccurrence hintType) {
Map counts = new HashMap<>();
for (GeneMention gm : this) {
Multimap taxonomyCandidates = gm.getTaxonomyOccurrences();
for (String taxId : taxonomyCandidates.keySet()) {
Collection geneSpeciesOccurrences = taxonomyCandidates.get(taxId);
if (geneSpeciesOccurrences.contains(hintType))
counts.merge(taxId, 1, Integer::sum);
}
}
return counts;
}
/**
* Returns those taxonomy IDs that occur most frequently with the specified occurrence type. All tax IDs with the
* highest count are returned.
*
* @param hintType
* @return
*/
public Set getMostOccurringSpecies(GeneSpeciesOccurrence hintType) {
Map speciesOccurrenceCounts = getSpeciesOccurrenceCounts(hintType);
Optional max = speciesOccurrenceCounts.values().stream().max(Integer::compareTo);
return max.isPresent() ? speciesOccurrenceCounts.keySet().stream().filter(tax -> speciesOccurrenceCounts.get(tax) == max.get()).collect(Collectors.toSet()) : Collections.emptySet();
}
/**
* Returns any potentially existing acronym long form in this gene set.
*
* @return The found acronym long form or an empty optional.
*/
public Optional getAbbreviationLongform() {
return stream().filter(GeneMention::isAbbreviationLongForm).findAny();
}
public Stream getGeneMentionsWithSpeciesOccurrence(GeneSpeciesOccurrence occurrenceType) {
return stream().filter(gm -> gm.getTaxonomyOccurrences().values().contains(occurrenceType));
}
public Stream getCandidateGeneIds() {
return stream().map(GeneMention::getMentionMappingResult).filter(Objects::nonNull).flatMap(mmr -> mmr.tax2originalCandidates.keySet().stream().flatMap(tax -> mmr.tax2originalCandidates.get(tax).stream()).map(SynHit::getId));
}
public Stream getResultSynHits() {
return stream().flatMap(GeneMention::getResultCandidates);
}
public boolean hasRejectedAndNonRejectedMentions() {
Boolean rejected = null;
for (GeneMention gm : this) {
boolean thisGmIsRejected = gm.isRejected();
if (rejected == null) {
rejected = thisGmIsRejected;
} else {
if (rejected ^ thisGmIsRejected)
return true;
}
}
return false;
}
public MentionMappingResult.RejectReason getRejectReason() {
return rejectReason;
}
public void setRejectReason(MentionMappingResult.RejectReason rejectReason) {
this.rejectReason = rejectReason;
}
public boolean isRejected() {
return setId != null && !setId.isEmpty() && setId.stream().allMatch(Predicate.not(sh -> sh != MentionMappingResult.REJECTION));
}
public List getFamilyNames() {
return familyNames;
}
public void setFamilyNames(List familyNames) {
this.familyNames = familyNames;
}
public MentionMappingResult getMentionMappingResult() {
return mentionMappingResult;
}
public void setMentionMappingResult(MentionMappingResult mentionMappingResult) {
this.mentionMappingResult = mentionMappingResult;
}
public int getNumber() {
return number;
}
public void setNumber(int number) {
this.number = number;
}
public Set getContradictingGoldGeneIds() {
Set nonSharedGoldIds = new HashSet<>();
for (GeneMention gm1 : this) {
for (GeneMention gm2 : this) {
Set gold1 = gm1.getAllGoldIdAsSet();
Set gold2 = gm2.getAllGoldIdAsSet();
if (gold1.stream().noneMatch(gold2::contains))
nonSharedGoldIds.addAll(Sets.symmetricDifference(gold1, gold2));
}
}
return nonSharedGoldIds;
}
public SynHit getBestCandidate() {
SynHit ret = null;
if (mentionMappingResult != null && mentionMappingResult.tax2finalRankedCandidates != null) {
List synHits = mentionMappingResult.tax2finalRankedCandidates.get(taxId);
if (synHits != null && !synHits.isEmpty()) {
ret = synHits.get(0);
}
}
return ret;
}
public Stream getNonRejectedGenes() {
return stream().filter(Predicate.not(GeneMention::isRejected));
}
public Iterable getNonRejectedGenesIterable() {
return () -> getNonRejectedGenes().iterator();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy