All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.geneexpbase.genemodel.GeneSet Maven / Gradle / Ivy

package de.julielab.geneexpbase.genemodel;

import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import de.julielab.geneexpbase.candidateretrieval.SynHit;

import java.util.*;
import java.util.function.Predicate;
import java.util.function.Supplier;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import java.util.stream.Stream;

public class GeneSet extends TreeSet {

    /**
     *
     */
    private static final long serialVersionUID = -4038206150665551536L;
    private List setId;
    private FeatureVector featureVector;
    private GeneMention.SpecificType specificType;
    private boolean isPlural;
    private Instance instance;
    private double specificTypeConfidence;
    private String docId;
    private Map> id2contextScores = Collections.emptyMap();
    private GeneSet geneSet;
    private MentionMappingResult.RejectReason rejectReason;
    private List familyNames;
    private MentionMappingResult mentionMappingResult;
    private int number;
    private String taxId;

    public GeneSet(Set genes, List setId) {
        this();
        addAll(genes);
        this.setId = setId;
    }

    public GeneSet() {
        super(Comparator.comparingInt(GeneMention::getBegin).thenComparing(System::identityHashCode));
    }

    @Override
    public boolean addAll(Collection c) {
        boolean b = super.addAll(c);
        c.forEach(gm -> gm.addGeneSet(this));
        return b;
    }

    public boolean addAll(Collection c, boolean addGenesToSet) {
        if (addGenesToSet)
            return super.addAll(c);
        else {
            boolean changed = false;
            for (GeneMention gm : c)
                if (add(gm, addGenesToSet))
                    changed = true;
            return changed;
        }
    }

    private boolean add(GeneMention gm, boolean addGenesToSet) {
        boolean add = super.add(gm);
        if (addGenesToSet) {
            gm.addGeneSet(this);
        }
        return add;
    }

    @Override
    public boolean add(GeneMention geneMention) {
        return add(geneMention, true);
    }

    public String getTaxId() {
        return taxId;
    }

    public void setTaxId(String taxId) {
        this.taxId = taxId;
    }

    public int getSmallestGeneBegin() {
        Optional first = stream().findFirst();
        if (first.isEmpty())
            return 0;
        return first.get().getBegin();
    }

    public List getAllGoldIdsAsList() {
        return getAllGoldIds(Collectors.toList(), Collections::emptyList);
    }

    public Set getAllGoldIdsAsSet() {
        return getAllGoldIds(Collectors.toSet(), Collections::emptySet);
    }

    private  R getAllGoldIds(Collector collector, Supplier emptyResultSupplier) {
        if (!hasGoldMentions()) return emptyResultSupplier.get();
        return getAllGoldIds().map(id -> (T) id).collect(collector);
    }

    private boolean hasGoldMentions() {
        return stream().anyMatch(GeneMention::hasGoldMentions);
    }

    public Stream getAllGoldIds() {
        if (!hasGoldMentions()) return Stream.empty();
        return stream().map(GeneMention::getOverlappingGoldMentions).flatMap(Collection::stream).map(GeneMention::getIds).flatMap(Collection::stream);
    }

    public void addContextScore(String id, String contextFieldName, Double score) {
        if (id2contextScores.isEmpty())
            id2contextScores = new HashMap<>();
        Map field2score = id2contextScores.compute(id, (k, v) -> v != null ? v : new HashMap<>());
        field2score.put(contextFieldName, score);
    }

    public double getContextScore(String id, String contextFieldName) {
        if (id2contextScores == null || !id2contextScores.containsKey(id) || !id2contextScores.get(id).containsKey(contextFieldName))
            return 0;
        return id2contextScores.get(id).get(contextFieldName);
    }

    public Instance getInstance() {
        return instance;
    }

    public void setInstance(Instance instance) {
        this.instance = instance;

    }

    /**
     * The set ID represent the gene ID that all elements in the set belong to
     *
     * @return The ID of the elements in this set.
     */
    public List getSetId() {
        return setId;
    }

    public void setSetId(List setId) {
        this.setId = setId;
    }

    @Override
    public int hashCode() {
        return System.identityHashCode(this);
    }

    @Override
    public boolean equals(Object obj) {
        return Objects.equals(this, obj);
    }

    /**
     * Returns the text of any gene mention in this set or null, if the set is
     * empty.
     *
     * @return Any gene mention text of this set.
     */
    public String getRepresentationText() {
        Optional any = stream().findAny();
        if (any.isPresent())
            return any.get().getText();
        return null;
    }

    public FeatureVector getFeatureVector() {
        return featureVector;
    }

    public void setFeatureVector(FeatureVector featureVector) {
        this.featureVector = featureVector;
        stream().forEach(gm -> gm.setFeatureVector(featureVector));
    }

    public GeneMention.SpecificType getSpecificType() {
        return specificType;
    }

    public void setSpecificType(GeneMention.SpecificType specificType) {
        this.specificType = specificType;
        stream().forEach(gm -> gm.setSpecificType(specificType));
    }

    public boolean isPlural() {
        return isPlural;
    }

    public void setPlural(boolean isPlural) {
        this.isPlural = isPlural;
    }

    public double getSpecificTypeConfidence() {
        return specificTypeConfidence;
    }

    public void setSpecificTypeConfidence(double specificTypeConfidence) {
        this.specificTypeConfidence = specificTypeConfidence;
        stream().forEach(gm -> gm.setSpecificTypeConfidence(specificTypeConfidence));
    }

    public String getDocId() {
        return docId;
    }

    public void setDocId(String docId) {
        this.docId = docId;
    }

    public boolean hasContradictingGeneIdMappings() {
        Set knownIds = new HashSet<>();
        for (GeneMention gm : this) {
            Iterator idIt = gm.getResultCandidates().map(SynHit::getId).iterator();
            while (idIt.hasNext()) {
                String id = idIt.next();
                if (!knownIds.add(id))
                    return true;
            }
        }
        return false;
    }

    public boolean hasContradictingTaxonomyIds() {
        Set knownIds = new HashSet<>();
        for (GeneMention gm : this) {
            Set gmIds = gm.getTaxonomyIds().stream().collect(Collectors.toSet());
            if (knownIds.isEmpty() || !Sets.intersection(knownIds, gmIds).isEmpty())
                knownIds.addAll(gmIds);
            else
                return true;
        }
        return false;
    }

    public Stream getDocumentContext(int numTokensPerGene, boolean distinct) {
        return getDocumentContext(numTokensPerGene, Collections.emptySet(), false, distinct);
    }

    public Stream getDocumentContext(int numTokensPerGene, Set excludedTokens, boolean excludeGeneMentions, boolean distinct) {
        if (numTokensPerGene == 0)
            return Stream.empty();
        Stream contextStream = stream().flatMap(gm -> gm.getDocumentContext(numTokensPerGene, excludedTokens, excludeGeneMentions));
        if (distinct)
            contextStream = contextStream.distinct();
        return contextStream;
    }

    public GeneSet getGeneSet() {
        return geneSet;
    }

    public void setGeneSet(GeneSet geneSet) {
        this.geneSet = geneSet;
    }

    public Map> getId2contextScores() {
        return id2contextScores;
    }

    /**
     * Counts for each taxonomy ID how often it appears with the given species occurrence type.
     *
     * @param hintType
     * @return
     */
    public Map getSpeciesOccurrenceCounts(GeneSpeciesOccurrence hintType) {
        Map counts = new HashMap<>();
        for (GeneMention gm : this) {
            Multimap taxonomyCandidates = gm.getTaxonomyOccurrences();
            for (String taxId : taxonomyCandidates.keySet()) {
                Collection geneSpeciesOccurrences = taxonomyCandidates.get(taxId);
                if (geneSpeciesOccurrences.contains(hintType))
                    counts.merge(taxId, 1, Integer::sum);
            }
        }
        return counts;
    }


    /**
     * Returns those taxonomy IDs that occur most frequently with the specified occurrence type. All tax IDs with the
     * highest count are returned.
     *
     * @param hintType
     * @return
     */
    public Set getMostOccurringSpecies(GeneSpeciesOccurrence hintType) {
        Map speciesOccurrenceCounts = getSpeciesOccurrenceCounts(hintType);
        Optional max = speciesOccurrenceCounts.values().stream().max(Integer::compareTo);
        return max.isPresent() ? speciesOccurrenceCounts.keySet().stream().filter(tax -> speciesOccurrenceCounts.get(tax) == max.get()).collect(Collectors.toSet()) : Collections.emptySet();
    }

    /**
     * Returns any potentially existing acronym long form in this gene set.
     *
     * @return The found acronym long form or an empty optional.
     */
    public Optional getAbbreviationLongform() {
        return stream().filter(GeneMention::isAbbreviationLongForm).findAny();
    }

    public Stream getGeneMentionsWithSpeciesOccurrence(GeneSpeciesOccurrence occurrenceType) {
        return stream().filter(gm -> gm.getTaxonomyOccurrences().values().contains(occurrenceType));
    }

    public Stream getCandidateGeneIds() {
        return stream().map(GeneMention::getMentionMappingResult).filter(Objects::nonNull).flatMap(mmr -> mmr.tax2originalCandidates.keySet().stream().flatMap(tax -> mmr.tax2originalCandidates.get(tax).stream()).map(SynHit::getId));
    }

    public Stream getResultSynHits() {
        return stream().flatMap(GeneMention::getResultCandidates);
    }

    public boolean hasRejectedAndNonRejectedMentions() {
        Boolean rejected = null;
        for (GeneMention gm : this) {
            boolean thisGmIsRejected = gm.isRejected();
            if (rejected == null) {
                rejected = thisGmIsRejected;
            } else {
                if (rejected ^ thisGmIsRejected)
                    return true;
            }
        }
        return false;
    }

    public MentionMappingResult.RejectReason getRejectReason() {
        return rejectReason;
    }

    public void setRejectReason(MentionMappingResult.RejectReason rejectReason) {
        this.rejectReason = rejectReason;
    }

    public boolean isRejected() {
        return setId != null && !setId.isEmpty() && setId.stream().allMatch(Predicate.not(sh -> sh != MentionMappingResult.REJECTION));
    }

    public List getFamilyNames() {
        return familyNames;
    }

    public void setFamilyNames(List familyNames) {
        this.familyNames = familyNames;
    }

    public MentionMappingResult getMentionMappingResult() {
        return mentionMappingResult;
    }

    public void setMentionMappingResult(MentionMappingResult mentionMappingResult) {
        this.mentionMappingResult = mentionMappingResult;
    }

    public int getNumber() {
        return number;
    }

    public void setNumber(int number) {
        this.number = number;
    }

    public Set getContradictingGoldGeneIds() {
        Set nonSharedGoldIds = new HashSet<>();
        for (GeneMention gm1 : this) {
            for (GeneMention gm2 : this) {
                Set gold1 = gm1.getAllGoldIdAsSet();
                Set gold2 = gm2.getAllGoldIdAsSet();
                if (gold1.stream().noneMatch(gold2::contains))
                    nonSharedGoldIds.addAll(Sets.symmetricDifference(gold1, gold2));
            }
        }
        return nonSharedGoldIds;
    }

    public SynHit getBestCandidate() {
        SynHit ret = null;
        if (mentionMappingResult != null && mentionMappingResult.tax2finalRankedCandidates != null) {
            List synHits = mentionMappingResult.tax2finalRankedCandidates.get(taxId);
            if (synHits != null && !synHits.isEmpty()) {
                ret = synHits.get(0);
            }
        }
        return ret;
    }

    public Stream getNonRejectedGenes() {
        return stream().filter(Predicate.not(GeneMention::isRejected));
    }

    public Iterable getNonRejectedGenesIterable() {
        return () -> getNonRejectedGenes().iterator();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy