All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.geneexpbase.genemodel.GeneDocument Maven / Gradle / Ivy

package de.julielab.geneexpbase.genemodel;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import com.lahodiuk.ahocorasick.AhoCorasickOptimized;
import de.julielab.geneexpbase.AhoCorasickLongestMatchCallback;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.java.utilities.spanutils.OffsetMap;
import de.julielab.java.utilities.spanutils.OffsetSet;
import de.julielab.java.utilities.spanutils.OffsetSpanComparator;
import de.julielab.java.utilities.spanutils.Span;
import org.apache.commons.lang3.Range;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;
import java.util.Map.Entry;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

import static de.julielab.geneexpbase.genemodel.GeneMention.GeneTagger;
import static de.julielab.geneexpbase.genemodel.GeneMention.GeneTagger.GOLD;
import static java.util.stream.Collectors.toList;
import static java.util.stream.Collectors.toSet;


public class GeneDocument {
    public static final Pattern lociRegExp = Pattern.compile("[0-9]+[Xqp][0-9.-]+");
    private static final Logger log = LoggerFactory.getLogger(GeneDocument.class);
    /**
     * Sometimes an obvious plural tag is missed, e.g. for words like "LERKs". This
     * matches this exact pattern: Upper case characters followed by a lower case
     * 's'.
     */
    private final Matcher pluralMatcher = Pattern.compile("[A-Z]+s").matcher("");
    private OffsetMap acronyms;
    private OffsetMap acronymLongforms;
    private OffsetMap chunks;
    private OffsetMap posTags;
    private OffsetMap ontologyClassMentions;
    private String documentText;
    private String documentTitle;
    /**
     * This is the original set of genes that has been set via
     * {@link #setGenes(Stream)}. From this set, a subset is selected and stored in
     * {@link #genes} which is then the set of genes used for all processing and
     * mapping. Non-selected genes are non-existent to processing algorithms, except
     * they explicitly work on the allGenes set.
     */
    private NavigableSet allGenes;
    private OffsetMap> genes;
    /**
     * Used for evaluation and tagger training purposes.
     */
    private OffsetMap> goldGenes = OffsetMap.emptyOffsetMap();
    private Set goldIds = Collections.emptySet();
    private boolean goldHasOffsets;
    private boolean goldOffsetsInferred;
    private GeneSets geneSets;
    private String id;
    private OffsetSet sentences;
    private SpeciesCandidates species;
    private AhoCorasickOptimized geneNameDictionary;
    private TermNormalizer termNormalizer;
    private Collection meshHeadings;
    private Set state;
    private Set chromosomeLocations;
    private Set geneMentionTexts;
    private String abstractText;
    private Range titleOffsets;
    private Range abstractOffsets;
    private Set goldTaxonomyIds = Collections.emptySet();
    private boolean completelyAnnotated;
    private Collection coreferenceSets;
    private OffsetMap coreferenceExpressions;
    private OffsetMap appositions;
    private OffsetSet nonGenePhrases;

    public GeneDocument() {
        state = new LinkedHashSet<>();
        termNormalizer = new TermNormalizer();
    }

    /**
     * Copies the template document. This is mostly a shallow copy, except the
     * genes. Those are deeply copied and put into the respective structures (the
     * "genes" and "geneSets" fields).
     *
     * @param template The document to copy.
     */
    public GeneDocument(GeneDocument template) {
        acronyms = template.acronyms;
        acronymLongforms = template.acronymLongforms;
        coreferenceSets = template.coreferenceSets;
        coreferenceExpressions = template.coreferenceExpressions;
        appositions = template.appositions;
        chunks = template.chunks;
        ontologyClassMentions = template.ontologyClassMentions;
        posTags = template.posTags;
        documentText = template.documentText;
        documentTitle = template.documentTitle;
        // Copy the genes by their Java system ID
        TreeMap orgToNew = new TreeMap<>(
                Comparator.comparingInt(System::identityHashCode));
        template.allGenes.forEach(g -> {
            GeneMention newGm = new GeneMention(g);
            newGm.setGeneDocument(this);
            orgToNew.put(g, newGm);
        });
        allGenes = template.allGenes.stream().map(old -> Objects.requireNonNull(orgToNew.get(old))).collect(Collectors.toCollection(this::createAllGenesSet));
        genes = new OffsetMap<>();
        for (Entry, List> original : template.genes.entrySet())
            genes.put(original.getKey(), original.getValue().stream().map(k -> Objects.requireNonNull(orgToNew.get(k))).collect(toList()));
        template.goldGenes.values().stream().flatMap(Collection::stream).map(GeneMention::new).forEach(this::putGoldGene);
        if (template.geneSets != null) {
            geneSets = new GeneSets();
            for (GeneSet gs : template.geneSets) {
                GeneSet newSet = new GeneSet();
                newSet.setFeatureVector(gs.getFeatureVector());
                newSet.setInstance(gs.getInstance());
                newSet.setSetId(gs.getSetId());
                newSet.setSpecificType(gs.getSpecificType());
                gs.forEach(g -> newSet.add(orgToNew.get(g)));
                geneSets.add(newSet);
                newSet.forEach(g -> g.addGeneSet(newSet));
            }
        }
        if (template.meshHeadings != null)
            meshHeadings = template.meshHeadings.stream().map(MeshHeading::clone).collect(toSet());
        id = template.id;
        sentences = template.sentences;
        nonGenePhrases = template.nonGenePhrases;
        if (template.species != null)
            species = template.species.clone();
        geneNameDictionary = template.geneNameDictionary;
        termNormalizer = template.termNormalizer;
        state = new HashSet<>(template.state);
        goldHasOffsets = template.goldHasOffsets;
        goldOffsetsInferred = template.goldOffsetsInferred;
        if (template.goldIds != null)
            goldIds = new HashSet<>(template.goldIds);
        if (template.goldTaxonomyIds != null)
            goldTaxonomyIds = new HashSet<>(template.goldTaxonomyIds);
        if (template.chromosomeLocations != null)
            chromosomeLocations = new HashSet<>(template.chromosomeLocations);
        if (template.geneMentionTexts != null)
            geneMentionTexts = new HashSet<>(template.geneMentionTexts);
    }

    public GeneDocument(String id) {
        this();
        this.id = id;
    }


    private TreeSet createAllGenesSet() {
//        return new TreeSet<>(Comparator.comparingInt(GeneMention::getBegin).thenComparingInt(GeneMention::getEnd).thenComparing(GeneMention::getTagger).thenComparingInt(g -> System.identityHashCode(g)));
        // removed the hash code comparison for more general access; I don't know why exactly I added the hash code so this might break some code. But that code is probably unused, so I dare ;-)
        return new TreeSet<>(Comparator.comparingInt(GeneMention::getBegin).thenComparingInt(GeneMention::getEnd).thenComparing(GeneMention::getTagger));
    }

    public OffsetMap> getGoldGenes() {
        return goldGenes;
    }

    public Range getTitleOffsets() {
        return titleOffsets != null ? titleOffsets : Range.between(0, 0);
    }

    public void setTitleOffsets(Range titleOffsets) {
        this.titleOffsets = titleOffsets;
    }

    public Range getAbstractOffsets() {
        return abstractOffsets != null ? abstractOffsets : Range.between(0, 0);
    }

    public void setAbstractOffsets(Range abstractOffsets) {
        this.abstractOffsets = abstractOffsets;
    }

    public Set getGoldIds() {
        return goldIds;
    }

    public void setGoldIds(Set goldIds) {
        this.goldIds = goldIds;
    }

    public void addState(State state) {
        this.state.add(state);
    }

    public boolean hasState(State state) {
        if (state == null)
            return false;
        return this.state.contains(state);
    }

    public AcronymLongform getAcronymLongformAndOffsets(Acronym acronym) {
        AcronymLongform longform = acronym.getLongform();
        if (null == longform.getText()) {
            Range range = longform.getOffsets();
            longform.setText(this.getDocumentText().substring(range.getMinimum(), range.getMaximum()));
        }
        return longform;
    }

    public void setNominalPhrasesOfGenesToNames() {
        for (GeneMention gm : getAllGenes()) {
            Optional, String>> chunkNP = getOverlappingChunks(gm.getOffsets(), "ChunkNP").stream().findFirst();
            if (chunkNP.isPresent()) {
                gm.getGeneName().setNominalPhraseContext(getCoveredText(chunkNP.get().getKey()));
            }
        }
    }

    /**
     * Uses acronym resolution to generate gene name variants. Variant generation includes the replacement of acronyms
     * by their long forms and vice versa.
     */
    public void setAcronymsAsGeneNameAlternatives() {
        List longformTexts = new ArrayList<>();
        Multimap long2short = HashMultimap.create();
        Iterator longIt = getAcronymLongforms().values().iterator();
        // Iterate over all long forms and set long name variants to all
        // genes overlapping their acronyms.
        // Also, build a dictionary of long forms and a map that connects long forms
        // to their respective short forms. This is used below to to set short
        // variants to gene mentions containing the long form of an acronym definition.
        while (longIt.hasNext()) {
            AcronymLongform longForm = longIt.next();
            String longformText = getCoveredText(longForm);
            longformTexts.add(longformText);
            String acronymText = null;
            for (Acronym acronym : longForm.getAcronyms()) {
                acronymText = getCoveredText(acronym);
                long2short.put(longformText, acronymText);
                // TODO check that the missing filter doesn't hurt performance
//                Iterator acroIt = getOverlappingGenes(acronym.getOffsets()).filter(gm -> gm.getOffsets().equals(acronym.getOffsets())).iterator();
                Iterator acroIt = getOverlappingGenes(acronym.getOffsets()).iterator();
                while (acroIt.hasNext()) {
                    GeneMention acroGene = acroIt.next();
                    String longText = acroGene.getText().replace(acronymText, longformText);
                    // In case that the acronym definition is also a gene name, the acronym also overlaps with the
                    // long form gene (since the gene mention covers both, if not resolved via the composite
                    // resolution)
                    if (!longText.equals(acroGene.getText())) {
                        GeneName longName = new GeneName(longText, getTermNormalizer());
                        acroGene.getGeneName().addAlternative(longName);
                    }
                }
            }
//            Iterator longFormGenes = getOverlappingGenes(longForm.getOffsets()).filter(gm -> gm.getOffsets().equals(longForm.getOffsets())).iterator();
//            while (longFormGenes.hasNext()) {
//                GeneMention longGm = longFormGenes.next();
//                GeneName acroName = new GeneName(acronymText, getTermNormalizer());
//                longGm.getGeneName().addAlternative(acroName);
//            }
        }
        // Now add short form variants to all genes containing a long form of an acronym.
        AhoCorasickOptimized longformAc = new AhoCorasickOptimized(longformTexts);
        AhoCorasickLongestMatchCallback callback = new AhoCorasickLongestMatchCallback();
        for (GeneMention gm : getGenesIterable()) {
            callback.clear();
            longformAc.match(gm.getText(), callback);
            Optional match = callback.getLongestMatches().values().stream().findAny();
            if (match.isPresent()) {
                String longform = match.get();
                Collection shortforms = long2short.get(longform);
                // There will most likely be just a single short form.
                for (String shortform : shortforms) {
                    String shortText = gm.getText().replace(longform, shortform);
                    GeneName shortName = new GeneName(shortText, getTermNormalizer());
                    gm.getGeneName().addAlternative(shortName);
                }
            }
        }
    }

    public OffsetMap getAcronyms() {
        return acronyms;
    }

    public void setAcronyms(OffsetMap acronyms) {
        this.acronyms = acronyms;
        addState(State.ACRONYMS_SET);
    }

    public void setAcronyms(Stream acronyms) {
        this.acronyms = new OffsetMap<>();
        this.acronymLongforms = new OffsetMap<>();
        acronyms.forEach(a -> {
            this.acronyms.put(a.getOffsets(), a);
            this.acronymLongforms.put(a.getLongform().getOffsets(), a.getLongform());
        });
        addState(State.ACRONYMS_SET);
    }

    public void setAcronyms(Acronym... acronyms) {
        setAcronyms(Stream.of(acronyms));
    }

    public void setAcronyms(Collection acronyms) {
        setAcronyms(acronyms.stream());
    }

    public OffsetMap getAcronymLongforms() {
        return acronymLongforms;
    }

    public OffsetMap getOntologyClassMentions() {
        return ontologyClassMentions;
    }

    public void setOntologyClassMentions(OffsetMap ontologyClassMentions) {
        this.ontologyClassMentions = ontologyClassMentions;
        addState(State.ONTOLOGY_CLASS_MENTONS_SET);
    }

    public OffsetMap getChunks() {
        return chunks;
    }

    public void setChunks(OffsetMap chunks) {
        this.chunks = chunks;
        addState(State.CHUNKS_SET);
    }

    public String getDocumentText() {
        return documentText;
    }

    public void setDocumentText(String documentText) {
        this.documentText = documentText;
    }

    public String getDocumentTitle() {
        return documentText.substring(getTitleOffsets().getMinimum(), getTitleOffsets().getMaximum());
    }

    /**
     * @param documentTitle
     * @deprecated use offsets on the complete text
     */
    @Deprecated
    public void setDocumentTitle(String documentTitle) {
        this.documentTitle = documentTitle;
    }

    public String getAbstractText() {
        return documentText.substring(abstractOffsets.getMinimum(), abstractOffsets.getMaximum());
    }

    public OffsetMap> getGeneMap() {
        if (genes == null)
            throw new IllegalStateException(
                    "The internal genes map has to be built first by calling an appropriate method after setting the original set of genes.");
        return genes;
    }

    public Stream getGeneMentionsAtOffsets(final Range offsets) {
        return getGenes().filter(g -> g.getOffsets().isOverlappedBy(offsets));
    }

    public NavigableSet getAllGeneMentions(final GeneMention gm, final GeneTagger tagger) {
        GeneMention gmKey = new GeneMention();
        gmKey.setOffsets(gm.getOffsets());
        gmKey.setTagger(tagger);
        return getAllGenes().subSet(gmKey, true, gmKey, true);
    }

    /**
     * Returns those genes that have been selected from the original set of all
     * genes. Thus, before this method works, a selection method has to be called
     * first.
     *
     * @return The currently selected genes.
     * @see #selectGeneMentionsByTagger(GeneTagger...)
     * @see #unifyGeneMentionsAtEqualOffsets(GeneTagger...)
     */
    public Stream getGenes() {
        if (genes == null)
            throw new IllegalStateException(
                    "The internal genes map has to be built first by calling an appropriate method after setting the original set of genes.");
        return genes.values().stream().flatMap(Collection::stream);
    }

    public void setGenes(GeneMention... genes) {
        this.allGenes = createAllGenesSet();
        setGenes(Stream.of(genes));
    }

    public void setGenes(Stream genes) {
        if (this.allGenes != null)
            this.allGenes.clear();
        else
            this.allGenes = createAllGenesSet();
        genes.forEach(this.allGenes::add);
        this.allGenes.forEach(g -> g.setGeneDocument(this));
        if (termNormalizer != null)
            this.allGenes.forEach(g -> g.setNormalizer(termNormalizer));
    }

    public void setGenes(Collection genes) {
        if (this.allGenes == null)
            this.allGenes = createAllGenesSet();
        setGenes(genes.stream());
    }

    public void addGene(GeneMention gene) {
        if (this.allGenes == null)
            this.allGenes = createAllGenesSet();
        allGenes.add(gene);
        gene.setGeneDocument(this);
        if (termNormalizer != null)
            gene.setNormalizer(termNormalizer);
        if (gene.getTagger() == GOLD)
            putGene(gene);
    }

    public Iterable getGenesIterable() {
        return () -> getGenes().iterator();
    }

    public Iterable getNonRejectedGenesIterable() {
        return () -> getNonRejectedGenes().iterator();
    }

    public Stream getNonRejectedGenes() {
        return getGenes().filter(Predicate.not(GeneMention::isRejected));
    }

    /**
     * On first call, creates a trivial GeneSets object where each gene is in its
     * own set. From here, one can begin to agglomerate sets e.g. due to the same
     * name, an acronym connection or other measures. Subsequent calls will return
     * the same set instance.
     *
     * @return A GeneSets object where each gene has its own set.
     */
    public GeneSets getGeneSets() {
        if (this.geneSets != null) {
            return this.geneSets;
        }
        GeneSets geneSets = new GeneSets();
        getGenes().forEach(gm -> {
            List taxonomyIds = gm.getTaxonomyIds() != null && !gm.getTaxonomyIds().isEmpty() ? gm.getTaxonomyIds() : List.of(GeneMention.NOID);
            for (String taxId : taxonomyIds) {
                GeneSet geneSet = new GeneSet();
                geneSet.setTaxId(taxId);
                geneSet.add(gm);
                gm.addGeneSet(geneSet);
                geneSet.setDocId(this.id);
                geneSet.setSpecificType(gm.getSpecificType());
                if (gm.getCompositeResolver() == null)
                    getLastPosTag(gm.getOffsets(), PosTag.stopTags)
                            .ifPresent(tag -> geneSet.setPlural(tag.getTag().equals("NNS")));
                geneSets.add(geneSet);
            }
        });
        this.geneSets = geneSets;
        return geneSets;
    }

    public void resetGeneSets() {
        if (geneSets != null)
            geneSets.stream().flatMap(Collection::stream).forEach(gm -> gm.getGeneSets().clear());
        this.geneSets = null;
        if (state != null)
            state.removeAll(EnumSet.of(State.AGGLOMERATION_BY_NAME, State.AGGLOMERATION_BY_ACRONYMS));
    }

    public GeneSet addGeneSet(Collection newGs) {
        GeneSet geneSet = new GeneSet();
        geneSet.addAll(newGs);
        geneSet.setDocId(this.id);
        newGs.forEach(gm -> gm.addGeneSet(geneSet));
        newGs.stream().findAny().ifPresent(gm -> {
                    geneSet.setSpecificType(gm.getSpecificType());
                    getLastPosTag(gm.getOffsets(), PosTag.stopTags)
                            .ifPresent(tag -> geneSet.setPlural(tag.getTag().equals("NNS")));
                }
        );
        geneSets.add(geneSet);
        return geneSet;
    }

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    /**
     * Returns acronyms (not full forms!) overlapping with the given range.
     *
     * @param range An offset range.
     * @return Acronyms overlapping the given range.
     */
    public Collection getOverlappingAcronyms(Range range) {
        return acronyms.getOverlapping(range).values();
    }

    public Collection getOverlappingAcronymLongforms(Range range) {
        return acronymLongforms.getOverlapping(range).values();
    }

    public Range getOverlappingSentence(Span span) {
        return getOverlappingSentence(span.getOffsets());
    }

    public Range getOverlappingSentence(Range range) {
        Range sentence = sentences.locate(range);
        return sentence != null ? sentence : Range.between(0, 0);
    }

    /**
     * Returns ontology class mentions overlapping with the given range.
     *
     * @param range An offset range.
     * @return Ontology class mentions overlapping the given range.
     */
    public Set, String>> getOverlappingOntologyClassMentions(Range range) {
        return ontologyClassMentions != null ? ontologyClassMentions.getOverlapping(range).entrySet() : Collections.emptySet();
    }

    /**
     * Returns ontology class mentions of the given type overlapping with the given range.
     *
     * @param range        An offset range.
     * @param specificType The ontology class type - e.g. GeneOrGeneProduct - to return.
     * @return Ontology class mentions with the given type overlapping the given range.
     */
    public Set, String>> getOverlappingOntologyClassMentions(Range range, final String specificType) {
        return getOverlappingOntologyClassMentions(range).stream().filter(e -> e.getValue().equals(specificType))
                .collect(toSet());
    }

    /**
     * Returns chunks overlapping with the given range.
     *
     * @param range An offset range.
     * @return Chunks overlapping the given range.
     */
    public Set, String>> getOverlappingChunks(Range range) {
        return chunks.getOverlapping(range).entrySet();
    }

    public Collection getOverlappingAppositions(Range range) {
        return appositions.getOverlapping(range).values();
    }

    /**
     * Returns chunks of the given type overlapping with the given range.
     *
     * @param range     An offset range.
     * @param chunkType The chunk type - e.g. ChunkNP - to return.
     * @return Chunks with the given type overlapping the given range.
     */
    public Set, String>> getOverlappingChunks(Range range, final String chunkType) {
        return getOverlappingChunks(range).stream().filter(e -> e.getValue().equals(chunkType))
                .collect(toSet());
    }

    public Collection getOverlappingPosTags(Range range) {
        if (posTags == null)
            return Collections.emptyList();
        return posTags.getOverlapping(range).values();
    }

    public Optional getLastPosTag(Range range, Set excludedTags) {
        List posList = getOverlappingPosTags(range).stream().collect(toList());
        if (posList.isEmpty())
            return Optional.empty();
        for (int i = posList.size() - 1; i >= 0; i--) {
            PosTag posTag = posList.get(i);
            if (excludedTags == null || excludedTags.isEmpty() || !excludedTags.contains(posTag.getTag()))
                return Optional.of(posTag);
        }
        return Optional.empty();
    }

    public OffsetMap getPosTags() {
        return posTags;
    }

    public void setPosTags(Stream posTags) {
        this.posTags = new OffsetMap<>();
        posTags.map(pos -> {
            if (pos.getTag().equals("NN") && documentText != null && pos.getEnd() < documentText.length()) {
                synchronized (pluralMatcher) {
                    pluralMatcher.reset(getCoveredText(pos));

                    if (pluralMatcher.matches())
                        pos.setTag("NNS");
                }
            }
            return pos;
        }).forEach(this.posTags::put);
        addState(State.POS_SET);
    }

    public void setPosTags(Collection posTags) {
        setPosTags(posTags.stream());
    }

    /**
     * Returns genes overlapping with the given range.
     *
     * @param range An offset range.
     * @return Genes overlapping the given range.
     */
    public Stream getOverlappingGenes(Range range) {
        return genes.getOverlapping(range).values().stream().flatMap(list -> list.stream());
    }

    public Stream getOverlappingGoldGenes(Range range) {
        if (goldGenes == null)
            return Stream.empty();
        return goldGenes.getOverlapping(range).values().stream().flatMap(list -> list.stream());
    }

    public NavigableSet> getSentences() {
        return sentences;
    }

    public void setSentences(OffsetSet sentences) {
        this.sentences = sentences;
        addState(State.SENTENCES_SET);
    }

    public SpeciesCandidates getSpecies() {
        return species;
    }

    public void setSpecies(SpeciesCandidates species) {
        this.species = species;
        addState(State.SPECIES_MENTIONS_SET);
    }

    /**
     * Builds the internal gene offset map with all available genes, overlapping or
     * not. Offset duplicates will be override items that have been in the offset
     * map before their addition.
     */
    public void selectAllGenes() {
        this.genes = new OffsetMap<>();
        if (allGenes == null)
            allGenes = createAllGenesSet();
        this.allGenes.forEach(g -> putGene(g));
        addState(State.GENES_SELECTED);
    }

    /**
     * Builds the internal gene offset map and only keeps gene mentions found by the
     * given taggers.
     *
     * @param tagger The taggers for which gene mentions should be kept.
     */
    public void selectGeneMentionsByTagger(final GeneTagger... tagger) {
        if (genes == null)
            genes = new OffsetMap<>();
        Set includedTaggers = new HashSet<>(Arrays.asList(tagger));
        for (Iterator it = allGenes.iterator(); it.hasNext(); ) {
            GeneMention g = it.next();
            if (g.getTagger() == null) {
                log.error("Gene {} in document {} does not have a tagger set", g.getText(), g.getDocId());
//                it.remove();
            } else {
                // Only add genes were there is not already one
                if (includedTaggers.contains(g.getTagger()) && genes.getOverlapping(g.getOffsets()).isEmpty()) {
                    putGene(g, false);
                }
            }
        }
        addState(State.GENES_SELECTED);
    }

    public void expectState(EnumSet expectedStates) {
        for (State s : expectedStates) {
            if (!state.contains(s)) {
                throw new IllegalStateException("Expected state " + s + " which is not set to this document. The current document processing state is " + state);
            }
        }
    }

    /**
     * Adds gene mentions to the selected set of gene mentions based on a tagger
     * (optional) and regular expressions matched on the mention string.
     *
     * @param tagger  Optional, may be null
     * @param regExes A list of regular expressions. Each gene mention matching one of
     *                the expressions (and, if given, the tagger) will be added to the
     *                selected list of genes.
     */
    public void allowGeneMentionsByRegularExpression(final GeneTagger tagger, final Pattern... regExes) {
        Matcher[] ms = new Matcher[regExes.length];
        for (int i = 0; i < regExes.length; ++i)
            ms[i] = regExes[i].matcher("");
        for (GeneMention gm : allGenes) {
            // check the tagger
            if (tagger != null && gm.getTagger() != tagger)
                continue;
            // if the tagger was correct (or not given), check all regular
            // expressions for this mention
            boolean allowed = false;
            for (int i = 0; i < regExes.length && !allowed; ++i) {
                ms[i].reset(gm.getText());
                if (ms[i].matches())
                    allowed = true;
            }
            // if at least one mention matched a regular expression, add it to
            // the set of selected genes
            if (allowed)
                putGene(gm);
        }
    }

    /**
     * Creates the internal gene map without allowing exact duplicate ranges where
     * begin and end are equal but still allows overlapping.
     *
     * @param taggerPriorities The order in which should be decided which gene mention to keep at
     *                         a given position with multiple candidates at the exact same
     *                         location. A lower position means higher priority. Non-mentioned
     *                         taggers have minimum priority, e.g. are most easily discarded.
     */
    public void unifyGeneMentionsAtEqualOffsets(final GeneTagger... taggerPriorities) {
        genes = new OffsetMap<>();
        Map priorities = new HashMap<>();
        IntStream.range(0, taggerPriorities.length).forEach(i -> priorities.put(taggerPriorities[i], i));
        for (GeneMention gm : allGenes) {
            List genesAtOffset = genes.get(gm.getOffsets());
            if (genesAtOffset == null) {
                putGene(gm);
            } else {
                for (GeneMention gmInMap : genesAtOffset) {
                    int priorityInMap = priorities.getOrDefault(gmInMap.getTagger(), Integer.MAX_VALUE);
                    int gmPriority = priorities.getOrDefault(gm.getTagger(), Integer.MAX_VALUE);
                    if (gmPriority > priorityInMap)
                        replaceGene(gmInMap, gm);
                }
            }
        }
    }

    public void unifyAcronymsLongerFirst() {
        TreeSet unifiedSet = unifySpanLongerFirst(acronyms.values());
        acronyms = new OffsetMap<>();
        unifiedSet.forEach(g -> acronyms.put(g.getOffsets(), (Acronym) g));
    }

    /**
     * Unifies all genes with the longer-span-first strategy.
     */
    public void unifyAllGenesLongerFirst() {
        TreeSet unifiedSet = unifySpanLongerFirst(allGenes);
        genes = new OffsetMap<>();
        unifiedSet.forEach(g -> putGene((GeneMention) g));
    }

    public void unifyAllGenesLongerFirst(GeneTagger... taggers) {
        selectGeneMentionsByTagger(taggers);
        TreeSet unifiedSet = unifySpanLongerFirst(
                genes.values().stream().flatMap(list -> list.stream()).collect(toList()));
        genes = new OffsetMap<>();
        unifiedSet.forEach(g -> putGene((GeneMention) g));
    }

    private TreeSet unifySpanLongerFirst(Collection spans) {
        Span otherGene = null;
        TreeSet sortedGenes = new TreeSet<>(new OffsetSpanComparator());
        for (Span gm : spans) {
            if (sortedGenes.contains(gm)) {
                continue;
            } else if (null != (otherGene = sortedGenes.floor(gm))) {
                if (otherGene.getOffsets().isOverlappedBy(gm.getOffsets())) {
                    int gmLength = gm.getOffsets().getMaximum() - gm.getOffsets().getMinimum();
                    int otherLength = otherGene.getOffsets().getMaximum() - otherGene.getOffsets().getMinimum();
                    if (gmLength > otherLength) {
                        if (sortedGenes.remove(otherGene)) {
                            sortedGenes.add(gm);
                        }
                    }
                } else {
                    sortedGenes.add(gm);
                }
            } else if (null != (otherGene = sortedGenes.ceiling(gm))) {
                if (otherGene.getOffsets().isOverlappedBy(gm.getOffsets())) {
                    int gmLength = gm.getOffsets().getMaximum() - gm.getOffsets().getMinimum();
                    int otherLength = otherGene.getOffsets().getMaximum() - otherGene.getOffsets().getMinimum();
                    if (gmLength > otherLength) {
                        if (sortedGenes.remove(otherGene)) {
                            sortedGenes.add(gm);
                        }
                    }
                } else {
                    sortedGenes.add(gm);
                }
            } else {
                sortedGenes.add(gm);
            }
        }
        return sortedGenes;
    }

    public void unifyGenesPrioritizeTagger(NavigableSet sortedGenes, GeneTagger tagger) {
        allGenes.forEach(gm -> {
            GeneMention otherGene = null;
            if (sortedGenes.contains(gm)) {
                // As comparison is done via ranges, two genes are equal,
                // if they cover the same range, even if their respective other
                // values are different
                GeneTagger candidateTagger = gm.getTagger();
                if (candidateTagger == tagger) {
                    if (sortedGenes.remove(gm)) {
                        sortedGenes.add(gm);
                    }
                }
            } else if (null != (otherGene = sortedGenes.floor(gm))) {
                if (otherGene.getOffsets().isOverlappedBy(gm.getOffsets())) {
                    GeneTagger candidateTagger = gm.getTagger();
                    if (candidateTagger == tagger) {
                        if (sortedGenes.remove(otherGene)) {
                            sortedGenes.add(gm);
                        }
                    }
                } else {
                    sortedGenes.add(gm);
                }
            } else if (null != (otherGene = sortedGenes.ceiling(gm))) {
                if (otherGene.getOffsets().isOverlappedBy(gm.getOffsets())) {
                    GeneTagger candidateTagger = gm.getTagger();
                    if (candidateTagger == tagger) {
                        if (sortedGenes.remove(otherGene)) {
                            sortedGenes.add(gm);
                        }
                    }
                } else {
                    sortedGenes.add(gm);
                }
            } else {
                sortedGenes.add(gm);
            }
        });
        genes = new OffsetMap<>();
        sortedGenes.forEach(g -> putGene(g));
    }

    /**
     * Returns the raw gene mentions in this document, without any filtering,
     * unification, aggregation or whatsoever and possibly from multiple taggers.
     *
     * @return All gene mentions in this document.
     */
    public NavigableSet getAllGenes() {
        return allGenes == null ? Collections.emptyNavigableSet() : allGenes;
    }

    /**
     * Adds the given gene mention into the {@link #genes} map by its offset. This action resets the gene sets of this document.
     *
     * @param gm
     */
    private void putGene(GeneMention gm) {
        putGene(gm, true);
    }

    /**
     * Adds the given gene mention into the {@link #genes} map by its offset. This action resets the gene sets of this document.
     *
     * @param gm
     */
    private void putGene(GeneMention gm, boolean addToAllGenes) {
        if (gm.getOffsets() == null)
            throw new IllegalArgumentException("The passed gene mention does not specify text offsets: " + gm);
        if (genes == null)
            genes = new OffsetMap<>();
        putGene(gm, genes);
        if (addToAllGenes) {
            if (allGenes == null)
                allGenes = createAllGenesSet();
            try {
                allGenes.add(gm);
            } catch (Exception e) {
                e.printStackTrace();
                System.err.println(gm + "; " + gm.getTagger());
                throw e;
            }
        }
    }

    public void putGoldGene(GeneMention gm) {
        if (gm.getOffsets() == null)
            throw new IllegalArgumentException("The passed gene mention does not specify text offsets: " + gm);
        if (goldGenes.isEmpty())
            goldGenes = new OffsetMap<>();
        putGene(gm, goldGenes);
    }

    private void putGene(GeneMention gm, OffsetMap> geneMap) {
        assert geneMap != null;
        if (gm.getOffsets() == null)
            throw new IllegalArgumentException("The passed gene mention does not specify text offsets: " + gm);
        List gmList = geneMap.get(gm.getOffsets());
        if (gmList == null) {
            gmList = new ArrayList<>();
            geneMap.put(gm.getOffsets(), gmList);
        }
        if (!gmList.contains(gm))
            gmList.add(gm);
        gm.setGeneDocument(this);
        resetGeneSets();
    }

    private void replaceGene(GeneMention gene, GeneMention replacement) {
        List gmList = genes.get(gene.getOffsets());
        int index = gmList.indexOf(gene);
        gmList.set(index, replacement);
    }

    public String getCoveredText(Span span) {
        return getCoveredText(span.getOffsets());
    }

    public String getCoveredText(Range range) {
        return getCoveredText(range.getMinimum(), range.getMaximum());
    }

    public String getCoveredText(int begin, int end) {
        return documentText.substring(begin, end);
    }

    /**
     * Adds the given GeneMention to the set of currently selected genes but not to
     * the allGenes set.
     *
     * @param gm The gene mention to add.
     */
    public void selectGene(GeneMention gm) {
        putGene(gm);
    }

    public TermNormalizer getTermNormalizer() {
        return termNormalizer;
    }

    public void setTermNormalizer(TermNormalizer termNormalizer) {
        this.termNormalizer = termNormalizer;
    }


    /**
     * Removes the given gene from this GeneDocument. If the removal was successful, the gene sets are reset.
     *
     * @param gm
     */
    public boolean removeGene(GeneMention gm) {
        boolean success = false;
        List genesAtOffset = getGeneMap().get(gm.getOffsets());
        if (genesAtOffset != null) {
            genesAtOffset.remove(gm);
            if (genesAtOffset.isEmpty()) {
                success = getGeneMap().remove(gm.getOffsets()) != null;
                if (success)
                    resetGeneSets();
            }
        }
        allGenes.remove(gm);
        return success;
    }

    /**
     * Builds an instance of {@link AhoCorasickOptimized} from the currently
     * selected genes. The instance is stored internally.
     *
     * @return A trie dictionary compiled from the names (text occurrence) of all
     * selected genes.
     */
    public AhoCorasickOptimized getGeneNameDictionary() {
        if (geneNameDictionary == null) {
            geneNameDictionary = new AhoCorasickOptimized(
                    getGenes().map(GeneMention::getText).map(String::toLowerCase).collect(toList()));
        }
        return geneNameDictionary;
    }

    /**
     * Merges those gene sets that are connected via acronym resolution.
     */
    public void agglomerateByAcronyms() {
        if (hasState(State.AGGLOMERATION_BY_ACRONYMS))
            return;
        Collection docAcronyms = getAcronyms().values();
        if (docAcronyms.isEmpty()) {
            return;
        }

        // for quick access to the gene sets by GeneMention
//        Map geneSetMap = new HashMap<>();
        if (geneSets == null)
            getGeneSets();
//        geneSets.stream().forEach(gs -> gs.forEach(gm -> geneSetMap.put(gm, gs)));

//        Map, GeneSet> mergedSets = new HashMap<>();

        for (Acronym acronym : getAcronyms().values()) {
            Collection gms = getOverlappingGenes(acronym.getOffsets()).collect(toList());
            if (gms.isEmpty())
                continue;

            String acronymText = getCoveredText(acronym);

            GeneMention gm = gms.stream().findFirst().get();
            AcronymLongform longform = acronym.getLongform();

            Collection longGms = getOverlappingGenes(longform.getOffsets()).collect(toList());
            if (longGms.isEmpty())
                continue;

            GeneMention longGm = longGms.stream().findFirst().get();

            if (gm.equals(longGm))
                continue;

            // This should avoid a too lose matching between genes and acronyms. For
            // example, the acronym HLH should not taken to be the same as HLH462. But we
            // allow minor discrepancies for species prefixes.
            if (gm.getText().length() > acronymText.length() + 2 || !gm.getText().endsWith(acronymText)
                    || (gm.getText().length() != acronymText.length()
                    && !Character.isLowerCase(gm.getText().charAt(0))))
                continue;

            // Also it happens that an abbreviation's longform overlaps a gene but only in a
            // rather small part (e.g. TNF vs. type-1 tumor-necrosis-factor
            // (TNF)-receptor-associated protein (TRAP)-2). Thus we check that the full form
            // actually is the gene.
//            if (longGm.getText().length() != longform.getEnd() - longform.getBegin())
//                continue;
            int gmsize = gm.getGeneSets().size();
            int longsize = longGm.getGeneSets().size();
            for (GeneSet gmSet : gm.getGeneSets()) {
                for (GeneSet longGmSet : longGm.getGeneSets()) {

                    if (longGms == gmSet)
                        continue;

                    // Don't merge different taxonomy IDs
                    if (!gmSet.getTaxId().equals(longGmSet.getTaxId()))
                        continue;

                    // We don't want to merge plural and non-plural sets since this is an import
                    // part of family recognition
                    if (gmSet.isPlural() ^ longGmSet.isPlural())
                        continue;

                    // now merge the smaller set into the larger one
                    GeneSet from;
                    GeneSet to;
                    if (gmSet.size() > longGmSet.size()) {
                        from = longGmSet;
                        to = gmSet;
                    } else {
                        from = gmSet;
                        to = longGmSet;
                    }

                    // may happen if we have overlapping / embedded acronyms (e.g. human
                    // follicle stimulating hormone receptor (hFSH-R) has the acronyms
                    // hFSH-R and FSH-R)
                    if (from == to)
                        continue;

                    to.addAll(from, false);
                    from.clear();

                }
            }
        }
        cleanAndEnumerateGeneSets();
        getGenes().forEach(GeneMention::clearGeneSets);
        geneSets.forEach(gs -> gs.forEach(gm -> gm.addGeneSet(gs)));
        addState(State.AGGLOMERATION_BY_ACRONYMS);
    }

    /**
     * Merges those gene sets that are connected via coreference resolution.
     */
    public void agglomerateByCoreference() {
        if (hasState(State.AGGLOMERATION_BY_COREFERENCES))
            return;
        if (coreferenceSets == null || coreferenceSets.isEmpty())
            return;

        if (geneSets == null)
            getGeneSets();

        for (CoreferenceSet corefSet : coreferenceSets) {
            Map tax2geneset = new HashMap<>();
            for (CoreferenceExpression corefExp : corefSet) {
                Iterator geneIt = getOverlappingGenes(corefExp.getOffsets()).iterator();
                while (geneIt.hasNext()) {
                    GeneMention gm = geneIt.next();
                    for (String taxId : gm.getTaxonomyIds()) {
                        GeneSet oldGs = gm.getGeneSets().getGeneSet(taxId);
                        GeneSet gs = tax2geneset.compute(taxId, (k, v) -> v != null && !v.isEmpty() ? v : oldGs);
                        if (gs != oldGs) {
                            Set tmp = new HashSet<>(oldGs);
                            oldGs.clear();
                            tmp.forEach(g -> g.getGeneSets().remove(oldGs));
                            gs.addAll(tmp);
                        }
                    }
                }
            }
        }


        cleanAndEnumerateGeneSets();
        getGenes().forEach(GeneMention::clearGeneSets);
        geneSets.forEach(gs -> gs.forEach(gm -> gm.addGeneSet(gs)));
        addState(State.AGGLOMERATION_BY_COREFERENCES);
    }

    private void cleanAndEnumerateGeneSets() {
        int number = 0;
        Iterator iterator = getGeneSets().iterator();
        while (iterator.hasNext()) {
            GeneSet gs = iterator.next();
            if (gs.isEmpty()) {
                iterator.remove();
            } else {
                gs.setNumber(number++);
            }
        }
    }

    /**
     * @param dontMergeDifferentTaxonomyIds Don't merge gene sets with different taxonomy IDs. Only works if the gene sets already are uniform in they taxonomy IDs before calling this method.
     */
    public void agglomerateByNames(boolean dontMergeDifferentTaxonomyIds) {
        if (hasState(State.AGGLOMERATION_BY_NAME))
            return;
        if (geneSets == null)
            getGeneSets();

        for (GeneMention gm1 : getGenesIterable()) {
            for (GeneMention gm2 : getGenesIterable()) {
                for (String tax1 : gm1.getTaxonomyIds()) {
                    for (String tax2 : gm2.getTaxonomyIds()) {
                        GeneSet iSet = gm1.getGeneSets().getGeneSet(tax1);
                        GeneSet jSet = gm2.getGeneSets().getGeneSet(tax2);

                        if (iSet == jSet)
                            continue;
                        if (iSet.isEmpty() || jSet.isEmpty())
                            continue;
                        if (!iSet.getTaxId().equals(jSet.getTaxId()))
                            continue;

                        // We don't want to merge plural and non-plural sets since this is an important
                        // part of family recognition
                        if (iSet.isPlural() ^ jSet.isPlural())
                            continue;
                        // Do not merge sets without common taxonomy IDs. Since a GeneMention may have multiple taxonomy IDs,
                        // it is well possible for a GeneMention to end up in multiple gene sets.
                        if (dontMergeDifferentTaxonomyIds) {
                            if (!iSet.getTaxId().equals(jSet.getTaxId()))
                                continue;
                        }

                        // Check if there are common names in both sets
                        Function>> gm2gnFunc = gm -> Stream.concat(Stream.of(gm.getGeneName()), gm.getGeneName().getAlternatives().stream())
                                .map(gn -> termNormalizer.normalize(gn.getText()))
                                .map(s -> Arrays.stream(s.split("\\s+"))
                                        .collect(toSet()));
                        Set> iNameSet = iSet.stream().flatMap(gm2gnFunc).collect(toSet());
                        Set> jNameSet = jSet.stream().flatMap(gm2gnFunc).collect(toSet());
                        if (!Sets.intersection(iNameSet, jNameSet).isEmpty()) {
                            iSet.addAll(jSet, false);
                            jSet.clear();
                        }
                    }
                }
            }
        }


        cleanAndEnumerateGeneSets();
        getGenes().forEach(GeneMention::clearGeneSets);
        geneSets.forEach(gs -> gs.forEach(gm -> gm.addGeneSet(gs)));
        addState(State.AGGLOMERATION_BY_NAME);
    }

    /**
     * 

Adds alternative names to genes based on acronym resolution.

*

Sometimes, gene names include abbreviations that are introduced in a more general context. For example, * document. For example, the name elements occurring in the same document *

    *
  1. monokine induced by interferon gamma
  2. *
  3. interferon gamma (IFN-gamma)
  4. *
  5. monokine induced by IFN-gamma
  6. *
* would allow to infer that 1. and 3. are actually the same name. This method prepares for that inference step * by adding name variants where the abbreviation is expanded so that 3. would have 1. as variant. Then, * the {@link #agglomerateByNames(boolean)} method will agglomerate 1. and 3. into the same gene set. *

*/ public void generateGeneNameVariants() { if (hasState(State.GENE_VARIANTS_GENERATED)) return; Map acro2long = acronyms.values().stream().collect(Collectors.toMap(acronym -> getCoveredText(acronym.getOffsets()), acronym -> getCoveredText(acronym.getLongform().getOffsets()), (x, y) -> x)); Map acro2longvariants = new HashMap<>(); for (String acro : acro2long.keySet()) { String longform = acro2long.get(acro); // plural normalization if (acro.endsWith("s") && longform.endsWith("s")) acro2longvariants.put(acro.substring(0, acro.length() - 1), longform.substring(0, longform.length() - 1)); } acro2long.putAll(acro2longvariants); AhoCorasickOptimized acroAc = new AhoCorasickOptimized(acro2long.keySet()); Function> gm2gnFunc = gm -> Stream.concat(Stream.of(gm.getGeneName()), gm.getGeneName().getAlternatives().stream()).flatMap(gn -> Stream.of(gn.getText(), termNormalizer.normalize(gn.getText()))) .flatMap(s -> { Stream.Builder variantBuilder = Stream.builder(); variantBuilder.accept(s); acroAc.match(s, (start, end, match) -> { variantBuilder.accept(new StringBuilder(s).replace(start, end + 1, acro2long.get(match)).toString()); }); Stream build = variantBuilder.build(); List collect = build.collect(toList()); return collect.stream(); }) .filter(Objects::nonNull) // .map(String::toLowerCase) // .map(GeneMapper::removeNondescriptives) .map(s -> new GeneName(s, termNormalizer)); for (GeneMention gm : getGenesIterable()) { Set alreadyKnownAlternatives = Stream.concat(Stream.of(termNormalizer.normalize(gm.getText())), gm.getGeneName().getAlternatives().stream().map(gn -> termNormalizer.normalize(gn.getText()))).collect(toSet()); List variantsWithNonDesc = gm2gnFunc.apply(gm).filter(gn -> alreadyKnownAlternatives.add(termNormalizer.normalize(gn.getText()))).collect(toList()); variantsWithNonDesc.forEach(gm.getGeneName()::addAlternative); // TODO this is too complicated due to the fact that we cannot just normalize once and go with the normalized form. Fix as soon as we can List variantsWithoutNonDesc = gm2gnFunc.apply(gm).map(GeneName::getText).filter(s -> alreadyKnownAlternatives.add(termNormalizer.normalize(TermNormalizer.removeNondescriptives(s)))).map(s -> new GeneName(TermNormalizer.removeNondescriptives(s), termNormalizer)).collect(toList()); for (var gn : variantsWithoutNonDesc) gm.getGeneName().addAlternative(gn); } addState(State.GENE_VARIANTS_GENERATED); } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; GeneDocument that = (GeneDocument) o; return Objects.equals(id, that.id); } @Override public int hashCode() { return Objects.hash(id); } public Collection getMeshHeadings() { return meshHeadings != null ? meshHeadings : Collections.emptyList(); } public void setMeshHeadings(Collection meshHeadings) { this.meshHeadings = meshHeadings; } public Stream getGenesWithText(String text) { return getGenes().filter(gm -> gm.getText().equals(text)); } public Entry, SpeciesMention> getNearestPreviousSpeciesMention(Range range, String taxId) { final OffsetMap speciesCandidates = species.getAllMentionCandidates(); Entry, SpeciesMention> lower = speciesCandidates.lowerEntry(range); while (lower != null && ((!lower.getValue().getTaxId().equals(taxId) && taxId != null) || lower.getKey().isOverlappedBy(range))) { lower = speciesCandidates.lowerEntry(lower.getKey()); } if (lower != null && !lower.getValue().getTaxId().equals(taxId) && taxId != null) lower = null; return lower; } public Entry, SpeciesMention> getNearestPreviousSpeciesMention(Range range) { return getNearestPreviousSpeciesMention(range, null); } public Entry, SpeciesMention> getNearestNextSpeciesMention(Range range, String taxId) { final OffsetMap speciesCandidates = species.getAllMentionCandidates(); Entry, SpeciesMention> higher = speciesCandidates.higherEntry(range); while (higher != null && ((!higher.getValue().getTaxId().equals(taxId) && taxId != null) || higher.getKey().isOverlappedBy(range))) { higher = speciesCandidates.higherEntry(higher.getKey()); } if (higher != null && !higher.getValue().getTaxId().equals(taxId) && taxId != null) higher = null; return higher; } public Set findChromosomeLocations() { if (chromosomeLocations == null) { chromosomeLocations = new HashSet<>(); Matcher m = GeneLocation.MAP_LOC_PATTERN.matcher(getDocumentText()); while (m.find()) { chromosomeLocations.add(new GeneLocation(m)); } } return chromosomeLocations; } public Stream getDocumentContext(Range inputOffsets, int numTokens) { return getDocumentContext(inputOffsets, Collections.emptySet(), false, numTokens); } public Stream getDocumentContext(Range inputOffsets, Set excludedTokens, boolean excludeGeneMentions, int numTokens) { if (numTokens == 0) return Stream.empty(); Set allstopwords = !excludedTokens.isEmpty() || excludeGeneMentions ? new HashSet<>() : Collections.emptySet(); if (excludeGeneMentions) { if (geneMentionTexts == null) { for (GeneMention gm : getGenesIterable()) geneMentionTexts = Stream.of(gm.getText().split("\\s+")).collect(toSet()); } allstopwords.addAll(geneMentionTexts); } String[] contextTokens = new String[numTokens]; Range focusOffsets = inputOffsets; for (int i = (int) (numTokens / 2d); i >= 0; i--) { Range tokenOffset = posTags.lowerKey(focusOffsets); if (tokenOffset == null) break; String coveredText = getCoveredText(tokenOffset); if (allstopwords.isEmpty() || !allstopwords.contains(coveredText)) { contextTokens[i] = coveredText; } focusOffsets = tokenOffset; } focusOffsets = inputOffsets; for (int i = (int) (numTokens / 2d) + 1; i < numTokens; i++) { Range tokenOffset = posTags.higherKey(focusOffsets); if (tokenOffset == null) break; String coveredText = getCoveredText(tokenOffset); if (allstopwords.isEmpty() || !allstopwords.contains(coveredText)) { contextTokens[i] = coveredText; } focusOffsets = tokenOffset; } return Arrays.stream(contextTokens).filter(Objects::nonNull); } public void reset() { resetGeneSets(); getGenes().forEach(gm -> { gm.setMentionMappingResult(null); gm.setTaxonomyOcurrences(HashMultimap.create()); }); state = new LinkedHashSet<>(); } public boolean isGoldHasOffsets() { return goldHasOffsets; } public void setGoldMentionsWithOffsets(boolean goldHasOffsets) { this.goldHasOffsets = goldHasOffsets; } /** * Converts this document and its entities into the PubTator format. * * @return A string containing the PubTator format conversion. */ public String getPubTatorString() { String ls = System.getProperty("line.separator"); StringBuilder sb = new StringBuilder(); if (documentTitle != null && !documentTitle.isBlank()) sb.append(id).append("|t|").append(documentTitle).append(ls); if (documentText != null && !documentText.isBlank()) sb.append(id).append("|a|").append(abstractText).append(ls); // 10064899 100 118 Lysophospholipases FamilyName // 10064899 360 403 lysophospholipid-specific lysophospholipase Gene 10434 for (GeneMention gm : getGenesIterable()) { MentionMappingResult mmr = gm.getMentionMappingResult(); if (!gm.isRejected()) { sb.append(id).append("\t").append(gm.getBegin()).append("\t").append(gm.getEnd()).append("\t").append(gm.getText()).append("\t").append("Gene"); // sb.append("\t"); // if (gm.getTaxonomyId().equals("9606")) // sb.append(gm.getIds().get(0)); // else // sb.append(gm.getIds().get(0)).append("(Tax:").append(gm.getTaxonomyId()).append(")"); // sb.append(gm.getOverlappingGoldMentions().stream().map(gold -> "9606".equals(gold.getTaxonomyId()) ? gold.getAnyGoldId() : gold.getAnyGoldId() + "(Tax:" + gold.getTaxonomyId() + ")").collect(Collectors.joining(","))); sb.append(ls); } } // 10064899 341 346 human Species 9606 // for (SpeciesMention sm : (Iterable) () -> Stream.concat(species.getTitleCandidates().values().stream(), species.getTextCandidates().values().stream()).iterator()) { // sb.append(id).append("\t").append(sm.getBegin()).append("\t").append(sm.getEnd()).append("\t").append(sm.getText()).append("\tSpecies\t").append(sm.getTaxId()).append(ls); // } // sb.append(ls); return sb.toString(); } /** * @param abstractText * @deprecated use offsets on the complete text */ @Deprecated public void setDocumentAbstract(String abstractText) { this.abstractText = abstractText; } public String getInspectionText(Function correctnessFunction, Map> renderFunctions) { StringBuilder sb = new StringBuilder(); int pos = 0; for (GeneMention gm : (Iterable) () -> getGenes().sorted(Comparator.comparingInt(GeneMention::getBegin)).iterator()) { int begin = gm.getBegin(); sb.append(documentText, Math.min(pos, begin), begin); // if (gm.hasExactCandidateMatch()) { MentionCorrectness correctness = correctnessFunction.apply(gm); Function geneMentionStringFunction = renderFunctions.get(correctness); String apply = geneMentionStringFunction.apply(gm); sb.append(apply); // } else { // sb.append(gm.getText()); // } pos = gm.getEnd(); } sb.append(documentText, pos, documentText.length()); return sb.toString(); } public String getGenesetInspectionText(BiFunction correctnessFunction, Map> renderFunctions) { StringBuilder sb = new StringBuilder(); int pos = 0; for (GeneMention gm : (Iterable) () -> getGenes().sorted(Comparator.comparingInt(GeneMention::getBegin)).iterator()) { // When there is no gold we still want to show the FP so we add the dummy NOID instead List goldIdList = gm.hasGoldMentions() ? gm.getAllGoldIdsAsList() : List.of(GeneMention.NOID); List overlappingGenes = gm.getGeneDocument().getOverlappingGenes(gm.getOffsets()).collect(toList()); int index = overlappingGenes.indexOf(gm); String goldId = goldIdList.get(Math.min(index, goldIdList.size() - 1)); int begin = gm.getBegin(); sb.append(documentText, Math.min(pos, begin), begin); BiFunction geneMentionStringFunction = renderFunctions.get(correctnessFunction.apply(gm, goldId)); String apply = geneMentionStringFunction.apply(gm, goldId); sb.append(apply); pos = gm.getEnd(); } sb.append(documentText, pos, documentText.length()); return sb.toString(); } public Set getGoldTaxonomyIds() { return goldTaxonomyIds; } public void setGoldTaxonomyIds(Set goldTaxonomyIds) { this.goldTaxonomyIds = goldTaxonomyIds; } /** * Some gene corpora have annotated all gene occurrences while other focus on the most important genes * with regards to a specific task. * * @return Whether all genes in this document have been annotated or only a subset. */ public boolean isCompletelyAnnotated() { return completelyAnnotated; } public void setCompletelyAnnotated(boolean completelyAnnotated) { this.completelyAnnotated = completelyAnnotated; } public boolean isGoldOffsetsInferred() { return goldOffsetsInferred; } public void setGoldOffsetsInferred(boolean goldOffsetsInferred) { this.goldOffsetsInferred = goldOffsetsInferred; } public void clearSelectedGenes() { genes = null; } public Collection getCoreferenceSets() { return coreferenceSets; } public void setCoreferenceRelations(Collection coreferenceSets) { this.coreferenceSets = coreferenceSets; coreferenceExpressions = new OffsetMap<>(); coreferenceSets.stream().flatMap(Collection::stream).forEach(coreferenceExpressions::put); } public void setAppositions(Collection appositions) { this.appositions = new OffsetMap(appositions); } public void setAppositionContextToGeneNames() { for (GeneMention gm : getGenesIterable()) { Apposition overlappingApposition = appositions.getFirstLargestIntersectionValue(gm.getOffsets()); if (overlappingApposition != null) { // We actually do not make sure here that the "other" apposition element is actually the more general part // of this apposition. It will mostly be. Apposition inApposition = overlappingApposition.getOther(); gm.getGeneName().addAppositionContext(getCoveredText(inApposition)); } } } public Range getOverlappingNonGenePhrases(Range offsets) { Range nonGenePhrase = nonGenePhrases.isEmpty() ? null : nonGenePhrases.locate(offsets); return nonGenePhrase != null && nonGenePhrase.isOverlappedBy(offsets) ? nonGenePhrase : Range.between(0, 0); } public OffsetSet getNonGenePhrases() { return nonGenePhrases; } public void setNonGenePhrases(OffsetSet nonGenePhrases) { this.nonGenePhrases = nonGenePhrases; } public void rejectGenesOverlappingNonGenePhrases() { for (GeneMention gm : getGenesIterable()) { if (getOverlappingNonGenePhrases(gm.getOffsets()).getMaximum() > 0) gm.reject(MentionMappingResult.RejectReason.IS_NON_GENE_WORD); } } public enum MentionCorrectness {CORRECT_ID, WRONG_ID, CANT_FIND} public enum State { GENES_SELECTED, SENTENCES_SET, SPECIES_MENTIONS_SET, ACRONYMS_SET, CHUNKS_SET, POS_SET, /** * Species hints/{@link GeneSpeciesOccurrence} markers have been set. */ SPECIES_CANDIDATES_ASSIGNED, SYNONYM_CANDIDATES_ASSIGNED, /** * Filtered out tax IDs that do not exist in NCBI Gene */ SPECIES_CANDIDATES_FILTERED, /** * Taxonomy IDs have been assigned to {@link MeshHeading} instances that represent a species. */ MESH_TAX_IDS_ASSIGNED, REFERENCE_SPECIES_ADDED, SPECIES_SCORES_ASSIGNED, AGGLOMERATION_BY_ACRONYMS, AGGLOMERATION_BY_NAME, AGGLOMERATION_BY_COREFERENCES, SPECIES_ASSIGNED_TO_GENES, ONTOLOGY_CLASS_MENTONS_SET, GENE_VARIANTS_GENERATED } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy