
de.julielab.geneexpbase.genemodel.GeneDocument Maven / Gradle / Ivy
package de.julielab.geneexpbase.genemodel;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import com.lahodiuk.ahocorasick.AhoCorasickOptimized;
import de.julielab.geneexpbase.AhoCorasickLongestMatchCallback;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.java.utilities.spanutils.OffsetMap;
import de.julielab.java.utilities.spanutils.OffsetSet;
import de.julielab.java.utilities.spanutils.OffsetSpanComparator;
import de.julielab.java.utilities.spanutils.Span;
import org.apache.commons.lang3.Range;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.Map.Entry;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import static de.julielab.geneexpbase.genemodel.GeneMention.GeneTagger;
import static de.julielab.geneexpbase.genemodel.GeneMention.GeneTagger.GOLD;
import static java.util.stream.Collectors.toList;
import static java.util.stream.Collectors.toSet;
public class GeneDocument {
public static final Pattern lociRegExp = Pattern.compile("[0-9]+[Xqp][0-9.-]+");
private static final Logger log = LoggerFactory.getLogger(GeneDocument.class);
/**
* Sometimes an obvious plural tag is missed, e.g. for words like "LERKs". This
* matches this exact pattern: Upper case characters followed by a lower case
* 's'.
*/
private final Matcher pluralMatcher = Pattern.compile("[A-Z]+s").matcher("");
private OffsetMap acronyms;
private OffsetMap acronymLongforms;
private OffsetMap chunks;
private OffsetMap posTags;
private OffsetMap ontologyClassMentions;
private String documentText;
private String documentTitle;
/**
* This is the original set of genes that has been set via
* {@link #setGenes(Stream)}. From this set, a subset is selected and stored in
* {@link #genes} which is then the set of genes used for all processing and
* mapping. Non-selected genes are non-existent to processing algorithms, except
* they explicitly work on the allGenes set.
*/
private NavigableSet allGenes;
private OffsetMap> genes;
/**
* Used for evaluation and tagger training purposes.
*/
private OffsetMap> goldGenes = OffsetMap.emptyOffsetMap();
private Set goldIds = Collections.emptySet();
private boolean goldHasOffsets;
private boolean goldOffsetsInferred;
private GeneSets geneSets;
private String id;
private OffsetSet sentences;
private SpeciesCandidates species;
private AhoCorasickOptimized geneNameDictionary;
private TermNormalizer termNormalizer;
private Collection meshHeadings;
private Set state;
private Set chromosomeLocations;
private Set geneMentionTexts;
private String abstractText;
private Range titleOffsets;
private Range abstractOffsets;
private Set goldTaxonomyIds = Collections.emptySet();
private boolean completelyAnnotated;
private Collection coreferenceSets;
private OffsetMap coreferenceExpressions;
private OffsetMap appositions;
private OffsetSet nonGenePhrases;
public GeneDocument() {
state = new LinkedHashSet<>();
termNormalizer = new TermNormalizer();
}
/**
* Copies the template document. This is mostly a shallow copy, except the
* genes. Those are deeply copied and put into the respective structures (the
* "genes" and "geneSets" fields).
*
* @param template The document to copy.
*/
public GeneDocument(GeneDocument template) {
acronyms = template.acronyms;
acronymLongforms = template.acronymLongforms;
coreferenceSets = template.coreferenceSets;
coreferenceExpressions = template.coreferenceExpressions;
appositions = template.appositions;
chunks = template.chunks;
ontologyClassMentions = template.ontologyClassMentions;
posTags = template.posTags;
documentText = template.documentText;
documentTitle = template.documentTitle;
// Copy the genes by their Java system ID
TreeMap orgToNew = new TreeMap<>(
Comparator.comparingInt(System::identityHashCode));
template.allGenes.forEach(g -> {
GeneMention newGm = new GeneMention(g);
newGm.setGeneDocument(this);
orgToNew.put(g, newGm);
});
allGenes = template.allGenes.stream().map(old -> Objects.requireNonNull(orgToNew.get(old))).collect(Collectors.toCollection(this::createAllGenesSet));
genes = new OffsetMap<>();
for (Entry, List> original : template.genes.entrySet())
genes.put(original.getKey(), original.getValue().stream().map(k -> Objects.requireNonNull(orgToNew.get(k))).collect(toList()));
template.goldGenes.values().stream().flatMap(Collection::stream).map(GeneMention::new).forEach(this::putGoldGene);
if (template.geneSets != null) {
geneSets = new GeneSets();
for (GeneSet gs : template.geneSets) {
GeneSet newSet = new GeneSet();
newSet.setFeatureVector(gs.getFeatureVector());
newSet.setInstance(gs.getInstance());
newSet.setSetId(gs.getSetId());
newSet.setSpecificType(gs.getSpecificType());
gs.forEach(g -> newSet.add(orgToNew.get(g)));
geneSets.add(newSet);
newSet.forEach(g -> g.addGeneSet(newSet));
}
}
if (template.meshHeadings != null)
meshHeadings = template.meshHeadings.stream().map(MeshHeading::clone).collect(toSet());
id = template.id;
sentences = template.sentences;
nonGenePhrases = template.nonGenePhrases;
if (template.species != null)
species = template.species.clone();
geneNameDictionary = template.geneNameDictionary;
termNormalizer = template.termNormalizer;
state = new HashSet<>(template.state);
goldHasOffsets = template.goldHasOffsets;
goldOffsetsInferred = template.goldOffsetsInferred;
if (template.goldIds != null)
goldIds = new HashSet<>(template.goldIds);
if (template.goldTaxonomyIds != null)
goldTaxonomyIds = new HashSet<>(template.goldTaxonomyIds);
if (template.chromosomeLocations != null)
chromosomeLocations = new HashSet<>(template.chromosomeLocations);
if (template.geneMentionTexts != null)
geneMentionTexts = new HashSet<>(template.geneMentionTexts);
}
public GeneDocument(String id) {
this();
this.id = id;
}
private TreeSet createAllGenesSet() {
// return new TreeSet<>(Comparator.comparingInt(GeneMention::getBegin).thenComparingInt(GeneMention::getEnd).thenComparing(GeneMention::getTagger).thenComparingInt(g -> System.identityHashCode(g)));
// removed the hash code comparison for more general access; I don't know why exactly I added the hash code so this might break some code. But that code is probably unused, so I dare ;-)
return new TreeSet<>(Comparator.comparingInt(GeneMention::getBegin).thenComparingInt(GeneMention::getEnd).thenComparing(GeneMention::getTagger));
}
public OffsetMap> getGoldGenes() {
return goldGenes;
}
public Range getTitleOffsets() {
return titleOffsets != null ? titleOffsets : Range.between(0, 0);
}
public void setTitleOffsets(Range titleOffsets) {
this.titleOffsets = titleOffsets;
}
public Range getAbstractOffsets() {
return abstractOffsets != null ? abstractOffsets : Range.between(0, 0);
}
public void setAbstractOffsets(Range abstractOffsets) {
this.abstractOffsets = abstractOffsets;
}
public Set getGoldIds() {
return goldIds;
}
public void setGoldIds(Set goldIds) {
this.goldIds = goldIds;
}
public void addState(State state) {
this.state.add(state);
}
public boolean hasState(State state) {
if (state == null)
return false;
return this.state.contains(state);
}
public AcronymLongform getAcronymLongformAndOffsets(Acronym acronym) {
AcronymLongform longform = acronym.getLongform();
if (null == longform.getText()) {
Range range = longform.getOffsets();
longform.setText(this.getDocumentText().substring(range.getMinimum(), range.getMaximum()));
}
return longform;
}
public void setNominalPhrasesOfGenesToNames() {
for (GeneMention gm : getAllGenes()) {
Optional, String>> chunkNP = getOverlappingChunks(gm.getOffsets(), "ChunkNP").stream().findFirst();
if (chunkNP.isPresent()) {
gm.getGeneName().setNominalPhraseContext(getCoveredText(chunkNP.get().getKey()));
}
}
}
/**
* Uses acronym resolution to generate gene name variants. Variant generation includes the replacement of acronyms
* by their long forms and vice versa.
*/
public void setAcronymsAsGeneNameAlternatives() {
List longformTexts = new ArrayList<>();
Multimap long2short = HashMultimap.create();
Iterator longIt = getAcronymLongforms().values().iterator();
// Iterate over all long forms and set long name variants to all
// genes overlapping their acronyms.
// Also, build a dictionary of long forms and a map that connects long forms
// to their respective short forms. This is used below to to set short
// variants to gene mentions containing the long form of an acronym definition.
while (longIt.hasNext()) {
AcronymLongform longForm = longIt.next();
String longformText = getCoveredText(longForm);
longformTexts.add(longformText);
String acronymText = null;
for (Acronym acronym : longForm.getAcronyms()) {
acronymText = getCoveredText(acronym);
long2short.put(longformText, acronymText);
// TODO check that the missing filter doesn't hurt performance
// Iterator acroIt = getOverlappingGenes(acronym.getOffsets()).filter(gm -> gm.getOffsets().equals(acronym.getOffsets())).iterator();
Iterator acroIt = getOverlappingGenes(acronym.getOffsets()).iterator();
while (acroIt.hasNext()) {
GeneMention acroGene = acroIt.next();
String longText = acroGene.getText().replace(acronymText, longformText);
// In case that the acronym definition is also a gene name, the acronym also overlaps with the
// long form gene (since the gene mention covers both, if not resolved via the composite
// resolution)
if (!longText.equals(acroGene.getText())) {
GeneName longName = new GeneName(longText, getTermNormalizer());
acroGene.getGeneName().addAlternative(longName);
}
}
}
// Iterator longFormGenes = getOverlappingGenes(longForm.getOffsets()).filter(gm -> gm.getOffsets().equals(longForm.getOffsets())).iterator();
// while (longFormGenes.hasNext()) {
// GeneMention longGm = longFormGenes.next();
// GeneName acroName = new GeneName(acronymText, getTermNormalizer());
// longGm.getGeneName().addAlternative(acroName);
// }
}
// Now add short form variants to all genes containing a long form of an acronym.
AhoCorasickOptimized longformAc = new AhoCorasickOptimized(longformTexts);
AhoCorasickLongestMatchCallback callback = new AhoCorasickLongestMatchCallback();
for (GeneMention gm : getGenesIterable()) {
callback.clear();
longformAc.match(gm.getText(), callback);
Optional match = callback.getLongestMatches().values().stream().findAny();
if (match.isPresent()) {
String longform = match.get();
Collection shortforms = long2short.get(longform);
// There will most likely be just a single short form.
for (String shortform : shortforms) {
String shortText = gm.getText().replace(longform, shortform);
GeneName shortName = new GeneName(shortText, getTermNormalizer());
gm.getGeneName().addAlternative(shortName);
}
}
}
}
public OffsetMap getAcronyms() {
return acronyms;
}
public void setAcronyms(OffsetMap acronyms) {
this.acronyms = acronyms;
addState(State.ACRONYMS_SET);
}
public void setAcronyms(Stream acronyms) {
this.acronyms = new OffsetMap<>();
this.acronymLongforms = new OffsetMap<>();
acronyms.forEach(a -> {
this.acronyms.put(a.getOffsets(), a);
this.acronymLongforms.put(a.getLongform().getOffsets(), a.getLongform());
});
addState(State.ACRONYMS_SET);
}
public void setAcronyms(Acronym... acronyms) {
setAcronyms(Stream.of(acronyms));
}
public void setAcronyms(Collection acronyms) {
setAcronyms(acronyms.stream());
}
public OffsetMap getAcronymLongforms() {
return acronymLongforms;
}
public OffsetMap getOntologyClassMentions() {
return ontologyClassMentions;
}
public void setOntologyClassMentions(OffsetMap ontologyClassMentions) {
this.ontologyClassMentions = ontologyClassMentions;
addState(State.ONTOLOGY_CLASS_MENTONS_SET);
}
public OffsetMap getChunks() {
return chunks;
}
public void setChunks(OffsetMap chunks) {
this.chunks = chunks;
addState(State.CHUNKS_SET);
}
public String getDocumentText() {
return documentText;
}
public void setDocumentText(String documentText) {
this.documentText = documentText;
}
public String getDocumentTitle() {
return documentText.substring(getTitleOffsets().getMinimum(), getTitleOffsets().getMaximum());
}
/**
* @param documentTitle
* @deprecated use offsets on the complete text
*/
@Deprecated
public void setDocumentTitle(String documentTitle) {
this.documentTitle = documentTitle;
}
public String getAbstractText() {
return documentText.substring(abstractOffsets.getMinimum(), abstractOffsets.getMaximum());
}
public OffsetMap> getGeneMap() {
if (genes == null)
throw new IllegalStateException(
"The internal genes map has to be built first by calling an appropriate method after setting the original set of genes.");
return genes;
}
public Stream getGeneMentionsAtOffsets(final Range offsets) {
return getGenes().filter(g -> g.getOffsets().isOverlappedBy(offsets));
}
public NavigableSet getAllGeneMentions(final GeneMention gm, final GeneTagger tagger) {
GeneMention gmKey = new GeneMention();
gmKey.setOffsets(gm.getOffsets());
gmKey.setTagger(tagger);
return getAllGenes().subSet(gmKey, true, gmKey, true);
}
/**
* Returns those genes that have been selected from the original set of all
* genes. Thus, before this method works, a selection method has to be called
* first.
*
* @return The currently selected genes.
* @see #selectGeneMentionsByTagger(GeneTagger...)
* @see #unifyGeneMentionsAtEqualOffsets(GeneTagger...)
*/
public Stream getGenes() {
if (genes == null)
throw new IllegalStateException(
"The internal genes map has to be built first by calling an appropriate method after setting the original set of genes.");
return genes.values().stream().flatMap(Collection::stream);
}
public void setGenes(GeneMention... genes) {
this.allGenes = createAllGenesSet();
setGenes(Stream.of(genes));
}
public void setGenes(Stream genes) {
if (this.allGenes != null)
this.allGenes.clear();
else
this.allGenes = createAllGenesSet();
genes.forEach(this.allGenes::add);
this.allGenes.forEach(g -> g.setGeneDocument(this));
if (termNormalizer != null)
this.allGenes.forEach(g -> g.setNormalizer(termNormalizer));
}
public void setGenes(Collection genes) {
if (this.allGenes == null)
this.allGenes = createAllGenesSet();
setGenes(genes.stream());
}
public void addGene(GeneMention gene) {
if (this.allGenes == null)
this.allGenes = createAllGenesSet();
allGenes.add(gene);
gene.setGeneDocument(this);
if (termNormalizer != null)
gene.setNormalizer(termNormalizer);
if (gene.getTagger() == GOLD)
putGene(gene);
}
public Iterable getGenesIterable() {
return () -> getGenes().iterator();
}
public Iterable getNonRejectedGenesIterable() {
return () -> getNonRejectedGenes().iterator();
}
public Stream getNonRejectedGenes() {
return getGenes().filter(Predicate.not(GeneMention::isRejected));
}
/**
* On first call, creates a trivial GeneSets object where each gene is in its
* own set. From here, one can begin to agglomerate sets e.g. due to the same
* name, an acronym connection or other measures. Subsequent calls will return
* the same set instance.
*
* @return A GeneSets object where each gene has its own set.
*/
public GeneSets getGeneSets() {
if (this.geneSets != null) {
return this.geneSets;
}
GeneSets geneSets = new GeneSets();
getGenes().forEach(gm -> {
List taxonomyIds = gm.getTaxonomyIds() != null && !gm.getTaxonomyIds().isEmpty() ? gm.getTaxonomyIds() : List.of(GeneMention.NOID);
for (String taxId : taxonomyIds) {
GeneSet geneSet = new GeneSet();
geneSet.setTaxId(taxId);
geneSet.add(gm);
gm.addGeneSet(geneSet);
geneSet.setDocId(this.id);
geneSet.setSpecificType(gm.getSpecificType());
if (gm.getCompositeResolver() == null)
getLastPosTag(gm.getOffsets(), PosTag.stopTags)
.ifPresent(tag -> geneSet.setPlural(tag.getTag().equals("NNS")));
geneSets.add(geneSet);
}
});
this.geneSets = geneSets;
return geneSets;
}
public void resetGeneSets() {
if (geneSets != null)
geneSets.stream().flatMap(Collection::stream).forEach(gm -> gm.getGeneSets().clear());
this.geneSets = null;
if (state != null)
state.removeAll(EnumSet.of(State.AGGLOMERATION_BY_NAME, State.AGGLOMERATION_BY_ACRONYMS));
}
public GeneSet addGeneSet(Collection newGs) {
GeneSet geneSet = new GeneSet();
geneSet.addAll(newGs);
geneSet.setDocId(this.id);
newGs.forEach(gm -> gm.addGeneSet(geneSet));
newGs.stream().findAny().ifPresent(gm -> {
geneSet.setSpecificType(gm.getSpecificType());
getLastPosTag(gm.getOffsets(), PosTag.stopTags)
.ifPresent(tag -> geneSet.setPlural(tag.getTag().equals("NNS")));
}
);
geneSets.add(geneSet);
return geneSet;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
/**
* Returns acronyms (not full forms!) overlapping with the given range.
*
* @param range An offset range.
* @return Acronyms overlapping the given range.
*/
public Collection getOverlappingAcronyms(Range range) {
return acronyms.getOverlapping(range).values();
}
public Collection getOverlappingAcronymLongforms(Range range) {
return acronymLongforms.getOverlapping(range).values();
}
public Range getOverlappingSentence(Span span) {
return getOverlappingSentence(span.getOffsets());
}
public Range getOverlappingSentence(Range range) {
Range sentence = sentences.locate(range);
return sentence != null ? sentence : Range.between(0, 0);
}
/**
* Returns ontology class mentions overlapping with the given range.
*
* @param range An offset range.
* @return Ontology class mentions overlapping the given range.
*/
public Set, String>> getOverlappingOntologyClassMentions(Range range) {
return ontologyClassMentions != null ? ontologyClassMentions.getOverlapping(range).entrySet() : Collections.emptySet();
}
/**
* Returns ontology class mentions of the given type overlapping with the given range.
*
* @param range An offset range.
* @param specificType The ontology class type - e.g. GeneOrGeneProduct - to return.
* @return Ontology class mentions with the given type overlapping the given range.
*/
public Set, String>> getOverlappingOntologyClassMentions(Range range, final String specificType) {
return getOverlappingOntologyClassMentions(range).stream().filter(e -> e.getValue().equals(specificType))
.collect(toSet());
}
/**
* Returns chunks overlapping with the given range.
*
* @param range An offset range.
* @return Chunks overlapping the given range.
*/
public Set, String>> getOverlappingChunks(Range range) {
return chunks.getOverlapping(range).entrySet();
}
public Collection getOverlappingAppositions(Range range) {
return appositions.getOverlapping(range).values();
}
/**
* Returns chunks of the given type overlapping with the given range.
*
* @param range An offset range.
* @param chunkType The chunk type - e.g. ChunkNP - to return.
* @return Chunks with the given type overlapping the given range.
*/
public Set, String>> getOverlappingChunks(Range range, final String chunkType) {
return getOverlappingChunks(range).stream().filter(e -> e.getValue().equals(chunkType))
.collect(toSet());
}
public Collection getOverlappingPosTags(Range range) {
if (posTags == null)
return Collections.emptyList();
return posTags.getOverlapping(range).values();
}
public Optional getLastPosTag(Range range, Set excludedTags) {
List posList = getOverlappingPosTags(range).stream().collect(toList());
if (posList.isEmpty())
return Optional.empty();
for (int i = posList.size() - 1; i >= 0; i--) {
PosTag posTag = posList.get(i);
if (excludedTags == null || excludedTags.isEmpty() || !excludedTags.contains(posTag.getTag()))
return Optional.of(posTag);
}
return Optional.empty();
}
public OffsetMap getPosTags() {
return posTags;
}
public void setPosTags(Stream posTags) {
this.posTags = new OffsetMap<>();
posTags.map(pos -> {
if (pos.getTag().equals("NN") && documentText != null && pos.getEnd() < documentText.length()) {
synchronized (pluralMatcher) {
pluralMatcher.reset(getCoveredText(pos));
if (pluralMatcher.matches())
pos.setTag("NNS");
}
}
return pos;
}).forEach(this.posTags::put);
addState(State.POS_SET);
}
public void setPosTags(Collection posTags) {
setPosTags(posTags.stream());
}
/**
* Returns genes overlapping with the given range.
*
* @param range An offset range.
* @return Genes overlapping the given range.
*/
public Stream getOverlappingGenes(Range range) {
return genes.getOverlapping(range).values().stream().flatMap(list -> list.stream());
}
public Stream getOverlappingGoldGenes(Range range) {
if (goldGenes == null)
return Stream.empty();
return goldGenes.getOverlapping(range).values().stream().flatMap(list -> list.stream());
}
public NavigableSet> getSentences() {
return sentences;
}
public void setSentences(OffsetSet sentences) {
this.sentences = sentences;
addState(State.SENTENCES_SET);
}
public SpeciesCandidates getSpecies() {
return species;
}
public void setSpecies(SpeciesCandidates species) {
this.species = species;
addState(State.SPECIES_MENTIONS_SET);
}
/**
* Builds the internal gene offset map with all available genes, overlapping or
* not. Offset duplicates will be override items that have been in the offset
* map before their addition.
*/
public void selectAllGenes() {
this.genes = new OffsetMap<>();
if (allGenes == null)
allGenes = createAllGenesSet();
this.allGenes.forEach(g -> putGene(g));
addState(State.GENES_SELECTED);
}
/**
* Builds the internal gene offset map and only keeps gene mentions found by the
* given taggers.
*
* @param tagger The taggers for which gene mentions should be kept.
*/
public void selectGeneMentionsByTagger(final GeneTagger... tagger) {
if (genes == null)
genes = new OffsetMap<>();
Set includedTaggers = new HashSet<>(Arrays.asList(tagger));
for (Iterator it = allGenes.iterator(); it.hasNext(); ) {
GeneMention g = it.next();
if (g.getTagger() == null) {
log.error("Gene {} in document {} does not have a tagger set", g.getText(), g.getDocId());
// it.remove();
} else {
// Only add genes were there is not already one
if (includedTaggers.contains(g.getTagger()) && genes.getOverlapping(g.getOffsets()).isEmpty()) {
putGene(g, false);
}
}
}
addState(State.GENES_SELECTED);
}
public void expectState(EnumSet expectedStates) {
for (State s : expectedStates) {
if (!state.contains(s)) {
throw new IllegalStateException("Expected state " + s + " which is not set to this document. The current document processing state is " + state);
}
}
}
/**
* Adds gene mentions to the selected set of gene mentions based on a tagger
* (optional) and regular expressions matched on the mention string.
*
* @param tagger Optional, may be null
* @param regExes A list of regular expressions. Each gene mention matching one of
* the expressions (and, if given, the tagger) will be added to the
* selected list of genes.
*/
public void allowGeneMentionsByRegularExpression(final GeneTagger tagger, final Pattern... regExes) {
Matcher[] ms = new Matcher[regExes.length];
for (int i = 0; i < regExes.length; ++i)
ms[i] = regExes[i].matcher("");
for (GeneMention gm : allGenes) {
// check the tagger
if (tagger != null && gm.getTagger() != tagger)
continue;
// if the tagger was correct (or not given), check all regular
// expressions for this mention
boolean allowed = false;
for (int i = 0; i < regExes.length && !allowed; ++i) {
ms[i].reset(gm.getText());
if (ms[i].matches())
allowed = true;
}
// if at least one mention matched a regular expression, add it to
// the set of selected genes
if (allowed)
putGene(gm);
}
}
/**
* Creates the internal gene map without allowing exact duplicate ranges where
* begin and end are equal but still allows overlapping.
*
* @param taggerPriorities The order in which should be decided which gene mention to keep at
* a given position with multiple candidates at the exact same
* location. A lower position means higher priority. Non-mentioned
* taggers have minimum priority, e.g. are most easily discarded.
*/
public void unifyGeneMentionsAtEqualOffsets(final GeneTagger... taggerPriorities) {
genes = new OffsetMap<>();
Map priorities = new HashMap<>();
IntStream.range(0, taggerPriorities.length).forEach(i -> priorities.put(taggerPriorities[i], i));
for (GeneMention gm : allGenes) {
List genesAtOffset = genes.get(gm.getOffsets());
if (genesAtOffset == null) {
putGene(gm);
} else {
for (GeneMention gmInMap : genesAtOffset) {
int priorityInMap = priorities.getOrDefault(gmInMap.getTagger(), Integer.MAX_VALUE);
int gmPriority = priorities.getOrDefault(gm.getTagger(), Integer.MAX_VALUE);
if (gmPriority > priorityInMap)
replaceGene(gmInMap, gm);
}
}
}
}
public void unifyAcronymsLongerFirst() {
TreeSet unifiedSet = unifySpanLongerFirst(acronyms.values());
acronyms = new OffsetMap<>();
unifiedSet.forEach(g -> acronyms.put(g.getOffsets(), (Acronym) g));
}
/**
* Unifies all genes with the longer-span-first strategy.
*/
public void unifyAllGenesLongerFirst() {
TreeSet unifiedSet = unifySpanLongerFirst(allGenes);
genes = new OffsetMap<>();
unifiedSet.forEach(g -> putGene((GeneMention) g));
}
public void unifyAllGenesLongerFirst(GeneTagger... taggers) {
selectGeneMentionsByTagger(taggers);
TreeSet unifiedSet = unifySpanLongerFirst(
genes.values().stream().flatMap(list -> list.stream()).collect(toList()));
genes = new OffsetMap<>();
unifiedSet.forEach(g -> putGene((GeneMention) g));
}
private TreeSet unifySpanLongerFirst(Collection extends Span> spans) {
Span otherGene = null;
TreeSet sortedGenes = new TreeSet<>(new OffsetSpanComparator());
for (Span gm : spans) {
if (sortedGenes.contains(gm)) {
continue;
} else if (null != (otherGene = sortedGenes.floor(gm))) {
if (otherGene.getOffsets().isOverlappedBy(gm.getOffsets())) {
int gmLength = gm.getOffsets().getMaximum() - gm.getOffsets().getMinimum();
int otherLength = otherGene.getOffsets().getMaximum() - otherGene.getOffsets().getMinimum();
if (gmLength > otherLength) {
if (sortedGenes.remove(otherGene)) {
sortedGenes.add(gm);
}
}
} else {
sortedGenes.add(gm);
}
} else if (null != (otherGene = sortedGenes.ceiling(gm))) {
if (otherGene.getOffsets().isOverlappedBy(gm.getOffsets())) {
int gmLength = gm.getOffsets().getMaximum() - gm.getOffsets().getMinimum();
int otherLength = otherGene.getOffsets().getMaximum() - otherGene.getOffsets().getMinimum();
if (gmLength > otherLength) {
if (sortedGenes.remove(otherGene)) {
sortedGenes.add(gm);
}
}
} else {
sortedGenes.add(gm);
}
} else {
sortedGenes.add(gm);
}
}
return sortedGenes;
}
public void unifyGenesPrioritizeTagger(NavigableSet sortedGenes, GeneTagger tagger) {
allGenes.forEach(gm -> {
GeneMention otherGene = null;
if (sortedGenes.contains(gm)) {
// As comparison is done via ranges, two genes are equal,
// if they cover the same range, even if their respective other
// values are different
GeneTagger candidateTagger = gm.getTagger();
if (candidateTagger == tagger) {
if (sortedGenes.remove(gm)) {
sortedGenes.add(gm);
}
}
} else if (null != (otherGene = sortedGenes.floor(gm))) {
if (otherGene.getOffsets().isOverlappedBy(gm.getOffsets())) {
GeneTagger candidateTagger = gm.getTagger();
if (candidateTagger == tagger) {
if (sortedGenes.remove(otherGene)) {
sortedGenes.add(gm);
}
}
} else {
sortedGenes.add(gm);
}
} else if (null != (otherGene = sortedGenes.ceiling(gm))) {
if (otherGene.getOffsets().isOverlappedBy(gm.getOffsets())) {
GeneTagger candidateTagger = gm.getTagger();
if (candidateTagger == tagger) {
if (sortedGenes.remove(otherGene)) {
sortedGenes.add(gm);
}
}
} else {
sortedGenes.add(gm);
}
} else {
sortedGenes.add(gm);
}
});
genes = new OffsetMap<>();
sortedGenes.forEach(g -> putGene(g));
}
/**
* Returns the raw gene mentions in this document, without any filtering,
* unification, aggregation or whatsoever and possibly from multiple taggers.
*
* @return All gene mentions in this document.
*/
public NavigableSet getAllGenes() {
return allGenes == null ? Collections.emptyNavigableSet() : allGenes;
}
/**
* Adds the given gene mention into the {@link #genes} map by its offset. This action resets the gene sets of this document.
*
* @param gm
*/
private void putGene(GeneMention gm) {
putGene(gm, true);
}
/**
* Adds the given gene mention into the {@link #genes} map by its offset. This action resets the gene sets of this document.
*
* @param gm
*/
private void putGene(GeneMention gm, boolean addToAllGenes) {
if (gm.getOffsets() == null)
throw new IllegalArgumentException("The passed gene mention does not specify text offsets: " + gm);
if (genes == null)
genes = new OffsetMap<>();
putGene(gm, genes);
if (addToAllGenes) {
if (allGenes == null)
allGenes = createAllGenesSet();
try {
allGenes.add(gm);
} catch (Exception e) {
e.printStackTrace();
System.err.println(gm + "; " + gm.getTagger());
throw e;
}
}
}
public void putGoldGene(GeneMention gm) {
if (gm.getOffsets() == null)
throw new IllegalArgumentException("The passed gene mention does not specify text offsets: " + gm);
if (goldGenes.isEmpty())
goldGenes = new OffsetMap<>();
putGene(gm, goldGenes);
}
private void putGene(GeneMention gm, OffsetMap> geneMap) {
assert geneMap != null;
if (gm.getOffsets() == null)
throw new IllegalArgumentException("The passed gene mention does not specify text offsets: " + gm);
List gmList = geneMap.get(gm.getOffsets());
if (gmList == null) {
gmList = new ArrayList<>();
geneMap.put(gm.getOffsets(), gmList);
}
if (!gmList.contains(gm))
gmList.add(gm);
gm.setGeneDocument(this);
resetGeneSets();
}
private void replaceGene(GeneMention gene, GeneMention replacement) {
List gmList = genes.get(gene.getOffsets());
int index = gmList.indexOf(gene);
gmList.set(index, replacement);
}
public String getCoveredText(Span span) {
return getCoveredText(span.getOffsets());
}
public String getCoveredText(Range range) {
return getCoveredText(range.getMinimum(), range.getMaximum());
}
public String getCoveredText(int begin, int end) {
return documentText.substring(begin, end);
}
/**
* Adds the given GeneMention to the set of currently selected genes but not to
* the allGenes set.
*
* @param gm The gene mention to add.
*/
public void selectGene(GeneMention gm) {
putGene(gm);
}
public TermNormalizer getTermNormalizer() {
return termNormalizer;
}
public void setTermNormalizer(TermNormalizer termNormalizer) {
this.termNormalizer = termNormalizer;
}
/**
* Removes the given gene from this GeneDocument. If the removal was successful, the gene sets are reset.
*
* @param gm
*/
public boolean removeGene(GeneMention gm) {
boolean success = false;
List genesAtOffset = getGeneMap().get(gm.getOffsets());
if (genesAtOffset != null) {
genesAtOffset.remove(gm);
if (genesAtOffset.isEmpty()) {
success = getGeneMap().remove(gm.getOffsets()) != null;
if (success)
resetGeneSets();
}
}
allGenes.remove(gm);
return success;
}
/**
* Builds an instance of {@link AhoCorasickOptimized} from the currently
* selected genes. The instance is stored internally.
*
* @return A trie dictionary compiled from the names (text occurrence) of all
* selected genes.
*/
public AhoCorasickOptimized getGeneNameDictionary() {
if (geneNameDictionary == null) {
geneNameDictionary = new AhoCorasickOptimized(
getGenes().map(GeneMention::getText).map(String::toLowerCase).collect(toList()));
}
return geneNameDictionary;
}
/**
* Merges those gene sets that are connected via acronym resolution.
*/
public void agglomerateByAcronyms() {
if (hasState(State.AGGLOMERATION_BY_ACRONYMS))
return;
Collection docAcronyms = getAcronyms().values();
if (docAcronyms.isEmpty()) {
return;
}
// for quick access to the gene sets by GeneMention
// Map geneSetMap = new HashMap<>();
if (geneSets == null)
getGeneSets();
// geneSets.stream().forEach(gs -> gs.forEach(gm -> geneSetMap.put(gm, gs)));
// Map, GeneSet> mergedSets = new HashMap<>();
for (Acronym acronym : getAcronyms().values()) {
Collection gms = getOverlappingGenes(acronym.getOffsets()).collect(toList());
if (gms.isEmpty())
continue;
String acronymText = getCoveredText(acronym);
GeneMention gm = gms.stream().findFirst().get();
AcronymLongform longform = acronym.getLongform();
Collection longGms = getOverlappingGenes(longform.getOffsets()).collect(toList());
if (longGms.isEmpty())
continue;
GeneMention longGm = longGms.stream().findFirst().get();
if (gm.equals(longGm))
continue;
// This should avoid a too lose matching between genes and acronyms. For
// example, the acronym HLH should not taken to be the same as HLH462. But we
// allow minor discrepancies for species prefixes.
if (gm.getText().length() > acronymText.length() + 2 || !gm.getText().endsWith(acronymText)
|| (gm.getText().length() != acronymText.length()
&& !Character.isLowerCase(gm.getText().charAt(0))))
continue;
// Also it happens that an abbreviation's longform overlaps a gene but only in a
// rather small part (e.g. TNF vs. type-1 tumor-necrosis-factor
// (TNF)-receptor-associated protein (TRAP)-2). Thus we check that the full form
// actually is the gene.
// if (longGm.getText().length() != longform.getEnd() - longform.getBegin())
// continue;
int gmsize = gm.getGeneSets().size();
int longsize = longGm.getGeneSets().size();
for (GeneSet gmSet : gm.getGeneSets()) {
for (GeneSet longGmSet : longGm.getGeneSets()) {
if (longGms == gmSet)
continue;
// Don't merge different taxonomy IDs
if (!gmSet.getTaxId().equals(longGmSet.getTaxId()))
continue;
// We don't want to merge plural and non-plural sets since this is an import
// part of family recognition
if (gmSet.isPlural() ^ longGmSet.isPlural())
continue;
// now merge the smaller set into the larger one
GeneSet from;
GeneSet to;
if (gmSet.size() > longGmSet.size()) {
from = longGmSet;
to = gmSet;
} else {
from = gmSet;
to = longGmSet;
}
// may happen if we have overlapping / embedded acronyms (e.g. human
// follicle stimulating hormone receptor (hFSH-R) has the acronyms
// hFSH-R and FSH-R)
if (from == to)
continue;
to.addAll(from, false);
from.clear();
}
}
}
cleanAndEnumerateGeneSets();
getGenes().forEach(GeneMention::clearGeneSets);
geneSets.forEach(gs -> gs.forEach(gm -> gm.addGeneSet(gs)));
addState(State.AGGLOMERATION_BY_ACRONYMS);
}
/**
* Merges those gene sets that are connected via coreference resolution.
*/
public void agglomerateByCoreference() {
if (hasState(State.AGGLOMERATION_BY_COREFERENCES))
return;
if (coreferenceSets == null || coreferenceSets.isEmpty())
return;
if (geneSets == null)
getGeneSets();
for (CoreferenceSet corefSet : coreferenceSets) {
Map tax2geneset = new HashMap<>();
for (CoreferenceExpression corefExp : corefSet) {
Iterator geneIt = getOverlappingGenes(corefExp.getOffsets()).iterator();
while (geneIt.hasNext()) {
GeneMention gm = geneIt.next();
for (String taxId : gm.getTaxonomyIds()) {
GeneSet oldGs = gm.getGeneSets().getGeneSet(taxId);
GeneSet gs = tax2geneset.compute(taxId, (k, v) -> v != null && !v.isEmpty() ? v : oldGs);
if (gs != oldGs) {
Set tmp = new HashSet<>(oldGs);
oldGs.clear();
tmp.forEach(g -> g.getGeneSets().remove(oldGs));
gs.addAll(tmp);
}
}
}
}
}
cleanAndEnumerateGeneSets();
getGenes().forEach(GeneMention::clearGeneSets);
geneSets.forEach(gs -> gs.forEach(gm -> gm.addGeneSet(gs)));
addState(State.AGGLOMERATION_BY_COREFERENCES);
}
private void cleanAndEnumerateGeneSets() {
int number = 0;
Iterator iterator = getGeneSets().iterator();
while (iterator.hasNext()) {
GeneSet gs = iterator.next();
if (gs.isEmpty()) {
iterator.remove();
} else {
gs.setNumber(number++);
}
}
}
/**
* @param dontMergeDifferentTaxonomyIds Don't merge gene sets with different taxonomy IDs. Only works if the gene sets already are uniform in they taxonomy IDs before calling this method.
*/
public void agglomerateByNames(boolean dontMergeDifferentTaxonomyIds) {
if (hasState(State.AGGLOMERATION_BY_NAME))
return;
if (geneSets == null)
getGeneSets();
for (GeneMention gm1 : getGenesIterable()) {
for (GeneMention gm2 : getGenesIterable()) {
for (String tax1 : gm1.getTaxonomyIds()) {
for (String tax2 : gm2.getTaxonomyIds()) {
GeneSet iSet = gm1.getGeneSets().getGeneSet(tax1);
GeneSet jSet = gm2.getGeneSets().getGeneSet(tax2);
if (iSet == jSet)
continue;
if (iSet.isEmpty() || jSet.isEmpty())
continue;
if (!iSet.getTaxId().equals(jSet.getTaxId()))
continue;
// We don't want to merge plural and non-plural sets since this is an important
// part of family recognition
if (iSet.isPlural() ^ jSet.isPlural())
continue;
// Do not merge sets without common taxonomy IDs. Since a GeneMention may have multiple taxonomy IDs,
// it is well possible for a GeneMention to end up in multiple gene sets.
if (dontMergeDifferentTaxonomyIds) {
if (!iSet.getTaxId().equals(jSet.getTaxId()))
continue;
}
// Check if there are common names in both sets
Function>> gm2gnFunc = gm -> Stream.concat(Stream.of(gm.getGeneName()), gm.getGeneName().getAlternatives().stream())
.map(gn -> termNormalizer.normalize(gn.getText()))
.map(s -> Arrays.stream(s.split("\\s+"))
.collect(toSet()));
Set> iNameSet = iSet.stream().flatMap(gm2gnFunc).collect(toSet());
Set> jNameSet = jSet.stream().flatMap(gm2gnFunc).collect(toSet());
if (!Sets.intersection(iNameSet, jNameSet).isEmpty()) {
iSet.addAll(jSet, false);
jSet.clear();
}
}
}
}
}
cleanAndEnumerateGeneSets();
getGenes().forEach(GeneMention::clearGeneSets);
geneSets.forEach(gs -> gs.forEach(gm -> gm.addGeneSet(gs)));
addState(State.AGGLOMERATION_BY_NAME);
}
/**
* Adds alternative names to genes based on acronym resolution.
* Sometimes, gene names include abbreviations that are introduced in a more general context. For example,
* document. For example, the name elements occurring in the same document
*
* - monokine induced by interferon gamma
* - interferon gamma (IFN-gamma)
* - monokine induced by IFN-gamma
*
* would allow to infer that 1. and 3. are actually the same name. This method prepares for that inference step
* by adding name variants where the abbreviation is expanded so that 3. would have 1. as variant. Then,
* the {@link #agglomerateByNames(boolean)} method will agglomerate 1. and 3. into the same gene set.
*
*/
public void generateGeneNameVariants() {
if (hasState(State.GENE_VARIANTS_GENERATED))
return;
Map acro2long = acronyms.values().stream().collect(Collectors.toMap(acronym -> getCoveredText(acronym.getOffsets()), acronym -> getCoveredText(acronym.getLongform().getOffsets()), (x, y) -> x));
Map acro2longvariants = new HashMap<>();
for (String acro : acro2long.keySet()) {
String longform = acro2long.get(acro);
// plural normalization
if (acro.endsWith("s") && longform.endsWith("s"))
acro2longvariants.put(acro.substring(0, acro.length() - 1), longform.substring(0, longform.length() - 1));
}
acro2long.putAll(acro2longvariants);
AhoCorasickOptimized acroAc = new AhoCorasickOptimized(acro2long.keySet());
Function> gm2gnFunc = gm -> Stream.concat(Stream.of(gm.getGeneName()), gm.getGeneName().getAlternatives().stream()).flatMap(gn -> Stream.of(gn.getText(), termNormalizer.normalize(gn.getText())))
.flatMap(s -> {
Stream.Builder variantBuilder = Stream.builder();
variantBuilder.accept(s);
acroAc.match(s, (start, end, match) -> {
variantBuilder.accept(new StringBuilder(s).replace(start, end + 1, acro2long.get(match)).toString());
});
Stream build = variantBuilder.build();
List collect = build.collect(toList());
return collect.stream();
})
.filter(Objects::nonNull)
// .map(String::toLowerCase)
// .map(GeneMapper::removeNondescriptives)
.map(s -> new GeneName(s, termNormalizer));
for (GeneMention gm : getGenesIterable()) {
Set alreadyKnownAlternatives = Stream.concat(Stream.of(termNormalizer.normalize(gm.getText())), gm.getGeneName().getAlternatives().stream().map(gn -> termNormalizer.normalize(gn.getText()))).collect(toSet());
List variantsWithNonDesc = gm2gnFunc.apply(gm).filter(gn -> alreadyKnownAlternatives.add(termNormalizer.normalize(gn.getText()))).collect(toList());
variantsWithNonDesc.forEach(gm.getGeneName()::addAlternative);
// TODO this is too complicated due to the fact that we cannot just normalize once and go with the normalized form. Fix as soon as we can
List variantsWithoutNonDesc = gm2gnFunc.apply(gm).map(GeneName::getText).filter(s -> alreadyKnownAlternatives.add(termNormalizer.normalize(TermNormalizer.removeNondescriptives(s)))).map(s -> new GeneName(TermNormalizer.removeNondescriptives(s), termNormalizer)).collect(toList());
for (var gn : variantsWithoutNonDesc)
gm.getGeneName().addAlternative(gn);
}
addState(State.GENE_VARIANTS_GENERATED);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
GeneDocument that = (GeneDocument) o;
return Objects.equals(id, that.id);
}
@Override
public int hashCode() {
return Objects.hash(id);
}
public Collection getMeshHeadings() {
return meshHeadings != null ? meshHeadings : Collections.emptyList();
}
public void setMeshHeadings(Collection meshHeadings) {
this.meshHeadings = meshHeadings;
}
public Stream getGenesWithText(String text) {
return getGenes().filter(gm -> gm.getText().equals(text));
}
public Entry, SpeciesMention> getNearestPreviousSpeciesMention(Range range, String taxId) {
final OffsetMap speciesCandidates = species.getAllMentionCandidates();
Entry, SpeciesMention> lower = speciesCandidates.lowerEntry(range);
while (lower != null && ((!lower.getValue().getTaxId().equals(taxId) && taxId != null) || lower.getKey().isOverlappedBy(range))) {
lower = speciesCandidates.lowerEntry(lower.getKey());
}
if (lower != null && !lower.getValue().getTaxId().equals(taxId) && taxId != null)
lower = null;
return lower;
}
public Entry, SpeciesMention> getNearestPreviousSpeciesMention(Range range) {
return getNearestPreviousSpeciesMention(range, null);
}
public Entry, SpeciesMention> getNearestNextSpeciesMention(Range range, String taxId) {
final OffsetMap speciesCandidates = species.getAllMentionCandidates();
Entry, SpeciesMention> higher = speciesCandidates.higherEntry(range);
while (higher != null && ((!higher.getValue().getTaxId().equals(taxId) && taxId != null) || higher.getKey().isOverlappedBy(range))) {
higher = speciesCandidates.higherEntry(higher.getKey());
}
if (higher != null && !higher.getValue().getTaxId().equals(taxId) && taxId != null)
higher = null;
return higher;
}
public Set findChromosomeLocations() {
if (chromosomeLocations == null) {
chromosomeLocations = new HashSet<>();
Matcher m = GeneLocation.MAP_LOC_PATTERN.matcher(getDocumentText());
while (m.find()) {
chromosomeLocations.add(new GeneLocation(m));
}
}
return chromosomeLocations;
}
public Stream getDocumentContext(Range inputOffsets, int numTokens) {
return getDocumentContext(inputOffsets, Collections.emptySet(), false, numTokens);
}
public Stream getDocumentContext(Range inputOffsets, Set excludedTokens, boolean excludeGeneMentions, int numTokens) {
if (numTokens == 0)
return Stream.empty();
Set allstopwords = !excludedTokens.isEmpty() || excludeGeneMentions ? new HashSet<>() : Collections.emptySet();
if (excludeGeneMentions) {
if (geneMentionTexts == null) {
for (GeneMention gm : getGenesIterable())
geneMentionTexts = Stream.of(gm.getText().split("\\s+")).collect(toSet());
}
allstopwords.addAll(geneMentionTexts);
}
String[] contextTokens = new String[numTokens];
Range focusOffsets = inputOffsets;
for (int i = (int) (numTokens / 2d); i >= 0; i--) {
Range tokenOffset = posTags.lowerKey(focusOffsets);
if (tokenOffset == null)
break;
String coveredText = getCoveredText(tokenOffset);
if (allstopwords.isEmpty() || !allstopwords.contains(coveredText)) {
contextTokens[i] = coveredText;
}
focusOffsets = tokenOffset;
}
focusOffsets = inputOffsets;
for (int i = (int) (numTokens / 2d) + 1; i < numTokens; i++) {
Range tokenOffset = posTags.higherKey(focusOffsets);
if (tokenOffset == null)
break;
String coveredText = getCoveredText(tokenOffset);
if (allstopwords.isEmpty() || !allstopwords.contains(coveredText)) {
contextTokens[i] = coveredText;
}
focusOffsets = tokenOffset;
}
return Arrays.stream(contextTokens).filter(Objects::nonNull);
}
public void reset() {
resetGeneSets();
getGenes().forEach(gm -> {
gm.setMentionMappingResult(null);
gm.setTaxonomyOcurrences(HashMultimap.create());
});
state = new LinkedHashSet<>();
}
public boolean isGoldHasOffsets() {
return goldHasOffsets;
}
public void setGoldMentionsWithOffsets(boolean goldHasOffsets) {
this.goldHasOffsets = goldHasOffsets;
}
/**
* Converts this document and its entities into the PubTator format.
*
* @return A string containing the PubTator format conversion.
*/
public String getPubTatorString() {
String ls = System.getProperty("line.separator");
StringBuilder sb = new StringBuilder();
if (documentTitle != null && !documentTitle.isBlank())
sb.append(id).append("|t|").append(documentTitle).append(ls);
if (documentText != null && !documentText.isBlank())
sb.append(id).append("|a|").append(abstractText).append(ls);
// 10064899 100 118 Lysophospholipases FamilyName
// 10064899 360 403 lysophospholipid-specific lysophospholipase Gene 10434
for (GeneMention gm : getGenesIterable()) {
MentionMappingResult mmr = gm.getMentionMappingResult();
if (!gm.isRejected()) {
sb.append(id).append("\t").append(gm.getBegin()).append("\t").append(gm.getEnd()).append("\t").append(gm.getText()).append("\t").append("Gene");
// sb.append("\t");
// if (gm.getTaxonomyId().equals("9606"))
// sb.append(gm.getIds().get(0));
// else
// sb.append(gm.getIds().get(0)).append("(Tax:").append(gm.getTaxonomyId()).append(")");
// sb.append(gm.getOverlappingGoldMentions().stream().map(gold -> "9606".equals(gold.getTaxonomyId()) ? gold.getAnyGoldId() : gold.getAnyGoldId() + "(Tax:" + gold.getTaxonomyId() + ")").collect(Collectors.joining(",")));
sb.append(ls);
}
}
// 10064899 341 346 human Species 9606
// for (SpeciesMention sm : (Iterable) () -> Stream.concat(species.getTitleCandidates().values().stream(), species.getTextCandidates().values().stream()).iterator()) {
// sb.append(id).append("\t").append(sm.getBegin()).append("\t").append(sm.getEnd()).append("\t").append(sm.getText()).append("\tSpecies\t").append(sm.getTaxId()).append(ls);
// }
// sb.append(ls);
return sb.toString();
}
/**
* @param abstractText
* @deprecated use offsets on the complete text
*/
@Deprecated
public void setDocumentAbstract(String abstractText) {
this.abstractText = abstractText;
}
public String getInspectionText(Function correctnessFunction, Map> renderFunctions) {
StringBuilder sb = new StringBuilder();
int pos = 0;
for (GeneMention gm : (Iterable) () -> getGenes().sorted(Comparator.comparingInt(GeneMention::getBegin)).iterator()) {
int begin = gm.getBegin();
sb.append(documentText, Math.min(pos, begin), begin);
// if (gm.hasExactCandidateMatch()) {
MentionCorrectness correctness = correctnessFunction.apply(gm);
Function geneMentionStringFunction = renderFunctions.get(correctness);
String apply = geneMentionStringFunction.apply(gm);
sb.append(apply);
// } else {
// sb.append(gm.getText());
// }
pos = gm.getEnd();
}
sb.append(documentText, pos, documentText.length());
return sb.toString();
}
public String getGenesetInspectionText(BiFunction correctnessFunction, Map> renderFunctions) {
StringBuilder sb = new StringBuilder();
int pos = 0;
for (GeneMention gm : (Iterable) () -> getGenes().sorted(Comparator.comparingInt(GeneMention::getBegin)).iterator()) {
// When there is no gold we still want to show the FP so we add the dummy NOID instead
List goldIdList = gm.hasGoldMentions() ? gm.getAllGoldIdsAsList() : List.of(GeneMention.NOID);
List overlappingGenes = gm.getGeneDocument().getOverlappingGenes(gm.getOffsets()).collect(toList());
int index = overlappingGenes.indexOf(gm);
String goldId = goldIdList.get(Math.min(index, goldIdList.size() - 1));
int begin = gm.getBegin();
sb.append(documentText, Math.min(pos, begin), begin);
BiFunction geneMentionStringFunction = renderFunctions.get(correctnessFunction.apply(gm, goldId));
String apply = geneMentionStringFunction.apply(gm, goldId);
sb.append(apply);
pos = gm.getEnd();
}
sb.append(documentText, pos, documentText.length());
return sb.toString();
}
public Set getGoldTaxonomyIds() {
return goldTaxonomyIds;
}
public void setGoldTaxonomyIds(Set goldTaxonomyIds) {
this.goldTaxonomyIds = goldTaxonomyIds;
}
/**
* Some gene corpora have annotated all gene occurrences while other focus on the most important genes
* with regards to a specific task.
*
* @return Whether all genes in this document have been annotated or only a subset.
*/
public boolean isCompletelyAnnotated() {
return completelyAnnotated;
}
public void setCompletelyAnnotated(boolean completelyAnnotated) {
this.completelyAnnotated = completelyAnnotated;
}
public boolean isGoldOffsetsInferred() {
return goldOffsetsInferred;
}
public void setGoldOffsetsInferred(boolean goldOffsetsInferred) {
this.goldOffsetsInferred = goldOffsetsInferred;
}
public void clearSelectedGenes() {
genes = null;
}
public Collection getCoreferenceSets() {
return coreferenceSets;
}
public void setCoreferenceRelations(Collection coreferenceSets) {
this.coreferenceSets = coreferenceSets;
coreferenceExpressions = new OffsetMap<>();
coreferenceSets.stream().flatMap(Collection::stream).forEach(coreferenceExpressions::put);
}
public void setAppositions(Collection appositions) {
this.appositions = new OffsetMap(appositions);
}
public void setAppositionContextToGeneNames() {
for (GeneMention gm : getGenesIterable()) {
Apposition overlappingApposition = appositions.getFirstLargestIntersectionValue(gm.getOffsets());
if (overlappingApposition != null) {
// We actually do not make sure here that the "other" apposition element is actually the more general part
// of this apposition. It will mostly be.
Apposition inApposition = overlappingApposition.getOther();
gm.getGeneName().addAppositionContext(getCoveredText(inApposition));
}
}
}
public Range getOverlappingNonGenePhrases(Range offsets) {
Range nonGenePhrase = nonGenePhrases.isEmpty() ? null : nonGenePhrases.locate(offsets);
return nonGenePhrase != null && nonGenePhrase.isOverlappedBy(offsets) ? nonGenePhrase : Range.between(0, 0);
}
public OffsetSet getNonGenePhrases() {
return nonGenePhrases;
}
public void setNonGenePhrases(OffsetSet nonGenePhrases) {
this.nonGenePhrases = nonGenePhrases;
}
public void rejectGenesOverlappingNonGenePhrases() {
for (GeneMention gm : getGenesIterable()) {
if (getOverlappingNonGenePhrases(gm.getOffsets()).getMaximum() > 0)
gm.reject(MentionMappingResult.RejectReason.IS_NON_GENE_WORD);
}
}
public enum MentionCorrectness {CORRECT_ID, WRONG_ID, CANT_FIND}
public enum State {
GENES_SELECTED, SENTENCES_SET, SPECIES_MENTIONS_SET, ACRONYMS_SET, CHUNKS_SET, POS_SET,
/**
* Species hints/{@link GeneSpeciesOccurrence} markers have been set.
*/
SPECIES_CANDIDATES_ASSIGNED, SYNONYM_CANDIDATES_ASSIGNED,
/**
* Filtered out tax IDs that do not exist in NCBI Gene
*/
SPECIES_CANDIDATES_FILTERED,
/**
* Taxonomy IDs have been assigned to {@link MeshHeading} instances that represent a species.
*/
MESH_TAX_IDS_ASSIGNED, REFERENCE_SPECIES_ADDED, SPECIES_SCORES_ASSIGNED, AGGLOMERATION_BY_ACRONYMS, AGGLOMERATION_BY_NAME, AGGLOMERATION_BY_COREFERENCES, SPECIES_ASSIGNED_TO_GENES, ONTOLOGY_CLASS_MENTONS_SET, GENE_VARIANTS_GENERATED
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy