![JAR search and dependency download from the Maven repository](/logo.png)
de.julielab.geneexpbase.data.DocumentLoader Maven / Gradle / Ivy
package de.julielab.geneexpbase.data;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.geneexpbase.candidateretrieval.CandidateRetrieval;
import de.julielab.geneexpbase.candidateretrieval.QueryGenerator;
import de.julielab.geneexpbase.candidateretrieval.SynHit;
import de.julielab.geneexpbase.configuration.Configuration;
import de.julielab.geneexpbase.genemodel.*;
import de.julielab.geneexpbase.ioc.BaseModule;
import de.julielab.java.utilities.spanutils.OffsetMap;
import de.julielab.java.utilities.spanutils.OffsetSet;
import org.apache.commons.lang3.Range;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Inject;
import javax.inject.Named;
import java.io.IOException;
import java.util.*;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class DocumentLoader {
private final static Logger log = LoggerFactory.getLogger(DocumentLoader.class);
private final CandidateRetrieval candidateRetrieval;
private final QueryGenerator queryGeneratorForMentionInference;
private final GeneOrthologs geneOrthologs;
private final Configuration configuration;
private final TermNormalizer normalizer;
@Inject
public DocumentLoader(CandidateRetrieval candidateRetrieval, @Named(BaseModule.ID_INFERENCE) QueryGenerator queryGeneratorForMentionInference, GeneOrthologs geneOrthologs, TermNormalizer normalizer, Configuration configuration) {
this.candidateRetrieval = candidateRetrieval;
this.queryGeneratorForMentionInference = queryGeneratorForMentionInference;
this.geneOrthologs = geneOrthologs;
this.normalizer = normalizer;
this.configuration = configuration;
}
private static void setGoldMentionsToPredictions(Collection> goldData, Multimap predictedGeneMentions, Set finalDocIds) {
Map>> goldGeneMap = goldData.stream().flatMap(Collection::stream).collect(Collectors.groupingBy(GeneMention::getDocId, HashMap::new, Collectors.toMap(GeneMention::getOffsets, gm -> {
List l = new ArrayList<>();
l.add(gm);
return l;
}, (l1, l2) -> {
l1.addAll(l2);
return l1;
}, OffsetMap::new)));
for (GeneMention gm : predictedGeneMentions.values()) {
if (finalDocIds.contains(gm.getDocId())) {
if (goldGeneMap.get(gm.getDocId()) != null)
gm.setOverlappingGoldMentions(new ArrayList<>(goldGeneMap.get(gm.getDocId()).getOverlapping(gm).values().stream().flatMap(Collection::stream).collect(Collectors.toList())));
}
}
}
public Stream getDocuments(DocumentSourceFiles files) throws DocumentLoadingException {
try {
boolean goldHasOffsets = false;
boolean inferDocLevelAnnotationsToMentions = false;
Multimap goldData;
try {
goldData = CorpusReader.readMentionsWithOffsets(files.getGoldGeneList());
goldHasOffsets = true;
} catch (Exception e) {
// This error occurs when the data does not have offsets
goldData = CorpusReader.readGoldIds(files.getGoldGeneList());
inferDocLevelAnnotationsToMentions = files.getInferDocumentLevelLabelsToMentions();
}
// Filter out gold mentions without an ID.
// The GNormPlus BC2 Test data has multiple entries without an ID, for example.
Iterator goldIt = goldData.values().iterator();
while (goldIt.hasNext()) {
GeneMention goldGm = goldIt.next();
if (goldGm.getIds().isEmpty() || goldGm.getIds().get(0).equals(GeneMention.NOID))
goldIt.remove();
}
goldData.values().forEach(gm -> gm.setTagger(GeneMention.GeneTagger.GOLD));
goldData.values().forEach(gm -> gm.setSpecificType(gm.getSpecificType() != null ? gm.getSpecificType() : files.getDefaultEntityType()));
final boolean withOffsets = goldHasOffsets;
List allowedTypes = Arrays.asList("Gene", "protein", "protein_complex", "protein_enum", "protein_familiy_or_group");
Multimap predictedGeneMentions = CorpusReader
.readMixedFileForGenesWithOffsets(files.getPredictedGenesPath(), files.getAllowedGeneTypes(), files.getTaggersToUse());
// Multimap gazetteerMatchesWithIds = CorpusReader
// .readMixedFileForGenesWithOffsets(files.getGazetteerMatchesWithIds(), files.getAllowedGeneTypes(), files.getTaggersToUse());
Multimap acronyms = CorpusReader.readAcronymAnnotations(files.getAcronymsPath());
Multimap coreferences = CorpusReader.readCoreferenceAnnotations(files.getCorefPath());
Multimap appositions = CorpusReader.readAppositionAnnotations(files.getAppositionsPath());
Map documentContexts = CorpusReader.readGeneContexts(files.getDocTextPath());
Multimap> sentences = CorpusReader.readMixedFileForSentenceOffsets(files.getSentencesPath());
Multimap> nonGenePhrases = CorpusReader.readMixedFileForNonGenePhraseOffsets(files.getSentencesPath());
Map> species = CorpusReader.readMixedFileForTextSpecies(files.getSpeciesPath());
Map> chunks = CorpusReader.readMixedFileForChunkOffsets(files.getChunksPath());
Map> ontologyClassMentions = CorpusReader.readMixedFileForOntologyClassMentions(files.getOntologyMentionsPath());
Multimap posTags = CorpusReader.readMixedFileForPosTags(files.getPosPath());
Multimap meshHeadings = HashMultimap.create();
if (files.hashMesh())
meshHeadings = CorpusReader.readMeshHeadings(files.getMeshPath());
if (files.hasSubstances())
meshHeadings.putAll(CorpusReader.readMeshHeadings(files.getSubstancesPath()));
if (predictedGeneMentions.isEmpty())
throw new IllegalArgumentException("Could not find any entity of types '" + allowedTypes + "' of tagger '" + files.getTaggersToUse() + "' in " + files.getPredictedGenesPath() + ".");
Multimap finalMeshHeadings = meshHeadings;
boolean isSpeciesCorpus = files.isSpeciesCorpus();
Set docIds = documentContexts.keySet();
Set finalDocIds = docIds;
// Set finalDocIds = Set.of("10215850");
// Set finalDocIds = Files.lines(Path.of("bctest_top7_last4_first1.pmids")).collect(Collectors.toSet());
if (files.isHasGeneIds()) {
// Set the taxonomy ID for the gold data
goldData.values().stream().filter(gm -> finalDocIds.contains(gm.getDocId())).forEach(gm -> {
// Map replaced IDs with their new ID
gm.setIds(gm.getIds().stream().map(id -> GeneInformation.REPLACED.getOrDefault(id, id)).collect(Collectors.toList()));
gm.setTaxonomyId(candidateRetrieval.mapGeneIdToTaxId(gm.getGoldMentionId()));
if (gm.getTaxonomyId().isBlank())
log.warn("Could not retrieve the taxonomy of the gold gene ID " + gm.getIds());
// gm.setTaxonomyIds(Collections.singletonList(gm.getTaxonomyId()));
});
} else if (files.isSpeciesCorpus()) {
goldData.values().stream().forEach(gm -> gm.setTaxonomyIds(gm.getIds()));
}
Multimap finalGoldData = goldData;
boolean finalGoldHasOffsets = goldHasOffsets;
boolean finalInferDocToMention = inferDocLevelAnnotationsToMentions;
return finalDocIds.stream()
.map(docId -> getGeneDocument(normalizer, candidateRetrieval, finalGoldData, predictedGeneMentions, acronyms, coreferences, appositions, documentContexts, sentences, nonGenePhrases, species, chunks, posTags, ontologyClassMentions, finalMeshHeadings, isSpeciesCorpus, geneOrthologs, withOffsets, finalInferDocToMention, docId))
.peek(d -> {
if (finalGoldHasOffsets || finalInferDocToMention)
setGoldMentionsToPredictions(d.getGoldGenes().values(), predictedGeneMentions, finalDocIds);
d.setCompletelyAnnotated(files.isCompletelyAnnotated());
});
} catch (IOException e) {
throw new DocumentLoadingException(e);
}
}
public GeneDocument getGeneDocument(TermNormalizer normalizer, CandidateRetrieval candidateRetrieval, Multimap finalGoldData, Multimap predictedGeneMentions, Multimap acronyms, Multimap coreferences, Multimap appositions, Map documentContexts, Multimap> sentences, Multimap> nonGenePhrases, Map> species, Map> chunks, Multimap posTags, Map> ontologyClassMentions, Multimap meshHeadings, boolean isSpeciesCorpus, GeneOrthologs geneOrthologs, boolean goldHasOffsets, boolean inferDocLevelLabelsToMentions, String docId) {
GeneDocument document = new GeneDocument(docId);
document.setTermNormalizer(normalizer);
document.setAcronyms(new HashSet<>(acronyms.get(docId)));
document.setCoreferenceRelations(coreferences.get(docId));
document.setAppositions(appositions.get(docId));
// the BioCreative II GN documents always have two lines, the first
// the title, the second the abstract
String[] textSplit = Stream.of(documentContexts.get(docId).split("\\n")).filter(Predicate.not(String::isBlank)).toArray(String[]::new);
String title = null;
String abstractText = null;
if (textSplit.length > 1) {
title = textSplit[0];
abstractText = textSplit[1];
} else if (textSplit.length == 1) {
// I checked the BioCreative II GN train document titles and the largest had 284 characters. So I rounded
// up to 300 and use that as an indicator whether we have a title or the abstract text here.
if (textSplit[0].length() > 300) {
abstractText = textSplit[0];
} else {
title = textSplit[0];
}
}
if (title != null)
document.setTitleOffsets(Range.between(0, title.length()));
document.setAbstractOffsets(Range.between(document.getTitleOffsets().getMaximum() + 1, documentContexts.get(docId).length()));
document.setDocumentTitle(title);
document.setDocumentAbstract(abstractText);
document.setDocumentText(documentContexts.get(docId));
document.setChunks(chunks.get(docId));
document.setOntologyClassMentions(ontologyClassMentions.get(docId));
document.setPosTags(posTags.get(docId));
document.setSpecies(new SpeciesCandidates(document.getTitleOffsets().getMinimum(), document.getTitleOffsets().getMaximum(),
Collections.emptySet(), species.get(docId)));
document.setSentences(new OffsetSet(sentences.get(docId)));
document.setNonGenePhrases(new OffsetSet(nonGenePhrases.get(docId)));
document.setMeshHeadings(meshHeadings.get(docId));
document.setGenes(new HashSet<>(predictedGeneMentions.get(docId)));
document.getAllGenes().forEach(gm -> {
gm.setDocumentContext(document.getDocumentText());
gm.setId(GeneMention.NOID);
gm.setTaxonomyId(null);
});
document.selectAllGenes();
// finalGoldData.values().forEach(goldGm -> goldGm.setGeneDocument(document));
if (goldHasOffsets) {
finalGoldData.get(docId).forEach(document::putGoldGene);
document.setGoldIds(finalGoldData.get(docId).stream().map(GeneMention::getIds).flatMap(Collection::stream).collect(Collectors.toSet()));
} else {
Set goldIds = finalGoldData.get(docId).stream().map(GeneMention::getIds).flatMap(Collection::stream).collect(Collectors.toSet());
if (inferDocLevelLabelsToMentions) {
inferDocumentLevelLabelsToMentions(document, goldIds, candidateRetrieval, geneOrthologs, isSpeciesCorpus);
}
document.setGoldIds(goldIds);
document.setGoldTaxonomyIds(goldIds.stream().map(id -> candidateRetrieval.mapGeneIdToTaxId(id)).collect(Collectors.toSet()));
}
// EF 29/03/2021: This was commented out but I don't know why. We need this for the species optimization.
if (isSpeciesCorpus) {
document.getGoldGenes().values().stream().flatMap(Collection::stream).forEach(goldgm -> {
goldgm.setTaxonomyIds(goldgm.getIds());
goldgm.setId(null);
});
document.setGoldTaxonomyIds(new HashSet<>(document.getGoldIds()));
}
// We mark this step as being done even when we don't do it. Because we didn't forget about it we just
// didn't want to.
document.addState(GeneDocument.State.REFERENCE_SPECIES_ADDED);
document.setGoldMentionsWithOffsets(goldHasOffsets || inferDocLevelLabelsToMentions);
return document;
}
/**
* Makes a best effort to find the correct gene mention given the document's mentions and the set of valid IDs known for the document.
*
* @param document The document to create offset-based gold annotations for.
* @param goldIds The known positive IDs for this document.
* @param candidateRetrieval A candidate retrieval class.
* @param geneOrthologs
* @param isSpeciesCorpus Whether this is a species corpus. If so, the input IDs will be used as taxonomy IDs. Otherwise gene IDs are assumed.
*/
public void inferDocumentLevelLabelsToMentions(GeneDocument document, Set goldIds, CandidateRetrieval candidateRetrieval, GeneOrthologs geneOrthologs, boolean isSpeciesCorpus) {
final boolean REMOVE_AMBIGOUOUS_GENES = false;
int batchSize = 1024;
// it actually happened that there were more than 1024 gold IDs (in gene2pubmed data)
// which broke the Lucene boolean clauses limit
if (goldIds.size() > batchSize)
log.debug("Document {} has {} goldIds", document.getId(), goldIds.size());
List unclearGms = new ArrayList<>();
for (GeneMention gm : document.getGenesIterable()) {
List candidates = new ArrayList<>();
Iterator goldIdIt = goldIds.iterator();
while (goldIdIt.hasNext()) {
List batch = new ArrayList<>(batchSize);
for (int i = 0; i < batchSize && goldIdIt.hasNext(); i++)
batch.add(goldIdIt.next());
try {
candidates.addAll(candidateRetrieval.getCandidates(gm, batch, Collections.emptySet(), queryGeneratorForMentionInference));
if (goldIds.size() > batchSize)
log.debug("Retrieved {} candidates for gene {} in document {}", candidates.size(), gm.getText(), document.getId());
} catch (Exception e) {
log.error("Could not retrieve candidates for gene {} and goldIds {}.", gm, goldIds);
throw e;
}
}
if (!candidates.isEmpty()) {
SynHit bestHit = candidates.get(0);
Set orthologIds = geneOrthologs.getOrthologs(bestHit.getId()).stream().map(record -> record.getGeneIdNotEqualTo(bestHit.getId())).collect(Collectors.toSet());
orthologIds.remove(bestHit.getId());
Set orthologsInGold = Sets.intersection(orthologIds, goldIds);
if (!REMOVE_AMBIGOUOUS_GENES || orthologsInGold.isEmpty()) {
GeneMention gold = new GeneMention(gm);
gold.setDocId(document.getId());
gold.setIds(List.of(bestHit.getId()));
gold.setTaxonomyIds(gold.getIds().stream().map(id -> candidateRetrieval.mapGeneIdToTaxId(id)).collect(Collectors.toList()));
document.putGoldGene(gold);
gm.setOverlappingGoldMentions(Collections.singletonList(gold));
} else {
unclearGms.add(gm);
}
} else {
unclearGms.add(gm);
}
}
unclearGms.forEach(gm -> document.removeGene(gm));
document.setGoldMentionsWithOffsets(true);
document.setGoldOffsetsInferred(true);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy