de.julielab.geneexpbase.data.DocumentLoader Maven / Gradle / Ivy

Go to download
package de.julielab.geneexpbase.data;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.geneexpbase.candidateretrieval.CandidateRetrieval;
import de.julielab.geneexpbase.candidateretrieval.QueryGenerator;
import de.julielab.geneexpbase.candidateretrieval.SynHit;
import de.julielab.geneexpbase.configuration.Configuration;
import de.julielab.geneexpbase.genemodel.*;
import de.julielab.geneexpbase.ioc.BaseModule;
import de.julielab.java.utilities.spanutils.OffsetMap;
import de.julielab.java.utilities.spanutils.OffsetSet;
import org.apache.commons.lang3.Range;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.inject.Inject;
import javax.inject.Named;
import java.io.IOException;
import java.util.*;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;

public class DocumentLoader {
    private final static Logger log = LoggerFactory.getLogger(DocumentLoader.class);
    private final CandidateRetrieval candidateRetrieval;
    private final QueryGenerator queryGeneratorForMentionInference;
    private final GeneOrthologs geneOrthologs;
    private final Configuration configuration;
    private final TermNormalizer normalizer;

    @Inject
    public DocumentLoader(CandidateRetrieval candidateRetrieval, @Named(BaseModule.ID_INFERENCE) QueryGenerator queryGeneratorForMentionInference, GeneOrthologs geneOrthologs, TermNormalizer normalizer, Configuration configuration) {
        this.candidateRetrieval = candidateRetrieval;
        this.queryGeneratorForMentionInference = queryGeneratorForMentionInference;
        this.geneOrthologs = geneOrthologs;
        this.normalizer = normalizer;
        this.configuration = configuration;
    }

    private static void setGoldMentionsToPredictions(Collection> goldData, Multimap predictedGeneMentions, Set finalDocIds) {
        Map>> goldGeneMap = goldData.stream().flatMap(Collection::stream).collect(Collectors.groupingBy(GeneMention::getDocId, HashMap::new, Collectors.toMap(GeneMention::getOffsets, gm -> {
            List l = new ArrayList<>();
            l.add(gm);
            return l;
        }, (l1, l2) -> {
            l1.addAll(l2);
            return l1;
        }, OffsetMap::new)));
        for (GeneMention gm : predictedGeneMentions.values()) {
            if (finalDocIds.contains(gm.getDocId())) {
                if (goldGeneMap.get(gm.getDocId()) != null)
                    gm.setOverlappingGoldMentions(new ArrayList<>(goldGeneMap.get(gm.getDocId()).getOverlapping(gm).values().stream().flatMap(Collection::stream).collect(Collectors.toList())));
            }
        }
    }

    public Stream getDocuments(DocumentSourceFiles files) throws DocumentLoadingException {

        try {
            boolean goldHasOffsets = false;
            boolean inferDocLevelAnnotationsToMentions = false;
            Multimap goldData;
            try {
                goldData = CorpusReader.readMentionsWithOffsets(files.getGoldGeneList());
                goldHasOffsets = true;
            } catch (Exception e) {
                // This error occurs when the data does not have offsets
                goldData = CorpusReader.readGoldIds(files.getGoldGeneList());
                inferDocLevelAnnotationsToMentions = files.getInferDocumentLevelLabelsToMentions();
            }
            // Filter out gold mentions without an ID.
            // The GNormPlus BC2 Test data has multiple entries without an ID, for example.
            Iterator goldIt = goldData.values().iterator();
            while (goldIt.hasNext()) {
                GeneMention goldGm = goldIt.next();
                if (goldGm.getIds().isEmpty() || goldGm.getIds().get(0).equals(GeneMention.NOID))
                    goldIt.remove();
            }
            goldData.values().forEach(gm -> gm.setTagger(GeneMention.GeneTagger.GOLD));
            goldData.values().forEach(gm -> gm.setSpecificType(gm.getSpecificType() != null ? gm.getSpecificType() : files.getDefaultEntityType()));
            final boolean withOffsets = goldHasOffsets;
            List allowedTypes = Arrays.asList("Gene", "protein", "protein_complex", "protein_enum", "protein_familiy_or_group");
            Multimap predictedGeneMentions = CorpusReader
                    .readMixedFileForGenesWithOffsets(files.getPredictedGenesPath(), files.getAllowedGeneTypes(), files.getTaggersToUse());
//            Multimap gazetteerMatchesWithIds = CorpusReader
//                    .readMixedFileForGenesWithOffsets(files.getGazetteerMatchesWithIds(), files.getAllowedGeneTypes(), files.getTaggersToUse());
            Multimap acronyms = CorpusReader.readAcronymAnnotations(files.getAcronymsPath());
            Multimap coreferences = CorpusReader.readCoreferenceAnnotations(files.getCorefPath());
            Multimap appositions = CorpusReader.readAppositionAnnotations(files.getAppositionsPath());
            Map documentContexts = CorpusReader.readGeneContexts(files.getDocTextPath());
            Multimap> sentences = CorpusReader.readMixedFileForSentenceOffsets(files.getSentencesPath());
            Multimap> nonGenePhrases = CorpusReader.readMixedFileForNonGenePhraseOffsets(files.getSentencesPath());
            Map> species = CorpusReader.readMixedFileForTextSpecies(files.getSpeciesPath());
            Map> chunks = CorpusReader.readMixedFileForChunkOffsets(files.getChunksPath());
            Map> ontologyClassMentions = CorpusReader.readMixedFileForOntologyClassMentions(files.getOntologyMentionsPath());
            Multimap posTags = CorpusReader.readMixedFileForPosTags(files.getPosPath());
            Multimap meshHeadings = HashMultimap.create();
            if (files.hashMesh())
                meshHeadings = CorpusReader.readMeshHeadings(files.getMeshPath());
            if (files.hasSubstances())
                meshHeadings.putAll(CorpusReader.readMeshHeadings(files.getSubstancesPath()));
            if (predictedGeneMentions.isEmpty())
                throw new IllegalArgumentException("Could not find any entity of types '" + allowedTypes + "' of tagger '" + files.getTaggersToUse() + "' in " + files.getPredictedGenesPath() + ".");

            Multimap finalMeshHeadings = meshHeadings;
            boolean isSpeciesCorpus = files.isSpeciesCorpus();


            Set docIds = documentContexts.keySet();
            Set finalDocIds = docIds;
//            Set finalDocIds = Set.of("10215850");
//            Set finalDocIds = Files.lines(Path.of("bctest_top7_last4_first1.pmids")).collect(Collectors.toSet());



            if (files.isHasGeneIds()) {
                // Set the taxonomy ID for the gold data
                goldData.values().stream().filter(gm -> finalDocIds.contains(gm.getDocId())).forEach(gm -> {
                    // Map replaced IDs with their new ID
                    gm.setIds(gm.getIds().stream().map(id -> GeneInformation.REPLACED.getOrDefault(id, id)).collect(Collectors.toList()));
                    gm.setTaxonomyId(candidateRetrieval.mapGeneIdToTaxId(gm.getGoldMentionId()));
                    if (gm.getTaxonomyId().isBlank())
                        log.warn("Could not retrieve the taxonomy of the gold gene ID " + gm.getIds());
//                    gm.setTaxonomyIds(Collections.singletonList(gm.getTaxonomyId()));
                });
            } else if (files.isSpeciesCorpus()) {
                goldData.values().stream().forEach(gm -> gm.setTaxonomyIds(gm.getIds()));
            }

            Multimap finalGoldData = goldData;
            boolean finalGoldHasOffsets = goldHasOffsets;
            boolean finalInferDocToMention = inferDocLevelAnnotationsToMentions;
            return finalDocIds.stream()
                    .map(docId -> getGeneDocument(normalizer, candidateRetrieval, finalGoldData, predictedGeneMentions, acronyms, coreferences, appositions, documentContexts, sentences, nonGenePhrases, species, chunks, posTags, ontologyClassMentions, finalMeshHeadings, isSpeciesCorpus, geneOrthologs, withOffsets, finalInferDocToMention, docId))
                    .peek(d -> {
                        if (finalGoldHasOffsets || finalInferDocToMention)
                            setGoldMentionsToPredictions(d.getGoldGenes().values(), predictedGeneMentions, finalDocIds);
                        d.setCompletelyAnnotated(files.isCompletelyAnnotated());
                    });
        } catch (IOException e) {
            throw new DocumentLoadingException(e);
        }
    }

    public GeneDocument getGeneDocument(TermNormalizer normalizer, CandidateRetrieval candidateRetrieval, Multimap finalGoldData, Multimap predictedGeneMentions, Multimap acronyms, Multimap coreferences, Multimap appositions, Map documentContexts, Multimap> sentences, Multimap> nonGenePhrases, Map> species, Map> chunks, Multimap posTags, Map> ontologyClassMentions, Multimap meshHeadings, boolean isSpeciesCorpus, GeneOrthologs geneOrthologs, boolean goldHasOffsets, boolean inferDocLevelLabelsToMentions, String docId) {
        GeneDocument document = new GeneDocument(docId);
        document.setTermNormalizer(normalizer);
        document.setAcronyms(new HashSet<>(acronyms.get(docId)));
        document.setCoreferenceRelations(coreferences.get(docId));
        document.setAppositions(appositions.get(docId));
        // the BioCreative II GN documents always have two lines, the first
        // the title, the second the abstract
        String[] textSplit = Stream.of(documentContexts.get(docId).split("\\n")).filter(Predicate.not(String::isBlank)).toArray(String[]::new);
        String title = null;
        String abstractText = null;
        if (textSplit.length > 1) {
            title = textSplit[0];
            abstractText = textSplit[1];
        } else if (textSplit.length == 1) {
            // I checked the BioCreative II GN train document titles and the largest had 284 characters. So I rounded
            // up to 300 and use that as an indicator whether we have a title or the abstract text here.
            if (textSplit[0].length() > 300) {
                abstractText = textSplit[0];
            } else {
                title = textSplit[0];
            }
        }
        if (title != null)
            document.setTitleOffsets(Range.between(0, title.length()));
        document.setAbstractOffsets(Range.between(document.getTitleOffsets().getMaximum() + 1, documentContexts.get(docId).length()));
        document.setDocumentTitle(title);
        document.setDocumentAbstract(abstractText);
        document.setDocumentText(documentContexts.get(docId));
        document.setChunks(chunks.get(docId));
        document.setOntologyClassMentions(ontologyClassMentions.get(docId));
        document.setPosTags(posTags.get(docId));
        document.setSpecies(new SpeciesCandidates(document.getTitleOffsets().getMinimum(), document.getTitleOffsets().getMaximum(),
                Collections.emptySet(), species.get(docId)));
        document.setSentences(new OffsetSet(sentences.get(docId)));
        document.setNonGenePhrases(new OffsetSet(nonGenePhrases.get(docId)));
        document.setMeshHeadings(meshHeadings.get(docId));
        document.setGenes(new HashSet<>(predictedGeneMentions.get(docId)));
        document.getAllGenes().forEach(gm -> {
            gm.setDocumentContext(document.getDocumentText());
            gm.setId(GeneMention.NOID);
            gm.setTaxonomyId(null);
        });
        document.selectAllGenes();
//        finalGoldData.values().forEach(goldGm -> goldGm.setGeneDocument(document));
        if (goldHasOffsets) {
            finalGoldData.get(docId).forEach(document::putGoldGene);
            document.setGoldIds(finalGoldData.get(docId).stream().map(GeneMention::getIds).flatMap(Collection::stream).collect(Collectors.toSet()));
        } else {
            Set goldIds = finalGoldData.get(docId).stream().map(GeneMention::getIds).flatMap(Collection::stream).collect(Collectors.toSet());
            if (inferDocLevelLabelsToMentions) {
                inferDocumentLevelLabelsToMentions(document, goldIds, candidateRetrieval, geneOrthologs, isSpeciesCorpus);
            }
            document.setGoldIds(goldIds);
            document.setGoldTaxonomyIds(goldIds.stream().map(id -> candidateRetrieval.mapGeneIdToTaxId(id)).collect(Collectors.toSet()));
        }
        // EF 29/03/2021: This was commented out but I don't know why. We need this for the species optimization.
        if (isSpeciesCorpus) {
            document.getGoldGenes().values().stream().flatMap(Collection::stream).forEach(goldgm -> {
                goldgm.setTaxonomyIds(goldgm.getIds());
                goldgm.setId(null);
            });
            document.setGoldTaxonomyIds(new HashSet<>(document.getGoldIds()));
        }


        // We mark this step as being done even when we don't do it. Because we didn't forget about it we just
        // didn't want to.
        document.addState(GeneDocument.State.REFERENCE_SPECIES_ADDED);

        document.setGoldMentionsWithOffsets(goldHasOffsets || inferDocLevelLabelsToMentions);
        return document;
    }

    /**
     * Makes a best effort to find the correct gene mention given the document's mentions and the set of valid IDs known for the document.
     *
     * @param document           The document to create offset-based gold annotations for.
     * @param goldIds            The known positive IDs for this document.
     * @param candidateRetrieval A candidate retrieval class.
     * @param geneOrthologs
     * @param isSpeciesCorpus    Whether this is a species corpus. If so, the input IDs will be used as taxonomy IDs. Otherwise gene IDs are assumed.
     */
    public void inferDocumentLevelLabelsToMentions(GeneDocument document, Set goldIds, CandidateRetrieval candidateRetrieval, GeneOrthologs geneOrthologs, boolean isSpeciesCorpus) {
        final boolean REMOVE_AMBIGOUOUS_GENES = false;
        int batchSize = 1024;
        // it actually happened that there were more than 1024 gold IDs (in gene2pubmed data)
        // which broke the Lucene boolean clauses limit
        if (goldIds.size() > batchSize)
            log.debug("Document {} has {} goldIds", document.getId(), goldIds.size());
        List unclearGms = new ArrayList<>();
        for (GeneMention gm : document.getGenesIterable()) {
            List candidates = new ArrayList<>();
            Iterator goldIdIt = goldIds.iterator();
            while (goldIdIt.hasNext()) {
                List batch = new ArrayList<>(batchSize);
                for (int i = 0; i < batchSize && goldIdIt.hasNext(); i++)
                    batch.add(goldIdIt.next());
                try {
                    candidates.addAll(candidateRetrieval.getCandidates(gm, batch, Collections.emptySet(), queryGeneratorForMentionInference));
                    if (goldIds.size() > batchSize)
                        log.debug("Retrieved {} candidates for gene {} in document {}", candidates.size(), gm.getText(), document.getId());
                } catch (Exception e) {
                    log.error("Could not retrieve candidates for gene {} and goldIds {}.", gm, goldIds);
                    throw e;
                }
            }
            if (!candidates.isEmpty()) {
                SynHit bestHit = candidates.get(0);
                Set orthologIds = geneOrthologs.getOrthologs(bestHit.getId()).stream().map(record -> record.getGeneIdNotEqualTo(bestHit.getId())).collect(Collectors.toSet());
                orthologIds.remove(bestHit.getId());
                Set orthologsInGold = Sets.intersection(orthologIds, goldIds);
                if (!REMOVE_AMBIGOUOUS_GENES || orthologsInGold.isEmpty()) {
                    GeneMention gold = new GeneMention(gm);
                    gold.setDocId(document.getId());
                    gold.setIds(List.of(bestHit.getId()));
                    gold.setTaxonomyIds(gold.getIds().stream().map(id -> candidateRetrieval.mapGeneIdToTaxId(id)).collect(Collectors.toList()));
                    document.putGoldGene(gold);
                    gm.setOverlappingGoldMentions(Collections.singletonList(gold));
                } else {
                    unclearGms.add(gm);
                }
            } else {
                unclearGms.add(gm);
            }
        }
        unclearGms.forEach(gm -> document.removeGene(gm));
        document.setGoldMentionsWithOffsets(true);
        document.setGoldOffsetsInferred(true);
    }
}