de.julielab.genemapper.resources.TransformerDisambiguationGene2PubmedDataWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.
The newest version!
package de.julielab.genemapper.resources;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.inject.Guice;
import com.google.inject.Injector;
import de.julielab.geneexpbase.configuration.Parameters;
import de.julielab.geneexpbase.data.DocumentLoader;
import de.julielab.geneexpbase.genemodel.GeneDocument;
import de.julielab.geneexpbase.genemodel.GeneOrthologs;
import de.julielab.geneexpbase.ioc.ServicesShutdownHub;
import de.julielab.genemapper.Configuration;
import de.julielab.genemapper.GeneMapper;
import de.julielab.genemapper.classification.TransformerDisambiguationDataUtils;
import de.julielab.genemapper.genemodel.GeneDocumentFactory;
import de.julielab.genemapper.ioc.GeneMappingModule;
import de.julielab.java.utilities.FileUtilities;
import de.julielab.java.utilities.ProgressBar;
import de.julielab.jcore.ae.checkpoint.DBCheckpointAE;
import de.julielab.jcore.reader.xmi.XmiDBReader;
import de.julielab.jcore.types.EntityMention;
import de.julielab.jcore.types.Gene;
import de.julielab.jcore.utility.JCoReTools;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.lucene.search.BooleanQuery;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.analysis_engine.CasIterator;
import org.apache.uima.cas.CAS;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.CollectionReaderFactory;
import org.apache.uima.fit.factory.TypeSystemDescriptionFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.impl.ResourceManager_impl;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.resource.metadata.impl.ProcessingResourceMetaData_impl;
import org.apache.uima.util.CasPool;
import org.apache.uima.util.InvalidXMLException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.*;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static java.nio.charset.StandardCharsets.UTF_8;

public class TransformerDisambiguationGene2PubmedDataWriter {
    private final static Logger log = LoggerFactory.getLogger(TransformerDisambiguationGene2PubmedDataWriter.class);
    private static final BlockingDeque documentBuffer = new LinkedBlockingDeque<>(512);
    private final List writingThreads = new ArrayList<>();
    private static Injector injector;
    private static Configuration configuration;
    private boolean errorOccurred;

    public static void main(String[] args) throws Exception {
        configuration = new Configuration(new File("configurations/genemapper_gene2pubmed.properties"));
        log.info("Detected {} CPUs. Using this number minus 2.", Runtime.getRuntime().availableProcessors());
        int numThreads = Math.max(1, Runtime.getRuntime().availableProcessors() - 2);
//        int numThreads = 1;

        String costosysConfigurationFile = "../jcore-gene-mapper-resources/src/main/resources/costosys.xml";
        String gene2pubmedDocumentTableName = "geno.gene2pubmed";
//                String gene2pubmedDocumentTableName = "geno.errordoc";
        injector = Guice.createInjector(new GeneMappingModule(configuration));
        String goldTaxMode = TransformerDisambiguationDataUtils.USE_GOLD_TAX_FOR_CANDIDATE_RETRIEVAL ? "goldTax" : "noGoldTax";
        File gene2pubmed = new File("../jcore-gene-mapper-resources/gene2pubmed.gz");
        String outputFile = "transformerDisambiguationData-gene2pubmed-v" + TransformerDisambiguationDataUtils.VERSION + "-" + goldTaxMode + "-%s.tsv.gz";
        TransformerDisambiguationGene2PubmedDataWriter dataWriter = new TransformerDisambiguationGene2PubmedDataWriter();
        dataWriter.createDisambiguationData(gene2pubmed, costosysConfigurationFile, gene2pubmedDocumentTableName, injector.getInstance(GeneMapper.class), outputFile, numThreads);
        log.info("Shutting down gene mapper services.");
        log.info("Application finished.");
        injector.getInstance(ServicesShutdownHub.class).shutdown();
    }

    public void createDisambiguationData(File gene2pubmed, String costosysConfigurationFile, String tableName, GeneMapper geneMapper, String outputFile, int numThreads) throws IOException, UIMAException, InterruptedException {
        ExecutorService executorService = Executors.newFixedThreadPool(numThreads);
        try {
            Multimap pubmed2gene = readGene2pubmed(gene2pubmed);
            int retrievalBatchSize = 50;
            CollectionReader dbReader = CollectionReaderFactory.createReader("de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-multiplier-reader",
                    XmiDBReader.PARAM_COSTOSYS_CONFIG_NAME, costosysConfigurationFile,
                    XmiDBReader.PARAM_ANNOTATIONS_TO_LOAD, new String[]{"de.julielab.jcore.types.Sentence", "de.julielab.jcore.types.Token", "de.julielab.jcore.types.PennBioIEPOSTag", "de.julielab.jcore.types.Organism", "de.julielab.jcore.types.Abbreviation", "flair:de.julielab.jcore.types.Gene"},
                    XmiDBReader.PARAM_READS_BASE_DOCUMENT, true,
                    XmiDBReader.PARAM_TABLE, tableName,
                    XmiDBReader.PARAM_BATCH_SIZE, retrievalBatchSize,
                    XmiDBReader.PARAM_RESET_TABLE, true);
            TypeSystemDescription tsd = TypeSystemDescriptionFactory.createTypeSystemDescription(
                    "de.julielab.jcore.types.jcore-morpho-syntax-types",
                    "de.julielab.jcore.types.jcore-document-structure-pubmed-types",
                    "de.julielab.jcore.types.jcore-document-meta-pubmed-types",
                    "de.julielab.jcore.ae.genemapper.desc.ProteinOffsetExpansionTypeSystem",
                    "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types",
                    "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types"
            );
            List multipliers = new ArrayList<>(numThreads);
            List offsetExpanders = new ArrayList<>(numThreads);
            List proteinMergers = new ArrayList<>(numThreads);
            List consistencyAes = new ArrayList<>(numThreads);
            List checkpointConsumers = new ArrayList<>(numThreads);
            for (int i = 0; i < numThreads; i++) {
                AnalysisEngine multiplier = createEngineWithTs("de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-multiplier", tsd);
                // This is a RUTA component creating extended gene/protein annotation offset annotations according to a set of rules
                AnalysisEngine proteinOffsetExpansion = createEngineWithTs("de.julielab.jcore.ae.genemapper.desc.ProteinOffsetExpansionEngine", tsd);
                // Applies the extended offset annotations to the actual gene annotations in the CAS (de.julielab.jcore.ae.genemapper.uima.ExtendedProteinsMerger)
                AnalysisEngine extendedProteinsMerger = createEngineWithTs("de.julielab.jcore.ae.genemapper.desc.jcore-extended-proteins-merger", tsd);
                // Derives gene annotations from already annotated genes in a CAS (see de.julielab.jcore.ae.genemapper.uima.ProteinConsistencyTagger)
                AnalysisEngine consistencyAe = createEngineWithTs("de.julielab.jcore.ae.genemapper.desc.jcore-protein-consistency-tagger", tsd);
                AnalysisEngine checkpointAe = AnalysisEngineFactory.createEngine(
                        DBCheckpointAE.class,
                        tsd,
                        DBCheckpointAE.PARAM_CHECKPOINT_NAME, "end",
                        DBCheckpointAE.PARAM_COSTOSYS_CONFIG, costosysConfigurationFile,
                        DBCheckpointAE.PARAM_INDICATE_FINISHED, true);

                multipliers.add(multiplier);
                offsetExpanders.add(proteinOffsetExpansion);
                proteinMergers.add(extendedProteinsMerger);
                consistencyAes.add(consistencyAe);
                checkpointConsumers.add(checkpointAe);
            }

            /**
             * CAS objects are expensive to create, especially in terms of memory, but also in CPU time. Reusing them
             * is much more appropriate which is why we use a CasPool.
             */

            final ProcessingResourceMetaData_impl metaData = new ProcessingResourceMetaData_impl();
            metaData.setTypeSystem(tsd);
            // Create a CasPool.
            CasPool casPool = new CasPool(numThreads + 4, metaData, new ResourceManager_impl());


            Map threadIds = new ConcurrentHashMap<>(numThreads);
            createWritingThreads(geneMapper, outputFile, numThreads);

            ProgressBar progressBar = new ProgressBar(dbReader.getProgress()[0].getTotal() / retrievalBatchSize, 80, true);
            while (dbReader.hasNext() && !errorOccurred) {
                log.debug("Getting CAS. Available CASes: {}", casPool.getNumAvailable());
                CAS cas = casPool.getCas(1200000);
                if (cas == null)
                    continue;
                dbReader.getNext(cas);
                executorService.submit(() -> {
                    String lastDocId = null;
                    Integer id = threadIds.compute(Thread.currentThread(), (k, v) -> v != null ? v : threadIds.size());
                    try {
                        CasIterator casIterator = multipliers.get(id).processAndOutputNewCASes(cas);
                        while (casIterator.hasNext()) {
                            CAS innerCas = casIterator.next();
//                            XmiCasSerializer.serialize(innerCas, new FileOutputStream("mycas.xmi"));
                            lastDocId = JCoReTools.getDocId(innerCas.getJCas());
                            offsetExpanders.get(id).process(innerCas);
                            proteinMergers.get(id).process(innerCas);
                            consistencyAes.get(id).process(innerCas);
                            writeTrainingData(pubmed2gene, innerCas.getJCas(), geneMapper, TransformerDisambiguationDataUtils.MAX_DOC_CONTEXT_SIZE);
                            checkpointConsumers.get(id).process(innerCas);
                            innerCas.release();
                        }
                        casPool.releaseCas(cas);
                        callBatchProcessingComplete(offsetExpanders);
                        callBatchProcessingComplete(proteinMergers);
                        callBatchProcessingComplete(consistencyAes);
                    } catch (ClassCastException | AnalysisEngineProcessException e) {
                        log.warn("Got {} exception for document '{}': {}. Assuming that this is a JeDIS (de-)serialization issue, skipping the document.", e.getClass().getCanonicalName(), lastDocId, e.getMessage());
                    } catch (Throwable t) {
                        log.error("Could not process batch of CASes with Thread ID {} (name: {}) due to exception. The last seen document ID was '{}'.", id, Thread.currentThread().getName(), lastDocId, t);
                        errorOccurred = true;
                    }
                });
                progressBar.incrementDone();
                progressBar.printProgressBar();
            }
            for (AnalysisEngine checkpointConsumer : checkpointConsumers) {
                checkpointConsumer.collectionProcessComplete();
            }
            // Wait for for the processing threads to finish
            log.info("Shutting down ExecutorService.");
            executorService.shutdown();
            log.info("Waiting 15 minutes for all processing threads to finish.");
            executorService.awaitTermination(15, TimeUnit.MINUTES);
            synchronized (documentBuffer) {
                // Signal the to-disc-writing-thread to quit.
                log.info("Processing threads have finished, signaling the to-disc-writing threads to finish.");
                writingThreads.forEach(WritingThread::finish);
                // Notify the document buffer which should have it terminate.
                log.info("Notifying the to-disc-writing thread to continue a last writing iteration.");
                documentBuffer.notifyAll();
            }
            log.info("Waiting for last data to be written to disc...");
            for (WritingThread writingThread : writingThreads) {
                writingThread.join();
            }
            log.info("WritingThreads have terminated.");
            log.info("Merging written files into {}", String.format(outputFile, "all"));
            mergeFiles(outputFile);
            log.info("Merging done, application finished.");
        } finally {
            // Just to be sure.
            executorService.shutdown();
        }
        if (errorOccurred)
            log.error("Early termination due to error. Check the log messages above.");
    }

    private void callBatchProcessingComplete(List aes) throws AnalysisEngineProcessException {
        for (AnalysisEngine ae : aes)
            ae.collectionProcessComplete();
    }

    private void mergeFiles(String outputFile) throws IOException {
        try (BufferedOutputStream bos = FileUtilities.getOutputStreamToFile(new File(String.format(outputFile, "all")))) {
            for (WritingThread wt : writingThreads) {
                IOUtils.copy(FileUtilities.getReaderFromFile(wt.getOutputFile()), bos, UTF_8);
                wt.getOutputFile().delete();
            }
        }
    }

    /**
     * Creates a thread that waits on the {@link #documentBuffer} monitor. When notified, it flushes the current buffer to disc.
     *
     * @param geneMapper
     * @param outputFile
     * @param numThreads
     */
    public void createWritingThreads(GeneMapper geneMapper, String outputFile, int numThreads) throws IOException {
        for (int i = 0; i < numThreads; i++) {
            WritingThread writingThread = new WritingThread(geneMapper, new File(String.format(outputFile, i)));
            writingThread.setName("WritingThread-" + i);
            writingThread.start();
            writingThreads.add(writingThread);
        }
    }

    private Multimap readGene2pubmed(File gene2pubmed) throws IOException {
        Multimap pubmed2gene = HashMultimap.create();
        try (BufferedReader br = FileUtilities.getReaderFromFile(gene2pubmed)) {
            br.lines().filter(Predicate.not(l -> l.startsWith("#"))).map(l -> l.split("\t")).forEach(s -> pubmed2gene.put(s[2], s[1]));
        }
        return pubmed2gene;
    }

    private void writeTrainingData(Multimap pubmed2gene, JCas jCas, GeneMapper geneMapper, int contextWindowSize) throws AnalysisEngineProcessException, InterruptedException {
        Matcher specificTypeMatcher = Pattern.compile("Gene|protein|protein_complex|protein_enum|protein_familiy_or_group").matcher("");
        Map entityMappingTypes = Map.of(Gene.class.getCanonicalName(), specificTypeMatcher);
//        Function> contextFun = em -> {
//            try {
//                String entityContext = ContextUtils.makeContext(jCas,
//                        contextWindowSize, em);
//                if (entityContext != null)
//                    entityContext = entityContext.trim();
//                BooleanQuery contextQuery = ContextUtils.makeContextQuery(entityContext);
//                return new ImmutablePair<>(entityContext, contextQuery);
//            } catch (IOException e) {
//                throw new GeneMapperRuntimeException(e);
//            }
//        };
        Function> contextFun = em -> ImmutablePair.nullPair();
        GeneDocument document = injector.getInstance(GeneDocumentFactory.class).createGeneDocument(jCas, entityMappingTypes, contextFun, new Parameters(configuration));
        TransformerDisambiguationDataUtils.addDocumentLevelGeneAnnotations(document, pubmed2gene);
        injector.getInstance(DocumentLoader.class).inferDocumentLevelLabelsToMentions(document, document.getGoldIds(), geneMapper.getMappingCore().getCandidateRetrieval(), injector.getInstance(GeneOrthologs.class), false);
        // in pubmed2gene we only have annotations for the gene the database record is about, other genes in the
        // document we don't have information about.
        document.setCompletelyAnnotated(false);
        // We need a while loop and a timeout because it is quite possible that all processing threads have sent
        // their notify while the writing thread is already working and thus waiting to be notified.
        // Then, without the timeout and the loop, the all processing threads are stuck waiting for documentBuffer
        // to gain free capacity which never happens because the writing thread endlessly waits to be notified after
        // its previous writing process.
        do {
            synchronized (documentBuffer) {
                if (documentBuffer.remainingCapacity() == 0) {
                    log.trace("Notifying writing thread that buffer is full.");
                    documentBuffer.notify();
                }
            }
        } while (!documentBuffer.offer(document, 1, TimeUnit.MINUTES));
    }

    private AnalysisEngine createEngineWithTs(String descriptorPath, TypeSystemDescription tsd) throws IOException, InvalidXMLException, ResourceInitializationException {
        AnalysisEngineDescription desc = AnalysisEngineFactory.createEngineDescription(descriptorPath);
        desc.getAnalysisEngineMetaData().setTypeSystem(tsd);
        return AnalysisEngineFactory.createEngine(desc);
    }

    private class WritingThread extends Thread {
        private final GeneMapper geneMapper;
        private final File outputFile;
        private boolean finish = false;
        private final BufferedWriter bw;

        public WritingThread(GeneMapper geneMapper, File outputFile) throws IOException {
            this.geneMapper = geneMapper;
            this.outputFile = outputFile;
            if (outputFile.exists())
                outputFile.delete();
            bw = FileUtilities.getWriterToFile(outputFile);
        }

        public File getOutputFile() {
            return outputFile;
        }

        public void finish() {
            this.finish = true;
        }

        @Override
        public void run() {
            try {
                while (!finish || !documentBuffer.isEmpty()) {
                    synchronized (documentBuffer) {
                        // Block until notified
                        if (documentBuffer.remainingCapacity() > 500 && !finish) {
                            log.trace("Waiting for notification.");
                            documentBuffer.wait();
                        }
                    }
                    log.debug("Draining document buffer of size {} to outbound list.", documentBuffer.size());
                    List documents = new ArrayList<>(documentBuffer.size());
                    documentBuffer.drainTo(documents);
                    log.debug("Writing document buffer to file {}", outputFile);
                    for (GeneDocument document : documents)
                        TransformerDisambiguationDataUtils.writeData(bw, geneMapper, document);
                    log.debug("Writing finished.");
                }
            } catch (Throwable e) {
                log.error("Error in the data writing thread.", e);
                throw new RuntimeException(e);
            } finally {
                if (bw != null) {
                    try {
                        bw.close();
                    } catch (IOException e) {
                        log.error("Could not close writer to {}", outputFile);
                    }
                }
            }
        }
    }
}