de.julielab.genemapper.resources.TransformerDisambiguationGene2PubmedDataWriter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe
Gene Mapper.
The newest version!
package de.julielab.genemapper.resources;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.inject.Guice;
import com.google.inject.Injector;
import de.julielab.geneexpbase.configuration.Parameters;
import de.julielab.geneexpbase.data.DocumentLoader;
import de.julielab.geneexpbase.genemodel.GeneDocument;
import de.julielab.geneexpbase.genemodel.GeneOrthologs;
import de.julielab.geneexpbase.ioc.ServicesShutdownHub;
import de.julielab.genemapper.Configuration;
import de.julielab.genemapper.GeneMapper;
import de.julielab.genemapper.classification.TransformerDisambiguationDataUtils;
import de.julielab.genemapper.genemodel.GeneDocumentFactory;
import de.julielab.genemapper.ioc.GeneMappingModule;
import de.julielab.java.utilities.FileUtilities;
import de.julielab.java.utilities.ProgressBar;
import de.julielab.jcore.ae.checkpoint.DBCheckpointAE;
import de.julielab.jcore.reader.xmi.XmiDBReader;
import de.julielab.jcore.types.EntityMention;
import de.julielab.jcore.types.Gene;
import de.julielab.jcore.utility.JCoReTools;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.lucene.search.BooleanQuery;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.analysis_engine.CasIterator;
import org.apache.uima.cas.CAS;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.CollectionReaderFactory;
import org.apache.uima.fit.factory.TypeSystemDescriptionFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.impl.ResourceManager_impl;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.resource.metadata.impl.ProcessingResourceMetaData_impl;
import org.apache.uima.util.CasPool;
import org.apache.uima.util.InvalidXMLException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.*;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.nio.charset.StandardCharsets.UTF_8;
public class TransformerDisambiguationGene2PubmedDataWriter {
private final static Logger log = LoggerFactory.getLogger(TransformerDisambiguationGene2PubmedDataWriter.class);
private static final BlockingDeque documentBuffer = new LinkedBlockingDeque<>(512);
private final List writingThreads = new ArrayList<>();
private static Injector injector;
private static Configuration configuration;
private boolean errorOccurred;
public static void main(String[] args) throws Exception {
configuration = new Configuration(new File("configurations/genemapper_gene2pubmed.properties"));
log.info("Detected {} CPUs. Using this number minus 2.", Runtime.getRuntime().availableProcessors());
int numThreads = Math.max(1, Runtime.getRuntime().availableProcessors() - 2);
// int numThreads = 1;
String costosysConfigurationFile = "../jcore-gene-mapper-resources/src/main/resources/costosys.xml";
String gene2pubmedDocumentTableName = "geno.gene2pubmed";
// String gene2pubmedDocumentTableName = "geno.errordoc";
injector = Guice.createInjector(new GeneMappingModule(configuration));
String goldTaxMode = TransformerDisambiguationDataUtils.USE_GOLD_TAX_FOR_CANDIDATE_RETRIEVAL ? "goldTax" : "noGoldTax";
File gene2pubmed = new File("../jcore-gene-mapper-resources/gene2pubmed.gz");
String outputFile = "transformerDisambiguationData-gene2pubmed-v" + TransformerDisambiguationDataUtils.VERSION + "-" + goldTaxMode + "-%s.tsv.gz";
TransformerDisambiguationGene2PubmedDataWriter dataWriter = new TransformerDisambiguationGene2PubmedDataWriter();
dataWriter.createDisambiguationData(gene2pubmed, costosysConfigurationFile, gene2pubmedDocumentTableName, injector.getInstance(GeneMapper.class), outputFile, numThreads);
log.info("Shutting down gene mapper services.");
log.info("Application finished.");
injector.getInstance(ServicesShutdownHub.class).shutdown();
}
public void createDisambiguationData(File gene2pubmed, String costosysConfigurationFile, String tableName, GeneMapper geneMapper, String outputFile, int numThreads) throws IOException, UIMAException, InterruptedException {
ExecutorService executorService = Executors.newFixedThreadPool(numThreads);
try {
Multimap pubmed2gene = readGene2pubmed(gene2pubmed);
int retrievalBatchSize = 50;
CollectionReader dbReader = CollectionReaderFactory.createReader("de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-multiplier-reader",
XmiDBReader.PARAM_COSTOSYS_CONFIG_NAME, costosysConfigurationFile,
XmiDBReader.PARAM_ANNOTATIONS_TO_LOAD, new String[]{"de.julielab.jcore.types.Sentence", "de.julielab.jcore.types.Token", "de.julielab.jcore.types.PennBioIEPOSTag", "de.julielab.jcore.types.Organism", "de.julielab.jcore.types.Abbreviation", "flair:de.julielab.jcore.types.Gene"},
XmiDBReader.PARAM_READS_BASE_DOCUMENT, true,
XmiDBReader.PARAM_TABLE, tableName,
XmiDBReader.PARAM_BATCH_SIZE, retrievalBatchSize,
XmiDBReader.PARAM_RESET_TABLE, true);
TypeSystemDescription tsd = TypeSystemDescriptionFactory.createTypeSystemDescription(
"de.julielab.jcore.types.jcore-morpho-syntax-types",
"de.julielab.jcore.types.jcore-document-structure-pubmed-types",
"de.julielab.jcore.types.jcore-document-meta-pubmed-types",
"de.julielab.jcore.ae.genemapper.desc.ProteinOffsetExpansionTypeSystem",
"de.julielab.jcore.types.extensions.jcore-document-meta-extension-types",
"de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types"
);
List multipliers = new ArrayList<>(numThreads);
List offsetExpanders = new ArrayList<>(numThreads);
List proteinMergers = new ArrayList<>(numThreads);
List consistencyAes = new ArrayList<>(numThreads);
List checkpointConsumers = new ArrayList<>(numThreads);
for (int i = 0; i < numThreads; i++) {
AnalysisEngine multiplier = createEngineWithTs("de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-multiplier", tsd);
// This is a RUTA component creating extended gene/protein annotation offset annotations according to a set of rules
AnalysisEngine proteinOffsetExpansion = createEngineWithTs("de.julielab.jcore.ae.genemapper.desc.ProteinOffsetExpansionEngine", tsd);
// Applies the extended offset annotations to the actual gene annotations in the CAS (de.julielab.jcore.ae.genemapper.uima.ExtendedProteinsMerger)
AnalysisEngine extendedProteinsMerger = createEngineWithTs("de.julielab.jcore.ae.genemapper.desc.jcore-extended-proteins-merger", tsd);
// Derives gene annotations from already annotated genes in a CAS (see de.julielab.jcore.ae.genemapper.uima.ProteinConsistencyTagger)
AnalysisEngine consistencyAe = createEngineWithTs("de.julielab.jcore.ae.genemapper.desc.jcore-protein-consistency-tagger", tsd);
AnalysisEngine checkpointAe = AnalysisEngineFactory.createEngine(
DBCheckpointAE.class,
tsd,
DBCheckpointAE.PARAM_CHECKPOINT_NAME, "end",
DBCheckpointAE.PARAM_COSTOSYS_CONFIG, costosysConfigurationFile,
DBCheckpointAE.PARAM_INDICATE_FINISHED, true);
multipliers.add(multiplier);
offsetExpanders.add(proteinOffsetExpansion);
proteinMergers.add(extendedProteinsMerger);
consistencyAes.add(consistencyAe);
checkpointConsumers.add(checkpointAe);
}
/**
* CAS objects are expensive to create, especially in terms of memory, but also in CPU time. Reusing them
* is much more appropriate which is why we use a CasPool.
*/
final ProcessingResourceMetaData_impl metaData = new ProcessingResourceMetaData_impl();
metaData.setTypeSystem(tsd);
// Create a CasPool.
CasPool casPool = new CasPool(numThreads + 4, metaData, new ResourceManager_impl());
Map threadIds = new ConcurrentHashMap<>(numThreads);
createWritingThreads(geneMapper, outputFile, numThreads);
ProgressBar progressBar = new ProgressBar(dbReader.getProgress()[0].getTotal() / retrievalBatchSize, 80, true);
while (dbReader.hasNext() && !errorOccurred) {
log.debug("Getting CAS. Available CASes: {}", casPool.getNumAvailable());
CAS cas = casPool.getCas(1200000);
if (cas == null)
continue;
dbReader.getNext(cas);
executorService.submit(() -> {
String lastDocId = null;
Integer id = threadIds.compute(Thread.currentThread(), (k, v) -> v != null ? v : threadIds.size());
try {
CasIterator casIterator = multipliers.get(id).processAndOutputNewCASes(cas);
while (casIterator.hasNext()) {
CAS innerCas = casIterator.next();
// XmiCasSerializer.serialize(innerCas, new FileOutputStream("mycas.xmi"));
lastDocId = JCoReTools.getDocId(innerCas.getJCas());
offsetExpanders.get(id).process(innerCas);
proteinMergers.get(id).process(innerCas);
consistencyAes.get(id).process(innerCas);
writeTrainingData(pubmed2gene, innerCas.getJCas(), geneMapper, TransformerDisambiguationDataUtils.MAX_DOC_CONTEXT_SIZE);
checkpointConsumers.get(id).process(innerCas);
innerCas.release();
}
casPool.releaseCas(cas);
callBatchProcessingComplete(offsetExpanders);
callBatchProcessingComplete(proteinMergers);
callBatchProcessingComplete(consistencyAes);
} catch (ClassCastException | AnalysisEngineProcessException e) {
log.warn("Got {} exception for document '{}': {}. Assuming that this is a JeDIS (de-)serialization issue, skipping the document.", e.getClass().getCanonicalName(), lastDocId, e.getMessage());
} catch (Throwable t) {
log.error("Could not process batch of CASes with Thread ID {} (name: {}) due to exception. The last seen document ID was '{}'.", id, Thread.currentThread().getName(), lastDocId, t);
errorOccurred = true;
}
});
progressBar.incrementDone();
progressBar.printProgressBar();
}
for (AnalysisEngine checkpointConsumer : checkpointConsumers) {
checkpointConsumer.collectionProcessComplete();
}
// Wait for for the processing threads to finish
log.info("Shutting down ExecutorService.");
executorService.shutdown();
log.info("Waiting 15 minutes for all processing threads to finish.");
executorService.awaitTermination(15, TimeUnit.MINUTES);
synchronized (documentBuffer) {
// Signal the to-disc-writing-thread to quit.
log.info("Processing threads have finished, signaling the to-disc-writing threads to finish.");
writingThreads.forEach(WritingThread::finish);
// Notify the document buffer which should have it terminate.
log.info("Notifying the to-disc-writing thread to continue a last writing iteration.");
documentBuffer.notifyAll();
}
log.info("Waiting for last data to be written to disc...");
for (WritingThread writingThread : writingThreads) {
writingThread.join();
}
log.info("WritingThreads have terminated.");
log.info("Merging written files into {}", String.format(outputFile, "all"));
mergeFiles(outputFile);
log.info("Merging done, application finished.");
} finally {
// Just to be sure.
executorService.shutdown();
}
if (errorOccurred)
log.error("Early termination due to error. Check the log messages above.");
}
private void callBatchProcessingComplete(List aes) throws AnalysisEngineProcessException {
for (AnalysisEngine ae : aes)
ae.collectionProcessComplete();
}
private void mergeFiles(String outputFile) throws IOException {
try (BufferedOutputStream bos = FileUtilities.getOutputStreamToFile(new File(String.format(outputFile, "all")))) {
for (WritingThread wt : writingThreads) {
IOUtils.copy(FileUtilities.getReaderFromFile(wt.getOutputFile()), bos, UTF_8);
wt.getOutputFile().delete();
}
}
}
/**
* Creates a thread that waits on the {@link #documentBuffer} monitor. When notified, it flushes the current buffer to disc.
*
* @param geneMapper
* @param outputFile
* @param numThreads
*/
public void createWritingThreads(GeneMapper geneMapper, String outputFile, int numThreads) throws IOException {
for (int i = 0; i < numThreads; i++) {
WritingThread writingThread = new WritingThread(geneMapper, new File(String.format(outputFile, i)));
writingThread.setName("WritingThread-" + i);
writingThread.start();
writingThreads.add(writingThread);
}
}
private Multimap readGene2pubmed(File gene2pubmed) throws IOException {
Multimap pubmed2gene = HashMultimap.create();
try (BufferedReader br = FileUtilities.getReaderFromFile(gene2pubmed)) {
br.lines().filter(Predicate.not(l -> l.startsWith("#"))).map(l -> l.split("\t")).forEach(s -> pubmed2gene.put(s[2], s[1]));
}
return pubmed2gene;
}
private void writeTrainingData(Multimap pubmed2gene, JCas jCas, GeneMapper geneMapper, int contextWindowSize) throws AnalysisEngineProcessException, InterruptedException {
Matcher specificTypeMatcher = Pattern.compile("Gene|protein|protein_complex|protein_enum|protein_familiy_or_group").matcher("");
Map entityMappingTypes = Map.of(Gene.class.getCanonicalName(), specificTypeMatcher);
// Function> contextFun = em -> {
// try {
// String entityContext = ContextUtils.makeContext(jCas,
// contextWindowSize, em);
// if (entityContext != null)
// entityContext = entityContext.trim();
// BooleanQuery contextQuery = ContextUtils.makeContextQuery(entityContext);
// return new ImmutablePair<>(entityContext, contextQuery);
// } catch (IOException e) {
// throw new GeneMapperRuntimeException(e);
// }
// };
Function> contextFun = em -> ImmutablePair.nullPair();
GeneDocument document = injector.getInstance(GeneDocumentFactory.class).createGeneDocument(jCas, entityMappingTypes, contextFun, new Parameters(configuration));
TransformerDisambiguationDataUtils.addDocumentLevelGeneAnnotations(document, pubmed2gene);
injector.getInstance(DocumentLoader.class).inferDocumentLevelLabelsToMentions(document, document.getGoldIds(), geneMapper.getMappingCore().getCandidateRetrieval(), injector.getInstance(GeneOrthologs.class), false);
// in pubmed2gene we only have annotations for the gene the database record is about, other genes in the
// document we don't have information about.
document.setCompletelyAnnotated(false);
// We need a while loop and a timeout because it is quite possible that all processing threads have sent
// their notify while the writing thread is already working and thus waiting to be notified.
// Then, without the timeout and the loop, the all processing threads are stuck waiting for documentBuffer
// to gain free capacity which never happens because the writing thread endlessly waits to be notified after
// its previous writing process.
do {
synchronized (documentBuffer) {
if (documentBuffer.remainingCapacity() == 0) {
log.trace("Notifying writing thread that buffer is full.");
documentBuffer.notify();
}
}
} while (!documentBuffer.offer(document, 1, TimeUnit.MINUTES));
}
private AnalysisEngine createEngineWithTs(String descriptorPath, TypeSystemDescription tsd) throws IOException, InvalidXMLException, ResourceInitializationException {
AnalysisEngineDescription desc = AnalysisEngineFactory.createEngineDescription(descriptorPath);
desc.getAnalysisEngineMetaData().setTypeSystem(tsd);
return AnalysisEngineFactory.createEngine(desc);
}
private class WritingThread extends Thread {
private final GeneMapper geneMapper;
private final File outputFile;
private boolean finish = false;
private final BufferedWriter bw;
public WritingThread(GeneMapper geneMapper, File outputFile) throws IOException {
this.geneMapper = geneMapper;
this.outputFile = outputFile;
if (outputFile.exists())
outputFile.delete();
bw = FileUtilities.getWriterToFile(outputFile);
}
public File getOutputFile() {
return outputFile;
}
public void finish() {
this.finish = true;
}
@Override
public void run() {
try {
while (!finish || !documentBuffer.isEmpty()) {
synchronized (documentBuffer) {
// Block until notified
if (documentBuffer.remainingCapacity() > 500 && !finish) {
log.trace("Waiting for notification.");
documentBuffer.wait();
}
}
log.debug("Draining document buffer of size {} to outbound list.", documentBuffer.size());
List documents = new ArrayList<>(documentBuffer.size());
documentBuffer.drainTo(documents);
log.debug("Writing document buffer to file {}", outputFile);
for (GeneDocument document : documents)
TransformerDisambiguationDataUtils.writeData(bw, geneMapper, document);
log.debug("Writing finished.");
}
} catch (Throwable e) {
log.error("Error in the data writing thread.", e);
throw new RuntimeException(e);
} finally {
if (bw != null) {
try {
bw.close();
} catch (IOException e) {
log.error("Could not close writer to {}", outputFile);
}
}
}
}
}
}