Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
package de.julielab.genemapper.resources.uima;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import de.julielab.costosys.configuration.FieldConfig;
import de.julielab.costosys.dbconnection.CoStoSysConnection;
import de.julielab.costosys.dbconnection.DataBaseConnector;
import de.julielab.costosys.dbconnection.util.CoStoSysSQLRuntimeException;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.java.utilities.FileUtilities;
import de.julielab.java.utilities.IOStreamUtilities;
import de.julielab.jcore.types.*;
import de.julielab.jcore.utility.JCoReTools;
import de.julielab.jcore.utility.index.JCoReOverlapAnnotationIndex;
import de.julielab.xml.JulieXMLConstants;
import de.julielab.xml.JulieXMLTools;
import org.apache.commons.lang3.tuple.ImmutableTriple;
import org.apache.commons.lang3.tuple.Triple;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.Date;
import java.util.List;
import java.util.*;
import java.util.stream.Stream;
@ResourceMetaData(name = "JCoRe Synonym Species Occurrences DB Writer", description = "Counts common occurrences of gene synonyms and species mentions and stores the counts in a Postgres database. The idea is to create an a priori distribution of species for each synonym to use for disambiguation. For this purpose, two sources of gene mentions are used: A gazetteer based on the synonym dictionary created by the _makeDictionary.sh script and a higher-precision machine learning-based approach for false positive filtering. The gazetteer gene mentions must have the specific type 'GazetteerGene' to be counted. Apart from the ML-based gene mentions for FP filtering there should not any other gene mentions in the CAS to avoid counting mistakes.")
@TypeCapability(inputs = {"de.julielab.jcore.types.Organism", "de.julielab.jcore.types.Gene"})
public class SynonymSpeciesOccurrencesConsumer extends JCasAnnotator_ImplBase {
public static final String PARAM_COSTOSYS_FILE = "CostosysConfiguration";
public static final String PARAM_TABLE_NAME = "TableName";
private final static Logger log = LoggerFactory.getLogger(SynonymSpeciesOccurrencesConsumer.class);
/**
* A map that assigns some MeSH descriptors that denote organisms appropriate taxonomy Ids.
*/
private static final Multimap desc2TaxId = HashMultimap.create();
/**
* A map that assigns some taxonomy Ids additional taxonomy Ids of frequently used reference or model organisms for the original taxonomy ID
*/
private static final Map referenceSpeciesTaxIds = new HashMap<>();
@ConfigurationParameter(name = PARAM_COSTOSYS_FILE, description = "Path to the CoStoSys configuration file the specifies the database to write to.")
private String costosysConfiguration;
@ConfigurationParameter(name = PARAM_TABLE_NAME, mandatory = false, description = "Optional. The name of the database table in which the species-genesynonym occurrence counts will be stored. Defaults to 'occurrences'.")
private String tableName;
private DataBaseConnector dbc;
/**
* The triples have the elements synonym, taxId, scope.
* Each triple implicates a single occurrence. Thus, there can and should be duplicates.
*/
private List> occurrences;
private final TermNormalizer normalizer = new TermNormalizer();
private long processed;
@Override
public void initialize(UimaContext aContext) throws ResourceInitializationException {
costosysConfiguration = (String) aContext.getConfigParameterValue(PARAM_COSTOSYS_FILE);
tableName = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_TABLE_NAME)).orElse("occurrences");
try {
dbc = new DataBaseConnector(costosysConfiguration);
addOccurrencesTableFieldConfiguration();
dbc.createTable(tableName, "occurrences", "Created by " + SynonymSpeciesOccurrencesConsumer.class.getCanonicalName() + " on " + new Date() + ". This table collects counts of common occurrences of gene synonyms and species taxonomy IDs. Its aim is to serve as a source of disambiguation for the jcore-gene-mapper-ae when it is not clear to which species a gene might belong and there are multiple (or even no) candidates to chose from.");
} catch (FileNotFoundException e) {
log.error("Could not instantiate DatabaseConnector", e);
throw new ResourceInitializationException(e);
} catch (CoStoSysSQLRuntimeException e) {
log.info("This exception is expected and is no cause for alarm.");
// This might be the expected "the table already exists" exception
if (!e.getMessage().contains("already exists"))
throw new ResourceInitializationException(e);
}
occurrences = new ArrayList<>();
readDesc2TaxMap();
try {
readReferenceMap();
} catch (IOException e) {
log.error("Could not read the reference species taxonomy ID map", e);
throw new ResourceInitializationException(e);
}
processed = 0;
}
private void readReferenceMap() throws IOException {
synchronized (referenceSpeciesTaxIds) {
if (!referenceSpeciesTaxIds.isEmpty())
return;
try (final InputStream referenceSpecies = FileUtilities.findResource("/reference_species.txt")) {
if (referenceSpecies == null)
throw new FileNotFoundException("Could not find the reference species mapping file expected on the classpath as /reference_species.txt.");
try (final BufferedReader br = IOStreamUtilities.getReaderFromInputStream(referenceSpecies)) {
br.lines().map(line -> line.split("\t")).forEach(split -> referenceSpeciesTaxIds.put(split[1], split[0]));
}
}
}
}
/**
* Creates the table schema that is used to store the occurrence counts. The schema is names 'occurrences'.
*/
private void addOccurrencesTableFieldConfiguration() {
List