de.julielab.genemapper.resources.uima.SynonymSpeciesOccurrencesConsumer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.
The newest version!
package de.julielab.genemapper.resources.uima;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import de.julielab.costosys.configuration.FieldConfig;
import de.julielab.costosys.dbconnection.CoStoSysConnection;
import de.julielab.costosys.dbconnection.DataBaseConnector;
import de.julielab.costosys.dbconnection.util.CoStoSysSQLRuntimeException;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.java.utilities.FileUtilities;
import de.julielab.java.utilities.IOStreamUtilities;
import de.julielab.jcore.types.*;
import de.julielab.jcore.utility.JCoReTools;
import de.julielab.jcore.utility.index.JCoReOverlapAnnotationIndex;
import de.julielab.xml.JulieXMLConstants;
import de.julielab.xml.JulieXMLTools;
import org.apache.commons.lang3.tuple.ImmutableTriple;
import org.apache.commons.lang3.tuple.Triple;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.Date;
import java.util.List;
import java.util.*;
import java.util.stream.Stream;

@ResourceMetaData(name = "JCoRe Synonym Species Occurrences DB Writer", description = "Counts common occurrences of gene synonyms and species mentions and stores the counts in a Postgres database. The idea is to create an a priori distribution of species for each synonym to use for disambiguation. For this purpose, two sources of gene mentions are used: A gazetteer based on the synonym dictionary created by the _makeDictionary.sh script and a higher-precision machine learning-based approach for false positive filtering. The gazetteer gene mentions must have the specific type 'GazetteerGene' to be counted. Apart from the ML-based gene mentions for FP filtering there should not any other gene mentions in the CAS to avoid counting mistakes.")
@TypeCapability(inputs = {"de.julielab.jcore.types.Organism", "de.julielab.jcore.types.Gene"})
public class SynonymSpeciesOccurrencesConsumer extends JCasAnnotator_ImplBase {
    public static final String PARAM_COSTOSYS_FILE = "CostosysConfiguration";
    public static final String PARAM_TABLE_NAME = "TableName";
    private final static Logger log = LoggerFactory.getLogger(SynonymSpeciesOccurrencesConsumer.class);
    /**
     * A map that assigns some MeSH descriptors that denote organisms appropriate taxonomy Ids.
     */
    private static final Multimap desc2TaxId = HashMultimap.create();
    /**
     * A map that assigns some taxonomy Ids additional taxonomy Ids of frequently used reference or model organisms for the original taxonomy ID
     */
    private static final Map referenceSpeciesTaxIds = new HashMap<>();
    @ConfigurationParameter(name = PARAM_COSTOSYS_FILE, description = "Path to the CoStoSys configuration file the specifies the database to write to.")
    private String costosysConfiguration;
    @ConfigurationParameter(name = PARAM_TABLE_NAME, mandatory = false, description = "Optional. The name of the database table in which the species-genesynonym occurrence counts will be stored. Defaults to 'occurrences'.")
    private String tableName;
    private DataBaseConnector dbc;
    /**
     * The triples have the elements synonym, taxId, scope.
     * Each triple implicates a single occurrence. Thus, there can and should be duplicates.
     */
    private List> occurrences;

    private final TermNormalizer normalizer = new TermNormalizer();
    private long processed;


    @Override
    public void initialize(UimaContext aContext) throws ResourceInitializationException {
        costosysConfiguration = (String) aContext.getConfigParameterValue(PARAM_COSTOSYS_FILE);
        tableName = Optional.ofNullable((String) aContext.getConfigParameterValue(PARAM_TABLE_NAME)).orElse("occurrences");
        try {
            dbc = new DataBaseConnector(costosysConfiguration);
            addOccurrencesTableFieldConfiguration();
            dbc.createTable(tableName, "occurrences", "Created by " + SynonymSpeciesOccurrencesConsumer.class.getCanonicalName() + " on " + new Date() + ". This table collects counts of common occurrences of gene synonyms and species taxonomy IDs. Its aim is to serve as a source of disambiguation for the jcore-gene-mapper-ae when it is not clear to which species a gene might belong and there are multiple (or even no) candidates to chose from.");
        } catch (FileNotFoundException e) {
            log.error("Could not instantiate DatabaseConnector", e);
            throw new ResourceInitializationException(e);
        } catch (CoStoSysSQLRuntimeException e) {
            log.info("This exception is expected and is no cause for alarm.");
            // This might be the expected "the table already exists" exception
            if (!e.getMessage().contains("already exists"))
                throw new ResourceInitializationException(e);
        }
        occurrences = new ArrayList<>();
        readDesc2TaxMap();
        try {
            readReferenceMap();
        } catch (IOException e) {
            log.error("Could not read the reference species taxonomy ID map", e);
            throw new ResourceInitializationException(e);
        }
        processed = 0;
    }

    private void readReferenceMap() throws IOException {
        synchronized (referenceSpeciesTaxIds) {
            if (!referenceSpeciesTaxIds.isEmpty())
                return;
            try (final InputStream referenceSpecies = FileUtilities.findResource("/reference_species.txt")) {
                if (referenceSpecies == null)
                    throw new FileNotFoundException("Could not find the reference species mapping file expected on the classpath as /reference_species.txt.");
                try (final BufferedReader br = IOStreamUtilities.getReaderFromInputStream(referenceSpecies)) {
                    br.lines().map(line -> line.split("\t")).forEach(split -> referenceSpeciesTaxIds.put(split[1], split[0]));
                }
            }
        }
    }

    /**
     * Creates the table schema that is used to store the occurrence counts. The schema is names 'occurrences'.
     */
    private void addOccurrencesTableFieldConfiguration() {
        List> columnsDefinition = new ArrayList<>();
        columnsDefinition.add(JulieXMLTools.createField(JulieXMLConstants.NAME, "synonym", JulieXMLConstants.TYPE, "text", JulieXMLConstants.RETRIEVE, "true", JulieXMLConstants.PRIMARY_KEY, "true"));
        columnsDefinition.add(JulieXMLTools.createField(JulieXMLConstants.NAME, "tax_id", JulieXMLConstants.TYPE, "integer", JulieXMLConstants.RETRIEVE, "true", JulieXMLConstants.PRIMARY_KEY, "true"));
        columnsDefinition.add(JulieXMLTools.createField(JulieXMLConstants.NAME, "scope", JulieXMLConstants.TYPE, "text", JulieXMLConstants.RETRIEVE, "true", JulieXMLConstants.PRIMARY_KEY, "true"));
        columnsDefinition.add(JulieXMLTools.createField(JulieXMLConstants.NAME, "count", JulieXMLConstants.TYPE, "integer", JulieXMLConstants.RETRIEVE, "true"));
        final FieldConfig occurrencesFieldConfig = new FieldConfig(columnsDefinition, null, "occurrences");
        dbc.addFieldConfiguration(occurrencesFieldConfig);
    }

    /**
     * Expects the desc2tax file to reside on the classpath. It should be located in the jcore-gene-mapper-ae
     * project under src/main/resources.
     *
     * @throws ResourceInitializationException If the mapping file cannot be read.
     */
    private void readDesc2TaxMap() throws ResourceInitializationException {
        synchronized (desc2TaxId) {
            if (!desc2TaxId.isEmpty())
                return;
            try {
                InputStream mapping = FileUtilities.findResource("/desc2tax");
                if (mapping == null)
                    mapping = FileUtilities.findResource("/desc2tax.gz");
                if (mapping == null)
                    throw new ResourceInitializationException(new FileNotFoundException("Could not find the desc2tax file that maps MeSH descriptor names to taxonomy IDs."));
                try (final BufferedReader br = IOStreamUtilities.getReaderFromInputStream(mapping)) {
                    br.lines().map(line -> line.split("\t")).forEach(split -> desc2TaxId.put(split[0].trim(), split[1].trim().intern()));
                }
            } catch (IOException e) {
                log.error("IOException while searching for the descriptor to taxonomy ID mapping file", e);
                throw new ResourceInitializationException(e);
            }
        }
    }

    @Override
    public void process(JCas jCas) throws AnalysisEngineProcessException {
        final FSIterator gazGeneIt = jCas.getAnnotationIndex(Gene.type).iterator();
        if (gazGeneIt.hasNext()) {
            Set docTaxIds = getAllTaxIdsInDocument(jCas);
            Set meshTaxIds = getMeshTaxIds(jCas);
            Multimap synTaxPairs = HashMultimap.create();
            Set synonyms = new HashSet<>();
            final JCoReOverlapAnnotationIndex sentenceIndex = new JCoReOverlapAnnotationIndex<>(jCas, Sentence.type);
            final JCoReOverlapAnnotationIndex organismIndex = new JCoReOverlapAnnotationIndex<>(jCas, Organism.type);
            final JCoReOverlapAnnotationIndex npIndex = new JCoReOverlapAnnotationIndex<>(jCas, ChunkNP.type);
            final JCoReOverlapAnnotationIndex flairGeneIndex = new JCoReOverlapAnnotationIndex<>(jCas, Gene.type);
            while (gazGeneIt.hasNext()) {
                EntityMention gazSynonym = (EntityMention) gazGeneIt.next();

                // We want to iterate over the gazetteer synonym finding only. We use the ML-based gene mentions
                // as a filter below
                if (gazSynonym.getSpecificType() == null || !gazSynonym.getSpecificType().equals("GazetteerGene"))
                    continue;

                // Filter the gazetteer synonym for overlapping ML-based gene mentions to avoid FPs
                if (flairGeneIndex.search(gazSynonym).stream().anyMatch(gm -> gm.getSpecificType() == null || !gm.getSpecificType().equals("GazetteerGene"))) {

                    String synonym = getSynonym(gazSynonym);
                    synonyms.add(synonym);

                    int originalSize = occurrences.size();
                    final Optional chunkOpt = npIndex.search(gazSynonym).stream().findAny();
                    if (chunkOpt.isPresent()) {
                        final ChunkNP np = chunkOpt.get();
                        final Stream organismsInNp = organismIndex.search(np).stream();
                        organismsInNp.map(org -> org.getResourceEntryList(0).getEntryId()).flatMap(org -> Stream.of(org, referenceSpeciesTaxIds.get(org))).filter(Objects::nonNull).peek(taxId -> synTaxPairs.put(synonym, taxId)).forEach(taxId -> occurrences.add(new ImmutableTriple<>(synonym, taxId, OccurrenceScope.NP)));
                    }

                    if (occurrences.size() == originalSize) {
                        Optional sentenceOpt = sentenceIndex.search(gazSynonym).stream().findAny();
                        if (sentenceOpt.isPresent()) {
                            final Sentence sentence = sentenceOpt.get();
                            final Stream organismsInSentence = organismIndex.search(sentence).stream();
                            organismsInSentence.map(org -> org.getResourceEntryList(0).getEntryId()).flatMap(org -> Stream.of(org, referenceSpeciesTaxIds.get(org))).filter(Objects::nonNull).peek(taxId -> synTaxPairs.put(synonym, taxId)).forEach(taxId -> occurrences.add(new ImmutableTriple<>(synonym, taxId, OccurrenceScope.SENTENCE)));
                        }
                    }

                    for (String taxId : meshTaxIds)
                        occurrences.add(new ImmutableTriple<>(synonym, taxId, OccurrenceScope.MESH));

                }
            }
            for (String synonym : synonyms) {
                for (String taxId : docTaxIds)
                    if (!synTaxPairs.get(synonym).contains(taxId))
                        occurrences.add(new ImmutableTriple<>(synonym, taxId, OccurrenceScope.DOCUMENT));
            }
        }
        ++processed;
        if (processed % 500 == 0) {
            try {
                log.debug("Triggering batchProcessComplete after 500 processed documents.");
                batchProcessComplete();
            } catch (AnalysisEngineProcessException e) {
                log.error("Exception while executing batchProcessComplete", e);
                throw e;
            }
        }
    }

    private String getSynonym(EntityMention gene) {
        try {
            return normalizer.normalize(gene.getCoveredText());
        } catch (StringIndexOutOfBoundsException e) {
            try {
                log.error("Gene mention in document {} has invalid offsets: {}-{} (document text length: {})", JCoReTools.getDocId(gene.getCAS().getJCas()), gene.getBegin(), gene.getEnd(), gene.getCAS().getDocumentText().length());
            } catch (CASException casException) {
                log.error("Gene mention has invalid offsets. Cannot output more details due to CASException when trying to obtain more information.");
            }
            throw e;
        }
    }

    /**
     * Iterates through the MeSH headings that the document was indexed with and checks if they occur in our
     * desc2tax mapping. If so, the found taxId is added to the returned set.
     *
     * @param jCas The CAS.
     * @return A set containing the taxonomy IDs mapped from MeSH descriptor names.
     */
    private Set getMeshTaxIds(JCas jCas) {
        Set meshTaxIds = new HashSet<>();
        for (Annotation annotation : jCas.getAnnotationIndex(MeshHeading.type)) {
            MeshHeading heading = (MeshHeading) annotation;
            final String desc = heading.getDescriptorName();
            final Collection taxIds = desc2TaxId.get(desc);
            if (taxIds != null) {
                for (String taxId : taxIds) {
                    meshTaxIds.add(taxId);
                    String referenceId = referenceSpeciesTaxIds.get(taxId);
                    if (referenceId != null)
                        meshTaxIds.add(referenceId);
                }
            }
        }
        return meshTaxIds;
    }

    /**
     * Text and MeSH occurrences. Uses {@link #getMeshTaxIds(JCas)} to find all the MeSH-mentioned taxonomy IDs
     * and adds the taxonomy IDs from all the {@link Organism} mentions found in this document.
     *
     * @param jCas The CAS.
     * @return The set of all taxonomy IDs from MeSH descriptors and Organism mentions.
     */
    private Set getAllTaxIdsInDocument(JCas jCas) {
        Set taxIdMentions = new HashSet<>();
        for (Annotation annotation : jCas.getAnnotationIndex(Organism.type)) {
            Organism organism = (Organism) annotation;
            taxIdMentions.add(organism.getResourceEntryList(0).getEntryId());
            String referenceId = referenceSpeciesTaxIds.get(organism.getResourceEntryList(0).getEntryId());
            if (referenceId != null)
                taxIdMentions.add(referenceId);
        }
        return taxIdMentions;
    }

    private void sendOccurrencesToDatabase() throws SQLException {
        log.debug("Sending {} gene synonym species occurrences to the database", occurrences.size());
        try (final CoStoSysConnection coStoSysConnection = dbc.obtainOrReserveConnection()) {
            boolean wasAutoComit = coStoSysConnection.getAutoCommit();
            coStoSysConnection.setAutoCommit(false);
            final PreparedStatement ps = coStoSysConnection.prepareStatement("INSERT INTO " + tableName + "(synonym,tax_id,scope,count) values(?,?,?,1) ON CONFLICT ON CONSTRAINT " + tableName + "_pkey DO UPDATE SET count = " + tableName + ".count + 1");
            for (Triple occurrence : occurrences) {
                // Set the synonym
                ps.setString(1, occurrence.getLeft());
                // Set the taxonomy ID.
                ps.setInt(2, Integer.parseInt(occurrence.getMiddle()));
                // Set the occurrence scope
                ps.setString(3, occurrence.getRight().name());
                ps.addBatch();
            }
            // And execute the whole thing.
            ps.executeBatch();
            coStoSysConnection.commit();
            coStoSysConnection.setAutoCommit(wasAutoComit);
            occurrences.clear();
        }
        log.debug("Finished sending gene synonym species occurrences to the database.");
    }

    @Override
    public void batchProcessComplete() throws AnalysisEngineProcessException {
        log.debug("Batch processing complete.");
        try {
            sendOccurrencesToDatabase();
        } catch (SQLException e) {
            log.error("Could not send the collected gene synonym species occurrences to the database", e);
            throw new AnalysisEngineProcessException(e);
        }
    }

    @Override
    public void collectionProcessComplete() throws AnalysisEngineProcessException {
        log.debug("Collection processing complete.");
        try {
            sendOccurrencesToDatabase();
        } catch (SQLException e) {
            log.error("Could not send the collected gene synonym species occurrences to the database", e);
            throw new AnalysisEngineProcessException(e);
        }
    }

    public enum OccurrenceScope {SENTENCE, NP, MESH, DOCUMENT}
}