All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.genemapper.resources.NameCentricSynonymIndexGenerator Maven / Gradle / Ivy

Go to download

This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.

The newest version!
package de.julielab.genemapper.resources;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multiset;
import de.julielab.gene.candidateretrieval.LuceneCandidateRetrieval;
import de.julielab.gene.candidateretrieval.SynonymIndexFieldNames;
import de.julielab.geneexpbase.CandidateFilter;
import de.julielab.geneexpbase.candidateretrieval.SynHit;
import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.ngram.NGramFilterFactory;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.document.*;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;

/**
 * Synonym or gene name centric indexer, new as of March 11, 2019. The idea is to save storage and gain more focused
 * gene mention search results by not indexing each synonym of each gene but group the gene ids by all possible
 * synonyms. Thus, each synonym is only stored once and references the list of genes it may refer to, immediately
 * showing the ambiguity of the synonym.
 */
public class NameCentricSynonymIndexGenerator {

    private static final Logger log = LoggerFactory.getLogger(NameCentricSynonymIndexGenerator.class);

    private static final Boolean OMIT_FILTERED = true;
    private final File filteredDictFile;
    private final File ambiguousSynsFile;
    private final NGramFilterFactory nGramFilterFactory;

    Map id2tax;

    Directory indexDirectory;
    /**
     * A file containing gene or protein names / synonyms and their respective NCBI
     * Gene or UniProt ID.
     */
    private final File dictFile;
    private final File familyRecordsFile;
    private final String idSource;
    private final String entityType;

    /**
     * @param dictFile   A file containing gene or protein names / synonyms and their
     *                   respective NCBI Gene or UniProt ID. No term normalization is
     *                   expected for this dictionary.
     * @param idSource
     * @param entityType
     * @param indexFile  The directory where the name / synonym index will be written to.
     */
    public NameCentricSynonymIndexGenerator(File dictFile, File familyRecordsFile, String idSource, String entityType, File indexFile) {
        this.familyRecordsFile = familyRecordsFile;
        this.idSource = idSource;
        this.entityType = entityType;
        System.out.println("Building synonym index from dictionary " + dictFile.getAbsolutePath());
        System.out.println("Adding family synonyms from " + familyRecordsFile.getAbsolutePath());
        this.dictFile = dictFile;
        this.filteredDictFile = new File(dictFile.getParent(), dictFile.getName() + ".indexGeneratorFiltered");
        this.ambiguousSynsFile = new File(dictFile.getParent(), "intra_tax_ambiguous.eg");
        indexDirectory = createIndexDirectory(indexFile);

        Map ngramFilterSettings = new HashMap<>();
        ngramFilterSettings.put("minGramSize", "2");
        ngramFilterSettings.put("maxGramSize", "3");
        nGramFilterFactory = new NGramFilterFactory(ngramFilterSettings);

    }

    /**
     * To execute the ContextIndexGenerator start it with the following command-line
     * arguments:
* arg0: path to resources directory arg1: path to synonym indices directory * * @param args */ public static void main(String[] args) { long s1 = System.currentTimeMillis(); if (args.length != 3) { System.err.println( "Usage: SynonymIndexGenerator "); System.exit(1); } String resPath = args[0]; File resDir = new File(resPath); if (!resDir.isDirectory()) { System.err.println("Could not find resources directory"); System.exit(1); } if (!resPath.endsWith(File.separator)) { resPath = resPath + File.separator; } File geneInfo = new File(resPath + args[1]); if (!geneInfo.exists()) { System.err.println("Gene info file could not be found at " + geneInfo.getAbsolutePath()); System.exit(1); } String indexPath = args[2]; if (!indexPath.endsWith("/")) { indexPath = indexPath + "/"; } File geneIndexDir = new File(indexPath + "geneSynonymIndex"); File proteinIndexDir = new File(indexPath + "proteinSynonymIndex"); if (geneIndexDir.exists()) FileUtils.deleteQuietly(geneIndexDir); if (proteinIndexDir.exists()) FileUtils.deleteQuietly(proteinIndexDir); File upDictFile = new File(resPath + "gene.dict.variants.norm.up"); //checkFile(upDictFile); File egDictFile = new File(resPath + "gene.dict.variants.norm.filtered.eg"); checkFile(egDictFile); File upTaxMap = new File(resPath + "up2eg2tax.map"); checkFile(upTaxMap); File familyRecordsFile = new File(resPath + "familyrecords.dict"); File egTaxMap = geneInfo; NameCentricSynonymIndexGenerator indexGenerator; try { // indexGenerator = new NameCentricSynonymIndexGenerator(upDictFile, proteinIndexDir); // indexGenerator.readUpTaxMap(upTaxMap); // indexGenerator.createIndex(); indexGenerator = new NameCentricSynonymIndexGenerator(egDictFile, familyRecordsFile, "NCBI Gene", SynHit.TYPE_GEPRO, geneIndexDir); indexGenerator.readEgTaxMap(egTaxMap); indexGenerator.createIndex(); } catch (IOException e) { e.printStackTrace(); } long s2 = System.currentTimeMillis(); System.out.println("Index created successfully! (" + (s2 - s1) / 1000 + " sec)"); } private static void checkFile(File file) { if (!file.isFile()) throw new IllegalArgumentException("File \"" + file.getAbsolutePath() + "\" could not be found."); } /** * Creates the synonym index. Each unique synonym is indexed in a document of its own. Each such document * has a number of fields for each gene that has the current synonym and lists the gene ID, its tax ID (if the * tax ID mapping is given) and the "priority" that the synonym has for the gene. The priority aims to describe * the reliability of the source given the respective synonym. Higher numbers mean a lower priority. * The official gene symbol has priority -1. * * @throws IOException */ public void createIndex() throws IOException { CandidateFilter cf = new CandidateFilter(); WhitespaceAnalyzer wsAnalyzer = new WhitespaceAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(wsAnalyzer); iwc.setOpenMode(OpenMode.CREATE); log.info("Using up to 20 threads for index document creation"); // VERY IMPORTANT: The dictionary file must be sorted by synonym. This is because we want to group the // dictionary entries by synonym but we don't want to read the whole dictionary and sort it in-memory // because this may well exhaust the memory for the full all-species dictionary. final ExecutorService executorService = Executors.newFixedThreadPool(20); try (IndexWriter iw = new IndexWriter(indexDirectory, iwc)) { try (BufferedWriter outdictBw = FileUtilities.getWriterToFile(filteredDictFile); BufferedWriter ambiguousSynonymsBw = FileUtilities.getWriterToFile(ambiguousSynsFile)) { try { indexDictionary(dictFile, idSource, entityType, cf, executorService, iw, outdictBw, ambiguousSynonymsBw); // indexDictionary(familyRecordsFile, "GenoFamilies", SynHit.TYPE_GROUP, null, executorService, iw, null, null); } finally { try { // log.info("Shutting down executor."); executorService.shutdown(); log.info("Waiting for running threads to terminate."); // The 100 days basically mean "wait until we are finished" executorService.awaitTermination(100, TimeUnit.DAYS); } catch (InterruptedException e) { log.warn("Waiting for running threads to finish has been interrupted. Shutting down the executor service now."); executorService.shutdownNow(); } log.info("ExecutorService has been shut down."); } } // try(final BufferedReader brFam = FileUtilities.getReaderFromFile(familyDict)) { // // Index the family names // String line; // while ((line = brFam.readLine()) != null) { // String[] split = line.split("\t"); // String familySynonym = split[0]; // String famplexId = split[1]; // // Document doc = new Document(); // Field lookupSynField = new TextField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD, familySynonym, // Store.YES); // Field idField = new StringField(SynonymIndexFieldNames.ID_FIELD, famplexId, Store.NO); // // To fit into the prioritized synonym schema, we just assign the highest priority to the family synonyms // Field idPriorityField = new StringField(SynonymIndexFieldNames.ID_FIELD, famplexId + LuceneCandidateRetrieval.NAME_PRIO_DELIMITER + "-1", Store.YES); // Field idSourceField = new StringField(SynonymIndexFieldNames.SOURCE, "FamPlex", Store.YES); // doc.add(lookupSynField); // doc.add(idField); // doc.add(idPriorityField); // doc.add(idSourceField); // iw.addDocument(doc); // } // } log.info("Committing all index additions."); iw.commit(); } } public void indexDictionary(File dictFile, String idSource, String entityType, CandidateFilter cf, ExecutorService executorService, IndexWriter iw, BufferedWriter outdictBw, BufferedWriter ambiguousSynonymsBw) throws IOException { try (final BufferedReader br = FileUtilities.getReaderFromFile(dictFile)) { AtomicInteger counter = new AtomicInteger(); String line; String currentSynonym = null; List entriesForCurrentSynonym = new ArrayList<>(); while ((line = br.readLine()) != null) { currentSynonym = processLine(line, currentSynonym, entriesForCurrentSynonym, counter, idSource, entityType, iw, outdictBw, ambiguousSynonymsBw, cf, executorService); } if (currentSynonym != null) { line = "$$END\tOF\tFILE$$"; processLine(line, currentSynonym, entriesForCurrentSynonym, counter, idSource, entityType, iw, outdictBw, ambiguousSynonymsBw, cf, executorService); } } } private String processLine(String line, String currentSynonym, List entriesForCurrentSynonym, AtomicInteger counter, String idSource, String entityType, IndexWriter iw, BufferedWriter outdictBw, BufferedWriter ambiguousSynonymsBw, CandidateFilter cf, ExecutorService executorService) { final String[] split = line.split("\t"); if (split.length != 3 && split.length != 3) System.err.println("ERR: normalized dictionary not in expected format. \ncritical line: " + line); String synonym = split[0]; if (currentSynonym == null) currentSynonym = synonym; // Have we reached the next synonym? Then we must first create the index items for the current // synonym before we continue if (!synonym.equals(currentSynonym)) { final String synonymToWrite = currentSynonym; final List entriesToWrite = new ArrayList<>(entriesForCurrentSynonym); executorService.submit(() -> { try { indexCurrentSynonymEntries(cf, idSource, entityType, iw, outdictBw, ambiguousSynonymsBw, counter, synonymToWrite, entriesToWrite); } catch (IOException e) { log.error("Could not create index document for synonym {}", synonymToWrite, e); } }); entriesForCurrentSynonym.clear(); } entriesForCurrentSynonym.add(split); currentSynonym = synonym; return currentSynonym; } /** * Takes the arrays with the gene IDs that have the passed synonym with the priorities that are also stored in the arrays given by entriesForCurrentSynonym. * Creates one Lucene document for the synonym and sets all the IDs with their priorities into one field (separated by {@link LuceneCandidateRetrieval#NAME_PRIO_DELIMITER}) and the respective taxonomy IDs in another field. * The priorities are assigned in the _makeGeneDictionary.sh script and should reflect the reliability of the source that * gave the corresponding synonym tot he gene. * * @param cf The candidate filter for filtering out synonyms that look as they wouldn't help at all. * @param idSource * @param iw The Lucene index writer. * @param outdictBw * @param ambiguousSynonymsBw * @param counter A parameter for counting the number of synonyms processed, for status output. * @param currentSynonym The synonym for which all entries have been collected in entriesForCurrentSynonym. * @param entriesForCurrentSynonym All the IDs of genes that have the currentSynonym and the priority with which they have the synonym. * @return The counter for status output. The generated document is added to the IndexWriter within the method. * @throws IOException */ private void indexCurrentSynonymEntries(CandidateFilter cf, String idSource, String entityType, IndexWriter iw, BufferedWriter outdictBw, BufferedWriter ambiguousSynonymsBw, AtomicInteger counter, String currentSynonym, List entriesForCurrentSynonym) throws IOException { Document doc = new Document(); Field lookupSynField = new TextField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD, currentSynonym, Store.YES); doc.add(lookupSynField); SnowballFilter ts = new SnowballFilter(new WhitespaceAnalyzer().tokenStream(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_STEMMED, currentSynonym), "English"); doc.add(new TextField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_STEMMED, ts)); TokenFilter ngrams = nGramFilterFactory.create(new WhitespaceAnalyzer().tokenStream(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_NGRAMS, currentSynonym)); doc.add(new TextField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_NGRAMS, ngrams)); doc.add(new StringField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_EXACT, currentSynonym, Store.NO)); List fields = new ArrayList<>(); int minPriority = Integer.MAX_VALUE; Multiset taxIdsForSynonym = HashMultiset.create(); Multimap tax2Id = HashMultimap.create(); for (String[] geneEntry : entriesForCurrentSynonym) { String id = geneEntry[1]; boolean isFamilyDict = id.startsWith("GENO:"); Integer priority = Integer.parseInt(geneEntry[2]); if (priority < minPriority) minPriority = priority; boolean filtered = false; // If the synonym is the official gene symbol, we accept it, no matter what if (cf != null && !OMIT_FILTERED && priority != -1) { filtered = DictionaryFamilyDomainFilter.isFiltered(id, cf, currentSynonym); } if (log.isDebugEnabled()) { log.debug("ID: {}, synonym: {}, filtered out: {}", id, currentSynonym, filtered); } String tax = isFamilyDict ? "0" : ""; if (id2tax.get(id) != null) { tax = id2tax.get(id); if (priority <= 3) { taxIdsForSynonym.add(tax); tax2Id.put(tax, id); } } Field idField = new StringField(SynonymIndexFieldNames.ID_FIELD, id, Store.NO); Field idPriorityField = new StringField(SynonymIndexFieldNames.ID_FIELD, id + LuceneCandidateRetrieval.NAME_PRIO_DELIMITER + priority, Store.YES); Field taxField = new StringField(SynonymIndexFieldNames.TAX_ID_FIELD, tax, Store.YES); IntPoint priorityField = new IntPoint(SynonymIndexFieldNames.PRIORITY, priority); if (!OMIT_FILTERED) { IntPoint filteredField = new IntPoint(SynonymIndexFieldNames.FILTERED, filtered ? 1 : 0); StoredField storedFilteredField = new StoredField(SynonymIndexFieldNames.FILTERED, filtered ? 1 : 0); fields.add(filteredField); fields.add(storedFilteredField); } Field idSourceField = new StringField(SynonymIndexFieldNames.SOURCE, isFamilyDict ? "GenoFamilies" : idSource, Store.YES); Field typeField = new StringField(SynonymIndexFieldNames.ENTITY_TYPE, entityType, Store.YES); fields.add(idField); fields.add(idPriorityField); fields.add(taxField); fields.add(priorityField); fields.add(idSourceField); fields.add(typeField); } if (!fields.isEmpty()) { for (Field f : fields) doc.add(f); iw.addDocument(doc); if (outdictBw != null && minPriority <= 3) { synchronized (outdictBw) { outdictBw.write(currentSynonym + "\tGene"); outdictBw.newLine(); } } if (ambiguousSynonymsBw != null) { for (String tax : taxIdsForSynonym) { if (taxIdsForSynonym.count(tax) > 1) { Collection intraAmbiguousIds = tax2Id.get(tax); synchronized (ambiguousSynonymsBw) { for (String id : intraAmbiguousIds) { ambiguousSynonymsBw.write(id); ambiguousSynonymsBw.newLine(); } } } } } } int done = counter.incrementAndGet(); if (done % 10000 == 0) { log.debug("# entries processed: " + done); } } /** * create the directory object where to put the lucene index... */ private FSDirectory createIndexDirectory(File indexFile) { FSDirectory fdir = null; try { fdir = FSDirectory.open(indexFile.toPath()); } catch (IOException e) { e.printStackTrace(); } return fdir; } private void readUpTaxMap(File taxMap) throws IOException { log.info("Reading up2eg2tax.map ..."); id2tax = new HashMap(); BufferedReader reader = new BufferedReader(new FileReader(taxMap)); String line = ""; while ((line = reader.readLine()) != null) { String[] entry = line.split("\t"); if (entry.length != 3) { System.err.println("ERR: up2eg2tax.map not in expected format. \ncritical line: " + line); System.exit(-1); } String id = entry[0].trim(); String taxId = entry[2].trim(); id2tax.put(id, taxId); } reader.close(); } private void readEgTaxMap(File geneInfo) throws IOException { try (BufferedReader br = new BufferedReader( new InputStreamReader(new GZIPInputStream(new FileInputStream(geneInfo))))) { id2tax = br.lines().collect( Collectors.toMap(l -> l.split("\\t", 3)[1], l -> l.split("\\t", 3)[0])); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy