de.julielab.genemapper.resources.NameCentricSynonymIndexGenerator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.
The newest version!
package de.julielab.genemapper.resources;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multiset;
import de.julielab.gene.candidateretrieval.LuceneCandidateRetrieval;
import de.julielab.gene.candidateretrieval.SynonymIndexFieldNames;
import de.julielab.geneexpbase.CandidateFilter;
import de.julielab.geneexpbase.candidateretrieval.SynHit;
import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.ngram.NGramFilterFactory;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.document.*;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;

/**
 * Synonym or gene name centric indexer, new as of March 11, 2019. The idea is to save storage and gain more focused
 * gene mention search results by not indexing each synonym of each gene but group the gene ids by all possible
 * synonyms. Thus, each synonym is only stored once and references the list of genes it may refer to, immediately
 * showing the ambiguity of the synonym.
 */
public class NameCentricSynonymIndexGenerator {

    private static final Logger log = LoggerFactory.getLogger(NameCentricSynonymIndexGenerator.class);

    private static final Boolean OMIT_FILTERED = true;
    private final File filteredDictFile;
    private final File ambiguousSynsFile;
    private final NGramFilterFactory nGramFilterFactory;

    Map id2tax;

    Directory indexDirectory;
    /**
     * A file containing gene or protein names / synonyms and their respective NCBI
     * Gene or UniProt ID.
     */
    private final File dictFile;
    private final File familyRecordsFile;
    private final String idSource;
    private final String entityType;

    /**
     * @param dictFile   A file containing gene or protein names / synonyms and their
     *                   respective NCBI Gene or UniProt ID. No term normalization is
     *                   expected for this dictionary.
     * @param idSource
     * @param entityType
     * @param indexFile  The directory where the name / synonym index will be written to.
     */
    public NameCentricSynonymIndexGenerator(File dictFile, File familyRecordsFile, String idSource, String entityType, File indexFile) {
        this.familyRecordsFile = familyRecordsFile;
        this.idSource = idSource;
        this.entityType = entityType;
        System.out.println("Building synonym index from dictionary " + dictFile.getAbsolutePath());
        System.out.println("Adding family synonyms from " + familyRecordsFile.getAbsolutePath());
        this.dictFile = dictFile;
        this.filteredDictFile = new File(dictFile.getParent(), dictFile.getName() + ".indexGeneratorFiltered");
        this.ambiguousSynsFile = new File(dictFile.getParent(), "intra_tax_ambiguous.eg");
        indexDirectory = createIndexDirectory(indexFile);

        Map ngramFilterSettings = new HashMap<>();
        ngramFilterSettings.put("minGramSize", "2");
        ngramFilterSettings.put("maxGramSize", "3");
        nGramFilterFactory = new NGramFilterFactory(ngramFilterSettings);

    }

    /**
     * To execute the ContextIndexGenerator start it with the following command-line
     * arguments:

     * arg0: path to resources directory arg1: path to synonym indices directory
     *
     * @param args
     */
    public static void main(String[] args) {

        long s1 = System.currentTimeMillis();

        if (args.length != 3) {
            System.err.println(
                    "Usage: SynonymIndexGenerator   ");
            System.exit(1);
        }

        String resPath = args[0];
        File resDir = new File(resPath);
        if (!resDir.isDirectory()) {
            System.err.println("Could not find resources directory");
            System.exit(1);
        }
        if (!resPath.endsWith(File.separator)) {
            resPath = resPath + File.separator;
        }

        File geneInfo = new File(resPath + args[1]);
        if (!geneInfo.exists()) {
            System.err.println("Gene info file could not be found at " + geneInfo.getAbsolutePath());
            System.exit(1);
        }

        String indexPath = args[2];
        if (!indexPath.endsWith("/")) {
            indexPath = indexPath + "/";
        }
        File geneIndexDir = new File(indexPath + "geneSynonymIndex");
        File proteinIndexDir = new File(indexPath + "proteinSynonymIndex");

        if (geneIndexDir.exists())
            FileUtils.deleteQuietly(geneIndexDir);
        if (proteinIndexDir.exists())
            FileUtils.deleteQuietly(proteinIndexDir);

        File upDictFile = new File(resPath + "gene.dict.variants.norm.up");
        //checkFile(upDictFile);

        File egDictFile = new File(resPath + "gene.dict.variants.norm.filtered.eg");
        checkFile(egDictFile);

        File upTaxMap = new File(resPath + "up2eg2tax.map");
        checkFile(upTaxMap);

        File familyRecordsFile = new File(resPath + "familyrecords.dict");

        File egTaxMap = geneInfo;

        NameCentricSynonymIndexGenerator indexGenerator;
        try {
            // indexGenerator = new NameCentricSynonymIndexGenerator(upDictFile, proteinIndexDir);
            // indexGenerator.readUpTaxMap(upTaxMap);
            // indexGenerator.createIndex();
            indexGenerator = new NameCentricSynonymIndexGenerator(egDictFile, familyRecordsFile, "NCBI Gene", SynHit.TYPE_GEPRO, geneIndexDir);
            indexGenerator.readEgTaxMap(egTaxMap);
            indexGenerator.createIndex();
        } catch (IOException e) {
            e.printStackTrace();
        }

        long s2 = System.currentTimeMillis();
        System.out.println("Index created successfully! (" + (s2 - s1) / 1000 + " sec)");
    }

    private static void checkFile(File file) {
        if (!file.isFile())
            throw new IllegalArgumentException("File \"" + file.getAbsolutePath() + "\" could not be found.");
    }

    /**
     * Creates the synonym index. Each unique synonym is indexed in a document of its own. Each such document
     * has a number of fields for each gene that has the current synonym and lists the gene ID, its tax ID (if the
     * tax ID mapping is given) and the "priority" that the synonym has for the gene. The priority aims to describe
     * the reliability of the source given the respective synonym. Higher numbers mean a lower priority.
     * The official gene symbol has priority -1.
     *
     * @throws IOException
     */
    public void createIndex() throws IOException {
        CandidateFilter cf = new CandidateFilter();

        WhitespaceAnalyzer wsAnalyzer = new WhitespaceAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(wsAnalyzer);
        iwc.setOpenMode(OpenMode.CREATE);

        log.info("Using up to 20 threads for index document creation");
        // VERY IMPORTANT: The dictionary file must be sorted by synonym. This is because we want to group the
        // dictionary entries by synonym but we don't want to read the whole dictionary and sort it in-memory
        // because this may well exhaust the memory for the full all-species dictionary.
        final ExecutorService executorService = Executors.newFixedThreadPool(20);
        try (IndexWriter iw = new IndexWriter(indexDirectory, iwc)) {
            try (BufferedWriter outdictBw = FileUtilities.getWriterToFile(filteredDictFile);
                 BufferedWriter ambiguousSynonymsBw = FileUtilities.getWriterToFile(ambiguousSynsFile)) {
                try {
                    indexDictionary(dictFile, idSource, entityType, cf, executorService, iw, outdictBw, ambiguousSynonymsBw);
                    // indexDictionary(familyRecordsFile, "GenoFamilies", SynHit.TYPE_GROUP, null, executorService, iw, null, null);
                } finally {
                    try {
//                        log.info("Shutting down executor.");
                        executorService.shutdown();
                        log.info("Waiting for running threads to terminate.");
                        // The 100 days basically mean "wait until we are finished"
                        executorService.awaitTermination(100, TimeUnit.DAYS);
                    } catch (InterruptedException e) {
                        log.warn("Waiting for running threads to finish has been interrupted. Shutting down the executor service now.");
                        executorService.shutdownNow();
                    }
                    log.info("ExecutorService has been shut down.");
                }
            }
//            try(final BufferedReader brFam = FileUtilities.getReaderFromFile(familyDict)) {
//                // Index the family names
//                String line;
//                while ((line = brFam.readLine()) != null) {
//                    String[] split = line.split("\t");
//                    String familySynonym = split[0];
//                    String famplexId = split[1];
//
//                    Document doc = new Document();
//                    Field lookupSynField = new TextField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD, familySynonym,
//                            Store.YES);
//                    Field idField = new StringField(SynonymIndexFieldNames.ID_FIELD, famplexId, Store.NO);
//                    // To fit into the prioritized synonym schema, we just assign the highest priority to the family synonyms
//                    Field idPriorityField = new StringField(SynonymIndexFieldNames.ID_FIELD, famplexId + LuceneCandidateRetrieval.NAME_PRIO_DELIMITER + "-1", Store.YES);
//                    Field idSourceField = new StringField(SynonymIndexFieldNames.SOURCE, "FamPlex", Store.YES);
//                    doc.add(lookupSynField);
//                    doc.add(idField);
//                    doc.add(idPriorityField);
//                    doc.add(idSourceField);
//                    iw.addDocument(doc);
//                }
//            }

            log.info("Committing all index additions.");
            iw.commit();
        }
    }

    public void indexDictionary(File dictFile, String idSource, String entityType, CandidateFilter cf, ExecutorService executorService, IndexWriter iw, BufferedWriter outdictBw, BufferedWriter ambiguousSynonymsBw) throws IOException {
        try (final BufferedReader br = FileUtilities.getReaderFromFile(dictFile)) {
            AtomicInteger counter = new AtomicInteger();


            String line;
            String currentSynonym = null;
            List entriesForCurrentSynonym = new ArrayList<>();
            while ((line = br.readLine()) != null) {
                currentSynonym = processLine(line, currentSynonym, entriesForCurrentSynonym, counter, idSource, entityType, iw, outdictBw, ambiguousSynonymsBw, cf, executorService);
            }
            if (currentSynonym != null) {
                line = "$$END\tOF\tFILE$$";
                processLine(line, currentSynonym, entriesForCurrentSynonym, counter, idSource, entityType, iw, outdictBw, ambiguousSynonymsBw, cf, executorService);
            }
        }
    }

    private String processLine(String line, String currentSynonym, List entriesForCurrentSynonym, AtomicInteger counter, String idSource, String entityType, IndexWriter iw, BufferedWriter outdictBw, BufferedWriter ambiguousSynonymsBw, CandidateFilter cf, ExecutorService executorService) {
        final String[] split = line.split("\t");
        if (split.length != 3 && split.length != 3)
            System.err.println("ERR: normalized dictionary not in expected format. \ncritical line: " + line);
        String synonym = split[0];
        if (currentSynonym == null)
            currentSynonym = synonym;
        // Have we reached the next synonym? Then we must first create the index items for the current
        // synonym before we continue
        if (!synonym.equals(currentSynonym)) {
            final String synonymToWrite = currentSynonym;
            final List entriesToWrite = new ArrayList<>(entriesForCurrentSynonym);
            executorService.submit(() -> {
                try {
                    indexCurrentSynonymEntries(cf, idSource, entityType, iw, outdictBw, ambiguousSynonymsBw, counter, synonymToWrite, entriesToWrite);
                } catch (IOException e) {
                    log.error("Could not create index document for synonym {}", synonymToWrite, e);
                }
            });
            entriesForCurrentSynonym.clear();
        }

        entriesForCurrentSynonym.add(split);
        currentSynonym = synonym;
        return currentSynonym;
    }

    /**
     * Takes the arrays with the gene IDs that have the passed synonym with the priorities that are also stored in the arrays given by entriesForCurrentSynonym.
     * Creates one Lucene document for the synonym and sets all the IDs with their priorities into one field (separated by {@link LuceneCandidateRetrieval#NAME_PRIO_DELIMITER}) and the respective taxonomy IDs in another field.
     * The priorities are assigned in the _makeGeneDictionary.sh script and should reflect the reliability of the source that
     * gave the corresponding synonym tot he gene.
     *
     * @param cf                       The candidate filter for filtering out synonyms that look as they wouldn't help at all.
     * @param idSource
     * @param iw                       The Lucene index writer.
     * @param outdictBw
     * @param ambiguousSynonymsBw
     * @param counter                  A parameter for counting the number of synonyms processed, for status output.
     * @param currentSynonym           The synonym for which all entries have been collected in entriesForCurrentSynonym.
     * @param entriesForCurrentSynonym All the IDs of genes that have the currentSynonym and the priority with which they have the synonym.
     * @return The counter for status output. The generated document is added to the IndexWriter within the method.
     * @throws IOException
     */
    private void indexCurrentSynonymEntries(CandidateFilter cf, String idSource, String entityType, IndexWriter iw, BufferedWriter outdictBw, BufferedWriter ambiguousSynonymsBw, AtomicInteger counter, String currentSynonym, List entriesForCurrentSynonym) throws IOException {
        Document doc = new Document();
        Field lookupSynField = new TextField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD, currentSynonym,
                Store.YES);
        doc.add(lookupSynField);
        SnowballFilter ts = new SnowballFilter(new WhitespaceAnalyzer().tokenStream(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_STEMMED, currentSynonym), "English");
        doc.add(new TextField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_STEMMED, ts));
        TokenFilter ngrams = nGramFilterFactory.create(new WhitespaceAnalyzer().tokenStream(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_NGRAMS, currentSynonym));
        doc.add(new TextField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_NGRAMS, ngrams));
        doc.add(new StringField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_EXACT, currentSynonym, Store.NO));

        List fields = new ArrayList<>();
        int minPriority = Integer.MAX_VALUE;
        Multiset taxIdsForSynonym = HashMultiset.create();
        Multimap tax2Id = HashMultimap.create();
        for (String[] geneEntry : entriesForCurrentSynonym) {
            String id = geneEntry[1];
            boolean isFamilyDict = id.startsWith("GENO:");
            Integer priority = Integer.parseInt(geneEntry[2]);
            if (priority < minPriority)
                minPriority = priority;

            boolean filtered = false;
            // If the synonym is the official gene symbol, we accept it, no matter what
            if (cf != null && !OMIT_FILTERED && priority != -1) {
                filtered = DictionaryFamilyDomainFilter.isFiltered(id, cf, currentSynonym);
            }
            if (log.isDebugEnabled()) {
                log.debug("ID: {}, synonym: {}, filtered out: {}", id, currentSynonym, filtered);
            }

            String tax = isFamilyDict ? "0" : "";
            if (id2tax.get(id) != null) {
                tax = id2tax.get(id);
                if (priority <= 3) {
                    taxIdsForSynonym.add(tax);
                    tax2Id.put(tax, id);
                }
            }


            Field idField = new StringField(SynonymIndexFieldNames.ID_FIELD, id, Store.NO);
            Field idPriorityField = new StringField(SynonymIndexFieldNames.ID_FIELD, id + LuceneCandidateRetrieval.NAME_PRIO_DELIMITER + priority, Store.YES);
            Field taxField = new StringField(SynonymIndexFieldNames.TAX_ID_FIELD, tax, Store.YES);
            IntPoint priorityField = new IntPoint(SynonymIndexFieldNames.PRIORITY, priority);
            if (!OMIT_FILTERED) {
                IntPoint filteredField = new IntPoint(SynonymIndexFieldNames.FILTERED, filtered ? 1 : 0);
                StoredField storedFilteredField = new StoredField(SynonymIndexFieldNames.FILTERED,
                        filtered ? 1 : 0);
                fields.add(filteredField);
                fields.add(storedFilteredField);
            }
            Field idSourceField = new StringField(SynonymIndexFieldNames.SOURCE, isFamilyDict ? "GenoFamilies" : idSource, Store.YES);
            Field typeField = new StringField(SynonymIndexFieldNames.ENTITY_TYPE, entityType, Store.YES);
            fields.add(idField);
            fields.add(idPriorityField);
            fields.add(taxField);
            fields.add(priorityField);
            fields.add(idSourceField);
            fields.add(typeField);

        }
        if (!fields.isEmpty()) {
            for (Field f : fields)
                doc.add(f);
            iw.addDocument(doc);
            if (outdictBw != null && minPriority <= 3) {
                synchronized (outdictBw) {
                    outdictBw.write(currentSynonym + "\tGene");
                    outdictBw.newLine();
                }
            }
            if (ambiguousSynonymsBw != null) {
                for (String tax : taxIdsForSynonym) {
                    if (taxIdsForSynonym.count(tax) > 1) {
                        Collection intraAmbiguousIds = tax2Id.get(tax);
                        synchronized (ambiguousSynonymsBw) {
                            for (String id : intraAmbiguousIds) {
                                ambiguousSynonymsBw.write(id);
                                ambiguousSynonymsBw.newLine();
                            }
                        }
                    }
                }
            }
        }

        int done = counter.incrementAndGet();
        if (done % 10000 == 0) {
            log.debug("# entries processed: " + done);
        }
    }


    /**
     * create the directory object where to put the lucene index...
     */
    private FSDirectory createIndexDirectory(File indexFile) {
        FSDirectory fdir = null;
        try {
            fdir = FSDirectory.open(indexFile.toPath());
        } catch (IOException e) {
            e.printStackTrace();
        }
        return fdir;
    }


    private void readUpTaxMap(File taxMap) throws IOException {
        log.info("Reading up2eg2tax.map ...");
        id2tax = new HashMap();

        BufferedReader reader = new BufferedReader(new FileReader(taxMap));
        String line = "";

        while ((line = reader.readLine()) != null) {
            String[] entry = line.split("\t");

            if (entry.length != 3) {
                System.err.println("ERR: up2eg2tax.map not in expected format. \ncritical line: " + line);
                System.exit(-1);
            }

            String id = entry[0].trim();
            String taxId = entry[2].trim();
            id2tax.put(id, taxId);
        }

        reader.close();
    }

    private void readEgTaxMap(File geneInfo) throws IOException {
        try (BufferedReader br = new BufferedReader(
                new InputStreamReader(new GZIPInputStream(new FileInputStream(geneInfo))))) {
            id2tax = br.lines().collect(
                    Collectors.toMap(l -> l.split("\\t", 3)[1], l -> l.split("\\t", 3)[0]));
        }
    }

}