de.julielab.genemapper.resources.GeneRecordIndexGenerator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.
The newest version!
/**
 * IndexGenerator.java
 * 
 * Copyright (c) 2006, JULIE Lab.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Common Public License v1.0
 * 

 * Author: tomanek
 * 

 * Current version: 1.5.1
 * Since version:   1.0
 * 

 * Creation date: Nov 30, 2006
 * 

 * This class generates the Lucene index from the modified biothesaurus as
 * provided by EBI in BootSTREP.
 * 

 * This version of the index generator expects a consolidated biothesaurus file
 * which only consists of these columns:
 * - col1: synonym (normalized)
 * - col2: uniref_50
 * 
 * IMPORTANT NOTES:
 * - no normalization is done here, so better do the normalization of the BT yourself
 * - for better performance: make entries in bt file unique!
 **/

package de.julielab.genemapper.resources;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import de.julielab.gene.candidateretrieval.SynonymIndexFieldNames;
import de.julielab.geneexpbase.CandidateFilter;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.geneexpbase.genemodel.GeneMention;
import de.julielab.java.utilities.FileUtilities;
import de.julielab.java.utilities.ProgressBar;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.lang.ref.SoftReference;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.zip.GZIPInputStream;


public class GeneRecordIndexGenerator {

    private static final Logger log = LoggerFactory.getLogger(GeneRecordIndexGenerator.class);

    /**
     * The synonym index is filtered for unspecifieds and others. This field
     * determines whether filtered items should be omitted completely from the index
     * or if they should just be flagged to be filtered but included into the index.
     * The latter will lead to a larger index, of course. Used for experiments, not
     * required as of January 30, 2018.
     */
    private static final Boolean OMIT_FILTERED = true;
    private final Directory indexDirectory;
    /**
     * A file containing gene or protein names / synonyms and their respective NCBI
     * Gene or UniProt ID. No term normalization is expected for this dictionary.
     */
    private final File dictFile;
    private final Map extendedInformationFields;
    Map id2tax;
    Set> documents = new HashSet<>();

    /**
     * @param dictFile                  A file containing gene or protein names / synonyms and their
     *                                  respective NCBI Gene or UniProt ID. No term normalization is
     *                                  expected for this dictionary.
     * @param extendedInformationFields
     * @param indexFile                 The directory where the name / synonym index will be written to.
     * @throws FileNotFoundException
     * @throws IOException
     */
    public GeneRecordIndexGenerator(File dictFile, Map extendedInformationFields, File indexFile) throws FileNotFoundException, IOException {
        this.extendedInformationFields = extendedInformationFields;
        log.info("Building gene records index from dictionary {}", dictFile);
        this.dictFile = dictFile;
        indexDirectory = FSDirectory.open(indexFile.toPath());
    }

    /**
     * To execute the ContextIndexGenerator start it with the following command-line
     * arguments:

     * arg0: path to resources directory arg1: path to synonym indices directory
     *
     * @param args
     */
    public static void main(String[] args) {

        long s1 = System.currentTimeMillis();

        if (args.length != 3) {
            System.err.println(
                    "Usage: GeneRecordIndexGenerator   ");
            System.exit(1);
        }

        String dictFile = args[0];
        String resPath = args[1];
        File resDir = new File(resPath);
        if (!resDir.isDirectory()) {
            System.err.println("Could not find resources directory");
            System.exit(1);
        }
        if (!resPath.endsWith(File.separator)) {
            resPath = resPath + File.separator;
        }

        String indexPath = args[2];

        File geneIndexDir = dictFile.equals("gene.dict.uniqueprioritynames.sortedbyid.eg") ? new File(indexPath, "geneNamesRecordsIndexOriginalNames") : new File(indexPath, "geneRecordsIndex");
        File proteinIndexDir = dictFile.equals("gene.dict.uniqueprioritynames.sortedbyid.eg") ? new File(indexPath, "ProteinRecordsIndexOriginalNames") : new File(indexPath, "proteinRecordsIndex");

        if (geneIndexDir.exists())
            FileUtils.deleteQuietly(geneIndexDir);
        if (proteinIndexDir.exists())
            FileUtils.deleteQuietly(proteinIndexDir);

        File upDictFile = new File(resPath + "gene.dict.up");
        checkFile(upDictFile);

        File egDictFile = new File(resPath + dictFile);
        checkFile(egDictFile);

        File eg2chromosome = new File(resPath + "eg2chromosome");
        File eg2description = new File(resPath + "eg2description");
        File eg2generif = new File(resPath + "eg2generif");
        File eg2go = new File(resPath + "eg2go");
        File goDesc = new File(resPath + "go_all");
        File eg2interaction = new File(resPath + "eg2interaction");
        File eg2maplocation = new File(resPath + "eg2maplocation");
        File eg2summary = new File(resPath + "eg2summary");
        File eg2ecnumber = new File(resPath + "eg2ecnumber-genexmldownloader.gz");

        Map extendedInformationFields = new LinkedHashMap<>();
        extendedInformationFields.put(SynonymIndexFieldNames.CHROMOSOME, eg2chromosome);
        extendedInformationFields.put(SynonymIndexFieldNames.DESCRIPTION, eg2description);
        extendedInformationFields.put(SynonymIndexFieldNames.GENERIF, eg2generif);
        extendedInformationFields.put("go", eg2go);
        extendedInformationFields.put(SynonymIndexFieldNames.GODESC, goDesc);
        extendedInformationFields.put(SynonymIndexFieldNames.INTERACTION, eg2interaction);
        extendedInformationFields.put(SynonymIndexFieldNames.MAPLOCATION, eg2maplocation);
        extendedInformationFields.put(SynonymIndexFieldNames.SUMMARY, eg2summary);
        extendedInformationFields.put(SynonymIndexFieldNames.ECNUMBER, eg2ecnumber);

        File upTaxMap = new File(resPath + "up2eg2tax.map");
        checkFile(upTaxMap);

        File geneInfo = new File(resPath + "gene_info_organism_filtered.gz");
        File egTaxMap = geneInfo;

        GeneRecordIndexGenerator indexGenerator;
        try {
            // indexGenerator = new SynonymIndexGenerator(upDictFile, proteinIndexDir);
            // indexGenerator.readUpTaxMap(upTaxMap);
            // indexGenerator.createIndex();
            indexGenerator = new GeneRecordIndexGenerator(egDictFile, extendedInformationFields, geneIndexDir);
            indexGenerator.readEgTaxMap(egTaxMap);
            indexGenerator.createIndex();
        } catch (IOException e) {
            e.printStackTrace();
        }

        long s2 = System.currentTimeMillis();
        System.out.println("Index created successfully! (" + (s2 - s1) / 1000 + " sec)");
    }

    private static void checkFile(File file) {
        if (!file.isFile())
            throw new IllegalArgumentException("File \"" + file.getAbsolutePath() + "\" could not be found.");
    }

    /**
     * create the index, i.e. read from the biothesaurus file (which is expected to
     * have normalized synonyms!) and then write it to the index.
     *
     * @throws IOException
     */
    public void createIndex() throws IOException {
        CandidateFilter cf = new CandidateFilter();
        TermNormalizer normalizer = new TermNormalizer();

        Map> id2infotype2info = readExtendedInformationFiles();

        FieldType notStoredTextFieldType = new FieldType(TextField.TYPE_NOT_STORED);
        notStoredTextFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        FieldType storedTextFieldType = new FieldType(TextField.TYPE_STORED);
        storedTextFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        FieldType stringFieldTypeDocsAndFreqs = new FieldType(StringField.TYPE_STORED);
        stringFieldTypeDocsAndFreqs.setIndexOptions(IndexOptions.DOCS_AND_FREQS);


        WhitespaceAnalyzer wsAnalyzer = new WhitespaceAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(wsAnalyzer);
        iwc.setOpenMode(OpenMode.CREATE);
        // VERY IMPORTANT: The dictionary file must be sorted by id. This is because we want to group the
        // dictionary entries by id, but we don't want to read the whole dictionary and sort it in-memory
        // because this may well exhaust the memory for the full all-species dictionary.
        final ExecutorService executorService = Executors.newFixedThreadPool(20);
        AtomicInteger numDocumentsIndexed = new AtomicInteger();
        try (IndexWriter iw = new IndexWriter(indexDirectory, iwc)) {

            log.info("Counting number of lines of the dictionary file {}", dictFile);
            long numLines;
            try (BufferedReader normDictReader = new BufferedReader(new FileReader(dictFile))) {
                numLines = normDictReader.lines().count();
            }
            log.info("Generating index now for {} synonyms.", numLines);
            // now loop through dictionary and add entries to the index
            ProgressBar progressBar = new ProgressBar(numLines, 80);
            try (BufferedReader dictReader = new BufferedReader(new FileReader(dictFile))) {
                Map syn2prio = new HashMap<>();
                String lastId = null;
                boolean filtered = false;
                String line;
                int lineNum = 1;
                while ((line = dictReader.readLine()) != null) {

                    String[] values = line.split("\t");

                    // check whether format is OK
                    if (values.length != 3) {
                        System.err.println("ERR: normalized dictionary not in expected format. \ncritical line: " + line);
                        // System.exit(-1);
                        continue;
                    }

                    // now get the field values
                    String normalizedName = values[0];
                    String id = values[1];
                    Integer priority = Integer.parseInt(values[2]);
                    boolean isFamilyEntry = id.contains("GENO:");

                    if (cf != null && !OMIT_FILTERED && priority != -1 && !isFamilyEntry) {
                        filtered = DictionaryFamilyDomainFilter.isFiltered(id, cf, normalizedName);
                    }


                    if (filtered && OMIT_FILTERED)
                        continue;

                    if (lastId != null && !lastId.equals(id)) {
                        boolean finalFiltered = filtered;
                        log.trace("Indexing gene record with ID {} and synonym/priority pairs {}", lastId, syn2prio);
                        String finalLastId = lastId;
                        HashMap syn2prio4id = new HashMap<>(syn2prio);
                        executorService.submit(() -> {
                            try {
                                indexGeneRecord(finalLastId, syn2prio4id, id2infotype2info, finalFiltered, normalizer, iw, notStoredTextFieldType, storedTextFieldType, stringFieldTypeDocsAndFreqs, numDocumentsIndexed);
                            } catch (IOException e) {
                                log.error("Could not create index document for gene id {}", finalLastId, e);
                            }
                        });
                        syn2prio.clear();
                    }
                    syn2prio.put(normalizedName, priority);
                    lastId = id;
                    //if (lineNum % 100000 == 0)
//                        log.info("Now processing line {}", lineNum);
                    // No need to try and show the progress bar when a lot of logging messages appear all the time
                    if (lineNum % 1000 == 0 && !log.isDebugEnabled()) {
                        progressBar.incrementDone(lineNum - progressBar.getDone(), true);
                    }
//                    if (lineNum % 1000000 == 0) {
//                        log.debug("Committing IndexWriter at line number {}.", lineNum);
//                        if (log.isDebugEnabled())
//                            synchronized (documents) {
//                                log.debug("There are {} documents reachable", documents.stream().map(SoftReference::get).filter(Objects::nonNull).count());
//                            }
//                        iw.commit();
//                    }
                    ++lineNum;
                }
                // Index the last entry
                boolean finalFiltered = filtered;
                String finalLastId = lastId;
                executorService.submit(() -> {
                    try {
                        indexGeneRecord(finalLastId, new HashMap<>(syn2prio), id2infotype2info, finalFiltered, normalizer, iw, notStoredTextFieldType, storedTextFieldType, stringFieldTypeDocsAndFreqs, numDocumentsIndexed);
                    } catch (IOException e) {
                        log.error("Could not create index document for gene id {}", finalLastId, e);
                    } catch (Throwable t) {
                        log.error("Error", t);
                    }
                });
                log.info("Dictionary file {} has been consumed, all indexing jobs have been sent.", dictFile);
            } finally {
                try {
                    log.info("Shutting down executor.");
                    log.info("Waiting for running threads to terminate.");
                    executorService.shutdown();
                    // The 100 days basically mean "wait until we are finished"
                    executorService.awaitTermination(100, TimeUnit.DAYS);
                } catch (InterruptedException e) {
                    log.warn("Waiting for running threads to finish has been interrupted. Shutting down the executor service now.");
                    executorService.shutdownNow();
                }
                log.info("ExecutorService has been shut down.");
            }
            log.info("Committing {} documents to the index.", numDocumentsIndexed.get());
            iw.commit();
            iw.forceMerge(5);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private Map> readExtendedInformationFiles() {
        // chromosome  - single field
        // description - single field
        // generif     - single field
        // go          - value is GO ID; single field
        // godesc      - key is GO ID; multifield, pipe as separator
        // interaction - single field
        // maplocation - single field
        // summary     - single field

        // The map must be ordered such that the "go" file comes before the "godesc" file to create this map in time
        Map goid2eg = new HashMap<>();
        Map> infoMap = new HashMap<>();
        log.info("Loading extended information files");
        for (String informationType : extendedInformationFields.keySet()) {
            File f = extendedInformationFields.get(informationType);
            log.info("Reading {}", f);
            try (BufferedReader br = FileUtilities.getReaderFromFile(f)) {
                Stream linesSplits = br.lines().map(s -> s.split("\t"));
                if (informationType.equals("godesc")) {
                    linesSplits = linesSplits.map(s -> {
                        s[0] = goid2eg.containsKey(s[0]) ? goid2eg.get(s[0].intern()).intern() : null;
                        return s;
                    });
                }
                if (informationType.equals("go")) {
                    linesSplits.forEach(s -> goid2eg.put(s[1].intern(), s[0].intern()));
                } else {
                    linesSplits.filter(Objects::nonNull).filter(s -> s[0] != null && s[1] != null).filter(s -> !s[0].isBlank() && !s[1].isBlank()).forEach(s ->
                            infoMap.compute(s[0].intern(), (k, v) -> v != null ? v : HashMultimap.create()).put(informationType.intern(), s[1]));
                }
            } catch (IOException e) {
                log.error("Could not read file {}. The respective extended information will not be added to the index", f, e);
            }
        }
        return infoMap;
    }

    public void indexGeneRecord(String id, Map syn2prio, Map> id2infotype2info, boolean filtered, TermNormalizer normalizer, IndexWriter iw, FieldType notStoredTextFieldType, FieldType storedTextFieldType, FieldType stringFieldTypeDocsAndFreqs, AtomicInteger numDocumentsIndexed) throws IOException {
        try {
            String tax = "";
            if (id2tax.get(id) != null) {
                tax = id2tax.get(id);
            }
            // make fields
            List fields = new ArrayList<>();
            fields.add(new StringField(SynonymIndexFieldNames.ENTITY_TYPE, id.contains("GENO:") ? GeneMention.SpecificType.FAMILYNAME.name() : GeneMention.SpecificType.GENE.name(), Store.YES));

            Field idField = new StringField(SynonymIndexFieldNames.ID_FIELD, id, Store.YES);
            for (String normalizedName : syn2prio.keySet()) {
                Integer priority = syn2prio.get(normalizedName);
                String fieldname;
                if (priority == -1)
                    fieldname = SynonymIndexFieldNames.SYMBOL;
                else if (priority == 0)
                    fieldname = SynonymIndexFieldNames.SYMBOL_FROM_NOMCENCLATURE;
                else if (priority == 1)
                    fieldname = SynonymIndexFieldNames.FULL_NAMES;
                else if (priority == 2)
                    fieldname = SynonymIndexFieldNames.SYNONYMS;
                else if (priority == 3)
                    fieldname = SynonymIndexFieldNames.OTHER_DESIGNATIONS;
                else if (priority == 4)
                    // from the XML extracted from the NCBI Gene ASN.1 file
                    fieldname = SynonymIndexFieldNames.PROTEIN_NAMES;
                else if (priority == 5)
                    fieldname = SynonymIndexFieldNames.UNIPROT_NAMES;
                else if (priority == 6)
                    fieldname = SynonymIndexFieldNames.XREFS;
                else if (priority == 7)
                    fieldname = SynonymIndexFieldNames.BIO_THESAURUS;
                else
                    throw new IllegalArgumentException("Unsupported synonym priority: " + priority);

                log.trace("Now adding field {} for synonym {} for ID {}", fieldname, normalizedName, id);
                fields.add(new Field(fieldname, normalizedName, storedTextFieldType));
//                fields.add(new Field(SynonymIndexFieldNames.LOOKUP_SYN_FIELD, normalizedName, notStoredTextFieldType));
                // For the experiments on disambiguation: This allows us to try and match the exact
                // name.
                fields.add(new Field(fieldname + "_exact", normalizedName, stringFieldTypeDocsAndFreqs));
//                fields.add(new Field(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_EXACT, normalizedName, notStoredTextFieldType));
            }
            Field taxField = new StringField(SynonymIndexFieldNames.TAX_ID_FIELD, tax, Store.YES);
            if (!OMIT_FILTERED) {
                IntPoint filteredField = new IntPoint(SynonymIndexFieldNames.FILTERED, filtered ? 1 : 0);
                StoredField storedFilteredField = new StoredField(SynonymIndexFieldNames.FILTERED,
                        filtered ? 1 : 0);
                fields.add(filteredField);
                fields.add(storedFilteredField);
            }
            // chromosome  - single field
            // description - single field
            // generif     - single field
            // godesc      - multifield, pipe as separator
            // interaction - single field
            // maplocation - single field
            // summary     - single field

            Multimap infotype2info = id2infotype2info.get(id);
            if (infotype2info != null) {
                for (String infotype : infotype2info.keySet()) {
                    for (String value : infotype2info.get(infotype)) {
                        if (!infotype.equals(SynonymIndexFieldNames.GODESC)) {
                            fields.add(new Field(infotype, normalizer.normalize(value), storedTextFieldType));
                        } else {
                            String[] values = value.split("\\|");
                            for (String v : values)
                                fields.add(new Field(infotype, normalizer.normalize(v), storedTextFieldType));
                        }
                    }
                }
            }
            // The IDs only occur once, we can remove the data associated with it now.
            id2infotype2info.remove(id);

            fields.add(idField);
            fields.add(taxField);

            // make document and add to index
            Document d = new Document();
            for (Field f : fields)
                d.add(f);
            if (!fields.isEmpty()) {
                iw.addDocument(d);
                numDocumentsIndexed.incrementAndGet();
                synchronized (documents) {
                    documents.add(new SoftReference<>(d));
                }
            }
        } catch (Throwable t) {
            log.error("Error occurred", t);
            throw t;
        }
    }

    private void readUpTaxMap(File taxMap) throws IOException {
        log.info("Reading up2eg2tax.map ...");
        id2tax = new HashMap<>();

        BufferedReader reader = new BufferedReader(new FileReader(taxMap));
        String line = "";

        while ((line = reader.readLine()) != null) {
            String[] entry = line.split("\t");

            if (entry.length != 3) {
                System.err.println("ERR: up2eg2tax.map not in expected format. \ncritical line: " + line);
                System.exit(-1);
            }

            String id = entry[0].trim().intern();
            String taxId = entry[2].trim().intern();
            id2tax.put(id, taxId);
        }

        reader.close();
    }

    private void readEgTaxMap(File geneInfo) throws IOException {
        log.info("Reading file gene ID to taxonomy ID map from {}", geneInfo);
        try (BufferedReader br = new BufferedReader(
                new InputStreamReader(new GZIPInputStream(new FileInputStream(geneInfo))))) {
            id2tax = br.lines().collect(
                    Collectors.toMap(l -> l.split("\\t", 3)[1].intern(), l -> l.split("\\t", 3)[0].intern()));
        }
    }

}