de.julielab.genemapper.resources.ContextItemIndexGenerator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.
The newest version!
package de.julielab.genemapper.resources;

import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.genemapper.index.ContextIndexFieldNames;
import de.julielab.genemapper.resources.util.UncheckedGeneMapperResourcesException;
import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;

/**
 * This class creates a full text index of information about each gene which serves for disambiguation.
 * The indexed items are the GeneRIF and interaction fields of the genes
 * in NCBI Gene. Those two fields may exist many times per gene ID which makes it necessary to find specific
 * instances to check for occurrences of a given gene name from context, for example.
 */
public class ContextItemIndexGenerator {
    private final static Logger log = LoggerFactory.getLogger(ContextItemIndexGenerator.class);
    private final Directory indexDirectory;
    private final boolean normalizeContexts;
    private final String resourcesDir;

    /**
     * constructor which creates index in the specified directory on the disk
     */
    public ContextItemIndexGenerator(File indexFile, String resourcesDir, boolean normalizeContexts) {
        this.resourcesDir = resourcesDir;
        indexDirectory = createIndexDirectory(indexFile);
        this.normalizeContexts = normalizeContexts;
    }

    /**
     * To execute the ContextIndexGenerator start it with the following command-line arguments:

     * arg0: path to resources directory
     * arg1: path to context indices directory
     *
     * @param args
     */
    public static void main(String[] args) {

        long s1 = System.currentTimeMillis();

        if (args.length != 3) {
            System.err.println("Usage: SynonymDisambiguationIndexGenerator   ");
            System.exit(-1);
        }

        String indexBaseDir = args[1];
        if (!indexBaseDir.endsWith(File.separator)) {
            indexBaseDir = indexBaseDir + File.separator;
        }
        boolean normalizeContexts = Boolean.parseBoolean(args[2]);
        String nameSuffix = normalizeContexts ? "Normalized" : "Original";
        File geneIndexDir = new File(indexBaseDir + "geneContextItemsIndex" + nameSuffix);
        File proteinIndexDir = new File(indexBaseDir + "proteinContextItemsIndex" + nameSuffix);

        if (geneIndexDir.exists())
            FileUtils.deleteQuietly(geneIndexDir);
        if (proteinIndexDir.exists())
            FileUtils.deleteQuietly(proteinIndexDir);

        String resPath = args[0];
        File resDir = new File(resPath);
        if (!resDir.isDirectory()) {
            System.err.println("Could not find resources directory");
            System.exit(-1);
        }
        if (!resPath.endsWith(File.separator)) {
            resPath = resPath + File.separator;
        }

        File upFile = new File(resPath + "up.ids");
        if (!upFile.isFile()) {
            System.err.println("Could not find file uniprot.ids");
            System.exit(-1);
        }
        File egFile = new File(resPath + "eg.ids");
        if (!egFile.isFile()) {
            System.err.println("Could not find file eg.ids");
            System.exit(-1);
        }

        ContextItemIndexGenerator indexGenerator;
        try {
            indexGenerator = new ContextItemIndexGenerator(proteinIndexDir, resPath, normalizeContexts);
            indexGenerator.createIndex("protein");
            indexGenerator = new ContextItemIndexGenerator(geneIndexDir, resPath, normalizeContexts);
            indexGenerator.createIndex("gene");
        } catch (IOException e) {
            e.printStackTrace();
        }

        long s2 = System.currentTimeMillis();
        System.out.println("Indices created successfully! (" + (s2 - s1) / 1000 + " sec)");
    }

    /**
     * create the index, i.e. read from the biothesaurus file (which
     * is expected to have normalized synonyms!) and then write it to the index.
     *
     * @throws IOException
     */
    public void createIndex(String idType) throws IOException {

        String typePrefix = idType.equals("gene") ? "eg" : "up";
        String upSrcTypePrefix = idType.equals("gene") ? "entrezgene_id" : "uniprot_id";
        File geneRifFile = Path.of(resourcesDir, typePrefix + "2generif").toFile();
        File interactionFile = Path.of(resourcesDir, typePrefix + "2interaction").toFile();
        File summaryFile = Path.of(resourcesDir, typePrefix + "2summary").toFile();
        File designationsFile = Path.of(resourcesDir, typePrefix + "2designation").toFile();

        File activityRegulationFile = Path.of(resourcesDir, upSrcTypePrefix + "2activity regulation").toFile();
        File chromosomeFile =  Path.of(resourcesDir, upSrcTypePrefix + "2chromosome").toFile();
        File developmentalStageFile = Path.of(resourcesDir, upSrcTypePrefix + "2developmental stage").toFile();
        File diseaseFile = Path.of(resourcesDir, upSrcTypePrefix + "2disease").toFile();
        File domainFile = Path.of(resourcesDir, upSrcTypePrefix + "2domain").toFile();
        File functionFile = Path.of(resourcesDir, upSrcTypePrefix + "2function").toFile();
        File inductionFile = Path.of(resourcesDir, upSrcTypePrefix + "2induction").toFile();
        File keywordFile = Path.of(resourcesDir, upSrcTypePrefix + "2keyword").toFile();
        File miscellaneousFile = Path.of(resourcesDir, upSrcTypePrefix + "2miscellaneous").toFile();
        File pathwayFile = Path.of(resourcesDir, upSrcTypePrefix + "2pathway").toFile();
        File similarityFile = Path.of(resourcesDir, upSrcTypePrefix + "2similarity").toFile();
        File subcellularLocationFile = Path.of(resourcesDir, upSrcTypePrefix + "2subcellular location").toFile();
        File subunitFile = Path.of(resourcesDir, upSrcTypePrefix + "2subunit").toFile();
        File tissueFile = Path.of(resourcesDir, upSrcTypePrefix + "2tissue").toFile();
        File tissueSpecificityFile = Path.of(resourcesDir, upSrcTypePrefix + "2tissue specificity").toFile();

        TermNormalizer normalizer = new TermNormalizer();
        Function contextTransformation = normalizeContexts ? context -> normalizer.normalize(context) : Function.identity();
        IndexWriterConfig iwc = new IndexWriterConfig(new WhitespaceAnalyzer());
        iwc.setOpenMode(OpenMode.CREATE);
        AtomicInteger counter = new AtomicInteger();
        IndexWriter contextIndexWriter = new IndexWriter(indexDirectory, iwc);

        log.info("Writing index {}", indexDirectory);

        indexContextFile(activityRegulationFile, ContextIndexFieldNames.FIELD_ACTIVITY_REGULATION, contextTransformation, counter, contextIndexWriter);
        indexContextFile(chromosomeFile, ContextIndexFieldNames.FIELD_CHROMOSOME, contextTransformation, counter, contextIndexWriter);
        indexContextFile(developmentalStageFile, ContextIndexFieldNames.FIELD_DEVELOPMENTAL_STAGE, contextTransformation, counter, contextIndexWriter);
        indexContextFile(designationsFile, ContextIndexFieldNames.FIELD_DESIGNATION, contextTransformation, counter, contextIndexWriter);
        indexContextFile(diseaseFile, ContextIndexFieldNames.FIELD_DISEASE, contextTransformation, counter, contextIndexWriter);
        indexContextFile(domainFile, ContextIndexFieldNames.FIELD_DOMAIN, contextTransformation, counter, contextIndexWriter);
        indexContextFile(functionFile, ContextIndexFieldNames.FIELD_FUNCTION, contextTransformation, counter, contextIndexWriter);
        indexContextFile(geneRifFile, ContextIndexFieldNames.FIELD_GENERIF, contextTransformation, counter, contextIndexWriter);
        indexContextFile(inductionFile, ContextIndexFieldNames.FIELD_INDUCTION, contextTransformation, counter, contextIndexWriter);
        indexContextFile(interactionFile, ContextIndexFieldNames.FIELD_INTERACTION, contextTransformation, counter, contextIndexWriter);
        indexContextFile(keywordFile, ContextIndexFieldNames.FIELD_KEYWORD, contextTransformation, counter, contextIndexWriter);
        indexContextFile(miscellaneousFile, ContextIndexFieldNames.FIELD_MISCELLANEOUS, contextTransformation, counter, contextIndexWriter);
        indexContextFile(pathwayFile, ContextIndexFieldNames.FIELD_PATHWAY, contextTransformation, counter, contextIndexWriter);
        indexContextFile(similarityFile, ContextIndexFieldNames.FIELD_SIMILARITY, contextTransformation, counter, contextIndexWriter);
        indexContextFile(subcellularLocationFile, ContextIndexFieldNames.FIELD_SUBCELLULAR_LOCATION, contextTransformation, counter, contextIndexWriter);
        indexContextFile(subunitFile, ContextIndexFieldNames.FIELD_SUBUNIT, contextTransformation, counter, contextIndexWriter);
        indexContextFile(summaryFile, ContextIndexFieldNames.FIELD_SUMMARY, contextTransformation, counter, contextIndexWriter);
        indexContextFile(tissueFile, ContextIndexFieldNames.FIELD_TISSUE, contextTransformation, counter, contextIndexWriter);
        indexContextFile(tissueSpecificityFile, ContextIndexFieldNames.FIELD_TISSUE_SPECIFICITY, contextTransformation, counter, contextIndexWriter);

        contextIndexWriter.forceMerge(4);

        log.info("Done writing context item index for {} ids.", counter);

    }

    private void indexContextFile(File contextFile, String contextType, Function contextTransformation, AtomicInteger counter, IndexWriter contextIndexWriter) throws IOException {
        try (BufferedReader br = new BufferedReader(FileUtilities.getReaderFromFile(contextFile))) {
            br.lines().parallel().forEach(line -> {
                Document d = null;
                try {
                    String[] split = line.split("\t");
                    if (split.length != 2)
                        log.error("Wrong format in file {}. Expected two columns but got {}. Critical line: {}", contextFile, split.length, line);
                    String id = split[0];
                    String contextItem = split[1];
                    Field lookupIdField = new StringField(ContextIndexFieldNames.LOOKUP_ID_FIELD, id, Store.YES);
                    Field lookupContextField = new TextField(ContextIndexFieldNames.FIELD_CONTEXT_ITEM, contextTransformation.apply(contextItem), Store.YES);
                    Field contextTypeField = new StringField(ContextIndexFieldNames.FIELD_ITEM_TYPE, contextType, Store.NO);
                    d = new Document();
                    d.add(lookupIdField);
                    d.add(lookupContextField);
                    d.add(contextTypeField);

                    contextIndexWriter.addDocument(d);
                    counter.incrementAndGet();
                } catch (IOException e) {
                    log.error("Could not index document {}", d);
                    throw new UncheckedGeneMapperResourcesException(e);
                }
            });
        }
    }


    /**
     * create the directory object where to put the lucene index...
     */
    private FSDirectory createIndexDirectory(File indexFile) {
        FSDirectory fdir;
        try {
            // NIOFSDirectory performs better than FSDirectory, but not on windows (see NIOFSDirectory JavaDoc)
            boolean isWindows = System.getProperty("os.name").toLowerCase().contains("win");
            fdir = FSDirectory.open(indexFile.toPath());
        } catch (IOException e) {
            log.error("Could not create index directory {}", indexFile, e);
            throw new UncheckedGeneMapperResourcesException(e);
        }
        return fdir;
    }

}