de.julielab.genemapper.resources.ContextItemRecordsIndexGenerator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.
The newest version!
package de.julielab.genemapper.resources;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.genemapper.index.ContextIndexFieldNames;
import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Path;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;
import java.util.function.Predicate;

/**
 *
 * This class creates a full text index of information about each gene which serves for disambiguation.
 * The indexed items are gene grouped by Gene ID (in contrast to {@link ContextItemIndexGenerator}) and contain
 * GeneRIF, interaction and summary fields of the genes
 * in NCBI Gene. These information can be used to check whether a specific synonym has actually been used in
 * association with a specific gene ID entry, for example.
 * @deprecated This index type is augmented upon by {@link ContextItemIndexGenerator}
 */
public class ContextItemRecordsIndexGenerator {
    private final static Logger log = LoggerFactory.getLogger(ContextItemRecordsIndexGenerator.class);
    private final Directory indexDirectory;
    private final boolean normalizeContexts;
    private final File idFile;
    private final String resourcesDir;

    /**
     * constructor which creates index in the specified directory on the disk
     */
    public ContextItemRecordsIndexGenerator(File idFile,
                                            File indexFile, String resourcesDir, boolean normalizeContexts) {
        this.idFile = idFile;
        this.resourcesDir = resourcesDir;
        indexDirectory = createIndexDirectory(indexFile);
        this.normalizeContexts = normalizeContexts;
    }

    /**
     * To execute the ContextIndexGenerator start it with the following command-line arguments:

     * arg0: path to resources directory
     * arg1: path to context indices directory
     *
     * @param args
     */
    public static void main(String[] args) {

        long s1 = System.currentTimeMillis();

        if (args.length != 3) {
            System.err.println("Usage: SynonymDisambiguationIndexGenerator   ");
            System.exit(-1);
        }

        String indexBaseDir = args[1];
        if (!indexBaseDir.endsWith(File.separator)) {
            indexBaseDir = indexBaseDir + File.separator;
        }
        boolean normalizeContexts = Boolean.parseBoolean(args[2]);
        String nameSuffix = normalizeContexts ? "Normalized" : "Original";
        File geneIndexDir = new File(indexBaseDir + "geneContextItemsIndex"+nameSuffix);
        File proteinIndexDir = new File(indexBaseDir + "proteinContextItemsIndex"+nameSuffix);

        if (geneIndexDir.exists())
            FileUtils.deleteQuietly(geneIndexDir);
        if (proteinIndexDir.exists())
            FileUtils.deleteQuietly(proteinIndexDir);

        String resPath = args[0];
        File resDir = new File(resPath);
        if (!resDir.isDirectory()) {
            System.err.println("Could not find resources directory");
            System.exit(-1);
        }
        if (!resPath.endsWith(File.separator)) {
            resPath = resPath + File.separator;
        }

        File upFile = new File(resPath + "up.ids");
        if (!upFile.isFile()) {
            System.err.println("Could not find file uniprot.ids");
            System.exit(-1);
        }
        File egFile = new File(resPath + "eg.ids");
        if (!egFile.isFile()) {
            System.err.println("Could not find file eg.ids");
            System.exit(-1);
        }

        ContextItemRecordsIndexGenerator indexGenerator;
        try {
            indexGenerator = new ContextItemRecordsIndexGenerator(upFile, proteinIndexDir, resPath, normalizeContexts);
            indexGenerator.createIndex("protein");
            indexGenerator = new ContextItemRecordsIndexGenerator(egFile, geneIndexDir, resPath, normalizeContexts);
            indexGenerator.createIndex("gene");
        } catch (IOException e) {
            e.printStackTrace();
        }

        long s2 = System.currentTimeMillis();
        System.out.println("Indices created successfully! (" + (s2 - s1) / 1000 + " sec)");
    }

    /**
     * create the index, i.e. read from the biothesaurus file (which
     * is expected to have normalized synonyms!) and then write it to the index.
     *
     * @throws IOException
     */
    public void createIndex(String idType) throws IOException {

        String typePrefix = idType.equals("gene") ? "eg" : "up";

        Map> contextMaps = new HashMap<>();
        readContextInformation(Path.of(resourcesDir, typePrefix + "2generif").toFile(), contextMaps, ContextIndexFieldNames.FIELD_GENERIF);
        readContextInformation(Path.of(resourcesDir, typePrefix + "2interaction").toFile(), contextMaps, ContextIndexFieldNames.FIELD_INTERACTION);
        readContextInformation(Path.of(resourcesDir, typePrefix + "2summary").toFile(), contextMaps, ContextIndexFieldNames.FIELD_SUMMARY);
        readContextInformation(Path.of(resourcesDir, typePrefix + "2description").toFile(), contextMaps, ContextIndexFieldNames.FIELD_DESCRIPTION);
        readContextInformation(Path.of(resourcesDir, typePrefix + "2chromosome").toFile(), contextMaps, ContextIndexFieldNames.FIELD_CHROMOSOME);
        readContextInformation(Path.of(resourcesDir, typePrefix + "2maplocation").toFile(), contextMaps, ContextIndexFieldNames.FIELD_MAP_LOCATION);
        readContextInformation(Path.of(resourcesDir, typePrefix + "2synonyms").toFile(), contextMaps, ContextIndexFieldNames.FIELD_SYNONYMS_BOW);

        IndexWriterConfig iwc = new IndexWriterConfig(new WhitespaceAnalyzer());
        iwc.setOpenMode(OpenMode.CREATE);
        IndexWriter contextIndexWriter = new IndexWriter(indexDirectory, iwc);
        TermNormalizer normalizer = new TermNormalizer();
        AtomicInteger counter = new AtomicInteger();

        Function contextTransformation = normalizeContexts ? context -> normalizer.normalize(context) : Function.identity();
        log.info("Writing index {}", indexDirectory);
        try (BufferedReader idReader = new BufferedReader(new FileReader(idFile))) {

            idReader.lines().parallel().forEach(id -> {
                try {
                    List fields = new ArrayList<>();
                    for (String contextField : contextMaps.keySet()) {
                        final Collection contextItems = contextMaps.get(contextField).get(id);
                        if (contextItems != null) {
                            Field lookupIdField = new StringField(ContextIndexFieldNames.LOOKUP_ID_FIELD, id, Store.YES);
                            fields.add(lookupIdField);
                            for (String contextItem : contextItems) {
                                String contextString = contextItem;
                                if (contextField.equals(ContextIndexFieldNames.FIELD_SYNONYMS_BOW)) {
                                    String[] synonyms = contextString.split("§sep§");
                                    // for the bag-of-words we index all the synonym parts just like any other context item
                                    contextString = String.join(" ", synonyms);
                                    // for the exact synonyms, we store the normalized synonyms as keyword strings to be matched exactly
                                    for (String synonym : synonyms) {
                                        if (synonym.isBlank())
                                            continue;
                                        Field exactSynonymsField = new StringField(ContextIndexFieldNames.FIELD_SYNONYMS_EXACT, contextTransformation.apply(synonym), Store.YES);
                                        fields.add(exactSynonymsField);
                                    }
                                }
                                Field lookupContextField = new TextField(contextField, contextTransformation.apply(contextString), Store.YES);
                                fields.add(lookupContextField);
                            }
                        }
                    }
                    Document d = new Document();
                    for (Field f : fields)
                        d.add(f);


                    contextIndexWriter.addDocument(d);
                    counter.incrementAndGet();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            });
            contextIndexWriter.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        log.info("Done writing context item index for {} ids.", counter);

    }

    private void readContextInformation(File contextFile, Map> contextMaps, String fieldname) throws IOException {
        Multimap context = HashMultimap.create();
        try (final BufferedReader br = FileUtilities.getReaderFromFile(contextFile)) {
            br.lines().filter(Predicate.not(line -> line.startsWith("#"))).map(line -> line.split("\t")).filter(s -> {if (s.length < 2) log.warn("Skipping split {} because there are not enough fields", s); return s.length > 1;}).forEach(split -> context.put(split[0].intern(), split[1]));
        } catch (Exception e) {
            log.error("Could not load context file {}", contextFile, e);
            throw e;
        }
        log.info("Reading context file {} with {} entries", contextFile, context.size());
        contextMaps.put(fieldname, context);
    }


    /**
     * create the directory object where to put the lucene index...
     */
    private FSDirectory createIndexDirectory(File indexFile) {
        FSDirectory fdir = null;
        try {
            fdir = FSDirectory.open(indexFile.toPath());
        } catch (IOException e) {
            e.printStackTrace();
        }
        return fdir;
    }

}