All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.genemapper.resources.ContextItemIndexGenerator Maven / Gradle / Ivy

Go to download

This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.

The newest version!
package de.julielab.genemapper.resources;

import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.genemapper.index.ContextIndexFieldNames;
import de.julielab.genemapper.resources.util.UncheckedGeneMapperResourcesException;
import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;

/**
 * 

This class creates a full text index of information about each gene which serves for disambiguation.

*

The indexed items are the GeneRIF and interaction fields of the genes * in NCBI Gene. Those two fields may exist many times per gene ID which makes it necessary to find specific * instances to check for occurrences of a given gene name from context, for example.

*/ public class ContextItemIndexGenerator { private final static Logger log = LoggerFactory.getLogger(ContextItemIndexGenerator.class); private final Directory indexDirectory; private final boolean normalizeContexts; private final String resourcesDir; /** * constructor which creates index in the specified directory on the disk */ public ContextItemIndexGenerator(File indexFile, String resourcesDir, boolean normalizeContexts) { this.resourcesDir = resourcesDir; indexDirectory = createIndexDirectory(indexFile); this.normalizeContexts = normalizeContexts; } /** * To execute the ContextIndexGenerator start it with the following command-line arguments:
* arg0: path to resources directory * arg1: path to context indices directory * * @param args */ public static void main(String[] args) { long s1 = System.currentTimeMillis(); if (args.length != 3) { System.err.println("Usage: SynonymDisambiguationIndexGenerator "); System.exit(-1); } String indexBaseDir = args[1]; if (!indexBaseDir.endsWith(File.separator)) { indexBaseDir = indexBaseDir + File.separator; } boolean normalizeContexts = Boolean.parseBoolean(args[2]); String nameSuffix = normalizeContexts ? "Normalized" : "Original"; File geneIndexDir = new File(indexBaseDir + "geneContextItemsIndex" + nameSuffix); File proteinIndexDir = new File(indexBaseDir + "proteinContextItemsIndex" + nameSuffix); if (geneIndexDir.exists()) FileUtils.deleteQuietly(geneIndexDir); if (proteinIndexDir.exists()) FileUtils.deleteQuietly(proteinIndexDir); String resPath = args[0]; File resDir = new File(resPath); if (!resDir.isDirectory()) { System.err.println("Could not find resources directory"); System.exit(-1); } if (!resPath.endsWith(File.separator)) { resPath = resPath + File.separator; } File upFile = new File(resPath + "up.ids"); if (!upFile.isFile()) { System.err.println("Could not find file uniprot.ids"); System.exit(-1); } File egFile = new File(resPath + "eg.ids"); if (!egFile.isFile()) { System.err.println("Could not find file eg.ids"); System.exit(-1); } ContextItemIndexGenerator indexGenerator; try { indexGenerator = new ContextItemIndexGenerator(proteinIndexDir, resPath, normalizeContexts); indexGenerator.createIndex("protein"); indexGenerator = new ContextItemIndexGenerator(geneIndexDir, resPath, normalizeContexts); indexGenerator.createIndex("gene"); } catch (IOException e) { e.printStackTrace(); } long s2 = System.currentTimeMillis(); System.out.println("Indices created successfully! (" + (s2 - s1) / 1000 + " sec)"); } /** * create the index, i.e. read from the biothesaurus file (which * is expected to have normalized synonyms!) and then write it to the index. * * @throws IOException */ public void createIndex(String idType) throws IOException { String typePrefix = idType.equals("gene") ? "eg" : "up"; String upSrcTypePrefix = idType.equals("gene") ? "entrezgene_id" : "uniprot_id"; File geneRifFile = Path.of(resourcesDir, typePrefix + "2generif").toFile(); File interactionFile = Path.of(resourcesDir, typePrefix + "2interaction").toFile(); File summaryFile = Path.of(resourcesDir, typePrefix + "2summary").toFile(); File designationsFile = Path.of(resourcesDir, typePrefix + "2designation").toFile(); File activityRegulationFile = Path.of(resourcesDir, upSrcTypePrefix + "2activity regulation").toFile(); File chromosomeFile = Path.of(resourcesDir, upSrcTypePrefix + "2chromosome").toFile(); File developmentalStageFile = Path.of(resourcesDir, upSrcTypePrefix + "2developmental stage").toFile(); File diseaseFile = Path.of(resourcesDir, upSrcTypePrefix + "2disease").toFile(); File domainFile = Path.of(resourcesDir, upSrcTypePrefix + "2domain").toFile(); File functionFile = Path.of(resourcesDir, upSrcTypePrefix + "2function").toFile(); File inductionFile = Path.of(resourcesDir, upSrcTypePrefix + "2induction").toFile(); File keywordFile = Path.of(resourcesDir, upSrcTypePrefix + "2keyword").toFile(); File miscellaneousFile = Path.of(resourcesDir, upSrcTypePrefix + "2miscellaneous").toFile(); File pathwayFile = Path.of(resourcesDir, upSrcTypePrefix + "2pathway").toFile(); File similarityFile = Path.of(resourcesDir, upSrcTypePrefix + "2similarity").toFile(); File subcellularLocationFile = Path.of(resourcesDir, upSrcTypePrefix + "2subcellular location").toFile(); File subunitFile = Path.of(resourcesDir, upSrcTypePrefix + "2subunit").toFile(); File tissueFile = Path.of(resourcesDir, upSrcTypePrefix + "2tissue").toFile(); File tissueSpecificityFile = Path.of(resourcesDir, upSrcTypePrefix + "2tissue specificity").toFile(); TermNormalizer normalizer = new TermNormalizer(); Function contextTransformation = normalizeContexts ? context -> normalizer.normalize(context) : Function.identity(); IndexWriterConfig iwc = new IndexWriterConfig(new WhitespaceAnalyzer()); iwc.setOpenMode(OpenMode.CREATE); AtomicInteger counter = new AtomicInteger(); IndexWriter contextIndexWriter = new IndexWriter(indexDirectory, iwc); log.info("Writing index {}", indexDirectory); indexContextFile(activityRegulationFile, ContextIndexFieldNames.FIELD_ACTIVITY_REGULATION, contextTransformation, counter, contextIndexWriter); indexContextFile(chromosomeFile, ContextIndexFieldNames.FIELD_CHROMOSOME, contextTransformation, counter, contextIndexWriter); indexContextFile(developmentalStageFile, ContextIndexFieldNames.FIELD_DEVELOPMENTAL_STAGE, contextTransformation, counter, contextIndexWriter); indexContextFile(designationsFile, ContextIndexFieldNames.FIELD_DESIGNATION, contextTransformation, counter, contextIndexWriter); indexContextFile(diseaseFile, ContextIndexFieldNames.FIELD_DISEASE, contextTransformation, counter, contextIndexWriter); indexContextFile(domainFile, ContextIndexFieldNames.FIELD_DOMAIN, contextTransformation, counter, contextIndexWriter); indexContextFile(functionFile, ContextIndexFieldNames.FIELD_FUNCTION, contextTransformation, counter, contextIndexWriter); indexContextFile(geneRifFile, ContextIndexFieldNames.FIELD_GENERIF, contextTransformation, counter, contextIndexWriter); indexContextFile(inductionFile, ContextIndexFieldNames.FIELD_INDUCTION, contextTransformation, counter, contextIndexWriter); indexContextFile(interactionFile, ContextIndexFieldNames.FIELD_INTERACTION, contextTransformation, counter, contextIndexWriter); indexContextFile(keywordFile, ContextIndexFieldNames.FIELD_KEYWORD, contextTransformation, counter, contextIndexWriter); indexContextFile(miscellaneousFile, ContextIndexFieldNames.FIELD_MISCELLANEOUS, contextTransformation, counter, contextIndexWriter); indexContextFile(pathwayFile, ContextIndexFieldNames.FIELD_PATHWAY, contextTransformation, counter, contextIndexWriter); indexContextFile(similarityFile, ContextIndexFieldNames.FIELD_SIMILARITY, contextTransformation, counter, contextIndexWriter); indexContextFile(subcellularLocationFile, ContextIndexFieldNames.FIELD_SUBCELLULAR_LOCATION, contextTransformation, counter, contextIndexWriter); indexContextFile(subunitFile, ContextIndexFieldNames.FIELD_SUBUNIT, contextTransformation, counter, contextIndexWriter); indexContextFile(summaryFile, ContextIndexFieldNames.FIELD_SUMMARY, contextTransformation, counter, contextIndexWriter); indexContextFile(tissueFile, ContextIndexFieldNames.FIELD_TISSUE, contextTransformation, counter, contextIndexWriter); indexContextFile(tissueSpecificityFile, ContextIndexFieldNames.FIELD_TISSUE_SPECIFICITY, contextTransformation, counter, contextIndexWriter); contextIndexWriter.forceMerge(4); log.info("Done writing context item index for {} ids.", counter); } private void indexContextFile(File contextFile, String contextType, Function contextTransformation, AtomicInteger counter, IndexWriter contextIndexWriter) throws IOException { try (BufferedReader br = new BufferedReader(FileUtilities.getReaderFromFile(contextFile))) { br.lines().parallel().forEach(line -> { Document d = null; try { String[] split = line.split("\t"); if (split.length != 2) log.error("Wrong format in file {}. Expected two columns but got {}. Critical line: {}", contextFile, split.length, line); String id = split[0]; String contextItem = split[1]; Field lookupIdField = new StringField(ContextIndexFieldNames.LOOKUP_ID_FIELD, id, Store.YES); Field lookupContextField = new TextField(ContextIndexFieldNames.FIELD_CONTEXT_ITEM, contextTransformation.apply(contextItem), Store.YES); Field contextTypeField = new StringField(ContextIndexFieldNames.FIELD_ITEM_TYPE, contextType, Store.NO); d = new Document(); d.add(lookupIdField); d.add(lookupContextField); d.add(contextTypeField); contextIndexWriter.addDocument(d); counter.incrementAndGet(); } catch (IOException e) { log.error("Could not index document {}", d); throw new UncheckedGeneMapperResourcesException(e); } }); } } /** * create the directory object where to put the lucene index... */ private FSDirectory createIndexDirectory(File indexFile) { FSDirectory fdir; try { // NIOFSDirectory performs better than FSDirectory, but not on windows (see NIOFSDirectory JavaDoc) boolean isWindows = System.getProperty("os.name").toLowerCase().contains("win"); fdir = FSDirectory.open(indexFile.toPath()); } catch (IOException e) { log.error("Could not create index directory {}", indexFile, e); throw new UncheckedGeneMapperResourcesException(e); } return fdir; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy