All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.genemapper.resources.ContextItemRecordsIndexGenerator Maven / Gradle / Ivy

Go to download

This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.

The newest version!
package de.julielab.genemapper.resources;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.genemapper.index.ContextIndexFieldNames;
import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Path;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;
import java.util.function.Predicate;

/**
 *
 * 

This class creates a full text index of information about each gene which serves for disambiguation.

*

The indexed items are gene grouped by Gene ID (in contrast to {@link ContextItemIndexGenerator}) and contain * GeneRIF, interaction and summary fields of the genes * in NCBI Gene. These information can be used to check whether a specific synonym has actually been used in * association with a specific gene ID entry, for example.

* @deprecated This index type is augmented upon by {@link ContextItemIndexGenerator} */ public class ContextItemRecordsIndexGenerator { private final static Logger log = LoggerFactory.getLogger(ContextItemRecordsIndexGenerator.class); private final Directory indexDirectory; private final boolean normalizeContexts; private final File idFile; private final String resourcesDir; /** * constructor which creates index in the specified directory on the disk */ public ContextItemRecordsIndexGenerator(File idFile, File indexFile, String resourcesDir, boolean normalizeContexts) { this.idFile = idFile; this.resourcesDir = resourcesDir; indexDirectory = createIndexDirectory(indexFile); this.normalizeContexts = normalizeContexts; } /** * To execute the ContextIndexGenerator start it with the following command-line arguments:
* arg0: path to resources directory * arg1: path to context indices directory * * @param args */ public static void main(String[] args) { long s1 = System.currentTimeMillis(); if (args.length != 3) { System.err.println("Usage: SynonymDisambiguationIndexGenerator "); System.exit(-1); } String indexBaseDir = args[1]; if (!indexBaseDir.endsWith(File.separator)) { indexBaseDir = indexBaseDir + File.separator; } boolean normalizeContexts = Boolean.parseBoolean(args[2]); String nameSuffix = normalizeContexts ? "Normalized" : "Original"; File geneIndexDir = new File(indexBaseDir + "geneContextItemsIndex"+nameSuffix); File proteinIndexDir = new File(indexBaseDir + "proteinContextItemsIndex"+nameSuffix); if (geneIndexDir.exists()) FileUtils.deleteQuietly(geneIndexDir); if (proteinIndexDir.exists()) FileUtils.deleteQuietly(proteinIndexDir); String resPath = args[0]; File resDir = new File(resPath); if (!resDir.isDirectory()) { System.err.println("Could not find resources directory"); System.exit(-1); } if (!resPath.endsWith(File.separator)) { resPath = resPath + File.separator; } File upFile = new File(resPath + "up.ids"); if (!upFile.isFile()) { System.err.println("Could not find file uniprot.ids"); System.exit(-1); } File egFile = new File(resPath + "eg.ids"); if (!egFile.isFile()) { System.err.println("Could not find file eg.ids"); System.exit(-1); } ContextItemRecordsIndexGenerator indexGenerator; try { indexGenerator = new ContextItemRecordsIndexGenerator(upFile, proteinIndexDir, resPath, normalizeContexts); indexGenerator.createIndex("protein"); indexGenerator = new ContextItemRecordsIndexGenerator(egFile, geneIndexDir, resPath, normalizeContexts); indexGenerator.createIndex("gene"); } catch (IOException e) { e.printStackTrace(); } long s2 = System.currentTimeMillis(); System.out.println("Indices created successfully! (" + (s2 - s1) / 1000 + " sec)"); } /** * create the index, i.e. read from the biothesaurus file (which * is expected to have normalized synonyms!) and then write it to the index. * * @throws IOException */ public void createIndex(String idType) throws IOException { String typePrefix = idType.equals("gene") ? "eg" : "up"; Map> contextMaps = new HashMap<>(); readContextInformation(Path.of(resourcesDir, typePrefix + "2generif").toFile(), contextMaps, ContextIndexFieldNames.FIELD_GENERIF); readContextInformation(Path.of(resourcesDir, typePrefix + "2interaction").toFile(), contextMaps, ContextIndexFieldNames.FIELD_INTERACTION); readContextInformation(Path.of(resourcesDir, typePrefix + "2summary").toFile(), contextMaps, ContextIndexFieldNames.FIELD_SUMMARY); readContextInformation(Path.of(resourcesDir, typePrefix + "2description").toFile(), contextMaps, ContextIndexFieldNames.FIELD_DESCRIPTION); readContextInformation(Path.of(resourcesDir, typePrefix + "2chromosome").toFile(), contextMaps, ContextIndexFieldNames.FIELD_CHROMOSOME); readContextInformation(Path.of(resourcesDir, typePrefix + "2maplocation").toFile(), contextMaps, ContextIndexFieldNames.FIELD_MAP_LOCATION); readContextInformation(Path.of(resourcesDir, typePrefix + "2synonyms").toFile(), contextMaps, ContextIndexFieldNames.FIELD_SYNONYMS_BOW); IndexWriterConfig iwc = new IndexWriterConfig(new WhitespaceAnalyzer()); iwc.setOpenMode(OpenMode.CREATE); IndexWriter contextIndexWriter = new IndexWriter(indexDirectory, iwc); TermNormalizer normalizer = new TermNormalizer(); AtomicInteger counter = new AtomicInteger(); Function contextTransformation = normalizeContexts ? context -> normalizer.normalize(context) : Function.identity(); log.info("Writing index {}", indexDirectory); try (BufferedReader idReader = new BufferedReader(new FileReader(idFile))) { idReader.lines().parallel().forEach(id -> { try { List fields = new ArrayList<>(); for (String contextField : contextMaps.keySet()) { final Collection contextItems = contextMaps.get(contextField).get(id); if (contextItems != null) { Field lookupIdField = new StringField(ContextIndexFieldNames.LOOKUP_ID_FIELD, id, Store.YES); fields.add(lookupIdField); for (String contextItem : contextItems) { String contextString = contextItem; if (contextField.equals(ContextIndexFieldNames.FIELD_SYNONYMS_BOW)) { String[] synonyms = contextString.split("§sep§"); // for the bag-of-words we index all the synonym parts just like any other context item contextString = String.join(" ", synonyms); // for the exact synonyms, we store the normalized synonyms as keyword strings to be matched exactly for (String synonym : synonyms) { if (synonym.isBlank()) continue; Field exactSynonymsField = new StringField(ContextIndexFieldNames.FIELD_SYNONYMS_EXACT, contextTransformation.apply(synonym), Store.YES); fields.add(exactSynonymsField); } } Field lookupContextField = new TextField(contextField, contextTransformation.apply(contextString), Store.YES); fields.add(lookupContextField); } } } Document d = new Document(); for (Field f : fields) d.add(f); contextIndexWriter.addDocument(d); counter.incrementAndGet(); } catch (IOException e) { e.printStackTrace(); } }); contextIndexWriter.close(); } catch (IOException e) { e.printStackTrace(); } log.info("Done writing context item index for {} ids.", counter); } private void readContextInformation(File contextFile, Map> contextMaps, String fieldname) throws IOException { Multimap context = HashMultimap.create(); try (final BufferedReader br = FileUtilities.getReaderFromFile(contextFile)) { br.lines().filter(Predicate.not(line -> line.startsWith("#"))).map(line -> line.split("\t")).filter(s -> {if (s.length < 2) log.warn("Skipping split {} because there are not enough fields", s); return s.length > 1;}).forEach(split -> context.put(split[0].intern(), split[1])); } catch (Exception e) { log.error("Could not load context file {}", contextFile, e); throw e; } log.info("Reading context file {} with {} entries", contextFile, context.size()); contextMaps.put(fieldname, context); } /** * create the directory object where to put the lucene index... */ private FSDirectory createIndexDirectory(File indexFile) { FSDirectory fdir = null; try { fdir = FSDirectory.open(indexFile.toPath()); } catch (IOException e) { e.printStackTrace(); } return fdir; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy