de.julielab.genemapper.resources.GeneRecordIndexGenerator Maven / Gradle / Ivy
Show all versions of gene-mapper-resources Show documentation
/**
* IndexGenerator.java
*
* Copyright (c) 2006, JULIE Lab.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Common Public License v1.0
*
* Author: tomanek
*
* Current version: 1.5.1
* Since version: 1.0
*
* Creation date: Nov 30, 2006
*
* This class generates the Lucene index from the modified biothesaurus as
* provided by EBI in BootSTREP.
*
* This version of the index generator expects a consolidated biothesaurus file
* which only consists of these columns:
* - col1: synonym (normalized)
* - col2: uniref_50
*
* IMPORTANT NOTES:
* - no normalization is done here, so better do the normalization of the BT yourself
* - for better performance: make entries in bt file unique!
**/
package de.julielab.genemapper.resources;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import de.julielab.gene.candidateretrieval.SynonymIndexFieldNames;
import de.julielab.geneexpbase.CandidateFilter;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.geneexpbase.genemodel.GeneMention;
import de.julielab.java.utilities.FileUtilities;
import de.julielab.java.utilities.ProgressBar;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.lang.ref.SoftReference;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.zip.GZIPInputStream;
public class GeneRecordIndexGenerator {
private static final Logger log = LoggerFactory.getLogger(GeneRecordIndexGenerator.class);
/**
* The synonym index is filtered for unspecifieds and others. This field
* determines whether filtered items should be omitted completely from the index
* or if they should just be flagged to be filtered but included into the index.
* The latter will lead to a larger index, of course. Used for experiments, not
* required as of January 30, 2018.
*/
private static final Boolean OMIT_FILTERED = true;
private final Directory indexDirectory;
/**
* A file containing gene or protein names / synonyms and their respective NCBI
* Gene or UniProt ID. No term normalization is expected for this dictionary.
*/
private final File dictFile;
private final Map extendedInformationFields;
Map id2tax;
Set> documents = new HashSet<>();
/**
* @param dictFile A file containing gene or protein names / synonyms and their
* respective NCBI Gene or UniProt ID. No term normalization is
* expected for this dictionary.
* @param extendedInformationFields
* @param indexFile The directory where the name / synonym index will be written to.
* @throws FileNotFoundException
* @throws IOException
*/
public GeneRecordIndexGenerator(File dictFile, Map extendedInformationFields, File indexFile) throws FileNotFoundException, IOException {
this.extendedInformationFields = extendedInformationFields;
log.info("Building gene records index from dictionary {}", dictFile);
this.dictFile = dictFile;
indexDirectory = FSDirectory.open(indexFile.toPath());
}
/**
* To execute the ContextIndexGenerator start it with the following command-line
* arguments:
* arg0: path to resources directory arg1: path to synonym indices directory
*
* @param args
*/
public static void main(String[] args) {
long s1 = System.currentTimeMillis();
if (args.length != 3) {
System.err.println(
"Usage: GeneRecordIndexGenerator ");
System.exit(1);
}
String dictFile = args[0];
String resPath = args[1];
File resDir = new File(resPath);
if (!resDir.isDirectory()) {
System.err.println("Could not find resources directory");
System.exit(1);
}
if (!resPath.endsWith(File.separator)) {
resPath = resPath + File.separator;
}
String indexPath = args[2];
File geneIndexDir = dictFile.equals("gene.dict.uniqueprioritynames.sortedbyid.eg") ? new File(indexPath, "geneNamesRecordsIndexOriginalNames") : new File(indexPath, "geneRecordsIndex");
File proteinIndexDir = dictFile.equals("gene.dict.uniqueprioritynames.sortedbyid.eg") ? new File(indexPath, "ProteinRecordsIndexOriginalNames") : new File(indexPath, "proteinRecordsIndex");
if (geneIndexDir.exists())
FileUtils.deleteQuietly(geneIndexDir);
if (proteinIndexDir.exists())
FileUtils.deleteQuietly(proteinIndexDir);
File upDictFile = new File(resPath + "gene.dict.up");
checkFile(upDictFile);
File egDictFile = new File(resPath + dictFile);
checkFile(egDictFile);
File eg2chromosome = new File(resPath + "eg2chromosome");
File eg2description = new File(resPath + "eg2description");
File eg2generif = new File(resPath + "eg2generif");
File eg2go = new File(resPath + "eg2go");
File goDesc = new File(resPath + "go_all");
File eg2interaction = new File(resPath + "eg2interaction");
File eg2maplocation = new File(resPath + "eg2maplocation");
File eg2summary = new File(resPath + "eg2summary");
File eg2ecnumber = new File(resPath + "eg2ecnumber-genexmldownloader.gz");
Map extendedInformationFields = new LinkedHashMap<>();
extendedInformationFields.put(SynonymIndexFieldNames.CHROMOSOME, eg2chromosome);
extendedInformationFields.put(SynonymIndexFieldNames.DESCRIPTION, eg2description);
extendedInformationFields.put(SynonymIndexFieldNames.GENERIF, eg2generif);
extendedInformationFields.put("go", eg2go);
extendedInformationFields.put(SynonymIndexFieldNames.GODESC, goDesc);
extendedInformationFields.put(SynonymIndexFieldNames.INTERACTION, eg2interaction);
extendedInformationFields.put(SynonymIndexFieldNames.MAPLOCATION, eg2maplocation);
extendedInformationFields.put(SynonymIndexFieldNames.SUMMARY, eg2summary);
extendedInformationFields.put(SynonymIndexFieldNames.ECNUMBER, eg2ecnumber);
File upTaxMap = new File(resPath + "up2eg2tax.map");
checkFile(upTaxMap);
File geneInfo = new File(resPath + "gene_info_organism_filtered.gz");
File egTaxMap = geneInfo;
GeneRecordIndexGenerator indexGenerator;
try {
// indexGenerator = new SynonymIndexGenerator(upDictFile, proteinIndexDir);
// indexGenerator.readUpTaxMap(upTaxMap);
// indexGenerator.createIndex();
indexGenerator = new GeneRecordIndexGenerator(egDictFile, extendedInformationFields, geneIndexDir);
indexGenerator.readEgTaxMap(egTaxMap);
indexGenerator.createIndex();
} catch (IOException e) {
e.printStackTrace();
}
long s2 = System.currentTimeMillis();
System.out.println("Index created successfully! (" + (s2 - s1) / 1000 + " sec)");
}
private static void checkFile(File file) {
if (!file.isFile())
throw new IllegalArgumentException("File \"" + file.getAbsolutePath() + "\" could not be found.");
}
/**
* create the index, i.e. read from the biothesaurus file (which is expected to
* have normalized synonyms!) and then write it to the index.
*
* @throws IOException
*/
public void createIndex() throws IOException {
CandidateFilter cf = new CandidateFilter();
TermNormalizer normalizer = new TermNormalizer();
Map> id2infotype2info = readExtendedInformationFiles();
FieldType notStoredTextFieldType = new FieldType(TextField.TYPE_NOT_STORED);
notStoredTextFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
FieldType storedTextFieldType = new FieldType(TextField.TYPE_STORED);
storedTextFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
FieldType stringFieldTypeDocsAndFreqs = new FieldType(StringField.TYPE_STORED);
stringFieldTypeDocsAndFreqs.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
WhitespaceAnalyzer wsAnalyzer = new WhitespaceAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(wsAnalyzer);
iwc.setOpenMode(OpenMode.CREATE);
// VERY IMPORTANT: The dictionary file must be sorted by id. This is because we want to group the
// dictionary entries by id, but we don't want to read the whole dictionary and sort it in-memory
// because this may well exhaust the memory for the full all-species dictionary.
final ExecutorService executorService = Executors.newFixedThreadPool(20);
AtomicInteger numDocumentsIndexed = new AtomicInteger();
try (IndexWriter iw = new IndexWriter(indexDirectory, iwc)) {
log.info("Counting number of lines of the dictionary file {}", dictFile);
long numLines;
try (BufferedReader normDictReader = new BufferedReader(new FileReader(dictFile))) {
numLines = normDictReader.lines().count();
}
log.info("Generating index now for {} synonyms.", numLines);
// now loop through dictionary and add entries to the index
ProgressBar progressBar = new ProgressBar(numLines, 80);
try (BufferedReader dictReader = new BufferedReader(new FileReader(dictFile))) {
Map syn2prio = new HashMap<>();
String lastId = null;
boolean filtered = false;
String line;
int lineNum = 1;
while ((line = dictReader.readLine()) != null) {
String[] values = line.split("\t");
// check whether format is OK
if (values.length != 3) {
System.err.println("ERR: normalized dictionary not in expected format. \ncritical line: " + line);
// System.exit(-1);
continue;
}
// now get the field values
String normalizedName = values[0];
String id = values[1];
Integer priority = Integer.parseInt(values[2]);
boolean isFamilyEntry = id.contains("GENO:");
if (cf != null && !OMIT_FILTERED && priority != -1 && !isFamilyEntry) {
filtered = DictionaryFamilyDomainFilter.isFiltered(id, cf, normalizedName);
}
if (filtered && OMIT_FILTERED)
continue;
if (lastId != null && !lastId.equals(id)) {
boolean finalFiltered = filtered;
log.trace("Indexing gene record with ID {} and synonym/priority pairs {}", lastId, syn2prio);
String finalLastId = lastId;
HashMap syn2prio4id = new HashMap<>(syn2prio);
executorService.submit(() -> {
try {
indexGeneRecord(finalLastId, syn2prio4id, id2infotype2info, finalFiltered, normalizer, iw, notStoredTextFieldType, storedTextFieldType, stringFieldTypeDocsAndFreqs, numDocumentsIndexed);
} catch (IOException e) {
log.error("Could not create index document for gene id {}", finalLastId, e);
}
});
syn2prio.clear();
}
syn2prio.put(normalizedName, priority);
lastId = id;
//if (lineNum % 100000 == 0)
// log.info("Now processing line {}", lineNum);
// No need to try and show the progress bar when a lot of logging messages appear all the time
if (lineNum % 1000 == 0 && !log.isDebugEnabled()) {
progressBar.incrementDone(lineNum - progressBar.getDone(), true);
}
// if (lineNum % 1000000 == 0) {
// log.debug("Committing IndexWriter at line number {}.", lineNum);
// if (log.isDebugEnabled())
// synchronized (documents) {
// log.debug("There are {} documents reachable", documents.stream().map(SoftReference::get).filter(Objects::nonNull).count());
// }
// iw.commit();
// }
++lineNum;
}
// Index the last entry
boolean finalFiltered = filtered;
String finalLastId = lastId;
executorService.submit(() -> {
try {
indexGeneRecord(finalLastId, new HashMap<>(syn2prio), id2infotype2info, finalFiltered, normalizer, iw, notStoredTextFieldType, storedTextFieldType, stringFieldTypeDocsAndFreqs, numDocumentsIndexed);
} catch (IOException e) {
log.error("Could not create index document for gene id {}", finalLastId, e);
} catch (Throwable t) {
log.error("Error", t);
}
});
log.info("Dictionary file {} has been consumed, all indexing jobs have been sent.", dictFile);
} finally {
try {
log.info("Shutting down executor.");
log.info("Waiting for running threads to terminate.");
executorService.shutdown();
// The 100 days basically mean "wait until we are finished"
executorService.awaitTermination(100, TimeUnit.DAYS);
} catch (InterruptedException e) {
log.warn("Waiting for running threads to finish has been interrupted. Shutting down the executor service now.");
executorService.shutdownNow();
}
log.info("ExecutorService has been shut down.");
}
log.info("Committing {} documents to the index.", numDocumentsIndexed.get());
iw.commit();
iw.forceMerge(5);
} catch (IOException e) {
e.printStackTrace();
}
}
private Map> readExtendedInformationFiles() {
// chromosome - single field
// description - single field
// generif - single field
// go - value is GO ID; single field
// godesc - key is GO ID; multifield, pipe as separator
// interaction - single field
// maplocation - single field
// summary - single field
// The map must be ordered such that the "go" file comes before the "godesc" file to create this map in time
Map goid2eg = new HashMap<>();
Map> infoMap = new HashMap<>();
log.info("Loading extended information files");
for (String informationType : extendedInformationFields.keySet()) {
File f = extendedInformationFields.get(informationType);
log.info("Reading {}", f);
try (BufferedReader br = FileUtilities.getReaderFromFile(f)) {
Stream linesSplits = br.lines().map(s -> s.split("\t"));
if (informationType.equals("godesc")) {
linesSplits = linesSplits.map(s -> {
s[0] = goid2eg.containsKey(s[0]) ? goid2eg.get(s[0].intern()).intern() : null;
return s;
});
}
if (informationType.equals("go")) {
linesSplits.forEach(s -> goid2eg.put(s[1].intern(), s[0].intern()));
} else {
linesSplits.filter(Objects::nonNull).filter(s -> s[0] != null && s[1] != null).filter(s -> !s[0].isBlank() && !s[1].isBlank()).forEach(s ->
infoMap.compute(s[0].intern(), (k, v) -> v != null ? v : HashMultimap.create()).put(informationType.intern(), s[1]));
}
} catch (IOException e) {
log.error("Could not read file {}. The respective extended information will not be added to the index", f, e);
}
}
return infoMap;
}
public void indexGeneRecord(String id, Map syn2prio, Map> id2infotype2info, boolean filtered, TermNormalizer normalizer, IndexWriter iw, FieldType notStoredTextFieldType, FieldType storedTextFieldType, FieldType stringFieldTypeDocsAndFreqs, AtomicInteger numDocumentsIndexed) throws IOException {
try {
String tax = "";
if (id2tax.get(id) != null) {
tax = id2tax.get(id);
}
// make fields
List fields = new ArrayList<>();
fields.add(new StringField(SynonymIndexFieldNames.ENTITY_TYPE, id.contains("GENO:") ? GeneMention.SpecificType.FAMILYNAME.name() : GeneMention.SpecificType.GENE.name(), Store.YES));
Field idField = new StringField(SynonymIndexFieldNames.ID_FIELD, id, Store.YES);
for (String normalizedName : syn2prio.keySet()) {
Integer priority = syn2prio.get(normalizedName);
String fieldname;
if (priority == -1)
fieldname = SynonymIndexFieldNames.SYMBOL;
else if (priority == 0)
fieldname = SynonymIndexFieldNames.SYMBOL_FROM_NOMCENCLATURE;
else if (priority == 1)
fieldname = SynonymIndexFieldNames.FULL_NAMES;
else if (priority == 2)
fieldname = SynonymIndexFieldNames.SYNONYMS;
else if (priority == 3)
fieldname = SynonymIndexFieldNames.OTHER_DESIGNATIONS;
else if (priority == 4)
// from the XML extracted from the NCBI Gene ASN.1 file
fieldname = SynonymIndexFieldNames.PROTEIN_NAMES;
else if (priority == 5)
fieldname = SynonymIndexFieldNames.UNIPROT_NAMES;
else if (priority == 6)
fieldname = SynonymIndexFieldNames.XREFS;
else if (priority == 7)
fieldname = SynonymIndexFieldNames.BIO_THESAURUS;
else
throw new IllegalArgumentException("Unsupported synonym priority: " + priority);
log.trace("Now adding field {} for synonym {} for ID {}", fieldname, normalizedName, id);
fields.add(new Field(fieldname, normalizedName, storedTextFieldType));
// fields.add(new Field(SynonymIndexFieldNames.LOOKUP_SYN_FIELD, normalizedName, notStoredTextFieldType));
// For the experiments on disambiguation: This allows us to try and match the exact
// name.
fields.add(new Field(fieldname + "_exact", normalizedName, stringFieldTypeDocsAndFreqs));
// fields.add(new Field(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_EXACT, normalizedName, notStoredTextFieldType));
}
Field taxField = new StringField(SynonymIndexFieldNames.TAX_ID_FIELD, tax, Store.YES);
if (!OMIT_FILTERED) {
IntPoint filteredField = new IntPoint(SynonymIndexFieldNames.FILTERED, filtered ? 1 : 0);
StoredField storedFilteredField = new StoredField(SynonymIndexFieldNames.FILTERED,
filtered ? 1 : 0);
fields.add(filteredField);
fields.add(storedFilteredField);
}
// chromosome - single field
// description - single field
// generif - single field
// godesc - multifield, pipe as separator
// interaction - single field
// maplocation - single field
// summary - single field
Multimap infotype2info = id2infotype2info.get(id);
if (infotype2info != null) {
for (String infotype : infotype2info.keySet()) {
for (String value : infotype2info.get(infotype)) {
if (!infotype.equals(SynonymIndexFieldNames.GODESC)) {
fields.add(new Field(infotype, normalizer.normalize(value), storedTextFieldType));
} else {
String[] values = value.split("\\|");
for (String v : values)
fields.add(new Field(infotype, normalizer.normalize(v), storedTextFieldType));
}
}
}
}
// The IDs only occur once, we can remove the data associated with it now.
id2infotype2info.remove(id);
fields.add(idField);
fields.add(taxField);
// make document and add to index
Document d = new Document();
for (Field f : fields)
d.add(f);
if (!fields.isEmpty()) {
iw.addDocument(d);
numDocumentsIndexed.incrementAndGet();
synchronized (documents) {
documents.add(new SoftReference<>(d));
}
}
} catch (Throwable t) {
log.error("Error occurred", t);
throw t;
}
}
private void readUpTaxMap(File taxMap) throws IOException {
log.info("Reading up2eg2tax.map ...");
id2tax = new HashMap<>();
BufferedReader reader = new BufferedReader(new FileReader(taxMap));
String line = "";
while ((line = reader.readLine()) != null) {
String[] entry = line.split("\t");
if (entry.length != 3) {
System.err.println("ERR: up2eg2tax.map not in expected format. \ncritical line: " + line);
System.exit(-1);
}
String id = entry[0].trim().intern();
String taxId = entry[2].trim().intern();
id2tax.put(id, taxId);
}
reader.close();
}
private void readEgTaxMap(File geneInfo) throws IOException {
log.info("Reading file gene ID to taxonomy ID map from {}", geneInfo);
try (BufferedReader br = new BufferedReader(
new InputStreamReader(new GZIPInputStream(new FileInputStream(geneInfo))))) {
id2tax = br.lines().collect(
Collectors.toMap(l -> l.split("\\t", 3)[1].intern(), l -> l.split("\\t", 3)[0].intern()));
}
}
}