de.julielab.genemapper.resources.NameCentricSynonymIndexGenerator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe
Gene Mapper.
The newest version!
package de.julielab.genemapper.resources;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multiset;
import de.julielab.gene.candidateretrieval.LuceneCandidateRetrieval;
import de.julielab.gene.candidateretrieval.SynonymIndexFieldNames;
import de.julielab.geneexpbase.CandidateFilter;
import de.julielab.geneexpbase.candidateretrieval.SynHit;
import de.julielab.java.utilities.FileUtilities;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.ngram.NGramFilterFactory;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.document.*;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
/**
* Synonym or gene name centric indexer, new as of March 11, 2019. The idea is to save storage and gain more focused
* gene mention search results by not indexing each synonym of each gene but group the gene ids by all possible
* synonyms. Thus, each synonym is only stored once and references the list of genes it may refer to, immediately
* showing the ambiguity of the synonym.
*/
public class NameCentricSynonymIndexGenerator {
private static final Logger log = LoggerFactory.getLogger(NameCentricSynonymIndexGenerator.class);
private static final Boolean OMIT_FILTERED = true;
private final File filteredDictFile;
private final File ambiguousSynsFile;
private final NGramFilterFactory nGramFilterFactory;
Map id2tax;
Directory indexDirectory;
/**
* A file containing gene or protein names / synonyms and their respective NCBI
* Gene or UniProt ID.
*/
private final File dictFile;
private final File familyRecordsFile;
private final String idSource;
private final String entityType;
/**
* @param dictFile A file containing gene or protein names / synonyms and their
* respective NCBI Gene or UniProt ID. No term normalization is
* expected for this dictionary.
* @param idSource
* @param entityType
* @param indexFile The directory where the name / synonym index will be written to.
*/
public NameCentricSynonymIndexGenerator(File dictFile, File familyRecordsFile, String idSource, String entityType, File indexFile) {
this.familyRecordsFile = familyRecordsFile;
this.idSource = idSource;
this.entityType = entityType;
System.out.println("Building synonym index from dictionary " + dictFile.getAbsolutePath());
System.out.println("Adding family synonyms from " + familyRecordsFile.getAbsolutePath());
this.dictFile = dictFile;
this.filteredDictFile = new File(dictFile.getParent(), dictFile.getName() + ".indexGeneratorFiltered");
this.ambiguousSynsFile = new File(dictFile.getParent(), "intra_tax_ambiguous.eg");
indexDirectory = createIndexDirectory(indexFile);
Map ngramFilterSettings = new HashMap<>();
ngramFilterSettings.put("minGramSize", "2");
ngramFilterSettings.put("maxGramSize", "3");
nGramFilterFactory = new NGramFilterFactory(ngramFilterSettings);
}
/**
* To execute the ContextIndexGenerator start it with the following command-line
* arguments:
* arg0: path to resources directory arg1: path to synonym indices directory
*
* @param args
*/
public static void main(String[] args) {
long s1 = System.currentTimeMillis();
if (args.length != 3) {
System.err.println(
"Usage: SynonymIndexGenerator ");
System.exit(1);
}
String resPath = args[0];
File resDir = new File(resPath);
if (!resDir.isDirectory()) {
System.err.println("Could not find resources directory");
System.exit(1);
}
if (!resPath.endsWith(File.separator)) {
resPath = resPath + File.separator;
}
File geneInfo = new File(resPath + args[1]);
if (!geneInfo.exists()) {
System.err.println("Gene info file could not be found at " + geneInfo.getAbsolutePath());
System.exit(1);
}
String indexPath = args[2];
if (!indexPath.endsWith("/")) {
indexPath = indexPath + "/";
}
File geneIndexDir = new File(indexPath + "geneSynonymIndex");
File proteinIndexDir = new File(indexPath + "proteinSynonymIndex");
if (geneIndexDir.exists())
FileUtils.deleteQuietly(geneIndexDir);
if (proteinIndexDir.exists())
FileUtils.deleteQuietly(proteinIndexDir);
File upDictFile = new File(resPath + "gene.dict.variants.norm.up");
//checkFile(upDictFile);
File egDictFile = new File(resPath + "gene.dict.variants.norm.filtered.eg");
checkFile(egDictFile);
File upTaxMap = new File(resPath + "up2eg2tax.map");
checkFile(upTaxMap);
File familyRecordsFile = new File(resPath + "familyrecords.dict");
File egTaxMap = geneInfo;
NameCentricSynonymIndexGenerator indexGenerator;
try {
// indexGenerator = new NameCentricSynonymIndexGenerator(upDictFile, proteinIndexDir);
// indexGenerator.readUpTaxMap(upTaxMap);
// indexGenerator.createIndex();
indexGenerator = new NameCentricSynonymIndexGenerator(egDictFile, familyRecordsFile, "NCBI Gene", SynHit.TYPE_GEPRO, geneIndexDir);
indexGenerator.readEgTaxMap(egTaxMap);
indexGenerator.createIndex();
} catch (IOException e) {
e.printStackTrace();
}
long s2 = System.currentTimeMillis();
System.out.println("Index created successfully! (" + (s2 - s1) / 1000 + " sec)");
}
private static void checkFile(File file) {
if (!file.isFile())
throw new IllegalArgumentException("File \"" + file.getAbsolutePath() + "\" could not be found.");
}
/**
* Creates the synonym index. Each unique synonym is indexed in a document of its own. Each such document
* has a number of fields for each gene that has the current synonym and lists the gene ID, its tax ID (if the
* tax ID mapping is given) and the "priority" that the synonym has for the gene. The priority aims to describe
* the reliability of the source given the respective synonym. Higher numbers mean a lower priority.
* The official gene symbol has priority -1.
*
* @throws IOException
*/
public void createIndex() throws IOException {
CandidateFilter cf = new CandidateFilter();
WhitespaceAnalyzer wsAnalyzer = new WhitespaceAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(wsAnalyzer);
iwc.setOpenMode(OpenMode.CREATE);
log.info("Using up to 20 threads for index document creation");
// VERY IMPORTANT: The dictionary file must be sorted by synonym. This is because we want to group the
// dictionary entries by synonym but we don't want to read the whole dictionary and sort it in-memory
// because this may well exhaust the memory for the full all-species dictionary.
final ExecutorService executorService = Executors.newFixedThreadPool(20);
try (IndexWriter iw = new IndexWriter(indexDirectory, iwc)) {
try (BufferedWriter outdictBw = FileUtilities.getWriterToFile(filteredDictFile);
BufferedWriter ambiguousSynonymsBw = FileUtilities.getWriterToFile(ambiguousSynsFile)) {
try {
indexDictionary(dictFile, idSource, entityType, cf, executorService, iw, outdictBw, ambiguousSynonymsBw);
// indexDictionary(familyRecordsFile, "GenoFamilies", SynHit.TYPE_GROUP, null, executorService, iw, null, null);
} finally {
try {
// log.info("Shutting down executor.");
executorService.shutdown();
log.info("Waiting for running threads to terminate.");
// The 100 days basically mean "wait until we are finished"
executorService.awaitTermination(100, TimeUnit.DAYS);
} catch (InterruptedException e) {
log.warn("Waiting for running threads to finish has been interrupted. Shutting down the executor service now.");
executorService.shutdownNow();
}
log.info("ExecutorService has been shut down.");
}
}
// try(final BufferedReader brFam = FileUtilities.getReaderFromFile(familyDict)) {
// // Index the family names
// String line;
// while ((line = brFam.readLine()) != null) {
// String[] split = line.split("\t");
// String familySynonym = split[0];
// String famplexId = split[1];
//
// Document doc = new Document();
// Field lookupSynField = new TextField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD, familySynonym,
// Store.YES);
// Field idField = new StringField(SynonymIndexFieldNames.ID_FIELD, famplexId, Store.NO);
// // To fit into the prioritized synonym schema, we just assign the highest priority to the family synonyms
// Field idPriorityField = new StringField(SynonymIndexFieldNames.ID_FIELD, famplexId + LuceneCandidateRetrieval.NAME_PRIO_DELIMITER + "-1", Store.YES);
// Field idSourceField = new StringField(SynonymIndexFieldNames.SOURCE, "FamPlex", Store.YES);
// doc.add(lookupSynField);
// doc.add(idField);
// doc.add(idPriorityField);
// doc.add(idSourceField);
// iw.addDocument(doc);
// }
// }
log.info("Committing all index additions.");
iw.commit();
}
}
public void indexDictionary(File dictFile, String idSource, String entityType, CandidateFilter cf, ExecutorService executorService, IndexWriter iw, BufferedWriter outdictBw, BufferedWriter ambiguousSynonymsBw) throws IOException {
try (final BufferedReader br = FileUtilities.getReaderFromFile(dictFile)) {
AtomicInteger counter = new AtomicInteger();
String line;
String currentSynonym = null;
List entriesForCurrentSynonym = new ArrayList<>();
while ((line = br.readLine()) != null) {
currentSynonym = processLine(line, currentSynonym, entriesForCurrentSynonym, counter, idSource, entityType, iw, outdictBw, ambiguousSynonymsBw, cf, executorService);
}
if (currentSynonym != null) {
line = "$$END\tOF\tFILE$$";
processLine(line, currentSynonym, entriesForCurrentSynonym, counter, idSource, entityType, iw, outdictBw, ambiguousSynonymsBw, cf, executorService);
}
}
}
private String processLine(String line, String currentSynonym, List entriesForCurrentSynonym, AtomicInteger counter, String idSource, String entityType, IndexWriter iw, BufferedWriter outdictBw, BufferedWriter ambiguousSynonymsBw, CandidateFilter cf, ExecutorService executorService) {
final String[] split = line.split("\t");
if (split.length != 3 && split.length != 3)
System.err.println("ERR: normalized dictionary not in expected format. \ncritical line: " + line);
String synonym = split[0];
if (currentSynonym == null)
currentSynonym = synonym;
// Have we reached the next synonym? Then we must first create the index items for the current
// synonym before we continue
if (!synonym.equals(currentSynonym)) {
final String synonymToWrite = currentSynonym;
final List entriesToWrite = new ArrayList<>(entriesForCurrentSynonym);
executorService.submit(() -> {
try {
indexCurrentSynonymEntries(cf, idSource, entityType, iw, outdictBw, ambiguousSynonymsBw, counter, synonymToWrite, entriesToWrite);
} catch (IOException e) {
log.error("Could not create index document for synonym {}", synonymToWrite, e);
}
});
entriesForCurrentSynonym.clear();
}
entriesForCurrentSynonym.add(split);
currentSynonym = synonym;
return currentSynonym;
}
/**
* Takes the arrays with the gene IDs that have the passed synonym with the priorities that are also stored in the arrays given by entriesForCurrentSynonym.
* Creates one Lucene document for the synonym and sets all the IDs with their priorities into one field (separated by {@link LuceneCandidateRetrieval#NAME_PRIO_DELIMITER}) and the respective taxonomy IDs in another field.
* The priorities are assigned in the _makeGeneDictionary.sh script and should reflect the reliability of the source that
* gave the corresponding synonym tot he gene.
*
* @param cf The candidate filter for filtering out synonyms that look as they wouldn't help at all.
* @param idSource
* @param iw The Lucene index writer.
* @param outdictBw
* @param ambiguousSynonymsBw
* @param counter A parameter for counting the number of synonyms processed, for status output.
* @param currentSynonym The synonym for which all entries have been collected in entriesForCurrentSynonym.
* @param entriesForCurrentSynonym All the IDs of genes that have the currentSynonym and the priority with which they have the synonym.
* @return The counter for status output. The generated document is added to the IndexWriter within the method.
* @throws IOException
*/
private void indexCurrentSynonymEntries(CandidateFilter cf, String idSource, String entityType, IndexWriter iw, BufferedWriter outdictBw, BufferedWriter ambiguousSynonymsBw, AtomicInteger counter, String currentSynonym, List entriesForCurrentSynonym) throws IOException {
Document doc = new Document();
Field lookupSynField = new TextField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD, currentSynonym,
Store.YES);
doc.add(lookupSynField);
SnowballFilter ts = new SnowballFilter(new WhitespaceAnalyzer().tokenStream(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_STEMMED, currentSynonym), "English");
doc.add(new TextField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_STEMMED, ts));
TokenFilter ngrams = nGramFilterFactory.create(new WhitespaceAnalyzer().tokenStream(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_NGRAMS, currentSynonym));
doc.add(new TextField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_NGRAMS, ngrams));
doc.add(new StringField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_EXACT, currentSynonym, Store.NO));
List fields = new ArrayList<>();
int minPriority = Integer.MAX_VALUE;
Multiset taxIdsForSynonym = HashMultiset.create();
Multimap tax2Id = HashMultimap.create();
for (String[] geneEntry : entriesForCurrentSynonym) {
String id = geneEntry[1];
boolean isFamilyDict = id.startsWith("GENO:");
Integer priority = Integer.parseInt(geneEntry[2]);
if (priority < minPriority)
minPriority = priority;
boolean filtered = false;
// If the synonym is the official gene symbol, we accept it, no matter what
if (cf != null && !OMIT_FILTERED && priority != -1) {
filtered = DictionaryFamilyDomainFilter.isFiltered(id, cf, currentSynonym);
}
if (log.isDebugEnabled()) {
log.debug("ID: {}, synonym: {}, filtered out: {}", id, currentSynonym, filtered);
}
String tax = isFamilyDict ? "0" : "";
if (id2tax.get(id) != null) {
tax = id2tax.get(id);
if (priority <= 3) {
taxIdsForSynonym.add(tax);
tax2Id.put(tax, id);
}
}
Field idField = new StringField(SynonymIndexFieldNames.ID_FIELD, id, Store.NO);
Field idPriorityField = new StringField(SynonymIndexFieldNames.ID_FIELD, id + LuceneCandidateRetrieval.NAME_PRIO_DELIMITER + priority, Store.YES);
Field taxField = new StringField(SynonymIndexFieldNames.TAX_ID_FIELD, tax, Store.YES);
IntPoint priorityField = new IntPoint(SynonymIndexFieldNames.PRIORITY, priority);
if (!OMIT_FILTERED) {
IntPoint filteredField = new IntPoint(SynonymIndexFieldNames.FILTERED, filtered ? 1 : 0);
StoredField storedFilteredField = new StoredField(SynonymIndexFieldNames.FILTERED,
filtered ? 1 : 0);
fields.add(filteredField);
fields.add(storedFilteredField);
}
Field idSourceField = new StringField(SynonymIndexFieldNames.SOURCE, isFamilyDict ? "GenoFamilies" : idSource, Store.YES);
Field typeField = new StringField(SynonymIndexFieldNames.ENTITY_TYPE, entityType, Store.YES);
fields.add(idField);
fields.add(idPriorityField);
fields.add(taxField);
fields.add(priorityField);
fields.add(idSourceField);
fields.add(typeField);
}
if (!fields.isEmpty()) {
for (Field f : fields)
doc.add(f);
iw.addDocument(doc);
if (outdictBw != null && minPriority <= 3) {
synchronized (outdictBw) {
outdictBw.write(currentSynonym + "\tGene");
outdictBw.newLine();
}
}
if (ambiguousSynonymsBw != null) {
for (String tax : taxIdsForSynonym) {
if (taxIdsForSynonym.count(tax) > 1) {
Collection intraAmbiguousIds = tax2Id.get(tax);
synchronized (ambiguousSynonymsBw) {
for (String id : intraAmbiguousIds) {
ambiguousSynonymsBw.write(id);
ambiguousSynonymsBw.newLine();
}
}
}
}
}
}
int done = counter.incrementAndGet();
if (done % 10000 == 0) {
log.debug("# entries processed: " + done);
}
}
/**
* create the directory object where to put the lucene index...
*/
private FSDirectory createIndexDirectory(File indexFile) {
FSDirectory fdir = null;
try {
fdir = FSDirectory.open(indexFile.toPath());
} catch (IOException e) {
e.printStackTrace();
}
return fdir;
}
private void readUpTaxMap(File taxMap) throws IOException {
log.info("Reading up2eg2tax.map ...");
id2tax = new HashMap();
BufferedReader reader = new BufferedReader(new FileReader(taxMap));
String line = "";
while ((line = reader.readLine()) != null) {
String[] entry = line.split("\t");
if (entry.length != 3) {
System.err.println("ERR: up2eg2tax.map not in expected format. \ncritical line: " + line);
System.exit(-1);
}
String id = entry[0].trim();
String taxId = entry[2].trim();
id2tax.put(id, taxId);
}
reader.close();
}
private void readEgTaxMap(File geneInfo) throws IOException {
try (BufferedReader br = new BufferedReader(
new InputStreamReader(new GZIPInputStream(new FileInputStream(geneInfo))))) {
id2tax = br.lines().collect(
Collectors.toMap(l -> l.split("\\t", 3)[1], l -> l.split("\\t", 3)[0]));
}
}
}