All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.genemapper.resources.GeneRecordIndexGenerator Maven / Gradle / Ivy

Go to download

This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.

The newest version!
/**
 * IndexGenerator.java
 * 

* Copyright (c) 2006, JULIE Lab. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Common Public License v1.0 *

* Author: tomanek *

* Current version: 1.5.1 * Since version: 1.0 *

* Creation date: Nov 30, 2006 *

* This class generates the Lucene index from the modified biothesaurus as * provided by EBI in BootSTREP. *

* This version of the index generator expects a consolidated biothesaurus file * which only consists of these columns: * - col1: synonym (normalized) * - col2: uniref_50 *

* IMPORTANT NOTES: * - no normalization is done here, so better do the normalization of the BT yourself * - for better performance: make entries in bt file unique! **/ package de.julielab.genemapper.resources; import com.google.common.collect.HashMultimap; import com.google.common.collect.Multimap; import de.julielab.gene.candidateretrieval.SynonymIndexFieldNames; import de.julielab.geneexpbase.CandidateFilter; import de.julielab.geneexpbase.TermNormalizer; import de.julielab.geneexpbase.genemodel.GeneMention; import de.julielab.java.utilities.FileUtilities; import de.julielab.java.utilities.ProgressBar; import org.apache.commons.io.FileUtils; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.document.*; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.lang.ref.SoftReference; import java.util.*; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import java.util.stream.Stream; import java.util.zip.GZIPInputStream; public class GeneRecordIndexGenerator { private static final Logger log = LoggerFactory.getLogger(GeneRecordIndexGenerator.class); /** * The synonym index is filtered for unspecifieds and others. This field * determines whether filtered items should be omitted completely from the index * or if they should just be flagged to be filtered but included into the index. * The latter will lead to a larger index, of course. Used for experiments, not * required as of January 30, 2018. */ private static final Boolean OMIT_FILTERED = true; private final Directory indexDirectory; /** * A file containing gene or protein names / synonyms and their respective NCBI * Gene or UniProt ID. No term normalization is expected for this dictionary. */ private final File dictFile; private final Map extendedInformationFields; Map id2tax; Set> documents = new HashSet<>(); /** * @param dictFile A file containing gene or protein names / synonyms and their * respective NCBI Gene or UniProt ID. No term normalization is * expected for this dictionary. * @param extendedInformationFields * @param indexFile The directory where the name / synonym index will be written to. * @throws FileNotFoundException * @throws IOException */ public GeneRecordIndexGenerator(File dictFile, Map extendedInformationFields, File indexFile) throws FileNotFoundException, IOException { this.extendedInformationFields = extendedInformationFields; log.info("Building gene records index from dictionary {}", dictFile); this.dictFile = dictFile; indexDirectory = FSDirectory.open(indexFile.toPath()); } /** * To execute the ContextIndexGenerator start it with the following command-line * arguments:
* arg0: path to resources directory arg1: path to synonym indices directory * * @param args */ public static void main(String[] args) { long s1 = System.currentTimeMillis(); if (args.length != 3) { System.err.println( "Usage: GeneRecordIndexGenerator "); System.exit(1); } String dictFile = args[0]; String resPath = args[1]; File resDir = new File(resPath); if (!resDir.isDirectory()) { System.err.println("Could not find resources directory"); System.exit(1); } if (!resPath.endsWith(File.separator)) { resPath = resPath + File.separator; } String indexPath = args[2]; File geneIndexDir = dictFile.equals("gene.dict.uniqueprioritynames.sortedbyid.eg") ? new File(indexPath, "geneNamesRecordsIndexOriginalNames") : new File(indexPath, "geneRecordsIndex"); File proteinIndexDir = dictFile.equals("gene.dict.uniqueprioritynames.sortedbyid.eg") ? new File(indexPath, "ProteinRecordsIndexOriginalNames") : new File(indexPath, "proteinRecordsIndex"); if (geneIndexDir.exists()) FileUtils.deleteQuietly(geneIndexDir); if (proteinIndexDir.exists()) FileUtils.deleteQuietly(proteinIndexDir); File upDictFile = new File(resPath + "gene.dict.up"); checkFile(upDictFile); File egDictFile = new File(resPath + dictFile); checkFile(egDictFile); File eg2chromosome = new File(resPath + "eg2chromosome"); File eg2description = new File(resPath + "eg2description"); File eg2generif = new File(resPath + "eg2generif"); File eg2go = new File(resPath + "eg2go"); File goDesc = new File(resPath + "go_all"); File eg2interaction = new File(resPath + "eg2interaction"); File eg2maplocation = new File(resPath + "eg2maplocation"); File eg2summary = new File(resPath + "eg2summary"); File eg2ecnumber = new File(resPath + "eg2ecnumber-genexmldownloader.gz"); Map extendedInformationFields = new LinkedHashMap<>(); extendedInformationFields.put(SynonymIndexFieldNames.CHROMOSOME, eg2chromosome); extendedInformationFields.put(SynonymIndexFieldNames.DESCRIPTION, eg2description); extendedInformationFields.put(SynonymIndexFieldNames.GENERIF, eg2generif); extendedInformationFields.put("go", eg2go); extendedInformationFields.put(SynonymIndexFieldNames.GODESC, goDesc); extendedInformationFields.put(SynonymIndexFieldNames.INTERACTION, eg2interaction); extendedInformationFields.put(SynonymIndexFieldNames.MAPLOCATION, eg2maplocation); extendedInformationFields.put(SynonymIndexFieldNames.SUMMARY, eg2summary); extendedInformationFields.put(SynonymIndexFieldNames.ECNUMBER, eg2ecnumber); File upTaxMap = new File(resPath + "up2eg2tax.map"); checkFile(upTaxMap); File geneInfo = new File(resPath + "gene_info_organism_filtered.gz"); File egTaxMap = geneInfo; GeneRecordIndexGenerator indexGenerator; try { // indexGenerator = new SynonymIndexGenerator(upDictFile, proteinIndexDir); // indexGenerator.readUpTaxMap(upTaxMap); // indexGenerator.createIndex(); indexGenerator = new GeneRecordIndexGenerator(egDictFile, extendedInformationFields, geneIndexDir); indexGenerator.readEgTaxMap(egTaxMap); indexGenerator.createIndex(); } catch (IOException e) { e.printStackTrace(); } long s2 = System.currentTimeMillis(); System.out.println("Index created successfully! (" + (s2 - s1) / 1000 + " sec)"); } private static void checkFile(File file) { if (!file.isFile()) throw new IllegalArgumentException("File \"" + file.getAbsolutePath() + "\" could not be found."); } /** * create the index, i.e. read from the biothesaurus file (which is expected to * have normalized synonyms!) and then write it to the index. * * @throws IOException */ public void createIndex() throws IOException { CandidateFilter cf = new CandidateFilter(); TermNormalizer normalizer = new TermNormalizer(); Map> id2infotype2info = readExtendedInformationFiles(); FieldType notStoredTextFieldType = new FieldType(TextField.TYPE_NOT_STORED); notStoredTextFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); FieldType storedTextFieldType = new FieldType(TextField.TYPE_STORED); storedTextFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); FieldType stringFieldTypeDocsAndFreqs = new FieldType(StringField.TYPE_STORED); stringFieldTypeDocsAndFreqs.setIndexOptions(IndexOptions.DOCS_AND_FREQS); WhitespaceAnalyzer wsAnalyzer = new WhitespaceAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(wsAnalyzer); iwc.setOpenMode(OpenMode.CREATE); // VERY IMPORTANT: The dictionary file must be sorted by id. This is because we want to group the // dictionary entries by id, but we don't want to read the whole dictionary and sort it in-memory // because this may well exhaust the memory for the full all-species dictionary. final ExecutorService executorService = Executors.newFixedThreadPool(20); AtomicInteger numDocumentsIndexed = new AtomicInteger(); try (IndexWriter iw = new IndexWriter(indexDirectory, iwc)) { log.info("Counting number of lines of the dictionary file {}", dictFile); long numLines; try (BufferedReader normDictReader = new BufferedReader(new FileReader(dictFile))) { numLines = normDictReader.lines().count(); } log.info("Generating index now for {} synonyms.", numLines); // now loop through dictionary and add entries to the index ProgressBar progressBar = new ProgressBar(numLines, 80); try (BufferedReader dictReader = new BufferedReader(new FileReader(dictFile))) { Map syn2prio = new HashMap<>(); String lastId = null; boolean filtered = false; String line; int lineNum = 1; while ((line = dictReader.readLine()) != null) { String[] values = line.split("\t"); // check whether format is OK if (values.length != 3) { System.err.println("ERR: normalized dictionary not in expected format. \ncritical line: " + line); // System.exit(-1); continue; } // now get the field values String normalizedName = values[0]; String id = values[1]; Integer priority = Integer.parseInt(values[2]); boolean isFamilyEntry = id.contains("GENO:"); if (cf != null && !OMIT_FILTERED && priority != -1 && !isFamilyEntry) { filtered = DictionaryFamilyDomainFilter.isFiltered(id, cf, normalizedName); } if (filtered && OMIT_FILTERED) continue; if (lastId != null && !lastId.equals(id)) { boolean finalFiltered = filtered; log.trace("Indexing gene record with ID {} and synonym/priority pairs {}", lastId, syn2prio); String finalLastId = lastId; HashMap syn2prio4id = new HashMap<>(syn2prio); executorService.submit(() -> { try { indexGeneRecord(finalLastId, syn2prio4id, id2infotype2info, finalFiltered, normalizer, iw, notStoredTextFieldType, storedTextFieldType, stringFieldTypeDocsAndFreqs, numDocumentsIndexed); } catch (IOException e) { log.error("Could not create index document for gene id {}", finalLastId, e); } }); syn2prio.clear(); } syn2prio.put(normalizedName, priority); lastId = id; //if (lineNum % 100000 == 0) // log.info("Now processing line {}", lineNum); // No need to try and show the progress bar when a lot of logging messages appear all the time if (lineNum % 1000 == 0 && !log.isDebugEnabled()) { progressBar.incrementDone(lineNum - progressBar.getDone(), true); } // if (lineNum % 1000000 == 0) { // log.debug("Committing IndexWriter at line number {}.", lineNum); // if (log.isDebugEnabled()) // synchronized (documents) { // log.debug("There are {} documents reachable", documents.stream().map(SoftReference::get).filter(Objects::nonNull).count()); // } // iw.commit(); // } ++lineNum; } // Index the last entry boolean finalFiltered = filtered; String finalLastId = lastId; executorService.submit(() -> { try { indexGeneRecord(finalLastId, new HashMap<>(syn2prio), id2infotype2info, finalFiltered, normalizer, iw, notStoredTextFieldType, storedTextFieldType, stringFieldTypeDocsAndFreqs, numDocumentsIndexed); } catch (IOException e) { log.error("Could not create index document for gene id {}", finalLastId, e); } catch (Throwable t) { log.error("Error", t); } }); log.info("Dictionary file {} has been consumed, all indexing jobs have been sent.", dictFile); } finally { try { log.info("Shutting down executor."); log.info("Waiting for running threads to terminate."); executorService.shutdown(); // The 100 days basically mean "wait until we are finished" executorService.awaitTermination(100, TimeUnit.DAYS); } catch (InterruptedException e) { log.warn("Waiting for running threads to finish has been interrupted. Shutting down the executor service now."); executorService.shutdownNow(); } log.info("ExecutorService has been shut down."); } log.info("Committing {} documents to the index.", numDocumentsIndexed.get()); iw.commit(); iw.forceMerge(5); } catch (IOException e) { e.printStackTrace(); } } private Map> readExtendedInformationFiles() { // chromosome - single field // description - single field // generif - single field // go - value is GO ID; single field // godesc - key is GO ID; multifield, pipe as separator // interaction - single field // maplocation - single field // summary - single field // The map must be ordered such that the "go" file comes before the "godesc" file to create this map in time Map goid2eg = new HashMap<>(); Map> infoMap = new HashMap<>(); log.info("Loading extended information files"); for (String informationType : extendedInformationFields.keySet()) { File f = extendedInformationFields.get(informationType); log.info("Reading {}", f); try (BufferedReader br = FileUtilities.getReaderFromFile(f)) { Stream linesSplits = br.lines().map(s -> s.split("\t")); if (informationType.equals("godesc")) { linesSplits = linesSplits.map(s -> { s[0] = goid2eg.containsKey(s[0]) ? goid2eg.get(s[0].intern()).intern() : null; return s; }); } if (informationType.equals("go")) { linesSplits.forEach(s -> goid2eg.put(s[1].intern(), s[0].intern())); } else { linesSplits.filter(Objects::nonNull).filter(s -> s[0] != null && s[1] != null).filter(s -> !s[0].isBlank() && !s[1].isBlank()).forEach(s -> infoMap.compute(s[0].intern(), (k, v) -> v != null ? v : HashMultimap.create()).put(informationType.intern(), s[1])); } } catch (IOException e) { log.error("Could not read file {}. The respective extended information will not be added to the index", f, e); } } return infoMap; } public void indexGeneRecord(String id, Map syn2prio, Map> id2infotype2info, boolean filtered, TermNormalizer normalizer, IndexWriter iw, FieldType notStoredTextFieldType, FieldType storedTextFieldType, FieldType stringFieldTypeDocsAndFreqs, AtomicInteger numDocumentsIndexed) throws IOException { try { String tax = ""; if (id2tax.get(id) != null) { tax = id2tax.get(id); } // make fields List fields = new ArrayList<>(); fields.add(new StringField(SynonymIndexFieldNames.ENTITY_TYPE, id.contains("GENO:") ? GeneMention.SpecificType.FAMILYNAME.name() : GeneMention.SpecificType.GENE.name(), Store.YES)); Field idField = new StringField(SynonymIndexFieldNames.ID_FIELD, id, Store.YES); for (String normalizedName : syn2prio.keySet()) { Integer priority = syn2prio.get(normalizedName); String fieldname; if (priority == -1) fieldname = SynonymIndexFieldNames.SYMBOL; else if (priority == 0) fieldname = SynonymIndexFieldNames.SYMBOL_FROM_NOMCENCLATURE; else if (priority == 1) fieldname = SynonymIndexFieldNames.FULL_NAMES; else if (priority == 2) fieldname = SynonymIndexFieldNames.SYNONYMS; else if (priority == 3) fieldname = SynonymIndexFieldNames.OTHER_DESIGNATIONS; else if (priority == 4) // from the XML extracted from the NCBI Gene ASN.1 file fieldname = SynonymIndexFieldNames.PROTEIN_NAMES; else if (priority == 5) fieldname = SynonymIndexFieldNames.UNIPROT_NAMES; else if (priority == 6) fieldname = SynonymIndexFieldNames.XREFS; else if (priority == 7) fieldname = SynonymIndexFieldNames.BIO_THESAURUS; else throw new IllegalArgumentException("Unsupported synonym priority: " + priority); log.trace("Now adding field {} for synonym {} for ID {}", fieldname, normalizedName, id); fields.add(new Field(fieldname, normalizedName, storedTextFieldType)); // fields.add(new Field(SynonymIndexFieldNames.LOOKUP_SYN_FIELD, normalizedName, notStoredTextFieldType)); // For the experiments on disambiguation: This allows us to try and match the exact // name. fields.add(new Field(fieldname + "_exact", normalizedName, stringFieldTypeDocsAndFreqs)); // fields.add(new Field(SynonymIndexFieldNames.LOOKUP_SYN_FIELD_EXACT, normalizedName, notStoredTextFieldType)); } Field taxField = new StringField(SynonymIndexFieldNames.TAX_ID_FIELD, tax, Store.YES); if (!OMIT_FILTERED) { IntPoint filteredField = new IntPoint(SynonymIndexFieldNames.FILTERED, filtered ? 1 : 0); StoredField storedFilteredField = new StoredField(SynonymIndexFieldNames.FILTERED, filtered ? 1 : 0); fields.add(filteredField); fields.add(storedFilteredField); } // chromosome - single field // description - single field // generif - single field // godesc - multifield, pipe as separator // interaction - single field // maplocation - single field // summary - single field Multimap infotype2info = id2infotype2info.get(id); if (infotype2info != null) { for (String infotype : infotype2info.keySet()) { for (String value : infotype2info.get(infotype)) { if (!infotype.equals(SynonymIndexFieldNames.GODESC)) { fields.add(new Field(infotype, normalizer.normalize(value), storedTextFieldType)); } else { String[] values = value.split("\\|"); for (String v : values) fields.add(new Field(infotype, normalizer.normalize(v), storedTextFieldType)); } } } } // The IDs only occur once, we can remove the data associated with it now. id2infotype2info.remove(id); fields.add(idField); fields.add(taxField); // make document and add to index Document d = new Document(); for (Field f : fields) d.add(f); if (!fields.isEmpty()) { iw.addDocument(d); numDocumentsIndexed.incrementAndGet(); synchronized (documents) { documents.add(new SoftReference<>(d)); } } } catch (Throwable t) { log.error("Error occurred", t); throw t; } } private void readUpTaxMap(File taxMap) throws IOException { log.info("Reading up2eg2tax.map ..."); id2tax = new HashMap<>(); BufferedReader reader = new BufferedReader(new FileReader(taxMap)); String line = ""; while ((line = reader.readLine()) != null) { String[] entry = line.split("\t"); if (entry.length != 3) { System.err.println("ERR: up2eg2tax.map not in expected format. \ncritical line: " + line); System.exit(-1); } String id = entry[0].trim().intern(); String taxId = entry[2].trim().intern(); id2tax.put(id, taxId); } reader.close(); } private void readEgTaxMap(File geneInfo) throws IOException { log.info("Reading file gene ID to taxonomy ID map from {}", geneInfo); try (BufferedReader br = new BufferedReader( new InputStreamReader(new GZIPInputStream(new FileInputStream(geneInfo))))) { id2tax = br.lines().collect( Collectors.toMap(l -> l.split("\\t", 3)[1].intern(), l -> l.split("\\t", 3)[0].intern())); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy