de.julielab.genemapper.resources.SynonymIndexGenerator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.
The newest version!
/** 
 * IndexGenerator.java
 * 
 * Copyright (c) 2006, JULIE Lab. 
 * All rights reserved. This program and the accompanying materials 
 * are made available under the terms of the Common Public License v1.0 
 *
 * Author: tomanek
 * 
 * Current version: 1.5.1
 * Since version:   1.0
 *
 * Creation date: Nov 30, 2006 
 * 
 * This class generates the Lucene index from the modified biothesaurus as
 * provided by EBI in BootSTREP.
 * 
 * This version of the index generator expects a consolidated biothesaurus file
 * which only consists of these columns:
 * - col1: synonym (normalized)
 * - col2: uniref_50
 * 
 * IMPORTANT NOTES:
 * - no normalization is done here, so better do the normalization of the BT yourself
 * - for better performance: make entries in bt file unique!
 * 
 **/

package de.julielab.genemapper.resources;

import de.julielab.gene.candidateretrieval.SynonymIndexFieldNames;
import de.julielab.geneexpbase.CandidateFilter;
import de.julielab.geneexpbase.TermNormalizer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;


public class SynonymIndexGenerator {

	private static final Logger log = LoggerFactory.getLogger(SynonymIndexGenerator.class);

	/**
	 * The synonym index is filtered for unspecifieds and others. This field
	 * determines whether filtered items should be omitted completely from the index
	 * or if they should just be flagged to be filtered but included into the index.
	 * The latter will lead to a larger index, of course. Used for experiments, not
	 * required as of January 30, 2018.
	 */
	private static final Boolean OMIT_FILTERED = true;

	/*
	 * defines the maximum length of synonyms to be considered longer synonyms are
	 * omitted, i.e. not stored in the index
	 */
	private static final int MAX_SYNLENGTH = 8;
	private static final int MIN_SYNLENGTH = 2;

	/**
	 * A file containing gene or protein names / synonyms and their respective NCBI
	 * Gene or UniProt ID. No term normalization is expected for this dictionary.
	 */
	private final File dictFile;

	Map id2tax;

	Directory indexDirectory;

	private static final boolean debug = false;

	/**
	 * To execute the ContextIndexGenerator start it with the following command-line
	 * arguments:

	 * arg0: path to resources directory arg1: path to synonym indices directory
	 * 
	 * @param args
	 */
	public static void main(String[] args) {

		long s1 = System.currentTimeMillis();

		if (args.length != 3) {
			System.err.println(
					"Usage: SynonymIndexGenerator   ");
			System.exit(1);
		}

		String resPath = args[0];
		File resDir = new File(resPath);
		if (!resDir.isDirectory()) {
			System.err.println("Could not find resources directory");
			System.exit(1);
		}
		if (!resPath.endsWith(File.separator)) {
			resPath = resPath + File.separator;
		}

		File geneInfo = new File(resPath + args[1]);
		if (!geneInfo.exists()) {
			System.err.println("Gene info file could not be found at " + geneInfo.getAbsolutePath());
			System.exit(1);
		}

		String indexPath = args[2];
		if (!indexPath.endsWith("/")) {
			indexPath = indexPath + "/";
		}
		File geneIndexDir = new File(indexPath + "geneSynonymIndex");
		File proteinIndexDir = new File(indexPath + "proteinSynonymIndex");

		File upDictFile = new File(resPath + "gene.dict.up");
		checkFile(upDictFile);

		File egDictFile = new File(resPath + "gene.dict.eg");
		checkFile(egDictFile);

		File upTaxMap = new File(resPath + "up2eg2tax.map");
		checkFile(upTaxMap);

		File egTaxMap = geneInfo;

		SynonymIndexGenerator indexGenerator;
		try {
			// indexGenerator = new SynonymIndexGenerator(upDictFile, proteinIndexDir);
			// indexGenerator.readUpTaxMap(upTaxMap);
			// indexGenerator.createIndex();
			indexGenerator = new SynonymIndexGenerator(egDictFile, geneIndexDir);
			indexGenerator.readEgTaxMap(egTaxMap);
			indexGenerator.createIndex();
		} catch (IOException e) {
			e.printStackTrace();
		}

		long s2 = System.currentTimeMillis();
		System.out.println("Index created successfully! (" + (s2 - s1) / 1000 + " sec)");
	}

	private static void checkFile(File file) {
		if (!file.isFile())
			throw new IllegalArgumentException("File \"" + file.getAbsolutePath() + "\" could not be found.");
	}

	/**
	 * 
	 * @param dictFile
	 *            A file containing gene or protein names / synonyms and their
	 *            respective NCBI Gene or UniProt ID. No term normalization is
	 *            expected for this dictionary.
	 * @param indexFile
	 *            The directory where the name / synonym index will be written to.
	 * @throws FileNotFoundException
	 * @throws IOException
	 */
	public SynonymIndexGenerator(File dictFile, File indexFile) throws FileNotFoundException, IOException {
		System.out.println("Building synonym index from dictionary " + dictFile.getAbsolutePath());
		this.dictFile = dictFile;
		indexDirectory = createIndexDirectory(indexFile);

	}

	/**
	 * create the index, i.e. read from the biothesaurus file (which is expected to
	 * have normalized synonyms!) and then write it to the index.
	 * 
	 * @throws IOException
	 */
	public void createIndex() throws IOException {

		CandidateFilter cf = new CandidateFilter();

		WhitespaceAnalyzer wsAnalyzer = new WhitespaceAnalyzer();
		IndexWriterConfig iwc = new IndexWriterConfig(wsAnalyzer);
		iwc.setOpenMode(OpenMode.CREATE);
		IndexWriter iw = new IndexWriter(indexDirectory, iwc);

		TermNormalizer normalizer = new TermNormalizer();
		int counter = 0;
		BufferedReader normDictReader = new BufferedReader(new FileReader(dictFile));

		System.out.println(
				"Generating index now. This may take quite a while (up to several hours when input files are large) ...");
		// now loop through dictionary and add entries to the index
		try {

			String line = "";
			while ((line = normDictReader.readLine()) != null) {

				String[] values = line.split("\t");

				// check whether format is OK
				if (values.length != 3) {
					System.err.println("ERR: normalized dictionary not in expected format. \ncritical line: " + line);
					// System.exit(-1);
					continue;
				}

				// now get the field values
				String name = values[0];
				String normalizedName = normalizer.normalize(name);
				List normalizedNameVariant = normalizer.generateVariants(name).stream()
						.map(normalizer::normalize).collect(Collectors.toList());
				String id = values[1];
				Integer priority = Integer.parseInt(values[2]);

				boolean filtered = false;

				// ignore synonyms smaller than MIN_SYNLENGTH or longer than
				// MAX_SYNLENGTH
				int synTokenNum = normalizedName.split(" ").length;
				if (synTokenNum > MAX_SYNLENGTH
						|| (synTokenNum < MIN_SYNLENGTH && normalizedName.length() < MIN_SYNLENGTH)) {
					log.debug("Removed due to illegal length (too short or too long): {}", normalizedName);
					continue;
				}

				// ignore syns that look like domain or family names
				Pattern p = CandidateFilter.patternDomainFamilies;
				Matcher m = p.matcher(normalizedName);
				if (m.matches()) {
					log.debug("DOMAIN/FAMILY REMOVED: |{}|", normalizedName);
					filtered = true;
				}

				p = CandidateFilter.patternUnspecifieds;
				m = p.matcher(normalizedName);
				if (m.matches()) {
					log.debug("UNSPECIFIED REMOVED: |{}|", normalizedName);
					filtered = true;
				}

				if (filtered && OMIT_FILTERED)
					continue;

				showDebug(id + "\t" + normalizedName);

				String tax = "";
				if (id2tax.get(id) != null) {
					tax = id2tax.get(id);
				}

				// make fields
				List fields = new ArrayList<>();
				Field idField = new StringField(SynonymIndexFieldNames.ID_FIELD, id, Store.YES);
				Field originalNameField = new TextField(SynonymIndexFieldNames.ORIGINAL_NAME, name.toLowerCase(),
						Store.YES);
				Field lookupSynField = new TextField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD, normalizedName,
						Store.YES);
				Field taxField = new StringField(SynonymIndexFieldNames.TAX_ID_FIELD, tax, Field.Store.YES);
				IntPoint priorityField = new IntPoint(SynonymIndexFieldNames.PRIORITY, priority);
				StoredField storedPriorityField = new StoredField(SynonymIndexFieldNames.PRIORITY, priority);
				if (!OMIT_FILTERED) {
					IntPoint filteredField = new IntPoint(SynonymIndexFieldNames.FILTERED, filtered ? 1 : 0);
					StoredField storedFilteredField = new StoredField(SynonymIndexFieldNames.FILTERED,
							filtered ? 1 : 0);
					fields.add(filteredField);
					fields.add(storedFilteredField);
				}
				fields.add(idField);
				fields.add(originalNameField);
				fields.add(lookupSynField);
				fields.add(taxField);
				fields.add(priorityField);
				fields.add(storedPriorityField);
				for (int i = 0; i < normalizedNameVariant.size(); ++i)
					fields.add(new TextField(SynonymIndexFieldNames.VARIANT_NAME, normalizedNameVariant.get(i),
							Store.YES));
				for (int i = 0; i < normalizedNameVariant.size(); ++i)
					fields.add(new TextField(SynonymIndexFieldNames.STEMMED_NORMALIZED_NAME,
							normalizedNameVariant.get(i), Store.YES));

				// make document and add to synonym index
				Document d = new Document();
				for (Field f : fields)
					d.add(f);
				iw.addDocument(d);

				++counter;
				if (counter % 10000 == 0) {
					System.err.println("# entries processed: " + counter);
				}
			}

			iw.close();

			normDictReader.close();

		} catch (IOException e) {
			e.printStackTrace();
		}

	}

	/**
	 * create the directory object where to put the lucene index...
	 */
	private FSDirectory createIndexDirectory(File indexFile) {
		FSDirectory fdir = null;
		try {
			fdir = FSDirectory.open(indexFile.toPath());
		} catch (IOException e) {
			e.printStackTrace();
		}
		return fdir;
	}

	private void showDebug(String s) {
		if (debug) {
			System.out.println(s);
		}
	}

	private void readUpTaxMap(File taxMap) throws IOException {
		System.out.println("Reading up2eg2tax.map ...");
		id2tax = new HashMap();

		BufferedReader reader = new BufferedReader(new FileReader(taxMap));
		String line = "";

		while ((line = reader.readLine()) != null) {
			String[] entry = line.split("\t");

			if (entry.length != 3) {
				System.err.println("ERR: up2eg2tax.map not in expected format. \ncritical line: " + line);
				System.exit(-1);
			}

			String id = entry[0].trim();
			String taxId = entry[2].trim();
			id2tax.put(id, taxId);
		}

		reader.close();
	}

	private void readEgTaxMap(File geneInfo) throws IOException {
		try (BufferedReader br = new BufferedReader(
				new InputStreamReader(new GZIPInputStream(new FileInputStream(geneInfo))))) {
			id2tax = br.lines().collect(
					Collectors.toMap(l -> l.split("\\t", 3)[1], l -> l.split("\\t", 3)[0]));
		}
	}

}