de.julielab.genemapper.resources.ContextIndexGenerator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.
The newest version!
/** 
 * IndexGenerator.java
 * 
 * Copyright (c) 2006, JULIE Lab. 
 * All rights reserved. This program and the accompanying materials 
 * are made available under the terms of the Common Public License v1.0 
 *
 * Author: wermter
 * 
 * Current version: 1.5.1
 * Since version:   1.0
 *
 * Creation date: Nov 30, 2006 
 * 
 * This class puts the semantic context for a given gene dictionary into
 * a Lucene index.
 * 
 **/

package de.julielab.genemapper.resources;

import de.julielab.genemapper.index.ContextGenerator;
import de.julielab.genemapper.index.ContextIndexFieldNames;
import de.julielab.genemapper.utils.ContextUtils;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;


public class ContextIndexGenerator {

	//private static final File BIOTHESAURUS_FILE = new File("/data/data_resources/biology/Julie_BT/uniprot_subset/uniprot.plus_ids.unique");
	//private static final File BIOTHESAURUS_FILE = new File("/home/jwermter/biocreative2_data/real_text_data/entrezGeneLexicon.norm.realtext");

	// StemNet:
//	private static final File BIOTHESAURUS_FILE = new File("/mnt/data_stemnet/resources/dictionaries/gene_dictionaries/stemnet/up_index_resources/uniprot.ids");
	// private static final File BIOTHESAURUS_FILE = new File("/data/data_resources/biology/up_index_resources/uniprot.ids");
	// Update Engelmann:
	// private static final File BIOTHESAURUS_FILE = new File("/home/engelmann/geno/Dictionaries_new/uniprot.ids");

	// BC2 new:
	//private static final File BIOTHESAURUS_FILE = new File("/mnt/data_stemnet/resources/dictionaries/gene_dictionaries/stemnet/eg_index_resources_human/eg.ids");

	// BC2 old:
	//private static final File BIOTHESAURUS_FILE = new File("/home/jwermter/biocreative2_data/entrezGene.ids");

	// StemNet:
	// private static final File INDEX_FILE = new File("data/app_data/gene_context_index");
	// Update Engelmann:
	// private static final File INDEX_FILE = new File("/home/engelmann/geno/Indices/gene_context_index");

	// BC2:
	//private static final File INDEX_FILE = new File("data/eval_data/biocreative2_data/entrezGeneContextToken_index");




	/*
	 * define some fields in the index:
	 * SYN_FIELD: this field is to be searched
	 * ID_FIELD: there the id is stored (organism-independent)
	 * LOOKUP_SYN_FIELD: the synonym is stored again here (needed for calculating the score)
	 */

//	//public static final String ID_FIELD = "uniprot_id";
//	//public static final String SYN_FIELD = "synonym";
//	public static final String LOOKUP_ID_FIELD = "indexed_id";
//	//public static final String CONTEXT = "context";
//	public static final String LOOKUP_CONTEXT_FIELD = "indexed_context";
//



	private final File biothesaurusFile;
	private final String resourcesDir;

	Directory indexDirectory;

	private static final boolean debug = false;
	private static final boolean useContextTypes = false;

	/**
	 * To execute the ContextIndexGenerator start it with the following command-line arguments:
 
	 * arg0: path to resources directory
	 * arg1: path to context indices directory
	 * 
	 * @param args
	 */
	public static void main(String[] args) {
		
		long s1 = System.currentTimeMillis();
		
		if (args.length != 2) {
			System.err.println("Usage: ContextIndexGenerator  ");
			System.exit(-1);
		}
		
		String indexPath = args[1];
		if (!indexPath.endsWith("/")) {
			indexPath = indexPath + "/";
		}
		File geneIndexDir = new File(indexPath + "geneContextIndex");
		File proteinIndexDir = new File(indexPath + "proteinContextIndex");
		
		String resPath = args[0];
		File resDir = new File(resPath);
		if (!resDir.isDirectory()) {
			System.err.println("Could not find resources directory");
			System.exit(-1);
		}
		if (!resPath.endsWith("/")) {
			resPath = resPath + "/";
		}
		
		File upFile = new File(resPath + "up.ids");
		if (!upFile.isFile()) {
			System.err.println("Could not find file uniprot.ids");
			System.exit(-1);
		}	
		File egFile = new File(resPath + "eg.ids");
		if (!egFile.isFile()) {
			System.err.println("Could not find file eg.ids");
			System.exit(-1);
		}

		ContextIndexGenerator indexGenerator;
		try {
			indexGenerator = new ContextIndexGenerator(upFile, proteinIndexDir, resPath);
			indexGenerator.createIndex("protein");
			indexGenerator = new ContextIndexGenerator(egFile, geneIndexDir, resPath);
			indexGenerator.createIndex("gene");
		} catch (IOException e) {
			e.printStackTrace();
		}
		
		long s2 = System.currentTimeMillis();
		System.out.println("Indices created successfully! (" + (s2 - s1) / 1000 + " sec)");
	}

	/**
	 * constructor which creates index in the specified directory on the disk
	 */
	public ContextIndexGenerator(File biothesaurusFile,
			File indexFile, String resourcesDir) throws IOException {
		this.biothesaurusFile = biothesaurusFile;
		this.resourcesDir = resourcesDir;
		indexDirectory = createIndexDirectory(indexFile);
	}


	/**
	 * create the index, i.e. read from the biothesaurus file (which
	 * is expected to have normalized synonyms!) and then write it to the index.
	 * 
	 * @throws IOException
	 */
	public void createIndex(String idType) throws IOException {


		ContextGenerator cg = new ContextGenerator(resourcesDir, idType);
		//SnowballAnalyzer sbAnalyzer = new SnowballAnalyzer("English", cg.getStopWords());
//		SnowballAnalyzer sbAnalyzer = new SnowballAnalyzer("English", ContextUtils.STOPWORDS);
		WhitespaceAnalyzer wsAnalyzer = new WhitespaceAnalyzer();
		IndexWriterConfig iwc = new IndexWriterConfig(wsAnalyzer);
		iwc.setOpenMode(OpenMode.CREATE);
		IndexWriter contextIndexWriter = new IndexWriter(indexDirectory, iwc);
		/*
		if(useContextTypes) {
			contextIndexWriter = new IndexWriter(indexDirectory, wsAnalyzer, true);
			System.out.println("useContextTpye = " + useContextTypes);
		} else {
			System.out.println("useContextTpye = " + useContextTypes);
			contextIndexWriter = new IndexWriter(indexDirectory, sbAnalyzer, true);
		}
		*/
		//TermNormalizer normalizer = new TermNormalizer();
		int counter = 0;
		BufferedReader biothesaurusReader = new BufferedReader(new FileReader(
				biothesaurusFile));

		System.out
				.println("Generating index now. This may take quite a while (up to several hours when file is large) ...");
		// now loop thourgh biothesaurus and add entries to the index
		try {

			String line = "";
			while ((line = biothesaurusReader.readLine()) != null) {

				String[] values = line.split("\t");

				// check whether format is OK
				if (values.length != 1) {
					System.err
							.println("ERR: Input file not in expected format. \ncritical line: "
									+ line);
					System.exit(-1);
				}

				// now get the field values
				String id_org = values[0];
				
				showDebug(id_org);
				
//				 make fields
				String context = cg.getContext(id_org);
				
				
				if(useContextTypes) {
					//System.out.println("useContextTpye = " + useContextTypes);
					context = ContextUtils.makeContextTypes(context);
				} else {
					//System.out.println("useContextTpye = " + useContextTypes);
					context = ContextUtils.makeContextTokenString(context);
//					System.out.println(context);
				}
				/*
				if(id_org.equals("IL2RA_HUMAN")) {
					System.out.println("FINAL CONTEXT:" + context);
				}
				*/
				//String contextBigrams = Utils.makeUnderScoreBigrams(context);
				//System.out.println(contextBigrams);
				//context = context + " " + contextBigrams;

				/*
				System.out.println(id_org + ": " + context);
				TokenStream stream = sbAnalyzer.tokenStream("English", new StringReader(context));
				for (Token t = stream.next(); t != null; t = stream.next()) {
					System.out.print(t.toString() + " ");

				System.out.println("\n");
				*/
				//Field contextField = new Field(CONTEXT, new StringReader(context));
				Field lookupContextField = new TextField(ContextIndexFieldNames.LOOKUP_CONTEXT_FIELD, context, Store.YES);
				
				//Field idField = new Field(ID_FIELD, id_org, Field.Store.YES,
					//	Field.Index.UN_TOKENIZED);
				Field lookupIdField = new StringField(ContextIndexFieldNames.LOOKUP_ID_FIELD, id_org, Field.Store.YES);
				

				
				// make context field, make document and add to context index
				Document d = new Document();
				//d.add(contextField);
				//d.add(idField);
				d.add(lookupContextField);
				d.add(lookupIdField);
				
				contextIndexWriter.addDocument(d);
				
				
				++counter;
				if (counter % 10000 == 0){
					System.err.println("# entries processed: " + counter);
				}
			}

			// finally optimize the index and close it
			contextIndexWriter.close();

			biothesaurusReader.close();

		} catch (IOException e) {
			e.printStackTrace();
		}

	}

	

	

	/**
	 * create the directory object where to put the lucene index...
	 */
	private FSDirectory createIndexDirectory(File indexFile) {
		FSDirectory fdir = null;
		try {
			fdir = FSDirectory.open(indexFile.toPath());
		} catch (IOException e) {
			e.printStackTrace();
		}
		return fdir;
	}
	
	
	

	private void showDebug(String s) {
		if (debug) {
			System.out.println(s);
		}
	}

}