All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biopax.paxtools.search.SearchEngine Maven / Gradle / Ivy

There is a newer version: 5.3.0
Show newest version
package org.biopax.paxtools.search;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.CachingWrapperFilter;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.SearcherFactory;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.Version;
import org.biopax.paxtools.controller.DataPropertyEditor;
import org.biopax.paxtools.controller.Fetcher;
import org.biopax.paxtools.controller.ModelUtils;
import org.biopax.paxtools.controller.SimpleEditorMap;
import org.biopax.paxtools.model.BioPAXElement;
import org.biopax.paxtools.model.Model;
import org.biopax.paxtools.model.level3.*;
import org.biopax.paxtools.model.level3.Process;
import org.biopax.paxtools.util.ClassFilterSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * A full-text searcher/indexer for BioPAX L3 models.
 *
 * TODO: upgrade to the latest Lucene API and copy some changes from cPath2 project.
 *
 * @author rodche
 */
public class SearchEngine implements Indexer, Searcher {
	private static final Logger LOG = LoggerFactory.getLogger(SearchEngine.class);
	
	// search fields
	public static final String FIELD_URI = "uri";
	public static final String FIELD_KEYWORD = "keyword"; //anything, e.g., names, terms, comments, incl. - from child elements 
	public static final String FIELD_NAME = "name";
	public static final String FIELD_XREFID = "xrefid"; //xref.id
	public static final String FIELD_PATHWAY = "pathway"; //parent/owner pathways; to be inferred from the whole biopax model
	public static final String FIELD_N_PARTICIPANTS = "participants"; // num. of PEs or Genes in a process or Complex
	public static final String FIELD_N_PROCESSES = "processes"; // is same as 'size' used to be before cPath2 v7
	public static final String FIELD_N = "n"; // also num. of processes, but indexed to use in range queries, e.g., n:>3
	// Full-text search/filter fields (case sensitive) -
	//index organism names, cell/tissue type (term), taxonomy id, but only store BioSource URIs	
	public static final String FIELD_ORGANISM = "organism";
	//index data source names, but only URIs are stored in the index
	public static final String FIELD_DATASOURCE = "datasource";
	public static final String FIELD_TYPE = "type";
	
	//Default fields to use with the MultiFieldQueryParser;
	//one can still search in other fields directly, e.g.,
	//pathway:some_keywords datasource:"pid", etc.
	public final static String[] DEFAULT_FIELDS = 
	{
			FIELD_KEYWORD, //includes all data type properties (names, terms, comments), 
			// also from child elements up to given depth (3), also includes pathway names (inferred)
			FIELD_NAME, // standardName, displayName, other names
			FIELD_XREFID, //xref.id (also direct child's xref.id, i.e., can find both xref and its owners using a xrefid: query string)
	};
	
	/**
	 * A Key for the value in a 
	 * BioPAX element's annotations map
	 * where additional information about  
	 * corresponding search hit will be stored. 
	 */
	public enum HitAnnotation 
	{
		HIT_EXCERPT,
		HIT_PROCESSES,
		HIT_PARTICIPANTS,
		HIT_ORGANISM,
		HIT_DATASOURCE,
		HIT_PATHWAY,
	}
		
	private final Model model;
	private int maxHitsPerPage;
	private final Analyzer analyzer;
	private final File indexFile;
	private SearcherManager searcherManager;

	public final static int DEFAULT_MAX_HITS_PER_PAGE = 100;
	
	/**
	 * Main Constructor.
	 *
	 * @param model BioPAX object model to be indexed or searched.
	 * @param indexLocation full path to the index directory
	 */
	public SearchEngine(Model model, String indexLocation) {
		this.model = model;
		this.indexFile = new File(indexLocation);
		initSearcherManager();
		this.maxHitsPerPage = DEFAULT_MAX_HITS_PER_PAGE;
		this.analyzer = new StandardAnalyzer();
	}

	private void initSearcherManager() {
		try {
			if(indexFile.exists()) 
				this.searcherManager = 
					new SearcherManager(MMapDirectory.open(indexFile), new SearcherFactory());
			else 
				LOG.info(indexFile.getPath() + " does not exist.");
		} catch (IOException e) {
			LOG.warn("Could not create a searcher: " + e);
		}
	}

	/**
	 * Sets the maximum no. hits per search results page (pagination).
	 *
	 * @param maxHitsPerPage positive int value; otherwise - unlimited
	 */
	public void setMaxHitsPerPage(int maxHitsPerPage) {
		this.maxHitsPerPage = maxHitsPerPage;
	}

	/**
	 * Gets the maximum no. hits per search results page (pagination parameter).
	 * @return int value
	 */
	 public int getMaxHitsPerPage() {
		return maxHitsPerPage;
	}

	public SearchResult search(String query, int page,
			Class filterByType, String[] datasources,
			String[] organisms) 
	{
		SearchResult response = null;
		
		LOG.debug("search: " + query + ", page: " + page 
			+ ", filterBy: " + filterByType
			+ "; extra filters: ds in (" + Arrays.toString(datasources)
			+ "), org. in (" + Arrays.toString(organisms) + ")");
		
		IndexSearcher searcher = null;
	
		try {	
			QueryParser queryParser = new MultiFieldQueryParser(DEFAULT_FIELDS, analyzer);
			queryParser.setAllowLeadingWildcard(true);//TODO do we really want leading wildcards (e.g. *sulin)?
			
			searcher = searcherManager.acquire();	
			
			//find and transform top docs to search hits (beans), considering pagination...
			if(!query.trim().equals("*")) { //if not "*" query, which is not supported out-of-the-box, then
				//create the lucene query
				Query luceneQuery = queryParser.parse(query);
//do NOT (Lucene 4.1), or scoring/highlighting won't work for wildcard queries...				
//luceneQuery = searcher.rewrite(luceneQuery); 
				LOG.debug("parsed lucene query is " + luceneQuery.getClass().getSimpleName());
				
				//create filter: type AND (d OR d...) AND (o OR o...)
				Filter filter = createFilter(filterByType, datasources, organisms);
				
				//get the first page of top hits
				TopDocs topDocs = searcher.search(luceneQuery, filter, maxHitsPerPage);
				//get the required hits page if page>0
				if(page>0) {
					TopScoreDocCollector collector = TopScoreDocCollector.create(maxHitsPerPage*(page+1), true);  
					searcher.search(luceneQuery, filter, collector);
					topDocs = collector.topDocs(page * maxHitsPerPage, maxHitsPerPage);
				}
				
				//transform docs to hits, use a highlighter to get excerpts
				response = transform(luceneQuery, searcher, true, topDocs);
	
			} else { //find ALL objects of a particular BioPAX class (+ filters by organism, datasource)
				if(filterByType==null) 
					filterByType = Level3Element.class;

				//replace q="*" with a search for the class or its sub-class name in the TYPE field
				BooleanQuery luceneQuery = new BooleanQuery();
				for(Class subType : SimpleEditorMap.L3.getKnownSubClassesOf(filterByType)) {
					luceneQuery.add(new TermQuery(new Term(FIELD_TYPE, subType.getSimpleName().toLowerCase())), Occur.SHOULD);
				}
				Filter filter = createFilter(null, datasources, organisms);
				
				//get the first page of top hits
				TopDocs topDocs = searcher.search(luceneQuery, filter, maxHitsPerPage);
				//get the required hits page if page>0
				if(page>0) {
					TopScoreDocCollector collector = TopScoreDocCollector.create(maxHitsPerPage*(page+1), true);  
					searcher.search(luceneQuery, filter, collector);
					topDocs = collector.topDocs(page * maxHitsPerPage, maxHitsPerPage);	
				}
				
				//convert
				response = transform(luceneQuery, searcher, false, topDocs);			
			}	
			
		} catch (ParseException e) {
			throw new RuntimeException("getTopDocs: failed to parse the query string.", e);
		} catch (IOException e) {
			throw new RuntimeException("getTopDocs: failed.", e);
		} finally {
			try {
				if(searcher!=null) {
					searcherManager.release(searcher);
					searcher = null;
				}
			} catch (IOException e) {}	
		}
		
		response.setPage(page);
		
		return response;
	}

	
	/**
	 * Returns a SearchResult
	 * that contains a List,
	 * some parameters, totals, etc.
	 */
	private SearchResult transform(Query query, IndexSearcher searcher, boolean highlight, TopDocs topDocs) 
			throws CorruptIndexException, IOException 
	{	
		final SearchResult response = new SearchResult();
		final List hits = new ArrayList();
		
		response.setMaxHitsPerPage(maxHitsPerPage);
		response.setHits(hits);
		
		for(ScoreDoc scoreDoc : topDocs.scoreDocs) {			
			Document doc = searcher.doc(scoreDoc.doc);
			String uri = doc.get(FIELD_URI);
			BioPAXElement bpe = model.getByID(uri);			
			LOG.debug("transform: doc:" + scoreDoc.doc + ", uri:" + uri);
			
			// use the highlighter (get matching fragments)
			// for this to work, all keywords were stored in the index field
			if (highlight && doc.get(FIELD_KEYWORD) != null) {				
				// use a Highlighter (store.YES must be enabled for 'keyword' field)
				QueryScorer scorer = new QueryScorer(query, FIELD_KEYWORD); 
				//this fixes scoring/highlighting for all-field wildcard queries like q=insulin* 
				//but not for term/prefix queries, i.e, q=name:insulin*, q=pathway:brca2. TODO
				scorer.setExpandMultiTermQuery(true);	
				
				//TODO try PostingsHighlighter once it's stable...
				// (see http://lucene.apache.org/core/4_10_0/highlighter/org/apache/lucene/search/postingshighlight/PostingsHighlighter.html)
				SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("", "");
				Highlighter highlighter = new Highlighter(formatter, scorer);
				highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 80));
										
				final String text = StringUtils.join(doc.getValues(FIELD_KEYWORD), " ");
				try {
					TokenStream tokenStream = analyzer.tokenStream("", new StringReader(text));
					String res = highlighter.getBestFragments(tokenStream, text, 7, "...");

					if(res != null && !res.isEmpty()) {
						bpe.getAnnotations().put(HitAnnotation.HIT_EXCERPT.name(), res);
					}

				} catch (Exception e) {throw new RuntimeException(e);}

			} else if(highlight) {
				LOG.warn("Highlighter skipped, because KEYWORD field was null; hit: " 
						+ uri + ", " + bpe.getModelInterface().getSimpleName());
			}

			// extract organisms (URI only) if not done before
			if(doc.get(FIELD_ORGANISM) != null && !bpe.getAnnotations().containsKey(HitAnnotation.HIT_ORGANISM.name())) {
				Set uniqueVals = new TreeSet();
				for(String o : doc.getValues(FIELD_ORGANISM)) {
					//note: only URIS are stored in the index					
					uniqueVals.add(o);
				}
				bpe.getAnnotations().put(HitAnnotation.HIT_ORGANISM.name(), uniqueVals);
			}
			
			// extract values form the index if not previously done
			if(doc.get(FIELD_DATASOURCE) != null && !bpe.getAnnotations().containsKey(HitAnnotation.HIT_DATASOURCE.name())) {
				Set uniqueVals = new TreeSet();
				for(String d : doc.getValues(FIELD_DATASOURCE)) {
					//note: only URIS are stored in the index
					uniqueVals.add(d);
				}
				bpe.getAnnotations().put(HitAnnotation.HIT_DATASOURCE.name(), uniqueVals);
			}	
			
			// extract only pathway URIs if not previously done
			//(because names and IDs used to be stored in the index field as well)
			if(doc.get(FIELD_PATHWAY) != null && !bpe.getAnnotations().containsKey(HitAnnotation.HIT_PATHWAY.name())) {
				Set uniqueVals = new TreeSet();
				for(String d : doc.getValues(FIELD_PATHWAY)) {
					//only URIs were stored (though all names/ids were indexed/analyzed)
					if(!d.equals(uri)) //exclude itself
						uniqueVals.add(d);
				}
				bpe.getAnnotations().put(HitAnnotation.HIT_PATHWAY.name(), uniqueVals);
			}
			
			//store the no. processes in the sub-network if not previously done
			if(doc.get(FIELD_N_PARTICIPANTS)!=null && !bpe.getAnnotations().containsKey(HitAnnotation.HIT_PARTICIPANTS.name()))
				bpe.getAnnotations().put(HitAnnotation.HIT_PARTICIPANTS.name(), Integer.valueOf(doc.get(FIELD_N_PARTICIPANTS)));
			if(doc.get(FIELD_N_PROCESSES)!=null && !bpe.getAnnotations().containsKey(HitAnnotation.HIT_PROCESSES.name()))
				bpe.getAnnotations().put(HitAnnotation.HIT_PROCESSES.name(), Integer.valueOf(doc.get(FIELD_N_PROCESSES)));

			//store the Lucene's score and explanation.
			String excerpt = (String) bpe.getAnnotations().get(HitAnnotation.HIT_EXCERPT.name());
			if(excerpt == null) excerpt = "";
			excerpt += " -SCORE- " + scoreDoc.score + " -EXPLANATION- " + searcher.explain(query, scoreDoc.doc);
			bpe.getAnnotations().put(HitAnnotation.HIT_EXCERPT.name(), excerpt);

			hits.add(bpe);
		}
						
		//set total no. hits	
		response.setTotalHits(topDocs.totalHits);
		
		return response;
	}


	public void index() {
		final int numObjects =  model.getObjects().size();
		LOG.info("index(), there are " + numObjects + " BioPAX objects to be (re-)indexed.");		
		IndexWriter iw;		
		try {
			//close the searcher manager if the old index exists
			if(searcherManager != null) {
				searcherManager.close();
				searcherManager = null;
			}
			IndexWriterConfig conf = new IndexWriterConfig(Version.LATEST, analyzer);
			iw = new IndexWriter(FSDirectory.open(indexFile), conf);
			//cleanup
			iw.deleteAll();
			iw.commit();
		} catch (IOException e) {
			throw new RuntimeException("Failed to create a new IndexWriter.", e);
		}		
		final IndexWriter indexWriter = iw;

		ExecutorService exec = Executors.newFixedThreadPool(30);
		
		final AtomicInteger numLeft = new AtomicInteger(numObjects);

		final org.biopax.paxtools.util.Filter dataPropertiesToConsider
				= new org.biopax.paxtools.util.Filter() {
			@Override
			public boolean filter(DataPropertyEditor editor) {
				final String prop = editor.getProperty();
				//to include in the index, as keywords, only the following properties
				// (basically, to exclude float type properties, embedded xml, db names, etc.):
				return (prop.equalsIgnoreCase("author") || prop.equalsIgnoreCase("availability")
						|| prop.equalsIgnoreCase("chemicalFormula") || prop.equalsIgnoreCase("comment")
						|| prop.equalsIgnoreCase("controlType") || prop.equalsIgnoreCase("conversionDirection")
						|| prop.equalsIgnoreCase("eCNumber") || prop.equalsIgnoreCase("id")
						|| prop.equalsIgnoreCase("name") || prop.equalsIgnoreCase("displayName")
						|| prop.equalsIgnoreCase("standardName") || prop.equalsIgnoreCase("sequence")
						|| prop.equalsIgnoreCase("source") || prop.equalsIgnoreCase("year")
						|| prop.equalsIgnoreCase("term") || prop.equalsIgnoreCase("stepDirection")
						|| prop.equalsIgnoreCase("structureData") || prop.equalsIgnoreCase("templateDirection")
						|| prop.equalsIgnoreCase("title") || prop.equalsIgnoreCase("url")
				);
			}
		};

		final Fetcher fetcher = new Fetcher(SimpleEditorMap.L3, Fetcher.nextStepFilter);
		fetcher.setSkipSubPathways(true);

		for(final BioPAXElement bpe : model.getObjects()) {	
			// prepare & index each element in a separate thread
			exec.execute(new Runnable() {
				public void run() {					
					// get or infer some important values if possible from this, child or parent objects:
					Set keywords = ModelUtils.getKeywords(bpe, 3, dataPropertiesToConsider);
					// a hack to remove special (debugging) biopax comments
					for(String s : new HashSet(keywords)) {
						//exclude additional comments generated by normalizer, merger, etc.
						if(s.startsWith("REPLACED ") || s.contains("ADDED"))
							keywords.remove(s);
					}
					
					bpe.getAnnotations().put(FIELD_KEYWORD, keywords);
					bpe.getAnnotations().put(FIELD_DATASOURCE, ModelUtils.getDatasources(bpe));
					bpe.getAnnotations().put(FIELD_ORGANISM, ModelUtils.getOrganisms(bpe));
					bpe.getAnnotations().put(FIELD_PATHWAY, ModelUtils.getParentPathways(bpe));
					//- includes itself if bpe is a pathway

					//set  (PEs/Genes),  (interactions/pathways) index fields:
					if(bpe instanceof org.biopax.paxtools.model.level3.Process) {
						int numProc = fetcher.fetch(bpe, Process.class).size(); //except itself
						int numPeAndG = fetcher.fetch(bpe, PhysicalEntity.class).size()
								+ fetcher.fetch(bpe, Gene.class).size();
						bpe.getAnnotations().put(FIELD_N_PARTICIPANTS, Integer.toString(numPeAndG));
						bpe.getAnnotations().put(FIELD_N_PROCESSES, Integer.toString(numProc));
					} else if(bpe instanceof Complex) {
						int numPEs = fetcher.fetch(bpe, PhysicalEntity.class).size();
						bpe.getAnnotations().put(FIELD_N_PARTICIPANTS, Integer.toString(numPEs));
					}

					//add IDs of uni. and rel. xrefs of this element and its children
					if(bpe instanceof Entity || bpe instanceof EntityReference)
					{
						final Set ids = new HashSet();

						//fetch all children of (implicit) type XReferrable, which means - either
						//BioSource or ControlledVocabulary or Evidence or Provenance or Entity or EntityReference
						//(we actually want only the latter two types and their sub-types; will skip the rest later on):
						Set children = fetcher.fetch(bpe, XReferrable.class);
						//include itself (- for fetcher only gets child elements)
						children.add((XReferrable) bpe);
						for(XReferrable child : children) {
							//skip IDs of Evidence,CV,Provenance (included into 'keyword' field anyway)
							if(!(child instanceof Entity || child instanceof EntityReference))
								continue;
							// collect standard bio IDs (skip publications); try/use id-mapping to associate more IDs:
							for(Xref x : child.getXref()) {
								if (!(x instanceof PublicationXref) && x.getId()!=null && x.getDb()!=null) {
									ids.add(x.getId());
								}
							}
						}
						if(!ids.isEmpty()) {
							bpe.getAnnotations().put(SearchEngine.FIELD_XREFID, ids);
						}
					}

					index(bpe, indexWriter);
					
					//count, log a progress message
					int left = numLeft.decrementAndGet();
					if(left % 10000 == 0)
						LOG.info("index(), biopax objects left to index: " + left);
				}
			});
		}
		
		exec.shutdown(); //stop accepting new tasks	
		try { //wait
			exec.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);
		} catch (InterruptedException e) {
			throw new RuntimeException("Interrupted!", e);
		}
		
		try {
			indexWriter.close(); //wait for pending op., auto-commit, close.
		} catch (IOException e) {
			throw new RuntimeException("Failed to close IndexWriter.", e);
		} 
		
		//finally, create a new searcher manager
		initSearcherManager();
	}
	
	
	// internal methods
	
	/**
	 * Creates a new Lucene Document that corresponds to a BioPAX object.
	 * It does not check whether the document exists (should not be there,
	 * because the {@link #index()} method cleans up the index)
	 * 
	 * Some fields also include biopax data type property values not only from 
	 * the biopax object but also from its child elements, up to some depth 
	 * (using key-value pairs in the pre-computed bpe.annotations map):
	 * 
	 *  'uri' - biopax object's absolute URI, analyze=no, store=yes;
	 * 
	 *  'name' - names, analyze=yes, store=yes; boosted;
	 * 
	 *  'keyword' - infer from this bpe and its child objects' data properties,
	 *            such as Score.value, structureData, structureFormat, chemicalFormula, 
	 *            availability, term, comment, patoData, author, source, title, url, published, 
	 *            up to given depth/level; and also all 'pathway' field values are included here; 
	 *            analyze=yes, store=yes;
	 *  
	 *  'datasource', 'organism' and 'pathway' - infer from this bpe and its child objects 
	 *  									  up to given depth/level, analyze=no, store=yes;
	 *  
	 *  'size' - number of child processes, an integer as string; analyze=no, store=yes
	 * 
	 * @param bpe BioPAX object
	 * @param indexWriter index writer
	*/
	void index(BioPAXElement bpe, IndexWriter indexWriter) {		
		// create a new document
		final Document doc = new Document();
		
		// save URI (not indexed field)
		Field field = new StoredField(FIELD_URI, bpe.getUri());
		doc.add(field);
		
		// index and store but not analyze/tokenize the biopax class name:
		field = new StringField(FIELD_TYPE, bpe.getModelInterface().getSimpleName().toLowerCase(), Field.Store.YES);
		doc.add(field);
		
		// make index fields from the annotations map (of pre-calculated/inferred values)
		if(!bpe.getAnnotations().isEmpty()) {
			if(bpe.getAnnotations().containsKey(FIELD_PATHWAY)) {
				addPathways((Set)bpe.getAnnotations().get(FIELD_PATHWAY), doc);
			}
			if(bpe.getAnnotations().containsKey(FIELD_ORGANISM)) {
				addOrganisms((Set)bpe.getAnnotations().get(FIELD_ORGANISM), doc);
			}
			if(bpe.getAnnotations().containsKey(FIELD_DATASOURCE)) {
				addDatasources((Set)bpe.getAnnotations().get(FIELD_DATASOURCE), doc);
			}
			if(bpe.getAnnotations().containsKey(FIELD_KEYWORD)) {
				addKeywords((Set)bpe.getAnnotations().get(FIELD_KEYWORD), doc);
			}
			if(bpe.getAnnotations().containsKey(FIELD_N_PARTICIPANTS)) {
				field = new StoredField(FIELD_N_PARTICIPANTS,
						Integer.parseInt((String)bpe.getAnnotations().get(FIELD_N_PARTICIPANTS)));
				doc.add(field);
			}
			if(bpe.getAnnotations().containsKey(FIELD_N_PROCESSES)) {
				field = new IntField(FIELD_N,
						Integer.parseInt((String)bpe.getAnnotations().get(FIELD_N_PROCESSES)), Field.Store.NO);
				doc.add(field);
				field = new StoredField(FIELD_N_PROCESSES,
						Integer.parseInt((String)bpe.getAnnotations().get(FIELD_N_PROCESSES)));
				doc.add(field);
			}
			if(bpe.getAnnotations().containsKey(FIELD_XREFID)) {
				//index biological IDs as keywords
				addKeywords((Set)bpe.getAnnotations().get(FIELD_XREFID), doc);

				//index all IDs using "xrefid" fields
				for (String id : (Set)bpe.getAnnotations().get(FIELD_XREFID)) {
					Field f = new StringField(FIELD_XREFID, id.toLowerCase(), Field.Store.NO);
					doc.add(f);
				}
			}
		}
		bpe.getAnnotations().remove(FIELD_KEYWORD);
		bpe.getAnnotations().remove(FIELD_DATASOURCE);
		bpe.getAnnotations().remove(FIELD_ORGANISM);
		bpe.getAnnotations().remove(FIELD_PATHWAY);
		bpe.getAnnotations().remove(FIELD_N_PARTICIPANTS);
		bpe.getAnnotations().remove(FIELD_N_PROCESSES);
		bpe.getAnnotations().remove(FIELD_XREFID);

		// name
		if(bpe instanceof Named) {
			Named named = (Named) bpe;
			if(named.getStandardName() != null) {
				field = new TextField(FIELD_NAME, named.getStandardName(), Field.Store.NO);
				field.setBoost(3.5f);
				doc.add(field);
			}
			if(named.getDisplayName() != null && !named.getDisplayName().equalsIgnoreCase(named.getStandardName())) {
				field = new TextField(FIELD_NAME, named.getDisplayName(), Field.Store.NO);
				field.setBoost(3.0f);
				doc.add(field);
			}
			for(String name : named.getName()) {
				if(name.equalsIgnoreCase(named.getDisplayName()) || name.equalsIgnoreCase(named.getStandardName()))
					continue;
				field = new TextField(FIELD_NAME, name.toLowerCase(), Field.Store.NO);
				field.setBoost(2.5f);
				doc.add(field);
			}
		}

		// write
		try {
			indexWriter.addDocument(doc);
		} catch (IOException e) {
			throw new RuntimeException("Failed to index; " + bpe.getUri(), e);
		}
	}

	private void addKeywords(Set keywords, Document doc) {
		for (String keyword : keywords) {
			Field f = new TextField(FIELD_KEYWORD, keyword.toLowerCase(), Field.Store.YES);
			doc.add(f);
		}
	}

	private void addDatasources(Set set, Document doc) {
		for (Provenance p : set) {
			// Index and store URI (untokinized) - 
			// required to accurately calculate no. entities or to filter by data source (diff. datasources may share same names)
			doc.add(new StringField(FIELD_DATASOURCE, p.getUri(), Field.Store.YES));
			// index names as well
			for (String s : p.getName())
				doc.add(new TextField(FIELD_DATASOURCE, s.toLowerCase(), Field.Store.NO));
		}
	}

	private void addOrganisms(Set set, Document doc) {	
		for(BioSource bs : set) {
			// store URI as is (not indexed, untokinized)
			doc.add(new StoredField(FIELD_ORGANISM, bs.getUri()));
				
			// add organism names
			for(String s : bs.getName()) {
				doc.add(new TextField(FIELD_ORGANISM, s.toLowerCase(), Field.Store.NO));
			}
			// add taxonomy
			for(UnificationXref x : 
				new ClassFilterSet(bs.getXref(), UnificationXref.class)) {
				if(x.getId() != null)
					doc.add(new TextField(FIELD_ORGANISM, x.getId().toLowerCase(), Field.Store.NO));
			}
			// include tissue type terms
			if (bs.getTissue() != null) {
				for (String s : bs.getTissue().getTerm())
					doc.add(new TextField(FIELD_ORGANISM, s.toLowerCase(), Field.Store.NO));
			}
			// include cell type terms
			if (bs.getCellType() != null) {
				for (String s : bs.getCellType().getTerm()) {
					doc.add(new TextField(FIELD_ORGANISM, s.toLowerCase(), Field.Store.NO));
				}
			}
		}
	}

	private void addPathways(Set set, Document doc) {
		for(Pathway pw : set) {
			//add URI as is (do not lowercase; do not index; store=yes - required to report hits, e.g., as xml)
			doc.add(new StoredField(FIELD_PATHWAY, pw.getUri()));
			
			// add names to the 'pathway' (don't store) and 'keywords' (store, don't index) fields
			for (String s : pw.getName()) {
				doc.add(new TextField(FIELD_PATHWAY, s.toLowerCase(), Field.Store.NO));
				doc.add(new StoredField(FIELD_KEYWORD, s.toLowerCase()));//for highlighting only, not indexed
			}
			
			// add unification xref IDs too
			for (UnificationXref x : new ClassFilterSet(
					pw.getXref(), UnificationXref.class)) {
				if (x.getId() != null) {
					// index in both 'pathway' (don't store) and 'keywords' (store, don't index)
					doc.add(new TextField(FIELD_PATHWAY, x.getId().toLowerCase(), Field.Store.NO));
					doc.add(new StoredField(FIELD_KEYWORD, x.getId().toLowerCase()));//for highlighting only, not indexed
				}
			}
		}
	}

	/** 
	 * Creates a search filter like 
	 * type AND (datasource OR datasource...) 
	 *      AND (organism OR organism OR...)
	 * 
	 * Both names (partial or full) and URIs should work as filter values. 
	 * 
	 * @param type
	 * @param datasources
	 * @param organisms
	 */	
	private Filter createFilter(Class type, 
			String[] datasources, String[] organisms) {
		
		BooleanQuery filterQuery = new BooleanQuery();
		
		//AND datasources	
		if (datasources != null && datasources.length > 0) {
			filterQuery.add(subQuery(datasources, FIELD_DATASOURCE), Occur.MUST);
		}
		//AND organisms
		if (organisms != null && organisms.length > 0) {
			filterQuery.add(subQuery(organisms, FIELD_ORGANISM), Occur.MUST);
		}		
		//AND type	
		if(type != null) { //add biopax class filter
			BooleanQuery query = new BooleanQuery();
			query.add(new TermQuery(new Term(FIELD_TYPE, type.getSimpleName().toLowerCase())), Occur.SHOULD);//OR
			//for each biopax subclass (interface), add the name to the filter query
			for(Class subType : SimpleEditorMap.L3.getKnownSubClassesOf(type)) {
				query.add(new TermQuery(new Term(FIELD_TYPE, subType.getSimpleName().toLowerCase())), Occur.SHOULD);//OR
			}		
			filterQuery.add(query, Occur.MUST);
		}
		
		if(!filterQuery.clauses().isEmpty()) {
			LOG.debug("filterQuery: " + filterQuery.toString());
			return new CachingWrapperFilter( new QueryWrapperFilter(filterQuery) ); //TODO why CachingWrapperFilter, QueryWrapperFilter?
		} else 
			return null;
	}

	/**
	 * Filter values here are joint with 'OR' operator, 
	 * but if a value has internal whitespace symbols, this also makes a sub-query,
	 * in which terms are joint with 'AND'. This is to allow filtering
	 * by datasource/organism's full name, partial name, or uri 
	 * and allowing multiple datasources/organisms.
	 * 
	 * @param filterValues
	 * @param filterField
	 * @return
	 */
	private Query subQuery(String[] filterValues, String filterField) {
		BooleanQuery query = new BooleanQuery();	
		final Pattern pattern = Pattern.compile("\\s");		
		for(String v : filterValues) {
			//if v has whitespace chars (several words), make a "word1 AND word2 AND..." subquery
			if(pattern.matcher(v).find()) {
				BooleanQuery bq = new BooleanQuery();
//was bug: text with spaces and 'of', 'for', 'and', etc., did not match anything (we have to use the same analyzer as during indexing!)
//				for(String w : v.split("\\s+")) {
//					bq.add(new TermQuery(new Term(filterField, w.toLowerCase())), Occur.MUST);
//					LOG.debug("subQuery, add part: " + w.toLowerCase());
//				}
				try {
					//use the same analyser as when indexing
					TokenStream tokenStream = analyzer.tokenStream(filterField, new StringReader(v));
					CharTermAttribute chattr = tokenStream.addAttribute(CharTermAttribute.class);
					tokenStream.reset();
					while(tokenStream.incrementToken()) {
						//'of', 'and', 'for',.. never occur as tokens (this is how the std. analyzer works)
						String token = chattr.toString();
						bq.add(new TermQuery(new Term(filterField, token)), Occur.MUST);
					}
					tokenStream.end(); 
					tokenStream.close();
				} catch (IOException e) {
					//should never happen as we use StringReader
					throw new RuntimeException("Failed to open a token stream; "
							+ "field:" + filterField + ", value:" + v,e);
				}
				query.add(bq, Occur.SHOULD);
			} else {
				query.add(new TermQuery(new Term(filterField, v.toLowerCase())), Occur.SHOULD);
			}			
		}
		
		return query;
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy