gate.creole.annic.lucene.LuceneSearcher Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gate-core Show documentation
Show all versions of gate-core Show documentation
GATE - general achitecture for text engineering - is open source
software capable of solving almost any text processing problem. This
artifact enables you to embed the core GATE Embedded with its essential
dependencies. You will able to use the GATE Embedded API and load and
store GATE XML documents. This artifact is the perfect dependency for
CREOLE plugins or for applications that need to customize the GATE
dependencies due to confict with their own dependencies or for lower
footprint.
The newest version!
/*
* LuceneSearcher.java
*
* Niraj Aswani, 19/March/07
*
* $Id: LuceneSearcher.html,v 1.0 2007/03/19 16:22:01 niraj Exp $
*/
package gate.creole.annic.lucene;
import gate.creole.annic.Constants;
import gate.creole.annic.Hit;
import gate.creole.annic.Pattern;
import gate.creole.annic.SearchException;
import gate.creole.annic.Searcher;
import gate.creole.annic.apache.lucene.document.Document;
import gate.creole.annic.apache.lucene.index.IndexReader;
import gate.creole.annic.apache.lucene.index.Term;
import gate.creole.annic.apache.lucene.index.TermEnum;
import gate.creole.annic.apache.lucene.search.BooleanQuery;
import gate.creole.annic.apache.lucene.search.Hits;
import gate.creole.annic.apache.lucene.search.IndexSearcher;
import gate.creole.annic.apache.lucene.search.TermQuery;
import gate.persist.LuceneDataStoreImpl;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* This class provides the Searching functionality for annic.
*
* @author niraj
*/
public class LuceneSearcher implements Searcher {
/**
* A List of index locations. It allows searching at multiple locations.
*/
private List indexLocations = null;
/**
* The submitted query.
*/
private String query = null;
/**
* The number of base token annotations to show in left and right context of
* the pattern. By default 5.
*/
private int contextWindow = 5;
/**
* Found patterns.
*/
private List annicPatterns = new ArrayList();
/**
* Found annotation types in the annic patterns. The maps keeps record of
* found annotation types and features for each of them.
*/
public Map> annotationTypesMap =
new HashMap>();
/**
* Search parameters.
*/
private Map parameters = null;
/**
* Corpus to search in.
*/
private String corpusToSearchIn = null;
/**
* Annotation set to search in.
*/
private String annotationSetToSearchIn = null;
/**
* Hits returned by the lucene.
*/
private Hits luceneHits = null;
/**
* Indicates if the query was to delete certain documents.
*/
private boolean wasDeleteQuery = false;
/**
* A query can result into multiple queries. For example: (A|B)C is converted
* into two queries: AC and AD. For each query a separate thread is started.
*/
private List luceneSearchThreads = null;
/**
* Indicates if the search was successful.
*/
private boolean success = false;
/**
* Tells which thread to use to retrieve results from.
*/
private int luceneSearchThreadIndex = 0;
/**
* Tells if we have reached at the end of of found results.
*/
private boolean fwdIterationEnded = false;
/**
* Used with freq method for statistics.
*/
private LuceneDataStoreImpl datastore;
/**
* Return the next numberOfHits -1 indicates all
*/
@Override
public Hit[] next(int numberOfHits) throws SearchException {
annicPatterns = new ArrayList();
if(!success) {
this.annicPatterns = new ArrayList();
return getHits();
}
if(fwdIterationEnded) {
this.annicPatterns = new ArrayList();
return getHits();
}
try {
if(wasDeleteQuery) {
List docIDs = new ArrayList();
List setNames = new ArrayList();
for(int i = 0; i < luceneHits.length(); i++) {
Document luceneDoc = luceneHits.doc(i);
String documentID = luceneDoc.get(Constants.DOCUMENT_ID);
String annotationSetID = luceneDoc.get(Constants.ANNOTATION_SET_ID);
int index = docIDs.indexOf(documentID);
if(index == -1) {
docIDs.add(documentID);
setNames.add(annotationSetID);
} else {
if(!setNames.get(index).equals(annotationSetID)) {
docIDs.add(documentID);
setNames.add(annotationSetID);
}
}
}
Hit[] toReturn = new Hit[docIDs.size()];
for(int i = 0; i < toReturn.length; i++) {
toReturn[i] = new Hit(docIDs.get(i), setNames.get(i), 0, 0, "");
}
return toReturn;
}
for(; luceneSearchThreadIndex < luceneSearchThreads.size(); luceneSearchThreadIndex++) {
LuceneSearchThread lst =
luceneSearchThreads.get(luceneSearchThreadIndex);
List results = lst.next(numberOfHits);
if(results != null) {
if(numberOfHits != -1) {
numberOfHits -= results.size();
}
this.annicPatterns.addAll(results);
if(numberOfHits == 0) { return getHits(); }
}
}
// if we are here, there wer no sufficient patterns available
// so what we do is make success to false so that this method
// return null on next call
fwdIterationEnded = true;
return getHits();
} catch(Exception e) {
throw new SearchException(e);
}
}
/**
* Method retunrs true/false indicating whether results were found or not.
*/
@SuppressWarnings("unchecked")
@Override
public boolean search(String query, Map parameters)
throws SearchException {
luceneHits = null;
annicPatterns = new ArrayList();
annotationTypesMap = new HashMap>();
luceneSearchThreads = new ArrayList();
luceneSearchThreadIndex = 0;
success = false;
fwdIterationEnded = false;
wasDeleteQuery = false;
if(parameters == null)
throw new SearchException("Parameters cannot be null");
this.parameters = parameters;
/*
* lets first check if the query is to search the document names This is
* used when we only wants to search for documents stored under the specific
* corpus
*/
if(parameters.size() == 2
&& parameters.get(Constants.INDEX_LOCATION_URL) != null) {
String corpusID = (String)parameters.get(Constants.CORPUS_ID);
String indexLocation = null;
try {
indexLocation =
new File(
((URL)parameters.get(Constants.INDEX_LOCATION_URL)).toURI())
.getAbsolutePath();
} catch(URISyntaxException use) {
indexLocation =
new File(
((URL)parameters.get(Constants.INDEX_LOCATION_URL)).getFile())
.getAbsolutePath();
}
if(corpusID != null && indexLocation != null) {
wasDeleteQuery = true;
Term term = new Term(Constants.CORPUS_ID, corpusID);
TermQuery tq = new TermQuery(term);
try {
gate.creole.annic.apache.lucene.search.Searcher searcher =
new IndexSearcher(indexLocation);
// and now execute the query
// result of which will be stored in hits
luceneHits = searcher.search(tq);
success = luceneHits.length() > 0 ? true : false;
return success;
} catch(IOException ioe) {
ioe.printStackTrace();
throw new SearchException(ioe);
}
}
}
// check for index locations
if(parameters.get(Constants.INDEX_LOCATIONS) == null) {
String indexLocation;
try {
indexLocation =
new File(((URL)datastore.getIndexer().getParameters()
.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
} catch(URISyntaxException use) {
indexLocation =
new File(((URL)datastore.getIndexer().getParameters()
.get(Constants.INDEX_LOCATION_URL)).getFile())
.getAbsolutePath();
}
ArrayList indexLocations = new ArrayList();
indexLocations.add(indexLocation);
parameters.put(Constants.INDEX_LOCATIONS, indexLocations);
}
indexLocations =
new ArrayList(
(List extends String>)parameters.get(Constants.INDEX_LOCATIONS));
if(indexLocations.size() == 0)
throw new SearchException("Corpus is not initialized");
// check for valid context window
if(parameters.get(Constants.CONTEXT_WINDOW) == null)
throw new SearchException("Parameter " + Constants.CONTEXT_WINDOW
+ " is not provided!");
contextWindow =
((Integer)parameters.get(Constants.CONTEXT_WINDOW)).intValue();
if(getContextWindow().intValue() <= 0)
throw new SearchException("Context Window must be atleast 1 or > 1");
if(query == null) throw new SearchException("Query is not initialized");
this.query = query;
this.corpusToSearchIn = (String)parameters.get(Constants.CORPUS_ID);
this.annotationSetToSearchIn =
(String)parameters.get(Constants.ANNOTATION_SET_ID);
annicPatterns = new ArrayList();
annotationTypesMap = new HashMap>();
luceneSearchThreads = new ArrayList();
// for different indexes, we create a different instance of
// indexSearcher
// TODO: is this really useful or used to have several indexLocations ?
for(int indexCounter = 0; indexCounter < indexLocations.size(); indexCounter++) {
String location = indexLocations.get(indexCounter);
// we create a separate Thread for each index
LuceneSearchThread lst = new LuceneSearchThread();
if(lst.search(query, contextWindow, location, corpusToSearchIn,
annotationSetToSearchIn, this)) {
luceneSearchThreads.add(lst);
}
}
success = luceneSearchThreads.size() > 0 ? true : false;
return success;
}
/**
* Gets the submitted query.
*/
@Override
public String getQuery() {
return this.query;
}
/**
* Gets the number of base token annotations to show in the context.
*/
public Integer getContextWindow() {
return this.contextWindow;
}
/**
* Gets the found hits (annic patterns).
*/
@Override
public Hit[] getHits() {
if(annicPatterns == null) annicPatterns = new ArrayList();
Hit[] hits = new Hit[annicPatterns.size()];
for(int i = 0; i < annicPatterns.size(); i++) {
hits[i] = annicPatterns.get(i);
}
return hits;
}
/**
* Gets the map of found annotation types and annotation features. This call
* must be invoked only after a call to the
* getIndexedAnnotationSetNames(String indexLocation) method. Otherwise this
* method doesn't guranttee the correct results. The results obtained has the
* following format. Key: CorpusName;AnnotationSetName;AnnotationType Value:
* respective features
*/
@Override
public Map> getAnnotationTypesMap() {
return annotationTypesMap;
}
/**
* This method returns a set of annotation set names that are indexed. Each
* entry has the following format:
*
* corpusName;annotationSetName
*
* where, the corpusName is the name of the corpus the annotationSetName
* belongs to.
*/
@Override
public String[] getIndexedAnnotationSetNames() throws SearchException {
String indexLocation;
try {
indexLocation =
new File(((URL)datastore.getIndexer().getParameters()
.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
} catch(URISyntaxException use) {
indexLocation =
new File(((URL)datastore.getIndexer().getParameters()
.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
}
annotationTypesMap = new HashMap>();
Set toReturn = new HashSet();
try {
IndexReader reader = IndexReader.open(indexLocation);
try {
// lets first obtain stored corpora
TermEnum terms =
reader.terms(new Term(Constants.ANNOTATION_SET_ID, ""));
if(terms == null) { return new String[0]; }
// iterating over terms and finding out names of annotation sets indexed
Set annotSets = new HashSet();
boolean foundAnnotSet = false;
do {
Term t = terms.term();
if(t == null) continue;
if(t.field().equals(Constants.ANNOTATION_SET_ID)) {
annotSets.add(t.text());
foundAnnotSet = true;
} else {
if(foundAnnotSet) break;
}
} while(terms.next());
// we query for each annotation set
// and go through docs with that annotation set in them
// to see which corpus they belong to.
// we could have done the other way round as well (i.e.
// first asking for corpus ids and then obtainin annotation set ids
// but not all documents belong to corpora
for(String annotSet : annotSets) {
Term term = new Term(Constants.ANNOTATION_SET_ID, annotSet);
TermQuery tq = new TermQuery(term);
try {
gate.creole.annic.apache.lucene.search.Searcher searcher =
new IndexSearcher(indexLocation);
try {
Hits annotSetHits = searcher.search(tq);
for(int i = 0; i < annotSetHits.length(); i++) {
Document luceneDoc = annotSetHits.doc(i);
String corpusID = luceneDoc.get(Constants.CORPUS_ID);
if(corpusID == null) corpusID = "";
toReturn.add(corpusID + ";" + annotSet);
// lets create a boolean query
Term annotSetTerm =
new Term(Constants.ANNOTATION_SET_ID, annotSet);
TermQuery atq = new TermQuery(annotSetTerm);
BooleanQuery bq = new BooleanQuery();
bq.add(tq, true, false);
bq.add(atq, true, false);
gate.creole.annic.apache.lucene.search.Searcher indexFeatureSearcher =
new IndexSearcher(indexLocation);
try {
Hits indexFeaturesHits = searcher.search(bq);
for(int j = 0; j < indexFeaturesHits.length(); j++) {
Document aDoc = indexFeaturesHits.doc(j);
String indexedFeatures =
aDoc.get(Constants.INDEXED_FEATURES);
if(indexedFeatures != null) {
String[] features = indexedFeatures.split(";");
for(String aFeature : features) {
// AnnotationType.FeatureName
int index = aFeature.indexOf(".");
if(index == -1) {
continue;
}
String type = aFeature.substring(0, index);
String featureName = aFeature.substring(index + 1);
String key = corpusID + ";" + annotSet + ";" + type;
List listOfFeatures =
annotationTypesMap.get(key);
if(listOfFeatures == null) {
listOfFeatures = new ArrayList();
annotationTypesMap.put(key, listOfFeatures);
}
if(!listOfFeatures.contains(featureName)) {
listOfFeatures.add(featureName);
}
}
}
}
} finally {
indexFeatureSearcher.close();
}
}
} finally {
searcher.close();
}
} catch(IOException ioe) {
ioe.printStackTrace();
throw new SearchException(ioe);
}
}
} finally {
reader.close();
}
} catch(IOException ioe) {
throw new SearchException(ioe);
}
return toReturn.toArray(new String[0]);
}
/**
* Gets the search parameters set by user.
*/
@Override
public Map getParameters() {
return parameters;
}
/**
* A Map used for caching query tokens created for a query.
*/
private Map> queryTokens =
new HashMap>();
/**
* Gets the query tokens for the given query.
*/
public synchronized List getQueryTokens(String query) {
return queryTokens.get(query);
}
/**
* Adds the query tokens for the given query.
*/
public synchronized void addQueryTokens(String query, List queryTokens) {
this.queryTokens.put(query, queryTokens);
}
/**
* This method allow exporting results in to the provided file. This method
* has not been implemented yet.
*/
@Override
public void exportResults(File outputFile) {
throw new RuntimeException("ExportResults method is not implemented yet!");
}
@Override
public int freq(String corpusToSearchIn, String annotationSetToSearchIn,
String annotationType, String featureName, String value)
throws SearchException {
String indexLocation;
try {
indexLocation =
new File(((URL)datastore.getIndexer().getParameters()
.get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
} catch(URISyntaxException use) {
indexLocation =
new File(((URL)datastore.getIndexer().getParameters()
.get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
}
IndexSearcher indexSearcher;
try { // open the IndexSearcher
indexSearcher = new IndexSearcher(indexLocation);
} catch(IOException e) {
e.printStackTrace();
return -1;
}
int result =
StatsCalculator.freq(indexSearcher, corpusToSearchIn,
annotationSetToSearchIn, annotationType, featureName, value);
try { // close the IndexSearcher
indexSearcher.close();
} catch(IOException ioe) {
ioe.printStackTrace();
return -1;
}
return result;
}
@Override
public int freq(String corpusToSearchIn, String annotationSetToSearchIn,
String annotationType) throws SearchException {
return this.freq(corpusToSearchIn, annotationSetToSearchIn, annotationType,
null, null);
}
@Override
public int freq(String corpusToSearchIn, String annotationSetToSearchIn,
String annotationType, String featureName) throws SearchException {
return this.freq(corpusToSearchIn, annotationSetToSearchIn, annotationType,
featureName, null);
}
@Override
public int freq(List patternsToSearchIn, String annotationType,
String feature, String value, boolean inMatchedSpan, boolean inContext)
throws SearchException {
return StatsCalculator.freq(patternsToSearchIn, annotationType, feature,
value, inMatchedSpan, inContext);
}
@Override
public int freq(List patternsToSearchIn, String annotationType,
boolean inMatchedSpan, boolean inContext) throws SearchException {
return StatsCalculator.freq(patternsToSearchIn, annotationType,
inMatchedSpan, inContext);
}
@Override
public Map freqForAllValues(List patternsToSearchIn,
String annotationType, String feature, boolean inMatchedSpan,
boolean inContext) throws SearchException {
return StatsCalculator.freqForAllValues(patternsToSearchIn, annotationType,
feature, inMatchedSpan, inContext);
}
public void setLuceneDatastore(gate.persist.LuceneDataStoreImpl datastore) {
this.datastore = datastore;
}
}