gate.creole.annic.lucene.LuceneIndexer Maven / Gradle / Ivy
Show all versions of gate-core Show documentation
/*
* LuceneIndexer.java
*
* Niraj Aswani, 19/March/07
*
* $Id: LuceneIndexer.html,v 1.0 2007/03/19 16:22:01 niraj Exp $
*/
package gate.creole.annic.lucene;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import gate.Corpus;
import gate.creole.annic.Constants;
import gate.creole.annic.IndexException;
import gate.creole.annic.Indexer;
import gate.creole.annic.apache.lucene.document.Document;
import gate.creole.annic.apache.lucene.index.IndexReader;
import gate.creole.annic.apache.lucene.index.IndexWriter;
import gate.creole.annic.apache.lucene.index.Term;
import gate.creole.annic.apache.lucene.search.Hits;
import gate.creole.annic.apache.lucene.search.IndexSearcher;
import gate.creole.annic.apache.lucene.search.TermQuery;
import gate.util.Files;
/**
* This class provides a Lucene based implementation for the Indexer
* interface. It asks users to provide various required parameters and
* creates the Lucene Index.
*
* @author niraj
*
*/
public class LuceneIndexer implements Indexer {
protected boolean DEBUG = false;
/** An corpus for indexing */
protected Corpus corpus;
/**
* Various parameters such as location of the Index etc.
*/
protected Map parameters;
/**
* Constructor
*
* @param indexLocationUrl
* @throws IOException
*/
public LuceneIndexer(URL indexLocationUrl) throws IOException {
if(indexLocationUrl != null) {
readParametersFromDisk(indexLocationUrl);
}
}
/**
* Checks the Index Parameters to see if they are all compatible
*/
protected void checkIndexParameters(Map parameters) throws IndexException {
this.parameters = parameters;
if(parameters == null) {
throw new IndexException("No parameters provided!");
}
URL indexLocation = (URL)parameters.get(Constants.INDEX_LOCATION_URL);
if(indexLocation == null)
throw new IndexException("You must provide a URL for INDEX_LOCATION");
if(!indexLocation.getProtocol().equalsIgnoreCase("file")) {
throw new IndexException(
"Index Output Directory must be set to the empty directory on the file system");
}
File file = null;
try {
file = new File(indexLocation.toURI());
} catch(URISyntaxException use) {
file = Files.fileFromURL(indexLocation);
}
if(file.exists()) {
if(!file.isDirectory()) {
throw new IndexException("Path doesn't exist");
}
}
String baseTokenAnnotationType = (String)parameters
.get(Constants.BASE_TOKEN_ANNOTATION_TYPE);
if(baseTokenAnnotationType == null || baseTokenAnnotationType.trim().length() == 0) {
baseTokenAnnotationType = Constants.ANNIC_TOKEN;
parameters.put(Constants.BASE_TOKEN_ANNOTATION_TYPE,
Constants.ANNIC_TOKEN);
} else if(baseTokenAnnotationType.indexOf(".") > -1 || baseTokenAnnotationType.indexOf("=") > -1
|| baseTokenAnnotationType.indexOf(";") > -1 || baseTokenAnnotationType.indexOf(",") > -1) {
throw new IndexException(
"Base token annotation type cannot have '.' , '=', ',' or ';; in it");
}
String indexUnitAnnotationType = (String)parameters
.get(Constants.INDEX_UNIT_ANNOTATION_TYPE);
if(DEBUG) {
System.out.println("BTAT : " + baseTokenAnnotationType);
System.out.println("IUAT : " + indexUnitAnnotationType);
}
}
/**
* Returns the indexing parameters
*/
protected Map getIndexParameters() {
return this.parameters;
}
/**
* Creates index directory and indexing all documents in the corpus.
*
* @param indexParameters This is a map containing various values
* required to create an index In case of LuceneIndexManager
* following are the values required
*
* INDEX_LOCATION_URL - this is a URL where the Index be
* created
*
* BASE_TOKEN_ANNOTATION_TYPE
*
* INDEX_UNIT_ANNOTATION_TYPE
*
* FEATURES_TO_EXCLUDE
*
* FEATURES_TO_INCLUDE
*
*
*/
@Override
public void createIndex(Map indexParameters) throws IndexException {
checkIndexParameters(indexParameters);
URL indexLocation = (URL)parameters.get(Constants.INDEX_LOCATION_URL);
try {
File file = null;
try {
file = new File(indexLocation.toURI());
} catch(URISyntaxException use) {
file = Files.fileFromURL(indexLocation);
}
// create an instance of Index Writer
IndexWriter writer = new IndexWriter(file.getAbsolutePath(),
new LuceneAnalyzer(), true);
try {
if(corpus != null) {
// load documents and add them one by one
for(int i = 0; i < corpus.size(); i++) {
gate.Document gateDoc = corpus.get(i);
String idToUse = gateDoc.getLRPersistenceId() == null ? gateDoc
.getName() : gateDoc.getLRPersistenceId().toString();
System.out.print("Indexing : " + idToUse + " ...");
String corpusName = corpus.getLRPersistenceId() == null ? corpus
.getName() : corpus.getLRPersistenceId().toString();
List luceneDocs = getLuceneDocuments(
corpusName, gateDoc, indexLocation.toString());
if(luceneDocs != null) {
for(int j = 0; j < luceneDocs.size(); j++) {
if(luceneDocs.get(j) != null) {
writer.addDocument(luceneDocs.get(j));
}
}
}
if(gateDoc.getLRPersistenceId() != null) {
gate.Factory.deleteResource(gateDoc);
}
System.out.println("Done");
}
}// for (all documents)
}
finally {
writer.close();
}
writeParametersToDisk();
}
catch(java.io.IOException ioe) {
throw new IndexException(ioe);
}
}
/** Optimize existing index. */
@Override
public void optimizeIndex() throws IndexException {
try {
String location = ((URL)parameters.get(Constants.INDEX_LOCATION_URL))
.toString();
IndexWriter writer = new IndexWriter(location,
new gate.creole.annic.lucene.LuceneAnalyzer(), false);
try {
writer.optimize();
}
finally {
writer.close();
}
}
catch(java.io.IOException ioe) {
throw new IndexException(ioe);
}
}
/** Deletes the index. */
@Override
public void deleteIndex() throws IndexException {
if(parameters == null) return;
File dir = null;
//TODO should we use the gate util Files mehotd for this
try {
dir = new File(((URL)parameters.get(Constants.INDEX_LOCATION_URL))
.toURI());
} catch(URISyntaxException use) {
dir = new File(((URL)parameters.get(Constants.INDEX_LOCATION_URL))
.getFile());
}
if(!FileUtils.deleteQuietly(dir)) {
throw new IndexException("Can't delete directory" + dir.getAbsolutePath());
}
}
/**
* Add new documents to Index
* @throws IndexException
*/
@Override
public void add(String corpusPersistenceID, List added)
throws IndexException {
String location = null;
//TODO should we use the gate util Files mehotd for this
try {
location = new File(((URL)parameters.get(Constants.INDEX_LOCATION_URL))
.toURI()).getAbsolutePath();
} catch(URISyntaxException use) {
location = new File(((URL)parameters.get(Constants.INDEX_LOCATION_URL))
.getFile()).getAbsolutePath();
}
try {
IndexWriter writer = new IndexWriter(location, new LuceneAnalyzer(), false);
try {
if(added != null) {
for(int i = 0; i < added.size(); i++) {
gate.Document gateDoc = added.get(i);
String idToUse = gateDoc.getLRPersistenceId() == null ? gateDoc
.getName() : gateDoc.getLRPersistenceId().toString();
System.out.print("Indexing : " + idToUse + " ...");
List docs = getLuceneDocuments(
corpusPersistenceID, gateDoc, location);
if(docs == null) {
System.out.println("Done");
continue;
}
for(int j = 0; j < docs.size(); j++) {
writer.addDocument(docs.get(j));
}
System.out.println("Done");
}// for (add all added documents)
}
}
finally {
// make sure we close the writer, whatever happens
writer.close();
}
}
catch(java.io.IOException ioe) {
throw new IndexException(ioe);
}
}
private String getCompatibleName(String name) {
return name.replaceAll("[\\/:\\*\\?\"<>|]", "_");
}
/**
* remove documents from the Index
*
* @param removedIDs - when documents are not
* peristed, Persistence IDs will not be available In that
* case provide the document Names instead of their IDs
* @throws IndexException if an error occurs while removing documents
*/
@Override
public void remove(List