All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
gate.persist.LuceneDataStoreImpl Maven / Gradle / Ivy
package gate.persist;
import gate.Corpus;
import gate.DataStore;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.LanguageResource;
import gate.Resource;
import gate.corpora.SerialCorpusImpl;
import gate.creole.ResourceInstantiationException;
import gate.creole.annic.Constants;
import gate.creole.annic.Hit;
import gate.creole.annic.IndexException;
import gate.creole.annic.Indexer;
import gate.creole.annic.SearchException;
import gate.creole.annic.SearchableDataStore;
import gate.creole.annic.Searcher;
import gate.creole.annic.lucene.LuceneIndexer;
import gate.creole.annic.lucene.LuceneSearcher;
import gate.event.CorpusEvent;
import gate.event.CorpusListener;
import gate.event.CreoleEvent;
import gate.event.CreoleListener;
import gate.util.Files;
import gate.util.GateRuntimeException;
import gate.util.Strings;
import gate.util.persistence.PersistenceManager;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.lang.ref.ReferenceQueue;
import java.lang.ref.SoftReference;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.commons.io.IOUtils;
public class LuceneDataStoreImpl extends SerialDataStore implements
SearchableDataStore,
CorpusListener,
CreoleListener {
/**
* serial version UID
*/
private static final long serialVersionUID = 3618696392336421680L;
/**
* To store canonical lock objects for each LR ID.
*/
protected Map lockObjects =
new HashMap();
/**
* Reference queue with which the soft references in the lockObjects
* map will be registered.
*/
protected ReferenceQueue refQueue = new ReferenceQueue();
/**
* Indicates if the datastore is being closed.
*/
protected boolean dataStoreClosing = false;
/**
* Executor to run the indexing tasks
*/
protected ScheduledThreadPoolExecutor executor;
/**
* Map keeping track of the most recent indexing task for each LR ID.
*/
protected ConcurrentMap currentTasks =
new ConcurrentHashMap();
/**
* Number of milliseconds we should wait after a sync before
* attempting to re-index a document. If sync is called again for the
* same document within this time then the timer for the re-indexing
* task is reset. Thus if several changes to the same document are
* made in quick succession it will only be re-indexed once. On the
* other hand, if the delay is set too long the document may never be
* indexed until the data store is closed. The default delay is 1000
* (one second).
*/
protected long indexDelay = 1000L;
/**
* Indexer to be used for indexing documents
*/
protected Indexer indexer;
/**
* Index Parameters
*/
protected Map indexParameters;
/**
* URL of the index
*/
protected URL indexURL;
/**
* Searcher to be used for searching the indexed documents
*/
protected Searcher searcher;
/**
* This is where we store the search parameters
*/
protected Map searchParameters;
/** Close the data store. */
@Override
public void close() throws PersistenceException {
// stop listening to Creole events
Gate.getCreoleRegister().removeCreoleListener(this);
// shut down the executor. We submit the shutdown request
// as a zero-delay task rather than calling shutdown directly,
// in order to interrupt any timed wait currently in progress.
executor.execute(new Runnable() {
@Override
public void run() {
executor.shutdown();
}
});
try {
// allow up to two minutes for indexing to finish
executor.awaitTermination(120, TimeUnit.SECONDS);
} catch(InterruptedException e) {
// propagate the interruption
Thread.currentThread().interrupt();
}
// At this point, any in-progress indexing tasks have
// finished. We now process any tasks that were queued
// but not run, running them in the current thread.
Collection queuedTasks = currentTasks.values();
// copy the tasks into an array to avoid concurrent
// modification issues, as IndexingTask.run modifies
// the currentTasks map
IndexingTask[] queuedTasksArray =
queuedTasks.toArray(new IndexingTask[queuedTasks.size()]);
for(IndexingTask task : queuedTasksArray) {
task.run();
}
super.close();
} // close()
/** Open a connection to the data store. */
@Override
public void open() throws PersistenceException {
super.open();
/*
* check if the storage directory is a valid serial datastore if we
* want to support old style: String versionInVersionFile = "1.0";
* (but this means it will open *any* directory)
*/
BufferedReader isr = null;
try {
isr = new BufferedReader(new FileReader(getVersionFile()));
currentProtocolVersion = isr.readLine();
String indexDirRelativePath = isr.readLine();
if(indexDirRelativePath != null
&& indexDirRelativePath.trim().length() > 1) {
URL storageDirURL = storageDir.toURI().toURL();
URL theIndexURL = new URL(storageDirURL, indexDirRelativePath);
// check if index directory exists
File indexDir = Files.fileFromURL(theIndexURL);
if(!indexDir.exists()) {
throw new PersistenceException("Index directory "
+ indexDirRelativePath
+ " could not be found for datastore at " + storageDirURL);
}
indexURL = theIndexURL;
this.indexer = new LuceneIndexer(indexURL);
this.searcher = new LuceneSearcher();
((LuceneSearcher)this.searcher).setLuceneDatastore(this);
}
} catch(IOException e) {
throw new PersistenceException("Invalid storage directory: " + e);
} finally {
IOUtils.closeQuietly(isr);
}
if(!isValidProtocolVersion(currentProtocolVersion))
throw new PersistenceException("Invalid protocol version number: "
+ currentProtocolVersion);
// Lets create a separate indexer thread which keeps running in the
// background
executor =
new ScheduledThreadPoolExecutor(1, Executors.defaultThreadFactory());
// set up the executor so it does not execute delayed indexing tasks
// that are still waiting when it is shut down. We run these tasks
// immediately at shutdown time rather than waiting.
executor.setContinueExistingPeriodicTasksAfterShutdownPolicy(false);
executor.setExecuteExistingDelayedTasksAfterShutdownPolicy(false);
// start listening to Creole events
Gate.getCreoleRegister().addCreoleListener(this);
}
/**
* Obtain the lock object on which we must synchronize when loading or
* saving the LR with the given ID.
*/
private Object lockObjectForID(Object id) {
synchronized(lockObjects) {
processRefQueue();
Object lock = null;
if(lockObjects.containsKey(id)) {
lock = lockObjects.get(id).get();
}
if(lock == null) {
lockObjects.remove(id);
lock = new Object();
LabelledSoftReference ref = new LabelledSoftReference(lock);
ref.label = id;
lockObjects.put(id, ref);
}
return lock;
}
}
/**
* Cleans up the lockObjects map by removing any entries whose
* SoftReference values have been cleared by the garbage collector.
*/
private void processRefQueue() {
LabelledSoftReference ref = null;
while((ref = LabelledSoftReference.class.cast(refQueue.poll())) != null) {
// check that the queued ref hasn't already been replaced in the
// map
if(lockObjects.get(ref.label) == ref) {
lockObjects.remove(ref.label);
}
}
}
/**
* Submits the given LR ID for indexing. The task is delayed by 5
* seconds, so multiple updates to the same LR in close succession do
* not un-necessarily trigger multiple re-indexing passes.
*/
protected void queueForIndexing(Object lrID) {
IndexingTask existingTask = currentTasks.get(lrID);
if(existingTask != null) {
existingTask.disable();
}
IndexingTask newTask = new IndexingTask(lrID);
currentTasks.put(lrID, newTask);
// set the LR to be indexed after the configured delay
executor.schedule(newTask, indexDelay, TimeUnit.MILLISECONDS);
}
/**
* Delete a resource from the data store.
*/
@Override
public void delete(String lrClassName, Object lrPersistenceId)
throws PersistenceException {
IndexingTask task = currentTasks.get(lrPersistenceId);
if(task != null) {
task.disable();
}
// and we delete it from the datastore
// we obtained the lock on this - in order to avoid clashing between
// the object being loaded by the indexer thread and the thread that
// deletes it
Object lock = lockObjectForID(lrPersistenceId);
synchronized(lock) {
super.delete(lrClassName, lrPersistenceId);
}
lock = null;
/*
* lets first find out if the deleted resource is a corpus. Deleting
* a corpus does not require deleting all its member documents but
* we need to remove the reference of corpus from all its underlying
* documents in index
*/
try {
if(Corpus.class.isAssignableFrom(Class.forName(lrClassName, true,
Gate.getClassLoader()))) {
/*
* we would issue a search query to obtain all documents which
* belong to his corpus and set them as referring to null
* instead of refering to the given corpus
*/
Map parameters = new HashMap();
parameters.put(Constants.INDEX_LOCATION_URL, indexURL);
parameters.put(Constants.CORPUS_ID, lrPersistenceId.toString());
try {
boolean success = getSearcher().search("nothing", parameters);
if(!success) return;
Hit[] hits = getSearcher().next(-1);
if(hits == null || hits.length == 0) {
// do nothing
return;
}
for(int i = 0; i < hits.length; i++) {
String docID = hits[i].getDocumentID();
queueForIndexing(docID);
}
} catch(SearchException se) {
throw new PersistenceException(se);
}
return;
}
} catch(ClassNotFoundException cnfe) {
// don't do anything
}
// we want to delete this document from the Index as well
ArrayList removed = new ArrayList();
removed.add(lrPersistenceId);
try {
synchronized(indexer) {
this.indexer.remove(removed);
}
} catch(IndexException ie) {
throw new PersistenceException(ie);
}
}
@Override
public LanguageResource getLr(String lrClassName, Object lrPersistenceId)
throws PersistenceException, SecurityException {
LanguageResource lr = super.getLr(lrClassName, lrPersistenceId);
if(lr instanceof Corpus) {
((Corpus)lr).addCorpusListener(this);
}
return lr;
}
/**
* Save: synchonise the in-memory image of the LR with the persistent
* image.
*/
@Override
public void sync(LanguageResource lr) throws PersistenceException {
if(lr.getLRPersistenceId() != null) {
// lock the LR ID so we don't write to the file while an
// indexer task is reading it
Object lock = lockObjectForID(lr.getLRPersistenceId());
synchronized(lock) {
// we load the copy of this LR and check if any modification were done
// if so, it should be reindexed or else it should not be synced again.
LanguageResource copy = null;
try {
copy =
getLr(lr.getClass().getName(), lr.getLRPersistenceId());
// we check it only if it is an instance of Document
if(copy instanceof Document && lr instanceof Document) {
Document cDoc = (Document)copy;
Document lrDoc = (Document)lr;
boolean sameDocs = false;
// we only check content and annotation sets
// as that's what matters from the annic perspective
if(cDoc.getContent().equals(lrDoc.getContent())) {
if(cDoc.getAnnotations().equals(lrDoc.getAnnotations())) {
if(cDoc.getNamedAnnotationSets().equals(
lrDoc.getNamedAnnotationSets())) {
boolean allSetsSame = true;
for(String key : cDoc.getNamedAnnotationSets().keySet()) {
if(!cDoc.getAnnotations(key).equals(lrDoc.getAnnotations(key))) {
allSetsSame = false;
break;
}
}
if(allSetsSame) {
sameDocs = true;
}
}
}
}
if(sameDocs) {
lock = null;
return;
}
}
} catch(SecurityException e) {
e.printStackTrace();
} finally {
// delete the copy of this LR
if(copy != null) {
Factory.deleteResource(copy);
}
}
super.sync(lr);
}
lock = null;
} else {
super.sync(lr);
}
if(lr instanceof Document) {
queueForIndexing(lr.getLRPersistenceId());
}
}
/**
* Sets the Indexer to be used for indexing Datastore
*/
@Override
public void setIndexer(Indexer indexer, Map indexParameters)
throws IndexException {
this.indexer = indexer;
this.indexParameters = indexParameters;
this.indexURL = (URL)this.indexParameters.get(Constants.INDEX_LOCATION_URL);
this.indexer.createIndex(this.indexParameters);
// dump the version file
try {
File versionFile = getVersionFile();
OutputStreamWriter osw =
new OutputStreamWriter(new FileOutputStream(versionFile));
osw.write(versionNumber + Strings.getNl());
String indexDirRelativePath =
PersistenceManager.getRelativePath(storageDir.toURI().toURL(),
indexURL);
osw.write(indexDirRelativePath);
osw.close();
} catch(IOException e) {
throw new IndexException("couldn't write version file: " + e);
}
}
@Override
public Indexer getIndexer() {
return this.indexer;
}
@Override
public void setSearcher(Searcher searcher) throws SearchException {
this.searcher = searcher;
if(this.searcher instanceof LuceneSearcher) {
((LuceneSearcher)this.searcher).setLuceneDatastore(this);
}
}
@Override
public Searcher getSearcher() {
return this.searcher;
}
/**
* Sets the delay in milliseconds that we should wait after a sync
* before attempting to re-index a document. If sync is called again
* for the same document within this time then the timer for the
* re-indexing task is reset. Thus if several changes to the same
* document are made in quick succession it will only be re-indexed
* once. On the other hand, if the delay is set too long the document
* may never be indexed until the data store is closed. The default
* delay is 1000ms (one second), which should be appropriate for usage
* in the GATE GUI.
*/
public void setIndexDelay(long indexDelay) {
this.indexDelay = indexDelay;
}
public long getIndexDelay() {
return indexDelay;
}
/**
* Search the datastore
*/
@Override
public boolean search(String query, Map searchParameters)
throws SearchException {
return this.searcher.search(query, searchParameters);
}
/**
* Returns the next numberOfPatterns
*
* @param numberOfPatterns
* @return null if no patterns found
*/
@Override
public Hit[] next(int numberOfPatterns) throws SearchException {
return this.searcher.next(numberOfPatterns);
}
// Corpus Events
/**
* This method is invoked whenever a document is removed from a corpus
*/
@Override
public void documentRemoved(CorpusEvent ce) {
Object docLRID = ce.getDocumentLRID();
/*
* we need to remove this document from the index
*/
if(docLRID != null) {
ArrayList removed = new ArrayList();
removed.add(docLRID);
try {
synchronized(indexer) {
indexer.remove(removed);
}
} catch(IndexException ie) {
throw new GateRuntimeException(ie);
}
// queueForIndexing(docLRID);
}
}
/**
* This method is invoked whenever a document is added to a particular
* corpus
*/
@Override
public void documentAdded(CorpusEvent ce) {
/*
* we don't want to do anything here, because the sync is
* automatically called when a document is added to a corpus which
* is part of the the datastore
*/
}
/*
* (non-Javadoc)
*
* @see
* gate.event.CreoleListener#datastoreClosed(gate.event.CreoleEvent)
*/
@Override
public void datastoreClosed(CreoleEvent e) {
}
/*
* (non-Javadoc)
*
* @see
* gate.event.CreoleListener#datastoreCreated(gate.event.CreoleEvent)
*/
@Override
public void datastoreCreated(CreoleEvent e) {
}
/*
* (non-Javadoc)
*
* @see
* gate.event.CreoleListener#datastoreOpened(gate.event.CreoleEvent)
*/
@Override
public void datastoreOpened(CreoleEvent e) {
}
/*
* (non-Javadoc)
*
* @see
* gate.event.CreoleListener#resourceLoaded(gate.event.CreoleEvent)
*/
@Override
public void resourceLoaded(CreoleEvent e) {
}
/*
* (non-Javadoc)
*
* @see gate.event.CreoleListener#resourceRenamed(gate.Resource,
* java.lang.String, java.lang.String)
*/
@Override
public void resourceRenamed(Resource resource, String oldName, String newName) {
}
/*
* (non-Javadoc)
*
* @see
* gate.event.CreoleListener#resourceUnloaded(gate.event.CreoleEvent)
*/
@Override
public void resourceUnloaded(CreoleEvent e) {
// if the resource being close is one of our corpora. we need to
// remove
// the corpus listener associated with it
Resource res = e.getResource();
if(res instanceof Corpus) {
((Corpus)res).removeCorpusListener(this);
}
}
protected class IndexingTask implements Runnable {
private AtomicBoolean disabled = new AtomicBoolean(false);
private Object lrID;
public IndexingTask(Object lrID) {
this.lrID = lrID;
}
public void disable() {
disabled.set(true);
}
@Override
public void run() {
// remove this task from the currentTasks map if it has not been
// superseded by a later task
currentTasks.remove(lrID, this);
// only run the rest of the process if this task has not been
// disabled (because a newer task for the same LR was scheduled).
// We set the disabled flag at this point so the same task cannot
// be run twice.
if(disabled.compareAndSet(false, true)) {
Document doc = null;
// read the document from datastore
FeatureMap features = Factory.newFeatureMap();
features.put(DataStore.LR_ID_FEATURE_NAME, lrID);
features.put(DataStore.DATASTORE_FEATURE_NAME, LuceneDataStoreImpl.this);
FeatureMap hidefeatures = Factory.newFeatureMap();
Gate.setHiddenAttribute(hidefeatures, true);
try {
// lock the LR ID so we don't try and read a file
// which is in the process of being written
Object lock = lockObjectForID(lrID);
synchronized(lock) {
doc =
(Document)Factory
.createResource("gate.corpora.DocumentImpl",
features, hidefeatures);
}
lock = null;
} catch(ResourceInstantiationException rie) {
// this means the LR ID was null
doc = null;
}
// if the document is not null,
// proceed to indexing it
if(doc != null) {
/*
* we need to reindex this document in order to synchronize it
* lets first remove it from the index
*/
ArrayList removed = new ArrayList();
removed.add(lrID);
try {
synchronized(indexer) {
indexer.remove(removed);
}
} catch(IndexException ie) {
throw new GateRuntimeException(ie);
}
// and add it back
ArrayList added = new ArrayList();
added.add(doc);
try {
String corpusPID = null;
/*
* we need to find out the corpus which this document
* belongs to one easy way is to check all instances of
* serial corpus loaded in memory
*/
List scs =
Gate.getCreoleRegister().getLrInstances(
SerialCorpusImpl.class.getName());
if(scs != null) {
/*
* we need to check which corpus the deleted class
* belonged to
*/
Iterator iter = scs.iterator();
while(iter.hasNext()) {
SerialCorpusImpl sci = (SerialCorpusImpl)iter.next();
if(sci != null) {
if(sci.contains(doc)) {
corpusPID = sci.getLRPersistenceId().toString();
break;
}
}
}
}
/*
* it is also possible that the document is loaded from
* datastore without being loaded from the corpus (e.g.
* using getLR(...) method of datastore) in this case the
* relevant corpus won't exist in memory
*/
if(corpusPID == null) {
List corpusPIDs = getLrIds(SerialCorpusImpl.class.getName());
if(corpusPIDs != null) {
for(int i = 0; i < corpusPIDs.size(); i++) {
Object corpusID = corpusPIDs.get(i);
SerialCorpusImpl corpusLR = null;
// we will have to load this corpus
FeatureMap params = Factory.newFeatureMap();
params.put(DataStore.DATASTORE_FEATURE_NAME,
LuceneDataStoreImpl.this);
params.put(DataStore.LR_ID_FEATURE_NAME, corpusID);
hidefeatures = Factory.newFeatureMap();
Gate.setHiddenAttribute(hidefeatures, true);
Object lock = lockObjectForID(corpusID);
synchronized(lock) {
corpusLR =
(SerialCorpusImpl)Factory.createResource(
SerialCorpusImpl.class.getCanonicalName(),
params, hidefeatures);
}
lock = null;
if(corpusLR != null) {
if(corpusLR.contains(doc)) {
corpusPID = corpusLR.getLRPersistenceId().toString();
}
Factory.deleteResource(corpusLR);
if(corpusPID != null) break;
}
}
}
}
synchronized(indexer) {
indexer.add(corpusPID, added);
}
Factory.deleteResource(doc);
} catch(Exception ie) {
ie.printStackTrace();
}
}
}
}
}
/**
* Soft reference with an associated label.
*/
private class LabelledSoftReference extends SoftReference {
Object label;
public LabelledSoftReference(Object referent) {
super(referent);
}
}
}