Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
gate.corpora.SerialCorpusImpl Maven / Gradle / Ivy
Go to download
GATE - general achitecture for text engineering - is open source
software capable of solving almost any text processing problem. This
artifact enables you to embed the core GATE Embedded with its essential
dependencies. You will able to use the GATE Embedded API and load and
store GATE XML documents. This artifact is the perfect dependency for
CREOLE plugins or for applications that need to customize the GATE
dependencies due to confict with their own dependencies or for lower
footprint.
/*
* SerialCorpusImpl.java
*
* Copyright (c) 1995-2012, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Kalina Bontcheva, 19/Oct/2001
*
* $Id: SerialCorpusImpl.java 17841 2014-04-16 12:35:41Z markagreenwood $
*/
package gate.corpora;
import gate.Corpus;
import gate.DataStore;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.GateConstants;
import gate.Resource;
import gate.creole.AbstractLanguageResource;
import gate.creole.CustomDuplication;
import gate.creole.ResourceInstantiationException;
import gate.creole.ir.IREngine;
import gate.creole.ir.IndexDefinition;
import gate.creole.ir.IndexException;
import gate.creole.ir.IndexManager;
import gate.creole.ir.IndexStatistics;
import gate.creole.ir.IndexedCorpus;
import gate.creole.metadata.CreoleResource;
import gate.event.CorpusEvent;
import gate.event.CorpusListener;
import gate.event.CreoleEvent;
import gate.event.CreoleListener;
import gate.event.DatastoreEvent;
import gate.event.DatastoreListener;
import gate.persist.PersistenceException;
import gate.util.Err;
import gate.util.GateRuntimeException;
import gate.util.MethodNotImplementedException;
import gate.util.Out;
import java.io.FileFilter;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Vector;
// The initial design was to implement this on the basis of a WeakValueHashMap.
// However this creates problems, because the user might e.g., add a transient
// document to the corpus and then if the Document variable goes out of scope
// before sync() is called, nothing will be saved of the new document. Bad!
// Instead, to cope with the unloading for memory saving use, I implemented
// a documentUnload() method, which sets the in-memory copy to null but can
// always restore the doc, because it has its persistence ID.
@CreoleResource(name = "GATE Serial Corpus", isPrivate = true, comment = "GATE persistent corpus (serialisation)", icon = "corpus", helpURL = "http://gate.ac.uk/userguide/sec:developer:datastores")
public class SerialCorpusImpl extends AbstractLanguageResource
implements Corpus, CreoleListener, DatastoreListener, IndexedCorpus,
CustomDuplication {
/** Debug flag */
private static final boolean DEBUG = false;
static final long serialVersionUID = 3632609241787241616L;
protected transient Vector corpusListeners;
protected List docDataList = null;
// here I keep document index as key (same as the index in docDataList
// which defines the document order) and Documents as value
protected transient List documents = null;
protected transient IndexManager indexManager = null;
protected transient List addedDocs = null;
protected transient List removedDocIDs = null;
protected transient List changedDocs = null;
public SerialCorpusImpl() {
}
/**
* Constructor to create a SerialCorpus from a transient one. This is
* called by adopt() to store the transient corpus and re-route the
* methods calls to it, until the corpus is sync-ed on disk. After
* that, the transientCorpus will always be null, so the new
* functionality will be used instead.
*/
protected SerialCorpusImpl(Corpus tCorpus) {
// copy the corpus name and features from the one in memory
this.setName(tCorpus.getName());
this.setFeatures(tCorpus.getFeatures());
docDataList = new ArrayList();
// now cache the names of all docs for future use
List docNames = tCorpus.getDocumentNames();
for(int i = 0; i < docNames.size(); i++) {
Document doc = tCorpus.get(i);
docDataList.add(new DocumentData(docNames.get(i), null, doc
.getClass().getName()));
}
// copy all the documents from the transient corpus
documents = new ArrayList();
documents.addAll(tCorpus);
// make sure we fire events when docs are added/removed/etc
Gate.getCreoleRegister().addCreoleListener(this);
}
/**
* Gets the names of the documents in this corpus.
*
* @return a {@link List} of Strings representing the names of the
* documents in this corpus.
*/
@Override
public List getDocumentNames() {
List docsNames = new ArrayList();
if(docDataList == null) return docsNames;
for(Object aDocDataList : docDataList) {
DocumentData data = (DocumentData)aDocDataList;
docsNames.add(data.getDocumentName());
}
return docsNames;
}
/**
* Gets the persistent IDs of the documents in this corpus.
*
* @return a {@link List} of Objects representing the persistent IDs
* of the documents in this corpus.
*/
public List getDocumentPersistentIDs() {
List docsIDs = new ArrayList();
if(docDataList == null) return docsIDs;
Iterator iter = docDataList.iterator();
while(iter.hasNext()) {
DocumentData data = iter.next();
docsIDs.add(data.getPersistentID());
}
return docsIDs;
}
/**
* Gets the persistent IDs of the documents in this corpus.
*
* @return a {@link List} of Objects representing the persistent IDs
* of the documents in this corpus.
*/
public List getDocumentClassTypes() {
List docsIDs = new ArrayList();
if(docDataList == null) return docsIDs;
Iterator iter = docDataList.iterator();
while(iter.hasNext()) {
DocumentData data = iter.next();
docsIDs.add(data.getClassType());
}
return docsIDs;
}
/**
* This method should only be used by the Serial Datastore to set
*/
public void setDocumentPersistentID(int index, Object persID) {
if(index >= docDataList.size()) return;
docDataList.get(index).setPersistentID(persID);
if(DEBUG) Out.prln("IDs are now: " + docDataList);
}
/**
* Gets the name of a document in this corpus.
*
* @param index the index of the document
* @return a String value representing the name of the document at
* index in this corpus.
*
*/
@Override
public String getDocumentName(int index) {
if(index >= docDataList.size()) return "No such document";
return docDataList.get(index).getDocumentName();
}
/**
* Gets the persistent ID of a document in this corpus.
*
* @param index the index of the document
* @return a value representing the persistent ID of the document at
* index in this corpus.
*
*/
public Object getDocumentPersistentID(int index) {
if(index >= docDataList.size()) return null;
return docDataList.get(index).getPersistentID();
}
public String getDocumentClassType(int index) {
if(index >= docDataList.size()) return null;
return docDataList.get(index).getClassType();
}
/**
* Unloads a document from memory.
*
* @param index the index of the document to be unloaded.
* @param sync should the document be sync'ed (i.e. saved) before
* unloading.
*/
public void unloadDocument(int index, boolean sync) {
// 1. check whether its been loaded and is a persistent one
// if a persistent doc is not loaded, there's nothing we need to do
if((!isDocumentLoaded(index)) && isPersistentDocument(index)) return;
// 2. If requested, sync the document before releasing it from
// memory,
// because the creole register garbage collects all LRs which are
// not used
// any more
if(sync) {
Document doc = documents.get(index);
try {
// if the document is not already adopted, we need to do that
// first
if(doc.getLRPersistenceId() == null) {
doc = (Document)this.getDataStore().adopt(doc);
this.getDataStore().sync(doc);
this.setDocumentPersistentID(index, doc.getLRPersistenceId());
}
else // if it is adopted, just sync it
this.getDataStore().sync(doc);
}
catch(PersistenceException ex) {
throw new GateRuntimeException("Error unloading document from corpus"
+ "because document sync failed: " + ex.getMessage(), ex);
}
}
// 3. remove the document from the memory
// do this, only if the saving has succeeded
documents.set(index, null);
}
/**
* Unloads a document from memory
*
* @param doc the document to be unloaded
* @param sync should the document be sync'ed (i.e. saved) before
* unloading.
*/
public void unloadDocument(Document doc, boolean sync) {
if(DEBUG) Out.prln("Document to be unloaded :" + doc.getName());
// 1. determine the index of the document; if not there, do nothing
int index = findDocument(doc);
if(index == -1) return;
if(DEBUG) Out.prln("Index of doc: " + index);
if(DEBUG) Out.prln("Size of corpus: " + documents.size());
unloadDocument(index, sync);
// documents.remove(new Integer(index));
}
/**
* Unloads a document from memory, calling sync() first, to store the
* changes.
*
* @param doc the document to be unloaded.
*/
@Override
public void unloadDocument(Document doc) {
unloadDocument(doc, true);
}
/**
* Unloads the document from memory, calling sync() first, to store
* the changes.
*
* @param index the index of the document to be unloaded.
*/
public void unloadDocument(int index) {
unloadDocument(index, true);
}
/**
* This method returns true when the document is already loaded in
* memory
*/
@Override
public boolean isDocumentLoaded(int index) {
if(documents == null || documents.isEmpty()) return false;
return documents.get(index) != null;
}
/**
* This method returns true when the document is already stored on
* disk i.e., is not transient
*/
public boolean isPersistentDocument(int index) {
if(documents == null || documents.isEmpty()) return false;
return (docDataList.get(index).getPersistentID() != null);
}
/**
* Every LR that is a CreoleListener (and other Listeners too) must
* override this method and make sure it removes itself from the
* objects which it has been listening to. Otherwise, the object will
* not be released from memory (memory leak!).
*/
@Override
public void cleanup() {
if(DEBUG) Out.prln("serial corpus cleanup called");
if(corpusListeners != null) corpusListeners = null;
if(documents != null) documents.clear();
docDataList.clear();
Gate.getCreoleRegister().removeCreoleListener(this);
if(this.dataStore != null) {
this.dataStore.removeDatastoreListener(this);
}
}
/**
* Fills this corpus with documents created from files in a directory.
*
* @param filter the file filter used to select files from the target
* directory. If the filter is null all the files
* will be accepted.
* @param directory the directory from which the files will be picked.
* This parameter is an URL for uniformity. It needs to be a
* URL of type file otherwise an InvalidArgumentException
* will be thrown. An implementation for this method is
* provided as a static method at
* {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}
* .
* @param encoding the encoding to be used for reading the documents
* @param recurseDirectories should the directory be parsed
* recursively?. If true all the files from the
* provided directory and all its children directories (on as
* many levels as necessary) will be picked if accepted by
* the filter otherwise the children directories will be
* ignored.
*/
@Override
public void populate(URL directory, FileFilter filter, String encoding,
boolean recurseDirectories) throws IOException,
ResourceInstantiationException {
CorpusImpl.populate(this, directory, filter, encoding, recurseDirectories);
}
/**
* Fills this corpus with documents created from files in a directory.
*
* @param filter the file filter used to select files from the target
* directory. If the filter is null all the files
* will be accepted.
* @param directory the directory from which the files will be picked.
* This parameter is an URL for uniformity. It needs to be a
* URL of type file otherwise an InvalidArgumentException
* will be thrown. An implementation for this method is
* provided as a static method at
* {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}
* .
* @param encoding the encoding to be used for reading the documents
* @param recurseDirectories should the directory be parsed
* recursively?. If true all the files from the
* provided directory and all its children directories (on as
* many levels as necessary) will be picked if accepted by
* the filter otherwise the children directories will be
* ignored.
*/
@Override
public void populate(URL directory, FileFilter filter, String encoding,
String mimeType, boolean recurseDirectories) throws IOException,
ResourceInstantiationException {
CorpusImpl.populate(this, directory, filter, encoding, mimeType,
recurseDirectories);
}
/**
* Fills the provided corpus with documents extracted from the
* provided single concatenated file.
*
* @param singleConcatenatedFile the single concatenated file.
* @param documentRootElement content between the start and end of
* this element is considered for documents.
* @param encoding the encoding of the trec file.
* @param numberOfFilesToExtract indicates the number of files to
* extract from the trecweb file.
* @param documentNamePrefix the prefix to use for document names when
* creating from
* @param mimeType the mime type which determines how the document is handled
* @return total length of populated documents in the corpus in number
* of bytes
*/
@Override
public long populate(URL singleConcatenatedFile, String documentRootElement,
String encoding, int numberOfFilesToExtract,
String documentNamePrefix, String mimeType, boolean includeRootElement) throws IOException,
ResourceInstantiationException {
return CorpusImpl.populate(this, singleConcatenatedFile,
documentRootElement, encoding, numberOfFilesToExtract,
documentNamePrefix, mimeType, includeRootElement);
}
@Override
public synchronized void removeCorpusListener(CorpusListener l) {
if(corpusListeners != null && corpusListeners.contains(l)) {
@SuppressWarnings("unchecked")
Vector v = (Vector)corpusListeners.clone();
v.removeElement(l);
corpusListeners = v;
}
}
@Override
public synchronized void addCorpusListener(CorpusListener l) {
@SuppressWarnings("unchecked")
Vector v = corpusListeners == null
? new Vector(2)
: (Vector)corpusListeners.clone();
if(!v.contains(l)) {
v.addElement(l);
corpusListeners = v;
}
}
protected void fireDocumentAdded(CorpusEvent e) {
if(corpusListeners != null) {
Vector listeners = corpusListeners;
int count = listeners.size();
for(int i = 0; i < count; i++) {
listeners.elementAt(i).documentAdded(e);
}
}
}
protected void fireDocumentRemoved(CorpusEvent e) {
if(corpusListeners != null) {
Vector listeners = corpusListeners;
int count = listeners.size();
for(int i = 0; i < count; i++) {
listeners.elementAt(i).documentRemoved(e);
}
}
}
@Override
public void resourceLoaded(CreoleEvent e) {
}
@Override
public void resourceRenamed(Resource resource, String oldName, String newName) {
}
@Override
public void resourceUnloaded(CreoleEvent e) {
Resource res = e.getResource();
if(res instanceof Document) {
Document doc = (Document)res;
if(DEBUG) Out.prln("resource Unloaded called ");
// remove from the corpus too, if a transient one
if(doc.getDataStore() != this.getDataStore()) {
this.remove(doc);
}
else {
// unload all occurences
int index = indexOf(res);
if(index < 0) return;
documents.set(index, null);
if(DEBUG)
Out.prln("corpus: document " + index + " unloaded and set to null");
} // if
}
}
@Override
public void datastoreOpened(CreoleEvent e) {
}
@Override
public void datastoreCreated(CreoleEvent e) {
}
@Override
public void datastoreClosed(CreoleEvent e) {
if(!e.getDatastore().equals(this.getDataStore())) return;
if(this.getDataStore() != null)
this.getDataStore().removeDatastoreListener(this);
// close this corpus, since it cannot stay open when the DS it comes
// from
// is closed
Factory.deleteResource(this);
}
/**
* Called by a datastore when a new resource has been adopted
*/
@Override
public void resourceAdopted(DatastoreEvent evt) {
}
/**
* Called by a datastore when a resource has been deleted
*/
@Override
public void resourceDeleted(DatastoreEvent evt) {
DataStore ds = (DataStore)evt.getSource();
// 1. check whether this datastore fired the event. If not, return.
if(!ds.equals(this.dataStore)) return;
Object docID = evt.getResourceID();
if(docID == null) return;
if(DEBUG) Out.prln("Resource deleted called for: " + docID);
// first check if it is this corpus that's been deleted, it must be
// unloaded immediately
if(docID.equals(this.getLRPersistenceId())) {
Factory.deleteResource(this);
return;
}// if
boolean isDirty = false;
// the problem here is that I only have the doc persistent ID
// and nothing else, so I need to determine the index of the doc
// first
for(int i = 0; i < docDataList.size(); i++) {
DocumentData docData = docDataList.get(i);
// we've found the correct document
// don't break the loop, because it might appear more than once
if(docID.equals(docData.getPersistentID())) {
if(evt.getResource() == null) {
// instead of calling remove() which tries to load the
// document
// remove it from the documents and docDataList
documentRemoved(docDataList.get(i).persistentID
.toString());
docDataList.remove(i);
documents.remove(i);
isDirty = true;
i--;
continue;
}
remove(i);
isDirty = true;
}// if
}// for loop through the doc data
if(isDirty) try {
this.dataStore.sync(this);
}
catch(PersistenceException ex) {
throw new GateRuntimeException("SerialCorpusImpl: " + ex.getMessage());
}
catch(SecurityException sex) {
throw new GateRuntimeException("SerialCorpusImpl: " + sex.getMessage());
}
}// resourceDeleted
/**
* Called by a datastore when a resource has been wrote into the
* datastore
*/
@Override
public void resourceWritten(DatastoreEvent evt) {
if(evt.getResourceID().equals(this.getLRPersistenceId())) {
thisResourceWritten();
}
}
// List methods
// java docs will be automatically copied from the List interface.
@Override
public int size() {
return docDataList.size();
}
@Override
public boolean isEmpty() {
return docDataList.isEmpty();
}
@Override
public boolean contains(Object o) {
// return true if:
// - the document data list contains a document with such a name
// and persistent id
if(!(o instanceof Document)) return false;
int index = findDocument((Document)o);
if(index < 0)
return false;
else return true;
}
@Override
public Iterator iterator() {
return new Iterator() {
Iterator docDataIter = docDataList.iterator();
@Override
public boolean hasNext() {
return docDataIter.hasNext();
}
@Override
public Document next() {
// try finding a document with the same name and persistent ID
DocumentData docData = docDataIter.next();
int index = docDataList.indexOf(docData);
return SerialCorpusImpl.this.get(index);
}
@Override
public void remove() {
throw new UnsupportedOperationException("SerialCorpusImpl does not "
+ "support remove in the iterators");
}
}; // return
}// iterator
@Override
public String toString() {
return "document data " + docDataList.toString() + " documents "
+ documents;
}
@Override
public Object[] toArray() {
// there is a problem here, because some docs might not be
// instantiated
throw new MethodNotImplementedException(
"toArray() is not implemented for SerialCorpusImpl");
}
@Override
public T[] toArray(T[] a) {
// there is a problem here, because some docs might not be
// instantiated
throw new MethodNotImplementedException(
"toArray(Object[] a) is not implemented for SerialCorpusImpl");
}
@Override
public boolean add(Document o) {
if(o == null) return false;
Document doc = o;
// make it accept only docs from its own datastore
if(doc.getDataStore() != null && !this.dataStore.equals(doc.getDataStore())) {
Err.prln("Error: Persistent corpus can only accept documents "
+ "from its own datastore!");
return false;
}// if
// add the document with its index in the docDataList
// in this case, since it's going to be added to the end
// the index will be the size of the docDataList before
// the addition
DocumentData docData = new DocumentData(doc.getName(), doc
.getLRPersistenceId(), doc.getClass().getName());
boolean result = docDataList.add(docData);
documents.add(doc);
documentAdded(doc);
fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this, doc, docDataList
.size() - 1, doc.getLRPersistenceId(), CorpusEvent.DOCUMENT_ADDED));
return result;
}
@Override
public boolean remove(Object o) {
if(DEBUG) Out.prln("SerialCorpus:Remove object called");
if(!(o instanceof Document)) return false;
Document doc = (Document)o;
// see if we can find it first. If not, then judt return
int index = findDocument(doc);
if(index == -1) return false;
if(index < docDataList.size()) { // we found it, so remove it
// by Andrey Shafirin: this part of code can produce an exception
// if
// document wasn't loaded
String docName = docDataList.get(index).getDocumentName();
Object docPersistentID = getDocumentPersistentID(index);
docDataList.remove(index);
// Document oldDoc = (Document) documents.remove(index);
documents.remove(index);
// if (DEBUG) Out.prln("documents after remove of " +
// oldDoc.getName()
// + " are " + documents);
if(DEBUG)
Out.prln("documents after remove of " + docName + " are " + documents);
// documentRemoved(oldDoc.getLRPersistenceId().toString());
if(docPersistentID != null) documentRemoved(docPersistentID.toString());
// fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this,
// oldDoc,
// index,
// CorpusEvent.DOCUMENT_REMOVED));
fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this, (Document)o,
index, docPersistentID, CorpusEvent.DOCUMENT_REMOVED));
}
return true;
}
public int findDocument(Document doc) {
boolean found = false;
DocumentData docData = null;
// first try finding the document in memory
int index = documents.indexOf(doc);
if(index > -1 && index < docDataList.size()) return index;
// else try finding a document with the same name and persistent ID
Iterator iter = docDataList.iterator();
for(index = 0; iter.hasNext(); index++) {
docData = iter.next();
if(docData.getDocumentName().equals(doc.getName())
&& docData.getPersistentID().equals(doc.getLRPersistenceId())
&& docData.getClassType().equals(doc.getClass().getName())) {
found = true;
break;
}
}
if(found && index < docDataList.size())
return index;
else return -1;
}// findDocument
@Override
public boolean containsAll(Collection> c) {
Iterator> iter = c.iterator();
while(iter.hasNext()) {
if(!contains(iter.next())) return false;
}
return true;
}
@Override
public boolean addAll(Collection extends Document> c) {
boolean allAdded = true;
Iterator extends Document> iter = c.iterator();
while(iter.hasNext()) {
if(!add(iter.next())) allAdded = false;
}
return allAdded;
}
@Override
public boolean addAll(int index, Collection extends Document> c) {
throw new UnsupportedOperationException();
}
@Override
public boolean removeAll(Collection> c) {
boolean allRemoved = true;
Iterator> iter = c.iterator();
while(iter.hasNext()) {
if(!remove(iter.next())) allRemoved = false;
}
return allRemoved;
}
@Override
public boolean retainAll(Collection> c) {
throw new UnsupportedOperationException();
}
@Override
public void clear() {
documents.clear();
docDataList.clear();
}
@Override
public boolean equals(Object o) {
if(o == null) return false;
if(!(o instanceof SerialCorpusImpl)) return false;
SerialCorpusImpl oCorpus = (SerialCorpusImpl)o;
if(oCorpus == this) return true;
if((oCorpus.lrPersistentId == this.lrPersistentId || (this.lrPersistentId != null && this.lrPersistentId
.equals(oCorpus.lrPersistentId)))
&& oCorpus.name.equals(this.name)
&& (oCorpus.dataStore == this.dataStore || oCorpus.dataStore
.equals(this.dataStore))
&& oCorpus.docDataList.equals(docDataList)) return true;
return false;
}
@Override
public int hashCode() {
return docDataList.hashCode();
}
@Override
public Document get(int index) {
if(index >= docDataList.size()) return null;
Document res = documents.get(index);
if(DEBUG)
Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res);
// if the document is null, then I must get it from the DS
if(res == null) {
FeatureMap parameters = Factory.newFeatureMap();
parameters.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore);
try {
parameters.put(DataStore.LR_ID_FEATURE_NAME, docDataList
.get(index).getPersistentID());
Document lr = (Document) Factory.createResource(docDataList
.get(index).getClassType(), parameters);
if(DEBUG) Out.prln("Loaded document :" + lr.getName());
// change the result to the newly loaded doc
res = lr;
// finally replace the doc with the instantiated version
documents.set(index, lr);
}
catch(ResourceInstantiationException ex) {
Err.prln("Error reading document inside a serialised corpus.");
throw new GateRuntimeException(ex);
}
}
return res;
}
@Override
public Document set(int index, Document element) {
throw new gate.util.MethodNotImplementedException();
// fire the 2 events
/*
* fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this,
* oldDoc, ((Integer) key).intValue(),
* CorpusEvent.DOCUMENT_REMOVED)); fireDocumentAdded(new
* CorpusEvent(SerialCorpusImpl.this, newDoc, ((Integer)
* key).intValue(), CorpusEvent.DOCUMENT_ADDED));
*/
}
@Override
public void add(int index, Document o) {
if(o == null) return;
Document doc = o;
DocumentData docData = new DocumentData(doc.getName(), doc
.getLRPersistenceId(), doc.getClass().getName());
docDataList.add(index, docData);
documents.add(index, doc);
documentAdded(doc);
fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this, doc, index, doc
.getLRPersistenceId(), CorpusEvent.DOCUMENT_ADDED));
}
@Override
public Document remove(int index) {
if(DEBUG) Out.prln("Remove index called");
// try to get the actual document if it was loaded
Document res = isDocumentLoaded(index) ? get(index) : null;
Object docLRID = docDataList.get(index).persistentID;
if(docLRID != null) documentRemoved(docLRID.toString());
docDataList.remove(index);
documents.remove(index);
fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this, res, index,
docLRID, CorpusEvent.DOCUMENT_REMOVED));
return res;
}
@Override
public int indexOf(Object o) {
if(o instanceof Document) return findDocument((Document)o);
return -1;
}
@Override
public int lastIndexOf(Object o) {
throw new gate.util.MethodNotImplementedException();
}
@Override
public ListIterator listIterator() {
throw new gate.util.MethodNotImplementedException();
}
@Override
public ListIterator listIterator(int index) {
throw new gate.util.MethodNotImplementedException();
}
/**
* persistent Corpus does not support this method as all the documents
* might no be in memory
*/
@Override
public List subList(int fromIndex, int toIndex) {
throw new gate.util.MethodNotImplementedException();
}
@Override
public void setDataStore(DataStore dataStore)
throws gate.persist.PersistenceException {
super.setDataStore(dataStore);
if(this.dataStore != null) this.dataStore.addDatastoreListener(this);
}
public void setTransientSource(Object source) {
if(!(source instanceof Corpus)) return;
// the following initialisation is only valid when we're
// constructing
// this object from a transient one. If it has already been stored
// in
// a datastore, then the initialisation is done in readObject()
// since
// this method is the one called by serialisation, when objects
// are restored.
if(this.dataStore != null && this.lrPersistentId != null) return;
Corpus tCorpus = (Corpus)source;
// copy the corpus name and features from the one in memory
this.setName(tCorpus.getName());
this.setFeatures(tCorpus.getFeatures());
docDataList = new ArrayList();
// now cache the names of all docs for future use
List docNames = tCorpus.getDocumentNames();
for(int i = 0; i < docNames.size(); i++) {
Document aDoc = tCorpus.get(i);
docDataList.add(new DocumentData(docNames.get(i), null, aDoc
.getClass().getName()));
}
// copy all the documents from the transient corpus
documents = new ArrayList();
documents.addAll(tCorpus);
this.addedDocs = new Vector();
this.removedDocIDs = new Vector();
this.changedDocs = new Vector();
// make sure we fire events when docs are added/removed/etc
Gate.getCreoleRegister().addCreoleListener(this);
}
// we don't keep the transient source, so always return null
// Sill this must be implemented, coz of the GUI and Factory
public Object getTransientSource() {
return null;
}
@Override
public Resource init() throws gate.creole.ResourceInstantiationException {
super.init();
return this;
}
/**
* readObject - calls the default readObject() and then initialises
* the transient data
*
* @serialData Read serializable fields. No optional data read.
*/
private void readObject(ObjectInputStream s) throws IOException,
ClassNotFoundException {
s.defaultReadObject();
documents = new ArrayList(docDataList.size());
for(int i = 0; i < docDataList.size(); i++)
documents.add(null);
corpusListeners = new Vector();
// finally set the creole listeners if the LR is like that
Gate.getCreoleRegister().addCreoleListener(this);
if(this.dataStore != null) this.dataStore.addDatastoreListener(this);
// if indexed construct the manager.
/*IndexDefinition definition = (IndexDefinition)this.getFeatures().get(
GateConstants.CORPUS_INDEX_DEFINITION_FEATURE_KEY);
if(definition != null) {
String className = definition.getIrEngineClassName();
try {
// Class aClass = Class.forName(className);
Class> aClass = Class.forName(className, true, Gate.getClassLoader());
IREngine engine = (IREngine)aClass.newInstance();
this.indexManager = engine.getIndexmanager();
this.indexManager.setIndexDefinition(definition);
this.indexManager.setCorpus(this);
}
catch(Exception e) {
e.printStackTrace(Err.getPrintWriter());
}
// switch (definition.getIndexType()) {
// case GateConstants.IR_LUCENE_INVFILE:
// this.indexManager = new LuceneIndexManager();
// this.indexManager.setIndexDefinition(definition);
// this.indexManager.setCorpus(this);
// break;
// }
this.addedDocs = new Vector();
this.removedDocIDs = new Vector();
this.changedDocs = new Vector();
}*/
}// readObject
@Override
public void setIndexDefinition(IndexDefinition definition) {
if(definition != null) {
this.getFeatures().put(GateConstants.CORPUS_INDEX_DEFINITION_FEATURE_KEY,
definition);
String className = definition.getIrEngineClassName();
try {
// Class aClass = Class.forName(className);
Class> aClass = Class.forName(className, true, Gate.getClassLoader());
IREngine engine = (IREngine)aClass.newInstance();
this.indexManager = engine.getIndexmanager();
this.indexManager.setIndexDefinition(definition);
this.indexManager.setCorpus(this);
}
catch(Exception e) {
e.printStackTrace(Err.getPrintWriter());
}
// switch (definition.getIndexType()) {
// case GateConstants.IR_LUCENE_INVFILE:
// this.indexManager = new LuceneIndexManager();
// this.indexManager.setIndexDefinition(definition);
// this.indexManager.setCorpus(this);
// break;
// }
this.addedDocs = new Vector();
this.removedDocIDs = new Vector();
this.changedDocs = new Vector();
}
}
@Override
public IndexDefinition getIndexDefinition() {
return (IndexDefinition)this.getFeatures().get(
GateConstants.CORPUS_INDEX_DEFINITION_FEATURE_KEY);
}
@Override
public IndexManager getIndexManager() {
return this.indexManager;
}
@Override
public IndexStatistics getIndexStatistics() {
return (IndexStatistics)this.getFeatures().get(
GateConstants.CORPUS_INDEX_STATISTICS_FEATURE_KEY);
}
private void documentAdded(Document doc) {
if(indexManager != null) {
addedDocs.add(doc);
}
}
private void documentRemoved(String lrID) {
if(indexManager != null) {
removedDocIDs.add(lrID);
}
}
private void thisResourceWritten() {
if(indexManager != null) {
try {
for(int i = 0; i < documents.size(); i++) {
if(documents.get(i) != null) {
Document doc = documents.get(i);
if(!addedDocs.contains(doc) && doc.isModified()) {
changedDocs.add(doc);
}
}
}
indexManager.sync(addedDocs, removedDocIDs, changedDocs);
}
catch(IndexException ie) {
ie.printStackTrace();
}
}
}
/**
* SerialCorpusImpl does not support duplication.
*/
@Override
public Resource duplicate(Factory.DuplicationContext ctx)
throws ResourceInstantiationException {
throw new ResourceInstantiationException("Duplication of "
+ this.getClass().getName() + " not permitted");
}
}