gate.compound.impl.AbstractCompoundDocument Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of alignment Show documentation
Show all versions of alignment Show documentation
A selection of tools for processing parallel texts.
package gate.compound.impl;
import gate.Annotation;
import gate.AnnotationSet;
import gate.DataStore;
import gate.Document;
import gate.DocumentContent;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.Resource;
import gate.compound.CompoundDocument;
import gate.corpora.DocumentContentImpl;
import gate.corpora.DocumentImpl;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.Optional;
import gate.event.CreoleEvent;
import gate.event.DatastoreEvent;
import gate.event.DocumentListener;
import gate.util.GateRuntimeException;
import gate.util.InvalidOffsetException;
import gate.util.Strings;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectInputValidation;
import java.io.StringWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Vector;
/**
* This is an abstract implementation of the AbstractAlignedDocument This class
* overrides the methods of DocumentImpl and provide generic implementation of
* some of the methods of AlignedDocument
*
* @author niraj
*/
public abstract class AbstractCompoundDocument extends DocumentImpl implements
CompoundDocument {
/**
* If you set this flag to true the repositioning information for the document
* will be kept in the document feature.
* Default value is false to avoid the unnecessary waste of time and memory
*/
protected Boolean collectRepositioningInfo = Boolean.FALSE;
/**
* If you set this flag to true the original content of the document will be
* kept in the document feature.
* Default value is false to avoid the unnecessary waste of memory
*/
protected Boolean preserveOriginalContent = Boolean.FALSE;
/**
* Languages
*/
protected List documentIDs;
/**
* Current Document
*/
protected Document currentDocument;
/**
* Available documents
*/
protected Map documents;
/** Freeze the serialization UID. */
static final long serialVersionUID = -8456893608311510260L;
/**
* Document listeners
*/
private transient Vector documentListeners;
/** exception message when the current document is not set */
protected String CURR_DOC_NOT_SET_MSG = "Please use "
+ "setCurrentDocument(...) method first!";
/** Clear all the data members of the object. */
public void cleanup() {
super.cleanup();
setCurrentDocument(null);
Iterator iter = documents.values().iterator();
while(iter.hasNext()) {
Document doc = iter.next();
if(Gate.getHiddenAttribute(doc.getFeatures())) {
// assume the document was created by this compound,
// so it's up to us to delete it
Factory.deleteResource(doc);
}
}
} // cleanup()
/**
* This method always returns features of the compound document. If you want
* access to features of the member document - you can use
* getDocument(documentId).getFeatures()
*/
@Override
public FeatureMap getFeatures() {
if(this.features == null) {
this.features = Factory.newFeatureMap();
}
return this.features;
}
/** gets the name of the current document */
public String getName() {
if(currentDocument == null) { return super.getName(); }
return currentDocument.getName();
}
/** Sets the document name */
public void setName(String name) {
if(currentDocument == null) {
super.setName(name);
} else {
currentDocument.setName(name);
}
}
/** Documents are identified by URLs */
public URL getSourceUrl() {
if(currentDocument == null) { return sourceUrl; }
return currentDocument.getSourceUrl();
}
/** Set method for the document's URL */
public void setSourceUrl(URL sourceUrl) {
if(currentDocument == null) {
this.sourceUrl = sourceUrl;
} else {
currentDocument.setSourceUrl(sourceUrl);
}
} // setSourceUrl
/**
* Documents may be packed within files; in this case an optional pair of
* offsets refer to the location of the document.
*/
public Long[] getSourceUrlOffsets() {
if(currentDocument == null) { return new Long[]{sourceUrlStartOffset,
sourceUrlEndOffset}; }
return currentDocument.getSourceUrlOffsets();
} // getSourceUrlOffsets
/**
* Allow/disallow preserving of the original document content. If is
* true the original content will be retrieved from the DocumentContent
* object and preserved as document feature.
*/
public void setPreserveOriginalContent(Boolean b) {
if(currentDocument == null) {
this.preserveOriginalContent = b;
} else {
currentDocument.setPreserveOriginalContent(b);
}
} // setPreserveOriginalContent
/**
* Get the preserving of content status of the Document.
*
* @return whether the Document should preserve it's original content.
*/
public Boolean getPreserveOriginalContent() {
if(currentDocument == null) {
return preserveOriginalContent;
} else {
return currentDocument.getPreserveOriginalContent();
}
} // getPreserveOriginalContent
/**
* Allow/disallow collecting of repositioning information. If is true
* information will be retrieved and preserved as document feature.
* Preserving of repositioning information give the possibilities for
* converting of coordinates between the original document content and
* extracted from the document text.
*/
public void setCollectRepositioningInfo(Boolean b) {
if(currentDocument == null) {
collectRepositioningInfo = b;
} else {
currentDocument.setCollectRepositioningInfo(b);
}
} // setCollectRepositioningInfo
/**
* Get the collectiong and preserving of repositioning information for the
* Document.
* Preserving of repositioning information give the possibilities for
* converting of coordinates between the original document content and
* extracted from the document text.
*
* @return whether the Document should collect and preserve information.
*/
public Boolean getCollectRepositioningInfo() {
if(currentDocument == null) {
return collectRepositioningInfo;
} else {
return currentDocument.getCollectRepositioningInfo();
}
} // getCollectRepositioningInfo
/**
* Documents may be packed within files; in this case an optional pair of
* offsets refer to the location of the document. This method gets the start
* offset.
*/
public Long getSourceUrlStartOffset() {
if(currentDocument == null) {
return sourceUrlStartOffset;
} else {
return currentDocument.getSourceUrlStartOffset();
}
}
/**
* Documents may be packed within files; in this case an optional pair of
* offsets refer to the location of the document. This method sets the start
* offset.
*/
public void setSourceUrlStartOffset(Long sourceUrlStartOffset) {
if(currentDocument == null) {
this.sourceUrlStartOffset = sourceUrlStartOffset;
} else {
currentDocument.setSourceUrlStartOffset(sourceUrlStartOffset);
}
} // setSourceUrlStartOffset
/**
* Documents may be packed within files; in this case an optional pair of
* offsets refer to the location of the document. This method gets the end
* offset.
*/
public Long getSourceUrlEndOffset() {
if(currentDocument == null) {
return sourceUrlEndOffset;
} else {
return currentDocument.getSourceUrlEndOffset();
}
}
/**
* Documents may be packed within files; in this case an optional pair of
* offsets refer to the location of the document. This method sets the end
* offset.
*/
public void setSourceUrlEndOffset(Long sourceUrlEndOffset) {
if(currentDocument == null) {
this.sourceUrlEndOffset = sourceUrlEndOffset;
} else {
currentDocument.setSourceUrlEndOffset(sourceUrlEndOffset);
}
} // setSourceUrlStartOffset
/**
* The content of the document: a String for text; MPEG for video; etc.
*/
public DocumentContent getContent() {
if(currentDocument == null) {
return new DocumentContentImpl("");
} else {
return currentDocument.getContent();
}
}
/** Set method for the document content */
public void setContent(DocumentContent content) {
if(currentDocument != null) {
currentDocument.setContent(content);
}
}
/** Get the encoding of the document content source */
public String getEncoding() {
if(currentDocument == null) {
return this.encoding;
} else {
return ((DocumentImpl)currentDocument).getEncoding();
}
}
/** Set the encoding of the document content source */
@CreoleParameter(defaultValue="UTF-8", comment="Encoding")
public void setEncoding(String encoding) {
if(currentDocument == null) {
this.encoding = encoding;
} else {
((DocumentImpl)currentDocument).setEncoding(encoding);
}
}
/**
* Get the default set of annotations. The set is created if it doesn't exist
* yet.
*/
public AnnotationSet getAnnotations() {
if(currentDocument == null) {
throw new GateRuntimeException(CURR_DOC_NOT_SET_MSG);
} else {
return currentDocument.getAnnotations();
}
} // getAnnotations()
/**
* Get a named set of annotations. Creates a new set if one with this name
* doesn't exist yet. If the provided name is null then it returns the default
* annotation set.
*/
public AnnotationSet getAnnotations(String name) {
if(currentDocument == null) {
throw new GateRuntimeException(CURR_DOC_NOT_SET_MSG);
} else {
return currentDocument.getAnnotations(name);
}
} // getAnnotations(name)
/**
* Make the document markup-aware. This will trigger the creation of a
* DocumentFormat object at Document initialisation time; the DocumentFormat
* object will unpack the markup in the Document and add it as annotations.
* Documents are not markup-aware by default.
*
* @param newMarkupAware
* markup awareness status.
*/
@CreoleParameter(defaultValue="true", comment="Should the document read the original markup")
public void setMarkupAware(Boolean newMarkupAware) {
if(currentDocument == null) {
this.markupAware = newMarkupAware;
} else {
currentDocument.setMarkupAware(newMarkupAware);
}
}
/**
* Get the markup awareness status of the Document. Documents are
* markup-aware by default.
*
* @return whether the Document is markup aware.
*/
public Boolean getMarkupAware() {
if(currentDocument == null) {
return this.markupAware;
} else {
return currentDocument.getMarkupAware();
}
}
/**
* Returns an XML document aming to preserve the original markups( the
* original markup will be in the same place and format as it was before
* processing the document) and include (if possible) the annotations
* specified in the aSourceAnnotationSet. It is equivalent to
* toXml(aSourceAnnotationSet, true).
*/
@Override
public String toXml(Set aSourceAnnotationSet) {
if(currentDocument == null) {
throw new GateRuntimeException(CURR_DOC_NOT_SET_MSG);
} else {
return currentDocument.toXml(aSourceAnnotationSet);
}
}
/**
* Returns an XML document aming to preserve the original markups( the
* original markup will be in the same place and format as it was before
* processing the document) and include (if possible) the annotations
* specified in the aSourceAnnotationSet. Warning: Annotations from the
* aSourceAnnotationSet will be lost if they will cause a crosed over
* situation.
*
* @param aSourceAnnotationSet
* is an annotation set containing all the annotations that will be
* combined with the original marup set. If the param is
* null
it will only dump the original markups.
* @param includeFeatures
* is a boolean that controls whether the annotation features should
* be included or not. If false, only the annotation type is included
* in the tag.
* @return a string representing an XML document containing the original
* markup + dumped annotations form the aSourceAnnotationSet
*/
public String toXml(Set aSourceAnnotationSet, boolean includeFeatures) {
if(currentDocument == null) {
throw new GateRuntimeException(CURR_DOC_NOT_SET_MSG);
} else {
return currentDocument.toXml(aSourceAnnotationSet, includeFeatures);
}
}// End toXml()
/**
* Returns a GateXml document that is a custom XML format for wich there is a
* reader inside GATE called gate.xml.GateFormatXmlHandler. What it does is to
* serialize a GATE document in an XML format.
*
* @return a string representing a Gate Xml document.
*/
public String toXml() {
if(currentDocument == null) {
return toXmlAsASingleDocument(this);
} else {
return currentDocument.toXml();
}
}// toXml
/**
* Gives a single XML representation for the entire document.
*/
public static String toXmlAsASingleDocument(CompoundDocument aCompoundDoc) {
Map docXmls = new HashMap();
Map globalMap = new HashMap();
for(String id : aCompoundDoc.getDocumentIDs()) {
docXmls.put(id, aCompoundDoc.getDocument(id).toXml());
}
// add document xmls
globalMap.put("docXmls", docXmls);
// we would use XStream library to store annic patterns
com.thoughtworks.xstream.XStream xstream =
new com.thoughtworks.xstream.XStream();
// Saving is accomplished just using XML serialization of the map.
StringWriter stringToReturn = new StringWriter();
Document currentDoc = aCompoundDoc.getCurrentDocument();
aCompoundDoc.setCurrentDocument(null);
// other features
Map features = new HashMap();
features.put("encoding", aCompoundDoc.getEncoding());
features.put("collectRepositioningInfo",
aCompoundDoc.getCollectRepositioningInfo());
features.put("preserveOriginalContent",
aCompoundDoc.getPreserveOriginalContent());
features.put("documentIDs", aCompoundDoc.getDocumentIDs());
features.put("markupAware", new Boolean(true));
features.put("name", aCompoundDoc.getName());
globalMap.put("feats", features);
globalMap.put("docFeats", aCompoundDoc.getFeatures());
// restore the original settings
aCompoundDoc.setCurrentDocument(currentDoc == null ? null : currentDoc
.getName());
xstream.toXML(globalMap, stringToReturn);
return stringToReturn.toString();
}
/**
* Returns a map with the named annotation sets. It returns null
* if no named annotaton set exists.
*/
public Map getNamedAnnotationSets() {
if(currentDocument == null) {
throw new GateRuntimeException(CURR_DOC_NOT_SET_MSG);
} else {
return currentDocument.getNamedAnnotationSets();
}
} // getNamedAnnotationSets
public Set getAnnotationSetNames() {
if(currentDocument == null) {
throw new GateRuntimeException(CURR_DOC_NOT_SET_MSG);
} else {
return currentDocument.getAnnotationSetNames();
}
}
/**
* Removes one of the named annotation sets. Note that the default annotation
* set cannot be removed.
*
* @param name
* the name of the annotation set to be removed
*/
public void removeAnnotationSet(String name) {
if(currentDocument != null) {
currentDocument.removeAnnotationSet(name);
}
}
/** Propagate edit changes to the document content and annotations. */
public void edit(Long start, Long end, DocumentContent replacement)
throws InvalidOffsetException {
if(currentDocument != null) {
currentDocument.edit(start, end, replacement);
}
} // edit(start,end,replacement)
/** Ordering based on URL.toString() and the URL offsets (if any) */
public int compareTo(Object o) throws ClassCastException {
CompoundDocument other = (CompoundDocument)o;
return toString().compareTo(other.toString());
} // compareTo
/** String respresentation */
public String toString() {
if(currentDocument == null) {
String n = Strings.getNl();
StringBuffer s = new StringBuffer("CompoundDocumentImpl: " + n);
s.append(" encoding:" + encoding + n);
s.append(" features:" + features + n);
s.append(" markupAware:" + markupAware + n);
s.append(" sourceUrl:" + sourceUrl + n);
s.append(n);
return s.toString();
} else {
return currentDocument.toString();
}
} // toString
public void removeDocument(String documentID) {
Document doc = documents.get(documentID);
if(doc == null) return;
Factory.deleteResource(doc);
}
public synchronized void removeDocumentListener(DocumentListener l) {
if(currentDocument != null) {
currentDocument.removeDocumentListener(l);
} else {
if(documentListeners != null && documentListeners.contains(l)) {
@SuppressWarnings("unchecked")
Vector v = (Vector)documentListeners.clone();
v.removeElement(l);
documentListeners = v;
}
}
}
public synchronized void addDocumentListener(DocumentListener l) {
if(currentDocument != null) {
currentDocument.addDocumentListener(l);
} else {
@SuppressWarnings("unchecked")
Vector v =
documentListeners == null
? new Vector(2)
: (Vector)documentListeners.clone();
if(!v.contains(l)) {
v.addElement(l);
documentListeners = v;
}
}
}
public void resourceLoaded(CreoleEvent e) {
}
public void resourceUnloaded(CreoleEvent e) {
}
public void datastoreOpened(CreoleEvent e) {
}
public void datastoreCreated(CreoleEvent e) {
}
public void resourceRenamed(Resource resource, String oldName, String newName) {
}
private void deleteAllDocs() {
Set keys = documents.keySet();
Iterator iter = keys.iterator();
while(iter.hasNext()) {
Object key = iter.next();
Document doc = documents.get(key);
Factory.deleteResource(doc);
}
}
public void datastoreClosed(CreoleEvent e) {
if(!e.getDatastore().equals(this.getDataStore())) return;
// we also remove other documents
deleteAllDocs();
// close this lr, since it cannot stay open when the DS it comes
// from
// is closed
Factory.deleteResource(this);
}
public void setLRPersistenceId(Object lrID) {
super.setLRPersistenceId(lrID);
// make persistent documents listen to the creole register
// for events about their DS
Gate.getCreoleRegister().addCreoleListener(this);
}
public void resourceAdopted(DatastoreEvent evt) {
}
public void resourceDeleted(DatastoreEvent evt) {
if(!evt.getSource().equals(this.getDataStore())) return;
// if an open document is deleted from a DS, then
// it must close itself immediately, as is no longer valid
if(evt.getResourceID().equals(this.getLRPersistenceId())) {
deleteAllDocs();
Factory.deleteResource(this);
}
}
public void resourceWritten(DatastoreEvent evt) {
}
public void setDataStore(DataStore dataStore)
throws gate.persist.PersistenceException {
super.setDataStore(dataStore);
if(this.dataStore != null) this.dataStore.addDatastoreListener(this);
}
public Document getCurrentDocument() {
return currentDocument;
}
public Document getDocument(String documentID) {
Object obj = documents.get(documentID);
if(obj == null) {
return this;
} else {
return (Document)obj;
}
}
public void setCurrentDocument(String documentID) {
if(documentID == null) {
currentDocument = null;
return;
}
Object obj = documents.get(documentID);
if(obj == null) {
currentDocument = null;
} else {
currentDocument = (Document)obj;
}
}
public Map getDocuments() {
return documents;
}
public List getDocumentIDs() {
return documentIDs;
}
@Optional
@CreoleParameter(comment="languages")
public void setDocumentIDs(List docIDs) {
if(docIDs != null) {
this.documentIDs = new ArrayList();
this.documentIDs.addAll(docIDs);
} else {
this.documentIDs = null;
}
}
/**
* Overridden to properly register component documents with the creole
* register when this compound is deserialized.
*/
private void readObject(ObjectInputStream stream) throws IOException,
ClassNotFoundException {
stream.defaultReadObject();
// register a validation callback to add our child documents
// to the creole register and fire the relevant events. This
// is what the Factory would do if the children were loaded in
// the normal way.
stream.registerValidation(new ObjectInputValidation() {
public void validateObject() {
for(Document d : documents.values()) {
Gate.getCreoleRegister().get(d.getClass().getName())
.addInstantiation(d);
Gate.getCreoleRegister().resourceLoaded(
new CreoleEvent(d, CreoleEvent.RESOURCE_LOADED));
}
}
}, 0);
}
}