gate.alignment.gui.DefaultIteratingMethod Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of alignment Show documentation
Show all versions of alignment Show documentation
A selection of tools for processing parallel texts.
package gate.alignment.gui;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.compound.CompoundDocument;
import gate.util.GateRuntimeException;
import gate.util.InvalidOffsetException;
import gate.util.OffsetComparator;
/**
* Default implementation of the IteratingMethod interface. Purpose of
* the IteratingMethod is to allow users to define their own iterating
* sequence for the alignment editor. For example, it could be a
* sentence alignment algorithm that decides which sentence in the
* source language should be paired with which sentence in the target
* language. Alignment editor takes one pair of annotations (one from
* the source document and one from the target document) and displays
* them.
*
* In this implementation, annotations from source and target documents
* are paired in order of their occurrence in the document. If the
* unitAnnotationType is set to Sentence for both the source and target
* documents, Sentence annotations in both the documents are sorted
* using the gate.util.OffsetComparator. Then first sentence from the
* sourceDocument is paired with the first sentence in the target
* document. The same is true for the second, third and for rest of the
* sentences in both documents.
*
* @author niraj
*/
@SuppressWarnings("serial")
public class DefaultIteratingMethod implements IteratingMethod {
/**
* Offset comparator used for sorting annotations
*/
private OffsetComparator comparator;
/**
* Unit of Alignment in the source document (e.g. Token)
*/
private String srcTokenAnnotationType;
/**
* Unit of alignment in the target document (e.g. Token)
*/
private String tgtTokenAnnotationType;
/**
* ID of the source document)
*/
private String srcDocumentID;
/**
* ID of the target document)
*/
private String tgtDocumentID;
/**
* Internal instance of AASequence that is used for obtaining
* annotations from the source document one by one.
*/
private AASequence srcSequence;
/**
* Internal instance of AASequence that is used for obtaining
* annotations from the target document one by one.
*/
private AASequence tgtSequence;
/**
* The document, that is being aligned in the Alignment Editor.
*/
private CompoundDocument compoundDocument;
/**
* Constructor
*/
public DefaultIteratingMethod() {
comparator = new OffsetComparator();
}
/**
* This method, given necessary parameters, initialises the internal
* resources.
*
* @param alignedDocument - source document for which alignment is
* taking place
* @param srcDocumentId - id of the source document
* @param tgtDocumentId - id of the target document
* @param srcInputAS - annotation set in the source document which to
* take annotations from
* @param tgtInputAS - annotation set in the target document which to
* take annotations from
* @param srcTokenAnnotationType - e.g. Token for word alignment
* @param tgtTokenAnnotationType - e.g. Token for word alignment
* @param srcUnitAnnotationType - e.g. Sentence for sentence alignment
* @param tgtUnitAnnotationType - e.g. Sentence for sentence alignment
* @throws IteratingMethodException - could throw exception if
* something is wrong with the parameters or with the source
* and target documents.
*/
public void init(CompoundDocument alignedDocument, String srcDocumentId,
String tgtDocumentId, String srcInputAS, String tgtInputAS,
String srcTokenAnnotationType, String tgtTokenAnnotationType,
String srcUnitAnnotationType, String tgtUnitAnnotationType)
throws IteratingMethodException {
this.compoundDocument = alignedDocument;
this.srcDocumentID = srcDocumentId;
this.tgtDocumentID = tgtDocumentId;
this.srcTokenAnnotationType = srcTokenAnnotationType;
this.tgtTokenAnnotationType = tgtTokenAnnotationType;
Document doc = compoundDocument.getDocument(srcDocumentId);
AnnotationSet as = srcInputAS.equals("")
|| srcInputAS.trim().length() == 0 ? doc.getAnnotations() : doc
.getAnnotations(srcInputAS);
srcSequence = new AASequence(doc, as, srcUnitAnnotationType);
doc = compoundDocument.getDocument(tgtDocumentId);
AnnotationSet as1 = tgtInputAS.equals("")
|| tgtInputAS.trim().length() == 0 ? doc.getAnnotations() : doc
.getAnnotations(tgtInputAS);
tgtSequence = new AASequence(doc, as1, tgtUnitAnnotationType);
}
/**
* Retrieves the underlying text for the given annotation in the
* document with given document id.
*/
public String getText(Annotation annot, String documentId) {
try {
if(documentId.equals(srcDocumentID)) {
return srcSequence.getText(annot);
}
else if(documentId.equals(tgtDocumentID)) {
return tgtSequence.getText(annot);
}
}
catch(InvalidOffsetException ioe) {
throw new GateRuntimeException(ioe);
}
return null;
}
/**
* Similar to getContained method of the gate.AnnotationSet.
*
* @param annot - this method uses annot.getStartNode().getOffset()
* and annot.getEndNode().getOffset() to decide boundaries.
* @param documentId - id of the document to be used in the compound
* document.
* @param tokenAnnotationType - type of the annotations to be retrieved
* @return gate.AnnotationSet with annotations of type annotationType
*/
public AnnotationSet getUnderlyingAnnotations(Annotation annot,
String documentId, String tokenAnnotationType) {
if(documentId.equals(srcDocumentID)) {
return srcSequence.getUnderlyingAnnotations(annot,
tokenAnnotationType == null
? this.srcTokenAnnotationType
: tokenAnnotationType);
}
else if(documentId.equals(tgtDocumentID)) {
return tgtSequence.getUnderlyingAnnotations(annot,
tokenAnnotationType == null
? this.tgtTokenAnnotationType
: tokenAnnotationType);
}
return null;
}
/**
* Cache for currentPair
*/
private Pair currentPair;
/**
* retrieves the next possible pair.
*/
public Pair next() {
Pair pair = new Pair(srcDocumentID, srcSequence.next(), tgtDocumentID,
tgtSequence.next());
this.currentPair = pair;
return pair;
}
/**
* retrieves the previous possible pair.
*/
public Pair previous() {
Pair pair = new Pair(srcDocumentID, srcSequence.previous(), tgtDocumentID,
tgtSequence.previous());
this.currentPair = pair;
return pair;
}
/**
* returns the current pair.
*/
public Pair current() {
return currentPair;
}
/**
* Returns true if there is any next pair available to return.
*/
public boolean hasNext() {
return srcSequence.hasNext() && tgtSequence.hasNext();
}
/**
* Return true if there is any previous pair available to return;
*/
public boolean hasPrevious() {
return srcSequence.hasPrevious() && tgtSequence.hasPrevious();
}
/**
* Internal class used for maintaining annotation sequences
*/
class AASequence {
Document document;
AnnotationSet set;
List annotations;
int counter = -1;
public AASequence(Document doc, AnnotationSet set, String parentType) {
this.document = doc;
this.set = set;
// collecting all sentences for example
annotations = new ArrayList(set.get(parentType));
Collections.sort(annotations, comparator);
}
public boolean hasNext() {
if(counter + 1 < annotations.size()) {
return true;
}
else {
return false;
}
}
// return next sentence
public Annotation next() {
counter++;
return annotations.get(counter);
}
public Annotation previous() {
counter--;
return annotations.get(counter);
}
public boolean hasPrevious() {
if(counter - 1 >= 0) {
return true;
}
return false;
}
public void reset() {
counter = -1;
}
public AnnotationSet getUnderlyingAnnotations(Annotation parentAnnot,
String annotationType) {
return set.getContained(parentAnnot.getStartNode().getOffset(),
parentAnnot.getEndNode().getOffset()).get(annotationType);
}
public String getText(Annotation ann) throws InvalidOffsetException {
return document.getContent().getContent(ann.getStartNode().getOffset(),
ann.getEndNode().getOffset()).toString();
}
}
}