
gate.creole.gazetteer.FlexibleGazetteer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tools Show documentation
Show all versions of tools Show documentation
A selection of processing resources commonly used to extend ANNIE
The newest version!
/*
* FlexibleGazetteer.java
*
* Copyright (c) 2004-2012, The University of Sheffield.
*
* This file is part of GATE (see http://gate.ac.uk/), and is free software,
* licenced under the GNU Library General Public License, Version 2, June1991.
*
* A copy of this licence is included in the distribution in the file
* licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
*
* Niraj Aswani 02/2002
* $Id: FlexibleGazetteer.java 19751 2016-11-18 09:04:17Z markagreenwood $
*/
package gate.creole.gazetteer;
import java.util.List;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.Utils;
import gate.corpora.DocumentImpl;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.InvalidOffsetException;
/**
*
* Title: Flexible Gazetteer
*
*
* The Flexible Gazetteer provides users with the flexibility to choose
* their own customised input and an external Gazetteer. For example,
* the user might want to replace words in the text with their base
* forms (which is an output of the Morphological Analyser).
*
*
* The Flexible Gazetteer performs lookup over a document based on the
* values of an arbitrary feature of an arbitrary annotation type, by
* using an externally provided gazetteer. It is important to use an
* external gazetteer as this allows the use of any type of gazetteer
* (e.g. an Ontological gazetteer).
*
*
* @author niraj aswani
* @version 1.0
*/
@CreoleResource(name="Flexible Gazetteer", comment="A more flexible list lookup component.", helpURL="A more flexible list lookup component.")
public class FlexibleGazetteer extends AbstractLanguageAnalyser {
private static final long serialVersionUID = -1023682327651886920L;
private static final String wrappedOutputASName = "Output";
private static final String wrappedInputASName = "Input";
// SET TO false BEFORE CHECKING IN
private static final boolean DEBUG = false;
/**
* This method runs the gazetteer. It assumes that all the needed parameters
* are set. If they are not, an exception will be fired.
*/
@Override
public void execute() throws ExecutionException {
if(gazetteerInst == null) { throw new ExecutionException(
"No Gazetteer Provided!"); }
fireProgressChanged(0);
fireStatusChanged("Checking Document...");
if(document == null) { throw new ExecutionException(
"No document to process!"); }
// obtain the inputAS
AnnotationSet inputAS = document.getAnnotations(inputASName);
// anything in the inputFeatureNames?
if(inputFeatureNames == null || inputFeatureNames.size() == 0) { throw new ExecutionException(
"No input feature names provided!"); }
// for each input feature, create a temporary document and run the
// gazetteer
for(String aFeature : inputFeatureNames) {
// find out the feature name user wants us to use
String[] keyVal = aFeature.split("\\.");
// if invalid feature name
if(keyVal.length != 2) {
System.err.println("Invalid input feature name:" + aFeature);
continue;
}
// keyVal[0] = annotation type
// keyVal[1] = feature name
// holds mapping for newly created annotations
FlexGazMappingTable mappingTable = new FlexGazMappingTable();
fireStatusChanged("Creating temporary Document for feature " + aFeature);
StringBuilder newdocString =
new StringBuilder(document.getContent().toString());
// sort annotations
List annotations =
Utils.inDocumentOrder(inputAS.get(keyVal[0]));
// remove duplicate annotations
// (this makes the reverse mapping much easier)
removeOverlappingAnnotations(annotations);
// initially no space is deducted
int totalDeductedSpaces = 0;
// now replace the document content with the value of the feature that
// user has provided
for(Annotation currentAnnotation : annotations) {
// if there's no such feature, continue
if(!currentAnnotation.getFeatures().containsKey(keyVal[1])) continue;
String newTokenValue =
currentAnnotation.getFeatures().get(keyVal[1]).toString();
// if no value found for this feature
//if(newTokenValue == null) continue;
// feature value found so we need to replace it
// find the start and end offsets for this token
long startOffset = Utils.start(currentAnnotation);
long endOffset = Utils.end(currentAnnotation);
// let us find the difference between the lengths of the
// actual string and the newTokenValue
long actualLength = endOffset - startOffset;
long lengthDifference = actualLength - newTokenValue.length();
// so lets find out the new startOffset and endOffset
long newStartOffset = startOffset - totalDeductedSpaces;
long newEndOffset = newStartOffset + newTokenValue.length();
totalDeductedSpaces += lengthDifference;
mappingTable.add(startOffset, endOffset, newStartOffset, newEndOffset);
// and finally replace the actual string in the document
// with the new document
newdocString.replace((int)newStartOffset, (int)newStartOffset
+ (int)actualLength, newTokenValue);
}
// proceed only if there was any replacement Map
if(mappingTable.isEmpty()) continue;
/* All the binary search stuff is done inside FlexGazMappingTable
* now, so it's guaranteed to return valid original annotation start
* and end offsets. */
// otherwise create a temporary document for the new text
Document tempDoc = null;
// update the status
fireStatusChanged("Processing document with Gazetteer...");
try {
FeatureMap params = Factory.newFeatureMap();
params.put("stringContent", newdocString.toString());
// set the appropriate encoding
if(document instanceof DocumentImpl) {
params.put("encoding", ((DocumentImpl)document).getEncoding());
params.put("markupAware", ((DocumentImpl)document).getMarkupAware());
}
FeatureMap features = Factory.newFeatureMap();
Gate.setHiddenAttribute(features, true);
tempDoc =
(Document)Factory.createResource("gate.corpora.DocumentImpl",
params, features);
/* Mark the temp document with the locations of the input annotations so
* that we can later eliminate Lookups that are out of scope. */
for (NodePosition mapping : mappingTable.getMappings()) {
tempDoc.getAnnotations(wrappedInputASName).add(mapping.getTempStartOffset(),
mapping.getTempEndOffset(), "Input", Factory.newFeatureMap());
}
}
catch(ResourceInstantiationException rie) {
throw new ExecutionException("Temporary document cannot be created", rie);
}
catch(InvalidOffsetException e) {
throw new ExecutionException("Error duplicating Input annotations", e);
}
try {
// lets create the gazetteer based on the provided gazetteer name
gazetteerInst.setDocument(tempDoc);
gazetteerInst.setAnnotationSetName(wrappedOutputASName);
fireStatusChanged("Executing Gazetteer...");
gazetteerInst.execute();
// now the tempDoc has been looked up, we need to shift the annotations
// from this temp document to the original document
fireStatusChanged("Transfering new annotations to the original one...");
AnnotationSet originalDocOutput = document.getAnnotations(outputASName);
if (DEBUG) {
mappingTable.dump();
}
// Now iterate over the new annotations and transfer them from the
// temp document back to the real one
for(Annotation currentLookup : tempDoc.getAnnotations(wrappedOutputASName)) {
long tempStartOffset = Utils.start(currentLookup);
long tempEndOffset = Utils.end(currentLookup);
/* Ignore annotations that fall entirely outside the input annotations,
* so that we don't get dodgy Lookups outside the area covered by
* Tokens copied into a restricted working set by the AST PR
* (for example) */
if (coveredByInput(tempStartOffset, tempEndOffset, tempDoc.getAnnotations(wrappedInputASName))) {
long destinationStart = mappingTable.getBestOriginalStart(tempStartOffset);
long destinationEnd = mappingTable.getBestOriginalEnd(tempEndOffset);
boolean valid = (destinationStart >= 0) && (destinationEnd >= 0);
if (valid) {
// Now make sure there is no other annotation like this
AnnotationSet testSet = originalDocOutput.getContained(destinationStart, destinationEnd).get(
currentLookup.getType(), currentLookup.getFeatures());
for(Annotation annot : testSet) {
if(Utils.start(annot) == destinationStart
&& Utils.end(annot) == destinationEnd
&& annot.getFeatures().size() == currentLookup.getFeatures().size()) {
valid = false;
break;
}
}
}
if(valid) {
addToOriginal(originalDocOutput, destinationStart, destinationEnd,
tempStartOffset, tempEndOffset, currentLookup, tempDoc);
}
} // END if coveredByInput(...)
} // END for OVER ALL THE Lookups
}
finally {
gazetteerInst.setDocument(null);
if(tempDoc != null) {
// now remove the newDoc
Factory.deleteResource(tempDoc);
}
}
} // for
fireProcessFinished();
} // END execute METHOD
/**
* Removes the overlapping annotations. preserves the one that appears first
* in the list. This assumes the list has been sorted already.
*
* @param annotations
*/
private void removeOverlappingAnnotations(List annotations) {
for(int i = 0; i < annotations.size() - 1; i++) {
Annotation annot1 = annotations.get(i);
Annotation annot2 = annotations.get(i + 1);
long annot2Start = Utils.start(annot2);
if(annot2Start >= Utils.start(annot1) && annot2Start < Utils.end(annot1)) {
annotations.remove(annot2);
i--;
continue;
}
}
}
/* We try hard not to cause InvalidOffsetExceptions, but let's have
* some better debugging info in case they happen.
*/
private void addToOriginal(AnnotationSet original, long originalStart, long originalEnd,
long tempStart, long tempEnd, Annotation tempLookup, Document tempDoc) throws ExecutionException {
try {
original.add(originalStart, originalEnd, tempLookup.getType(), tempLookup.getFeatures());
}
catch(InvalidOffsetException ioe) {
String errorDetails = String.format("temp %d, %d [%s]-> original %d, %d ", tempStart, tempEnd, Utils.stringFor(tempDoc, tempLookup),
originalStart, originalEnd);
throw new ExecutionException(errorDetails, ioe);
}
}
/* Is this Lookup within the scope of the input annotations? It might not be, if Token annotations
* have been copied by AST only over the significant sections of the document.
*/
private boolean coveredByInput(long tempStart, long tempEnd, AnnotationSet tempInputAS) {
if (tempInputAS.getCovering(wrappedInputASName, tempStart, tempStart).isEmpty()) {
return false;
}
// implied else
if (tempInputAS.getCovering(wrappedInputASName, tempEnd, tempEnd).isEmpty()) {
return false;
}
// implied else
return true;
}
/**
* Sets the name of annotation set that should be used for storing new
* annotations
*
* @param outputASName
*/
@RunTime
@Optional
@CreoleParameter(comment="The annotation set to be used for the generated annotations")
public void setOutputASName(String outputASName) {
this.outputASName = outputASName;
}
/**
* Returns the outputAnnotationSetName
*
* @return a {@link String} value.
*/
public String getOutputASName() {
return this.outputASName;
}
/**
* sets the input AnnotationSet Name
*
* @param inputASName
*/
@RunTime
@Optional
@CreoleParameter(comment="The annotation set to be used for getting features from")
public void setInputASName(String inputASName) {
this.inputASName = inputASName;
}
/**
* Returns the inputAnnotationSetName
*
* @return a {@link String} value.
*/
public String getInputASName() {
return this.inputASName;
}
/**
* Feature names for example: Token.string, Token.root etc... Values of these
* features should be used to replace the actual string of these features.
* This method allows a user to set the name of such features
*
* @param inputs
*/
@RunTime
@CreoleParameter(comment="Annotation.feature names to be considered for the gazetteer")
public void setInputFeatureNames(java.util.List inputs) {
this.inputFeatureNames = inputs;
}
/**
* Returns the feature names that are provided by the user to use their values
* to replace their actual strings in the document
*
* @return a {@link List} value.
*/
public java.util.List getInputFeatureNames() {
return this.inputFeatureNames;
}
public Gazetteer getGazetteerInst() {
return this.gazetteerInst;
}
@RunTime
@CreoleParameter(comment="Gazetteer Instance to be used")
public void setGazetteerInst(gate.creole.gazetteer.Gazetteer gazetteerInst) {
this.gazetteerInst = gazetteerInst;
}
// Gazetteer Runtime parameters
private java.lang.String outputASName;
private java.lang.String inputASName;
// Flexible Gazetteer parameter
private Gazetteer gazetteerInst;
private java.util.List inputFeatureNames;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy