gate.creole.orthomatcher.OrthoMatcherHelper Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of annie Show documentation
Show all versions of annie Show documentation
ANNIE is a general purpose information extraction system that
provides the building blocks of many other GATE applications.
package gate.creole.orthomatcher;
import gate.Annotation;
import gate.AnnotationSet;
import gate.creole.ExecutionException;
import gate.util.InvalidOffsetException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.apache.log4j.Logger;
public class OrthoMatcherHelper {
protected static final Logger log = Logger.getLogger(OrthoMatcherHelper.class);
public static boolean straightCompare(String s1,
String s2,
boolean matchCase) {
boolean matched = false;
if (!matchCase)
matched = s1.equalsIgnoreCase(s2);
else matched = s1.equals(s2) ;
// kalina: do not remove, nice for debug
// if (matched && (s2.startsWith("Kenneth") || s1.startsWith("Kenneth")))
// Out.prln("Rule1: Matched " + s1 + "and " + s2);
return matched;
}
/**
* Returns true if only one of s1 and s2 is a single character and the two strings match on that
* initial
*/
public static boolean initialMatch(String s1, String s2) {
return (((s1.length() == 1) ^ (s2.length() == 1) ) && (s1.charAt(0) == s2.charAt(0)));
}
/**
* Gets the content of an annotation
*/
public static String getStringForSpan(Long start, Long end,gate.Document d) throws ExecutionException {
try {
return d.getContent().getContent(start, end).toString();
}
catch (InvalidOffsetException e) {
//log.error("Weird offset exception in getStringForSpan", e);
throw new ExecutionException(e);
}
}
public static boolean executeDisjunction(Map allrules, int[] executeRules,String longName,String shortName, boolean mr[]) {
boolean result=false;
for (int i = 0; i < executeRules.length; i = i + 1) {
boolean current=allrules.get(executeRules[i]).value(longName, shortName);
mr[executeRules[i]]=current;
result=result || current;
}
return result;
}
public static Double round2Places(Double input) {
return Math.round(input*100.0)/100.0;
}
/**
* It is used for test purposes.
* This table shows which rules have fired over a corpus.
*/
private static final boolean[] rulesUsedTable;
static {
rulesUsedTable = new boolean[18];
for(int i=0;i> iter = rulesUsedTable.entrySet().iterator();
if (rulesUsedTable!=null) {
log.debug("Saving table of used orthomatcher rules:");
StringBuilder table=new StringBuilder();
for(int i=0;i1) {
int i=0;
//encode in temp
for(String pair: pairs){
String[] s = pair.split(":");
int x=Integer.parseInt(s[0].trim())* ceil + Integer.parseInt(s[1].trim());
temp[i]=x;
i++;
}
Arrays.sort(temp);
//decode from temp
StringBuilder result=new StringBuilder();
for(int n: temp) {
int a = n / ceil;
int b = n % ceil;
result.append(a).append(":").append(b).append(", ");
}
return result.toString();
}
else return input;//we do not need to sort a single pair
}
/*
* The feature "matches" contains annotation IDs.
* This method adds a new feature called "matches_positions" that tells the exact position of each match annotation from "matches".
* "matches" contains annotations IDs which are in general different and can not be used for comparison in tools like the Corpus Quality Assurance tool
* "matches_positions" can be used to check if the matches really match in for example the Corpus Quality Assurance tool
*/
protected static void setMatchesPositions(AnnotationSet nameAllAnnots) {
//get all annotations that have a matches feature
Set fNames = new HashSet();
fNames.add(gate.creole.ANNIEConstants.ANNOTATION_COREF_FEATURE_NAME);
AnnotationSet allMatchesAnnots =
nameAllAnnots.get(null, fNames);
if (allMatchesAnnots == null || allMatchesAnnots.isEmpty())
return;
for (Annotation currentMatchAnnot : allMatchesAnnots) {
String matchValue=currentMatchAnnot.getFeatures().get(gate.creole.ANNIEConstants.ANNOTATION_COREF_FEATURE_NAME).toString();
matchValue = matchValue.substring(1);
matchValue = matchValue.substring(0,matchValue.length()-1);
String[] annotationsIDs = matchValue.split(",");
String matchPositionsValue="";//with the annotations positions
String sentinel = ", ";
int[] integerIDs = OrthoMatcherHelper.convertArrayToInteger(annotationsIDs);
for (int i=0; i iter = nameAllAnnots.iterator();
//find the current annotation with ID from the match list - in order to get its start and end point
if (currentMatchAnnot.getId()==id)
ann=currentMatchAnnot; else {
while (iter.hasNext()) {
Annotation a=iter.next();
if (a.getId()==id)
{
ann = a;
break;
}
}
}
//do the actual job of retrieving the start and end points
if (ann!=null) {
matchPositionsValue = matchPositionsValue + ann.getStartNode().getOffset()+":"+ann.getEndNode().getOffset()+sentinel;
}
}//end going through the match ids
//sort so that every time we have the "match_positions" generated the same way so that we can compare it
matchPositionsValue = OrthoMatcherHelper.SortByStartPosition(matchPositionsValue);
//formating
if (matchPositionsValue.endsWith(sentinel)) {
matchPositionsValue = matchPositionsValue.substring(0,matchPositionsValue.length()-sentinel.length());
}
matchPositionsValue = "["+matchPositionsValue+"]";
//finally insert the annotation
currentMatchAnnot.getFeatures().put("matches_positions", matchPositionsValue);
//}
} //while - going through all the matches annotations(that have a feature "match") and adding the new feature
}//matchesPositions
}