com.ontotext.kim.model.AliasTextTransformer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gazetteer-lkb Show documentation
Show all versions of gazetteer-lkb Show documentation
A Large Knowledge Based (LKB) Gazetteer
The newest version!
package com.ontotext.kim.model;
import org.apache.commons.collections.Transformer;
/**
* This class implements specific text normalization logic used by the class
* ParsingFrame
. It is intended to remove insignificant specifics
* of the stored/searched aliases before their comparison. Such specifics
* are:
* - Multiple white spaces but no more than 10 at a time.
* - The case of letters (only in case-insensitive mode).
*
* This class is left open for addition of other normalization procedures
* if such need arises.
*
* @author danko
*
*/
public class AliasTextTransformer implements Transformer {
private final boolean toLower;
private boolean isWs(char c) { return Character.isWhitespace(c); }
private boolean isUc(char c) { return Character.isUpperCase(c); }
private char toLc(char c) { return Character.toLowerCase(c); }
public Object transform(Object input) {
if (input == null) return "";
String in = input.toString();
if (in.length() == 0) return "";
//trim
int i=0, j=in.length()-1;
while (i < j && isWs(in.charAt(i))) ++i;
while (j > i && isWs(in.charAt(j))) --j;
boolean changed = !(i==0 && j==in.length()-1);
// remove consecutive whitespace but max 10 of them
int mergedWS = 0;
StringBuilder strip = null;
if (changed) {
// If change case is detected create the change buff
strip = new StringBuilder(j-i+1);
}
for (int k = i; k<=j ; k++) {
char c = in.charAt(k);
boolean isWs = isWs(c);
// This block detects the first change. If no change
// criterion is fulfilled - no string manipulations are done
if (!changed) {
if (isWs) {
if (mergedWS>0) // detects WS merger case
changed=true;
else
mergedWS=1;
}
else {
if (toLower && isUc(c)) // detects to-lower case
changed = true;
else
mergedWS=0;
}
if (changed) {
// If change case is detected create the change buff
strip = new StringBuilder(j-i+1);
// append the string so far
strip.append(in.substring(i,k));
}
else
continue;
}
if (isWs && mergedWS<10) {
if (mergedWS < 1) strip.append(c);
mergedWS++;
}
else {
if (toLower) strip.append(toLc(c));
else strip.append(c);
mergedWS=0;
}
}
return (changed)? strip.toString(): in;
}
public AliasTextTransformer(boolean toLower) {
super();
this.toLower = toLower;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy