com.ontotext.kim.model.ParsingFrame Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gazetteer-lkb Show documentation
Show all versions of gazetteer-lkb Show documentation
A Large Knowledge Based (LKB) Gazetteer
The newest version!
package com.ontotext.kim.model;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.collections.Transformer;
import org.apache.commons.collections.TransformerUtils;
import com.ontotext.kim.model.AliasCacheImpl.Stats;
/**
* This class is used as a text parsing tool by classes
* AliasCacheImpl
and KimLookupParser
and produces
* special output for the class HashedAlias
.
* The class provides means to parse a string into sequence of plain numeric
* and letter lexemes delimited by optional non-alpha-numeric character
* sequences of arbitrary length. This is the base for creation of a Hashed
* Alias and also the base for searching it in a text.
* The class offers a frame view over the parsed string. The frame spans one or
* more sequential alpha-numeric lexemes (ANL) and their non-alpha-numeric
* context (surrounding puctuation and white spaces).
* The frame contains three parts:
* * middle - this is a part of the initial string which starts in the
* beginning of the first ANL of the frame and ends in the end of the last ANL
* of the frame.
* * prefix - this is the non-alpha-numeric sequence before the first ANL of
* the frame.
* * suffix - this is the non-alpha-numeric sequence after the last ANL of the
* frame.
* The class has couple of methods for manipulating the frame. The first one
* is for extending the frame by one lexeme. The second one is for moving
* the frame start by one lexeme, which resets the frame length to one
* lexeme.
* The class has retrieval methods that get the current frame offset and
* the lengths of the three parts. There are dedicated methods for calculation
* of the two Hash Codes specific to a Hashed Alias.
*
* @author danko
*
*/
public class ParsingFrame {
public static final boolean SPLIT_AT_SMALL_TO_CAPITAL_CASE_CHANGE = false;
//================================================
// ParsingFrame: Lexeme parsing regular expression
//================================================
/** This is the Reg-Ex pattern that is used search the ANLs */
private static final Pattern LEXEME_MATCH_PAT = Pattern.compile(
"(" +
// Number only lexemes
"(?:\\d)+|" +
(SPLIT_AT_SMALL_TO_CAPITAL_CASE_CHANGE
? "(?:\\p{javaUpperCase}{0,}\\p{javaLowerCase}{1,})|(?:\\p{javaUpperCase}{1,})"
: "(?:[\\p{javaUpperCase}\\p{javaLowerCase}]+)" ) +
")"
// Non-Alpha-Numeric suffix
+ "([^\\d\\p{javaUpperCase}\\p{javaLowerCase}]*)"
);
/** This is the delimiting character used for the normalized form
* of the parsed string. The normalized for is used to calculate
* the H1 hash value of the HashAlias
*/
private static final String H1_DELIMITER = "^";
public static Transformer frameTT = TransformerUtils.nopTransformer();
//================================================
// ParsingFrame: Parse result buffering
//================================================
/** This class represents a single alpha-numeric lexeme (ANL) and its
* non-alpha-numeric context. */
private static class ParsingBufferElement {
public final int pref;
public final int midd;
public final int suff;
public final String middTxt;
public final int offset;
public ParsingBufferElement(int pref, int midd, int suff,
String middTxt, int offset) {
this.pref = pref; this.midd = midd;
this.suff = suff; this.middTxt = middTxt; this.offset = offset;
}
}
private final static ParsingBufferElement EMPTY_ELEMENT =
new ParsingBufferElement(0,0,0,"",0);
/** This is the parsing buffer that stores the results from parsing of
* the input text. The result is stored as a sequence of ANL elements
* with the sizes of their non-alpha-numeric prefix and suffix. */
private ArrayList parsingBuffer =
new ArrayList();
//================================================
// ParsingFrame: The definition data of the frame
//================================================
private int parsingIx = -1;
private int frameIx = 0;
private int restartIx = 0;
private boolean frameCanExpand = true;
private int prefix = 0;
private int middle = 0;
private int suffix = 0;
private final String source;
private String normalizedAlias = "";
private int lexemeCount = 0;
//================================================
// ParsingFrame: class constructor
//================================================
/** The constructor parses the source string with the regular expression
* based java.util.regex.Matcher
and stores the results in
* the parsing buffer parsingBuffer
.
* @param source - the String to be processed through the
* ParsingFrame
class
*/
public ParsingFrame (String source) {
Stats.markIt(-1);
this.source = source;
Matcher lexMatcher = LEXEME_MATCH_PAT.matcher(source);
int lastSuff = -1;
while (lexMatcher.find()) {
if (lastSuff < 0)
lastSuff = lexMatcher.start();
parsingBuffer.add( new ParsingBufferElement(
lastSuff,
lexMatcher.end(1) - lexMatcher.start(1),
lexMatcher.end(2) - lexMatcher.start(2),
lexMatcher.group(1),
lexMatcher.start() - lastSuff));
lastSuff = lexMatcher.end(2) - lexMatcher.start(2);
}
Stats.markIt(7);
}
//================================================
// ParsingFrame: LookUp and Annotation data
//================================================
private int aliasHash1 = 0;
private int aliasHash2 = 0;
private int aliasOffset1 = 0;
private int aliasOffset2 = 0;
private int oldPref=-1;
private int oldSuff=-1;
private void resetLAD() {
aliasHash1 = 0;
aliasHash2 = 0;
oldPref=-1;
oldSuff=-1;
this.setNewPrefSufLen(prefix, suffix);
}
/** This method changes the effective lengths of the non-alpha-numeric
* prefix and suffix. This change affects the results of methods:
* getAliasHash2
, getLength
,
* getAliasStart
and getAliasEnd
.
* The last values of the prefix and suffix are stored so if called
* again with the same values - no recalculations will be performed.
* @param prefLen - the new effective prefix length.
* @param suffLen - the new effective suffix length.
*/
public void setNewPrefSufLen(int prefLen, int suffLen) {
if (oldPref!=prefLen) {
prefLen = Math.min(prefLen, prefix);
int offset = (parsingBuffer.size()>0)?
parsingBuffer.get(frameIx).offset:
0;
aliasOffset1 = offset + prefix - prefLen;
}
if (oldPref!=prefLen || oldSuff!=suffLen) {
suffLen = Math.min(suffLen, suffix);
aliasOffset2 = aliasOffset1 + prefLen + middle + suffLen;
aliasHash2 = 0; // Recalculation will be done when and if required
}
oldPref=prefLen;
oldSuff=suffLen;
}
//================================================
// ParsingFrame: Frame manipulation methods
//================================================
/** Method attempts to extend the frame to cover one more parsing-buffer
* element. If successful - recalculates the frame state metrics.
* @return true if the content of the frame was extended with
* one more buffer element
*/
public boolean parseOne() {
if (!frameCanMove()) return false;
Stats.markIt(-1);
ParsingBufferElement pbe = EMPTY_ELEMENT;
boolean oneParsed = false;
if (restartIx >= 0) {
frameIx = restartIx;
lexemeCount = 0;
oneParsed = (restartIx < parsingBuffer.size());
if (oneParsed)
parsingIx = restartIx;
}
else {
oneParsed = ((parsingIx+1) < parsingBuffer.size());
if (oneParsed)
parsingIx++;
}
if (oneParsed) {
pbe = parsingBuffer.get(parsingIx);
lexemeCount++;
}
Stats.markIt(8);
Stats.markIt(-1);
if (restartIx >= 0) {
prefix = pbe.pref;
middle = pbe.midd;
suffix = pbe.suff;
normalizedAlias = H1_DELIMITER + pbe.middTxt + H1_DELIMITER;
}
else {
middle += suffix + pbe.midd;
suffix = pbe.suff;
normalizedAlias += pbe.middTxt + H1_DELIMITER;
}
// Allows for preliminary detection of parse ending
frameCanExpand = oneParsed && ((parsingIx+1) < parsingBuffer.size());
restartIx = -1;
resetLAD();
Stats.markIt(9);
return oneParsed;
}
/** A shortcut method that extends the frame to cover the whole input
* string */
public void parseAll() {
while (frameCanExpand) parseOne();
}
/** The method moves the start of the frame - one buffer element forward
* and shrinks the size of the frame to one element. */
public void moveOne() {
if (restartIx >= 0) return;
// Calculate the offset for new parsing start
restartIx = frameIx + 1;
// Conditions under which the frame-movement attempt fails
if (restartIx >= parsingBuffer.size())
restartIx = -1;
}
//==================================================
// ParsingFrame: Extraction of frame dynamic flags
//==================================================
public boolean frameCanExpand() { return frameCanExpand; }
public boolean frameCanMove() { return frameCanExpand || restartIx >= 0; }
//==================================================
// ParsingFrame: Extraction of frame state metrics
//==================================================
/** Retrieves the HashedAlias
related Hash-Code-1. It is
* calculated over the normalized form of the underlying text. A text
* transformation with externally provided transformation logic is
* performed prior to hash-code calculation.
* @return - the value of the hash-code.
*/
public int getAliasHash1() {
checkValid();
if (aliasHash1==0)
aliasHash1 = frameTT.transform(normalizedAlias).hashCode();
return aliasHash1;
}
/** Retrieves the HashedAlias
related Hash-Code-2. It is
* calculated over the plain underlying text. A text
* transformation with externally provided transformation logic is
* performed prior to hash-code calculation.
* @return - the value of the hash-code.
*/
public int getAliasHash2() {
checkValid();
if (aliasHash2==0)
aliasHash2 = frameTT.transform(source.substring(
aliasOffset1, aliasOffset2)).hashCode();
return aliasHash2;
}
/** Retrieves the length of the non-alpha-numeric prefix of the frame
* @return non-alpha-numeric prefix length
*/
public int getPrefixLen() { checkValid(); return prefix; }
/** Retrieves the length of the middle part of the frame
* @return middle part length
*/
public int getMiddleLen() { checkValid(); return middle; }
/** Retrieves the length of the non-alpha-numeric suffix of the frame
* @return non-alpha-numeric suffix length
*/
public int getSuffixLen() { checkValid(); return suffix; }
/** Retrieves the offset in the original input text of the frame start
* @return frame start offset
*/
public int getAliasStart() { checkValid(); return aliasOffset1; }
/** Retrieves the offset in the original input text of the frame end
* @return frame end offset
*/
public int getAliasEnd() { checkValid(); return aliasOffset2; }
/** Retrieves the length of the text corresponding to the current frame
* @return frame text length
*/
public int getLength() { checkValid(); return aliasOffset2-aliasOffset1; }
/** Retrieves the count of ANLs in the frame
* @return ANL count
*/
public int getLexemeCount() { checkValid(); return lexemeCount; }
private void checkValid() {
if (restartIx >= 0)
throw new RuntimeException("No parsed content in the Frame!");
}
//========================================
// Methods overridden for Testing purposes
//========================================
@Override
public boolean equals(Object o) {
if (o instanceof ParsingFrame) {
ParsingFrame pf = (ParsingFrame) o;
return (pf.parsingBuffer.get(pf.frameIx).offset ==
this.parsingBuffer.get(this.frameIx).offset &&
pf.lexemeCount == this.lexemeCount &&
pf.getAliasHash1() == this.getAliasHash1() );
}
return false;
}
@Override
public int hashCode() {
return 1;
}
@Override
public String toString() {
return this.normalizedAlias +
"(" + parsingBuffer.get(frameIx).offset + "/" +
source.length() + ")" +
prefix + "-" + middle + "-" + suffix +
((frameCanExpand)? " CAN-expand": " NO-expand") + " " +
((restartIx>=0)? ("RELOCATE="+restartIx): "");
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy