edu.utah.bmi.nlp.fastcner.FastCRule Maven / Gradle / Ivy
/*
* Copyright 2017 Department of Biomedical Informatics, University of Utah
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.utah.bmi.nlp.fastcner;
import edu.utah.bmi.nlp.core.DeterminantValueSet.Determinants;
import edu.utah.bmi.nlp.core.*;
import edu.utah.bmi.nlp.fastner.FastRuleWG;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.logging.Level;
import static java.lang.Character.*;
/**
* This class is an extension of FastRules. Instead of handling string-element ruleStore, it handles char-element ruleStore
* Wildcard definition:
*
* ( Beginning of capturing a group
* ) End of capturing a group
* \p A punctuation
*
* \ plus following characters
* + An addition symbol (to distinguish the "+" after a wildcard)
* ( A left parentheses symbol
* ) A right parentheses symbol
* d A digit
* C A capital letter
* c A lowercase letter
* s A whitespace
* a A Non-whitespace character
* u A unusual character: not a letter, not a number, not a punctuation, not a whitespace
* n A return
*
* The wildcard plus "+": 1 or more wildcard
*
* NOTE: FastCRule is slightly different to FastCRule when handling replicates.
* FastCRule uses loops (usually is much faster)
*
* Because it use loops, it doesn't check any other possible ruleStore that might match the replicates at the same time.
* The rule like "\\c+w" won't work as expected: the rule will be matched all the way to "w" when checking "\\c+".
*
* FastCRule uses iterations.
* Once FastCRule is fully tested and compared with FastCRule, FastCRule might be deprecated.
*
* @author Jianlin Shi
*/
//TODO need to support | (e.g. to detect MR #146-55-23-5)
public class FastCRule extends FastRuleWG {
// other fields are defined in abstract class
protected HashMap scores = new HashMap();
protected final Determinants END = Determinants.END;
// max length of repeat char---to prevent overflow 25 works perfect, 10 is optimized for speed
protected int maxRepeatLength = 30;
protected boolean supportReplications = false, scSupport = false;
protected String method = "width";
protected int offset = 0;
// Because the match branches caused by wildcards, some right matches can be found before left matches
// A segment tree is maintained to check the overlapping among matches within a same type of concept
protected HashMap overlapCheckers = new HashMap<>();
protected FastCRule() {
}
public FastCRule(String ruleStr) {
// support read from OWl file, TSV file or OWL file directory
super(ruleStr);
}
public FastCRule(HashMap ruleStore) {
initiate(ruleStore);
}
/**
* Override addRule method
*
* @return true: if the rule is added
* false: if the rule is a duplicate
*/
@SuppressWarnings("unchecked")
protected boolean addRule(Rule rule) {
// use to store the HashMap sub-chain that have the key chain that overlap with the current rule
// rule1 to temporally store the hinges of existing HashMap chain that overlap with current rule
char[] crule = rule.rule.toCharArray();
String determinant = rule.ruleName;
HashMap rule1 = rulesMap;
// rule2 to construct the new HashMap sub-chain that doesn't overlap with existing chain
HashMap rule2 = new HashMap();
HashMap rulet = new HashMap();
int length = crule.length;
int i = 0;
while (i < length && rule1 != null && rule1.containsKey(crule[i])) {
rule1 = (HashMap) rule1.get(crule[i]);
i++;
}
// if the rule has been included
if (i == length && rule1.containsKey(END) && rule1.get(END) == determinant) {
logger.info("This rule has been included");
return false;
}
// start with the determinant, construct the last descendant HashMap
// >
if (i == length) {
if (rule1.containsKey(END)) {
((HashMap) rule1.get(END)).put(determinant, rule.id);
} else {
rule2.put(determinant, rule.id);
rule1.put(END, rule2.clone());
}
setScore(rule.id, rule.score);
return true;
} else {
rule2.put(determinant, rule.id);
rule2.put(END, rule2.clone());
rule2.remove(determinant);
// filling the HashMap chain which ruleStore doesn't have the key chain
for (int j = length - 1; j > i; j--) {
rulet = (HashMap) rule2.clone();
rule2.clear();
rule2.put(crule[j], rulet);
}
}
// map rule to score;
setScore(rule.id, rule.score);
rule1.put(crule[i], rule2.clone());
return true;
}
public HashMap> processTokens(ArrayList contextTokens) {
if (logger.isLoggable(Level.FINEST))
logger.finest("This method is not used in character-based ruleStore");
return null;
}
public HashMap> processSpans(ArrayList contextTokens) {
if (logger.isLoggable(Level.FINEST))
logger.finest("This method is not used in character-based ruleStore");
return null;
}
public HashMap> processString(String text) {
offset = 0;
return processRules(text);
}
public HashMap> processString(String text, int offset) {
this.offset = offset;
return processRules(text);
}
public HashMap> processSpan(Span span) {
return processString(span.text, span.begin);
}
public HashMap> processRules(String text) {
// use the first "startposition" to remember the original start matching
// position.
// use the 2nd one to remember the start position in which recursion.
HashMap> matches = new HashMap<>();
char[] textChars = text.toCharArray();
for (int i = 0; i < textChars.length; i++) {
char previousChar = i > 0 ? textChars[i - 1] : ' ';
processRules(text, textChars, rulesMap, i, -1, i, matches, previousChar, false, ' ');
}
if (removePseudo)
removePseudoMatches(matches);
return matches;
}
protected void processRules(String text, char[] textChars, HashMap rule, int matchBegin, int matchEnd, int currentPosition,
HashMap> matches,
char previousChar, boolean wildcard, char previousKey) {
// when reach the end of the tunedcontext, end the iteration
if (currentPosition < textChars.length) {
char thisChar = textChars[currentPosition];
if (rule.containsKey('\\')) {
processWildCards(text, textChars, (HashMap) rule.get('\\'), matchBegin, matchEnd, currentPosition, matches, previousChar, true, '\\');
}
if (rule.containsKey('(') && previousKey != '\\') {
processRules(text, textChars, (HashMap) rule.get('('), currentPosition, matchEnd, currentPosition, matches,
previousChar, false, '(');
}
if (rule.containsKey(')') && previousKey != '\\') {
processRules(text, textChars, (HashMap) rule.get(')'), matchBegin, currentPosition, currentPosition, matches,
previousChar, false, ')');
}
// if the end of a rule is met
if (rule.containsKey(END)) {
addDeterminants(text, rule, matches, matchBegin, matchEnd, currentPosition);
}
// if the current token match the element of a rule
if (rule.containsKey(thisChar) && (thisChar != ')' && thisChar != '(')) {
processRules(text, textChars, (HashMap) rule.get(thisChar), matchBegin, matchEnd, currentPosition + 1, matches,
thisChar, false, thisChar);
}
// if(currentRepeats>0)
// currentRepeats=currentRepeats;
// Replications of current char
if (supportReplications && rule.containsKey('+')) {
// processRules(textChars, (HashMap) rule.get('+'), matchBegin, matchEnd, currentPosition, matches,
// previousChar, false, ' ');
processRules(text, textChars, (HashMap) rule.get('+'), matchBegin, matchEnd, currentPosition, matches,
thisChar, false, '+');
processReplicants(text, textChars, (HashMap) rule.get('+'), matchBegin, matchEnd, currentPosition, matches,
thisChar, wildcard, previousKey);
}
} else if (currentPosition == textChars.length && rule.containsKey(END)) {
if (matchEnd == -1)
addDeterminants(text, rule, matches, matchBegin, currentPosition, currentPosition);
else
addDeterminants(text, rule, matches, matchBegin, matchEnd, currentPosition);
} else if (currentPosition == textChars.length && rule.containsKey('\\') && ((HashMap) rule.get('\\')).containsKey('e')) {
HashMap deterRule = ((HashMap) ((HashMap) rule.get('\\')).get('e'));
if (matchEnd == -1)
addDeterminants(text, deterRule, matches, matchBegin, currentPosition, currentPosition);
else
addDeterminants(text, deterRule, matches, matchBegin, matchEnd, currentPosition);
} else if (currentPosition == textChars.length && rule.containsKey(')')) {
HashMap deterRule = (HashMap) rule.get(')');
if (deterRule.containsKey(END)) {
addDeterminants(text, deterRule, matches, matchBegin, currentPosition, currentPosition);
} else if (deterRule.containsKey('\\') && ((HashMap) deterRule.get('\\')).containsKey('e'))
processRules(text, textChars, (HashMap) ((HashMap) deterRule.get('\\')).get('e'), matchBegin, matchEnd, currentPosition, matches, previousChar, false, ' ');
} else if (currentPosition == textChars.length && rule.containsKey('+')) {
HashMap deterRule = (HashMap) rule.get('+');
processRules(text, textChars, deterRule, matchBegin, matchEnd, currentPosition, matches, previousChar, wildcard, previousKey);
}
}
protected boolean iss(char thisChar) {
return (thisChar == ' ' || thisChar == '\t' || (int) thisChar == 160);
}
protected boolean isd(char thisChar) {
return isDigit(thisChar);
}
protected boolean isC(char thisChar) {
return isUpperCase(thisChar);
}
protected boolean isc(char thisChar) {
return isLowerCase(thisChar);
}
protected boolean isp(char thisChar) {
return WildCardChecker.isPunctuation(thisChar);
}
protected boolean isu(char thisChar) {
return WildCardChecker.isSpecialChar(thisChar);
}
protected boolean isw(char thisChar) {
return isWhitespace(thisChar) || (int) thisChar == 160 || WildCardChecker.isSpecialChar(thisChar);
}
protected boolean isa(char thisChar) {
return !isWhitespace(thisChar) && !((int) thisChar == 160);
}
protected void processWildCards(String text, char[] textChars, HashMap rule, int matchBegin, int matchEnd, int currentPosition, HashMap> matches, char previousChar, boolean wildcard, char previousKey) {
char thisChar = textChars[currentPosition];
for (Object rulechar : rule.keySet()) {
char thisRuleChar = (Character) rulechar;
switch (thisRuleChar) {
case 's':
// if (thisChar == ' ' || thisChar == '\t' || (scSupport && !(isLetterOrDigit(thisChar) || isWhitespace(thisChar) || WildCardChecker.isPunctuation(thisChar)))) {
if (iss(thisChar)) {
processRules(text, textChars, (HashMap) rule.get('s'), matchBegin, matchEnd, currentPosition + 1, matches,
thisChar, true, 's');
}
break;
case 'n':
if (thisChar == '\n' || thisChar == '\r') {
processRules(text, textChars, (HashMap) rule.get('n'), matchBegin, matchEnd, currentPosition + 1, matches,
thisChar, true, 'n');
}
break;
case '(':
if (thisChar == '(')
processRules(text, textChars, (HashMap) rule.get('('), matchBegin, matchEnd, currentPosition + 1, matches,
thisChar, true, '(');
break;
case ')':
if (thisChar == ')')
processRules(text, textChars, (HashMap) rule.get(')'), matchBegin, matchEnd, currentPosition + 1, matches,
thisChar, true, ')');
break;
case 'd':
if (isd(thisChar)) {
processRules(text, textChars, (HashMap) rule.get('d'), matchBegin, matchEnd, currentPosition + 1, matches,
thisChar, true, 'd');
}
break;
case 'C':
if (isC(thisChar)) {
processRules(text, textChars, (HashMap) rule.get('C'), matchBegin, matchEnd, currentPosition + 1, matches,
thisChar, true, 'C');
}
break;
case 'c':
if (isc(thisChar)) {
processRules(text, textChars, (HashMap) rule.get('c'), matchBegin, matchEnd, currentPosition + 1, matches,
thisChar, true, 'c');
}
break;
case 'p':
if (isp(thisChar)) {
processRules(text, textChars, (HashMap) rule.get('p'), matchBegin, matchEnd, currentPosition + 1, matches,
thisChar, true, 'p');
}
break;
case '+':
if (thisChar == '+') {
processRules(text, textChars, (HashMap) rule.get('+'), matchBegin, matchEnd, currentPosition + 1, matches,
thisChar, true, '+');
}
break;
case '\\':
if (thisChar == '\\') {
processRules(text, textChars, (HashMap) rule.get('\\'), matchBegin, matchEnd, currentPosition + 1, matches,
thisChar, false, '\\');
}
break;
case 'b':
if (currentPosition == 0)
processRules(text, textChars, (HashMap) rule.get('b'), matchBegin, matchEnd, currentPosition, matches,
previousChar, false, 'b');
break;
case 'a':
if (isa(thisChar))
// if(thisChar!=' ' && thisChar!='\t' && thisChar!='\r' && thisChar!='\n')
processRules(text, textChars, (HashMap) rule.get('a'), matchBegin, matchEnd, currentPosition + 1, matches,
thisChar, true, 'a');
break;
case 'u':
if (isu(thisChar))
processRules(text, textChars, (HashMap) rule.get('u'), matchBegin, matchEnd, currentPosition + 1, matches,
thisChar, true, 'u');
break;
case 'w':
if (isw(thisChar)) {
processRules(text, textChars, (HashMap) rule.get('w'), matchBegin, matchEnd, currentPosition + 1, matches,
thisChar, true, 'w');
}
break;
// TODO negation rule
// case '^':
// break;
}
}
}
protected void processReplicants(String text, char[] textChars, HashMap rule, int matchBegin, int matchEnd, int currentPosition, HashMap> matches, char previousChar, boolean wildcard, char previousKey) {
char thisChar = textChars[currentPosition];
int currentRepeats = 0;
if (wildcard) {
switch (previousKey) {
case 's':
// if (thisChar == ' ' || thisChar == '\t' || (int)thisChar==160 || (scSupport && !(isLetterOrDigit(thisChar) || isWhitespace(thisChar) || WildCardChecker.isPunctuation(thisChar)))) {
if (iss(thisChar)) {
while (iss(thisChar) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
currentPosition++;
currentRepeats++;
if (currentPosition == textChars.length)
break;
thisChar = textChars[currentPosition];
}
}
break;
case 'n':
if ((thisChar == '\n' || thisChar == '\r')) {
while ((thisChar == '\n' || thisChar == '\r') && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
currentPosition++;
currentRepeats++;
if (currentPosition == textChars.length)
break;
thisChar = textChars[currentPosition];
}
}
break;
case 'd':
if (isd(thisChar)) {
while (isd(thisChar) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
currentPosition++;
currentRepeats++;
if (currentPosition == textChars.length)
break;
thisChar = textChars[currentPosition];
}
}
break;
case 'C':
if (isC(thisChar)) {
while (isC(thisChar) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
currentPosition++;
currentRepeats++;
if (currentPosition == textChars.length)
break;
thisChar = textChars[currentPosition];
}
}
break;
case 'c':
if (isc(thisChar)) {
while (isc(thisChar) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
currentPosition++;
currentRepeats++;
if (currentPosition == textChars.length)
break;
thisChar = textChars[currentPosition];
}//
}
break;
case 'p':
if (isp(thisChar)) {
while (isp(thisChar) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
currentPosition++;
currentRepeats++;
if (currentPosition == textChars.length)
break;
thisChar = textChars[currentPosition];
}
}
break;
case 'a':
if (isa(thisChar)) {
while (isa(thisChar) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
currentPosition++;
currentRepeats++;
if (currentPosition == textChars.length)
break;
thisChar = textChars[currentPosition];
}
}
break;
case 'u':
if (isu(thisChar)) {
while (isu(thisChar) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
currentPosition++;
currentRepeats++;
if (currentPosition == textChars.length)
break;
thisChar = textChars[currentPosition];
}
}
break;
case 'w':
if (isw(thisChar)) {
while (isw(thisChar) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
currentPosition++;
currentRepeats++;
if (currentPosition == textChars.length)
break;
thisChar = textChars[currentPosition];
}
}
break;
}
processRules(text, textChars, rule, matchBegin, matchEnd, currentPosition, matches,
previousChar, false, '+');
} else if (thisChar == previousKey) {
while ((thisChar == previousKey) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
currentPosition++;
currentRepeats++;
if (currentPosition == textChars.length)
break;
thisChar = textChars[currentPosition];
}
processRules(text, textChars, rule, matchBegin, matchEnd, currentPosition, matches,
previousChar, false, '+');
}
}
protected void addDeterminants(String text, HashMap rule, HashMap> matches,
int matchBegin, int matchEnd, int currentPosition) {
HashMap deterRule = (HashMap) rule.get(END);
int end = matchEnd == -1 ? currentPosition : matchEnd;
if (matchBegin > end) {
StringBuilder sb = new StringBuilder();
for (Object key : deterRule.keySet()) {
int rulePos = deterRule.get(key);
sb.append(getRule(rulePos).toString());
sb.append("\n");
}
logger.warning("Rule definition error ----matched begin > matched end\n" +
"check the following rules: \n" + sb.toString());
int snippetBegin = matchBegin - 100;
snippetBegin = snippetBegin < 0 ? 0 : snippetBegin;
int snippetEnd = end + 100;
snippetEnd = snippetEnd > text.length() ? text.length() : snippetEnd;
logger.warning("try to match span: " + text.substring(snippetBegin, end) + "<*>"
+ text.substring(end, matchBegin) + "<*>" + text.substring(matchBegin, snippetEnd));
return;
}
Span currentSpan = new Span(matchBegin + offset, end + offset, text.substring(matchBegin, end));
if (logger.isLoggable(Level.FINEST))
logger.finest("Try to addDeterminants: " + currentSpan.begin + ", " + currentSpan.end + "\t" + currentSpan.text);
for (Object key : deterRule.keySet()) {
ArrayList currentSpanList = new ArrayList<>();
int rulePos = deterRule.get(key);
double score = getScore(rulePos);
currentSpan.ruleId = rulePos;
currentSpan.score = score;
if (logger.isLoggable(Level.FINEST))
logger.finest("\t\tRule Id: " + rulePos + "\t" + key + "\t" + getRule(rulePos).type + "\t" + getRuleString(rulePos));
// If needed, implement your own selection ruleStore and score updating logic below
if (matches.containsKey(key)) {
// because the ruleStore are all processed at the same time from the input left to the input right,
// it becomes more efficient to compare the overlaps
currentSpanList = matches.get(key);
IntervalST overlapChecker = overlapCheckers.get(key);
Object overlappedPos = overlapChecker.get(new Interval1D(currentSpan.begin, currentSpan.end - 1));
if (overlappedPos != null) {
int pos = (int) overlappedPos;
Span overlappedSpan = currentSpanList.get(pos);
if (logger.isLoggable(Level.FINEST))
logger.finest("\t\tOverlapped with: " + overlappedSpan.begin + ", " + overlappedSpan.end + "\t" +
text.substring(overlappedSpan.begin - offset, overlappedSpan.end - offset));
if (!compareSpan(currentSpan, overlappedSpan)) {
if (logger.isLoggable(Level.FINEST))
logger.finest("\t\tSkip this span ...");
continue;
}
currentSpanList.set(pos, currentSpan);
overlapChecker.remove(new Interval1D(overlappedSpan.begin, overlappedSpan.end - 1));
overlapChecker.put(new Interval1D(currentSpan.begin, currentSpan.end - 1), pos);
} else {
overlapChecker.put(new Interval1D(currentSpan.begin, currentSpan.end - 1), currentSpanList.size());
currentSpanList.add(currentSpan);
}
} else {
currentSpanList.add(currentSpan);
matches.put((String) key, currentSpanList);
IntervalST overlapChecker = new IntervalST();
overlapChecker.put(new Interval1D(currentSpan.begin, currentSpan.end - 1), 0);
overlapCheckers.put((String) key, overlapChecker);
}
}
}
public double getScore(Span span) {
return scores.get(span.ruleId);
}
public double getScore(int ruleId) {
return scores.get(ruleId);
}
public void setScore(int ruleId, double score) {
scores.put(ruleId, score);
}
/**
* Using "+" to support replications might slow down the performance of FastCRule,
* try to avoid using it as much as possible.
*
* @param support support replications
*/
public void setReplicationSupport(boolean support) {
this.supportReplications = support;
}
public void setCompareMethod(String method) {
this.method = method;
}
protected boolean compareScoreOnly(Span a, Span b) {
if (getScore(a) < 0)
return true;
if (getScore(b) < 0)
return false;
return getScore(a) > getScore(b);
}
protected boolean compareWidthOnly(Span a, Span b) {
return a.width > b.width;
}
protected boolean compareScorePrior(Span a, Span b) {
if (logger.isLoggable(Level.FINEST))
logger.finest("\t\tcurrent " + a.ruleId + " score: " + getScore(a.ruleId) + "\t---\toverlapped " + b.ruleId + " score: " + getScore(b.ruleId));
if (getScore(a) < 0)
return true;
if (getScore(b) < 0)
return false;
if (getScore(a) > getScore(b)) {
return true;
} else if (getScore(a) >= getScore(b) && a.width > b.width
// && getRule(b.ruleId).type != Determinants.PSEUDO
) {
return true;
}
return false;
}
protected boolean compareWidthPrior(Span a, Span b) {
if (a.width > b.width) {
return true;
} else if (a.width == b.width && getScore(a) > getScore(b)) {
return true;
}
return false;
}
protected boolean compareSpan(Span a, Span b) {
switch (method) {
case "score":
return compareScoreOnly(a, b);
case "scorewidth":
return compareScorePrior(a, b);
case "widthscore":
return compareWidthPrior(a, b);
default:
return compareWidthOnly(a, b);
}
}
public void setSpecialCharacterSupport(Boolean scSupport) {
this.scSupport = scSupport;
}
public void setMaxRepeatLength(int maxRepeatLength) {
this.maxRepeatLength = maxRepeatLength;
}
}