edu.utah.bmi.nlp.fastner.FastRuleWOG Maven / Gradle / Ivy
/*
* Copyright 2017 Department of Biomedical Informatics, University of Utah
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.utah.bmi.nlp.fastner;
import edu.utah.bmi.nlp.core.NERRule;
import edu.utah.bmi.nlp.core.NERSpan;
import edu.utah.bmi.nlp.core.Rule;
import edu.utah.bmi.nlp.core.Span;
import edu.utah.bmi.nlp.fastcner.UnicodeChecker;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.function.BiFunction;
import static edu.utah.bmi.nlp.core.NERSpan.byRuleLength;
import static edu.utah.bmi.nlp.core.NERSpan.scorewidth;
/**
* This is a class extended from FastRule, which apply the full rule as a match (does not consider group capturing)
*
* @author Jianlin Shi
*/
@SuppressWarnings("rawtypes")
public class FastRuleWOG extends FastRule {
// fields are defined in abstract class
protected HashMap ruleLengths = new HashMap();
protected String spanCompareMethod = scorewidth;
protected String widthCompareMethod = byRuleLength;
public FastRuleWOG() {
}
public FastRuleWOG(String ruleStr) {
initiate(ruleStr, false);
}
public FastRuleWOG(String ruleStr, boolean caseSensitive) {
initiate(ruleStr, caseSensitive);
}
public FastRuleWOG(HashMap ruleStore) {
initiate(ruleStore);
}
public void setCompareMethod(String method) {
this.spanCompareMethod = method;
}
public void setWidthCompareMethod(String widthCompareMethod) {
this.widthCompareMethod = widthCompareMethod;
}
protected boolean addRule(Rule rule) {
// use to store the HashMap sub-chain that have the key chain that meet
// the rule[]
ArrayList rules_tmp = new ArrayList();
HashMap rule1 = rulesMap;
HashMap rule2 = new HashMap();
HashMap rule_t;
String[] ruleContent = rule.rule.split("\\s+");
ruleLengths.put(rule.id, ruleContent.length);
int length = ruleContent.length;
int i = 0;
rules_tmp.add(rulesMap);
while (i < length && rule1 != null && rule1.containsKey(ruleContent[i])) {
rule1 = (HashMap) rule1.get(ruleContent[i]);
i++;
}
// if the rule has been included
if (i > length)
return false;
// start with the determinant, construct the last descendant HashMap
//
if (i == length) {
if (rule1.containsKey(END)) {
((HashMap) rule1.get(END)).put(rule.ruleName, rule.id);
} else {
rule2.put(rule.ruleName, rule.id);
rule1.put(END, rule2.clone());
}
return true;
} else {
rule2.put(rule.ruleName, rule.id);
rule2.put(END, rule2.clone());
rule2.remove(rule.ruleName);
// filling the HashMap chain which ruleStore doesn't have the key chain
for (int j = length - 1; j > i; j--) {
rule_t = (HashMap) rule2.clone();
rule2.clear();
rule2.put(ruleContent[j], rule_t);
}
}
rule1.put(ruleContent[i], rule2.clone());
return true;
}
public HashMap> processTokens(ArrayList contextTokens) {
// use the first "startposition" to remember the original start matching
// position.
// use the 2nd one to remember the start position in which recursion.
HashMap> matches = new HashMap>();
for (int i = 0; i < contextTokens.size(); i++) {
// System.out.println(contextTokens.get(i));
processTokens(contextTokens, rulesMap, i, -1, i, matches);
}
if (removePseudo)
removePseudoMatches(matches);
return matches;
}
protected void processTokens(ArrayList contextTokens, HashMap rule, int matchBegin, int matchEnd, int currentPosition,
HashMap> matches) {
process(contextTokens, getStringText, getBeginId, getEndId,
rule, matchBegin, matchEnd, currentPosition, matches);
}
public HashMap> processSpans(ArrayList contextTokens) {
// use the first "startposition" to remember the original start matching
// position.
// use the 2nd one to remember the start position in which recursion.
HashMap> matches = new HashMap>();
for (int i = 0; i < contextTokens.size(); i++) {
// System.out.println(contextTokens.get(i));
processSpans(contextTokens, rulesMap, i, -1, i, matches);
}
if (removePseudo)
removePseudoMatches(matches);
return matches;
}
protected void processSpans(ArrayList contextTokens,
HashMap rule, int matchBegin, int matchEnd, int currentPosition,
HashMap> matches) {
process(contextTokens, getSpanText, getSpanBegin, getSpanEnd,
rule, matchBegin, matchEnd, currentPosition, matches);
}
protected void process(ArrayList> contextTokens,
BiFunction getText,
BiFunction getBegin,
BiFunction getEnd,
HashMap rule, int matchBegin, int matchEnd, int currentPosition,
HashMap> matches) {
// when reach the end of the tunedcontext, end the iteration
if (currentPosition < contextTokens.size()) {
// start processing the tunedcontext tokens
String thisToken = getText.apply(contextTokens, currentPosition);
// System.out.println("thisToken-"+thisToken);
if (rule.containsKey("\\w+")) {
process(contextTokens, getText, getBegin, getEnd, (HashMap) rule.get("\\w+"), matchBegin, matchEnd, currentPosition + 1, matches);
}
// if the end of a rule is met
if (rule.containsKey(END)) {
// if no () is used in this definition, use the whole rule string
addDeterminants(rule, matches, getBegin.apply(contextTokens, matchBegin), getEnd.apply(contextTokens, (matchEnd == -1 ? currentPosition - 1 : matchEnd)));
}
// if the current token match the element of a rule
if (rule.containsKey(thisToken)) {
process(contextTokens, getText, getBegin, getEnd, (HashMap) rule.get(thisToken), matchBegin, matchEnd, currentPosition + 1, matches);
}
if (rule.containsKey("\\d+") && UnicodeChecker.isNumber(thisToken)) {
process(contextTokens, getText, getBegin, getEnd, (HashMap) rule.get("\\d+"), matchBegin, matchEnd, currentPosition + 1, matches);
}
} else if (currentPosition == contextTokens.size() && rule.containsKey(END)) {
// if no () is used in this definition, use the whole rule string
matchEnd = matchEnd == -1 ? currentPosition - 1 : matchEnd;
addDeterminants(rule, matches, getBegin.apply(contextTokens, matchBegin), getEnd.apply(contextTokens, matchEnd));
}
}
@SuppressWarnings("unchecked")
protected void addDeterminants(HashMap rule, HashMap> matches, int matchBegin, int matchEnd) {
HashMap deterRule = (HashMap) rule.get(END);
Span currentSpan;
ArrayList currentSpanList;
for (Object key : deterRule.keySet()) {
// claim as Span instance, to be compatible with old methods
int ruleId = deterRule.get(key);
boolean contain = ruleLengths.containsKey(ruleId);
contain = ruleStore.containsKey(ruleId);
currentSpan = new NERSpan(matchBegin, matchEnd, ruleId, ruleLengths.get(ruleId), ruleStore.get(ruleId).score, "");
((NERSpan) currentSpan).setCompareMethod(spanCompareMethod);
((NERSpan) currentSpan).setWidthCompareMethod(widthCompareMethod);
logger.finest(getRule(currentSpan.ruleId).toString());
if (matches.containsKey((String) key)) {
// because the ruleStore are all processed at the same time from the input left to the input right,
// it becomes more efficient to compare the overlaps
currentSpanList = matches.get((String) key);
Span lastSpan = currentSpanList.get(currentSpanList.size() - 1);
// Since there is no directional preference, assume the span is not exclusive within each determinant.
if (currentSpan.end < lastSpan.end) {
// if currentSpan is within lastSpan
continue;
} else if (lastSpan.end > currentSpan.begin) {
// if overlap and current span has priority than last span
if (((NERSpan) currentSpan).compareTo((NERSpan) lastSpan) > 0) {
currentSpanList.remove(currentSpanList.size() - 1);
} else {
continue;
}
}
currentSpanList.add(currentSpan);
} else {
currentSpanList = new ArrayList();
currentSpanList.add(currentSpan);
}
matches.put((String) key, currentSpanList);
}
}
}