edu.utah.bmi.nlp.fastcner.FastCRule Maven / Gradle / Ivy

Go to download
/*
 * Copyright  2017  Department of Biomedical Informatics, University of Utah
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 

 * http://www.apache.org/licenses/LICENSE-2.0
 * 

 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.utah.bmi.nlp.fastcner;

import edu.utah.bmi.nlp.core.DeterminantValueSet.Determinants;
import edu.utah.bmi.nlp.core.*;
import edu.utah.bmi.nlp.fastner.FastRuleWG;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.logging.Level;

import static java.lang.Character.*;

/**
 * This class is an extension of FastRules. Instead of handling string-element ruleStore, it handles char-element ruleStore
 * Wildcard definition:
 * 

 * (   Beginning of capturing a group
 * )   End of capturing a group
 * \p   A punctuation
 * 

 * \ plus following characters
 * +   An addition symbol (to distinguish the "+" after a wildcard)
 * (   A left parentheses symbol
 * )   A right parentheses symbol
 * d   A digit
 * C   A capital letter
 * c   A lowercase letter
 * s   A whitespace
 * a   A Non-whitespace character
 * u   A unusual character: not a letter, not a number, not a punctuation, not a whitespace
 * n   A return
 * 

 * The wildcard plus "+": 1 or more wildcard
 * 

 * NOTE: FastCRule is slightly different to FastCRule when handling replicates.
 * FastCRule uses loops (usually is much faster)
 * 

 * Because it use loops, it doesn't check any other possible ruleStore that might match the replicates at the same time.
 * The rule like "\\c+w" won't work as expected: the rule will be matched all the way to "w" when checking "\\c+".
 * 
 * FastCRule uses iterations.
 * Once FastCRule is fully tested and compared with FastCRule, FastCRule might be deprecated.
 *
 * @author Jianlin Shi
 */
//TODO need to support | (e.g. to detect MR #146-55-23-5)

public class FastCRule extends FastRuleWG {
    //  other  fields are defined in abstract class
    protected HashMap scores = new HashMap();
    protected final Determinants END = Determinants.END;
    //  max length of repeat char---to prevent overflow 25 works perfect, 10 is optimized for speed
    protected int maxRepeatLength = 30;
    protected boolean supportReplications = false, scSupport = false;
    protected String method = "width";
    protected int offset = 0;

    //    Because the match branches caused by wildcards, some right matches can be found before left matches
//    A segment tree is maintained to check the overlapping among matches within a same type of concept
    protected HashMap overlapCheckers = new HashMap<>();


    protected FastCRule() {
    }


    public FastCRule(String ruleStr) {
//        support read from OWl file, TSV file or OWL file directory
        super(ruleStr);
    }

    public FastCRule(HashMap ruleStore) {
        initiate(ruleStore);
    }


    /**
     * Override addRule method
     *
     * @return true: if the rule is added
     * false: if the rule is a duplicate
     */
    @SuppressWarnings("unchecked")
    protected boolean addRule(Rule rule) {
//      use to store the HashMap sub-chain that have the key chain that overlap with the current rule
//      rule1 to temporally store the hinges of existing HashMap chain that overlap with current rule
        char[] crule = rule.rule.toCharArray();
        String determinant = rule.ruleName;
        HashMap rule1 = rulesMap;
//      rule2 to construct the new HashMap sub-chain that doesn't overlap with existing chain
        HashMap rule2 = new HashMap();
        HashMap rulet = new HashMap();
        int length = crule.length;
        int i = 0;

        while (i < length && rule1 != null && rule1.containsKey(crule[i])) {
            rule1 = (HashMap) rule1.get(crule[i]);
            i++;
        }
        // if the rule has been included
        if (i == length && rule1.containsKey(END) && rule1.get(END) == determinant) {
            logger.info("This rule has been included");
            return false;
        }
        // start with the determinant, construct the last descendant HashMap
        // >
        if (i == length) {
            if (rule1.containsKey(END)) {
                ((HashMap) rule1.get(END)).put(determinant, rule.id);
            } else {
                rule2.put(determinant, rule.id);
                rule1.put(END, rule2.clone());
            }
            setScore(rule.id, rule.score);
            return true;
        } else {
            rule2.put(determinant, rule.id);
            rule2.put(END, rule2.clone());
            rule2.remove(determinant);

            // filling the HashMap chain which ruleStore doesn't have the key chain
            for (int j = length - 1; j > i; j--) {
                rulet = (HashMap) rule2.clone();
                rule2.clear();
                rule2.put(crule[j], rulet);
            }
        }
//      map rule to score;
        setScore(rule.id, rule.score);

        rule1.put(crule[i], rule2.clone());
        return true;
    }

    public HashMap> processTokens(ArrayList contextTokens) {
        if (logger.isLoggable(Level.FINEST))
            logger.finest("This method is not used in character-based ruleStore");
        return null;
    }

    public HashMap> processSpans(ArrayList contextTokens) {
        if (logger.isLoggable(Level.FINEST))
            logger.finest("This method is not used in character-based ruleStore");
        return null;
    }

    public HashMap> processString(String text) {
        offset = 0;
        return processRules(text);
    }

    public HashMap> processString(String text, int offset) {
        this.offset = offset;
        return processRules(text);
    }

    public HashMap> processSpan(Span span) {
        return processString(span.text, span.begin);
    }


    public HashMap> processRules(String text) {
        // use the first "startposition" to remember the original start matching
        // position.
        // use the 2nd one to remember the start position in which recursion.
        HashMap> matches = new HashMap<>();
        char[] textChars = text.toCharArray();
        for (int i = 0; i < textChars.length; i++) {
            char previousChar = i > 0 ? textChars[i - 1] : ' ';
            processRules(text, textChars, rulesMap, i, -1, i, matches, previousChar, false, ' ');
        }
        if (removePseudo)
            removePseudoMatches(matches);
        return matches;

    }


    protected void processRules(String text, char[] textChars, HashMap rule, int matchBegin, int matchEnd, int currentPosition,
                                HashMap> matches,
                                char previousChar, boolean wildcard, char previousKey) {
        // when reach the end of the tunedcontext, end the iteration
        if (currentPosition < textChars.length) {
            char thisChar = textChars[currentPosition];

            if (rule.containsKey('\\')) {
                processWildCards(text, textChars, (HashMap) rule.get('\\'), matchBegin, matchEnd, currentPosition, matches, previousChar, true, '\\');
            }
            if (rule.containsKey('(') && previousKey != '\\') {
                processRules(text, textChars, (HashMap) rule.get('('), currentPosition, matchEnd, currentPosition, matches,
                        previousChar, false, '(');
            }
            if (rule.containsKey(')') && previousKey != '\\') {
                processRules(text, textChars, (HashMap) rule.get(')'), matchBegin, currentPosition, currentPosition, matches,
                        previousChar, false, ')');

            }
            // if the end of a rule is met

            if (rule.containsKey(END)) {
                addDeterminants(text, rule, matches, matchBegin, matchEnd, currentPosition);
            }
            // if the current token match the element of a rule
            if (rule.containsKey(thisChar) && (thisChar != ')' && thisChar != '(')) {
                processRules(text, textChars, (HashMap) rule.get(thisChar), matchBegin, matchEnd, currentPosition + 1, matches,
                        thisChar, false, thisChar);
            }

//            if(currentRepeats>0)
//                currentRepeats=currentRepeats;
//          Replications of current char
            if (supportReplications && rule.containsKey('+')) {
//                processRules(textChars, (HashMap) rule.get('+'), matchBegin, matchEnd, currentPosition, matches,
//                        previousChar, false, ' ');
                processRules(text, textChars, (HashMap) rule.get('+'), matchBegin, matchEnd, currentPosition, matches,
                        thisChar, false, '+');
                processReplicants(text, textChars, (HashMap) rule.get('+'), matchBegin, matchEnd, currentPosition, matches,
                        thisChar, wildcard, previousKey);
            }


        } else if (currentPosition == textChars.length && rule.containsKey(END)) {
            if (matchEnd == -1)
                addDeterminants(text, rule, matches, matchBegin, currentPosition, currentPosition);
            else
                addDeterminants(text, rule, matches, matchBegin, matchEnd, currentPosition);
        } else if (currentPosition == textChars.length && rule.containsKey('\\') && ((HashMap) rule.get('\\')).containsKey('e')) {
            HashMap deterRule = ((HashMap) ((HashMap) rule.get('\\')).get('e'));
            if (matchEnd == -1)
                addDeterminants(text, deterRule, matches, matchBegin, currentPosition, currentPosition);
            else
                addDeterminants(text, deterRule, matches, matchBegin, matchEnd, currentPosition);
        } else if (currentPosition == textChars.length && rule.containsKey(')')) {
            HashMap deterRule = (HashMap) rule.get(')');
            if (deterRule.containsKey(END)) {
                addDeterminants(text, deterRule, matches, matchBegin, currentPosition, currentPosition);
            } else if (deterRule.containsKey('\\') && ((HashMap) deterRule.get('\\')).containsKey('e'))
                processRules(text, textChars, (HashMap) ((HashMap) deterRule.get('\\')).get('e'), matchBegin, matchEnd, currentPosition, matches, previousChar, false, ' ');
        } else if (currentPosition == textChars.length && rule.containsKey('+')) {
            HashMap deterRule = (HashMap) rule.get('+');
            processRules(text, textChars, deterRule, matchBegin, matchEnd, currentPosition, matches, previousChar, wildcard, previousKey);
        }
    }

    protected boolean iss(char thisChar) {
        return (thisChar == ' ' || thisChar == '\t' || (int) thisChar == 160);
    }

    protected boolean isd(char thisChar) {
        return isDigit(thisChar);
    }

    protected boolean isC(char thisChar) {
        return isUpperCase(thisChar);
    }

    protected boolean isc(char thisChar) {
        return isLowerCase(thisChar);
    }

    protected boolean isp(char thisChar) {
        return WildCardChecker.isPunctuation(thisChar);
    }

    protected boolean isu(char thisChar) {
        return WildCardChecker.isSpecialChar(thisChar);
    }

    protected boolean isw(char thisChar) {
        return isWhitespace(thisChar) || (int) thisChar == 160 || WildCardChecker.isSpecialChar(thisChar);
    }

    protected boolean isa(char thisChar) {
        return !isWhitespace(thisChar) && !((int) thisChar == 160);
    }


    protected void processWildCards(String text, char[] textChars, HashMap rule, int matchBegin, int matchEnd, int currentPosition, HashMap> matches, char previousChar, boolean wildcard, char previousKey) {
        char thisChar = textChars[currentPosition];
        for (Object rulechar : rule.keySet()) {
            char thisRuleChar = (Character) rulechar;
            switch (thisRuleChar) {
                case 's':
//                    if (thisChar == ' ' || thisChar == '\t' || (scSupport && !(isLetterOrDigit(thisChar) || isWhitespace(thisChar) || WildCardChecker.isPunctuation(thisChar)))) {
                    if (iss(thisChar)) {
                        processRules(text, textChars, (HashMap) rule.get('s'), matchBegin, matchEnd, currentPosition + 1, matches,
                                thisChar, true, 's');
                    }
                    break;
                case 'n':
                    if (thisChar == '\n' || thisChar == '\r') {
                        processRules(text, textChars, (HashMap) rule.get('n'), matchBegin, matchEnd, currentPosition + 1, matches,
                                thisChar, true, 'n');
                    }
                    break;
                case '(':
                    if (thisChar == '(')
                        processRules(text, textChars, (HashMap) rule.get('('), matchBegin, matchEnd, currentPosition + 1, matches,
                                thisChar, true, '(');
                    break;
                case ')':
                    if (thisChar == ')')
                        processRules(text, textChars, (HashMap) rule.get(')'), matchBegin, matchEnd, currentPosition + 1, matches,
                                thisChar, true, ')');
                    break;
                case 'd':
                    if (isd(thisChar)) {
                        processRules(text, textChars, (HashMap) rule.get('d'), matchBegin, matchEnd, currentPosition + 1, matches,
                                thisChar, true, 'd');
                    }
                    break;
                case 'C':
                    if (isC(thisChar)) {
                        processRules(text, textChars, (HashMap) rule.get('C'), matchBegin, matchEnd, currentPosition + 1, matches,
                                thisChar, true, 'C');
                    }
                    break;
                case 'c':
                    if (isc(thisChar)) {
                        processRules(text, textChars, (HashMap) rule.get('c'), matchBegin, matchEnd, currentPosition + 1, matches,
                                thisChar, true, 'c');
                    }
                    break;
                case 'p':
                    if (isp(thisChar)) {
                        processRules(text, textChars, (HashMap) rule.get('p'), matchBegin, matchEnd, currentPosition + 1, matches,
                                thisChar, true, 'p');
                    }
                    break;
                case '+':
                    if (thisChar == '+') {
                        processRules(text, textChars, (HashMap) rule.get('+'), matchBegin, matchEnd, currentPosition + 1, matches,
                                thisChar, true, '+');
                    }
                    break;
                case '\\':
                    if (thisChar == '\\') {
                        processRules(text, textChars, (HashMap) rule.get('\\'), matchBegin, matchEnd, currentPosition + 1, matches,
                                thisChar, false, '\\');
                    }
                    break;
                case 'b':
                    if (currentPosition == 0)
                        processRules(text, textChars, (HashMap) rule.get('b'), matchBegin, matchEnd, currentPosition, matches,
                                previousChar, false, 'b');
                    break;
                case 'a':
                    if (isa(thisChar))
//                    if(thisChar!=' ' && thisChar!='\t' && thisChar!='\r' && thisChar!='\n')
                        processRules(text, textChars, (HashMap) rule.get('a'), matchBegin, matchEnd, currentPosition + 1, matches,
                                thisChar, true, 'a');
                    break;
                case 'u':
                    if (isu(thisChar))
                        processRules(text, textChars, (HashMap) rule.get('u'), matchBegin, matchEnd, currentPosition + 1, matches,
                                thisChar, true, 'u');
                    break;

                case 'w':
                    if (isw(thisChar)) {
                        processRules(text, textChars, (HashMap) rule.get('w'), matchBegin, matchEnd, currentPosition + 1, matches,
                                thisChar, true, 'w');
                    }
                    break;
//                TODO negation rule
//                case '^':
//                    break;

            }
        }

    }

    protected void processReplicants(String text, char[] textChars, HashMap rule, int matchBegin, int matchEnd, int currentPosition, HashMap> matches, char previousChar, boolean wildcard, char previousKey) {
        char thisChar = textChars[currentPosition];
        int currentRepeats = 0;
        if (wildcard) {
            switch (previousKey) {
                case 's':
                    //                        if (thisChar == ' ' || thisChar == '\t' || (int)thisChar==160 || (scSupport && !(isLetterOrDigit(thisChar) || isWhitespace(thisChar) || WildCardChecker.isPunctuation(thisChar)))) {
                    if (iss(thisChar)) {
                        while (iss(thisChar) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
                            currentPosition++;
                            currentRepeats++;
                            if (currentPosition == textChars.length)
                                break;
                            thisChar = textChars[currentPosition];
                        }
                    }
                    break;
                case 'n':
                    if ((thisChar == '\n' || thisChar == '\r')) {
                        while ((thisChar == '\n' || thisChar == '\r') && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
                            currentPosition++;
                            currentRepeats++;
                            if (currentPosition == textChars.length)
                                break;
                            thisChar = textChars[currentPosition];
                        }
                    }
                    break;
                case 'd':
                    if (isd(thisChar)) {
                        while (isd(thisChar) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
                            currentPosition++;
                            currentRepeats++;
                            if (currentPosition == textChars.length)
                                break;
                            thisChar = textChars[currentPosition];
                        }
                    }
                    break;
                case 'C':
                    if (isC(thisChar)) {
                        while (isC(thisChar) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
                            currentPosition++;
                            currentRepeats++;
                            if (currentPosition == textChars.length)
                                break;
                            thisChar = textChars[currentPosition];
                        }
                    }
                    break;
                case 'c':
                    if (isc(thisChar)) {
                        while (isc(thisChar) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
                            currentPosition++;
                            currentRepeats++;
                            if (currentPosition == textChars.length)
                                break;
                            thisChar = textChars[currentPosition];
                        }//
                    }
                    break;
                case 'p':
                    if (isp(thisChar)) {
                        while (isp(thisChar) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
                            currentPosition++;
                            currentRepeats++;
                            if (currentPosition == textChars.length)
                                break;
                            thisChar = textChars[currentPosition];
                        }
                    }
                    break;
                case 'a':
                    if (isa(thisChar)) {
                        while (isa(thisChar) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
                            currentPosition++;
                            currentRepeats++;
                            if (currentPosition == textChars.length)
                                break;
                            thisChar = textChars[currentPosition];
                        }
                    }
                    break;
                case 'u':
                    if (isu(thisChar)) {
                        while (isu(thisChar) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
                            currentPosition++;
                            currentRepeats++;
                            if (currentPosition == textChars.length)
                                break;
                            thisChar = textChars[currentPosition];
                        }
                    }
                    break;
                case 'w':
                    if (isw(thisChar)) {
                        while (isw(thisChar) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
                            currentPosition++;
                            currentRepeats++;
                            if (currentPosition == textChars.length)
                                break;
                            thisChar = textChars[currentPosition];
                        }
                    }
                    break;
            }
            processRules(text, textChars, rule, matchBegin, matchEnd, currentPosition, matches,
                    previousChar, false, '+');
        } else if (thisChar == previousKey) {
            while ((thisChar == previousKey) && currentRepeats < maxRepeatLength && currentPosition < textChars.length) {
                currentPosition++;
                currentRepeats++;
                if (currentPosition == textChars.length)
                    break;
                thisChar = textChars[currentPosition];

            }
            processRules(text, textChars, rule, matchBegin, matchEnd, currentPosition, matches,
                    previousChar, false, '+');
        }
    }


    protected void addDeterminants(String text, HashMap rule, HashMap> matches,
                                   int matchBegin, int matchEnd, int currentPosition) {
        HashMap deterRule = (HashMap) rule.get(END);
        int end = matchEnd == -1 ? currentPosition : matchEnd;
        if (matchBegin > end) {
            StringBuilder sb = new StringBuilder();
            for (Object key : deterRule.keySet()) {
                int rulePos = deterRule.get(key);
                sb.append(getRule(rulePos).toString());
                sb.append("\n");
            }
            logger.warning("Rule definition error ----matched begin > matched end\n" +
                    "check the following rules: \n" + sb.toString());
            int snippetBegin = matchBegin - 100;
            snippetBegin = snippetBegin < 0 ? 0 : snippetBegin;
            int snippetEnd = end + 100;
            snippetEnd = snippetEnd > text.length() ? text.length() : snippetEnd;
            logger.warning("try to match span: " + text.substring(snippetBegin, end) + "<*>"
                    + text.substring(end, matchBegin) + "<*>" + text.substring(matchBegin, snippetEnd));
            return;

        }
        Span currentSpan = new Span(matchBegin + offset, end + offset, text.substring(matchBegin, end));
        if (logger.isLoggable(Level.FINEST))
            logger.finest("Try to addDeterminants: " + currentSpan.begin + ", " + currentSpan.end + "\t" + currentSpan.text);

        for (Object key : deterRule.keySet()) {
            ArrayList currentSpanList = new ArrayList<>();
            int rulePos = deterRule.get(key);
            double score = getScore(rulePos);
            currentSpan.ruleId = rulePos;
            currentSpan.score = score;
            if (logger.isLoggable(Level.FINEST))
                logger.finest("\t\tRule Id: " + rulePos + "\t" + key + "\t" + getRule(rulePos).type + "\t" + getRuleString(rulePos));
//          If needed, implement your own selection ruleStore and score updating logic below
            if (matches.containsKey(key)) {
//              because the ruleStore are all processed at the same time from the input left to the input right,
//                it becomes more efficient to compare the overlaps
                currentSpanList = matches.get(key);
                IntervalST overlapChecker = overlapCheckers.get(key);
                Object overlappedPos = overlapChecker.get(new Interval1D(currentSpan.begin, currentSpan.end - 1));
                if (overlappedPos != null) {
                    int pos = (int) overlappedPos;
                    Span overlappedSpan = currentSpanList.get(pos);
                    if (logger.isLoggable(Level.FINEST))
                        logger.finest("\t\tOverlapped with: " + overlappedSpan.begin + ", " + overlappedSpan.end + "\t" +
                                text.substring(overlappedSpan.begin - offset, overlappedSpan.end - offset));
                    if (!compareSpan(currentSpan, overlappedSpan)) {
                        if (logger.isLoggable(Level.FINEST))
                            logger.finest("\t\tSkip this span ...");
                        continue;
                    }
                    currentSpanList.set(pos, currentSpan);
                    overlapChecker.remove(new Interval1D(overlappedSpan.begin, overlappedSpan.end - 1));
                    overlapChecker.put(new Interval1D(currentSpan.begin, currentSpan.end - 1), pos);
                } else {
                    overlapChecker.put(new Interval1D(currentSpan.begin, currentSpan.end - 1), currentSpanList.size());
                    currentSpanList.add(currentSpan);
                }
            } else {
                currentSpanList.add(currentSpan);
                matches.put((String) key, currentSpanList);
                IntervalST overlapChecker = new IntervalST();
                overlapChecker.put(new Interval1D(currentSpan.begin, currentSpan.end - 1), 0);
                overlapCheckers.put((String) key, overlapChecker);
            }
        }
    }

    public double getScore(Span span) {
        return scores.get(span.ruleId);
    }

    public double getScore(int ruleId) {
        return scores.get(ruleId);
    }

    public void setScore(int ruleId, double score) {
        scores.put(ruleId, score);
    }

    /**
     * Using "+" to support replications might slow down the performance of FastCRule,
     * try to avoid using it as much as possible.
     *
     * @param support support replications
     */
    public void setReplicationSupport(boolean support) {
        this.supportReplications = support;
    }

    public void setCompareMethod(String method) {
        this.method = method;
    }

    protected boolean compareScoreOnly(Span a, Span b) {
        if (getScore(a) < 0)
            return true;
        if (getScore(b) < 0)
            return false;
        return getScore(a) > getScore(b);
    }

    protected boolean compareWidthOnly(Span a, Span b) {
        return a.width > b.width;
    }

    protected boolean compareScorePrior(Span a, Span b) {
        if (logger.isLoggable(Level.FINEST))
            logger.finest("\t\tcurrent " + a.ruleId + " score: " + getScore(a.ruleId) + "\t---\toverlapped " + b.ruleId + " score: " + getScore(b.ruleId));
        if (getScore(a) < 0)
            return true;
        if (getScore(b) < 0)
            return false;
        if (getScore(a) > getScore(b)) {
            return true;
        } else if (getScore(a) >= getScore(b) && a.width > b.width
//                && getRule(b.ruleId).type != Determinants.PSEUDO
                ) {
            return true;
        }
        return false;
    }

    protected boolean compareWidthPrior(Span a, Span b) {
        if (a.width > b.width) {
            return true;
        } else if (a.width == b.width && getScore(a) > getScore(b)) {
            return true;
        }
        return false;
    }

    protected boolean compareSpan(Span a, Span b) {
        switch (method) {
            case "score":
                return compareScoreOnly(a, b);
            case "scorewidth":
                return compareScorePrior(a, b);
            case "widthscore":
                return compareWidthPrior(a, b);
            default:
                return compareWidthOnly(a, b);
        }
    }

    public void setSpecialCharacterSupport(Boolean scSupport) {
        this.scSupport = scSupport;
    }

    public void setMaxRepeatLength(int maxRepeatLength) {
        this.maxRepeatLength = maxRepeatLength;
    }

}