com.aliasi.spell.SpellEvaluator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aliasi-lingpipe Show documentation
This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.
There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 * 
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.spell;

import com.aliasi.classify.ConfusionMatrix;

import com.aliasi.lm.LanguageModel;

import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;

import com.aliasi.util.ObjectToCounterMap;
import com.aliasi.util.Strings;

import java.text.DecimalFormat;

import java.util.ArrayList;
import java.util.List;
import java.util.Set;


/**
 * The SpellEvaluator provides an evaluation harness for
 * spell checkers.  As with the other evaluator classes, it is
 * constructed with the spell checker that will be evaluated.  Test
 * cases are presented to the evaluator using the {@link
 * #addCase(String,String)} method.  The {@link #getLastCaseReport()}
 * method returns a string-based representation of the performance of
 * the most recently provided test case.  The method {@link
 * #toString()} provides a general report of results.
 *
 * The method {@link #normalize(String)} may be used to normalize
 * both input text and system outputs before comparing them.  This
 * may be used to do an evaluation that is case or space or
 * punctuation insensitive, for example.
 *
 * 
The basic output of the spell checker evaluation classifies
 * test cases into one of five categories:
 * 
 * 

 * 
 * 
 * 
 * 

 * 
 * 

 * 
 * 

 * 
 * 

 * 
 * 
 * User Input System Suggestion Status Method
Correct No Suggestion TN {@link #userCorrectSystemNoSuggestion()}
Correct Wrong Suggestion FP {@link #userCorrectSystemWrongSuggestion()}
Error Correct Suggestion TP {@link #userErrorSystemCorrect()}
Error No Suggestion FN {@link #userErrorSystemNoSuggestion()}
Error Wrong Suggestion FN,FP {@link #userErrorSystemWrongSuggestion()}
 *
 * The status indicates whether the case counts as a true positive
 * (TP), false positive (FP), true negative (TN), or false negative
 * (FN).  Note that if the user's input contains an error and the
 * system provides the wrong suggestion, the result counts as both a
 * false negative (failure to correct) and a false positive (erroneous
 * correction).  Because of the case of user
 * input error and wrong system correction, the confusion matrix count
 * is not quite one-to-one in size with the input size.  A confusion
 * matrix may be retrieved (populated with the above counts) through
 * the method {@link #confusionMatrix()}.
 *
 * 
The methods for extracting the cases are listed in the final column
 * for each of the five result types.
 *
 * @author  Breck Baldwin
 * @author Bob Carpenter
 * @version 3.8
 * @since   LingPipe2.4.1
 */
public class SpellEvaluator {

    private final SpellChecker mSpellChecker;

    private final List mTextList 
        = new ArrayList();
    private final List mCorrectTextList 
        = new ArrayList();
    private final List mSuggestionList
        = new ArrayList();

    private String mLastCaseReport = "No cases added yet.";

    private int mUserCorrectSystemWrongSuggestion = 0;
    private int mUserCorrectSystemNoSuggestion = 0;
    private int mUserErrorSystemNoSuggestion = 0;
    private int mUserErrorSystemCorrect = 0;
    private int mUserErrorSystemWrongSuggestion = 0;

    private final ObjectToCounterMap mTokenCounter;

    /** 
     * Construct a spelling evaluator for the specified spell checker.
     *
     * @param checker Spell checker to evaluate.
     */
    public SpellEvaluator(SpellChecker checker) {
        this(checker,null);
    }

    /** 
     * Construct a spelling evaluator for the specified spell checker
     * with the specified token counts.  The token counts will be
     * used to report counts of tokens in the corpus along with
     * per-line outputs.  If the token counter is null,
     * no token reports are provided.  In order for the token counts
     * to be used, the spell checker must be an instance of
     * {@link CompiledSpellChecker}.
     *
     * @param checker Spell checker to evaluate.
     * @param tokenCounter Counter for tokens in the speller.
     */
    public SpellEvaluator(SpellChecker checker,
                          ObjectToCounterMap tokenCounter) {
        mSpellChecker = checker;
        mTokenCounter = tokenCounter;
    }

    /**
     * Adds a training case to the spelling evaluator in the form
     * of input text and its corrected form.  
     *
     * @param text Text to spell check.
     * @param correctText Correct form of input text.
     */
    public void addCase(String text, String correctText) {
        String normalizedText = normalize(text);
        String normalizedCorrectText = normalize(correctText);
        String suggestion = mSpellChecker.didYouMean(text);
        String normalizedSuggestion = (suggestion == null)
            ? normalizedText
            : normalize(suggestion);

        mTextList.add(normalizedText);
        mCorrectTextList.add(normalizedCorrectText);
        mSuggestionList .add(normalizedSuggestion);

        String resultDescription = null;
        if (normalizedText.equals(normalizedCorrectText)) {
            resultDescription = "user correct, ";
            if (normalizedText.equals(normalizedSuggestion)) {
                resultDescription += "spell check wrong suggestion (FP)";
                ++mUserCorrectSystemWrongSuggestion;
            } else {
                resultDescription += "spell check no suggestion (TN)";
                ++mUserCorrectSystemNoSuggestion;
            }
        } else {
            resultDescription = "user incorrect, ";
            if (normalizedText.equals(normalizedSuggestion)) {
                resultDescription += "spell check no suggestion (FN)";
                ++mUserErrorSystemNoSuggestion;
            } else if (normalizedCorrectText.equals(normalizedSuggestion)) {
                resultDescription += "spell check correct (TP)";
                ++mUserErrorSystemCorrect;
            } else {
                resultDescription += "spell check wrong suggestion (FP,FN)";
                ++mUserErrorSystemWrongSuggestion;
            }
        }
        
        StringBuilder sb = new StringBuilder();
        report(sb,"input",normalizedText);
        sb.append("\n");
        report(sb,"correct",normalizedCorrectText);
        sb.append("\n");
        report(sb,"suggest",normalizedSuggestion);
        sb.append("\n");
        
        mLastCaseReport = sb.toString();
    }


    void report(StringBuilder sb, String msg, String text) {
        sb.append(msg + "=|" + text + "|");
        if (!(mSpellChecker instanceof CompiledSpellChecker))
            return;
        CompiledSpellChecker checker = (CompiledSpellChecker) mSpellChecker;
        LanguageModel lm = checker.languageModel();
        double estimate 
            = lm.log2Estimate(" " + text + " ")
            - lm.log2Estimate(" ");

        sb.append(" log2 p=" + lpFormat(estimate));

        TokenizerFactory tf = checker.tokenizerFactory();
        char[] cs = text.toCharArray();
        Tokenizer tokenizer = tf.tokenizer(cs,0,cs.length);
        String[] tokens = tokenizer.tokenize();
        Set tokenSet = checker.tokenSet();
        for (int i = 0; i < tokens.length; ++i) {
            sb.append(" ");
            sb.append(tokens[i]);
            sb.append("[");
            if (mTokenCounter == null)
                sb.append(tokenSet.contains(tokens[i]) ? "+" : "-");
            else
                sb.append(mTokenCounter.getCount(tokens[i]));
            sb.append("]");
        }
    }
    
    static final DecimalFormat LP_FORMAT = new DecimalFormat("#0.0");

    static String lpFormat(double x) {
        return LP_FORMAT.format(x);
    }
    

    /**
     * Return a string-based representation of the current status
     * of this evaluation.
     *
     * @return String-based representation of the evaluation.
     */
    @Override
    public String toString() {
        int userErrors = mUserErrorSystemWrongSuggestion
            + mUserErrorSystemCorrect
            + mUserErrorSystemNoSuggestion;
        int userCorrect = mUserCorrectSystemWrongSuggestion
            + mUserCorrectSystemNoSuggestion;
        int total = userErrors + userCorrect;

        StringBuilder sb = new StringBuilder();
        sb.append("EVALUATION\n");
        addReport(sb,"User Error", 
                  userErrors,total);
        addReport(sb,"     System Correct",
                  mUserErrorSystemCorrect,userErrors);
        addReport(sb,"     System Error",
                  mUserErrorSystemWrongSuggestion,userErrors);
        addReport(sb,"     System No Suggestion",
                  mUserErrorSystemNoSuggestion,userErrors);
        
        addReport(sb,"User Correct",
                  userCorrect,total);
        addReport(sb,"     System Error",
                  mUserCorrectSystemWrongSuggestion, userCorrect);
        addReport(sb,"     System No Suggestion",
                  mUserCorrectSystemNoSuggestion, userCorrect);

        sb.append("SPELL CHECKER toString()\n");
        sb.append(mSpellChecker);

        return sb.toString();
    }

    static void addReport(StringBuilder sb, String msg,
                          int correct, int total) {
        sb.append(msg);
        sb.append(": ");
        sb.append(correct);
        sb.append(" [");
        double percentage = (total > 0) ? (100.0 * correct)/total : 0;
        sb.append(PERCENT_FORMAT.format(percentage));
        sb.append("%]\n");
    }

    static final DecimalFormat PERCENT_FORMAT = new DecimalFormat("##0.0");

    /**
     * Returns an array of cases for which the user was correct
     * and the system made no suggestions.  The entries in the
     * array are of the form {text,correct,suggestion}.
     *
     * @return The user correct, system no suggestion cases.
     */
    public String[][] userCorrectSystemNoSuggestion() {
        return extract(true,true,true);
    }

    /**
     * Returns an array of cases for which the user was correct and
     * the system made an erroneous suggestion.  The entries in the
     * array are of the form {text,correct,suggestion}.
     *
     * @return The user correct, system wrong suggestion cases.
     */
    public String[][] userCorrectSystemWrongSuggestion() {
        return extract(true,false,false);
    }

    /**
     * Returns an array of cases for which the user made an error and
     * system returned the appropriate correction. The entries in the
     * array are of the form {text,correct,suggestion}.
     *
     * @return The user error, system correct cases.
     */
    public String[][] userErrorSystemCorrect() {
        return extract(false,true,false);
    }


    /**
     * Returns an array of cases for which the user made an error and
     * system returned the appropriate correction. The entries in the
     * array are of the form {text,correct,suggestion}.
     *
     * @return The user error, system correct cases.
     */
    public String[][] userErrorSystemWrongSuggestion() {
        return extract(false,false,false);
    }

    /**
     * Returns an array of cases for which the user made an
     * error and the systme made no suggestion. The entries in the
     * array are of the form {text,correct,suggestion}.
     *
     * @return The user error, system no suggestion cases.
     */
    public String[][] userErrorSystemNoSuggestion() {
        return extract(false,false,true);
    }

    
    String[][] extract(boolean textEqualsCorrect,
                       boolean correctEqualsSuggestion,
                       boolean textEqualsSuggestion) {
        List result = new ArrayList();
        for (int i = 0; i < mSuggestionList.size(); ++i) {
            String text = mTextList.get(i).toString();
            String correct = mCorrectTextList.get(i).toString();
            String suggestion = mSuggestionList.get(i).toString();
            if (text.equals(correct) == textEqualsCorrect
                && correct.equals(suggestion) == correctEqualsSuggestion
                && text.equals(suggestion) == textEqualsSuggestion) 

                result.add(new String[] { text, correct, suggestion });
        }
        return result.toArray(Strings.EMPTY_STRING_2D_ARRAY);
    }

            
    /**
     * Returns a string-based representation of the last test case.
     *
     * @return A string-based representation of the last test case.
     */
    public String getLastCaseReport() {
        return mLastCaseReport;
    }

    /**
     * Returns the confusion matrix for the current state of this
     * evaluation.  The class documentation (see above) describes the
     * calculation of true positives, false positives, false
     * negatives, and true negatives.  The categories used are
     * "correct" and
     * "misspelled".
     *
     * 
The confusion matrix does not track this evaluator, so once
     * a confusion matrix is constructed and returned, it will not
     * reflect additional cases added to this evaluator.
     *
     * @return The confusion matrix for the current state of this evaluation.
     */
    public ConfusionMatrix confusionMatrix() {
        int tn = mUserCorrectSystemNoSuggestion;
        int tp = mUserErrorSystemCorrect;
        int fn = mUserErrorSystemNoSuggestion + mUserErrorSystemWrongSuggestion;
        int fp = mUserCorrectSystemWrongSuggestion;
        return new ConfusionMatrix(new String[] { "correct", 
                                                  "misspelled" },
                                   new int[][] { { tp, fp},
                                                 { fn, tn } });
    }

    /**
     * Return the normalized form of a query or system output.  This
     * method will be applied to the input text before sending it to
     * the spell checker and will be applied to the system suggestion
     * before comparing it to the correct text.  All cases are saved
     * in their normalized forms.
     *
     * The default implementation in this class does nothing,
     * simply returning the input text.  Subclasses may override
     * this normalizer.
     *
     * @param text Text to normalize.
     * @return The normalized form of the text.
     */
    public String normalize(String text) {
        return text;
    }

}
User Input	System Suggestion	Status	Method
Correct	No Suggestion	TN	{@link #userCorrectSystemNoSuggestion()}
Correct	Wrong Suggestion	FP	{@link #userCorrectSystemWrongSuggestion()}
Error	Correct Suggestion	TP	{@link #userErrorSystemCorrect()}
Error	No Suggestion	FN	{@link #userErrorSystemNoSuggestion()}
Error	Wrong Suggestion	FN,FP	{@link #userErrorSystemWrongSuggestion()}