org.apache.james.ai.classic.BayesianAnalyzer Maven / Gradle / Ivy

Go to download
/****************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one   *
 * or more contributor license agreements.  See the NOTICE file *
 * distributed with this work for additional information        *
 * regarding copyright ownership.  The ASF licenses this file   *
 * to you under the Apache License, Version 2.0 (the            *
 * "License"); you may not use this file except in compliance   *
 * with the License.  You may obtain a copy of the License at   *
 *                                                              *
 *   http://www.apache.org/licenses/LICENSE-2.0                 *
 *                                                              *
 * Unless required by applicable law or agreed to in writing,   *
 * software distributed under the License is distributed on an  *
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
 * KIND, either express or implied.  See the License for the    *
 * specific language governing permissions and limitations      *
 * under the License.                                           *
 ****************************************************************/

package org.apache.james.ai.classic;

import java.io.Reader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;

/**
 * 
 * Determines probability that text contains Spam.
 * 
 * 
 * 
 * Based upon Paul Grahams' A Plan
 * for Spam. Extended to Paul Grahams' Better Bayesian Filtering.
 * 
 * 
 * 
 * Sample method usage:
 * 
 * 
 * 
 * Use: void addHam(Reader) and void addSpam(Reader)
 * 
 * methods to build up the Maps of ham & spam tokens/occurrences. Both addHam
 * and addSpam assume they're reading one message at a time, if you feed more
 * than one message per call, be sure to adjust the appropriate message counter:
 * hamMessageCount or spamMessageCount.
 * 
 * Then...
 * 
 * 
 * 
 * Use: void buildCorpus()
 * 
 * to build the final token/probabilities Map.
 * 
 * Use your own methods for persistent storage of either the individual ham/spam
 * corpus & message counts, and/or the final corpus.
 * 
 * Then you can...
 * 
 * 
 * 
 * Use: double computeSpamProbability(Reader)
 * 
 * to determine the probability that a particular text contains spam. A returned
 * result of 0.9 or above is an indicator that the text was spam.
 * 
 * 
 * 
 * If you use persistent storage, use: void setCorpus(Map)
 * 
 * before calling computeSpamProbability.
 * 
 * 
 * @since 2.3.0
 */

public class BayesianAnalyzer {

    /**
     * Number of "interesting" tokens to use to compute overall spamminess
     * probability.
     */
    private final static int MAX_INTERESTING_TOKENS = 15;

    /**
     * Minimum probability distance from 0.5 to consider a token "interesting"
     * to use to compute overall spamminess probability.
     */
    private final static double INTERESTINGNESS_THRESHOLD = 0.46;

    /**
     * Default token probability to use when a token has not been encountered
     * before.
     */
    final static double DEFAULT_TOKEN_PROBABILITY = 0.4;

    /** Map of ham tokens and their occurrences. */
    private Map hamTokenCounts = new HashMap();

    /** Map of spam tokens and their occurrences. */
    private Map spamTokenCounts = new HashMap();

    /** Number of ham messages analyzed. */
    private int hamMessageCount = 0;

    /** Number of spam messages analyzed. */
    private int spamMessageCount = 0;

    /** Final token/probability corpus. */
    private Map corpus = new HashMap();

    /**
     * Basic class constructor.
     */
    public BayesianAnalyzer() {
    }

    /**
     * Public setter for the hamTokenCounts Map.
     * 
     * @param hamTokenCounts
     *            The new ham Token counts Map.
     */
    public void setHamTokenCounts(Map hamTokenCounts) {
        this.hamTokenCounts = hamTokenCounts;
    }

    /**
     * Public getter for the hamTokenCounts Map.
     */
    public Map getHamTokenCounts() {
        return this.hamTokenCounts;
    }

    /**
     * Public setter for the spamTokenCounts Map.
     * 
     * @param spamTokenCounts
     *            The new spam Token counts Map.
     */
    public void setSpamTokenCounts(Map spamTokenCounts) {
        this.spamTokenCounts = spamTokenCounts;
    }

    /**
     * Public getter for the spamTokenCounts Map.
     */
    public Map getSpamTokenCounts() {
        return this.spamTokenCounts;
    }

    /**
     * Public setter for spamMessageCount.
     * 
     * @param spamMessageCount
     *            The new spam message count.
     */
    public void setSpamMessageCount(int spamMessageCount) {
        this.spamMessageCount = spamMessageCount;
    }

    /**
     * Public getter for spamMessageCount.
     */
    public int getSpamMessageCount() {
        return this.spamMessageCount;
    }

    /**
     * Public setter for hamMessageCount.
     * 
     * @param hamMessageCount
     *            The new ham message count.
     */
    public void setHamMessageCount(int hamMessageCount) {
        this.hamMessageCount = hamMessageCount;
    }

    /**
     * Public getter for hamMessageCount.
     */
    public int getHamMessageCount() {
        return this.hamMessageCount;
    }

    /**
     * Clears all analysis repositories and counters.
     */
    public void clear() {
        corpus.clear();

        tokenCountsClear();

        hamMessageCount = 0;
        spamMessageCount = 0;
    }

    /**
     * Clears token counters.
     */
    public void tokenCountsClear() {
        hamTokenCounts.clear();
        spamTokenCounts.clear();
    }

    /**
     * Public setter for corpus.
     * 
     * @param corpus
     *            The new corpus.
     */
    public void setCorpus(Map corpus) {
        this.corpus = corpus;
    }

    /**
     * Public getter for corpus.
     */
    public Map getCorpus() {
        return this.corpus;
    }

    /**
     * Builds the corpus from the existing ham & spam counts.
     */
    public void buildCorpus() {
        // Combine the known ham & spam tokens.
        Set set = new HashSet(hamTokenCounts.size() + spamTokenCounts.size());
        set.addAll(hamTokenCounts.keySet());
        set.addAll(spamTokenCounts.keySet());
        Map tempCorpus = new HashMap(set.size());

        // Iterate through all the tokens and compute their new
        // individual probabilities.
        for (String token : set) {
            tempCorpus.put(token, computeProbability(token));
        }
        setCorpus(tempCorpus);
    }

    /**
     * Adds a message to the ham list.
     * 
     * @param stream
     *            A reader stream on the ham message to analyze
     * @throws IOException
     *             If any error occurs
     */
    public void addHam(Reader stream) throws java.io.IOException {
        addTokenOccurrences(stream, hamTokenCounts);
        hamMessageCount++;
    }

    /**
     * Adds a message to the spam list.
     * 
     * @param stream
     *            A reader stream on the spam message to analyze
     * @throws IOException
     *             If any error occurs
     */
    public void addSpam(Reader stream) throws java.io.IOException {
        addTokenOccurrences(stream, spamTokenCounts);
        spamMessageCount++;
    }

    /**
     * Computes the probability that the stream contains SPAM.
     * 
     * @param stream
     *            The text to be analyzed for Spamminess.
     * @return A 0.0 - 1.0 probability
     * @throws IOException
     *             If any error occurs
     */
    public double computeSpamProbability(Reader stream) throws java.io.IOException {
        // Build a set of the tokens in the Stream.
        Set tokens = parse(stream);

        // Get the corpus to use in this run
        // A new corpus may be being built in the meantime
        Map workCorpus = getCorpus();

        // Assign their probabilities from the Corpus (using an additional
        // calculation to determine spamminess).
        SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);

        // Compute and return the overall probability that the
        // stream is SPAM.
        return computeOverallProbability(tokenProbabilityStrengths, workCorpus);
    }

    /**
     * Parses a stream into tokens, and updates the target Map with the
     * token/counts.
     * 
     * @param stream
     * @param target
     */
    private void addTokenOccurrences(Reader stream, Map target) throws java.io.IOException {
        new TokenCounter(target).count(stream);
    }

    /**
     * Parses a stream into tokens, and returns a Set of the unique tokens
     * encountered.
     * 
     * @param stream
     * @return Set
     */
    private Set parse(Reader stream) throws java.io.IOException {
        Set tokens = new HashSet();
        new TokenCollector(tokens).collect(stream);
        // Return the unique set of tokens encountered.
        return tokens;
    }

    /**
     * Compute the probability that "token" is SPAM.
     * 
     * @param token
     * @return The probability that the token occurs within spam.
     */
    private double computeProbability(String token) {
        double hamFactor = 0;
        double spamFactor = 0;

        boolean foundInHam = false;
        boolean foundInSpam = false;

        double minThreshold = 0.01;
        double maxThreshold = 0.99;

        if (hamTokenCounts.containsKey(token)) {
            foundInHam = true;
        }

        if (spamTokenCounts.containsKey(token)) {
            foundInSpam = true;
        }

        if (foundInHam) {
            hamFactor = 2 * hamTokenCounts.get(token).doubleValue();
            if (!foundInSpam) {
                minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002;
            }
        }

        if (foundInSpam) {
            spamFactor = spamTokenCounts.get(token).doubleValue();
            if (!foundInHam) {
                maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998;
            }
        }

        if ((hamFactor + spamFactor) < 5) {
            // This token hasn't been seen enough.
            return 0.4;
        }

        double spamFreq = Math.min(1.0, spamFactor / spamMessageCount);
        double hamFreq = Math.min(1.0, hamFactor / hamMessageCount);

        return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq))));
    }

    /**
     * Returns a SortedSet of TokenProbabilityStrength built from the Corpus and
     * the tokens passed in the "tokens" Set. The ordering is from the highest
     * strength to the lowest strength.
     * 
     * @param tokens
     * @param workCorpus
     * @return SortedSet of TokenProbabilityStrength objects.
     */
    private SortedSet getTokenProbabilityStrengths(Set tokens, Map workCorpus) {
        // Convert to a SortedSet of token probability strengths.
        SortedSet tokenProbabilityStrengths = new TreeSet();

        for (String token : tokens) {
            TokenProbabilityStrength tps = new TokenProbabilityStrength();

            tps.token = token;

            if (workCorpus.containsKey(tps.token)) {
                tps.strength = Math.abs(0.5 - workCorpus.get(tps.token));
            } else {
                // This token has never been seen before,
                // we'll give it initially the default probability.
                Double corpusProbability = DEFAULT_TOKEN_PROBABILITY;
                tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
                boolean isTokenDegeneratedFound = false;

                Collection degeneratedTokens = buildDegenerated(tps.token);
                Iterator iDegenerated = degeneratedTokens.iterator();
                String tokenDegenerated;
                double strengthDegenerated;
                while (iDegenerated.hasNext()) {
                    tokenDegenerated = iDegenerated.next();
                    if (workCorpus.containsKey(tokenDegenerated)) {
                        Double probabilityTemp = workCorpus.get(tokenDegenerated);
                        strengthDegenerated = Math.abs(0.5 - probabilityTemp);
                        if (strengthDegenerated > tps.strength) {
                            isTokenDegeneratedFound = true;
                            tps.strength = strengthDegenerated;
                            corpusProbability = probabilityTemp;
                        }
                    }
                }
                // to reduce memory usage, put in the corpus only if the
                // probability is different from (stronger than) the default
                if (isTokenDegeneratedFound) {
                    synchronized (workCorpus) {
                        workCorpus.put(tps.token, corpusProbability);
                    }
                }
            }

            tokenProbabilityStrengths.add(tps);
        }

        return tokenProbabilityStrengths;
    }

    private Collection buildDegenerated(String fullToken) {
        ArrayList tokens = new ArrayList();
        String header;
        String token;
        String tokenLower;

        // look for a header string termination
        int headerEnd = fullToken.indexOf(':');
        if (headerEnd >= 0) {
            header = fullToken.substring(0, headerEnd);
            token = fullToken.substring(headerEnd);
        } else {
            header = "";
            token = fullToken;
        }

        // prepare a version of the token containing all lower case (for
        // performance reasons)
        tokenLower = token.toLowerCase();

        int end = token.length();
        do {
            if (!token.substring(0, end).equals(tokenLower.substring(0, end))) {
                tokens.add(header + tokenLower.substring(0, end));
                if (header.length() > 0) {
                    tokens.add(tokenLower.substring(0, end));
                }
            }
            if (end > 1 && token.charAt(0) >= 'A' && token.charAt(0) <= 'Z') {
                tokens.add(header + token.charAt(0) + tokenLower.substring(1, end));
                if (header.length() > 0) {
                    tokens.add(token.charAt(0) + tokenLower.substring(1, end));
                }
            }

            if (token.charAt(end - 1) != '!') {
                break;
            }

            end--;

            tokens.add(header + token.substring(0, end));
            if (header.length() > 0) {
                tokens.add(token.substring(0, end));
            }
        } while (end > 0);

        return tokens;
    }

    /**
     * Compute the spamminess probability of the interesting tokens in the
     * tokenProbabilities SortedSet.
     * 
     * @param tokenProbabilityStrengths
     * @param workCorpus
     * @return Computed spamminess.
     */
    private double computeOverallProbability(SortedSet tokenProbabilityStrengths, Map workCorpus) {
        double p = 1.0;
        double np = 1.0;
        double tempStrength = 0.5;
        int count = MAX_INTERESTING_TOKENS;
        Iterator iterator = tokenProbabilityStrengths.iterator();
        while ((iterator.hasNext()) && (count-- > 0 || tempStrength >= INTERESTINGNESS_THRESHOLD)) {
            TokenProbabilityStrength tps = iterator.next();
            tempStrength = tps.strength;

            // System.out.println(tps);

            double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it
                                                               // to the default
            Double theDoubleObject = workCorpus.get(tps.token);
            // if either the original token or a degeneration was found use the
            // double value, otherwise use the default
            if (theDoubleObject != null) {
                theDoubleValue = theDoubleObject;
            }
            p *= theDoubleValue;
            np *= (1.0 - theDoubleValue);
            // System.out.println("Token " + tps + ", p=" + theDoubleValue +
            // ", overall p=" + p / (p + np));
        }

        return (p / (p + np));
    }
}