All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.james.ai.classic.BayesianAnalyzer Maven / Gradle / Ivy

/****************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one   *
 * or more contributor license agreements.  See the NOTICE file *
 * distributed with this work for additional information        *
 * regarding copyright ownership.  The ASF licenses this file   *
 * to you under the Apache License, Version 2.0 (the            *
 * "License"); you may not use this file except in compliance   *
 * with the License.  You may obtain a copy of the License at   *
 *                                                              *
 *   http://www.apache.org/licenses/LICENSE-2.0                 *
 *                                                              *
 * Unless required by applicable law or agreed to in writing,   *
 * software distributed under the License is distributed on an  *
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
 * KIND, either express or implied.  See the License for the    *
 * specific language governing permissions and limitations      *
 * under the License.                                           *
 ****************************************************************/

package org.apache.james.ai.classic;

import java.io.Reader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;

/**
 * 

* Determines probability that text contains Spam. *

* *

* Based upon Paul Grahams' A Plan * for Spam. Extended to Paul Grahams' Better Bayesian Filtering. *

* *

* Sample method usage: *

* *

* Use: void addHam(Reader) and void addSpam(Reader) * * methods to build up the Maps of ham & spam tokens/occurrences. Both addHam * and addSpam assume they're reading one message at a time, if you feed more * than one message per call, be sure to adjust the appropriate message counter: * hamMessageCount or spamMessageCount. * * Then... *

* *

* Use: void buildCorpus() * * to build the final token/probabilities Map. * * Use your own methods for persistent storage of either the individual ham/spam * corpus & message counts, and/or the final corpus. * * Then you can... *

* *

* Use: double computeSpamProbability(Reader) * * to determine the probability that a particular text contains spam. A returned * result of 0.9 or above is an indicator that the text was spam. *

* *

* If you use persistent storage, use: void setCorpus(Map) * * before calling computeSpamProbability. *

* * @since 2.3.0 */ public class BayesianAnalyzer { /** * Number of "interesting" tokens to use to compute overall spamminess * probability. */ private final static int MAX_INTERESTING_TOKENS = 15; /** * Minimum probability distance from 0.5 to consider a token "interesting" * to use to compute overall spamminess probability. */ private final static double INTERESTINGNESS_THRESHOLD = 0.46; /** * Default token probability to use when a token has not been encountered * before. */ final static double DEFAULT_TOKEN_PROBABILITY = 0.4; /** Map of ham tokens and their occurrences. */ private Map hamTokenCounts = new HashMap(); /** Map of spam tokens and their occurrences. */ private Map spamTokenCounts = new HashMap(); /** Number of ham messages analyzed. */ private int hamMessageCount = 0; /** Number of spam messages analyzed. */ private int spamMessageCount = 0; /** Final token/probability corpus. */ private Map corpus = new HashMap(); /** * Basic class constructor. */ public BayesianAnalyzer() { } /** * Public setter for the hamTokenCounts Map. * * @param hamTokenCounts * The new ham Token counts Map. */ public void setHamTokenCounts(Map hamTokenCounts) { this.hamTokenCounts = hamTokenCounts; } /** * Public getter for the hamTokenCounts Map. */ public Map getHamTokenCounts() { return this.hamTokenCounts; } /** * Public setter for the spamTokenCounts Map. * * @param spamTokenCounts * The new spam Token counts Map. */ public void setSpamTokenCounts(Map spamTokenCounts) { this.spamTokenCounts = spamTokenCounts; } /** * Public getter for the spamTokenCounts Map. */ public Map getSpamTokenCounts() { return this.spamTokenCounts; } /** * Public setter for spamMessageCount. * * @param spamMessageCount * The new spam message count. */ public void setSpamMessageCount(int spamMessageCount) { this.spamMessageCount = spamMessageCount; } /** * Public getter for spamMessageCount. */ public int getSpamMessageCount() { return this.spamMessageCount; } /** * Public setter for hamMessageCount. * * @param hamMessageCount * The new ham message count. */ public void setHamMessageCount(int hamMessageCount) { this.hamMessageCount = hamMessageCount; } /** * Public getter for hamMessageCount. */ public int getHamMessageCount() { return this.hamMessageCount; } /** * Clears all analysis repositories and counters. */ public void clear() { corpus.clear(); tokenCountsClear(); hamMessageCount = 0; spamMessageCount = 0; } /** * Clears token counters. */ public void tokenCountsClear() { hamTokenCounts.clear(); spamTokenCounts.clear(); } /** * Public setter for corpus. * * @param corpus * The new corpus. */ public void setCorpus(Map corpus) { this.corpus = corpus; } /** * Public getter for corpus. */ public Map getCorpus() { return this.corpus; } /** * Builds the corpus from the existing ham & spam counts. */ public void buildCorpus() { // Combine the known ham & spam tokens. Set set = new HashSet(hamTokenCounts.size() + spamTokenCounts.size()); set.addAll(hamTokenCounts.keySet()); set.addAll(spamTokenCounts.keySet()); Map tempCorpus = new HashMap(set.size()); // Iterate through all the tokens and compute their new // individual probabilities. for (String token : set) { tempCorpus.put(token, computeProbability(token)); } setCorpus(tempCorpus); } /** * Adds a message to the ham list. * * @param stream * A reader stream on the ham message to analyze * @throws IOException * If any error occurs */ public void addHam(Reader stream) throws java.io.IOException { addTokenOccurrences(stream, hamTokenCounts); hamMessageCount++; } /** * Adds a message to the spam list. * * @param stream * A reader stream on the spam message to analyze * @throws IOException * If any error occurs */ public void addSpam(Reader stream) throws java.io.IOException { addTokenOccurrences(stream, spamTokenCounts); spamMessageCount++; } /** * Computes the probability that the stream contains SPAM. * * @param stream * The text to be analyzed for Spamminess. * @return A 0.0 - 1.0 probability * @throws IOException * If any error occurs */ public double computeSpamProbability(Reader stream) throws java.io.IOException { // Build a set of the tokens in the Stream. Set tokens = parse(stream); // Get the corpus to use in this run // A new corpus may be being built in the meantime Map workCorpus = getCorpus(); // Assign their probabilities from the Corpus (using an additional // calculation to determine spamminess). SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus); // Compute and return the overall probability that the // stream is SPAM. return computeOverallProbability(tokenProbabilityStrengths, workCorpus); } /** * Parses a stream into tokens, and updates the target Map with the * token/counts. * * @param stream * @param target */ private void addTokenOccurrences(Reader stream, Map target) throws java.io.IOException { new TokenCounter(target).count(stream); } /** * Parses a stream into tokens, and returns a Set of the unique tokens * encountered. * * @param stream * @return Set */ private Set parse(Reader stream) throws java.io.IOException { Set tokens = new HashSet(); new TokenCollector(tokens).collect(stream); // Return the unique set of tokens encountered. return tokens; } /** * Compute the probability that "token" is SPAM. * * @param token * @return The probability that the token occurs within spam. */ private double computeProbability(String token) { double hamFactor = 0; double spamFactor = 0; boolean foundInHam = false; boolean foundInSpam = false; double minThreshold = 0.01; double maxThreshold = 0.99; if (hamTokenCounts.containsKey(token)) { foundInHam = true; } if (spamTokenCounts.containsKey(token)) { foundInSpam = true; } if (foundInHam) { hamFactor = 2 * hamTokenCounts.get(token).doubleValue(); if (!foundInSpam) { minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002; } } if (foundInSpam) { spamFactor = spamTokenCounts.get(token).doubleValue(); if (!foundInHam) { maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998; } } if ((hamFactor + spamFactor) < 5) { // This token hasn't been seen enough. return 0.4; } double spamFreq = Math.min(1.0, spamFactor / spamMessageCount); double hamFreq = Math.min(1.0, hamFactor / hamMessageCount); return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq)))); } /** * Returns a SortedSet of TokenProbabilityStrength built from the Corpus and * the tokens passed in the "tokens" Set. The ordering is from the highest * strength to the lowest strength. * * @param tokens * @param workCorpus * @return SortedSet of TokenProbabilityStrength objects. */ private SortedSet getTokenProbabilityStrengths(Set tokens, Map workCorpus) { // Convert to a SortedSet of token probability strengths. SortedSet tokenProbabilityStrengths = new TreeSet(); for (String token : tokens) { TokenProbabilityStrength tps = new TokenProbabilityStrength(); tps.token = token; if (workCorpus.containsKey(tps.token)) { tps.strength = Math.abs(0.5 - workCorpus.get(tps.token)); } else { // This token has never been seen before, // we'll give it initially the default probability. Double corpusProbability = DEFAULT_TOKEN_PROBABILITY; tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY); boolean isTokenDegeneratedFound = false; Collection degeneratedTokens = buildDegenerated(tps.token); Iterator iDegenerated = degeneratedTokens.iterator(); String tokenDegenerated; double strengthDegenerated; while (iDegenerated.hasNext()) { tokenDegenerated = iDegenerated.next(); if (workCorpus.containsKey(tokenDegenerated)) { Double probabilityTemp = workCorpus.get(tokenDegenerated); strengthDegenerated = Math.abs(0.5 - probabilityTemp); if (strengthDegenerated > tps.strength) { isTokenDegeneratedFound = true; tps.strength = strengthDegenerated; corpusProbability = probabilityTemp; } } } // to reduce memory usage, put in the corpus only if the // probability is different from (stronger than) the default if (isTokenDegeneratedFound) { synchronized (workCorpus) { workCorpus.put(tps.token, corpusProbability); } } } tokenProbabilityStrengths.add(tps); } return tokenProbabilityStrengths; } private Collection buildDegenerated(String fullToken) { ArrayList tokens = new ArrayList(); String header; String token; String tokenLower; // look for a header string termination int headerEnd = fullToken.indexOf(':'); if (headerEnd >= 0) { header = fullToken.substring(0, headerEnd); token = fullToken.substring(headerEnd); } else { header = ""; token = fullToken; } // prepare a version of the token containing all lower case (for // performance reasons) tokenLower = token.toLowerCase(); int end = token.length(); do { if (!token.substring(0, end).equals(tokenLower.substring(0, end))) { tokens.add(header + tokenLower.substring(0, end)); if (header.length() > 0) { tokens.add(tokenLower.substring(0, end)); } } if (end > 1 && token.charAt(0) >= 'A' && token.charAt(0) <= 'Z') { tokens.add(header + token.charAt(0) + tokenLower.substring(1, end)); if (header.length() > 0) { tokens.add(token.charAt(0) + tokenLower.substring(1, end)); } } if (token.charAt(end - 1) != '!') { break; } end--; tokens.add(header + token.substring(0, end)); if (header.length() > 0) { tokens.add(token.substring(0, end)); } } while (end > 0); return tokens; } /** * Compute the spamminess probability of the interesting tokens in the * tokenProbabilities SortedSet. * * @param tokenProbabilityStrengths * @param workCorpus * @return Computed spamminess. */ private double computeOverallProbability(SortedSet tokenProbabilityStrengths, Map workCorpus) { double p = 1.0; double np = 1.0; double tempStrength = 0.5; int count = MAX_INTERESTING_TOKENS; Iterator iterator = tokenProbabilityStrengths.iterator(); while ((iterator.hasNext()) && (count-- > 0 || tempStrength >= INTERESTINGNESS_THRESHOLD)) { TokenProbabilityStrength tps = iterator.next(); tempStrength = tps.strength; // System.out.println(tps); double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it // to the default Double theDoubleObject = workCorpus.get(tps.token); // if either the original token or a degeneration was found use the // double value, otherwise use the default if (theDoubleObject != null) { theDoubleValue = theDoubleObject; } p *= theDoubleValue; np *= (1.0 - theDoubleValue); // System.out.println("Token " + tps + ", p=" + theDoubleValue + // ", overall p=" + p / (p + np)); } return (p / (p + np)); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy