All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.james.ai.classic.BayesianAnalysis Maven / Gradle / Ivy

/****************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one   *
 * or more contributor license agreements.  See the NOTICE file *
 * distributed with this work for additional information        *
 * regarding copyright ownership.  The ASF licenses this file   *
 * to you under the Apache License, Version 2.0 (the            *
 * "License"); you may not use this file except in compliance   *
 * with the License.  You may obtain a copy of the License at   *
 *                                                              *
 *   http://www.apache.org/licenses/LICENSE-2.0                 *
 *                                                              *
 * Unless required by applicable law or agreed to in writing,   *
 * software distributed under the License is distributed on an  *
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
 * KIND, either express or implied.  See the License for the    *
 * specific language governing permissions and limitations      *
 * under the License.                                           *
 ****************************************************************/

package org.apache.james.ai.classic;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.StringReader;
import java.sql.Connection;
import java.text.DecimalFormat;
import java.util.Collection;
import java.util.Iterator;

import javax.annotation.Resource;
import javax.mail.MessagingException;
import javax.mail.internet.MimeMessage;
import javax.sql.DataSource;

import org.apache.mailet.Mail;
import org.apache.mailet.MailAddress;
import org.apache.mailet.base.GenericMailet;
import org.apache.mailet.base.RFC2822Headers;

/**
 * 

* Spam detection mailet using bayesian analysis techniques. *

* *

* Sets an email message header indicating the probability that an email message * is SPAM. *

* *

* Based upon the principals described in: A Plan For Spam by Paul * Graham. Extended to Paul Grahams' Better Bayesian Filtering. *

* *

* The analysis capabilities are based on token frequencies (the Corpus) * learned through a training process (see {@link BayesianAnalysisFeeder}) and * stored in a JDBC database. After a training session, the Corpus must be * rebuilt from the database in order to acquire the new frequencies. Every 10 * minutes a special thread in this mailet will check if any change was made to * the database by the feeder, and rebuild the corpus if necessary. *

* *

* A org.apache.james.spam.probability mail attribute will be * created containing the computed spam probability as a * {@link java.lang.Double}. The headerName message header string * will be created containing such probability in floating point representation. *

* *

* Sample configuration: *

* *
 * 
 * <mailet match="All" class="BayesianAnalysis">
 *   <repositoryPath>db://maildb</repositoryPath>
 *   <!--
 *     Set this to the header name to add with the spam probability
 *     (default is "X-MessageIsSpamProbability").
 *   -->
 *   <headerName>X-MessageIsSpamProbability</headerName>
 *   <!--
 *     Set this to true if you want to ignore messages coming from local senders
 *     (default is false).
 *     By local sender we mean a return-path with a local server part (server listed
 *     in <servernames> in config.xml).
 *   -->
 *   <ignoreLocalSender>true</ignoreLocalSender>
 *   <!--
 *     Set this to the maximum message size (in bytes) that a message may have
 *     to be considered spam (default is 100000).
 *   -->
 *   <maxSize>100000</maxSize>
 *   <!--
 *     Set this to false if you not want to tag the message if spam is detected (Default is true).
 *   -->
 *   <tagSubject>true</tagSubject>
 * </mailet>
 * 
 * 
* *

* The probability of being spam is pre-pended to the subject if it is > 0.1 * (10%). *

* *

* The required tables are automatically created if not already there (see * sqlResources.xml). The token field in both the ham and spam tables is case * sensitive. *

* * @see BayesianAnalysisFeeder * @see BayesianAnalyzer * @see JDBCBayesianAnalyzer * @since 2.3.0 */ public class BayesianAnalysis extends GenericMailet implements Log { /** * The JDBCUtil helper class */ private final JDBCUtil theJDBCUtil = new JDBCUtil(this); /** * The JDBCBayesianAnalyzer class that does all the work. */ private JDBCBayesianAnalyzer analyzer = new JDBCBayesianAnalyzer(this); DataSource datasource; private static final String MAIL_ATTRIBUTE_NAME = "org.apache.james.spam.probability"; private static final String HEADER_NAME = "X-MessageIsSpamProbability"; static final long CORPUS_RELOAD_INTERVAL = 600000; private String headerName; private boolean ignoreLocalSender = false; private boolean tagSubject = true; /** * Return a string describing this mailet. * * @return a string describing this mailet */ public String getMailetInfo() { return "BayesianAnalysis Mailet"; } /** * Holds value of property maxSize. */ private int maxSize = 100000; /** * Holds value of property lastCorpusLoadTime. */ private long lastCorpusLoadTime; private SystemContext fs; /** * Getter for property maxSize. * * @return Value of property maxSize. */ public int getMaxSize() { return this.maxSize; } /** * Setter for property maxSize. * * @param maxSize * New value of property maxSize. */ public void setMaxSize(int maxSize) { this.maxSize = maxSize; } /** * Getter for property lastCorpusLoadTime. * * @return Value of property lastCorpusLoadTime. */ public long getLastCorpusLoadTime() { return this.lastCorpusLoadTime; } @Resource(name = "datasource") public void setDataSource(DataSource datasource) { this.datasource = datasource; } @Resource(name = "filesystem") public void setFileSystem(SystemContext fs) { this.fs = fs; } /** * Sets lastCorpusLoadTime to System.currentTimeMillis(). */ private void touchLastCorpusLoadTime() { this.lastCorpusLoadTime = System.currentTimeMillis(); } /** * Mailet initialization routine. * * @throws MessagingException * if a problem arises */ public void init() throws MessagingException { String repositoryPath = getInitParameter("repositoryPath"); if (repositoryPath == null) { throw new MessagingException("repositoryPath is null"); } headerName = getInitParameter("headerName", HEADER_NAME); ignoreLocalSender = Boolean.valueOf(getInitParameter("ignoreLocalSender")); if (ignoreLocalSender) { log("Will ignore messages coming from local senders"); } else { log("Will analyze messages coming from local senders"); } String maxSizeParam = getInitParameter("maxSize"); if (maxSizeParam != null) { setMaxSize(Integer.parseInt(maxSizeParam)); } log("maxSize: " + getMaxSize()); String tag = getInitParameter("tagSubject"); if (tag != null && tag.equals("false")) { tagSubject = false; } initDb(); CorpusLoaderThread corpusLoader = new CorpusLoaderThread(this); corpusLoader.setDaemon(true); corpusLoader.start(); } private void initDb() throws MessagingException { try { analyzer.initSqlQueries(datasource.getConnection(), fs.readXml("sqlResources.xml")); } catch (Exception e) { throw new MessagingException("Exception initializing queries", e); } try { loadData(datasource.getConnection()); } catch (java.sql.SQLException se) { throw new MessagingException("SQLException loading data", se); } } /** * Scans the mail and determines the spam probability. * * @param mail * The Mail message to be scanned. * @throws MessagingException * if a problem arises */ public void service(Mail mail) throws MessagingException { try { MimeMessage message = mail.getMessage(); if (ignoreLocalSender && isSenderLocal(mail)) { // ignore the message if the sender is local return; } String[] headerArray = message.getHeader(headerName); // ignore the message if already analyzed if (headerArray != null && headerArray.length > 0) { return; } ByteArrayOutputStream baos = new ByteArrayOutputStream(); double probability; if (message.getSize() < getMaxSize()) { message.writeTo(baos); probability = analyzer.computeSpamProbability(new BufferedReader(new StringReader(baos.toString()))); } else { probability = 0.0; } mail.setAttribute(MAIL_ATTRIBUTE_NAME, probability); message.setHeader(headerName, Double.toString(probability)); DecimalFormat probabilityForm = (DecimalFormat) DecimalFormat.getInstance(); probabilityForm.applyPattern("##0.##%"); String probabilityString = probabilityForm.format(probability); String senderString; if (mail.getSender() == null) { senderString = "null"; } else { senderString = mail.getSender().toString(); } if (probability > 0.1) { @SuppressWarnings("unchecked") final Collection recipients = mail.getRecipients(); log(headerName + ": " + probabilityString + "; From: " + senderString + "; Recipient(s): " + getAddressesString(recipients)); // Check if we should tag the subject if (tagSubject) { appendToSubject(message, " [" + probabilityString + (probability > 0.9 ? " SPAM" : " spam") + "]"); } } saveChanges(message); } catch (Exception e) { log("Exception: " + e.getMessage(), e); throw new MessagingException("Exception thrown", e); } } private boolean isSenderLocal(Mail mail) { return mail.getSender() != null && getMailetContext().isLocalServer(mail.getSender().getDomain()); } void loadData(Connection conn) throws java.sql.SQLException { try { // this is synchronized to avoid concurrent update of the corpus synchronized (JDBCBayesianAnalyzer.DATABASE_LOCK) { analyzer.tokenCountsClear(); analyzer.loadHamNSpam(conn); analyzer.buildCorpus(); analyzer.tokenCountsClear(); } log("BayesianAnalysis Corpus loaded"); touchLastCorpusLoadTime(); } finally { if (conn != null) { theJDBCUtil.closeJDBCConnection(conn); } } } private String getAddressesString(Collection addresses) { if (addresses == null) { return "null"; } Iterator iter = addresses.iterator(); StringBuilder sb = new StringBuilder(); sb.append('['); for (int i = 0; iter.hasNext(); i++) { sb.append(iter.next()); if (i + 1 < addresses.size()) { sb.append(", "); } } sb.append(']'); return sb.toString(); } private void appendToSubject(MimeMessage message, String toAppend) { try { String subject = message.getSubject(); if (subject == null) { message.setSubject(toAppend, "iso-8859-1"); } else { message.setSubject(toAppend + " " + subject, "iso-8859-1"); } } catch (MessagingException ex) { log("Failure to append to subject phrase: '" + toAppend + "'", ex); } } /** * Saves changes resetting the original message id. */ private void saveChanges(MimeMessage message) throws MessagingException { String messageId = message.getMessageID(); message.saveChanges(); if (messageId != null) { message.setHeader(RFC2822Headers.MESSAGE_ID, messageId); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy