org.apache.tika.parser.mbox.MboxParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tika-parsers Show documentation
There is a newer version: 3.0.0-BETA2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.mbox;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Collections;
import java.util.Date;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/**
 * Mbox (mailbox) parser. This version returns the headers for the first email
 * via metadata, which means headers from subsequent emails will be lost.
 */
public class MboxParser implements Parser {

    private static final Set SUPPORTED_TYPES =
        Collections.singleton(MediaType.application("mbox"));

    public static final String MBOX_MIME_TYPE = "application/mbox";
    public static final String MBOX_RECORD_DIVIDER = "From ";
    private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)");
    private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>");

    private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-";
    private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from";

    private enum ParseStates {
        START, IN_HEADER, IN_CONTENT
    }

    public Set getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, TikaException, SAXException {

        InputStreamReader isr;
        try {
            // Headers are going to be 7-bit ascii
            isr = new InputStreamReader(stream, "US-ASCII");
        } catch (UnsupportedEncodingException e) {
            throw new TikaException("US-ASCII is not supported!", e);
        }

        BufferedReader reader = new BufferedReader(isr);

        metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
        metadata.set(Metadata.CONTENT_ENCODING, "us-ascii");

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        ParseStates parseState = ParseStates.START;
        String multiLine = null;
        boolean inQuote = false;
        int numEmails = 0;

        // We're going to scan, line-by-line, for a line that starts with
        // "From "
        for (String curLine = reader.readLine(); curLine != null; curLine = reader.readLine()) {
            boolean newMessage = curLine.startsWith(MBOX_RECORD_DIVIDER);
            if (newMessage) {
                numEmails += 1;
            }

            switch (parseState) {
            case START:
                if (newMessage) {
                    parseState = ParseStates.IN_HEADER;
                    newMessage = false;
                    // Fall through to IN_HEADER
                } else {
                    break;
                }

            case IN_HEADER:
                if (newMessage) {
                    saveHeaderInMetadata(numEmails, metadata, multiLine);
                    multiLine = curLine;
                } else if (curLine.length() == 0) {
                    // Blank line is signal that we're transitioning to the content.
                    saveHeaderInMetadata(numEmails, metadata, multiLine);
                    parseState = ParseStates.IN_CONTENT;

                    // Mimic what PackageParser does between entries.
                    xhtml.startElement("div", "class", "email-entry");
                    xhtml.startElement("p");
                    inQuote = false;
                } else if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
                    multiLine += " " + curLine.trim();
                } else {
                    saveHeaderInMetadata(numEmails, metadata, multiLine);
                    multiLine = curLine;
                }

                break;

                // TODO - use real email parsing support so we can correctly handle
                // things like multipart messages and quoted-printable encoding.
                // We'd also want this for charset handling, where content isn't 7-bit
                // ascii.
            case IN_CONTENT:
                if (newMessage) {
                    endMessage(xhtml, inQuote);
                    parseState = ParseStates.IN_HEADER;
                    multiLine = curLine;
                } else {
                    boolean quoted = curLine.startsWith(">");
                    if (inQuote) {
                        if (!quoted) {
                            xhtml.endElement("q");
                            inQuote = false;
                        }
                    } else if (quoted) {
                        xhtml.startElement("q");
                        inQuote = true;
                    }

                    xhtml.characters(curLine);

                    // For plain text email, each line is a real break position.
                    xhtml.element("br", "");
                }
            }
        }

        if (parseState == ParseStates.IN_HEADER) {
            saveHeaderInMetadata(numEmails, metadata, multiLine);
        } else if (parseState == ParseStates.IN_CONTENT) {
            endMessage(xhtml, inQuote);
        }

        xhtml.endDocument();
    }

    private void endMessage(XHTMLContentHandler xhtml, boolean inQuote) throws SAXException {
        if (inQuote) {
            xhtml.endElement("q");
        }

        xhtml.endElement("p");
        xhtml.endElement("div");
    }

    private void saveHeaderInMetadata(int numEmails, Metadata metadata, String curLine) {
        if ((curLine == null) || (numEmails > 1)) {
            return;
        } else if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
            metadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
            return;
        }

        Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
        if (!headerMatcher.matches()) {
            return; // ignore malformed header lines
        }

        String headerTag = headerMatcher.group(1).toLowerCase();
        String headerContent = headerMatcher.group(2);

        if (headerTag.equalsIgnoreCase("From")) {
            metadata.add(Metadata.AUTHOR, headerContent);
            metadata.add(Metadata.CREATOR, headerContent);
        } else if (headerTag.equalsIgnoreCase("To") ||
        	headerTag.equalsIgnoreCase("Cc") ||
        	headerTag.equalsIgnoreCase("Bcc")) {
            Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent);
            if(address.find()) {
        	metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1));
            } else if(headerContent.indexOf('@') > -1) {
        	metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent);
            }
            
            String property = Metadata.MESSAGE_TO;
            if (headerTag.equalsIgnoreCase("Cc")) {
        	property = Metadata.MESSAGE_CC;
            } else if (headerTag.equalsIgnoreCase("Bcc")) {
        	property = Metadata.MESSAGE_BCC;
            }
            metadata.add(property, headerContent);
        } else if (headerTag.equalsIgnoreCase("Subject")) {
            metadata.add(Metadata.SUBJECT, headerContent);
            metadata.add(Metadata.TITLE, headerContent);
        } else if (headerTag.equalsIgnoreCase("Date")) {
            try {
                metadata.set(Metadata.DATE, parseDate(headerContent));
            } catch (ParseException e) {
                // ignoring date because format was not understood
            }
        } else if (headerTag.equalsIgnoreCase("Message-Id")) {
            metadata.add(Metadata.IDENTIFIER, headerContent);
        } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
            metadata.add(Metadata.RELATION, headerContent);
        } else if (headerTag.equalsIgnoreCase("Content-Type")) {
            // TODO - key off content-type in headers to
            // set mapping to use for content and convert if necessary.

            metadata.add(Metadata.CONTENT_TYPE, headerContent);
            metadata.add(Metadata.FORMAT, headerContent);
        } else {
            metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
        }
    }
    
    private Date parseDate(String headerContent) throws ParseException {
        SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
        return dateFormat.parse(headerContent);
    }

    public void parse(
            InputStream stream, ContentHandler handler, Metadata metadata)
            throws IOException, SAXException, TikaException {
        parse(stream, handler, metadata, new ParseContext());
    }

}