org.apache.tika.parser.mail.MailContentHandler Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tika-parsers Show documentation
There is a newer version: 3.0.0-BETA2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.mail;

import org.apache.james.mime4j.MimeException;
import org.apache.james.mime4j.codec.DecodeMonitor;
import org.apache.james.mime4j.codec.DecoderUtil;
import org.apache.james.mime4j.dom.address.Address;
import org.apache.james.mime4j.dom.address.AddressList;
import org.apache.james.mime4j.dom.address.Mailbox;
import org.apache.james.mime4j.dom.address.MailboxList;
import org.apache.james.mime4j.dom.field.AddressListField;
import org.apache.james.mime4j.dom.field.DateTimeField;
import org.apache.james.mime4j.dom.field.MailboxListField;
import org.apache.james.mime4j.dom.field.ParsedField;
import org.apache.james.mime4j.dom.field.UnstructuredField;
import org.apache.james.mime4j.field.LenientFieldParser;
import org.apache.james.mime4j.message.MaximalBodyDescriptor;
import org.apache.james.mime4j.parser.ContentHandler;
import org.apache.james.mime4j.stream.BodyDescriptor;
import org.apache.james.mime4j.stream.Field;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.parser.rtf.RTFParser;
import org.apache.tika.parser.txt.TXTParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.DateFormat;
import java.text.DateFormatSymbols;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Stack;
import java.util.TimeZone;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static org.apache.tika.utils.DateUtils.MIDDAY;
import static org.apache.tika.utils.DateUtils.UTC;

/**
 * Bridge between mime4j's content handler and the generic Sax content handler
 * used by Tika. See
 * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html
 */
class MailContentHandler implements ContentHandler {

    private static final String MULTIPART_ALTERNATIVE = "multipart/alternative";

    //TIKA-1970 Mac Mail's format
    private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN =
            Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z");

    //find a time ending in am/pm without a space: 10:30am and
    //use this pattern to insert space: 10:30 am
    private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b");

    private static final DateFormat[] ALTERNATE_DATE_FORMATS = new DateFormat[] {
            //note that the string is "cleaned" before processing:
            //1) condense multiple whitespace to single space
            //2) trim()
            //3) strip out commas
            //4) insert space before am/pm

            //May 16 2016 1:32am
            createDateFormat("MMM dd yy hh:mm a", null),

            //this is a standard pattern handled by mime4j;
            //but mime4j fails with leading whitespace
            createDateFormat("EEE d MMM yy HH:mm:ss Z", UTC),

            createDateFormat("EEE d MMM yy HH:mm:ss z", UTC),

            createDateFormat("EEE d MMM yy HH:mm:ss", null),// no timezone

            createDateFormat("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM

            //16 May 2016 at 09:30:32  GMT+1 (Mac Mail TIKA-1970)
            createDateFormat("d MMM yy 'at' HH:mm:ss z", UTC),   // UTC/Zulu

            createDateFormat("yy-MM-dd HH:mm:ss", null),

            createDateFormat("MM/dd/yy hh:mm a", null, false),

            //now dates without times
            createDateFormat("MMM d yy", MIDDAY, false),
            createDateFormat("EEE d MMM yy", MIDDAY, false),
            createDateFormat("d MMM yy", MIDDAY, false),
            createDateFormat("yy/MM/dd", MIDDAY, false),
            createDateFormat("MM/dd/yy", MIDDAY, false)
    };

    private static DateFormat createDateFormat(String format, TimeZone timezone) {
        return createDateFormat(format, timezone, true);
    }

    private static DateFormat createDateFormat(String format, TimeZone timezone, boolean isLenient) {
        SimpleDateFormat sdf =
                new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
        if (timezone != null) {
            sdf.setTimeZone(timezone);
        }
        sdf.setLenient(isLenient);
        return sdf;
    }


    private final XHTMLContentHandler handler;
    private final Metadata metadata;
    private final ParseContext parseContext;
    private boolean strictParsing = false;
    private final boolean extractAllAlternatives;
    private final EmbeddedDocumentExtractor extractor;
    private final Detector detector;
    //this is used to buffer a multipart body that
    //keeps track of multipart/alternative and its children
    private Stack alternativePartBuffer = new Stack<>();

    private Stack parts = new Stack<>();

    MailContentHandler(XHTMLContentHandler xhtml, Detector detector, Metadata metadata,
                       ParseContext context, boolean strictParsing, boolean extractAllAlternatives) {
        this.handler = xhtml;
        this.metadata = metadata;
        this.parseContext = context;
        this.strictParsing = strictParsing;
        this.extractAllAlternatives = extractAllAlternatives;

        // Fetch / Build an EmbeddedDocumentExtractor with which
        //  to handle/process the parts/attachments

        // Was an EmbeddedDocumentExtractor explicitly supplied?
        this.extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
        this.detector = detector;
    }

    @Override
    public void body(BodyDescriptor body, InputStream is) throws MimeException,
            IOException {
        // use a different metadata object
        // in order to specify the mime type of the
        // sub part without damaging the main metadata

        Metadata submd = new Metadata();
        submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
        submd.set(Metadata.CONTENT_ENCODING, body.getCharset());

        // TIKA-2455: flag the containing type.
        if (parts.size() > 0) {
            submd.set(Message.MULTIPART_SUBTYPE, parts.peek().getSubType());
            submd.set(Message.MULTIPART_BOUNDARY, parts.peek().getBoundary());
        }
        if (body instanceof MaximalBodyDescriptor) {
            MaximalBodyDescriptor maximalBody = (MaximalBodyDescriptor) body;
            String contentDispositionType = maximalBody.getContentDispositionType();
            if (contentDispositionType != null && !contentDispositionType.isEmpty()) {
                StringBuilder contentDisposition = new StringBuilder(contentDispositionType);
                Map contentDispositionParameters = maximalBody.getContentDispositionParameters();
                for (Entry param : contentDispositionParameters.entrySet()) {
                    contentDisposition.append("; ")
                            .append(param.getKey()).append("=\"").append(param.getValue()).append('"');
                }

                String contentDispositionFileName = maximalBody.getContentDispositionFilename();
                if ( contentDispositionFileName != null ) {
                    submd.set( Metadata.RESOURCE_NAME_KEY, contentDispositionFileName );
                }

                submd.set(Metadata.CONTENT_DISPOSITION, contentDisposition.toString());
            }
        }
        //if we're in a multipart/alternative or any one of its children
        //add the bodypart to the latest that was added
        if (!extractAllAlternatives && alternativePartBuffer.size() > 0) {
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            IOUtils.copy(is, bos);
            alternativePartBuffer.peek().children.add(new BodyContents(submd, bos.toByteArray()));
        } else if (!extractAllAlternatives && parts.size() < 2) {
            //if you're at the first level of embedding
            //and you're not in an alternative part block
            //and you're text/html, put that in the body of the email
            //otherwise treat as a regular attachment
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            IOUtils.copy(is, bos);
            byte[] bytes = bos.toByteArray();
            if (detectTextOrHtml(submd, bytes)) {
                handleInlineBodyPart(new BodyContents(submd, bos.toByteArray()));
            } else {
                //else handle as you would any other embedded content
                try (TikaInputStream tis = TikaInputStream.get(bytes)) {
                    handleEmbedded(tis, submd);
                }
            }
        } else {
            //else handle as you would any other embedded content
            try (TikaInputStream tis = TikaInputStream.get(is)) {
                handleEmbedded(tis, submd);
            }
        }
    }

    private boolean detectTextOrHtml(Metadata submd, byte[] bytes) {
        String mediaTypeString = submd.get(Metadata.CONTENT_TYPE);
        if (mediaTypeString != null) {
            if (mediaTypeString.startsWith("text")) {
                return true;
            } else {
                return false;
            }
        }
        try (TikaInputStream tis = TikaInputStream.get(bytes)) {
            MediaType mediaType = detector.detect(tis, submd);
            if (mediaType != null) {
                //detect only once
                submd.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, mediaType.toString());
                if (mediaType.toString().startsWith("text")) {
                    return true;
                }
            }
        } catch (IOException e) {

        }
        return false;
    }

    private void handleEmbedded(TikaInputStream tis, Metadata metadata) throws MimeException, IOException {

        String disposition = metadata.get(Metadata.CONTENT_DISPOSITION);
        boolean isInline = false;
        if (disposition != null) {
            if (disposition.toLowerCase(Locale.US).contains("inline")) {
                metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                        TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
                isInline = true;
            }
        }
        if (! isInline) {
            metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                    TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
        }

        try {
            if (extractor.shouldParseEmbedded(metadata)) {
                // Wrap the InputStream before passing on, as the James provided
                //  one misses many features we might want eg mark/reset
                extractor.parseEmbedded(tis, handler, metadata, false);
            }
        } catch (SAXException e) {
            throw new MimeException(e);
        }

    }

    @Override
    public void endBodyPart() throws MimeException {
        //if we're buffering for a multipart/alternative
        //don't write