org.apache.tika.parser.mail.MailContentHandler Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.mail;
import org.apache.james.mime4j.MimeException;
import org.apache.james.mime4j.codec.DecodeMonitor;
import org.apache.james.mime4j.codec.DecoderUtil;
import org.apache.james.mime4j.dom.address.Address;
import org.apache.james.mime4j.dom.address.AddressList;
import org.apache.james.mime4j.dom.address.Mailbox;
import org.apache.james.mime4j.dom.address.MailboxList;
import org.apache.james.mime4j.dom.field.AddressListField;
import org.apache.james.mime4j.dom.field.DateTimeField;
import org.apache.james.mime4j.dom.field.MailboxListField;
import org.apache.james.mime4j.dom.field.ParsedField;
import org.apache.james.mime4j.dom.field.UnstructuredField;
import org.apache.james.mime4j.field.LenientFieldParser;
import org.apache.james.mime4j.message.MaximalBodyDescriptor;
import org.apache.james.mime4j.parser.ContentHandler;
import org.apache.james.mime4j.stream.BodyDescriptor;
import org.apache.james.mime4j.stream.Field;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.csv.TextAndCSVParser;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.parser.rtf.RTFParser;
import org.apache.tika.parser.txt.TXTParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.DateFormat;
import java.text.DateFormatSymbols;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Stack;
import java.util.TimeZone;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static org.apache.tika.utils.DateUtils.MIDDAY;
import static org.apache.tika.utils.DateUtils.UTC;
/**
* Bridge between mime4j's content handler and the generic Sax content handler
* used by Tika. See
* http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html
*/
class MailContentHandler implements ContentHandler {
private static final String MULTIPART_ALTERNATIVE = "multipart/alternative";
//TIKA-1970 Mac Mail's format
private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN =
Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z");
//find a time ending in am/pm without a space: 10:30am and
//use this pattern to insert space: 10:30 am
private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b");
private static final DateFormat[] ALTERNATE_DATE_FORMATS = new DateFormat[] {
//note that the string is "cleaned" before processing:
//1) condense multiple whitespace to single space
//2) trim()
//3) strip out commas
//4) insert space before am/pm
//May 16 2016 1:32am
createDateFormat("MMM dd yy hh:mm a", null),
//this is a standard pattern handled by mime4j;
//but mime4j fails with leading whitespace
createDateFormat("EEE d MMM yy HH:mm:ss Z", UTC),
createDateFormat("EEE d MMM yy HH:mm:ss z", UTC),
createDateFormat("EEE d MMM yy HH:mm:ss", null),// no timezone
createDateFormat("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM
//16 May 2016 at 09:30:32 GMT+1 (Mac Mail TIKA-1970)
createDateFormat("d MMM yy 'at' HH:mm:ss z", UTC), // UTC/Zulu
createDateFormat("yy-MM-dd HH:mm:ss", null),
createDateFormat("MM/dd/yy hh:mm a", null, false),
//now dates without times
createDateFormat("MMM d yy", MIDDAY, false),
createDateFormat("EEE d MMM yy", MIDDAY, false),
createDateFormat("d MMM yy", MIDDAY, false),
createDateFormat("yy/MM/dd", MIDDAY, false),
createDateFormat("MM/dd/yy", MIDDAY, false)
};
private static DateFormat createDateFormat(String format, TimeZone timezone) {
return createDateFormat(format, timezone, true);
}
private static DateFormat createDateFormat(String format, TimeZone timezone, boolean isLenient) {
SimpleDateFormat sdf =
new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
if (timezone != null) {
sdf.setTimeZone(timezone);
}
sdf.setLenient(isLenient);
return sdf;
}
private final XHTMLContentHandler handler;
private final Metadata metadata;
private final ParseContext parseContext;
private boolean strictParsing = false;
private final boolean extractAllAlternatives;
private final EmbeddedDocumentExtractor extractor;
private final Detector detector;
//this is used to buffer a multipart body that
//keeps track of multipart/alternative and its children
private Stack alternativePartBuffer = new Stack<>();
private Stack parts = new Stack<>();
MailContentHandler(XHTMLContentHandler xhtml, Detector detector, Metadata metadata,
ParseContext context, boolean strictParsing, boolean extractAllAlternatives) {
this.handler = xhtml;
this.metadata = metadata;
this.parseContext = context;
this.strictParsing = strictParsing;
this.extractAllAlternatives = extractAllAlternatives;
// Fetch / Build an EmbeddedDocumentExtractor with which
// to handle/process the parts/attachments
// Was an EmbeddedDocumentExtractor explicitly supplied?
this.extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
this.detector = detector;
}
@Override
public void body(BodyDescriptor body, InputStream is) throws MimeException,
IOException {
// use a different metadata object
// in order to specify the mime type of the
// sub part without damaging the main metadata
Metadata submd = new Metadata();
submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
// TIKA-2455: flag the containing type.
if (parts.size() > 0) {
submd.set(Message.MULTIPART_SUBTYPE, parts.peek().getSubType());
submd.set(Message.MULTIPART_BOUNDARY, parts.peek().getBoundary());
}
if (body instanceof MaximalBodyDescriptor) {
MaximalBodyDescriptor maximalBody = (MaximalBodyDescriptor) body;
String contentDispositionType = maximalBody.getContentDispositionType();
if (contentDispositionType != null && !contentDispositionType.isEmpty()) {
StringBuilder contentDisposition = new StringBuilder(contentDispositionType);
Map contentDispositionParameters = maximalBody.getContentDispositionParameters();
for (Entry param : contentDispositionParameters.entrySet()) {
contentDisposition.append("; ")
.append(param.getKey()).append("=\"").append(param.getValue()).append('"');
}
String contentDispositionFileName = maximalBody.getContentDispositionFilename();
if ( contentDispositionFileName != null ) {
submd.set( Metadata.RESOURCE_NAME_KEY, contentDispositionFileName );
}
submd.set(Metadata.CONTENT_DISPOSITION, contentDisposition.toString());
}
}
//if we're in a multipart/alternative or any one of its children
//add the bodypart to the latest that was added
if (!extractAllAlternatives && alternativePartBuffer.size() > 0) {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
IOUtils.copy(is, bos);
alternativePartBuffer.peek().children.add(new BodyContents(submd, bos.toByteArray()));
} else if (!extractAllAlternatives && parts.size() < 2) {
//if you're at the first level of embedding
//and you're not in an alternative part block
//and you're text/html, put that in the body of the email
//otherwise treat as a regular attachment
ByteArrayOutputStream bos = new ByteArrayOutputStream();
IOUtils.copy(is, bos);
byte[] bytes = bos.toByteArray();
if (detectTextOrHtml(submd, bytes)) {
handleInlineBodyPart(new BodyContents(submd, bos.toByteArray()));
} else {
//else handle as you would any other embedded content
try (TikaInputStream tis = TikaInputStream.get(bytes)) {
handleEmbedded(tis, submd);
}
}
} else {
//else handle as you would any other embedded content
try (TikaInputStream tis = TikaInputStream.get(is)) {
handleEmbedded(tis, submd);
}
}
}
private boolean detectTextOrHtml(Metadata submd, byte[] bytes) {
String mediaTypeString = submd.get(Metadata.CONTENT_TYPE);
if (mediaTypeString != null) {
if (mediaTypeString.startsWith("text")) {
return true;
} else {
return false;
}
}
try (TikaInputStream tis = TikaInputStream.get(bytes)) {
MediaType mediaType = detector.detect(tis, submd);
if (mediaType != null) {
//detect only once
submd.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, mediaType.toString());
if (mediaType.toString().startsWith("text")) {
return true;
}
}
} catch (IOException e) {
}
return false;
}
private void handleEmbedded(TikaInputStream tis, Metadata metadata) throws MimeException, IOException {
String disposition = metadata.get(Metadata.CONTENT_DISPOSITION);
boolean isInline = false;
if (disposition != null) {
if (disposition.toLowerCase(Locale.US).contains("inline")) {
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
isInline = true;
}
}
if (! isInline) {
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
}
try {
if (extractor.shouldParseEmbedded(metadata)) {
// Wrap the InputStream before passing on, as the James provided
// one misses many features we might want eg mark/reset
extractor.parseEmbedded(tis, handler, metadata, false);
}
} catch (SAXException e) {
throw new MimeException(e);
}
}
@Override
public void endBodyPart() throws MimeException {
//if we're buffering for a multipart/alternative
//don't write
if (alternativePartBuffer.size() > 0) {
return;
}
try {
handler.startElement("div", "class", "email-entry");
handler.startElement("p");
} catch (SAXException e) {
throw new MimeException(e);
}
}
@Override
public void startHeader() throws MimeException {
// TODO Auto-generated method stub
}
@Override
public void startMultipart(BodyDescriptor descr) throws MimeException {
parts.push(descr);
if (! extractAllAlternatives) {
if (alternativePartBuffer.size() == 0
&& MULTIPART_ALTERNATIVE.equalsIgnoreCase(descr.getMimeType())) {
Part part = new Part(descr);
alternativePartBuffer.push(part);
} else if (alternativePartBuffer.size() > 0) {
//add the part to the stack
Part parent = alternativePartBuffer.peek();
Part part = new Part(descr);
alternativePartBuffer.push(part);
if (parent != null) {
parent.children.add(part);
}
}
}
}
private String stripOutFieldPrefix(Field field, String fieldname) {
String temp = field.getRaw().toString();
int loc = fieldname.length();
while (temp.charAt(loc) == ' ') {
loc++;
}
return temp.substring(loc);
}
private void handleBestParts(Part part) throws MimeException, IOException {
if (part == null) {
return;
}
if (part instanceof BodyContents) {
handleInlineBodyPart((BodyContents)part);
return;
}
if (MULTIPART_ALTERNATIVE.equalsIgnoreCase(part.bodyDescriptor.getMimeType())) {
int bestPartScore = -1;
Part bestPart = null;
for (Part alternative : part.children) {
int score = score(alternative);
if (score > bestPartScore) {
bestPart = alternative;
bestPartScore = score;
}
}
handleBestParts(bestPart);
} else {
for (Part child : part.children) {
handleBestParts(child);
}
}
}
private void handleInlineBodyPart(BodyContents part) throws MimeException, IOException {
String contentType = part.metadata.get(Metadata.CONTENT_TYPE);
Parser parser = null;
boolean inlineText = false;
if (MediaType.TEXT_HTML.toString().equalsIgnoreCase(contentType)) {
parser =
EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
} else if ("application/rtf".equalsIgnoreCase(contentType)) {
parser =
EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
} else if (MediaType.TEXT_PLAIN.toString().equalsIgnoreCase(contentType)) {
parser =
EmbeddedDocumentUtil.tryToFindExistingLeafParser(TXTParser.class, parseContext);
if (parser == null) {
parser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(TextAndCSVParser.class, parseContext);
inlineText = true;
}
}
if (parser == null) {
//back off and treat it as an embedded chunk
try (TikaInputStream tis = TikaInputStream.get(part.bytes)) {
handleEmbedded(tis, part.metadata);
}
} else {
//parse inline
try {
Metadata inlineMetadata = new Metadata();
if (inlineText) {
inlineMetadata.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, MediaType.TEXT_PLAIN.toString());
}
parser.parse(
new ByteArrayInputStream(part.bytes),
new EmbeddedContentHandler(new BodyContentHandler(handler)),
inlineMetadata, parseContext
);
} catch (SAXException | TikaException e) {
throw new MimeException(e);
}
}
}
private int score(Part part) {
if (part == null) {
return 0;
}
if (part instanceof BodyContents) {
String contentType = ((BodyContents)part).metadata.get(Metadata.CONTENT_TYPE);
if (contentType == null) {
return 0;
} else if (contentType.equalsIgnoreCase(MediaType.TEXT_PLAIN.toString())) {
return 1;
} else if (contentType.equalsIgnoreCase("application/rtf")) {
//TODO -- is this the right definition in rfc822 for rich text?!
return 2;
} else if (contentType.equalsIgnoreCase(MediaType.TEXT_HTML.toString())) {
return 3;
}
}
return 4;
}
private static class Part {
private final BodyDescriptor bodyDescriptor;
private final List