org.apache.tika.parser.microsoft.OutlookExtractor Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft;
import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.james.mime4j.codec.DecodeMonitor;
import org.apache.james.mime4j.codec.DecoderUtil;
import org.apache.poi.hmef.attribute.MAPIRtfAttribute;
import org.apache.poi.hsmf.MAPIMessage;
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
import org.apache.poi.hsmf.datatypes.ByteChunk;
import org.apache.poi.hsmf.datatypes.Chunk;
import org.apache.poi.hsmf.datatypes.Chunks;
import org.apache.poi.hsmf.datatypes.MAPIProperty;
import org.apache.poi.hsmf.datatypes.PropertyValue;
import org.apache.poi.hsmf.datatypes.RecipientChunks;
import org.apache.poi.hsmf.datatypes.StringChunk;
import org.apache.poi.hsmf.datatypes.Types;
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.util.CodePageUtil;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlEncodingDetector;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.parser.mbox.MboxParser;
import org.apache.tika.parser.rtf.RTFParser;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
/**
* Outlook Message Parser.
*/
public class OutlookExtractor extends AbstractPOIFSExtractor {
public enum RECIPIENT_TYPE {
TO(1),
CC(2),
BCC(3),
UNRECOGNIZED(-1),
UNSPECIFIED(-1);
private final int val;
RECIPIENT_TYPE(int val) {
this.val = val;
}
public static RECIPIENT_TYPE getTypeFromVal(int val) {
//mild hackery, clean up
if (val > 0 && val < 4) {
return RECIPIENT_TYPE.values()[val - 1];
}
return UNRECOGNIZED;
}
};
private enum ADDRESS_TYPE {
EX,
SMTP
}
private static Pattern HEADER_KEY_PAT =
Pattern.compile("\\A([\\x21-\\x39\\x3B-\\x7E]+):(.*?)\\Z");
//this according to the spec; in practice, it is probably more likely
//that a "split field" fails to start with a space character than
//that a real header contains anything but [-_A-Za-z0-9].
//e.g.
//header: this header goes onto the next line
// headers = normalizeHeaders(msg.getHeaders());
String from = msg.getDisplayFrom();
handleFromTo(headers, metadata);
metadata.set(TikaCoreProperties.TITLE, subject);
// TODO: Move to description in Tika 2.0
metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
msg.getConversationTopic());
try {
for (String recipientAddress : msg.getRecipientEmailAddressList()) {
if (recipientAddress != null)
metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
}
} catch (ChunkNotFoundException he) {
} // Will be fixed in POI 3.7 Final
for (Map.Entry e : headers.entrySet()) {
String headerKey = e.getKey();
for (String headerValue : e.getValue()) {
metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
}
}
// Date - try two ways to find it
// First try via the proper chunk
if (msg.getMessageDate() != null) {
metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
} else {
if (headers != null && headers.size() > 0) {
for (Map.Entry header : headers.entrySet()) {
String headerKey = header.getKey();
if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
// See if we can parse it as a normal mail date
try {
Date d = MboxParser.parseDate(date);
metadata.set(TikaCoreProperties.CREATED, d);
metadata.set(TikaCoreProperties.MODIFIED, d);
} catch (ParseException e) {
// Store it as-is, and hope for the best...
metadata.set(TikaCoreProperties.CREATED, date);
metadata.set(TikaCoreProperties.MODIFIED, date);
}
break;
}
}
}
}
xhtml.element("h1", subject);
// Output the from and to details in text, as you
// often want them in text form for searching
xhtml.startElement("dl");
if (from != null) {
header(xhtml, "From", from);
}
header(xhtml, "To", msg.getDisplayTo());
header(xhtml, "Cc", msg.getDisplayCC());
header(xhtml, "Bcc", msg.getDisplayBCC());
try {
header(xhtml, "Recipients", msg.getRecipientEmailAddress());
} catch (ChunkNotFoundException e) {
}
xhtml.endElement("dl");
// Get the message body. Preference order is: html, rtf, text
Chunk htmlChunk = null;
Chunk rtfChunk = null;
Chunk textChunk = null;
for (Chunk chunk : msg.getMainChunks().getChunks()) {
if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
htmlChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
rtfChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.BODY.id) {
textChunk = chunk;
}
}
boolean doneBody = false;
xhtml.startElement("div", "class", "message-body");
if (htmlChunk != null) {
byte[] data = null;
if (htmlChunk instanceof ByteChunk) {
data = ((ByteChunk) htmlChunk).getValue();
} else if (htmlChunk instanceof StringChunk) {
data = ((StringChunk) htmlChunk).getRawValue();
}
if (data != null) {
Parser htmlParser =
EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
if (htmlParser == null) {
htmlParser = new HtmlParser();
}
htmlParser.parse(
new ByteArrayInputStream(data),
new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
new Metadata(), parseContext
);
doneBody = true;
}
}
if (rtfChunk != null && !doneBody) {
ByteChunk chunk = (ByteChunk) rtfChunk;
MAPIRtfAttribute rtf = new MAPIRtfAttribute(
MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()
);
Parser rtfParser =
EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
if (rtfParser == null) {
rtfParser = new RTFParser();
}
rtfParser.parse(
new ByteArrayInputStream(rtf.getData()),
new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
new Metadata(), parseContext);
doneBody = true;
}
if (textChunk != null && !doneBody) {
xhtml.element("p", ((StringChunk) textChunk).getValue());
}
xhtml.endElement("div");
// Process the attachments
for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
xhtml.startElement("div", "class", "attachment-entry");
String filename = null;
if (attachment.getAttachLongFileName() != null) {
filename = attachment.getAttachLongFileName().getValue();
} else if (attachment.getAttachFileName() != null) {
filename = attachment.getAttachFileName().getValue();
}
if (filename != null && filename.length() > 0) {
xhtml.element("h1", filename);
}
if (attachment.getAttachData() != null) {
handleEmbeddedResource(
TikaInputStream.get(attachment.getAttachData().getValue()),
filename, null,
null, xhtml, true
);
}
if (attachment.getAttachmentDirectory() != null) {
handleEmbeddedOfficeDoc(
attachment.getAttachmentDirectory().getDirectory(),
xhtml
);
}
xhtml.endElement("div");
}
} catch (ChunkNotFoundException e) {
throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
} finally {
//You'd think you'd want to call msg.close().
//Don't do that. That closes down the file system.
//If an msg has multiple msg attachments, some of them
//can reside in the same file system. After the first
//child is read, the fs is closed, and the other children
//get a java.nio.channels.ClosedChannelException
}
}
private void handleFromTo(Map headers, Metadata metadata) throws ChunkNotFoundException {
String from = msg.getDisplayFrom();
metadata.set(TikaCoreProperties.CREATOR, from);
metadata.set(Metadata.MESSAGE_FROM, from);
metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
Chunks chunks = msg.getMainChunks();
StringChunk sentByServerType = chunks.getSentByServerType();
if (sentByServerType != null) {
metadata.set(Office.MAPI_SENT_BY_SERVER_TYPE,
sentByServerType.getValue());
}
Map> mainChunks = msg.getMainChunks().getAll();
List senderAddresType = mainChunks.get(MAPIProperty.SENDER_ADDRTYPE);
String senderAddressTypeString = "";
if (senderAddresType != null && senderAddresType.size() > 0) {
senderAddressTypeString = senderAddresType.get(0).toString();
}
//sometimes in SMTP .msg files there is an email in the sender name field.
setFirstChunk(
mainChunks.get(MAPIProperty.SENDER_NAME), Message.MESSAGE_FROM_NAME, metadata);
setFirstChunk(
mainChunks.get(MAPIProperty.SENT_REPRESENTING_NAME),
Office.MAPI_FROM_REPRESENTING_NAME, metadata);
setFirstChunk(mainChunks.get(MAPIProperty.SENDER_EMAIL_ADDRESS),
Message.MESSAGE_FROM_EMAIL, metadata);
setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_EMAIL_ADDRESS),
Office.MAPI_FROM_REPRESENTING_EMAIL, metadata);
for (Recipient recipient : buildRecipients()) {
switch(recipient.recipientType) {
case TO:
addEvenIfNull(Message.MESSAGE_TO_NAME, recipient.name, metadata);
addEvenIfNull(Message.MESSAGE_TO_DISPLAY_NAME, recipient.displayName, metadata);
addEvenIfNull(Message.MESSAGE_TO_EMAIL, recipient.emailAddress, metadata);
break;
case CC:
addEvenIfNull(Message.MESSAGE_CC_NAME, recipient.name, metadata);
addEvenIfNull(Message.MESSAGE_CC_DISPLAY_NAME, recipient.displayName, metadata);
addEvenIfNull(Message.MESSAGE_CC_EMAIL, recipient.emailAddress, metadata);
break;
case BCC:
addEvenIfNull(Message.MESSAGE_BCC_NAME, recipient.name, metadata);
addEvenIfNull(Message.MESSAGE_BCC_DISPLAY_NAME, recipient.displayName, metadata);
addEvenIfNull(Message.MESSAGE_BCC_EMAIL, recipient.emailAddress, metadata);
break;
default:
//log unknown or undefined?
break;
}
}
}
//need to add empty string to ensure that parallel arrays are parallel
//even if one value is null.
public static void addEvenIfNull(Property property, String value, Metadata metadata) {
if (value == null) {
value = "";
}
metadata.add(property, value);
}
private static void setFirstChunk(List chunks, Property property,
Metadata metadata) {
if (chunks == null || chunks.size() < 1 || chunks.get(0) == null) {
return;
}
metadata.set(property, chunks.get(0).toString());
}
private static void addFirstChunk(List chunks, Property property,
Metadata metadata) {
if (chunks == null || chunks.size() < 1 || chunks.get(0) == null) {
return;
}
metadata.add(property, chunks.get(0).toString());
}
//TODO: replace this with getMessageClassEnum when we upgrade POI
public static String getMessageClass(String messageClass){
if (messageClass == null || messageClass.trim().length() == 0) {
return "UNSPECIFIED";
} else if (messageClass.equalsIgnoreCase("IPM.Note")) {
return "NOTE";
} else if (messageClass.equalsIgnoreCase("IPM.Contact")) {
return "CONTACT";
} else if (messageClass.equalsIgnoreCase("IPM.Appointment")) {
return "APPOINTMENT";
} else if (messageClass.equalsIgnoreCase("IPM.StickyNote")) {
return "STICKY_NOTE";
} else if (messageClass.equalsIgnoreCase("IPM.Task")) {
return "TASK";
} else if (messageClass.equalsIgnoreCase("IPM.Post")) {
return "POST";
} else {
return "UNKNOWN";
}
}
//As of 3.15, POI currently returns header[] by splitting on /\r?\n/
//this rebuilds headers that are broken up over several lines
//this also decodes encoded headers.
private Map normalizeHeaders(String[] rows) {
Map ret = new LinkedHashMap<>();
if (rows == null) {
return ret;
}
StringBuilder sb = new StringBuilder();
Map> headers = new LinkedHashMap();
Matcher headerKeyMatcher = HEADER_KEY_PAT.matcher("");
String lastKey = null;
int consec = 0;
for (String row : rows) {
headerKeyMatcher.reset(row);
if (headerKeyMatcher.find()) {
if (lastKey != null) {
List vals = headers.get(lastKey);
vals = (vals == null) ? new ArrayList() : vals;
vals.add(decodeHeader(sb.toString()));
headers.put(lastKey, vals);
}
//reset sb
sb.setLength(0);
lastKey = headerKeyMatcher.group(1).trim();
sb.append(headerKeyMatcher.group(2).trim());
consec = 0;
} else {
if (consec > 0) {
sb.append("\n");
}
sb.append(row);
}
consec++;
}
//make sure to add the last value
if (sb.length() > 0 && lastKey != null) {
List vals = headers.get(lastKey);
vals = (vals == null) ? new ArrayList() : vals;
vals.add(decodeHeader(sb.toString()));
headers.put(lastKey, vals);
}
//convert to array
for (Map.Entry> e : headers.entrySet()) {
ret.put(e.getKey(), e.getValue().toArray(new String[e.getValue().size()]));
}
return ret;
}
private String decodeHeader(String header) {
return DecoderUtil.decodeEncodedWords(header, DecodeMonitor.SILENT);
}
private void header(XHTMLContentHandler xhtml, String key, String value)
throws SAXException {
if (value != null && value.length() > 0) {
xhtml.element("dt", key);
xhtml.element("dd", value);
}
}
/**
* Tries to identify the correct encoding for 7-bit (non-unicode)
* strings in the file.
* Many messages store their strings as unicode, which is
* nice and easy. Some use one-byte encodings for their
* strings, but don't always store the encoding anywhere
* helpful in the file.
* This method checks for codepage properties, and failing that
* looks at the headers for the message, and uses these to
* guess the correct encoding for your file.
* Bug #49441 has more on why this is needed
* This is taken verbatim from POI (TIKA-1238)
* as a temporary workaround to prevent unsupported encoding exceptions
*/
private void guess7BitEncoding(MAPIMessage msg) {
Chunks mainChunks = msg.getMainChunks();
//sanity check
if (mainChunks == null) {
return;
}
Map> props = mainChunks.getProperties();
if (props != null) {
// First choice is a codepage property
for (MAPIProperty prop : new MAPIProperty[]{
MAPIProperty.MESSAGE_CODEPAGE,
MAPIProperty.INTERNET_CPID
}) {
List val = props.get(prop);
if (val != null && val.size() > 0) {
int codepage = ((PropertyValue.LongPropertyValue) val.get(0)).getValue();
String encoding = null;
try {
encoding = CodePageUtil.codepageToEncoding(codepage, true);
} catch (UnsupportedEncodingException e) {
//swallow
}
if (tryToSet7BitEncoding(msg, encoding)) {
return;
}
}
}
}
// Second choice is a charset on a content type header
try {
String[] headers = msg.getHeaders();
if(headers != null && headers.length > 0) {
// Look for a content type with a charset
Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE);
for(String header : headers) {
if(header.startsWith("Content-Type")) {
Matcher m = p.matcher(header);
if(m.matches()) {
// Found it! Tell all the string chunks
String charset = m.group(1);
if (tryToSet7BitEncoding(msg, charset)) {
return;
}
}
}
}
}
} catch(ChunkNotFoundException e) {}
// Nothing suitable in the headers, try HTML
// TODO: do we need to replicate this in Tika? If we wind up
// parsing the html version of the email, this is duplicative??
// Or do we need to reset the header strings based on the html
// meta header if there is no other information?
try {
String html = msg.getHtmlBody();
if(html != null && html.length() > 0) {
Charset charset = null;
try {
charset = detector.detect(new ByteArrayInputStream(
html.getBytes(UTF_8)), EMPTY_METADATA);
} catch (IOException e) {
//swallow
}
if (charset != null && tryToSet7BitEncoding(msg, charset.name())) {
return;
}
}
} catch(ChunkNotFoundException e) {}
//absolute last resort, try charset detector
StringChunk text = mainChunks.getTextBodyChunk();
if (text != null) {
CharsetDetector detector = new CharsetDetector();
detector.setText(text.getRawValue());
CharsetMatch match = detector.detect();
if (match != null && match.getConfidence() > 35 &&
tryToSet7BitEncoding(msg, match.getName())) {
return;
}
}
}
private boolean tryToSet7BitEncoding(MAPIMessage msg, String charsetName) {
if (charsetName == null) {
return false;
}
if (charsetName.equalsIgnoreCase("utf-8")) {
return false;
}
try {
if (Charset.isSupported(charsetName)) {
msg.set7BitEncoding(charsetName);
return true;
}
} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
//swallow
}
return false;
}
private List buildRecipients() {
RecipientChunks[] recipientChunks = msg.getRecipientDetailsChunks();
if (recipientChunks == null) {
return Collections.EMPTY_LIST;
}
List recipients = new LinkedList<>();
for (RecipientChunks chunks : recipientChunks) {
Recipient r = new Recipient();
r.displayName = (chunks.recipientDisplayNameChunk != null) ? chunks.recipientDisplayNameChunk.toString() : null;
r.name = (chunks.recipientNameChunk != null) ? chunks.recipientNameChunk.toString() : null;
r.emailAddress = chunks.getRecipientEmailAddress();
List vals = chunks.getProperties().get(MAPIProperty.RECIPIENT_TYPE);
RECIPIENT_TYPE recipientType = RECIPIENT_TYPE.UNSPECIFIED;
if (vals != null && vals.size() > 0) {
Object val = vals.get(0).getValue();
if (val instanceof Integer) {
recipientType = RECIPIENT_TYPE.getTypeFromVal((int)val);
}
}
r.recipientType = recipientType;
vals = chunks.getProperties().get(MAPIProperty.ADDRTYPE);
if (vals != null && vals.size() > 0) {
String val = vals.get(0).toString();
if (val != null) {
val = val.toLowerCase(Locale.US);
//need to find example of this for testing
if (val.equals("ex")) {
r.addressType = ADDRESS_TYPE.EX;
} else if (val.equals("smtp")) {
r.addressType = ADDRESS_TYPE.SMTP;
}
}
}
recipients.add(r);
}
return recipients;
}
private static class Recipient {
String name;
String displayName;
RECIPIENT_TYPE recipientType;
String emailAddress;
ADDRESS_TYPE addressType;
}
}