Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.mbox;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Collections;
import java.util.Date;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* Mbox (mailbox) parser. This version returns the headers for the first email
* via metadata, which means headers from subsequent emails will be lost.
*/
public class MboxParser implements Parser {
private static final Set SUPPORTED_TYPES =
Collections.singleton(MediaType.application("mbox"));
public static final String MBOX_MIME_TYPE = "application/mbox";
public static final String MBOX_RECORD_DIVIDER = "From ";
private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)");
private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>");
private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-";
private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from";
private enum ParseStates {
START, IN_HEADER, IN_CONTENT
}
public Set getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, TikaException, SAXException {
InputStreamReader isr;
try {
// Headers are going to be 7-bit ascii
isr = new InputStreamReader(stream, "US-ASCII");
} catch (UnsupportedEncodingException e) {
throw new TikaException("US-ASCII is not supported!", e);
}
BufferedReader reader = new BufferedReader(isr);
metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
metadata.set(Metadata.CONTENT_ENCODING, "us-ascii");
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
ParseStates parseState = ParseStates.START;
String multiLine = null;
boolean inQuote = false;
int numEmails = 0;
// We're going to scan, line-by-line, for a line that starts with
// "From "
for (String curLine = reader.readLine(); curLine != null; curLine = reader.readLine()) {
boolean newMessage = curLine.startsWith(MBOX_RECORD_DIVIDER);
if (newMessage) {
numEmails += 1;
}
switch (parseState) {
case START:
if (newMessage) {
parseState = ParseStates.IN_HEADER;
newMessage = false;
// Fall through to IN_HEADER
} else {
break;
}
case IN_HEADER:
if (newMessage) {
saveHeaderInMetadata(numEmails, metadata, multiLine);
multiLine = curLine;
} else if (curLine.length() == 0) {
// Blank line is signal that we're transitioning to the content.
saveHeaderInMetadata(numEmails, metadata, multiLine);
parseState = ParseStates.IN_CONTENT;
// Mimic what PackageParser does between entries.
xhtml.startElement("div", "class", "email-entry");
xhtml.startElement("p");
inQuote = false;
} else if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
multiLine += " " + curLine.trim();
} else {
saveHeaderInMetadata(numEmails, metadata, multiLine);
multiLine = curLine;
}
break;
// TODO - use real email parsing support so we can correctly handle
// things like multipart messages and quoted-printable encoding.
// We'd also want this for charset handling, where content isn't 7-bit
// ascii.
case IN_CONTENT:
if (newMessage) {
endMessage(xhtml, inQuote);
parseState = ParseStates.IN_HEADER;
multiLine = curLine;
} else {
boolean quoted = curLine.startsWith(">");
if (inQuote) {
if (!quoted) {
xhtml.endElement("q");
inQuote = false;
}
} else if (quoted) {
xhtml.startElement("q");
inQuote = true;
}
xhtml.characters(curLine);
// For plain text email, each line is a real break position.
xhtml.element("br", "");
}
}
}
if (parseState == ParseStates.IN_HEADER) {
saveHeaderInMetadata(numEmails, metadata, multiLine);
} else if (parseState == ParseStates.IN_CONTENT) {
endMessage(xhtml, inQuote);
}
xhtml.endDocument();
}
private void endMessage(XHTMLContentHandler xhtml, boolean inQuote) throws SAXException {
if (inQuote) {
xhtml.endElement("q");
}
xhtml.endElement("p");
xhtml.endElement("div");
}
private void saveHeaderInMetadata(int numEmails, Metadata metadata, String curLine) {
if ((curLine == null) || (numEmails > 1)) {
return;
} else if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
metadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
return;
}
Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
if (!headerMatcher.matches()) {
return; // ignore malformed header lines
}
String headerTag = headerMatcher.group(1).toLowerCase();
String headerContent = headerMatcher.group(2);
if (headerTag.equalsIgnoreCase("From")) {
metadata.add(Metadata.AUTHOR, headerContent);
metadata.add(Metadata.CREATOR, headerContent);
} else if (headerTag.equalsIgnoreCase("To") ||
headerTag.equalsIgnoreCase("Cc") ||
headerTag.equalsIgnoreCase("Bcc")) {
Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent);
if(address.find()) {
metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1));
} else if(headerContent.indexOf('@') > -1) {
metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent);
}
String property = Metadata.MESSAGE_TO;
if (headerTag.equalsIgnoreCase("Cc")) {
property = Metadata.MESSAGE_CC;
} else if (headerTag.equalsIgnoreCase("Bcc")) {
property = Metadata.MESSAGE_BCC;
}
metadata.add(property, headerContent);
} else if (headerTag.equalsIgnoreCase("Subject")) {
metadata.add(Metadata.SUBJECT, headerContent);
metadata.add(Metadata.TITLE, headerContent);
} else if (headerTag.equalsIgnoreCase("Date")) {
try {
metadata.set(Metadata.DATE, parseDate(headerContent));
} catch (ParseException e) {
// ignoring date because format was not understood
}
} else if (headerTag.equalsIgnoreCase("Message-Id")) {
metadata.add(Metadata.IDENTIFIER, headerContent);
} else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
metadata.add(Metadata.RELATION, headerContent);
} else if (headerTag.equalsIgnoreCase("Content-Type")) {
// TODO - key off content-type in headers to
// set mapping to use for content and convert if necessary.
metadata.add(Metadata.CONTENT_TYPE, headerContent);
metadata.add(Metadata.FORMAT, headerContent);
} else {
metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
}
}
private Date parseDate(String headerContent) throws ParseException {
SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
return dateFormat.parse(headerContent);
}
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
throws IOException, SAXException, TikaException {
parse(stream, handler, metadata, new ParseContext());
}
}