eu.dicodeproject.analysis.examples.MailHandler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of integration Show documentation
Show all versions of integration Show documentation
The examples module provides glue code implementation for extracting common phrases, key word distributions and more from tweets stored on HDFS/HBase. It builds on Mahout for more sophisticated analysis.
The newest version!
/**
* Copyright (C) 2010, 2011 Neofonie GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package eu.dicodeproject.analysis.examples;
import java.io.IOException;
import org.apache.mahout.text.ChunkedWriter;
import org.xml.sax.Attributes;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
/**
* Accepts events resulting from parsing mbox archives. Each mail's content is
* stored in a sequence file (used as value part of a key value pair, key is a
* combination of the original mbox archive location + a running number.)
*
* Class is not threadsafe.
* */
public final class MailHandler implements MailContentHandler {
private final ChunkedWriter writer;
private String prefix;
private StringBuilder builder;
private int counter;
public MailHandler(ChunkedWriter writer) {
this.writer = writer;
this.counter = 0;
}
public String getPrefix() {
return prefix;
}
@Override
public void setPrefix(String targetKeyPrefix) {
this.prefix = targetKeyPrefix;
}
@Override
public void startDocument() throws SAXException {
this.builder = new StringBuilder();
}
@Override
public void characters(char[] arg0, int arg1, int arg2) throws SAXException {
String content = new String(arg0);
if (!content.startsWith("Content-Type") && !content.startsWith("--")) {
this.builder.append(content);
}
}
@Override
public void endDocument() throws SAXException {
}
@Override
public void startElement(String arg0, String arg1, String arg2, Attributes arg3) throws SAXException {
if ("div".equals(arg1)) {
this.counter++;
this.builder = new StringBuilder();
}
}
@Override
public void endElement(String arg0, String arg1, String arg2) throws SAXException {
if ("div".equals(arg1)) {
try {
writer.write(prefix + "-" + counter, this.builder.toString());
} catch (IOException e) {
// TODO accumulate and break if too many.
}
}
}
@Override
public void endPrefixMapping(String arg0) throws SAXException {
// Nothing to do.
}
@Override
public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {
// Nothing to do.
}
@Override
public void processingInstruction(String arg0, String arg1) throws SAXException {
// Nothing to do.
}
@Override
public void setDocumentLocator(Locator arg0) {
// Nothing to do.
}
@Override
public void skippedEntity(String arg0) throws SAXException {
// Nothing to do.
}
@Override
public void startPrefixMapping(String arg0, String arg1) throws SAXException {
// Nothing to do.
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy