eu.dicodeproject.analysis.examples.UnquotedHandler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of integration Show documentation
Show all versions of integration Show documentation
The examples module provides glue code implementation for extracting common phrases, key word distributions and more from tweets stored on HDFS/HBase. It builds on Mahout for more sophisticated analysis.
The newest version!
/**
* Copyright (C) 2010, 2011 Neofonie GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package eu.dicodeproject.analysis.examples;
import java.io.IOException;
import org.apache.mahout.text.ChunkedWriter;
import org.xml.sax.Attributes;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
/**
* Stores only content to disk that is not quoted in the original mail. Careful
* - the implemenation caries state, do not use one instance of the
* UnquotedHandler to process multiple documents (as in mbox files) at once.
*/
public class UnquotedHandler implements MailContentHandler {
private final ChunkedWriter writer;
private boolean inQuote = false;
private String prefix;
private StringBuilder builder;
private int counter;
public UnquotedHandler(ChunkedWriter writer) {
this.writer = writer;
this.counter = 0;
}
public String getPrefix() {
return prefix;
}
public void setPrefix(String targetKeyPrefix) {
this.prefix = targetKeyPrefix;
}
@Override
public void startDocument() throws SAXException {
this.builder = new StringBuilder();
}
@Override
public void characters(char[] arg0, int arg1, int arg2) throws SAXException {
String content = new String(arg0);
if (!content.startsWith("Content-Type") && !content.startsWith("--") && !this.inQuote) {
this.builder.append(content);
}
}
@Override
public void endDocument() throws SAXException {
}
@Override
public void startElement(String arg0, String arg1, String arg2, Attributes arg3) throws SAXException {
if ("div".equals(arg1)) {
this.counter++;
this.builder = new StringBuilder();
this.inQuote = false;
} else if ("q".equals(arg1)) {
this.inQuote = true;
}
}
@Override
public void endElement(String arg0, String arg1, String arg2) throws SAXException {
if ("div".equals(arg1)) {
try {
writer.write(prefix + "-" + counter, this.builder.toString());
} catch (IOException e) {
// TODO accumulate and break if too many.
}
} else if ("q".equals(arg1)) {
this.inQuote = false;
}
}
@Override
public void setDocumentLocator(Locator locator) {
// Nothing to change.
}
@Override
public void startPrefixMapping(String prefix, String uri) throws SAXException {
// Nothing to change.
}
@Override
public void endPrefixMapping(String prefix) throws SAXException {
// Nothing to change.
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
// Nothing to change.
}
@Override
public void processingInstruction(String target, String data) throws SAXException {
// Nothing to change.
}
@Override
public void skippedEntity(String name) throws SAXException {
// Nothing to change.
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy