org.apache.tika.sax.WriteOutContentHandler Maven / Gradle / Ivy
Show all versions of tika-core Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.sax;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Serializable;
import java.io.StringWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.UUID;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* SAX event handler that writes content up to an optional write
* limit out to a character stream or other decorated handler.
*/
public class WriteOutContentHandler extends ContentHandlerDecorator {
/**
* The unique tag associated with exceptions from stream.
*/
private final Serializable tag = UUID.randomUUID();
/**
* The maximum number of characters to write to the character stream.
* Set to -1 for no limit.
*/
private final int writeLimit;
/**
* Number of characters written so far.
*/
private int writeCount = 0;
/**
* Creates a content handler that writes content up to the given
* write limit to the given content handler.
*
* @since Apache Tika 0.10
* @param handler content handler to be decorated
* @param writeLimit write limit
*/
public WriteOutContentHandler(ContentHandler handler, int writeLimit) {
super(handler);
this.writeLimit = writeLimit;
}
/**
* Creates a content handler that writes content up to the given
* write limit to the given character stream.
*
* @since Apache Tika 0.10
* @param writer character stream
* @param writeLimit write limit
*/
public WriteOutContentHandler(Writer writer, int writeLimit) {
this(new ToTextContentHandler(writer), writeLimit);
}
/**
* Creates a content handler that writes character events to
* the given writer.
*
* @param writer writer
*/
public WriteOutContentHandler(Writer writer) {
this(writer, -1);
}
/**
* Creates a content handler that writes character events to
* the given output stream using the default encoding.
*
* @param stream output stream
*/
public WriteOutContentHandler(OutputStream stream) {
this(new OutputStreamWriter(stream, Charset.defaultCharset()));
}
/**
* Creates a content handler that writes character events
* to an internal string buffer. Use the {@link #toString()}
* method to access the collected character content.
*
* The internal string buffer is bounded at the given number of characters.
* If this write limit is reached, then a {@link SAXException} is thrown.
* The {@link #isWriteLimitReached(Throwable)} method can be used to
* detect this case.
*
* @since Apache Tika 0.7
* @param writeLimit maximum number of characters to include in the string,
* or -1 to disable the write limit
*/
public WriteOutContentHandler(int writeLimit) {
this(new StringWriter(), writeLimit);
}
/**
* Creates a content handler that writes character events
* to an internal string buffer. Use the {@link #toString()}
* method to access the collected character content.
*
* The internal string buffer is bounded at 100k characters. If this
* write limit is reached, then a {@link SAXException} is thrown. The
* {@link #isWriteLimitReached(Throwable)} method can be used to detect
* this case.
*/
public WriteOutContentHandler() {
this(100 * 1000);
}
/**
* Writes the given characters to the given character stream.
*/
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
if (writeLimit == -1 || writeCount + length <= writeLimit) {
super.characters(ch, start, length);
writeCount += length;
} else {
super.characters(ch, start, writeLimit - writeCount);
writeCount = writeLimit;
throw new WriteLimitReachedException(
"Your document contained more than " + writeLimit
+ " characters, and so your requested limit has been"
+ " reached. To receive the full text of the document,"
+ " increase your limit. (Text up to the limit is"
+ " however available).", tag);
}
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
if (writeLimit == -1 || writeCount + length <= writeLimit) {
super.ignorableWhitespace(ch, start, length);
writeCount += length;
} else {
super.ignorableWhitespace(ch, start, writeLimit - writeCount);
writeCount = writeLimit;
throw new WriteLimitReachedException(
"Your document contained more than " + writeLimit
+ " characters, and so your requested limit has been"
+ " reached. To receive the full text of the document,"
+ " increase your limit. (Text up to the limit is"
+ " however available).", tag);
}
}
/**
* Checks whether the given exception (or any of it's root causes) was
* thrown by this handler as a signal of reaching the write limit.
*
* @since Apache Tika 0.7
* @param t throwable
* @return true
if the write limit was reached,
* false
otherwise
*/
public boolean isWriteLimitReached(Throwable t) {
if (t instanceof WriteLimitReachedException) {
return tag.equals(((WriteLimitReachedException) t).tag);
} else {
return t.getCause() != null && isWriteLimitReached(t.getCause());
}
}
/**
* The exception used as a signal when the write limit has been reached.
*/
private static class WriteLimitReachedException extends SAXException {
/** Serial version UID */
private static final long serialVersionUID = -1850581945459429943L;
/** Serializable tag of the handler that caused this exception */
private final Serializable tag;
public WriteLimitReachedException(String message, Serializable tag) {
super(message);
this.tag = tag;
}
}
}