All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.sax.SafeContentHandler Maven / Gradle / Ivy

Go to download

This is the core Apache Tika™ toolkit library from which all other modules inherit functionality. It also includes the core facades for the Tika API.

There is a newer version: 1.0.18
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.sax;

/*
import java.util.ArrayList;
import java.util.List;
*/

import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

/**
 * Content handler decorator that makes sure that the character events
 * ({@link #characters(char[], int, int)} or
 * {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated
 * content handler contain only valid XML characters. All invalid characters
 * are replaced with spaces.
 * 

* The XML standard defines the following Unicode character ranges as * valid XML characters: *

 * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
 * 
*

* Note that currently this class only detects those invalid characters whose * UTF-16 representation fits a single char. Also, this class does not ensure * that the UTF-16 encoding of incoming characters is correct. */ public class SafeContentHandler extends ContentHandlerDecorator { /** * Replacement for invalid characters. */ private static final char[] REPLACEMENT = new char[] { '\ufffd' }; /** * Internal interface that allows both character and * ignorable whitespace content to be filtered the same way. */ protected interface Output { void write(char[] ch, int start, int length) throws SAXException; } private static class StringOutput implements Output { private final StringBuilder builder = new StringBuilder(); public void write(char[] ch, int start, int length) { builder.append(ch, start, length); } public String toString() { return builder.toString(); } } /** * Output through the {@link ContentHandler#characters(char[], int, int)} * method of the decorated content handler. */ private final Output charactersOutput = new Output() { public void write(char[] ch, int start, int length) throws SAXException { SafeContentHandler.super.characters(ch, start, length); } }; /** * Output through the * {@link ContentHandler#ignorableWhitespace(char[], int, int)} * method of the decorated content handler. */ private final Output ignorableWhitespaceOutput = new Output() { public void write(char[] ch, int start, int length) throws SAXException { SafeContentHandler.super.ignorableWhitespace(ch, start, length); } }; public SafeContentHandler(ContentHandler handler) { super(handler); } /** * Filters and outputs the contents of the given input buffer. Any * invalid characters in the input buffer area handled by sending a * replacement (a space character) to the given output. Any sequences * of valid characters are passed as-is to the given output. * * @param ch input buffer * @param start start offset within the buffer * @param length number of characters to read from the buffer * @param output output channel * @throws SAXException if the filtered characters could not be written out */ private void filter(char[] ch, int start, int length, Output output) throws SAXException { int end = start + length; int i = start; while (i < end) { int c = Character.codePointAt(ch, i, end); int j = i + Character.charCount(c); if (isInvalid(c)) { // Output any preceding valid characters if (i > start) { output.write(ch, start, i - start); } // Output the replacement for this invalid character writeReplacement(output); // Continue with the rest of the array start = j; } i = j; } // Output any remaining valid characters output.write(ch, start, end - start); } /** * Checks if the given string contains any invalid XML characters. * * @param value string to be checked * @return true if the string contains invalid XML characters, * false otherwise */ private boolean isInvalid(String value) { char[] ch = value.toCharArray(); int i = 0; while (i < ch.length) { int c = Character.codePointAt(ch, i); if (isInvalid(c)) { return true; } i = i + Character.charCount(c); } return false; } /** * Checks whether the given Unicode character is an invalid XML character * and should be replaced for output. Subclasses can override this method * to use an alternative definition of which characters should be replaced * in the XML output. The default definition from the XML specification is: *

     * Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
     * 
* * @param ch character * @return true if the character should be replaced, * false otherwise */ protected boolean isInvalid(int ch) { if (ch < 0x20) { return ch != 0x09 && ch != 0x0A && ch != 0x0D; } else if (ch < 0xE000) { return ch > 0xD7FF; } else if (ch < 0x10000) { return ch > 0xFFFD; } else { return ch > 0x10FFFF; } } /** * Outputs the replacement for an invalid character. Subclasses can * override this method to use a custom replacement. * * @param output where the replacement is written to * @throws SAXException if the replacement could not be written */ protected void writeReplacement(Output output) throws SAXException { output.write(REPLACEMENT, 0, REPLACEMENT.length); } /* private final List elements = new ArrayList(); // Called only from assert private boolean verifyStartElement(String name) { // TODO: we could strengthen this to do full // XTHML validation, eg you shouldn't start p inside // another p (but ODF parser, at least, seems to // violate this): //if (name.equals("p")) { //assert elements.size() == 0 || !elements.get(elements.size()-1).equals("p"); //} elements.add(name); return true; } // Called only from assert private boolean verifyEndElement(String name) { assert elements.size() > 0: "end tag=" + name + " with no startElement"; final String currentElement = elements.get(elements.size()-1); assert currentElement.equals(name): "mismatched elements open=" + currentElement + " close=" + name; elements.remove(elements.size()-1); return true; } // Called only from assert private boolean verifyEndDocument() { assert elements.size() == 0; return true; } */ //------------------------------------------------------< ContentHandler > @Override public void startElement( String uri, String localName, String name, Attributes atts) throws SAXException { // TODO: enable this, but some parsers currently // trip it //assert verifyStartElement(name); // Look for any invalid characters in attribute values. for (int i = 0; i < atts.getLength(); i++) { if (isInvalid(atts.getValue(i))) { // Found an invalid character, so need to filter the attributes AttributesImpl filtered = new AttributesImpl(); for (int j = 0; j < atts.getLength(); j++) { String value = atts.getValue(j); if (j >= i && isInvalid(value)) { // Filter the attribute value when needed Output buffer = new StringOutput(); filter(value.toCharArray(), 0, value.length(), buffer); value = buffer.toString(); } filtered.addAttribute( atts.getURI(j), atts.getLocalName(j), atts.getQName(j), atts.getType(j), value); } atts = filtered; break; } } super.startElement(uri, localName, name, atts); } @Override public void endElement(String uri, String localName, String name) throws SAXException { // TODO: enable this, but some parsers currently // trip it //assert verifyEndElement(name); super.endElement(uri, localName, name); } @Override public void endDocument() throws SAXException { // TODO: enable this, but some parsers currently // trip it //assert verifyEndDocument(); super.endDocument(); } @Override public void characters(char[] ch, int start, int length) throws SAXException { filter(ch, start, length, charactersOutput); } @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { filter(ch, start, length, ignorableWhitespaceOutput); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy