All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.sax.ToXMLContentHandler Maven / Gradle / Ivy

Go to download

This is the core Apache Tika™ toolkit library from which all other modules inherit functionality. It also includes the core facades for the Tika API.

There is a newer version: 1.0.18
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.sax;

import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;

/**
 * SAX event handler that serializes the XML document to a character stream.
 * The incoming SAX events are expected to be well-formed (properly nested,
 * etc.) and to explicitly include namespace declaration attributes and
 * corresponding namespace prefixes in element and attribute names.
 *
 * @since Apache Tika 0.10
 */
public class ToXMLContentHandler extends ToTextContentHandler {

    private static class ElementInfo {

        private final ElementInfo parent;

        private final Map namespaces;

        public ElementInfo(ElementInfo parent, Map namespaces) {
            this.parent = parent;
            if (namespaces.isEmpty()) {
                this.namespaces = Collections.emptyMap();
            } else {
                this.namespaces = new HashMap(namespaces);
            }
        }

        public String getPrefix(String uri) throws SAXException {
            String prefix = namespaces.get(uri);
            if (prefix != null) {
                return prefix;
            } else if (parent != null) {
                return parent.getPrefix(uri);
            } else if (uri == null || uri.length() == 0) {
                return "";
            } else {
                throw new SAXException("Namespace " + uri + " not declared");
            }
        }

        public String getQName(String uri, String localName)
                throws SAXException {
            String prefix = getPrefix(uri);
            if (prefix.length() > 0) {
                return prefix + ":" + localName;
            } else {
                return localName;
            }
        }

    }

    private final String encoding;

    protected boolean inStartElement = false;

    protected final Map namespaces =
        new HashMap();

    private ElementInfo currentElement;

    /**
     * Creates an XML serializer that writes to the given byte stream
     * using the given character encoding.
     *
     * @param stream output stream
     * @param encoding output encoding
     * @throws UnsupportedEncodingException if the encoding is unsupported
     */
    public ToXMLContentHandler(OutputStream stream, String encoding)
            throws UnsupportedEncodingException {
        super(stream, encoding);
        this.encoding = encoding;
    }

    public ToXMLContentHandler(String encoding) {
        super();
        this.encoding = encoding;
    }

    public ToXMLContentHandler() {
        super();
        this.encoding = null;
    }

    /**
     * Writes the XML prefix.
     */
    @Override
    public void startDocument() throws SAXException {
        if (encoding != null) {
            write("\n");
        }

        currentElement = null;
        namespaces.clear();
    }

    @Override
    public void startPrefixMapping(String prefix, String uri)
            throws SAXException {
        try {
            if (currentElement != null
                    && prefix.equals(currentElement.getPrefix(uri))) {
                return;
            }
        } catch (SAXException ignore) {
        }
        namespaces.put(uri, prefix);
    }

    @Override
    public void startElement(
            String uri, String localName, String qName, Attributes atts)
            throws SAXException {
        lazyCloseStartElement();

        currentElement = new ElementInfo(currentElement, namespaces);

        write('<');
        write(currentElement.getQName(uri, localName));

        for (int i = 0; i < atts.getLength(); i++) {
            write(' ');
            write(currentElement.getQName(atts.getURI(i), atts.getLocalName(i)));
            write('=');
            write('"');
            char[] ch = atts.getValue(i).toCharArray();
            writeEscaped(ch, 0, ch.length, true);
            write('"');
        }

        for (Map.Entry entry : namespaces.entrySet()) {
            write(' ');
            write("xmlns");
            String prefix = entry.getValue();
            if (prefix.length() > 0) {
                write(':');
                write(prefix);
            }
            write('=');
            write('"');
            char[] ch = entry.getKey().toCharArray();
            writeEscaped(ch, 0, ch.length, true);
            write('"');
        }
        namespaces.clear();

        inStartElement = true;
    }

    @Override
    public void endElement(String uri, String localName, String qName)
            throws SAXException {
        if (inStartElement) {
            write(" />");
            inStartElement = false;
        } else {
            write("');
        }

        namespaces.clear();

        // Reset the position in the tree, to avoid endless stack overflow
        // chains (see TIKA-1070)
        currentElement = currentElement.parent;
    }

    @Override
    public void characters(char[] ch, int start, int length)
            throws SAXException {
        lazyCloseStartElement();
        writeEscaped(ch, start, start + length, false);
    }

    private void lazyCloseStartElement() throws SAXException {
        if (inStartElement) {
            write('>');
            inStartElement = false;
        }
    }

    /**
     * Writes the given character as-is.
     *
     * @param ch character to be written
     * @throws SAXException if the character could not be written
     */
    protected void write(char ch) throws SAXException {
        super.characters(new char[] { ch }, 0, 1);
    }

    /**
     * Writes the given string of character as-is.
     *
     * @param string string of character to be written
     * @throws SAXException if the character string could not be written
     */
    protected void write(String string) throws SAXException {
        super.characters(string.toCharArray(), 0, string.length());
    }

    /**
     * Writes the given characters as-is followed by the given entity.
     *
     * @param ch character array
     * @param from start position in the array
     * @param to end position in the array
     * @param entity entity code
     * @return next position in the array,
     *         after the characters plus one entity
     * @throws SAXException if the characters could not be written
     */
    private int writeCharsAndEntity(char[] ch, int from, int to, String entity)
            throws SAXException {
        super.characters(ch, from, to - from);
        write('&');
        write(entity);
        write(';');
        return to + 1;
    }

    /**
     * Writes the given characters with XML meta characters escaped.
     *
     * @param ch character array
     * @param from start position in the array
     * @param to end position in the array
     * @param attribute whether the characters should be escaped as
     *                  an attribute value or normal character content
     * @throws SAXException if the characters could not be written
     */
    private void writeEscaped(char[] ch, int from, int to, boolean attribute)
            throws SAXException {
        int pos = from;
        while (pos < to) {
            if (ch[pos] == '<') {
                from = pos = writeCharsAndEntity(ch, from, pos, "lt");
            } else if (ch[pos] == '>') {
                from = pos = writeCharsAndEntity(ch, from, pos, "gt");
            } else if (ch[pos] == '&') {
                from = pos = writeCharsAndEntity(ch, from, pos, "amp");
            } else if (attribute && ch[pos] == '"') {
                from = pos = writeCharsAndEntity(ch, from, pos, "quot");
            } else {
                pos++;
            }
        }
        super.characters(ch, from, to - from);
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy