All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.vysper.xml.sax.impl.XMLParser Maven / Gradle / Ivy

The newest version!
/*
 *  Licensed to the Apache Software Foundation (ASF) under one
 *  or more contributor license agreements.  See the NOTICE file
 *  distributed with this work for additional information
 *  regarding copyright ownership.  The ASF licenses this file
 *  to you under the Apache License, Version 2.0 (the
 *  "License"); you may not use this file except in compliance
 *  with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing,
 *  software distributed under the License is distributed on an
 *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 *  KIND, either express or implied.  See the License for the
 *  specific language governing permissions and limitations
 *  under the License.
 *
 */
package org.apache.vysper.xml.sax.impl;

import java.nio.charset.CharsetDecoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.mina.core.buffer.IoBuffer;
import org.apache.vysper.xml.sax.impl.XMLTokenizer.TokenListener;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.ErrorHandler;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

/**
 *
 * @author The Apache MINA Project ([email protected])
 */
public class XMLParser implements TokenListener {

    private Logger log = LoggerFactory.getLogger(XMLParser.class);

    private static final String nameStartChar = ":A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02FF\\u0370-\\u037D\\u037F-\\u1FFF\\u200C-\\u200D\\u2070-\\u218F\\u2C00-\\u2FEF\\u3001-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFFD";

    private static final String nameChar = nameStartChar + "-\\.0-9\\u00B7\\u0300-\\u036F\\u203F-\\u2040";

    public static final Pattern NAME_PATTERN = Pattern.compile("^[" + nameStartChar + "][" + nameChar + "]*$");

    public static final Pattern NAME_PREFIX_PATTERN = Pattern.compile("^xml", Pattern.CASE_INSENSITIVE);

    public static final Pattern UNESCAPE_UNICODE_PATTERN = Pattern.compile("\\&\\#(x?)([0-9a-fA-F]++);");

    private ContentHandler contentHandler;

    private ErrorHandler errorHandler;

    private ParserNamespaceResolver nsResolver = new ParserNamespaceResolver();

    private static enum State {
        START, IN_TAG, IN_DECLARATION, IN_END_TAG, AFTER_START_NAME, AFTER_END_NAME, IN_EMPTY_TAG, AFTER_ATTRIBUTE_NAME, AFTER_ATTRIBUTE_EQUALS, AFTER_ATTRIBUTE_FIRST_QUOTE, AFTER_ATTRIBUTE_VALUE, AFTER_COMMENT_BANG, AFTER_COMMENT_DASH1, AFTER_COMMENT_DASH2, AFTER_COMMENT, AFTER_COMMENT_CLOSING_DASH1, AFTER_COMMENT_CLOSING_DASH2, AFTER_COMMENT_ENDING_DASH1, AFTER_COMMENT_ENDING_DASH2, CLOSED
    }

    private XMLTokenizer tokenizer;

    private State state = State.START;

    private String qname;

    // qname/value map
    private Map attributes;

    private String attributeName;

    // element names as {uri}qname
    private Stack elements = new Stack();

    private boolean sentStartDocument = false;

    // features
    private boolean reportNsAttributes = false;

    private boolean commentsAllowed = true;

    private boolean restartsAllowed = false;

    private String restartQname = null;

    public XMLParser(ContentHandler contentHandler, ErrorHandler errorHandler, Map features,
            Map properties) {
        this.contentHandler = contentHandler;
        this.errorHandler = errorHandler;

        commentsAllowed = feature(features, DefaultNonBlockingXMLReader.FEATURE_COMMENTS_ALLOWED, true);
        reportNsAttributes = feature(features, DefaultNonBlockingXMLReader.FEATURE_NAMESPACE_PREFIXES, false);
        reportNsAttributes = feature(features, DefaultNonBlockingXMLReader.FEATURE_NAMESPACE_PREFIXES, false);
        restartsAllowed = feature(features, DefaultNonBlockingXMLReader.FEATURE_RESTART_ALLOWED, false);
        restartQname = (String) properties.get(DefaultNonBlockingXMLReader.PROPERTY_RESTART_QNAME);

        this.tokenizer = new XMLTokenizer(this);
    }

    private boolean feature(Map features, String name, boolean defaultValue) {
        if (features.containsKey(name)) {
            return features.get(name);
        } else {
            return defaultValue;
        }
    }

    public void parse(IoBuffer byteBuffer, CharsetDecoder charsetDecoder) throws SAXException {
        if (state == State.CLOSED)
            throw new SAXException("Parser is closed");

        try {
            tokenizer.parse(byteBuffer, charsetDecoder);
        } catch (RuntimeException e) {
            e.printStackTrace();
            fatalError(e.getMessage());
        }
    }

    public void token(char c, String token) throws SAXException {
        if (log.isTraceEnabled()) {
            String s = (token == null) ? Character.toString(c) : token;
            log.trace("Parser got token {} in state {}", s, state);
        }

        switch (state) {
        case START:
            if (c == '<') {
                state = State.IN_TAG;
                attributes = new HashMap();
            } else {
                characters(token);
            }
            break;
        case IN_TAG:
            // token must be element name or / for a end tag
            if (c == '/') {
                state = State.IN_END_TAG;
            } else if (c == '?') {
                state = State.IN_DECLARATION;
                xmlDeclaration();
            } else if (c == '!') {
                if (commentsAllowed) {
                    state = State.AFTER_COMMENT_BANG;
                } else {
                    fatalError("Comments not allowed");
                    return;
                }
            } else {
                if (token != null && isValidName(token)) {
                    qname = token;
                    state = State.AFTER_START_NAME;
                } else {
                    if(token != null) {
                        fatalError("Invalid element name: " + qname);
                    } else {
                        fatalError("Not well-formed start tag");
                    }
                    return;
                }
            }
            break;
        case IN_END_TAG:
            // token must be element name
            qname = token;
            state = State.AFTER_END_NAME;
            break;
        case AFTER_START_NAME:
            // token must be attribute name or > or /
            if (c == '>') {
                // end of start or end tag
                if (state == State.AFTER_START_NAME) {
                    startElement();
                    state = State.START;
                    attributes = null;
                } else if (state == State.AFTER_END_NAME) {
                    state = State.START;
                    endElement();
                }
            } else if (c == '/') {
                state = State.IN_EMPTY_TAG;
            } else {
                // must be attribute name
                attributeName = token;
                state = State.AFTER_ATTRIBUTE_NAME;
            }
            break;
        case AFTER_ATTRIBUTE_NAME:
            // token must be =
            if (c == '=') {
                state = State.AFTER_ATTRIBUTE_EQUALS;
            } else {
                fatalError("Not wellformed");
            }
            break;
        case AFTER_ATTRIBUTE_EQUALS:
            // token must be " or '
            if (c == '"' || c == '\'') {
                state = State.AFTER_ATTRIBUTE_FIRST_QUOTE;
            }
            break;
        case AFTER_ATTRIBUTE_FIRST_QUOTE:
            // token must be attribute value
            attributes.put(attributeName, unescape(token));
            state = State.AFTER_ATTRIBUTE_VALUE;
            break;
        case AFTER_ATTRIBUTE_VALUE:
            // token must be " or '
            if (c == '"' || c == '\'') {
                state = State.AFTER_START_NAME;
            } else {
                fatalError("Not wellformed");
            }
            break;
        case AFTER_END_NAME:
            // token must be >
            if (c == '>') {
                state = State.START;
                endElement();
            }
            break;
        case IN_EMPTY_TAG:
            // token must be >
            if (c == '>') {
                startElement();
                attributes = null;

                if (state != State.CLOSED) {
                    state = State.START;
                    endElement();
                }
            }
            break;
        case AFTER_COMMENT_BANG:
            // token must be -
            if (c == '-') {
                state = State.AFTER_COMMENT_DASH1;
            } else {
                fatalError("Comment not wellformed");
                return;
            }
            break;
        case AFTER_COMMENT_DASH1:
            // token must be -
            if (c == '-') {
                state = State.AFTER_COMMENT_DASH2;
            } else {
                fatalError("Comment not wellformed");
                return;
            }
            break;
        case AFTER_COMMENT_DASH2:
            // we should now get the comment content, ignore
            if (c == '-') {
                state = State.AFTER_COMMENT_CLOSING_DASH1;
            } else {
                state = State.AFTER_COMMENT;
            }
            break;
        case AFTER_COMMENT:
            // token must be - or some text
            if (c == '-') {
                state = State.AFTER_COMMENT_CLOSING_DASH1;
            } else if (c == '>') {
                fatalError("Comment not wellformed");
                return;
            } else {
                // ignore
            }
            break;
        case AFTER_COMMENT_CLOSING_DASH1:
            // token must be -
            if (c == '-') {
                state = State.AFTER_COMMENT_CLOSING_DASH2;
            } else {
                fatalError("Comment not wellformed");
                return;
            }
            break;
        case AFTER_COMMENT_CLOSING_DASH2:
            // token must be >
            if (c == '>') {
                state = State.START;
            } else {
                fatalError("Comment not wellformed");
                return;
            }
            break;
        case IN_DECLARATION:
            // wait for >
            if (c == '>') {
                state = State.START;
            }
            break;
        }
    }

    private void characters(String s) throws SAXException {
        // text only allowed in element
        if (!elements.isEmpty()) {
            String unescaped = unescape(s);
            log.trace("Parser emitting characters \"{}\"", unescaped);
            contentHandler.characters(unescaped.toCharArray(), 0, unescaped.length());
        } else if (s.trim().length() > 0) {
            // must start document, even that document is not wellformed
            startDocument();
            fatalError("Text only allowed in element");
        } else {
            // ignorable whitespace
            startDocument();
        }
    }

    private boolean isValidName(String name) {
        // element names must only contain valid characters
        // element names must not begin with "xml" in any casing
        return NAME_PATTERN.matcher(name).find() && !NAME_PREFIX_PATTERN.matcher(name).find();
    }

    private boolean needsRestart() {
        return elements.size() > 0;
    }

    private void restart() {
        log.trace("Restarting XML stream");

        elements.clear();
        nsResolver = new ParserNamespaceResolver();
        sentStartDocument = false;
        tokenizer.restart();
    }

    private void xmlDeclaration() {
        // we got an XML declaration, should we restart stream?
        // TODO could also be a PI, if we want to support PIs, this code needs further attention
        if (needsRestart()) {
            if (restartsAllowed) {
                // ok, restart
                restart();
            } else {
                // restarts not allowed, fail 
            }
        }
    }

    private void startDocument() throws SAXException {
        if (!sentStartDocument) {
            contentHandler.startDocument();
            sentStartDocument = true;
        }
    }

    private void startElement() throws SAXException {
        log.trace("StartElement {}", qname);

        // check if this should restart stream
        if (restartsAllowed && needsRestart() && qname.equals(restartQname)) {
            restart();
        }

        if (elements.isEmpty()) {
            startDocument();
        }

        // find all namespace declarations so we can populate the NS resolver
        Map nsDeclarations = new HashMap();
        for (Entry attribute : attributes.entrySet()) {
            if (attribute.getKey().equals("xmlns")) {
                // is namespace attribute
                nsDeclarations.put("", attribute.getValue());
            } else if (attribute.getKey().startsWith("xmlns:")) {
                nsDeclarations.put(attribute.getKey().substring(6), attribute.getValue());
            }
        }
        nsResolver.push(nsDeclarations);

        // find all non-namespace attributes
        List nonNsAttributes = new ArrayList();
        for (Entry attribute : attributes.entrySet()) {
            String attQname = attribute.getKey();

            // only report NS declaration attributes if the feature is set to
            if (reportNsAttributes) {
                nonNsAttributes.add(new Attribute(attQname, null, attQname, attribute.getValue()));
            } else if (!attQname.equals("xmlns") && !attQname.startsWith("xmlns:")) {
                String attLocalName = extractLocalName(attQname);
                String attPrefix = extractNsPrefix(attQname);
                String attUri;
                if (attPrefix.length() > 0) {
                    attUri = nsResolver.resolveUri(attPrefix);
                    if (attUri == null) {
                        if (attPrefix.length() > 0) {
                            fatalError("Undeclared namespace prefix: " + attPrefix);
                            return;
                        } else {
                            attUri = "";
                        }
                    }
                } else {
                    // by default, attributes are in the empty namespace
                    attUri = "";
                }
                nonNsAttributes.add(new Attribute(attLocalName, attUri, attQname, attribute.getValue()));
            }
        }

        String prefix = extractNsPrefix(qname);
        String uri = nsResolver.resolveUri(prefix);
        if (uri == null) {
            if (prefix.length() > 0) {
                fatalError("Undeclared namespace prefix: " + prefix);
                return;
            } else {
                uri = "";
            }
        }

        String localName = extractLocalName(qname);

        elements.add(fullyQualifiedName(uri, qname));

        contentHandler.startElement(uri, localName, qname, new DefaultAttributes(nonNsAttributes));
    }

    private String extractLocalName(String qname) {
        int index = qname.indexOf(':');

        if (index > -1) {
            return qname.substring(index + 1);
        } else {
            return qname;
        }
    }

    private String extractNsPrefix(String qname) {
        int index = qname.indexOf(':');

        if (index > -1) {
            return qname.substring(0, index);
        } else {
            return "";
        }
    }

    private String fullyQualifiedName(String uri, String qname) {
        return "{" + uri + "}" + qname;
    }

    private void endElement() throws SAXException {
        log.trace("EndElement {}", qname);

        if (state == State.CLOSED)
            return;

        String prefix = extractNsPrefix(qname);
        String uri = nsResolver.resolveUri(prefix);
        if (uri == null) {
            if (prefix.length() > 0) {
                fatalError("Undeclared namespace prefix: " + prefix);
                return;
            } else {
                uri = "";
            }
        }

        nsResolver.pop();

        String localName = extractLocalName(qname);

        String fqn = elements.pop();
        if (fqn.equals(fullyQualifiedName(uri, qname))) {
            contentHandler.endElement(uri, localName, qname);

            if (elements.isEmpty()) {
                contentHandler.endDocument();
                state = State.CLOSED;
            }
        } else {
            fatalError("Invalid element name " + qname);
        }
    }

    private void fatalError(String message) throws SAXException {
        log.debug("Fatal error: {}", message);
        state = State.CLOSED;
        tokenizer.close();

        // make sure we send a start document event
        startDocument();

        errorHandler.fatalError(new SAXParseException(message, null));
    }

    private String unescape(String s) {
        s = s.replace("&", "&").replace(">", ">").replace("<", "<").replace("'", "'").replace(""",
                "\"");

        StringBuffer sb = new StringBuffer();

        Matcher matcher = UNESCAPE_UNICODE_PATTERN.matcher(s);
        int end = 0;
        while (matcher.find()) {
            boolean isHex = matcher.group(1).equals("x");
            String unicodeCode = matcher.group(2);

            int base = isHex ? 16 : 10;
            int i = Integer.valueOf(unicodeCode, base).intValue();
            char[] c = Character.toChars(i);
            sb.append(s.substring(end, matcher.start()));
            end = matcher.end();
            sb.append(c);
        }
        sb.append(s.substring(end, s.length()));

        return sb.toString();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy