All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.cnri.simplexml.XParser Maven / Gradle / Ivy

Go to download

CNRI Handle.net Software (Permission granted for redistribution by Giridhar Manepalli at CNRI)

There is a newer version: 9.2.0.v20190814
Show newest version
/**********************************************************************\
 © COPYRIGHT 2019 Corporation for National Research Initiatives (CNRI);
                        All rights reserved.

        The HANDLE.NET software is made available subject to the
      Handle.Net Public License Agreement, which may be obtained at
         http://hdl.handle.net/20.1000/112 or hdl:20.1000/112
\**********************************************************************/

package net.cnri.simplexml;

import java.io.*;
import java.util.*;

@SuppressWarnings({ "rawtypes", "unchecked" })
public class XParser {

    private static boolean debug = false;
    private final boolean stripNewLine = true;
    private final boolean strictMode = false;

    public XParser() {
    }

    private class ParseTag {
        StringBuffer name = new StringBuffer();
        HashMap attributes = new HashMap();
        boolean isComment;
        boolean isEndTag;
        boolean isAggregateEnd;

        ParseTag() {
            resetValues();
        }

        void resetValues() {
            name.setLength(0);
            attributes.clear();
            isComment = false;
            isEndTag = false;
            isAggregateEnd = false;
        }

        @Override
        public String toString() {
            return name == null ? "null" : name.toString();
        }
    }

    private void parseTagAttributes(PushbackReader reader, ParseTag tag) throws IOException {
        HashMap attributes = tag.attributes;
        String attName = null;
        String attVal = null;
        while (true) {
            int ci = reader.read();
            if (ci < 0) throw new IOException("Unexpected end of input reading attributes");
            char ch = (char) ci;

            switch (ch) {
            case '>':
                if (attName != null) {
                    attributes.put(attName, (attVal == null) ? "NULL" : attVal);
                }
                return;
            case '=':
                if (attName == null) attName = "";
                attVal = readToken(reader);
                break;
            case '/':
                if (attName != null) {
                    attributes.put(attName, (attVal == null) ? "NULL" : attVal);
                    attName = null;
                    attVal = null;
                }
                tag.isEndTag = true;
                break;
            case '\t':
            case '\r':
            case '\n':
            case ' ':
                continue;
            case '"':
            case '\'':
                // put the last value read into the attributes, and read a string...
                if (attName != null) {
                    if (attributes == null) attributes = new HashMap();
                    attributes.put(attName, (attVal == null) ? "NULL" : attVal);
                    attVal = null;
                }
                attName = readRestOfString(ch, reader);
                break;
            default:
                if (attName != null) {
                    attributes.put(attName, (attVal == null) ? "NULL" : attVal);
                    attName = null;
                    attVal = null;
                }
                reader.unread(ch);
                attName = readToken(reader);
            }
        }
    }

    private String readToken(PushbackReader reader) throws IOException {
        StringBuffer sb = new StringBuffer();
        while (true) {
            int ci = reader.read();
            if (ci < 0) throw new IOException("Unexpected end of input reading token: " + sb);
            char ch = (char) ci;
            if (Character.isWhitespace(ch)) if (sb.length() <= 0) continue;
            else return XUtil.decodeString(sb.toString());
            if (ch == '"' || ch == '\'') {
                // this char can either terminate a token, or begin a new string token
                if (sb.length() <= 0) {
                    return readRestOfString(ch, reader);
                } else {
                    reader.unread(ch);
                    return XUtil.decodeString(sb.toString());
                }
            } else if (ch == '=') {
                // this char can either terminate a token, or exist as a token on its own
                if (sb.length() <= 0) {
                    return String.valueOf(ch);
                } else {
                    reader.unread(ch);
                    return XUtil.decodeString(sb.toString());
                }
            } else if (ch == '>' || ch == '<') {
                // this char can either terminate a token, or exist as a token on its own
                if (sb.length() <= 0) {
                    return String.valueOf(ch);
                } else {
                    reader.unread(ch);
                    return XUtil.decodeString(sb.toString());
                }
            } else {
                sb.append(ch);
            }
        }
    }

    private String readRestOfString(char strBeginChar, PushbackReader reader) throws IOException {
        StringBuffer sb = new StringBuffer();
        while (true) {
            int ci = reader.read();
            if (ci < 0) throw new IOException("Unexpected end of input reading string: " + sb);
            char ch = (char) ci;
            if (ch == strBeginChar) return XUtil.decodeString(sb.toString());
            else sb.append(ch);
        }
    }

    /** Read the rest of the tag, after the '<' tag has already been read. */
    private boolean readTagToken(PushbackReader reader, ParseTag tag) throws IOException {
        tag.resetValues();

        int ci;
        char ch;
        // read up to the first start-tag character..
        while ((ci = reader.read()) != '<') {
            if (ci < 0) return false;
        }
        while ((ci = reader.read()) != '>') {
            if (ci < 0) throw new IOException("Unexpected end of input reading tag");
            ch = (char) ci;
            if (ch == '/') {
                if (tag.name.length() <= 0) tag.isAggregateEnd = true;
                else tag.isEndTag = true;
            } else if (Character.isWhitespace(ch)) {
                if (tag.name.length() <= 0) // whitespace before tag name
                    continue;
                if (tag.name.toString().startsWith("!--")) {
                    // comment tag... ignore the rest of it
                    tag.name.setLength(0);
                    tag.name.append("*********");
                    tag.isComment = true;
                    String comment = "";
                    while (!comment.toString().endsWith("-->")) {
                        ci = reader.read();
                        if (ci < 0) throw new IOException("Unexpected end of input reading comment tag");
                        comment += (char) ci;
                    }
                    return true;
                } else {
                    parseTagAttributes(reader, tag);
                }
                break;
            } else {
                tag.name.append(ch);
            }
        }
        return true;
    }

    private String readTagValue(PushbackReader reader) throws IOException {
        StringBuffer sb = new StringBuffer();
        int ci;
        while ((ci = reader.read()) != '<') {
            if (ci < 0) throw new IOException("Unexpected end of input reading tag value");
            sb.append((char) ci);
        }
        reader.unread('<');
        String s = sb.toString();

        if (stripNewLine && s.endsWith("\r\n")) {
            return XUtil.decodeString(s.substring(0, s.length() - 2));
        } else if (stripNewLine && s.endsWith("\n")) {
            return XUtil.decodeString(s.substring(0, s.length() - 1));
        }
        return XUtil.decodeString(s);
    }

    private XTag parseNonRecursive(PushbackReader reader) throws java.io.IOException {
        Stack parseStack = new Stack(); // the stack of parsed tags
        Stack aggregateStack = new Stack(); // the temporary stack of sub-tags
        ParseTag tag = new ParseTag();
        while (true) {
            // read until we find a non-comment tag
            tag.resetValues();
            String tagname;

            // read the next non-comment tag
            do {
                if (!readTagToken(reader, tag)) {
                    // there are no more tags... return what we have
                    if (parseStack.size() <= 0) return null;

                    // if there are multiple tags, add all but the first as
                    // sub-tags of the first, and return that.
                    Stack tmpStack = new Stack();
                    while (parseStack.size() > 1)
                        tmpStack.push(parseStack.pop());
                    XTag xtag = (XTag) parseStack.pop();
                    while (tmpStack.size() > 0) {
                        xtag.addSubTag((XTag) tmpStack.pop());
                    }
                    return xtag;
                }
            } while (tag.isComment);

            tagname = tag.name.toString().trim();

            if (debug) System.err.println("read tag: " + tag);

            if (tag.isAggregateEnd) {
                // this is an ending aggregate tag.  Gather up all of the tags
                // that have been read since the beginning of the aggregate, and
                // put them in the beginning tag

                aggregateStack.clear();
                XTag subtag = null;
                while (true) {
                    if (parseStack.size() <= 0) {
                        // if we are in strict mode, throw an exception
                        if (strictMode) throw new IOException("Got end tag " + tagname + " with no matching begin tag");

                        // otherwise, push everything back on the stack and break,
                        // setting subtag to null to indicate that no matching parent tag was found
                        while (aggregateStack.size() > 0)
                            parseStack.push(aggregateStack.pop());
                        subtag = null;
                        break;
                    }
                    subtag = (XTag) parseStack.pop();
                    if (subtag.getName().equals(tagname) && subtag.getSubTagCount() <= 0) break;
                    aggregateStack.push(subtag);
                }

                // at this point 'subtag' is the matching beginning tag, and all
                // of the subtags are in the aggregateStack

                if (subtag == null) continue;

                // Put the sub-tags under the beginning aggregate tag
                while (aggregateStack.size() > 0) {
                    subtag.addSubTag((XTag) aggregateStack.pop());
                }

                // set the aggregate tags' values to null so that they are not
                // identified as non-aggregate tags.  This only happens if the number
                // of sub-tags are greater than zero in order to avoid rejecting the
                // value of all XML (instead of SGML) style tags.
                if (subtag.getSubTagCount() > 0) subtag.setValue(null);

                // if the beginning tag was a first-level tag, then we
                // are done parsing and can return that tag as the result.
                if (parseStack.size() <= 0) {
                    return subtag;
                }

                // and push the beginning aggregate tag back onto the stack
                parseStack.push(subtag);
            } else if (tag.isEndTag) {
                // tag is an end tag
                XTag xtag = new XTag(tagname);
                xtag.setAttributes(tag.attributes);
                parseStack.push(xtag);
            } else {
                // this is not an ending aggregate tag, just a normal value tag or
                // beginning aggregate tag (but we won't know if it is a beginning
                // aggregate tag until we read the corresponding end tag
                XTag xtag = new XTag(tagname, readTagValue(reader));
                xtag.setAttributes(tag.attributes);
                parseStack.push(xtag);
            }
        }

    }

    /**
     * @param strict This parameter is ignored.
     */
    public XTag parse(Reader reader, boolean strict) throws Exception {
        return parseNonRecursive(new PushbackReader(reader));
    }

    public static void main(String argv[]) throws Exception {
        debug = true;
        XParser parser = new XParser();
        XTag xTag = parser.parse(new InputStreamReader(System.in, "UTF8"), false);
        System.err.println("parsed: " + xTag);
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy