com.tickaroo.tikxml.XmlReader Maven / Gradle / Ivy

Go to download
/*
 * Copyright (C) 2015 Hannes Dorfmann
 * Copyright (C) 2015 Tickaroo, Inc.
 * Copyright (C) 2015 Square, Inc.
 * Copyright (C) 2010 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package com.tickaroo.tikxml;

import java.io.Closeable;
import java.io.EOFException;
import java.io.IOException;
import okio.Buffer;
import okio.BufferedSource;
import okio.ByteString;

/**
 * A class to read and parse an xml stream.
 *
 * @author Hannes Dorfmann
 * @since 1.0
 */
public class XmlReader implements Closeable {

  //private static final ByteString LINEFEED_OR_CARRIAGE_RETURN = ByteString.encodeUtf8("\n\r");

  private static final ByteString UNQUOTED_STRING_TERMINALS
      = ByteString.encodeUtf8(" >/=\n");

  private static final ByteString CDATA_CLOSE = ByteString.encodeUtf8("]]>");

  private static final byte DOUBLE_QUOTE = '"';
  private static final byte SINGLE_QUOTE = '\'';
  private static final byte OPENING_XML_ELEMENT = '<';

  //
  // Peek states
  //
  /** Nothing peeked */
  private static final int PEEKED_NONE = 0;
  /** Peeked an xml element / object */
  private static final int PEEKED_ELEMENT_BEGIN = 1;
  /** Peeked the closing xml tag which indicates the end of an object */
  private static final int PEEKED_ELEMENT_END = 2;
  /** Peeked the closing xml header tag, hence we are inner xml tag object body */
  private static final int PEEKED_ELEMENT_TEXT_CONTENT = 3;
  /** Peeked the end of the stream */
  private static final int PEEKED_EOF = 4;
  /** Peeked an unquoted value which can be either xml element name or element attribute name */
  private static final int PEEKED_ELEMENT_NAME = 5;
  /** Peeked a quoted value which is the value of an xml attribute */
  private static final int PEEKED_DOUBLE_QUOTED = 6;
  /** Peeked a single quote which is the value of an xml attribute */
  private static final int PEEKED_SINGLE_QUOTED = 7;
  /** Peeked an attribute name (of a xml element) */
  private static final int PEEKED_ATTRIBUTE_NAME = 8;

  /** Peeked a CDATA */
  private static final int PEEKED_CDATA = 9;

  /** The input XML. */
  private int peeked = PEEKED_NONE;

  private String[] pathNames = new String[32];
  private int[] pathIndices = new int[32];

  /*
   * The nesting stack. Using a manual array rather than an ArrayList saves 20%.
   */
  private int[] stack = new int[32];
  private int stackSize = 0;

  {
    stack[stackSize++] = XmlScope.EMPTY_DOCUMENT;
  }

  private final BufferedSource source;
  private final Buffer buffer;

  private XmlReader(BufferedSource source) {
    if (source == null) {
      throw new NullPointerException("source == null");
    }
    this.source = source;
    this.buffer = source.buffer();
  }

  /**
   * Returns a new instance that reads a XML-encoded stream from {@code source}.
   */
  public static XmlReader of(BufferedSource source) {
    return new XmlReader(source);
  }

  /**
   * Get the next token without consuming it.
   *
   * @return {@link XmlToken}
   */
  public XmlToken peek() throws IOException {
    int p = peeked;
    if (p == PEEKED_NONE) {
      p = doPeek();
    }

    switch (p) {
      case PEEKED_ELEMENT_BEGIN:
        return XmlToken.ELEMENT_BEGIN;

      case PEEKED_ELEMENT_NAME:
        return XmlToken.ELEMENT_NAME;

      case PEEKED_ELEMENT_END:
        return XmlToken.ELEMENT_END;

      case PEEKED_ATTRIBUTE_NAME:
        return XmlToken.ATTRIBUTE_NAME;

      case PEEKED_DOUBLE_QUOTED:
      case PEEKED_SINGLE_QUOTED:
        return XmlToken.ATTRIBUTE_VALUE;

      case PEEKED_ELEMENT_TEXT_CONTENT:
      case PEEKED_CDATA:
        return XmlToken.ELEMENT_TEXT_CONTENT;

      case PEEKED_EOF:
        return XmlToken.END_OF_DOCUMENT;
      default:
        throw new AssertionError("Unknown XmlToken: Peeked = " + p);
    }
  }

  /**
   * Actually do a peek. This method will return the peeked token and updates the internal varible
   * {@link #peeked}
   *
   * @return The peeked token
   * @throws IOException
   */
  private int doPeek() throws IOException {

    int peekStack = stack[stackSize - 1];

    if (peekStack == XmlScope.ELEMENT_OPENING) {
      int c = nextNonWhitespace(true);
      if (isLiteral((char) c)) {
        return peeked = PEEKED_ELEMENT_NAME;
      } else {
        throw syntaxError("Expected xml element name (literal expected)");
      }
    } else if (peekStack == XmlScope.ELEMENT_ATTRIBUTE) {
      int c = nextNonWhitespace(true);

      if (isLiteral(c)) {
        return peeked = PEEKED_ATTRIBUTE_NAME;
      }

      switch (c) {
        case '>':
          // remove XmlScope.ELEMENT_ATTRIBUTE from top of the stack
          popStack();

          // set previous stack from XmlScope.ELEMENT_OPENING to XmlScope.ELEMENT_CONTENT
          stack[stackSize - 1] = XmlScope.ELEMENT_CONTENT;
          buffer.readByte(); // consume '>'

          int nextChar = nextNonWhitespace(true);

          if (nextChar != '<') {
            return peeked = PEEKED_ELEMENT_TEXT_CONTENT;
          }

          if (isCDATA()) {
            buffer.skip(9); // skip opening cdata tag
            return peeked = PEEKED_CDATA;
          }
          break;

        case '/':
          // Self closing />

          if (fillBuffer(2) && buffer.getByte(1) == '>') {
            // remove XmlScope.ELEMENT_ATTRIBUTE from top of the stack
            popStack();

            // correct closing xml tag
            buffer.readByte(); // consuming '/'
            buffer.readByte(); // consuming '>'
            return peeked = PEEKED_ELEMENT_END;
          } else {
            throw syntaxError("Expected closing />");
          }

        case '=':
          buffer.readByte(); // consume '='

          // Read next char which should be a quote
          c = nextNonWhitespace(true);

          switch (c) {
            case '"':
              buffer.readByte(); // consume "
              return peeked = PEEKED_DOUBLE_QUOTED;
            case '\'':
              buffer.readByte(); // consume '
              return peeked = PEEKED_SINGLE_QUOTED;

            default:
              throw syntaxError(
                  "Expected double quote (\") or single quote (') while reading xml elements attribute");
          }

        default:
          throw syntaxError("Unexpected character '"
              + ((char) c)
              + "' while trying to read xml elements attribute");
      }
    } else if (peekStack == XmlScope.ELEMENT_CONTENT) {
      int c = nextNonWhitespace(true);

      if (c != '<') {
        return peeked = PEEKED_ELEMENT_TEXT_CONTENT;
      }

      if (isCDATA()) {
        buffer.skip(9); // skip opening cdata tag
        return peeked = PEEKED_CDATA;
      }
    } else if (peekStack == XmlScope.EMPTY_DOCUMENT) {
      stack[stackSize - 1] = XmlScope.NONEMPTY_DOCUMENT;
    } else if (peekStack == XmlScope.NONEMPTY_DOCUMENT) {
      int c = nextNonWhitespace(false);
      if (c == -1) {
        return peeked = PEEKED_EOF;
      }
    } else if (peekStack == XmlScope.CLOSED) {
      throw new IllegalStateException("XmlReader is closed");
    }

    int c = nextNonWhitespace(true);
    switch (c) {

      // Handling open < and closing ') {
              buffer.readByte(); // consume >
              return peeked = PEEKED_ELEMENT_END;
            } else {
              syntaxError("Missing closing '>' character in  but found ");
          }
        }
        // its just a < which means begin of the element
        return peeked = PEEKED_ELEMENT_BEGIN;

      case '"':
        buffer.readByte(); // consume '"'.
        return peeked = PEEKED_DOUBLE_QUOTED;

      case '\'':
        buffer.readByte(); // consume '
        return peeked = PEEKED_SINGLE_QUOTED;
    }

    return PEEKED_NONE;
  }

  /**
   * Checks for CDATA beginning {@code text
   * content}
   *
   * If the element is empty (no content) like {@code } this method will return
   * the empty string "".
   *
   * {@code null} as return type is not supported yet, because there is no way in xml to distinguish
   * between empty string "" or null since both might be represented with {@code
   * }. So if you want to represent a null element, simply don't write the
   * corresponding xml tag. Then the parser will not try set the mapped field and it will remain the
   * default value (which is null).
   *
   * @return The xml element's text content
   * @throws IOException
   */
  public String nextTextContent() throws IOException {
    int p = peeked;
    if (p == PEEKED_NONE) {
      p = doPeek();
    }

    if (p == PEEKED_ELEMENT_TEXT_CONTENT) {

      peeked = PEEKED_NONE;

      // Read text until '<' found
      long index = source.indexOf(OPENING_XML_ELEMENT);
      if (index == -1L) {
        throw syntaxError("Unterminated element text content. Expected  but haven't found");
      }

      return buffer.readUtf8(index);
    } else if (p == PEEKED_CDATA) {
      peeked = PEEKED_NONE;

      // Search index of closing CDATA tag ]]>
      long index = indexOfClosingCDATA();

      String result = buffer.readUtf8(index);
      buffer.skip(3); // consume ]]>
      return result;
    } else if (p == PEEKED_ELEMENT_END) {
      // this is an element without any text content. i.e. .
      // In that case we return the default value of a string which is the empty string

      // Don't do peeked = PEEKED_NONE; because that would consume the end tag, which we haven't done yet.
      return "";
    } else {
      throw new XmlDataException("Expected xml element text content but was " + peek()
          + " at path " + getPath());
    }
  }

  /**
   * Get the next text content of an xml element as integer. Text content is {@code
   * 123}
   *
   * @return The xml element's text content as integer or 0 if empty tag like {@code
   * }
   * @throws IOException
   */
  public int nextTextContentAsInt() throws IOException {
    // TODO natively support

    // case when   is empty, then return default value which is "0" for long
    String content = nextTextContent();
    if (content.equals("")) {
      return 0;
    }

    return Integer.parseInt(content);
  }

  /**
   * Get the next text content of an xml element as long. Text content is {@code
   * 123}
   *
   * @return The xml element's text content as long or 0 if empty tag like {@code
   * }
   * @throws IOException
   */
  public long nextTextContentAsLong() throws IOException {
    // TODO natively support

    // case when   is empty, then return default value which is "0" for long
    String content = nextTextContent();
    if (content.equals("")) {
      return 0;
    }

    return Long.parseLong(content);
  }

  /**
   * Get the next text content of an xml element as double. Text content is {@code
   * 123}
   *
   * @return The xml element's text content as double or 0.0 if empty tag like {@code
   * }
   * @throws IOException
   */
  public double nextTextContentAsDouble() throws IOException {
    // TODO natively support

    // case when   is empty, then return default value which is "0.0" for double
    String content = nextTextContent();
    if (content.equals("")) {
      return 0;
    }

    return Double.parseDouble(content);
  }

  /**
   * Get the next text content of an xml element as boolean. Text content is {@code
   * 123}
   *
   * @return The xml element's text content as boolean or false if empty tag like {@code
   * }
   * @throws IOException
   */
  public boolean nextTextContentAsBoolean() throws IOException {
    // TODO natively support

    // case when   is empty, then return default value which is "false" for boolean
    String content = nextTextContent();
    if (content.equals("")) {
      return false;
    }
    return Boolean.parseBoolean(content);
  }

  /**
   * Returns the index of the last character before starting the CDATA closing tag "{@code ]]>}".
   * This method does not consume the closing CDATA tag.
   *
   * @return index of last character before closing tag.
   * @throws IOException
   */
  private long indexOfClosingCDATA() throws IOException {
    long index = source.indexOf(CDATA_CLOSE);
    if (index == -1) {
      throw new EOFException("");
    }
    return index;
  }

  /**
   * Skip the text content. Text content is {@code text content}
   *
   * @throws IOException
   */
  public void skipTextContent() throws IOException {

    int p = peeked;
    if (p == PEEKED_NONE) {
      p = doPeek();
    }

    if (p == PEEKED_ELEMENT_TEXT_CONTENT) {
      peeked = PEEKED_NONE;

      // Read text until '<' found
      long index = source.indexOf(OPENING_XML_ELEMENT);
      if (index == -1L) {
        throw syntaxError("Unterminated element text content. Expected  but haven't found");
      }

      buffer.skip(index);
    } else if (p == PEEKED_CDATA) {
      peeked = PEEKED_NONE;
      // Search index of closing CDATA tag ]]>
      long index = indexOfClosingCDATA();
      buffer.skip(index + 3); // +3 because of consuming closing tag
    } else {
      throw new XmlDataException("Expected xml element text content but was " + peek()
          + " at path " + getPath());
    }
  }

  /**
   * Push a new scope on top of the scope stack
   *
   * @param newTop The scope that should be pushed on top of the stack
   */
  private void pushStack(int newTop) {
    if (stackSize == stack.length) {
      int[] newStack = new int[stackSize * 2];
      int[] newPathIndices = new int[stackSize * 2];
      String[] newPathNames = new String[stackSize * 2];
      System.arraycopy(stack, 0, newStack, 0, stackSize);
      System.arraycopy(pathIndices, 0, newPathIndices, 0, stackSize);
      System.arraycopy(pathNames, 0, newPathNames, 0, stackSize);
      stack = newStack;
      pathIndices = newPathIndices;
      pathNames = newPathNames;
    }
    stack[stackSize++] = newTop;
  }

  /**
   * Removes the top element of the stack
   */
  private void popStack() {
    stack[stackSize - 1] = 0;
    stackSize--;
    pathNames[stackSize] = null; // Free the last path name so that it can be garbage collected!
    pathIndices[stackSize - 1]++;
  }

  /**
   * Returns a XPath to the current location in the XML value.
   */
  public String getPath() {
    return XmlScope.getPath(stackSize, stack, pathNames, pathIndices);
  }

  @Override
  public void close() throws IOException {
    peeked = PEEKED_NONE;
    buffer.clear();
    source.close();
  }

  /**
   * @param toFind a string to search for. Must not contain a newline.
   */
  private boolean skipTo(String toFind) throws IOException {
    outer:
    for (; fillBuffer(toFind.length()); ) {
      for (int c = 0; c < toFind.length(); c++) {
        if (buffer.getByte(c) != toFind.charAt(c)) {
          buffer.readByte();
          continue outer;
        }
      }
      return true;
    }
    return false;
  }

  /**
   * Returns true once {@code limit - pos >= minimum}. If the data is exhausted before that many
   * characters are available, this returns false.
   */
  private boolean fillBuffer(long minimum) throws IOException {
    return source.request(minimum);
  }

  /**
   * Returns the next character in the stream that is neither whitespace nor a part of a comment.
   * When this returns, the returned character is always at {@code buffer[pos-1]}; this means the
   * caller can always pushStack back the returned character by decrementing {@code pos}.
   */
  private int nextNonWhitespace(boolean throwOnEof) throws IOException {
    /*
     * This code uses ugly local variables 'p' and 'l' representing the 'pos'
     * and 'limit' fields respectively. Using locals rather than fields saves
     * a few field reads for each whitespace character in a pretty-printed
     * document, resulting in a 5% speedup. We need to flush 'p' to its field
     * before any (potentially indirect) call to fillBuffer() and reread both
     * 'p' and 'l' after any (potentially indirect) call to the same method.
     */
    int p = 0;
    while (fillBuffer(p + 1)) {
      int c = buffer.getByte(p++);
      if (c == '\n' || c == ' ' || c == '\r' || c == '\t') {
        continue;
      }

      buffer.skip(p - 1);
      if (c == '<' && !isCDATA()) {

        byte peek = buffer.getByte(1);
        if (peek == '!' && fillBuffer(4)) {
          // skip xml comments 
          // consume opening comment chars
          buffer.readByte(); // '<'
          buffer.readByte(); // '!'
          buffer.readByte(); // '-'
          buffer.readByte(); // '-'
          if (!skipTo("-->")) {
            throw syntaxError("Unterminated comment");
          }

          // Consume closing comment chars
          buffer.readByte(); // '-'
          buffer.readByte(); // '-'
          buffer.readByte(); // '>'
          p = 0;
          continue;
        } else if (peek == '?') {
          // Opening xml declaration processing instruction 
          buffer.readByte(); // consume <
          buffer.readByte(); // consume ?

          if (!skipTo("?>")) {
            throw syntaxError("Unterminated xml declaration or processing instruction \"
          p = 0;
          continue;
        }
      }

      return c;
    }

    if (throwOnEof) {
      throw new EOFException("Unexpected end of input at path " + getPath());
    } else {
      return -1;
    }
  }

  /**
   * Throws a new IO exception with the given message and a context snippet with this reader's
   * content.
   */
  private IOException syntaxError(String message) throws IOException {
    throw new IOException(message + " at path " + getPath());
  }

  /**
   * Get the name of the opening xml name
   *
   * @return The name
   * @throws IOException
   */
  public String nextElementName() throws IOException {
    int p = peeked;
    if (p == PEEKED_NONE) {
      p = doPeek();
    }
    if (p != PEEKED_ELEMENT_NAME) {
      throw syntaxError("Expected XML Tag Element name, but have " + peek());
    }

    String result = nextUnquotedValue();

    peeked = PEEKED_NONE;
    pathNames[stackSize - 1] = result;

    // Next we expect element attributes block
    pushStack(XmlScope.ELEMENT_ATTRIBUTE);
    return result;
  }

  /** Returns an unquoted value as a string. */
  private String nextUnquotedValue() throws IOException {
    long i = source.indexOfElement(UNQUOTED_STRING_TERMINALS);
    return i != -1 ? buffer.readUtf8(i) : buffer.readUtf8();
  }

  /**
   * Returns the string up to but not including {@code quote}, unescaping any character escape
   * sequences encountered along the way. The opening quote should have already been read. This
   * consumes the closing quote, but does not include it in the returned string.
   *
   * @throws IOException if any unicode escape sequences are malformed.
   */
  private String nextQuotedValue(byte runTerminator) throws IOException {
    StringBuilder builder = null;
    while (true) {
      long index = source.indexOf(runTerminator);
      if (index == -1L) {
        throw syntaxError(
            "Unterminated string (" + (runTerminator == DOUBLE_QUOTE ? "double quote \""
                : "single quote '") + " is missing)");
      }

      // If we've got an escape character, we're going to need a string builder.
      if (buffer.getByte(index) == '\\') {
        if (builder == null) builder = new StringBuilder();
        builder.append(buffer.readUtf8(index));
        buffer.readByte(); // '\'
        builder.append(readEscapeCharacter());
        continue;
      }

      // If it isn't the escape character, it's the quote. Return the string.
      if (builder == null) {
        String result = buffer.readUtf8(index);
        buffer.readByte(); // Consume the quote character.
        return result;
      } else {
        builder.append(buffer.readUtf8(index));
        buffer.readByte(); // Consume the quote character.
        return builder.toString();
      }
    }
  }

  /**
   * Checks wheter the passed character is a literal or not
   *
   * @param c the character to check
   * @return true if literal, otherwise false
   */
  private boolean isLiteral(int c) {
    switch (c) {
      case '=':
      case '<':
      case '>':
      case '/':
      case ' ':
        return false;
      default:
        return true;
    }
  }

  /**
   * Unescapes the character identified by the character or characters that immediately follow a
   * backslash. The backslash '\' should have already been read. This supports both unicode escapes
   * "u000A" and two-character escapes "\n".
   *
   * @throws IOException if any unicode escape sequences are malformed.
   */
  private char readEscapeCharacter() throws IOException {
    if (!fillBuffer(1)) {
      throw syntaxError("Unterminated escape sequence");
    }

    byte escaped = buffer.readByte();
    switch (escaped) {
      case 'u':
        if (!fillBuffer(4)) {
          throw new EOFException("Unterminated escape sequence at path " + getPath());
        }
        // Equivalent to Integer.parseInt(stringPool.get(buffer, pos, 4), 16);
        char result = 0;
        for (int i = 0, end = i + 4; i < end; i++) {
          byte c = buffer.getByte(i);
          result <<= 4;
          if (c >= '0' && c <= '9') {
            result += (c - '0');
          } else if (c >= 'a' && c <= 'f') {
            result += (c - 'a' + 10);
          } else if (c >= 'A' && c <= 'F') {
            result += (c - 'A' + 10);
          } else {
            throw syntaxError("\\u" + buffer.readUtf8(4));
          }
        }
        buffer.skip(4);
        return result;

      case 't':
        return '\t';

      case 'b':
        return '\b';

      case 'n':
        return '\n';

      case 'r':
        return '\r';

      case 'f':
        return '\f';

      case '\n':
      case '\'':
      case '"':
      case '\\':
      default:
        return (char) escaped;
    }
  }

  /**
   * Skip a quoted value
   *
   * @param runTerminator The terminator to skip
   * @throws IOException
   */
  private void skipQuotedValue(Byte runTerminator) throws IOException {
    while (true) {
      long index = source.indexOf(runTerminator);
      if (index == -1L) throw syntaxError("Unterminated string");

      if (buffer.getByte(index) == '\\') {
        buffer.skip(index + 1);
        readEscapeCharacter();
      } else {
        buffer.skip(index + 1);
        return;
      }
    }
  }

  /**
   * This method skips the rest of an xml Element. This method is typically invoked once {@link
   * #beginElement()} ang {@link #nextElementName()} has been consumed, but we don't want to consume
   * the xml element with the given name. So with this method we can  skip the whole remaining xml
   * element (attribute, text content and child elements) by using this method.
   *
   * @throws IOException
   */
  public void skipRemainingElement() throws IOException {

    int stackPeek = stack[stackSize - 1];
    if (stackPeek != XmlScope.ELEMENT_OPENING && stackPeek != XmlScope.ELEMENT_ATTRIBUTE) {
      throw new AssertionError(
          "This method can only be invoked after having consumed the opening element via beginElement()");
    }

    int count = 1;
    do {
      switch (peek()) {
        case ELEMENT_BEGIN:
          beginElement();
          count++;
          break;

        case ELEMENT_END:
          endElement();
          count--;
          break;

        case ELEMENT_NAME:
          nextElementName(); // TODO add a skip element name method
          break;

        case ATTRIBUTE_NAME:
          nextAttributeName(); // TODO add a skip attribute name method
          break;

        case ATTRIBUTE_VALUE:
          skipAttributeValue();
          break;

        case ELEMENT_TEXT_CONTENT:
          skipTextContent();
          break;

        case END_OF_DOCUMENT:
          if (count != 0) {
            throw syntaxError("Unexpected end of file! At least one xml element is not closed!");
          }
          break;

        default:
          throw new AssertionError(
              "Oops, there is something not implemented correctly internally. Please fill an issue on https://github.com/Tickaroo/tikxml/issues . Please include stacktrace and the model class you try to parse");
      }
      peeked = PEEKED_NONE;
    } while (count != 0);
  }

  /**
   * Skip an unquoted value
   *
   * @throws IOException
   *
   * private void skipUnquotedValue() throws IOException { long i = source.indexOfElement(UNQUOTED_STRING_TERMINALS);
   * buffer.skip(i != -1L ? i : buffer.size()); }
   */

  public enum XmlToken {
    /**
     * Indicates that an xml element begins.
     */
    ELEMENT_BEGIN,

    /**
     * xml element name
     */
    ELEMENT_NAME,

    /**
     * Indicates that an xml element ends
     */
    ELEMENT_END,

    /**
     * Indicates that we are reading an attribute name (of an xml element)
     */
    ATTRIBUTE_NAME,

    /**
     * Indicates that we are reading a xml elements attribute value
     */
    ATTRIBUTE_VALUE,

    /**
     * Indicates that we are reading the text content of an xml element like this {@code 
     * This is the text content }
     */
    ELEMENT_TEXT_CONTENT,

    /**
     * Indicates that we have reached the end of the document
     */
    END_OF_DOCUMENT
  }
}