All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.lowagie.text.xml.simpleparser.SimpleXMLParser Maven / Gradle / Ivy

There is a newer version: 2.0.3
Show newest version
/*
 * Copyright 2003 Paulo Soares
 *
 * The contents of this file are subject to the Mozilla Public License Version 1.1
 * (the "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the License.
 *
 * The Original Code is 'iText, a free JAVA-PDF library'.
 *
 * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
 * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie.
 * All Rights Reserved.
 * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
 * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved.
 *
 * Contributor(s): all the names of the contributors are added in the source code
 * where applicable.
 *
 * Alternatively, the contents of this file may be used under the terms of the
 * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
 * provisions of LGPL are applicable instead of those above.  If you wish to
 * allow use of your version of this file only under the terms of the LGPL
 * License and not to allow others to use your version of this file under
 * the MPL, indicate your decision by deleting the provisions above and
 * replace them with the notice and other provisions required by the LGPL.
 * If you do not delete the provisions above, a recipient may use your version
 * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
 *
 * This library is free software; you can redistribute it and/or modify it
 * under the terms of the MPL as stated above or under the terms of the GNU
 * Library General Public License as published by the Free Software Foundation;
 * either version 2 of the License, or any later version.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
 * details.
 *
 */
package com.lowagie.text.xml.simpleparser;

import com.lowagie.text.error_messages.MessageLocalization;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import java.util.Stack;

/**
 * A simple XML and HTML parser.  This parser is, like the SAX parser,
 * an event based parser, but with much less functionality.
 * 

* The parser can: *

*
    *
  • It recognizes the encoding used *
  • It recognizes all the elements' start tags and end tags *
  • It lists attributes, where attribute values can be enclosed in single or double quotes *
  • It recognizes the <[CDATA[ ... ]]> construct *
  • It recognizes the standard entities: &amp;, &lt;, &gt;, &quot;, and &apos;, as well as numeric entities *
  • It maps lines ending in \r\n and \r to \n on input, in accordance with the XML Specification, Section 2.11 *
*/ public final class SimpleXMLParser { /** possible states */ private final static int UNKNOWN = 0; private final static int TEXT = 1; private final static int TAG_ENCOUNTERED = 2; private final static int EXAMIN_TAG = 3; private final static int TAG_EXAMINED = 4; private final static int IN_CLOSETAG = 5; private final static int SINGLE_TAG = 6; private final static int CDATA = 7; private final static int COMMENT = 8; private final static int PI = 9; private final static int ENTITY = 10; private final static int QUOTE = 11; private final static int ATTRIBUTE_KEY = 12; private final static int ATTRIBUTE_EQUAL = 13; private final static int ATTRIBUTE_VALUE = 14; /** the state stack */ Stack stack; /** The current character. */ int character = 0; /** The previous character. */ int previousCharacter = -1; /** the line we are currently reading */ int lines = 1; /** the column where the current character occurs */ int columns = 0; /** was the last character equivalent to a newline? */ boolean eol = false; /** * A boolean indicating if the next character should be taken into account * if it's a space character. When nospace is false, the previous character * wasn't whitespace. * @since 2.1.5 */ boolean nowhite = false; /** the current state */ int state; /** Are we parsing HTML? */ boolean html; /** current text (whatever is encountered between tags) */ StringBuffer text = new StringBuffer(); /** current entity (whatever is encountered between & and ;) */ StringBuffer entity = new StringBuffer(); /** current tagname */ String tag = null; /** current attributes */ Map attributes = null; /** The handler to which we are going to forward document content */ SimpleXMLDocHandler doc; /** The handler to which we are going to forward comments. */ SimpleXMLDocHandlerComment comment; /** Keeps track of the number of tags that are open. */ int nested = 0; /** the quote character that was used to open the quote. */ int quoteCharacter = '"'; /** the attribute key. */ String attributekey = null; /** the attribute value. */ String attributevalue = null; /** * Creates a Simple XML parser object. * Call go(BufferedReader) immediately after creation. */ private SimpleXMLParser(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, boolean html) { this.doc = doc; this.comment = comment; this.html = html; stack = new Stack<>(); state = html ? TEXT : UNKNOWN; } /** * Does the actual parsing. Perform this immediately * after creating the parser object. */ private void go(Reader r) throws IOException { BufferedReader reader; if (r instanceof BufferedReader) reader = (BufferedReader)r; else reader = new BufferedReader(r); doc.startDocument(); while(true) { // read a new character if (previousCharacter == -1) { character = reader.read(); } // or re-examine the previous character else { character = previousCharacter; previousCharacter = -1; } // the end of the file was reached if (character == -1) { if (html) { if (html && state == TEXT) flush(); doc.endDocument(); } else { throwException(MessageLocalization.getComposedMessage("missing.end.tag")); } return; } // dealing with \n and \r if (character == '\n' && eol) { eol = false; continue; } else if (eol) { eol = false; } else if (character == '\n') { lines++; columns = 0; } else if (character == '\r') { eol = true; character = '\n'; lines++; columns = 0; } else { columns++; } switch(state) { // we are in an unknown state before there's actual content case UNKNOWN: if(character == '<') { saveState(TEXT); state = TAG_ENCOUNTERED; } break; // we can encounter any content case TEXT: if(character == '<') { flush(); saveState(state); state = TAG_ENCOUNTERED; } else if(character == '&') { saveState(state); entity.setLength(0); state = ENTITY; nowhite = true; } else if (Character.isWhitespace((char)character)) { if (nowhite) text.append((char)character); nowhite = false; } else { text.append((char)character); nowhite = true; } break; // we have just seen a < and are wondering what we are looking at // , , , etc. case TAG_ENCOUNTERED: initTag(); if(character == '/') { state = IN_CLOSETAG; } else if (character == '?') { restoreState(); state = PI; } else { text.append((char)character); state = EXAMIN_TAG; } break; // we are processing something like this . // It could still be a or something. case EXAMIN_TAG: if(character == '>') { doTag(); processTag(true); initTag(); state = restoreState(); } else if(character == '/') { state = SINGLE_TAG; } else if(character == '-' && text.toString().equals("!-")) { flush(); state = COMMENT; } else if(character == '[' && text.toString().equals("![CDATA")) { flush(); state = CDATA; } else if(character == 'E' && text.toString().equals("!DOCTYP")) { flush(); state = PI; } else if(Character.isWhitespace((char)character)) { doTag(); state = TAG_EXAMINED; } else { text.append((char)character); } break; // we know the name of the tag now. case TAG_EXAMINED: if(character == '>') { processTag(true); initTag(); state = restoreState(); } else if(character == '/') { state = SINGLE_TAG; } else if(Character.isWhitespace((char)character)) { // empty } else { text.append((char)character); state = ATTRIBUTE_KEY; } break; // we are processing a closing tag: e.g. case IN_CLOSETAG: if(character == '>') { doTag(); processTag(false); if(!html && nested==0) return; state = restoreState(); } else { if (!Character.isWhitespace((char)character)) text.append((char)character); } break; // we have just seen something like this: . case SINGLE_TAG: if(character != '>') throwException(MessageLocalization.getComposedMessage("expected.gt.for.tag.lt.1.gt", tag)); doTag(); processTag(true); processTag(false); initTag(); if(!html && nested==0) { doc.endDocument(); return; } state = restoreState(); break; // we are processing CDATA case CDATA: if(character == '>' && text.toString().endsWith("]]")) { text.setLength(text.length()-2); flush(); state = restoreState(); } else text.append((char)character); break; // we are processing a comment. We are inside // the looking for the -->. case COMMENT: if(character == '>' && text.toString().endsWith("--")) { text.setLength(text.length() - 2); flush(); state = restoreState(); } else text.append((char)character); break; // We are inside one of these or one of these case PI: if(character == '>') { state = restoreState(); if(state == TEXT) state = UNKNOWN; } break; // we are processing an entity, e.g. <, », etc. case ENTITY: if(character == ';') { state = restoreState(); String cent = entity.toString(); entity.setLength(0); char ce = EntitiesToUnicode.decodeEntity(cent); if (ce == '\0') text.append('&').append(cent).append(';'); else text.append(ce); } else if ((character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z') && (character < 'A' || character > 'Z')) || entity.length() >= 7) { state = restoreState(); previousCharacter = character; text.append('&').append(entity.toString()); entity.setLength(0); } else { entity.append((char)character); } break; // We are processing the quoted right-hand side of an element's attribute. case QUOTE: if (html && quoteCharacter == ' ' && character == '>') { flush(); processTag(true); initTag(); state = restoreState(); } else if (html && quoteCharacter == ' ' && Character.isWhitespace((char)character)) { flush(); state = TAG_EXAMINED; } else if (html && quoteCharacter == ' ') { text.append((char) character); } else if (character == quoteCharacter) { flush(); state = TAG_EXAMINED; } else if (" \r\n\t".indexOf(character) >= 0) { text.append(' '); } else if (character == '&') { saveState(state); state = ENTITY; entity.setLength(0); } else { text.append((char) character); } break; case ATTRIBUTE_KEY: if(Character.isWhitespace((char)character)) { flush(); state = ATTRIBUTE_EQUAL; } else if(character == '=') { flush(); state = ATTRIBUTE_VALUE; } else if (html && character == '>') { text.setLength(0); processTag(true); initTag(); state = restoreState(); } else { text.append((char)character); } break; case ATTRIBUTE_EQUAL: if(character == '=') { state = ATTRIBUTE_VALUE; } else if(Character.isWhitespace((char)character)) { // empty } else if (html && character == '>') { text.setLength(0); processTag(true); initTag(); state = restoreState(); } else if (html && character == '/') { flush(); state = SINGLE_TAG; } else if (html) { flush(); text.append((char)character); state = ATTRIBUTE_KEY; } else { throwException(MessageLocalization.getComposedMessage("error.in.attribute.processing")); } break; case ATTRIBUTE_VALUE: if(character == '"' || character == '\'') { quoteCharacter = character; state = QUOTE; } else if(Character.isWhitespace((char)character)) { // empty } else if (html && character == '>') { flush(); processTag(true); initTag(); state = restoreState(); } else if (html) { text.append((char)character); quoteCharacter = ' '; state = QUOTE; } else { throwException(MessageLocalization.getComposedMessage("error.in.attribute.processing")); } break; } } } /** * Gets a state from the stack * @return the previous state */ private int restoreState() { if(!stack.empty()) return stack.pop(); else return UNKNOWN; } /** * Adds a state to the stack. * @param s a state to add to the stack */ private void saveState(int s) { stack.push(s); } /** * Flushes the text that is currently in the buffer. * The text can be ignored, added to the document * as content or as comment,... depending on the current state. */ private void flush() { switch(state){ case TEXT: case CDATA: if(text.length() > 0) { doc.text(text.toString()); } break; case COMMENT: if (comment != null) { comment.comment(text.toString()); } break; case ATTRIBUTE_KEY: attributekey = text.toString(); if (html) attributekey = attributekey.toLowerCase(); break; case QUOTE: case ATTRIBUTE_VALUE: attributevalue = text.toString(); attributes.put(attributekey, attributevalue); break; default: // do nothing } text.setLength(0); } /** * Initialized the tag name and attributes. */ private void initTag() { tag = null; attributes = new HashMap<>(); } /** Sets the name of the tag. */ private void doTag() { if(tag == null) tag = text.toString(); if (html) tag = tag.toLowerCase(); text.setLength(0); } /** * processes the tag. * @param start if true we are dealing with a tag that has just been opened; if false we are closing a tag. */ private void processTag(boolean start) { if (start) { nested++; doc.startElement(tag, attributes); } else { nested--; doc.endElement(tag); } } /** Throws an exception */ private void throwException(String s) throws IOException { throw new IOException(MessageLocalization.getComposedMessage("1.near.line.2.column.3", s, String.valueOf(lines), String.valueOf(columns))); } /** * Parses the XML document firing the events to the handler. * @param doc the document handler * @param comment {@link SimpleXMLParser#comment} * @param r the document. The encoding is already resolved. The reader is not closed * @param html {@link SimpleXMLParser#html} * @throws IOException on error */ public static void parse(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, Reader r, boolean html) throws IOException { SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html); parser.go(r); } /** Detect charset from BOM, as per Unicode FAQ. */ private static Optional detectCharsetFromBOM(byte[] bom) { // 00 00 FE FF UTF-32BE // EF BB BF .. UTF-8 // FE FF .. .. UTF-16BE // FF FE 00 00 UTF-32LE // FF FE .. .. UTF-16LE switch (bom[0]) { case (byte) 0x00: if (bom[1] == (byte) 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF) return Optional.of(Charset.forName("UTF-32BE")); break; case (byte) 0xEF: if (bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) return Optional.of(StandardCharsets.UTF_8); break; case (byte) 0xFE: if (bom[1] == (byte) 0xFF) return Optional.of(StandardCharsets.UTF_16BE); break; case (byte) 0xFF: if (bom[1] == (byte) 0xFE) { if (bom[2] == (byte) 0x00 && bom[3] == (byte) 0x00) return Optional.of(Charset.forName("UTF-32LE")); else return Optional.of(StandardCharsets.UTF_16LE); } break; } return Optional.empty(); } /** * Parses the XML document firing the events to the handler. * @param doc the document handler * @param in the document. The encoding is deduced from the stream. The stream is not closed * @throws IOException on error */ public static void parse(SimpleXMLDocHandler doc, InputStream in) throws IOException { byte[] b4 = new byte[4]; int count = in.read(b4); if (count != 4) throw new IOException(MessageLocalization.getComposedMessage("insufficient.length")); Charset encoding = detectCharsetFromBOM(b4).orElse(null); if (encoding == null) { StringBuilder sb = new StringBuilder(); int c; while ((c = in.read()) != -1) { if (c == '>') break; sb.append((char)c); } String decl = getDeclaredEncoding(sb.toString()); if (decl == null) encoding = StandardCharsets.UTF_8; else try { encoding = Charset.forName(decl); } catch (UnsupportedCharsetException e) { encoding = Charset.forName(IanaEncodings.getJavaEncoding(decl)); } } parse(doc, new InputStreamReader(in, encoding)); } private static String getDeclaredEncoding(String decl) { if (decl == null) return null; int idx = decl.indexOf("encoding"); if (idx < 0) return null; int idx1 = decl.indexOf('"', idx); int idx2 = decl.indexOf('\'', idx); if (idx1 == idx2) return null; if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) { int idx3 = decl.indexOf('\'', idx2 + 1); if (idx3 < 0) return null; return decl.substring(idx2 + 1, idx3); } if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) { int idx3 = decl.indexOf('"', idx1 + 1); if (idx3 < 0) return null; return decl.substring(idx1 + 1, idx3); } return null; } public static void parse(SimpleXMLDocHandler doc,Reader r) throws IOException { parse(doc, null, r, false); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy