com.itextpdf.tool.xml.parser.XMLParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of xmlworker Show documentation
Parses XML to PDF, with CSS support, using iText
There is a newer version: 5.5.13.4
/*
 *
 * This file is part of the iText (R) project.
    Copyright (c) 1998-2017 iText Group NV
 * Authors: Balder Van Camp, Emiel Ackermann, et al.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License version 3
 * as published by the Free Software Foundation with the addition of the
 * following permission added to Section 15 as permitted in Section 7(a):
 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
 * ITEXT GROUP. ITEXT GROUP DISCLAIMS THE WARRANTY OF NON INFRINGEMENT
 * OF THIRD PARTY RIGHTS.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
 * details. You should have received a copy of the GNU Affero General Public
 * License along with this program; if not, see http://www.gnu.org/licenses or
 * write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA, 02110-1301 USA, or download the license from the following URL:
 * http://itextpdf.com/terms-of-use/
 *
 * The interactive user interfaces in modified source and object code versions
 * of this program must display Appropriate Legal Notices, as required under
 * Section 5 of the GNU Affero General Public License.
 *
 * In accordance with Section 7(b) of the GNU Affero General Public License, a
 * covered work must retain the producer line in every PDF that is created or
 * manipulated using iText.
 *
 * You can be released from the requirements of the license by purchasing a
 * commercial license. Buying such a license is mandatory as soon as you develop
 * commercial activities involving the iText software without disclosing the
 * source code of your own applications. These activities include: offering paid
 * services to customers as an ASP, serving PDFs on the fly in a web
 * application, shipping iText with a closed source product.
 *
 * For more information, please contact iText Software Corp. at this address:
 * [email protected]
 */
package com.itextpdf.tool.xml.parser;

import com.itextpdf.text.xml.XMLUtil;
import com.itextpdf.text.xml.simpleparser.IanaEncodings;
import com.itextpdf.tool.xml.parser.io.EncodingUtil;
import com.itextpdf.tool.xml.parser.io.MonitorInputReader;
import com.itextpdf.tool.xml.parser.io.ParserMonitor;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CopyOnWriteArrayList;

/**
 * Reads an XML file. Attach a {@link XMLParserListener} for receiving events.
 *
 * @author redlab_b
 */
public class XMLParser {

    private State state;
    private final StateController controller;
    private final List listeners;
    private final XMLParserMemory memory;
    private ParserMonitor monitor;
    private String text = null;
    private TagState tagState;
    private Charset charset;
    private boolean decodeSpecialChars = true;

    /**
     * Constructs a default XMLParser ready for HTML/XHTML processing.
     */
    public XMLParser() {
        this(true, Charset.defaultCharset());
    }

    /**
     * Constructs a XMLParser.
     *
     * @param isHtml  false if this parser is not going to parse HTML and
     *                whitespace should be submitted as text too.
     * @param charset charset
     */
    public XMLParser(final boolean isHtml, final Charset charset) {
        this.charset = charset;
        this.controller = new StateController(this, isHtml);
        controller.unknown();
        memory = new XMLParserMemory(isHtml);
        listeners = new CopyOnWriteArrayList();
    }

    /**
     * Construct an XMLParser with the given XMLParserConfig ready for
     * HTML/XHTML processing..
     *
     * @param listener the listener
     * @param charset  the Charset
     */
    public XMLParser(final XMLParserListener listener, final Charset charset) {
        this(true, charset);
        listeners.add(listener);
    }

    /**
     * Construct a XMLParser with the given XMLParserConfig.
     *
     * @param isHtml   false if this parser is not going to parse HTML and
     *                 whitespace should be submitted as text too.
     * @param listener the listener
     * @param charset  the Charset to use
     */
    public XMLParser(final boolean isHtml, final XMLParserListener listener, final Charset charset) {
        this(isHtml, charset);
        listeners.add(listener);
    }

    /**
     * Constructs a new Parser with the default jvm charset.
     *
     * @param b        true if HTML is being parsed
     * @param listener the XMLParserListener
     */
    public XMLParser(final boolean b, final XMLParserListener listener) {
        this(b, Charset.defaultCharset());
        listeners.add(listener);
    }

    /**
     * Constructs a new Parser with HTML parsing set to true and the default jvm charset.
     *
     * @param listener the XMLParserListener
     */
    public XMLParser(final XMLParserListener listener) {
        this(true, Charset.defaultCharset());
        listeners.add(listener);
    }

    /**
     * If no ParserListener is added, parsing with the parser seems
     * useless no?
     *
     * @param pl the {@link XMLParserListener}
     * @return the parser
     */
    public XMLParser addListener(final XMLParserListener pl) {
        listeners.add(pl);
        return this;
    }

    /**
     * Removes a Listener from the list of listeners.
     *
     * @param pl the {@link XMLParserListener} to remove
     * @return the parser
     */
    public XMLParser removeListener(final XMLParserListener pl) {
        listeners.remove(pl);
        return this;
    }

    /**
     * Parse an InputStream with default encoding set
     *
     * @param in the InputStream to parse
     * @throws IOException if IO went wrong
     */
    public void parse(final InputStream in) throws IOException {
        parse(new InputStreamReader(in));
    }

    /**
     * Parse an InputStream that optionally detects encoding from the stream
     *
     * @param in             the InputStream to parse
     * @param detectEncoding true if encoding should be detected from the stream
     * @throws IOException if IO went wrong
     */
    public void parse(final InputStream in, final boolean detectEncoding) throws IOException {
        if (detectEncoding) {
            parse(detectEncoding(new BufferedInputStream(in)));
        } else {
            parse(in);
        }
    }

    /**
     * Parses an InputStream using the given encoding
     *
     * @param in      the stream to read
     * @param charSet to use for the constructed reader.
     * @throws IOException if reading fails
     */
    public void parse(final InputStream in, final Charset charSet) throws IOException {
        this.charset = charSet;
        InputStreamReader reader = new InputStreamReader(in, charSet);
        parse(reader);

    }

    /**
     * Parse an Reader
     *
     * @param reader the reader
     * @throws IOException if IO went wrong
     */
    public void parse(final Reader reader) throws IOException {
        parseWithReader(reader);
    }

    /**
     * The actual parse method
     *
     * @param reader
     * @throws IOException
     */
    private void parseWithReader(final Reader reader) throws IOException {
        for (XMLParserListener l : listeners) {
            l.init();
        }
        Reader r;
        if (monitor != null) {
            r = new MonitorInputReader(reader, monitor);
        } else {
            r = reader;
        }
        char read[] = new char[1];
        try {
            while (-1 != (r.read(read))) {
                state.process(read[0]);
            }
        } finally {
            for (XMLParserListener l : listeners) {
                l.close();
            }
            r.close();
        }
    }

    /**
     * Detects encoding from a stream.
     *
     * @param in the stream
     * @return a Reader with the deduced encoding.
     * @throws IOException                  if IO went wrong
     * @throws UnsupportedEncodingException if unsupported encoding was detected
     */
    public InputStreamReader detectEncoding(final InputStream in) throws IOException, UnsupportedEncodingException {
        // we expect a '>' in the first 100 characters
    	in.mark(1028);
    	byte b4[] = new byte[4];
        int count = in.read(b4);
        if (count != 4)
            throw new IOException("Insufficient length");
        String encoding = XMLUtil.getEncodingName(b4);
        String decl = null;
        if (encoding.equals("UTF-8")) {
            StringBuffer sb = new StringBuffer();
            int c;
            while ((c = in.read()) != -1) {
                if (c == '>')
                    break;
                sb.append((char) c);
            }
            decl = sb.toString();
        } else if (encoding.equals("CP037")) {
            ByteArrayOutputStream bi = new ByteArrayOutputStream();
            int c;
            while ((c = in.read()) != -1) {
                if (c == 0x6e) // that's '>' in ebcdic
                    break;
                bi.write(c);
            }
            decl = new String(bi.toByteArray(), "CP037");
        }
        if (decl != null) {
            decl = EncodingUtil.getDeclaredEncoding(decl);
            if (decl != null)
                encoding = decl;
        }
        in.reset();
        return new InputStreamReader(in, IanaEncodings.getJavaEncoding(encoding));
    }

    /**
     * Set the current state.
     *
     * @param state the current state
     */
    protected void setState(final State state) {
        this.state = state;
    }

    /**
     * @param character the character to append
     * @return the parser
     */
    public XMLParser append(final char character) {
        this.memory.current().append(character);
        return this;

    }

    // /**
    // * @param str the String to append
    // * @return the parser
    // */
    // public XMLParser append(final String str) {
    // this.memory.current().write(str.getBytes());
    // return this;
    //
    // }

    /**
     * The state controller of the parser
     *
     * @return {@link StateController}
     */
    public StateController selectState() {
        return this.controller;
    }

    /**
     * Triggered when the UnknownState encountered anything before encountering
     * a tag.
     */
    public void unknownData() {
        for (XMLParserListener l : listeners) {
            l.unknownText(this.memory.current().toString());
        }
    }

    /**
     * Flushes the currently stored data in the buffer.
     */
    public void flush() {
        this.memory.resetBuffer();
    }

    /**
     * Returns the current content of the text buffer.
     *
     * @return current buffer content
     */
    public String current() {
        return this.memory.current().toString();
    }

    /**
     * Returns the XMLParserMemory.
     *
     * @return the memory
     */
    public XMLParserMemory memory() {
        return memory;
    }

    /**
     * Triggered when an opening tag has been encountered.
     */
    public void startElement() {
        currentTagState(TagState.OPEN);
        String tagName = this.memory.getCurrentTag();
        Map attributes = this.memory.getAttributes();
        if (tagName.startsWith("?")) {
            memory().processingInstruction().setLength(0);
        }
        callText();
        for (XMLParserListener l : listeners) {
            l.startElement(tagName, attributes, this.memory.getNameSpace());
        }
        this.memory().flushNameSpace();
    }

    /**
     * Call this method to submit the text to listeners.
     */
    private void callText() {
        if (null != text && text.length() > 0) {
            // LOGGER .log(text);
            for (XMLParserListener l : listeners) {
                l.text(text);
            }
            text = null;
        }
    }

    /**
     * Triggered when a closing tag has been encountered.
     */
    public void endElement() {
        currentTagState(TagState.CLOSE);
        callText();
        for (XMLParserListener l : listeners) {
            l.endElement(this.memory.getCurrentTag(), this.memory.getNameSpace());
        }
    }

    /**
     * Triggered when content has been encountered.
     *
     * @param bs the content
     */
    public void text(final String bs) {
        text = bs;
    }

    /**
     * Triggered for comments.
     */
    public void comment() {
        callText();
        for (XMLParserListener l : listeners) {
            l.comment(this.memory.current().toString());
        }
    }

    /**
     * @return the current last character of the buffer or ' ' if none.
     */
    public char currentLastChar() {
        StringBuilder current2 = this.memory.current();
        int length = current2.length();
        CharSequence current = current2.subSequence(length - 2, length - 1);
        if (current.length() > 0) {
            return (char) (current.length() - 1);
        }
        return ' ';
    }

    /**
     * Get the current tag
     *
     * @return the current tag.
     */
    public String currentTag() {
        return this.memory.getCurrentTag();
    }

    /**
     * Get the state of the current tag
     *
     * @return the state of the current tag
     */
    public TagState currentTagState() {
        return this.tagState;
    }

    /**
     * Set the state of the current tag
     *
     * @param state the state of the current tag
     */
    private void currentTagState(final TagState state) {
        this.tagState = state;
    }

    /**
     * @param monitor the monitor to set
     */
    public void setMonitor(final ParserMonitor monitor) {
        this.monitor = monitor;
    }

    /**
     * Determines whether special chars like > will be decoded
     * @param decodeSpecialChars true to decode, false to not decode
     */
    public void setDecodeSpecialChars(boolean decodeSpecialChars) {
        this.decodeSpecialChars = decodeSpecialChars;
    }

    public boolean isDecodeSpecialChars() {
        return decodeSpecialChars;
    }

    /**
     * @return the current buffer as a String
     */
    public String bufferToString() {
        return this.memory.current().toString();
    }

    /**
     * @param bytes the byte array to append
     * @return this instance of the XMLParser
     */
    public XMLParser append(final char[] bytes) {
        this.memory.current().append(bytes);
        return this;
    }

    /**
     * @return the size of the buffer
     */
    public int bufferSize() {
        return (null != this.memory.current()) ? this.memory.current().length() : 0;
    }

    /**
     * Appends the given string to the buffer.
     *
     * @param string the String to append
     * @return this instance of the XMLParser
     */
    public XMLParser append(final String string) {
        this.memory.current().append(string);
        return this;

    }

    /**
     * Returns the current used character set.
     *
     * @return the charset
     */
    public Charset getCharset() {
        return charset;
    }

}