com.itextpdf.tool.xml.parser.XMLParser Maven / Gradle / Ivy

Go to download
/*
 *
 * This file is part of the iText (R) project.
    Copyright (c) 1998-2022 iText Group NV
 * Authors: Balder Van Camp, Emiel Ackermann, et al.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License version 3
 * as published by the Free Software Foundation with the addition of the
 * following permission added to Section 15 as permitted in Section 7(a):
 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
 * ITEXT GROUP. ITEXT GROUP DISCLAIMS THE WARRANTY OF NON INFRINGEMENT
 * OF THIRD PARTY RIGHTS.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
 * details. You should have received a copy of the GNU Affero General Public
 * License along with this program; if not, see http://www.gnu.org/licenses or
 * write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA, 02110-1301 USA, or download the license from the following URL:
 * http://itextpdf.com/terms-of-use/
 *
 * The interactive user interfaces in modified source and object code versions
 * of this program must display Appropriate Legal Notices, as required under
 * Section 5 of the GNU Affero General Public License.
 *
 * In accordance with Section 7(b) of the GNU Affero General Public License, a
 * covered work must retain the producer line in every PDF that is created or
 * manipulated using iText.
 *
 * You can be released from the requirements of the license by purchasing a
 * commercial license. Buying such a license is mandatory as soon as you develop
 * commercial activities involving the iText software without disclosing the
 * source code of your own applications. These activities include: offering paid
 * services to customers as an ASP, serving PDFs on the fly in a web
 * application, shipping iText with a closed source product.
 *
 * For more information, please contact iText Software Corp. at this address:
 * [email protected]
 */
package com.itextpdf.tool.xml.parser;

import com.itextpdf.text.xml.XMLUtil;
import com.itextpdf.text.xml.simpleparser.IanaEncodings;
import com.itextpdf.tool.xml.parser.io.EncodingUtil;
import com.itextpdf.tool.xml.parser.io.MonitorInputReader;
import com.itextpdf.tool.xml.parser.io.ParserMonitor;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CopyOnWriteArrayList;

/**
 * Reads an XML file. Attach a {@link XMLParserListener} for receiving events.
 *
 * @author redlab_b
 */
public class XMLParser {

    private State state;
    private final StateController controller;
    private final List listeners;
    private final XMLParserMemory memory;
    private ParserMonitor monitor;
    private String text = null;
    private TagState tagState;
    private Charset charset;
    private boolean decodeSpecialChars = true;

    /**
     * Constructs a default XMLParser ready for HTML/XHTML processing.
     */
    public XMLParser() {
        this(true, Charset.defaultCharset());
    }

    /**
     * Constructs a XMLParser.
     *
     * @param isHtml  false if this parser is not going to parse HTML and
     *                whitespace should be submitted as text too.
     * @param charset charset
     */
    public XMLParser(final boolean isHtml, final Charset charset) {
        this.charset = charset;
        this.controller = new StateController(this, isHtml);
        controller.unknown();
        memory = new XMLParserMemory(isHtml);
        listeners = new CopyOnWriteArrayList();
    }

    /**
     * Construct an XMLParser with the given XMLParserConfig ready for
     * HTML/XHTML processing..
     *
     * @param listener the listener
     * @param charset  the Charset
     */
    public XMLParser(final XMLParserListener listener, final Charset charset) {
        this(true, charset);
        listeners.add(listener);
    }

    /**
     * Construct a XMLParser with the given XMLParserConfig.
     *
     * @param isHtml   false if this parser is not going to parse HTML and
     *                 whitespace should be submitted as text too.
     * @param listener the listener
     * @param charset  the Charset to use
     */
    public XMLParser(final boolean isHtml, final XMLParserListener listener, final Charset charset) {
        this(isHtml, charset);
        listeners.add(listener);
    }

    /**
     * Constructs a new Parser with the default jvm charset.
     *
     * @param b        true if HTML is being parsed
     * @param listener the XMLParserListener
     */
    public XMLParser(final boolean b, final XMLParserListener listener) {
        this(b, Charset.defaultCharset());
        listeners.add(listener);
    }

    /**
     * Constructs a new Parser with HTML parsing set to true and the default jvm charset.
     *
     * @param listener the XMLParserListener
     */
    public XMLParser(final XMLParserListener listener) {
        this(true, Charset.defaultCharset());
        listeners.add(listener);
    }

    /**
     * If no ParserListener is added, parsing with the parser seems
     * useless no?
     *
     * @param pl the {@link XMLParserListener}
     * @return the parser
     */
    public XMLParser addListener(final XMLParserListener pl) {
        listeners.add(pl);
        return this;
    }

    /**
     * Removes a Listener from the list of listeners.
     *
     * @param pl the {@link XMLParserListener} to remove
     * @return the parser
     */
    public XMLParser removeListener(final XMLParserListener pl) {
        listeners.remove(pl);
        return this;
    }

    /**
     * Parse an InputStream with default encoding set
     *
     * @param in the InputStream to parse
     * @throws IOException if IO went wrong
     */
    public void parse(final InputStream in) throws IOException {
        parse(new InputStreamReader(in));
    }

    /**
     * Parse an InputStream that optionally detects encoding from the stream
     *
     * @param in             the InputStream to parse
     * @param detectEncoding true if encoding should be detected from the stream
     * @throws IOException if IO went wrong
     */
    public void parse(final InputStream in, final boolean detectEncoding) throws IOException {
        if (detectEncoding) {
            parse(detectEncoding(new BufferedInputStream(in)));
        } else {
            parse(in);
        }
    }

    /**
     * Parses an InputStream using the given encoding
     *
     * @param in      the stream to read
     * @param charSet to use for the constructed reader.
     * @throws IOException if reading fails
     */
    public void parse(final InputStream in, final Charset charSet) throws IOException {
        this.charset = charSet;
        InputStreamReader reader = new InputStreamReader(in, charSet);
        parse(reader);

    }

    /**
     * Parse an Reader
     *
     * @param reader the reader
     * @throws IOException if IO went wrong
     */
    public void parse(final Reader reader) throws IOException {
        parseWithReader(reader);
    }

    /**
     * The actual parse method
     *
     * @param reader
     * @throws IOException
     */
    private void parseWithReader(final Reader reader) throws IOException {
        for (XMLParserListener l : listeners) {
            l.init();
        }
        Reader r;
        if (monitor != null) {
            r = new MonitorInputReader(reader, monitor);
        } else {
            r = reader;
        }
        char read[] = new char[1];
        try {
            while (-1 != (r.read(read))) {
                state.process(read[0]);
            }
        } finally {
            for (XMLParserListener l : listeners) {
                l.close();
            }
            r.close();
        }
    }

    /**
     * Detects encoding from a stream.
     *
     * @param in the stream
     * @return a Reader with the deduced encoding.
     * @throws IOException                  if IO went wrong
     * @throws UnsupportedEncodingException if unsupported encoding was detected
     */
    public InputStreamReader detectEncoding(final InputStream in) throws IOException, UnsupportedEncodingException {
        // we expect a '>' in the first 100 characters
    	in.mark(1028);
    	byte b4[] = new byte[4];
        int count = in.read(b4);
        if (count != 4)
            throw new IOException("Insufficient length");
        String encoding = XMLUtil.getEncodingName(b4);
        String decl = null;
        if (encoding.equals("UTF-8")) {
            StringBuffer sb = new StringBuffer();
            int c;
            while ((c = in.read()) != -1) {
                if (c == '>')
                    break;
                sb.append((char) c);
            }
            decl = sb.toString();
        } else if (encoding.equals("CP037")) {
            ByteArrayOutputStream bi = new ByteArrayOutputStream();
            int c;
            while ((c = in.read()) != -1) {
                if (c == 0x6e) // that's '>' in ebcdic
                    break;
                bi.write(c);
            }
            decl = new String(bi.toByteArray(), "CP037");
        }
        if (decl != null) {
            decl = EncodingUtil.getDeclaredEncoding(decl);
            if (decl != null)
                encoding = decl;
        }
        in.reset();
        return new InputStreamReader(in, IanaEncodings.getJavaEncoding(encoding));
    }

    /**
     * Set the current state.
     *
     * @param state the current state
     */
    protected void setState(final State state) {
        this.state = state;
    }

    /**
     * @param character the character to append
     * @return the parser
     */
    public XMLParser append(final char character) {
        this.memory.current().append(character);
        return this;

    }

    // /**
    // * @param str the String to append
    // * @return the parser
    // */
    // public XMLParser append(final String str) {
    // this.memory.current().write(str.getBytes());
    // return this;
    //
    // }

    /**
     * The state controller of the parser
     *
     * @return {@link StateController}
     */
    public StateController selectState() {
        return this.controller;
    }

    /**
     * Triggered when the UnknownState encountered anything before encountering
     * a tag.
     */
    public void unknownData() {
        for (XMLParserListener l : listeners) {
            l.unknownText(this.memory.current().toString());
        }
    }

    /**
     * Flushes the currently stored data in the buffer.
     */
    public void flush() {
        this.memory.resetBuffer();
    }

    /**
     * Returns the current content of the text buffer.
     *
     * @return current buffer content
     */
    public String current() {
        return this.memory.current().toString();
    }

    /**
     * Returns the XMLParserMemory.
     *
     * @return the memory
     */
    public XMLParserMemory memory() {
        return memory;
    }

    /**
     * Triggered when an opening tag has been encountered.
     */
    public void startElement() {
        currentTagState(TagState.OPEN);
        String tagName = this.memory.getCurrentTag();
        Map attributes = this.memory.getAttributes();
        if (tagName.startsWith("?")) {
            memory().processingInstruction().setLength(0);
        }
        callText();
        for (XMLParserListener l : listeners) {
            l.startElement(tagName, attributes, this.memory.getNameSpace());
        }
        this.memory().flushNameSpace();
    }

    /**
     * Call this method to submit the text to listeners.
     */
    private void callText() {
        if (null != text && text.length() > 0) {
            // LOGGER .log(text);
            for (XMLParserListener l : listeners) {
                l.text(text);
            }
            text = null;
        }
    }

    /**
     * Triggered when a closing tag has been encountered.
     */
    public void endElement() {
        currentTagState(TagState.CLOSE);
        callText();
        for (XMLParserListener l : listeners) {
            l.endElement(this.memory.getCurrentTag(), this.memory.getNameSpace());
        }
    }

    /**
     * Triggered when content has been encountered.
     *
     * @param bs the content
     */
    public void text(final String bs) {
        text = bs;
    }

    /**
     * Triggered for comments.
     */
    public void comment() {
        callText();
        for (XMLParserListener l : listeners) {
            l.comment(this.memory.current().toString());
        }
    }

    /**
     * @return the current last character of the buffer or ' ' if none.
     */
    public char currentLastChar() {
        StringBuilder current2 = this.memory.current();
        int length = current2.length();
        CharSequence current = current2.subSequence(length - 2, length - 1);
        if (current.length() > 0) {
            return (char) (current.length() - 1);
        }
        return ' ';
    }

    /**
     * Get the current tag
     *
     * @return the current tag.
     */
    public String currentTag() {
        return this.memory.getCurrentTag();
    }

    /**
     * Get the state of the current tag
     *
     * @return the state of the current tag
     */
    public TagState currentTagState() {
        return this.tagState;
    }

    /**
     * Set the state of the current tag
     *
     * @param state the state of the current tag
     */
    private void currentTagState(final TagState state) {
        this.tagState = state;
    }

    /**
     * @param monitor the monitor to set
     */
    public void setMonitor(final ParserMonitor monitor) {
        this.monitor = monitor;
    }

    /**
     * Determines whether special chars like > will be decoded
     * @param decodeSpecialChars true to decode, false to not decode
     */
    public void setDecodeSpecialChars(boolean decodeSpecialChars) {
        this.decodeSpecialChars = decodeSpecialChars;
    }

    public boolean isDecodeSpecialChars() {
        return decodeSpecialChars;
    }

    /**
     * @return the current buffer as a String
     */
    public String bufferToString() {
        return this.memory.current().toString();
    }

    /**
     * @param bytes the byte array to append
     * @return this instance of the XMLParser
     */
    public XMLParser append(final char[] bytes) {
        this.memory.current().append(bytes);
        return this;
    }

    /**
     * @return the size of the buffer
     */
    public int bufferSize() {
        return (null != this.memory.current()) ? this.memory.current().length() : 0;
    }

    /**
     * Appends the given string to the buffer.
     *
     * @param string the String to append
     * @return this instance of the XMLParser
     */
    public XMLParser append(final String string) {
        this.memory.current().append(string);
        return this;

    }

    /**
     * Returns the current used character set.
     *
     * @return the charset
     */
    public Charset getCharset() {
        return charset;
    }

}