org.attoparser.MarkupParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of attoparser Show documentation
Powerful, fast and easy to use HTML and XML parser for Java
There is a newer version: 2.0.7.RELEASE
/*
 * =============================================================================
 * 
 *   Copyright (c) 2012-2014, The ATTOPARSER team (http://www.attoparser.org)
 * 
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 * 
 *       http://www.apache.org/licenses/LICENSE-2.0
 * 
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 * 
 * =============================================================================
 */
package org.attoparser;

import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;

import org.attoparser.config.ParseConfiguration;
import org.attoparser.select.ParseSelection;


/**
 * 
 *   Default implementation of the {@link IMarkupParser} interface.
 * 
 * 
 *   AttoParser markup parsers work as SAX-style parsers that need
 *   a markup handler object for handling parsing events. These handlers implement
 *   the {@link org.attoparser.IMarkupHandler} interface, and are normally developed by
 *   users in order to perform the operations they require for their applications.
 * 
 * 
 *   See the documentation of the {@link org.attoparser.IMarkupHandler} interface for more
 *   information on the event handler methods, and also on the handler implementations
 *   AttoParser provides out-of-the-box.
 * 
 * 
 *   Also, note there are two different specialized parsers that use
 *   {@link org.attoparser.MarkupParser} underneath, but which are oriented towards allowing
 *   an easy use of specific parsing features: {@link org.attoparser.dom.IDOMMarkupParser} for
 *   DOM-oriented parsing and {@link org.attoparser.simple.ISimpleMarkupParser} for using
 *   a simplified version of the handler interface ({@link org.attoparser.simple.ISimpleMarkupHandler}).
 * 
 * 
 *   Sample usage:
 * 
 * 
 *   // Obtain a java.io.Reader on the document to be parsed
 *   final Reader documentReader = ...;
 *
 *   // Create the handler instance. Extending the no-op AbstractMarkupHandler is a good start
 *   final IMarkupHandler handler = new AbstractMarkupHandler() {
 *       ... // some events implemented
 *   };
 *
 *   // Create or obtain the parser instance (can be reused). Example uses the default configuration for HTML
 *   final IMarkupParser parser = new MarkupParser(ParseConfiguration.htmlConfiguration());
 *
 *   // Parse it!
 *   parser.parse(documentReader, handler);
 * 
 * 
 *   This parser class is thread-safe. However, take into account that, normally,
 *   {@link IMarkupHandler} implementations are not. So, even if parsers can be reused, handler objects
 *   usually cannot.
 * 
 * 
 *   This parser class uses a (configurable) pool of char[] buffers, in order to reduce the amount of
 *   memory used for parsing (buffers are large structures). This pool works in a non-blocking mode,
 *   so if a new buffer is needed and all are currently allocated, a new (unpooled) char[] object
 *   is created and returned without waiting for a pooled buffer to be available.
 * 
 * 
 *   (Note that these pooled buffers will not be used when parsing documents specified as char[]
 *   objects. In such case, the char[] documents themselves will be used as buffers, avoiding the need
 *   to allocate pooled buffers or use any additional amount of memory.)
 * 
 *
 * @author Daniel Fernández
 * 
 * @since 2.0.0
 *
 */
public final class MarkupParser implements IMarkupParser {

    /**
     * 
     *   Default buffer size to be used (buffer size will grow at runtime if
     *   an artifact (structure or text) is bigger than the whole buffer).
     *   Value: 4096 chars (= 8192 bytes).
     * 
     */
    public static final int DEFAULT_BUFFER_SIZE = 4096;

    /**
     * 
     *   Default pool size to be used. Buffers will be kept in a pool and
     *   reused in order to increase performance. Pool will be non-exclusive
     *   so that if pool size = 2 and a 3rd request arrives, it is assigned
     *   a new buffer object (not linked to the pool, and therefore GC-ed
     *   at the end). Value: 2.
     * 
     */
    public static final int DEFAULT_POOL_SIZE = 2;


    private final ParseConfiguration configuration;
    private final BufferPool pool;




    /**
     * 
     *   Creates a new instance of this parser, using the specified configuration and default
     *   sizes for pool ({@link #DEFAULT_POOL_SIZE}) and pooled buffers ({@link #DEFAULT_BUFFER_SIZE}).
     * 
     *
     * @param configuration the parsing configuration to be used.
     */
    public MarkupParser(final ParseConfiguration configuration) {
        this(configuration, DEFAULT_POOL_SIZE, DEFAULT_BUFFER_SIZE);
    }


    /**
     * 
     *   Creates a new instance of this parser, specifying the pool and buffer size.
     * 
     * 
     *   Buffer size (in chars) will be the size of the char[] structures used as buffers for parsing,
     *   which might grow if a certain markup structure does not fit inside (e.g. a text). Default size is
     *   {@link MarkupParser#DEFAULT_BUFFER_SIZE}.
     * 
     * 
     *   Pool size is the size of the pool of char[] buffers that will be kept in memory in order to
     *   allow their reuse. This pool works in a non-exclusive mode, so that if pool size is 3 and a 4th request
     *   arrives, it is served a new non-pooled buffer without the need to block waiting for one of the pooled
     *   instances. Default size is {@link MarkupParser#DEFAULT_POOL_SIZE}.
     * 
     * 
     *   Note that these pooled buffers will not be used when parsing documents specified as char[]
     *   objects. In such case, the char[] documents themselves will be used as buffers, avoiding the need
     *   to allocate buffers or use any additional amount of memory.
     * 
     *
     * @param configuration the parsing configuration to be used.
     * @param poolSize the size of the pool of buffers to be used.
     * @param bufferSize the default size of the buffers to be instanced for this parser.
     */
    public MarkupParser(final ParseConfiguration configuration, final int poolSize, final int bufferSize) {
        super();
        this.configuration = configuration;
        this.pool = new BufferPool(poolSize, bufferSize);
    }






    public void parse(final String document, final IMarkupHandler handler)
            throws ParseException {
        if (document == null) {
            throw new IllegalArgumentException("Document cannot be null");
        }
        parse(new StringReader(document), handler);
    }


    public void parse(final char[] document, final IMarkupHandler handler)
            throws ParseException {
        if (document == null) {
            throw new IllegalArgumentException("Document cannot be null");
        }
        parse(document, 0, document.length, handler);
    }


    public void parse(
            final char[] document, final int offset, final int len, final IMarkupHandler handler)
            throws ParseException {

        if (document == null) {
            throw new IllegalArgumentException("Document cannot be null");
        }
        if (offset < 0 || len < 0) {
            throw new IllegalArgumentException(
                    "Neither document offset (" + offset + ") nor document length (" +
                            len + ") can be less than zero");
        }

        if (handler == null) {
            throw new IllegalArgumentException("Handler cannot be null");
        }

        IMarkupHandler markupHandler =
                (ParseConfiguration.ParsingMode.HTML.equals(this.configuration.getMode()) ?
                        new HtmlMarkupHandler(handler) : handler);

        // We will not report directly to the specified handler, but instead to an intermediate class that will be in
        // charge of applying the required markup logic and rules, according to the specified configuration
        markupHandler = new MarkupEventProcessorHandler(markupHandler);

        markupHandler.setParseConfiguration(this.configuration);

        final ParseStatus status = new ParseStatus();
        markupHandler.setParseStatus(status);

        final ParseSelection selection = new ParseSelection();
        markupHandler.setParseSelection(selection);

        // We already have a suitable char[] buffer, so there is no need to use one from the pool.
        parseDocument(document, offset, len, markupHandler, status);

    }



    public void parse(
            final Reader reader, final IMarkupHandler handler)
            throws ParseException {

        if (reader == null) {
            throw new IllegalArgumentException("Reader cannot be null");
        }

        if (handler == null) {
            throw new IllegalArgumentException("Handler cannot be null");
        }

        IMarkupHandler markupHandler =
                (ParseConfiguration.ParsingMode.HTML.equals(this.configuration.getMode()) ?
                        new HtmlMarkupHandler(handler) : handler);

        // We will not report directly to the specified handler, but instead to an intermediate class that will be in
        // charge of applying the required markup logic and rules, according to the specified configuration
        markupHandler = new MarkupEventProcessorHandler(markupHandler);

        markupHandler.setParseConfiguration(this.configuration);

        final ParseStatus status = new ParseStatus();
        markupHandler.setParseStatus(status);

        final ParseSelection selection = new ParseSelection();
        markupHandler.setParseSelection(selection);

        // We don't already have a suitable char[] buffer, so we expect the parser to use one of its pooled buffers.
        parseDocument(reader, this.pool.poolBufferSize, markupHandler, status);

    }





    /*
     * This method receiving the buffer size with package visibility allows
     * testing different buffer sizes.
     */
    void parseDocument(
            final Reader reader, final int suggestedBufferSize,
            final IMarkupHandler handler, final ParseStatus status)
            throws ParseException {


        final long parsingStartTimeNanos = System.nanoTime();

        char[] buffer = null;

        try {

            handler.handleDocumentStart(parsingStartTimeNanos, 1, 1);

            int bufferSize = suggestedBufferSize;
            buffer = this.pool.allocateBuffer(bufferSize);

            int bufferContentSize = reader.read(buffer);

            boolean cont = (bufferContentSize != -1);

            status.offset = -1;
            status.line = 1;
            status.col = 1;
            status.inStructure = false;
            status.parsingDisabled = true;
            status.parsingDisabledLimitSequence = null;
            status.autoCloseRequired = null;
            status.autoCloseLimits = null;

            while (cont) {

                parseBuffer(buffer, 0, bufferContentSize, handler, status);

                int readOffset = 0;
                int readLen = bufferSize;

                if (status.offset == 0) {

                    if (bufferContentSize == bufferSize) {
                        // Buffer is not big enough, double it!

                        char[] newBuffer = null;
                        try {

                            bufferSize *= 2;

                            newBuffer = this.pool.allocateBuffer(bufferSize);
                            System.arraycopy(buffer, 0, newBuffer, 0, bufferContentSize);

                            this.pool.releaseBuffer(buffer);

                            buffer = newBuffer;

                        } catch (final Exception ignored) {
                            this.pool.releaseBuffer(newBuffer);
                        }

                    }

                    // it's possible for two reads to occur in a row and 1) read less than the bufferSize and 2)
                    // still not find the next tag/end of structure
                    readOffset = bufferContentSize;
                    readLen = bufferSize - readOffset;

                } else if (status.offset < bufferContentSize) {

                    System.arraycopy(buffer, status.offset, buffer, 0, bufferContentSize - status.offset);

                    readOffset = bufferContentSize - status.offset;
                    readLen = bufferSize - readOffset;

                    status.offset = 0;
                    bufferContentSize = readOffset;

                }

                final int read = reader.read(buffer, readOffset, readLen);
                if (read != -1) {
                    bufferContentSize = readOffset + read;
                } else {
                    cont = false;
                }

            }

            // Iteration done, now it's time to clean up in case we still have some text to be notified

            int lastLine = status.line;
            int lastCol = status.col;

            final int lastStart = status.offset;
            final int lastLen = bufferContentSize - lastStart;

            if (lastLen > 0) {

                if (status.inStructure) {
                    throw new ParseException(
                            "Incomplete structure: \"" + new String(buffer, lastStart, lastLen) + "\"", status.line, status.col);
                }

                handler.handleText(buffer, lastStart, lastLen, status.line, status.col);

                // As we have produced an additional text event, we need to fast-forward the
                // lastLine and lastCol position to include the last text structure.
                for (int i = lastStart; i < (lastStart + lastLen); i++) {
                    final char c = buffer[i];
                    if (c == '\n') {
                        lastLine++;
                        lastCol = 1;
                    } else {
                        lastCol++;
                    }

                }

            }

            final long parsingEndTimeNanos = System.nanoTime();
            handler.handleDocumentEnd(parsingEndTimeNanos, (parsingEndTimeNanos - parsingStartTimeNanos), lastLine, lastCol);

        } catch (final ParseException e) {
            throw e;
        } catch (final Exception e) {
            throw new ParseException(e);
        } finally {
            this.pool.releaseBuffer(buffer);
            try {
                reader.close();
            } catch (final Throwable ignored) {
                // This exception can be safely ignored
            }
        }

    }









    /*
     * This method is roughly equivalent to the one receiving a Reader, but oriented to parsing an already-existing
     * buffer without the need to allocate one from the pool.
     */
    void parseDocument(
            final char[] buffer, final int offset, final int len,
            final IMarkupHandler handler, final ParseStatus status)
            throws ParseException {


        final long parsingStartTimeNanos = System.nanoTime();

        try {

            handler.handleDocumentStart(parsingStartTimeNanos, 1, 1);

            status.offset = -1;
            status.line = 1;
            status.col = 1;
            status.inStructure = false;
            status.parsingDisabled = true;
            status.parsingDisabledLimitSequence = null;
            status.autoCloseRequired = null;
            status.autoCloseLimits = null;

            parseBuffer(buffer, offset, len, handler, status);

            // First parse done, now it's time to clean up in case we still have some text to be notified

            int lastLine = status.line;
            int lastCol = status.col;

            final int lastStart = status.offset;
            final int lastLen = (offset + len) - lastStart;

            if (lastLen > 0) {

                if (status.inStructure) {
                    throw new ParseException(
                            "Incomplete structure: \"" + new String(buffer, lastStart, lastLen) + "\"", status.line, status.col);
                }

                handler.handleText(buffer, lastStart, lastLen, status.line, status.col);

                // As we have produced an additional text event, we need to fast-forward the
                // lastLine and lastCol position to include the last text structure.
                for (int i = lastStart; i < (lastStart + lastLen); i++) {
                    final char c = buffer[i];
                    if (c == '\n') {
                        lastLine++;
                        lastCol = 1;
                    } else {
                        lastCol++;
                    }

                }

            }

            final long parsingEndTimeNanos = System.nanoTime();
            handler.handleDocumentEnd(parsingEndTimeNanos, (parsingEndTimeNanos - parsingStartTimeNanos), lastLine, lastCol);

        } catch (final ParseException e) {
            throw e;
        } catch (final Exception e) {
            throw new ParseException(e);
        }

    }












    
    
    private void parseBuffer(
            final char[] buffer, final int offset, final int len,
            final IMarkupHandler handler,
            final ParseStatus status)
            throws ParseException {


        final int[] locator = new int[] {status.line, status.col};
        
        int currentLine;
        int currentCol;
        
        final int maxi = offset + len;
        int i = offset;
        int current = i;

        boolean inStructure;

        boolean inOpenElement = false;
        boolean inCloseElement = false;
        boolean inComment = false;
        boolean inCdata = false;
        boolean inDocType = false;
        boolean inXmlDeclaration = false;
        boolean inProcessingInstruction = false;

        int tagStart;
        int tagEnd;
        
        while (i < maxi) {

            currentLine = locator[0];
            currentCol = locator[1];

            if (status.parsingDisabledLimitSequence != null) {
                // We need to disable parsing until we find a specific character sequence.
                // This allows correct parsing of CDATA (not PCDATA) sections (e.g.