All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.attoparser.MarkupParser Maven / Gradle / Ivy

There is a newer version: 2.0.7.RELEASE
Show newest version
/*
 * =============================================================================
 * 
 *   Copyright (c) 2012-2014, The ATTOPARSER team (http://www.attoparser.org)
 * 
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 * 
 *       http://www.apache.org/licenses/LICENSE-2.0
 * 
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 * 
 * =============================================================================
 */
package org.attoparser;

import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;

import org.attoparser.config.ParseConfiguration;
import org.attoparser.select.ParseSelection;


/**
 * 

* Default implementation of the {@link IMarkupParser} interface. *

*

* AttoParser markup parsers work as SAX-style parsers that need * a markup handler object for handling parsing events. These handlers implement * the {@link org.attoparser.IMarkupHandler} interface, and are normally developed by * users in order to perform the operations they require for their applications. *

*

* See the documentation of the {@link org.attoparser.IMarkupHandler} interface for more * information on the event handler methods, and also on the handler implementations * AttoParser provides out-of-the-box. *

*

* Also, note there are two different specialized parsers that use * {@link org.attoparser.MarkupParser} underneath, but which are oriented towards allowing * an easy use of specific parsing features: {@link org.attoparser.dom.IDOMMarkupParser} for * DOM-oriented parsing and {@link org.attoparser.simple.ISimpleMarkupParser} for using * a simplified version of the handler interface ({@link org.attoparser.simple.ISimpleMarkupHandler}). *

*

* Sample usage: *

*

 *   // Obtain a java.io.Reader on the document to be parsed
 *   final Reader documentReader = ...;
 *
 *   // Create the handler instance. Extending the no-op AbstractMarkupHandler is a good start
 *   final IMarkupHandler handler = new AbstractMarkupHandler() {
 *       ... // some events implemented
 *   };
 *
 *   // Create or obtain the parser instance (can be reused). Example uses the default configuration for HTML
 *   final IMarkupParser parser = new MarkupParser(ParseConfiguration.htmlConfiguration());
 *
 *   // Parse it!
 *   parser.parse(documentReader, handler);
 * 
*

* This parser class is thread-safe. However, take into account that, normally, * {@link IMarkupHandler} implementations are not. So, even if parsers can be reused, handler objects * usually cannot. *

*

* This parser class uses a (configurable) pool of char[] buffers, in order to reduce the amount of * memory used for parsing (buffers are large structures). This pool works in a non-blocking mode, * so if a new buffer is needed and all are currently allocated, a new (unpooled) char[] object * is created and returned without waiting for a pooled buffer to be available. *

*

* (Note that these pooled buffers will not be used when parsing documents specified as char[] * objects. In such case, the char[] documents themselves will be used as buffers, avoiding the need * to allocate pooled buffers or use any additional amount of memory.) *

* * @author Daniel Fernández * * @since 2.0.0 * */ public final class MarkupParser implements IMarkupParser { /** *

* Default buffer size to be used (buffer size will grow at runtime if * an artifact (structure or text) is bigger than the whole buffer). * Value: 4096 chars (= 8192 bytes). *

*/ public static final int DEFAULT_BUFFER_SIZE = 4096; /** *

* Default pool size to be used. Buffers will be kept in a pool and * reused in order to increase performance. Pool will be non-exclusive * so that if pool size = 2 and a 3rd request arrives, it is assigned * a new buffer object (not linked to the pool, and therefore GC-ed * at the end). Value: 2. *

*/ public static final int DEFAULT_POOL_SIZE = 2; private final ParseConfiguration configuration; private final BufferPool pool; /** *

* Creates a new instance of this parser, using the specified configuration and default * sizes for pool ({@link #DEFAULT_POOL_SIZE}) and pooled buffers ({@link #DEFAULT_BUFFER_SIZE}). *

* * @param configuration the parsing configuration to be used. */ public MarkupParser(final ParseConfiguration configuration) { this(configuration, DEFAULT_POOL_SIZE, DEFAULT_BUFFER_SIZE); } /** *

* Creates a new instance of this parser, specifying the pool and buffer size. *

*

* Buffer size (in chars) will be the size of the char[] structures used as buffers for parsing, * which might grow if a certain markup structure does not fit inside (e.g. a text). Default size is * {@link MarkupParser#DEFAULT_BUFFER_SIZE}. *

*

* Pool size is the size of the pool of char[] buffers that will be kept in memory in order to * allow their reuse. This pool works in a non-exclusive mode, so that if pool size is 3 and a 4th request * arrives, it is served a new non-pooled buffer without the need to block waiting for one of the pooled * instances. Default size is {@link MarkupParser#DEFAULT_POOL_SIZE}. *

*

* Note that these pooled buffers will not be used when parsing documents specified as char[] * objects. In such case, the char[] documents themselves will be used as buffers, avoiding the need * to allocate buffers or use any additional amount of memory. *

* * @param configuration the parsing configuration to be used. * @param poolSize the size of the pool of buffers to be used. * @param bufferSize the default size of the buffers to be instanced for this parser. */ public MarkupParser(final ParseConfiguration configuration, final int poolSize, final int bufferSize) { super(); this.configuration = configuration; this.pool = new BufferPool(poolSize, bufferSize); } public void parse(final String document, final IMarkupHandler handler) throws ParseException { if (document == null) { throw new IllegalArgumentException("Document cannot be null"); } parse(new StringReader(document), handler); } public void parse(final char[] document, final IMarkupHandler handler) throws ParseException { if (document == null) { throw new IllegalArgumentException("Document cannot be null"); } parse(document, 0, document.length, handler); } public void parse( final char[] document, final int offset, final int len, final IMarkupHandler handler) throws ParseException { if (document == null) { throw new IllegalArgumentException("Document cannot be null"); } if (offset < 0 || len < 0) { throw new IllegalArgumentException( "Neither document offset (" + offset + ") nor document length (" + len + ") can be less than zero"); } if (handler == null) { throw new IllegalArgumentException("Handler cannot be null"); } IMarkupHandler markupHandler = (ParseConfiguration.ParsingMode.HTML.equals(this.configuration.getMode()) ? new HtmlMarkupHandler(handler) : handler); // We will not report directly to the specified handler, but instead to an intermediate class that will be in // charge of applying the required markup logic and rules, according to the specified configuration markupHandler = new MarkupEventProcessorHandler(markupHandler); markupHandler.setParseConfiguration(this.configuration); final ParseStatus status = new ParseStatus(); markupHandler.setParseStatus(status); final ParseSelection selection = new ParseSelection(); markupHandler.setParseSelection(selection); // We already have a suitable char[] buffer, so there is no need to use one from the pool. parseDocument(document, offset, len, markupHandler, status); } public void parse( final Reader reader, final IMarkupHandler handler) throws ParseException { if (reader == null) { throw new IllegalArgumentException("Reader cannot be null"); } if (handler == null) { throw new IllegalArgumentException("Handler cannot be null"); } IMarkupHandler markupHandler = (ParseConfiguration.ParsingMode.HTML.equals(this.configuration.getMode()) ? new HtmlMarkupHandler(handler) : handler); // We will not report directly to the specified handler, but instead to an intermediate class that will be in // charge of applying the required markup logic and rules, according to the specified configuration markupHandler = new MarkupEventProcessorHandler(markupHandler); markupHandler.setParseConfiguration(this.configuration); final ParseStatus status = new ParseStatus(); markupHandler.setParseStatus(status); final ParseSelection selection = new ParseSelection(); markupHandler.setParseSelection(selection); // We don't already have a suitable char[] buffer, so we expect the parser to use one of its pooled buffers. parseDocument(reader, this.pool.poolBufferSize, markupHandler, status); } /* * This method receiving the buffer size with package visibility allows * testing different buffer sizes. */ void parseDocument( final Reader reader, final int suggestedBufferSize, final IMarkupHandler handler, final ParseStatus status) throws ParseException { final long parsingStartTimeNanos = System.nanoTime(); char[] buffer = null; try { handler.handleDocumentStart(parsingStartTimeNanos, 1, 1); int bufferSize = suggestedBufferSize; buffer = this.pool.allocateBuffer(bufferSize); int bufferContentSize = reader.read(buffer); boolean cont = (bufferContentSize != -1); status.offset = -1; status.line = 1; status.col = 1; status.inStructure = false; status.parsingDisabled = true; status.parsingDisabledLimitSequence = null; status.autoCloseRequired = null; status.autoCloseLimits = null; while (cont) { parseBuffer(buffer, 0, bufferContentSize, handler, status); int readOffset = 0; int readLen = bufferSize; if (status.offset == 0) { if (bufferContentSize == bufferSize) { // Buffer is not big enough, double it! char[] newBuffer = null; try { bufferSize *= 2; newBuffer = this.pool.allocateBuffer(bufferSize); System.arraycopy(buffer, 0, newBuffer, 0, bufferContentSize); this.pool.releaseBuffer(buffer); buffer = newBuffer; } catch (final Exception ignored) { this.pool.releaseBuffer(newBuffer); } } // it's possible for two reads to occur in a row and 1) read less than the bufferSize and 2) // still not find the next tag/end of structure readOffset = bufferContentSize; readLen = bufferSize - readOffset; } else if (status.offset < bufferContentSize) { System.arraycopy(buffer, status.offset, buffer, 0, bufferContentSize - status.offset); readOffset = bufferContentSize - status.offset; readLen = bufferSize - readOffset; status.offset = 0; bufferContentSize = readOffset; } final int read = reader.read(buffer, readOffset, readLen); if (read != -1) { bufferContentSize = readOffset + read; } else { cont = false; } } // Iteration done, now it's time to clean up in case we still have some text to be notified int lastLine = status.line; int lastCol = status.col; final int lastStart = status.offset; final int lastLen = bufferContentSize - lastStart; if (lastLen > 0) { if (status.inStructure) { throw new ParseException( "Incomplete structure: \"" + new String(buffer, lastStart, lastLen) + "\"", status.line, status.col); } handler.handleText(buffer, lastStart, lastLen, status.line, status.col); // As we have produced an additional text event, we need to fast-forward the // lastLine and lastCol position to include the last text structure. for (int i = lastStart; i < (lastStart + lastLen); i++) { final char c = buffer[i]; if (c == '\n') { lastLine++; lastCol = 1; } else { lastCol++; } } } final long parsingEndTimeNanos = System.nanoTime(); handler.handleDocumentEnd(parsingEndTimeNanos, (parsingEndTimeNanos - parsingStartTimeNanos), lastLine, lastCol); } catch (final ParseException e) { throw e; } catch (final Exception e) { throw new ParseException(e); } finally { this.pool.releaseBuffer(buffer); try { reader.close(); } catch (final Throwable ignored) { // This exception can be safely ignored } } } /* * This method is roughly equivalent to the one receiving a Reader, but oriented to parsing an already-existing * buffer without the need to allocate one from the pool. */ void parseDocument( final char[] buffer, final int offset, final int len, final IMarkupHandler handler, final ParseStatus status) throws ParseException { final long parsingStartTimeNanos = System.nanoTime(); try { handler.handleDocumentStart(parsingStartTimeNanos, 1, 1); status.offset = -1; status.line = 1; status.col = 1; status.inStructure = false; status.parsingDisabled = true; status.parsingDisabledLimitSequence = null; status.autoCloseRequired = null; status.autoCloseLimits = null; parseBuffer(buffer, offset, len, handler, status); // First parse done, now it's time to clean up in case we still have some text to be notified int lastLine = status.line; int lastCol = status.col; final int lastStart = status.offset; final int lastLen = (offset + len) - lastStart; if (lastLen > 0) { if (status.inStructure) { throw new ParseException( "Incomplete structure: \"" + new String(buffer, lastStart, lastLen) + "\"", status.line, status.col); } handler.handleText(buffer, lastStart, lastLen, status.line, status.col); // As we have produced an additional text event, we need to fast-forward the // lastLine and lastCol position to include the last text structure. for (int i = lastStart; i < (lastStart + lastLen); i++) { final char c = buffer[i]; if (c == '\n') { lastLine++; lastCol = 1; } else { lastCol++; } } } final long parsingEndTimeNanos = System.nanoTime(); handler.handleDocumentEnd(parsingEndTimeNanos, (parsingEndTimeNanos - parsingStartTimeNanos), lastLine, lastCol); } catch (final ParseException e) { throw e; } catch (final Exception e) { throw new ParseException(e); } } private void parseBuffer( final char[] buffer, final int offset, final int len, final IMarkupHandler handler, final ParseStatus status) throws ParseException { final int[] locator = new int[] {status.line, status.col}; int currentLine; int currentCol; final int maxi = offset + len; int i = offset; int current = i; boolean inStructure; boolean inOpenElement = false; boolean inCloseElement = false; boolean inComment = false; boolean inCdata = false; boolean inDocType = false; boolean inXmlDeclaration = false; boolean inProcessingInstruction = false; int tagStart; int tagEnd; while (i < maxi) { currentLine = locator[0]; currentCol = locator[1]; if (status.parsingDisabledLimitSequence != null) { // We need to disable parsing until we find a specific character sequence. // This allows correct parsing of CDATA (not PCDATA) sections (e.g.