org.attoparser.MarkupParser Maven / Gradle / Ivy
Show all versions of attoparser Show documentation
/*
* =============================================================================
*
* Copyright (c) 2012-2014, The ATTOPARSER team (http://www.attoparser.org)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* =============================================================================
*/
package org.attoparser;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import org.attoparser.config.ParseConfiguration;
import org.attoparser.select.ParseSelection;
/**
*
* Default implementation of the {@link IMarkupParser} interface.
*
*
* AttoParser markup parsers work as SAX-style parsers that need
* a markup handler object for handling parsing events. These handlers implement
* the {@link org.attoparser.IMarkupHandler} interface, and are normally developed by
* users in order to perform the operations they require for their applications.
*
*
* See the documentation of the {@link org.attoparser.IMarkupHandler} interface for more
* information on the event handler methods, and also on the handler implementations
* AttoParser provides out-of-the-box.
*
*
* Also, note there are two different specialized parsers that use
* {@link org.attoparser.MarkupParser} underneath, but which are oriented towards allowing
* an easy use of specific parsing features: {@link org.attoparser.dom.IDOMMarkupParser} for
* DOM-oriented parsing and {@link org.attoparser.simple.ISimpleMarkupParser} for using
* a simplified version of the handler interface ({@link org.attoparser.simple.ISimpleMarkupHandler}).
*
*
* Sample usage:
*
*
* // Obtain a java.io.Reader on the document to be parsed
* final Reader documentReader = ...;
*
* // Create the handler instance. Extending the no-op AbstractMarkupHandler is a good start
* final IMarkupHandler handler = new AbstractMarkupHandler() {
* ... // some events implemented
* };
*
* // Create or obtain the parser instance (can be reused). Example uses the default configuration for HTML
* final IMarkupParser parser = new MarkupParser(ParseConfiguration.htmlConfiguration());
*
* // Parse it!
* parser.parse(documentReader, handler);
*
*
* This parser class is thread-safe. However, take into account that, normally,
* {@link IMarkupHandler} implementations are not. So, even if parsers can be reused, handler objects
* usually cannot.
*
*
* This parser class uses a (configurable) pool of char[] buffers, in order to reduce the amount of
* memory used for parsing (buffers are large structures). This pool works in a non-blocking mode,
* so if a new buffer is needed and all are currently allocated, a new (unpooled) char[] object
* is created and returned without waiting for a pooled buffer to be available.
*
*
* (Note that these pooled buffers will not be used when parsing documents specified as char[]
* objects. In such case, the char[] documents themselves will be used as buffers, avoiding the need
* to allocate pooled buffers or use any additional amount of memory.)
*
*
* @author Daniel Fernández
*
* @since 2.0.0
*
*/
public final class MarkupParser implements IMarkupParser {
/**
*
* Default buffer size to be used (buffer size will grow at runtime if
* an artifact (structure or text) is bigger than the whole buffer).
* Value: 4096 chars (= 8192 bytes).
*
*/
public static final int DEFAULT_BUFFER_SIZE = 4096;
/**
*
* Default pool size to be used. Buffers will be kept in a pool and
* reused in order to increase performance. Pool will be non-exclusive
* so that if pool size = 2 and a 3rd request arrives, it is assigned
* a new buffer object (not linked to the pool, and therefore GC-ed
* at the end). Value: 2.
*
*/
public static final int DEFAULT_POOL_SIZE = 2;
private final ParseConfiguration configuration;
private final BufferPool pool;
/**
*
* Creates a new instance of this parser, using the specified configuration and default
* sizes for pool ({@link #DEFAULT_POOL_SIZE}) and pooled buffers ({@link #DEFAULT_BUFFER_SIZE}).
*
*
* @param configuration the parsing configuration to be used.
*/
public MarkupParser(final ParseConfiguration configuration) {
this(configuration, DEFAULT_POOL_SIZE, DEFAULT_BUFFER_SIZE);
}
/**
*
* Creates a new instance of this parser, specifying the pool and buffer size.
*
*
* Buffer size (in chars) will be the size of the char[] structures used as buffers for parsing,
* which might grow if a certain markup structure does not fit inside (e.g. a text). Default size is
* {@link MarkupParser#DEFAULT_BUFFER_SIZE}.
*
*
* Pool size is the size of the pool of char[] buffers that will be kept in memory in order to
* allow their reuse. This pool works in a non-exclusive mode, so that if pool size is 3 and a 4th request
* arrives, it is served a new non-pooled buffer without the need to block waiting for one of the pooled
* instances. Default size is {@link MarkupParser#DEFAULT_POOL_SIZE}.
*
*
* Note that these pooled buffers will not be used when parsing documents specified as char[]
* objects. In such case, the char[] documents themselves will be used as buffers, avoiding the need
* to allocate buffers or use any additional amount of memory.
*
*
* @param configuration the parsing configuration to be used.
* @param poolSize the size of the pool of buffers to be used.
* @param bufferSize the default size of the buffers to be instanced for this parser.
*/
public MarkupParser(final ParseConfiguration configuration, final int poolSize, final int bufferSize) {
super();
this.configuration = configuration;
this.pool = new BufferPool(poolSize, bufferSize);
}
public void parse(final String document, final IMarkupHandler handler)
throws ParseException {
if (document == null) {
throw new IllegalArgumentException("Document cannot be null");
}
parse(new StringReader(document), handler);
}
public void parse(final char[] document, final IMarkupHandler handler)
throws ParseException {
if (document == null) {
throw new IllegalArgumentException("Document cannot be null");
}
parse(document, 0, document.length, handler);
}
public void parse(
final char[] document, final int offset, final int len, final IMarkupHandler handler)
throws ParseException {
if (document == null) {
throw new IllegalArgumentException("Document cannot be null");
}
if (offset < 0 || len < 0) {
throw new IllegalArgumentException(
"Neither document offset (" + offset + ") nor document length (" +
len + ") can be less than zero");
}
if (handler == null) {
throw new IllegalArgumentException("Handler cannot be null");
}
IMarkupHandler markupHandler =
(ParseConfiguration.ParsingMode.HTML.equals(this.configuration.getMode()) ?
new HtmlMarkupHandler(handler) : handler);
// We will not report directly to the specified handler, but instead to an intermediate class that will be in
// charge of applying the required markup logic and rules, according to the specified configuration
markupHandler = new MarkupEventProcessorHandler(markupHandler);
markupHandler.setParseConfiguration(this.configuration);
final ParseStatus status = new ParseStatus();
markupHandler.setParseStatus(status);
final ParseSelection selection = new ParseSelection();
markupHandler.setParseSelection(selection);
// We already have a suitable char[] buffer, so there is no need to use one from the pool.
parseDocument(document, offset, len, markupHandler, status);
}
public void parse(
final Reader reader, final IMarkupHandler handler)
throws ParseException {
if (reader == null) {
throw new IllegalArgumentException("Reader cannot be null");
}
if (handler == null) {
throw new IllegalArgumentException("Handler cannot be null");
}
IMarkupHandler markupHandler =
(ParseConfiguration.ParsingMode.HTML.equals(this.configuration.getMode()) ?
new HtmlMarkupHandler(handler) : handler);
// We will not report directly to the specified handler, but instead to an intermediate class that will be in
// charge of applying the required markup logic and rules, according to the specified configuration
markupHandler = new MarkupEventProcessorHandler(markupHandler);
markupHandler.setParseConfiguration(this.configuration);
final ParseStatus status = new ParseStatus();
markupHandler.setParseStatus(status);
final ParseSelection selection = new ParseSelection();
markupHandler.setParseSelection(selection);
// We don't already have a suitable char[] buffer, so we expect the parser to use one of its pooled buffers.
parseDocument(reader, this.pool.poolBufferSize, markupHandler, status);
}
/*
* This method receiving the buffer size with package visibility allows
* testing different buffer sizes.
*/
void parseDocument(
final Reader reader, final int suggestedBufferSize,
final IMarkupHandler handler, final ParseStatus status)
throws ParseException {
final long parsingStartTimeNanos = System.nanoTime();
char[] buffer = null;
try {
handler.handleDocumentStart(parsingStartTimeNanos, 1, 1);
int bufferSize = suggestedBufferSize;
buffer = this.pool.allocateBuffer(bufferSize);
int bufferContentSize = reader.read(buffer);
boolean cont = (bufferContentSize != -1);
status.offset = -1;
status.line = 1;
status.col = 1;
status.inStructure = false;
status.parsingDisabled = true;
status.parsingDisabledLimitSequence = null;
status.autoCloseRequired = null;
status.autoCloseLimits = null;
while (cont) {
parseBuffer(buffer, 0, bufferContentSize, handler, status);
int readOffset = 0;
int readLen = bufferSize;
if (status.offset == 0) {
if (bufferContentSize == bufferSize) {
// Buffer is not big enough, double it!
char[] newBuffer = null;
try {
bufferSize *= 2;
newBuffer = this.pool.allocateBuffer(bufferSize);
System.arraycopy(buffer, 0, newBuffer, 0, bufferContentSize);
this.pool.releaseBuffer(buffer);
buffer = newBuffer;
} catch (final Exception ignored) {
this.pool.releaseBuffer(newBuffer);
}
}
// it's possible for two reads to occur in a row and 1) read less than the bufferSize and 2)
// still not find the next tag/end of structure
readOffset = bufferContentSize;
readLen = bufferSize - readOffset;
} else if (status.offset < bufferContentSize) {
System.arraycopy(buffer, status.offset, buffer, 0, bufferContentSize - status.offset);
readOffset = bufferContentSize - status.offset;
readLen = bufferSize - readOffset;
status.offset = 0;
bufferContentSize = readOffset;
}
final int read = reader.read(buffer, readOffset, readLen);
if (read != -1) {
bufferContentSize = readOffset + read;
} else {
cont = false;
}
}
// Iteration done, now it's time to clean up in case we still have some text to be notified
int lastLine = status.line;
int lastCol = status.col;
final int lastStart = status.offset;
final int lastLen = bufferContentSize - lastStart;
if (lastLen > 0) {
if (status.inStructure) {
throw new ParseException(
"Incomplete structure: \"" + new String(buffer, lastStart, lastLen) + "\"", status.line, status.col);
}
handler.handleText(buffer, lastStart, lastLen, status.line, status.col);
// As we have produced an additional text event, we need to fast-forward the
// lastLine and lastCol position to include the last text structure.
for (int i = lastStart; i < (lastStart + lastLen); i++) {
final char c = buffer[i];
if (c == '\n') {
lastLine++;
lastCol = 1;
} else {
lastCol++;
}
}
}
final long parsingEndTimeNanos = System.nanoTime();
handler.handleDocumentEnd(parsingEndTimeNanos, (parsingEndTimeNanos - parsingStartTimeNanos), lastLine, lastCol);
} catch (final ParseException e) {
throw e;
} catch (final Exception e) {
throw new ParseException(e);
} finally {
this.pool.releaseBuffer(buffer);
try {
reader.close();
} catch (final Throwable ignored) {
// This exception can be safely ignored
}
}
}
/*
* This method is roughly equivalent to the one receiving a Reader, but oriented to parsing an already-existing
* buffer without the need to allocate one from the pool.
*/
void parseDocument(
final char[] buffer, final int offset, final int len,
final IMarkupHandler handler, final ParseStatus status)
throws ParseException {
final long parsingStartTimeNanos = System.nanoTime();
try {
handler.handleDocumentStart(parsingStartTimeNanos, 1, 1);
status.offset = -1;
status.line = 1;
status.col = 1;
status.inStructure = false;
status.parsingDisabled = true;
status.parsingDisabledLimitSequence = null;
status.autoCloseRequired = null;
status.autoCloseLimits = null;
parseBuffer(buffer, offset, len, handler, status);
// First parse done, now it's time to clean up in case we still have some text to be notified
int lastLine = status.line;
int lastCol = status.col;
final int lastStart = status.offset;
final int lastLen = (offset + len) - lastStart;
if (lastLen > 0) {
if (status.inStructure) {
throw new ParseException(
"Incomplete structure: \"" + new String(buffer, lastStart, lastLen) + "\"", status.line, status.col);
}
handler.handleText(buffer, lastStart, lastLen, status.line, status.col);
// As we have produced an additional text event, we need to fast-forward the
// lastLine and lastCol position to include the last text structure.
for (int i = lastStart; i < (lastStart + lastLen); i++) {
final char c = buffer[i];
if (c == '\n') {
lastLine++;
lastCol = 1;
} else {
lastCol++;
}
}
}
final long parsingEndTimeNanos = System.nanoTime();
handler.handleDocumentEnd(parsingEndTimeNanos, (parsingEndTimeNanos - parsingStartTimeNanos), lastLine, lastCol);
} catch (final ParseException e) {
throw e;
} catch (final Exception e) {
throw new ParseException(e);
}
}
private void parseBuffer(
final char[] buffer, final int offset, final int len,
final IMarkupHandler handler,
final ParseStatus status)
throws ParseException {
final int[] locator = new int[] {status.line, status.col};
int currentLine;
int currentCol;
final int maxi = offset + len;
int i = offset;
int current = i;
boolean inStructure;
boolean inOpenElement = false;
boolean inCloseElement = false;
boolean inComment = false;
boolean inCdata = false;
boolean inDocType = false;
boolean inXmlDeclaration = false;
boolean inProcessingInstruction = false;
int tagStart;
int tagEnd;
while (i < maxi) {
currentLine = locator[0];
currentCol = locator[1];
if (status.parsingDisabledLimitSequence != null) {
// We need to disable parsing until we find a specific character sequence.
// This allows correct parsing of CDATA (not PCDATA) sections (e.g.