org.attoparser.markup.MarkupAttoParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of attoparser Show documentation
Powerful, fast and easy to use HTML and XML parser for Java
There is a newer version: 2.0.7.RELEASE
/*
 * =============================================================================
 * 
 *   Copyright (c) 2012, The ATTOPARSER team (http://www.attoparser.org)
 * 
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 * 
 *       http://www.apache.org/licenses/LICENSE-2.0
 * 
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 * 
 * =============================================================================
 */
package org.attoparser.markup;

import org.attoparser.AbstractBufferedAttoParser;
import org.attoparser.AttoParseException;
import org.attoparser.IAttoHandler;



/**
 * 
 *   Default implementation of the {@link org.attoparser.IAttoParser} interface, able of
 *   parsing XML and HTML markup.
 * 
 * 
 *   This parser reports as structures:
 * 
 * 
 *   Tags (a.k.a. elements): <body>, <img/>, 
 *       <div class="content">, etc.
 *   Comments: <!-- this is a comment -->
 *   CDATA sections: <![CDATA[ ... ]]>
 *   DOCTYPE clauses: <!DOCTYPE html>
 *   XML Declarations: <?xml version="1.0"?>
 *   Processing Instructions: <?xsl-stylesheet ...?>
 * 
 * 
 *   This parser class is thread-safe. But take into account that, usually, the 
 *   {@link IAttoHandler} implementations passed to parsers for event handling are not.
 * 
 * 
 * @author Daniel Fernández
 * 
 * @since 1.0
 *
 */
public final class MarkupAttoParser extends AbstractBufferedAttoParser {


    
    /**
     * 
     *   Creates a new instance of this parser.
     * 
     */
    public MarkupAttoParser() {
        super();
    }
    

    
    
    
    
    @Override
    protected final BufferParseResult parseBuffer(
            final char[] buffer, final int offset, final int len, 
            final IAttoHandler handler, final int line, final int col) 
            throws AttoParseException {


        final int[] locator = new int[] {line, col};
        
        int currentLine = locator[0];
        int currentCol = locator[1];
        
        final int maxi = offset + len;
        int i = offset;
        int current = i;
        
        boolean inStructure = false;
        
        boolean inOpenElement = false;
        boolean inCloseElement = false;
        boolean inComment = false;
        boolean inCdata = false;
        boolean inDocType = false;
        boolean inXmlDeclaration = false;
        boolean inProcessingInstruction = false;
        
        int tagStart = -1;
        int tagEnd = -1;
        
        while (i < maxi) {
            
            currentLine = locator[0];
            currentCol = locator[1];
            
            inStructure =
                    (inOpenElement || inCloseElement || inComment || inCdata || inDocType || inXmlDeclaration || inProcessingInstruction);
            
            if (!inStructure) {
                
                tagStart = MarkupParsingUtil.findNextStructureStart(buffer, i, maxi, locator);
                
                if (tagStart == -1) {
                    return new BufferParseResult(current, currentLine, currentCol, false);
                }

                inOpenElement = ElementMarkupParsingUtil.isOpenElementStart(buffer, tagStart, maxi);
                if (!inOpenElement) {
                    inCloseElement = ElementMarkupParsingUtil.isCloseElementStart(buffer, tagStart, maxi);
                    if (!inCloseElement) {
                        inComment = CommentMarkupParsingUtil.isCommentStart(buffer, tagStart, maxi);
                        if (!inComment) {
                            inCdata = CdataMarkupParsingUtil.isCdataStart(buffer, tagStart, maxi);
                            if (!inCdata) {
                                inDocType = DocTypeMarkupParsingUtil.isDocTypeStart(buffer, tagStart, maxi);
                                if (!inDocType) {
                                    inXmlDeclaration = XmlDeclarationMarkupParsingUtil.isXmlDeclarationStart(buffer, tagStart, maxi);
                                    if (!inXmlDeclaration) {
                                        inProcessingInstruction = ProcessingInstructionMarkupParsingUtil.isProcessingInstructionStart(buffer, tagStart, maxi);
                                    }
                                }
                            }
                        }
                    }
                }
                
                inStructure =
                        (inOpenElement || inCloseElement || inComment || inCdata || inDocType || inXmlDeclaration || inProcessingInstruction);
                
                
                while (!inStructure) {
                    // We found a '<', but it cannot be considered a tag because it is not
                    // the beginning of any known structure
                    
                    LocatorUtils.countChar(locator, buffer[tagStart]);
                    tagStart = MarkupParsingUtil.findNextStructureStart(buffer, tagStart + 1, maxi, locator);
                    
                    if (tagStart == -1) {
                        return new BufferParseResult(current, currentLine, currentCol, false);
                    }

                    inOpenElement = ElementMarkupParsingUtil.isOpenElementStart(buffer, tagStart, maxi);
                    if (!inOpenElement) {
                        inCloseElement = ElementMarkupParsingUtil.isCloseElementStart(buffer, tagStart, maxi);
                        if (!inCloseElement) {
                            inComment = CommentMarkupParsingUtil.isCommentStart(buffer, tagStart, maxi);
                            if (!inComment) {
                                inCdata = CdataMarkupParsingUtil.isCdataStart(buffer, tagStart, maxi);
                                if (!inCdata) {
                                    inDocType = DocTypeMarkupParsingUtil.isDocTypeStart(buffer, tagStart, maxi);
                                    if (!inDocType) {
                                        inXmlDeclaration = XmlDeclarationMarkupParsingUtil.isXmlDeclarationStart(buffer, tagStart, maxi);
                                        if (!inXmlDeclaration) {
                                            inProcessingInstruction = ProcessingInstructionMarkupParsingUtil.isProcessingInstructionStart(buffer, tagStart, maxi);
                                        }
                                    }
                                }
                            }
                        }
                    }
                    
                    inStructure =
                            (inOpenElement || inCloseElement || inComment || inCdata || inDocType || inXmlDeclaration || inProcessingInstruction);
                
                }
            
                
                if (tagStart > current) {
                    // We avoid empty-string text events
                    handler.handleText(
                            buffer, current, (tagStart - current), 
                            currentLine, currentCol);
                }
                
                current = tagStart;
                i = current;
                
            } else {

                // We do not include processing instructions here because their format
                // is undefined, and everything should be allowed except the "?>" sequence,
                // which will terminate the instruction.
                final boolean avoidQuotes =
                        (inOpenElement || inCloseElement || inDocType || inXmlDeclaration);

                
                tagEnd =
                        (inDocType?
                                DocTypeMarkupParsingUtil.findNextDocTypeStructureEnd(buffer, i, maxi, locator) :
                                (avoidQuotes?
                                        MarkupParsingUtil.findNextStructureEndAvoidQuotes(buffer, i, maxi, locator) :
                                        MarkupParsingUtil.findNextStructureEndDontAvoidQuotes(buffer, i, maxi, locator)));
                
                if (tagEnd < 0) {
                    // This is an unfinished structure
                    return new BufferParseResult(current, currentLine, currentCol, true);
                }

                
                if (inOpenElement) {
                    // This is a closing tag
                    
                    handler.handleStructure(buffer, current, (tagEnd - current) + 1, currentLine, currentCol);
                    inOpenElement = false;
                    
                } else if (inCloseElement) {
                    // This is a closing tag
                    
                    handler.handleStructure(buffer, current, (tagEnd - current) + 1, currentLine, currentCol);
                    inCloseElement = false;
                    
                } else if (inComment) {
                    // This is a comment! (obviously ;-))
                    
                    while (tagEnd - current < 7 || buffer[tagEnd - 1] != '-' || buffer[tagEnd - 2] != '-') {
                        // the '>' we chose is not the comment-closing one. Let's find again
                        
                        LocatorUtils.countChar(locator, buffer[tagEnd]);
                        tagEnd = MarkupParsingUtil.findNextStructureEndDontAvoidQuotes(buffer, tagEnd + 1, maxi, locator);
                        
                        if (tagEnd == -1) {
                            return new BufferParseResult(current, currentLine, currentCol, true);
                        }
                        
                    }
                    
                    handler.handleStructure(buffer, current, (tagEnd - current) + 1, currentLine, currentCol);
                    inComment = false;
                    
                } else if (inCdata) {
                    // This is a CDATA section
                    
                    while (tagEnd - current < 12 || buffer[tagEnd - 1] != ']' || buffer[tagEnd - 2] != ']') {
                        // the '>' we chose is not the comment-closing one. Let's find again
                        
                        LocatorUtils.countChar(locator, buffer[tagEnd]);
                        tagEnd = MarkupParsingUtil.findNextStructureEndDontAvoidQuotes(buffer, tagEnd + 1, maxi, locator);
                        
                        if (tagEnd == -1) {
                            return new BufferParseResult(current, currentLine, currentCol, true);
                        }
                        
                    }
                    
                    handler.handleStructure(buffer, current, (tagEnd - current) + 1, currentLine, currentCol);
                    inCdata = false;
                    
                } else if (inDocType) {
                    // This is a DOCTYPE clause
                    
                    handler.handleStructure(buffer, current, (tagEnd - current) + 1, currentLine, currentCol);
                    inDocType = false;
                    
                } else if (inXmlDeclaration) {
                    // This is an XML Declaration

                    handler.handleStructure(buffer, current, (tagEnd - current) + 1, currentLine, currentCol);
                    inXmlDeclaration = false;
                    
                } else if (inProcessingInstruction) {
                    // This is a processing instruction

                    while (tagEnd - current < 5 || buffer[tagEnd - 1] != '?') {
                        // the '>' we chose is not the PI-closing one. Let's find again

                        LocatorUtils.countChar(locator, buffer[tagEnd]);
                        tagEnd = MarkupParsingUtil.findNextStructureEndDontAvoidQuotes(buffer, tagEnd + 1, maxi, locator);
                        
                        if (tagEnd == -1) {
                            return new BufferParseResult(current, currentLine, currentCol, true);
                        }
                        
                    }
                    

                    handler.handleStructure(buffer, current, (tagEnd - current) + 1, currentLine, currentCol);
                    inProcessingInstruction = false;
                    
                } else {

                    throw new IllegalStateException(
                            "Illegal parsing state: structure is not of a recognized type");
                    
                }
                
                // The '>' char will be considered as processed too
                LocatorUtils.countChar(locator, buffer[tagEnd]);
                
                current = tagEnd + 1;
                i = current;
                
            }
            
        }
        
        return new BufferParseResult(current, locator[0], locator[1], false);
        
    }
    
    
}