com.day.cq.rewriter.htmlparser.HtmlParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
There is a newer version: 2024.11.18751.20241128T090041Z-241100
/*
 * Copyright 1997-2008 Day Management AG
 * Barfuesserplatz 6, 4001 Basel, Switzerland
 * All Rights Reserved.
 *
 * This software is the confidential and proprietary information of
 * Day Management AG, ("Confidential Information"). You shall not
 * disclose such Confidential Information and shall use it only in
 * accordance with the terms of the license agreement you entered into
 * with Day.
 */
package com.day.cq.rewriter.htmlparser;

import java.io.CharArrayWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Writer;
import java.util.HashSet;
import java.util.Set;

import org.apache.felix.scr.annotations.Component;
import org.apache.sling.commons.osgi.OsgiUtil;
import org.xml.sax.ContentHandler;

import com.day.cq.rewriter.htmlparser.impl.HtmlParserTransformer;
import com.day.cq.rewriter.pipeline.Generator;
import com.day.cq.rewriter.processor.ProcessingComponentConfiguration;
import com.day.cq.rewriter.processor.ProcessingContext;

/**
 * HTML parser. Invokes a DocumentHandler whenever an event occurs.
 */
@Component(factory = "com.day.cq.rewriter.pipeline.Generator/htmlparser")
@Deprecated
public class HtmlParser extends Writer implements Generator {

    public static final String INCLUDE_TAGS_PROPERTY = "includeTags";

    /** Internal character buffer */
    private final CharArrayWriter buffer = new CharArrayWriter(256);

    /** Tag tokenizer */
    private final TagTokenizer tokenizer = new TagTokenizer();

    /** Tag name buffer */
    private final CharArrayWriter tagNameBuffer = new CharArrayWriter(30);

    /** Tag name */
    private String tagName;

    /** Tag inclusion list */
    private Set tagInclusionSet;

    /** Registered document handler */
    private DocumentHandler documentHandler;

    /** Parse state constant */
    private final static int PS_OUTSIDE = 0;

    /** Parse state constant */
    private final static int PS_TAG = PS_OUTSIDE + 1;

    /** Parse state constant */
    private final static int PS_SCRIPT = PS_TAG + 1;

    /** Parse state constant */
    private final static int PS_COMMENT = PS_SCRIPT + 1;

    /** Parse state constant */
    private final static int PS_STRING = PS_COMMENT + 1;

    /** Tag type constant */
    private final static int TT_NONE = 0;

    /** Tag type constant */
    private final static int TT_MAYBE = 1;

    /** Tag type constant */
    private final static int TT_TAG = 2;

    /** Parse state */
    private int parseState;

    /** Parse substate */
    private int parseSubState;

    /** Previous parse state */
    private int prevParseState;

    /** Current tag type */
    private int tagType;

    /** Quote character */
    private char quoteChar;

    /** Did we already start parsing? */
    boolean started = false;

    /** indicates flag that forces a flush on the handler */
    boolean flushPending = false;

    /** flag that controls if flushing is allowed */
    boolean allowFlush = true;

    /** flag that disables toLowercasing of attributes */
    boolean preserveCamelCase = false;

    /** Only those tags are processed and passed to pipelines for processing */
    private static final Set DEFAULT_INCLUSION_TAGS;
    static {
        DEFAULT_INCLUSION_TAGS = new HashSet();
        DEFAULT_INCLUSION_TAGS.add("A");
        DEFAULT_INCLUSION_TAGS.add("/A");
        DEFAULT_INCLUSION_TAGS.add("IMG");
        DEFAULT_INCLUSION_TAGS.add("AREA");
        DEFAULT_INCLUSION_TAGS.add("FORM");
        DEFAULT_INCLUSION_TAGS.add("BASE");
        DEFAULT_INCLUSION_TAGS.add("LINK");
        DEFAULT_INCLUSION_TAGS.add("SCRIPT");
        DEFAULT_INCLUSION_TAGS.add("BODY");
        DEFAULT_INCLUSION_TAGS.add("/BODY");
        DEFAULT_INCLUSION_TAGS.add("HEAD");
        DEFAULT_INCLUSION_TAGS.add("/HEAD");
    }

    /**
     * Default constructor.
     */
    public HtmlParser() {
        this.tagInclusionSet = new HashSet<>(DEFAULT_INCLUSION_TAGS);
    }

    public HtmlParser(String[] includedTags){
        this();
        if (includedTags != null && includedTags.length > 0) {
            for (final String tag : includedTags) {
                this.tagInclusionSet.add(tag);
            }
        }
    }
    public HtmlParser(String[] includedTags, boolean preserveCamelCase) {
        this(includedTags);
        this.preserveCamelCase = preserveCamelCase;
    }

    /**
     * @see com.day.cq.rewriter.pipeline.Generator#init(com.day.cq.rewriter.processor.ProcessingContext,
     *      com.day.cq.rewriter.processor.ProcessingComponentConfiguration)
     */
    public void init(final ProcessingContext pipelineContext,
                     final ProcessingComponentConfiguration config) {
        final String[] includedTags = OsgiUtil.toStringArray(config
                .getConfiguration().get(INCLUDE_TAGS_PROPERTY));
        if (includedTags != null && includedTags.length > 0) {
            this.tagInclusionSet = new HashSet<>();
            for (final String tag : includedTags) {
                this.tagInclusionSet.add(tag);
            }
            // /body is required for the licence transformer
            this.tagInclusionSet.add("/BODY");

            pipelineContext.getRequest().setAttribute(HtmlParserTransformer.REQ_ATTR_HTML_PARSER, new HashSet<>(this.tagInclusionSet));
            // the below tags are required for injecting RUM data
            // for compatibility reasons, we need to avoid sending those through the complete pipeline!
            final Set additionalStartTags = new HashSet<>();
            final Set additionalEndTags = new HashSet<>();
            if (this.tagInclusionSet.add("BODY")) {
                additionalStartTags.add("BODY");
            }
            if (this.tagInclusionSet.add("HEAD")) {
                additionalStartTags.add("HEAD");
            }
            if (this.tagInclusionSet.add("SCRIPT")) {
                additionalStartTags.add("SCRIPT");
            }
            if (this.tagInclusionSet.add("LINK")) {
                additionalStartTags.add("LINK");
            }
            if (this.tagInclusionSet.add("/HEAD")) {
                additionalEndTags.add("HEAD");
            }
            // set additional tags as request attributes
            if (!additionalStartTags.isEmpty()) {
                pipelineContext.getRequest().setAttribute(HtmlParserTransformer.REQ_ATTR_START_TAGS, additionalStartTags);
            }
            if (!additionalEndTags.isEmpty()) {
                pipelineContext.getRequest().setAttribute(HtmlParserTransformer.REQ_ATTR_END_TAGS, additionalEndTags);
            }
        } else {
            pipelineContext.getRequest().setAttribute(HtmlParserTransformer.REQ_ATTR_HTML_PARSER, this.tagInclusionSet);
        }
    }

    /**
     * @see com.day.cq.rewriter.pipeline.Generator#getWriter()
     */
    public PrintWriter getWriter() {
        return new PrintWriter(this);
    }

    public Set getTagInclusionSet() {
        return tagInclusionSet;
    }

    public void setTagInclusionSet(Set tagInclusionSet) {
        this.tagInclusionSet = tagInclusionSet;
    }

    /**
     * @see com.day.cq.rewriter.pipeline.Generator#setContentHandler(org.xml.sax.ContentHandler)
     */
    public void setContentHandler(ContentHandler handler) {
        this.documentHandler = new DocumentHandlerToSAXAdapter(handler);
    }

    /**
     * Set document handler. Allows a component to get notified about the
     * events, before characters are decomposed into attributes.
     *
     * @param documentHandler document handler
     */
    public void setDocumentHandler(DocumentHandler documentHandler) {
        this.documentHandler = documentHandler;
    }

    @Override
    public void write(char cbuf[], int off, int len) throws IOException {
        this.update(cbuf, 0, len);
    }

    @Override
    public void write(int b) throws IOException {
        final char[] buf = new char[] { (char) b };
        this.update(buf, 0, buf.length);
    }

    @Override
    public void close() throws IOException {
        // nothing to do
    }

    @Override
    public void flush() throws IOException {
        flushPending = true;
        flushBuffer();
    }

    /**
     * Feed characters to the parser.
     *
     * @param buf
     *            character buffer
     * @param off
     *            offset where characters start
     * @param len
     *            length of affected buffer
     * @throws IOException {@link IOException}
     */
    public void update(char[] buf, int off, int len) throws IOException {
        if (!this.started) {
            this.documentHandler.onStart();
            this.started = true;
        }
        int start = off;
        int end = off + len;
        // tracking the previous characters to make sure the comment ends in "-->"
        char previousChar1 = 0;
        char previousChar2 = 0;

        for (int curr = start; curr < end; curr++) {
            char c = buf[curr];

            switch (parseState) {
            case PS_OUTSIDE:
                if (c == '<') {
                    if (curr > start) {
                        documentHandler.characters(buf, start, curr - start);
                    }
                    start = curr;
                    parseState = PS_TAG;
                    parseSubState = 0;
                    tagType = TT_MAYBE;
                    allowFlush = false;
                    resetTagName();
                }
                break;
            case PS_TAG:
                switch (parseSubState) {
                case -1:
                    if (c == '"' || c == '\'') {
                        quoteChar = c;
                        prevParseState = parseState;
                        parseState = PS_STRING;
                        parseSubState = -1;
                        allowFlush = false;
                    } else if (c == '>') {
                        parseState = PS_OUTSIDE;
                        allowFlush = true;
                    }
                    break;
                case 0:
                    if (c == '!') {
                        parseState = PS_COMMENT;
                        parseSubState = 0;
                        tagType = TT_NONE;
                        allowFlush = true;
                        flushBuffer();
                    } else if (c == '"' || c == '\'') {
                        quoteChar = c;
                        prevParseState = parseState;
                        parseState = PS_STRING;
                        parseSubState = -1;
                        tagType = TT_NONE;
                        allowFlush = true;
                        flushBuffer();
                    } else if (c == '>') {
                        parseState = PS_OUTSIDE;
                        tagType = TT_NONE;
                        allowFlush = true;
                        flushBuffer();
                    } else if (!Character.isWhitespace(c)) {
                        tagNameBuffer.write(c);
                        parseSubState = 1;
                    } else {
                        parseSubState = -1;
                        tagType = TT_NONE;
                        allowFlush = true;
                        flushBuffer();
                    }
                    break;
                case 1:
                    if (c == '"' || c == '\'') {
                        if (tagIncluded(getTagName())) {
                            tagType = TT_TAG;
                        } else {
                            tagType = TT_NONE;
                            allowFlush = true;
                            flushBuffer();
                        }
                        parseSubState = 2;
                        quoteChar = c;
                        prevParseState = parseState;
                        parseState = PS_STRING;
                    } else if (c == '>') {
                        if (tagIncluded(getTagName())) {
                            processTag(buf, start, curr - start + 1);
                            start = curr + 1;
                            tagType = TT_NONE;
                            parseState = getTagName()
                                    .equalsIgnoreCase("SCRIPT") ? PS_SCRIPT
                                    : PS_OUTSIDE;
                            parseSubState = 0;
                        } else {
                            tagType = TT_NONE;
                            parseState = PS_OUTSIDE;
                            allowFlush = true;
                            flushBuffer();
                        }
                    } else if (Character.isWhitespace(c)) {
                        if (tagIncluded(getTagName())) {
                            tagType = TT_TAG;
                        } else {
                            tagType = TT_NONE;
                            allowFlush = true;
                            flushBuffer();
                        }
                        parseSubState = 2;
                    } else {
                        tagNameBuffer.write(c);
                    }
                    break;
                case 2:
                    if (c == '"' || c == '\'') {
                        quoteChar = c;
                        prevParseState = parseState;
                        parseState = PS_STRING;
                    } else if (c == '>') {
                        if (tagType == TT_TAG) {
                            processTag(buf, start, curr - start + 1);
                            start = curr + 1;
                        } else {
                            allowFlush = true;
                            flushBuffer();
                        }
                        tagType = TT_NONE;
                        parseState = getTagName().equalsIgnoreCase("SCRIPT") ? PS_SCRIPT
                                : PS_OUTSIDE;
                        parseSubState = 0;
                    }
                    break;
                }
                break;
            case PS_COMMENT:
                switch (parseSubState) {
                case 0:
                    if (c == '-') {
                        parseSubState++;
                    } else if (c == '"' || c == '\'') {
                        quoteChar = c;
                        prevParseState = PS_TAG;
                        parseState = PS_STRING;
                        parseSubState = -1;
                        tagType = TT_NONE;
                        allowFlush = true;
                        flushBuffer();
                    } else if (c == '>') {
                        parseState = PS_OUTSIDE;
                        tagType = TT_NONE;
                        allowFlush = true;
                        flushBuffer();
                    } else {
                        parseState = PS_TAG;
                        parseSubState = -1;
                        tagType = TT_NONE;
                        allowFlush = true;
                        flushBuffer();
                    }
                    break;
                case 1:
                    if (c == '-') {
                        parseSubState++;
                    } else if (c == '"' || c == '\'') {
                        quoteChar = c;
                        prevParseState = PS_TAG;
                        parseState = PS_STRING;
                        parseSubState = -1;
                        tagType = TT_NONE;
                        allowFlush = true;
                        flushBuffer();
                    } else if (c == '>') {
                        parseState = PS_OUTSIDE;
                        tagType = TT_NONE;
                        allowFlush = true;
                        flushBuffer();
                    } else {
                        parseState = PS_TAG;
                        parseSubState = -1;
                        tagType = TT_NONE;
                        allowFlush = true;
                        flushBuffer();
                    }
                    break;
                case 2:
                    if (c == '-') {
                        parseSubState++;
                    }
                    else if (c == '>' && previousChar1 == '-' && previousChar2 == '-') {
                        parseState = PS_OUTSIDE;
                    }
                    break;
                case 3:
                    if (c == '-') {
                        parseSubState++;
                    } else if (c == '>' && previousChar1 == '-' && previousChar2 == '-') {
                        parseState = PS_OUTSIDE;
                    } else {
                        parseSubState = 2;
                    }
                    break;
                case 4:
                    if (c == '>') {
                        parseState = PS_OUTSIDE;
                    } else {
                        parseSubState = 2;
                    }
                    break;
                }
                previousChar2 = previousChar1;
                previousChar1 = c;
                break;

            case PS_SCRIPT:
                switch (parseSubState) {
                case 0:
                    if (c == '<') {
                        if (curr > start) {
                            documentHandler
                                    .characters(buf, start, curr - start);
                        }
                        start = curr;
                        tagType = TT_MAYBE;
                        parseSubState++;
                        allowFlush = false;
                    }
                    break;
                case 1:
                    if (c == '/') {
                        parseSubState++;
                    } else {
                        tagType = TT_NONE;
                        parseSubState = 0;
                        allowFlush = true;
                        flushBuffer();
                    }
                    break;
                case 2:
                    if (c == 'S' || c == 's') {
                        parseSubState++;
                    } else {
                        tagType = TT_NONE;
                        parseSubState = 0;
                        allowFlush = true;
                        flushBuffer();
                    }
                    break;
                case 3:
                    if (c == 'C' || c == 'c') {
                        parseSubState++;
                    } else {
                        tagType = TT_NONE;
                        parseSubState = 0;
                        allowFlush = true;
                        flushBuffer();
                    }
                    break;
                case 4:
                    if (c == 'R' || c == 'r') {
                        parseSubState++;
                    } else {
                        tagType = TT_NONE;
                        parseSubState = 0;
                        allowFlush = true;
                        flushBuffer();
                    }
                    break;
                case 5:
                    if (c == 'I' || c == 'i') {
                        parseSubState++;
                    } else {
                        tagType = TT_NONE;
                        parseSubState = 0;
                        allowFlush = true;
                        flushBuffer();
                    }
                    break;
                case 6:
                    if (c == 'P' || c == 'p') {
                        parseSubState++;
                    } else {
                        tagType = TT_NONE;
                        parseSubState = 0;
                        allowFlush = true;
                        flushBuffer();
                    }
                    break;
                case 7:
                    if (c == 'T' || c == 't') {
                        parseSubState++;
                    } else {
                        tagType = TT_NONE;
                        parseSubState = 0;
                        allowFlush = true;
                        flushBuffer();
                    }
                    break;
                case 8:
                    if (c == '>') {
                        if (tagIncluded("SCRIPT")) {
                            processTag(buf, start, curr - start + 1);
                            start = curr + 1;
                        } else {
                            allowFlush = true;
                            flushBuffer();
                        }
                        tagType = TT_NONE;
                        parseState = PS_OUTSIDE;
                    }
                    break;
                }
                break;

            case PS_STRING:
                if (c == quoteChar) {
                    parseState = prevParseState;
                }
                break;
            }
        }
        if (start < end) {
            if (tagType == TT_NONE) {
                documentHandler.characters(buf, start, end - start);
            } else {
                buffer.write(buf, start, end - start);
            }
        }
    }

    /**
     * Return a flag indicating whether the parser has still some undigested
     * characters left.
     *
     * @return true if the parser still contains characters
     *         false otherwise
     */
    public boolean isEmpty() {
        return buffer.size() == 0;
    }

    /**
     * Finish the parsing process. This forces the parser to flush the
     * characters still held in its internal buffer, regardless of the parsing
     * state.
     * @throws IOException {@link IOException}
     */
    public void finished() throws IOException {
        allowFlush = true;
        flushBuffer();
        this.documentHandler.onEnd();
    }

    /**
     * Clears the internal tagname buffer and cache
     */
    protected void resetTagName() {
        tagName = null;
        tagNameBuffer.reset();
    }

    /**
     * Returns the tagname scanned and resets the internal tagname buffer
     *
     * @return tagname
     */
    protected String getTagName() {
        if (tagName == null) {
            tagName = tagNameBuffer.toString();
        }
        return tagName;
    }

    /**
     * Flush internal buffer. This forces the parser to flush the characters
     * still held in its internal buffer, if the parsing state allows.
     * @throws IOException {@link IOException}
     */
    protected void flushBuffer() throws IOException {
        if (allowFlush) {
            if (buffer.size() > 0) {
                char[] ch = buffer.toCharArray();
                documentHandler.characters(ch, 0, ch.length);
                buffer.reset();
            }
            if (flushPending) {
                // special hack for flush request, see bug #20068
                // send 0-length characters that eventually let SAXWriter flush the
                // underlying writer
                documentHandler.characters(new char[0], 0, 0);
                flushPending = false;
            }
        }
    }

    /**
     * Returns a flag indicating whether the specified tag should be included in
     * the parsing process.
     *
     * @param tagName
     *            tag name
     * @return true if the tag should be processed, else
     *         false
     */
    protected boolean tagIncluded(String tagName) {
        return tagInclusionSet == null
                || tagInclusionSet.contains(tagName.toUpperCase());
    }

    /**
     * Decompose a tag and feed it to the document handler.
     *
     * @param ch
     *            character data
     * @param off
     *            offset where character data starts
     * @param len
     *            length of character data
     * @throws IOException {@link IOException}
     */
    protected void processTag(char[] ch, int off, int len) throws IOException {
        buffer.write(ch, off, len);

        char[] snippet = buffer.toCharArray();
        if (preserveCamelCase == true)
            tokenizer.setPreserveCamelCase();
        tokenizer.tokenize(snippet, 0, snippet.length);
        if (!tokenizer.endTag()) {
            documentHandler.onStartElement(tokenizer.tagName(), tokenizer
                    .attributes(), snippet, 0, snippet.length, tokenizer
                    .endSlash());
        } else {
            documentHandler.onEndElement(tokenizer.tagName(), snippet, 0,
                    snippet.length);
        }

        buffer.reset();
        allowFlush = true;
    }

    @Override
    public String toString() {
        return "Adobe AEM HTML Parser Generator";
    }
}