![JAR search and dependency download from the Maven repository](/logo.png)
com.day.cq.rewriter.htmlparser.HtmlParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
/*
* Copyright 1997-2008 Day Management AG
* Barfuesserplatz 6, 4001 Basel, Switzerland
* All Rights Reserved.
*
* This software is the confidential and proprietary information of
* Day Management AG, ("Confidential Information"). You shall not
* disclose such Confidential Information and shall use it only in
* accordance with the terms of the license agreement you entered into
* with Day.
*/
package com.day.cq.rewriter.htmlparser;
import java.io.CharArrayWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Writer;
import java.util.HashSet;
import java.util.Set;
import org.apache.felix.scr.annotations.Component;
import org.apache.sling.commons.osgi.OsgiUtil;
import org.xml.sax.ContentHandler;
import com.day.cq.rewriter.htmlparser.impl.HtmlParserTransformer;
import com.day.cq.rewriter.pipeline.Generator;
import com.day.cq.rewriter.processor.ProcessingComponentConfiguration;
import com.day.cq.rewriter.processor.ProcessingContext;
/**
* HTML parser. Invokes a DocumentHandler
whenever an event occurs.
*/
@Component(factory = "com.day.cq.rewriter.pipeline.Generator/htmlparser")
@Deprecated
public class HtmlParser extends Writer implements Generator {
public static final String INCLUDE_TAGS_PROPERTY = "includeTags";
/** Internal character buffer */
private final CharArrayWriter buffer = new CharArrayWriter(256);
/** Tag tokenizer */
private final TagTokenizer tokenizer = new TagTokenizer();
/** Tag name buffer */
private final CharArrayWriter tagNameBuffer = new CharArrayWriter(30);
/** Tag name */
private String tagName;
/** Tag inclusion list */
private Set tagInclusionSet;
/** Registered document handler */
private DocumentHandler documentHandler;
/** Parse state constant */
private final static int PS_OUTSIDE = 0;
/** Parse state constant */
private final static int PS_TAG = PS_OUTSIDE + 1;
/** Parse state constant */
private final static int PS_SCRIPT = PS_TAG + 1;
/** Parse state constant */
private final static int PS_COMMENT = PS_SCRIPT + 1;
/** Parse state constant */
private final static int PS_STRING = PS_COMMENT + 1;
/** Tag type constant */
private final static int TT_NONE = 0;
/** Tag type constant */
private final static int TT_MAYBE = 1;
/** Tag type constant */
private final static int TT_TAG = 2;
/** Parse state */
private int parseState;
/** Parse substate */
private int parseSubState;
/** Previous parse state */
private int prevParseState;
/** Current tag type */
private int tagType;
/** Quote character */
private char quoteChar;
/** Did we already start parsing? */
boolean started = false;
/** indicates flag that forces a flush on the handler */
boolean flushPending = false;
/** flag that controls if flushing is allowed */
boolean allowFlush = true;
/** flag that disables toLowercasing of attributes */
boolean preserveCamelCase = false;
/** Only those tags are processed and passed to pipelines for processing */
private static final Set DEFAULT_INCLUSION_TAGS;
static {
DEFAULT_INCLUSION_TAGS = new HashSet();
DEFAULT_INCLUSION_TAGS.add("A");
DEFAULT_INCLUSION_TAGS.add("/A");
DEFAULT_INCLUSION_TAGS.add("IMG");
DEFAULT_INCLUSION_TAGS.add("AREA");
DEFAULT_INCLUSION_TAGS.add("FORM");
DEFAULT_INCLUSION_TAGS.add("BASE");
DEFAULT_INCLUSION_TAGS.add("LINK");
DEFAULT_INCLUSION_TAGS.add("SCRIPT");
DEFAULT_INCLUSION_TAGS.add("BODY");
DEFAULT_INCLUSION_TAGS.add("/BODY");
DEFAULT_INCLUSION_TAGS.add("HEAD");
DEFAULT_INCLUSION_TAGS.add("/HEAD");
}
/**
* Default constructor.
*/
public HtmlParser() {
this.tagInclusionSet = new HashSet<>(DEFAULT_INCLUSION_TAGS);
}
public HtmlParser(String[] includedTags){
this();
if (includedTags != null && includedTags.length > 0) {
for (final String tag : includedTags) {
this.tagInclusionSet.add(tag);
}
}
}
public HtmlParser(String[] includedTags, boolean preserveCamelCase) {
this(includedTags);
this.preserveCamelCase = preserveCamelCase;
}
/**
* @see com.day.cq.rewriter.pipeline.Generator#init(com.day.cq.rewriter.processor.ProcessingContext,
* com.day.cq.rewriter.processor.ProcessingComponentConfiguration)
*/
public void init(final ProcessingContext pipelineContext,
final ProcessingComponentConfiguration config) {
final String[] includedTags = OsgiUtil.toStringArray(config
.getConfiguration().get(INCLUDE_TAGS_PROPERTY));
if (includedTags != null && includedTags.length > 0) {
this.tagInclusionSet = new HashSet<>();
for (final String tag : includedTags) {
this.tagInclusionSet.add(tag);
}
// /body is required for the licence transformer
this.tagInclusionSet.add("/BODY");
pipelineContext.getRequest().setAttribute(HtmlParserTransformer.REQ_ATTR_HTML_PARSER, new HashSet<>(this.tagInclusionSet));
// the below tags are required for injecting RUM data
// for compatibility reasons, we need to avoid sending those through the complete pipeline!
final Set additionalStartTags = new HashSet<>();
final Set additionalEndTags = new HashSet<>();
if (this.tagInclusionSet.add("BODY")) {
additionalStartTags.add("BODY");
}
if (this.tagInclusionSet.add("HEAD")) {
additionalStartTags.add("HEAD");
}
if (this.tagInclusionSet.add("SCRIPT")) {
additionalStartTags.add("SCRIPT");
}
if (this.tagInclusionSet.add("LINK")) {
additionalStartTags.add("LINK");
}
if (this.tagInclusionSet.add("/HEAD")) {
additionalEndTags.add("HEAD");
}
// set additional tags as request attributes
if (!additionalStartTags.isEmpty()) {
pipelineContext.getRequest().setAttribute(HtmlParserTransformer.REQ_ATTR_START_TAGS, additionalStartTags);
}
if (!additionalEndTags.isEmpty()) {
pipelineContext.getRequest().setAttribute(HtmlParserTransformer.REQ_ATTR_END_TAGS, additionalEndTags);
}
} else {
pipelineContext.getRequest().setAttribute(HtmlParserTransformer.REQ_ATTR_HTML_PARSER, this.tagInclusionSet);
}
}
/**
* @see com.day.cq.rewriter.pipeline.Generator#getWriter()
*/
public PrintWriter getWriter() {
return new PrintWriter(this);
}
public Set getTagInclusionSet() {
return tagInclusionSet;
}
public void setTagInclusionSet(Set tagInclusionSet) {
this.tagInclusionSet = tagInclusionSet;
}
/**
* @see com.day.cq.rewriter.pipeline.Generator#setContentHandler(org.xml.sax.ContentHandler)
*/
public void setContentHandler(ContentHandler handler) {
this.documentHandler = new DocumentHandlerToSAXAdapter(handler);
}
/**
* Set document handler. Allows a component to get notified about the
* events, before characters are decomposed into attributes.
*
* @param documentHandler document handler
*/
public void setDocumentHandler(DocumentHandler documentHandler) {
this.documentHandler = documentHandler;
}
@Override
public void write(char cbuf[], int off, int len) throws IOException {
this.update(cbuf, 0, len);
}
@Override
public void write(int b) throws IOException {
final char[] buf = new char[] { (char) b };
this.update(buf, 0, buf.length);
}
@Override
public void close() throws IOException {
// nothing to do
}
@Override
public void flush() throws IOException {
flushPending = true;
flushBuffer();
}
/**
* Feed characters to the parser.
*
* @param buf
* character buffer
* @param off
* offset where characters start
* @param len
* length of affected buffer
* @throws IOException {@link IOException}
*/
public void update(char[] buf, int off, int len) throws IOException {
if (!this.started) {
this.documentHandler.onStart();
this.started = true;
}
int start = off;
int end = off + len;
// tracking the previous characters to make sure the comment ends in "-->"
char previousChar1 = 0;
char previousChar2 = 0;
for (int curr = start; curr < end; curr++) {
char c = buf[curr];
switch (parseState) {
case PS_OUTSIDE:
if (c == '<') {
if (curr > start) {
documentHandler.characters(buf, start, curr - start);
}
start = curr;
parseState = PS_TAG;
parseSubState = 0;
tagType = TT_MAYBE;
allowFlush = false;
resetTagName();
}
break;
case PS_TAG:
switch (parseSubState) {
case -1:
if (c == '"' || c == '\'') {
quoteChar = c;
prevParseState = parseState;
parseState = PS_STRING;
parseSubState = -1;
allowFlush = false;
} else if (c == '>') {
parseState = PS_OUTSIDE;
allowFlush = true;
}
break;
case 0:
if (c == '!') {
parseState = PS_COMMENT;
parseSubState = 0;
tagType = TT_NONE;
allowFlush = true;
flushBuffer();
} else if (c == '"' || c == '\'') {
quoteChar = c;
prevParseState = parseState;
parseState = PS_STRING;
parseSubState = -1;
tagType = TT_NONE;
allowFlush = true;
flushBuffer();
} else if (c == '>') {
parseState = PS_OUTSIDE;
tagType = TT_NONE;
allowFlush = true;
flushBuffer();
} else if (!Character.isWhitespace(c)) {
tagNameBuffer.write(c);
parseSubState = 1;
} else {
parseSubState = -1;
tagType = TT_NONE;
allowFlush = true;
flushBuffer();
}
break;
case 1:
if (c == '"' || c == '\'') {
if (tagIncluded(getTagName())) {
tagType = TT_TAG;
} else {
tagType = TT_NONE;
allowFlush = true;
flushBuffer();
}
parseSubState = 2;
quoteChar = c;
prevParseState = parseState;
parseState = PS_STRING;
} else if (c == '>') {
if (tagIncluded(getTagName())) {
processTag(buf, start, curr - start + 1);
start = curr + 1;
tagType = TT_NONE;
parseState = getTagName()
.equalsIgnoreCase("SCRIPT") ? PS_SCRIPT
: PS_OUTSIDE;
parseSubState = 0;
} else {
tagType = TT_NONE;
parseState = PS_OUTSIDE;
allowFlush = true;
flushBuffer();
}
} else if (Character.isWhitespace(c)) {
if (tagIncluded(getTagName())) {
tagType = TT_TAG;
} else {
tagType = TT_NONE;
allowFlush = true;
flushBuffer();
}
parseSubState = 2;
} else {
tagNameBuffer.write(c);
}
break;
case 2:
if (c == '"' || c == '\'') {
quoteChar = c;
prevParseState = parseState;
parseState = PS_STRING;
} else if (c == '>') {
if (tagType == TT_TAG) {
processTag(buf, start, curr - start + 1);
start = curr + 1;
} else {
allowFlush = true;
flushBuffer();
}
tagType = TT_NONE;
parseState = getTagName().equalsIgnoreCase("SCRIPT") ? PS_SCRIPT
: PS_OUTSIDE;
parseSubState = 0;
}
break;
}
break;
case PS_COMMENT:
switch (parseSubState) {
case 0:
if (c == '-') {
parseSubState++;
} else if (c == '"' || c == '\'') {
quoteChar = c;
prevParseState = PS_TAG;
parseState = PS_STRING;
parseSubState = -1;
tagType = TT_NONE;
allowFlush = true;
flushBuffer();
} else if (c == '>') {
parseState = PS_OUTSIDE;
tagType = TT_NONE;
allowFlush = true;
flushBuffer();
} else {
parseState = PS_TAG;
parseSubState = -1;
tagType = TT_NONE;
allowFlush = true;
flushBuffer();
}
break;
case 1:
if (c == '-') {
parseSubState++;
} else if (c == '"' || c == '\'') {
quoteChar = c;
prevParseState = PS_TAG;
parseState = PS_STRING;
parseSubState = -1;
tagType = TT_NONE;
allowFlush = true;
flushBuffer();
} else if (c == '>') {
parseState = PS_OUTSIDE;
tagType = TT_NONE;
allowFlush = true;
flushBuffer();
} else {
parseState = PS_TAG;
parseSubState = -1;
tagType = TT_NONE;
allowFlush = true;
flushBuffer();
}
break;
case 2:
if (c == '-') {
parseSubState++;
}
else if (c == '>' && previousChar1 == '-' && previousChar2 == '-') {
parseState = PS_OUTSIDE;
}
break;
case 3:
if (c == '-') {
parseSubState++;
} else if (c == '>' && previousChar1 == '-' && previousChar2 == '-') {
parseState = PS_OUTSIDE;
} else {
parseSubState = 2;
}
break;
case 4:
if (c == '>') {
parseState = PS_OUTSIDE;
} else {
parseSubState = 2;
}
break;
}
previousChar2 = previousChar1;
previousChar1 = c;
break;
case PS_SCRIPT:
switch (parseSubState) {
case 0:
if (c == '<') {
if (curr > start) {
documentHandler
.characters(buf, start, curr - start);
}
start = curr;
tagType = TT_MAYBE;
parseSubState++;
allowFlush = false;
}
break;
case 1:
if (c == '/') {
parseSubState++;
} else {
tagType = TT_NONE;
parseSubState = 0;
allowFlush = true;
flushBuffer();
}
break;
case 2:
if (c == 'S' || c == 's') {
parseSubState++;
} else {
tagType = TT_NONE;
parseSubState = 0;
allowFlush = true;
flushBuffer();
}
break;
case 3:
if (c == 'C' || c == 'c') {
parseSubState++;
} else {
tagType = TT_NONE;
parseSubState = 0;
allowFlush = true;
flushBuffer();
}
break;
case 4:
if (c == 'R' || c == 'r') {
parseSubState++;
} else {
tagType = TT_NONE;
parseSubState = 0;
allowFlush = true;
flushBuffer();
}
break;
case 5:
if (c == 'I' || c == 'i') {
parseSubState++;
} else {
tagType = TT_NONE;
parseSubState = 0;
allowFlush = true;
flushBuffer();
}
break;
case 6:
if (c == 'P' || c == 'p') {
parseSubState++;
} else {
tagType = TT_NONE;
parseSubState = 0;
allowFlush = true;
flushBuffer();
}
break;
case 7:
if (c == 'T' || c == 't') {
parseSubState++;
} else {
tagType = TT_NONE;
parseSubState = 0;
allowFlush = true;
flushBuffer();
}
break;
case 8:
if (c == '>') {
if (tagIncluded("SCRIPT")) {
processTag(buf, start, curr - start + 1);
start = curr + 1;
} else {
allowFlush = true;
flushBuffer();
}
tagType = TT_NONE;
parseState = PS_OUTSIDE;
}
break;
}
break;
case PS_STRING:
if (c == quoteChar) {
parseState = prevParseState;
}
break;
}
}
if (start < end) {
if (tagType == TT_NONE) {
documentHandler.characters(buf, start, end - start);
} else {
buffer.write(buf, start, end - start);
}
}
}
/**
* Return a flag indicating whether the parser has still some undigested
* characters left.
*
* @return true
if the parser still contains characters
* false
otherwise
*/
public boolean isEmpty() {
return buffer.size() == 0;
}
/**
* Finish the parsing process. This forces the parser to flush the
* characters still held in its internal buffer, regardless of the parsing
* state.
* @throws IOException {@link IOException}
*/
public void finished() throws IOException {
allowFlush = true;
flushBuffer();
this.documentHandler.onEnd();
}
/**
* Clears the internal tagname buffer and cache
*/
protected void resetTagName() {
tagName = null;
tagNameBuffer.reset();
}
/**
* Returns the tagname scanned and resets the internal tagname buffer
*
* @return tagname
*/
protected String getTagName() {
if (tagName == null) {
tagName = tagNameBuffer.toString();
}
return tagName;
}
/**
* Flush internal buffer. This forces the parser to flush the characters
* still held in its internal buffer, if the parsing state allows.
* @throws IOException {@link IOException}
*/
protected void flushBuffer() throws IOException {
if (allowFlush) {
if (buffer.size() > 0) {
char[] ch = buffer.toCharArray();
documentHandler.characters(ch, 0, ch.length);
buffer.reset();
}
if (flushPending) {
// special hack for flush request, see bug #20068
// send 0-length characters that eventually let SAXWriter flush the
// underlying writer
documentHandler.characters(new char[0], 0, 0);
flushPending = false;
}
}
}
/**
* Returns a flag indicating whether the specified tag should be included in
* the parsing process.
*
* @param tagName
* tag name
* @return true
if the tag should be processed, else
* false
*/
protected boolean tagIncluded(String tagName) {
return tagInclusionSet == null
|| tagInclusionSet.contains(tagName.toUpperCase());
}
/**
* Decompose a tag and feed it to the document handler.
*
* @param ch
* character data
* @param off
* offset where character data starts
* @param len
* length of character data
* @throws IOException {@link IOException}
*/
protected void processTag(char[] ch, int off, int len) throws IOException {
buffer.write(ch, off, len);
char[] snippet = buffer.toCharArray();
if (preserveCamelCase == true)
tokenizer.setPreserveCamelCase();
tokenizer.tokenize(snippet, 0, snippet.length);
if (!tokenizer.endTag()) {
documentHandler.onStartElement(tokenizer.tagName(), tokenizer
.attributes(), snippet, 0, snippet.length, tokenizer
.endSlash());
} else {
documentHandler.onEndElement(tokenizer.tagName(), snippet, 0,
snippet.length);
}
buffer.reset();
allowFlush = true;
}
@Override
public String toString() {
return "Adobe AEM HTML Parser Generator";
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy