org.apache.shindig.gadgets.parse.nekohtml.NekoSimplifiedHtmlParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of shindig-gadgets Show documentation
Show all versions of shindig-gadgets Show documentation
Renders gadgets, provides the gadget metadata service, and serves
all javascript required by the OpenSocial specification.
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.shindig.gadgets.parse.nekohtml;
import org.apache.commons.lang3.StringUtils;
import org.apache.shindig.common.xml.DomUtil;
import org.apache.shindig.gadgets.GadgetException;
import org.apache.shindig.gadgets.parse.GadgetHtmlParser;
import org.apache.shindig.gadgets.parse.SocialDataTags;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.NamespaceContext;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLDocumentHandler;
import org.apache.xerces.xni.XMLLocator;
import org.apache.xerces.xni.XMLResourceIdentifier;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.XMLDocumentSource;
import org.apache.xerces.xni.parser.XMLInputSource;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import org.cyberneko.html.HTMLConfiguration;
import org.cyberneko.html.HTMLElements;
import org.cyberneko.html.HTMLEntities;
import org.cyberneko.html.HTMLScanner;
import org.cyberneko.html.HTMLTagBalancer;
import org.cyberneko.html.filters.NamespaceBinder;
import org.w3c.dom.DOMException;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import java.io.IOException;
import java.io.StringReader;
import java.util.Stack;
/**
* Supports parsing of social markup blocks inside gadget content.
* <script> elements with types of either "text/os-template"
* or "text/os-data" are parsed inline into contained DOM hierarchies
* for subsequent processing by the pipeline and template rewriters.
*/
@Singleton
public class NekoSimplifiedHtmlParser extends GadgetHtmlParser {
private static final HTMLElements.Element OSML_TEMPLATE_ELEMENT;
private static final HTMLElements.Element OSML_DATA_ELEMENT;
static {
HTMLElements.Element unknown = HTMLElements.getElement(HTMLElements.UNKNOWN);
OSML_TEMPLATE_ELEMENT = new HTMLElements.Element(unknown.code,
SocialDataTags.OSML_TEMPLATE_TAG, unknown.flags, HTMLElements.BODY, unknown.closes);
// Passing parent in constructor is ignored.
// Only allow template tags in BODY
OSML_TEMPLATE_ELEMENT.parent =
new HTMLElements.Element[]{HTMLElements.getElement(HTMLElements.BODY)};
// data tags are allowed in BODY only, since Neko disallows HEAD elements from
// having child elements of their own.
OSML_DATA_ELEMENT = new HTMLElements.Element(unknown.code,
SocialDataTags.OSML_TEMPLATE_TAG, unknown.flags, HTMLElements.BODY, unknown.closes);
OSML_DATA_ELEMENT.parent = new HTMLElements.Element[]{
HTMLElements.getElement(HTMLElements.BODY)};
}
@Inject
public NekoSimplifiedHtmlParser(DOMImplementation documentFactory) {
super(documentFactory);
}
@Override
protected Document parseDomImpl(String source) throws GadgetException {
DocumentHandler handler;
HTMLConfiguration config = newConfiguration();
try {
handler = parseHtmlImpl(source, config, new NormalizingTagBalancer());
} catch (IOException ioe) {
return null;
}
Document document = handler.getDocument();
document.appendChild(DomUtil.getFirstNamedChildNode(handler.getFragment(), "html"));
fixNekoWeirdness(document);
return document;
}
@Override
protected DocumentFragment parseFragmentImpl(String source) throws GadgetException {
DocumentHandler handler;
HTMLConfiguration config = newConfiguration();
// http://cyberneko.org/html/features/balance-tags/document-fragment
// deprecated http://cyberneko.org/html/features/document-fragment
config.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
config.setProperty("http://cyberneko.org/html/properties/balance-tags/fragment-context-stack",
new QName[]{new QName(null, "HTML", "HTML", null), new QName(null, "BODY", "BODY", null)});
try {
handler = parseHtmlImpl(source, config, new NekoPatchTagBalancer());
} catch (IOException ioe) {
return null;
}
return handler.getFragment();
}
/**
* Parse HTML source.
*
* @return a document handler containing the parsed source
*/
private DocumentHandler parseHtmlImpl(String source, HTMLConfiguration config,
NormalizingTagBalancer tagBalancer)
throws IOException {
HTMLScanner htmlScanner = new HTMLScanner();
tagBalancer.setScanner(htmlScanner);
DocumentHandler handler = newDocumentHandler(source);
NamespaceBinder namespaceBinder = new NamespaceBinder();
namespaceBinder.setDocumentHandler(handler);
namespaceBinder.setDocumentSource(tagBalancer);
namespaceBinder.reset(config);
tagBalancer.setDocumentHandler(namespaceBinder);
// Order of filter is Scanner -> OSMLFilter -> Tag Balancer
tagBalancer.setDocumentSource(htmlScanner);
htmlScanner.setDocumentHandler(tagBalancer);
tagBalancer.reset(config);
htmlScanner.reset(config);
XMLInputSource inputSource = new XMLInputSource(null, null, null);
inputSource.setEncoding("UTF-8");
inputSource.setCharacterStream(new StringReader(source));
htmlScanner.setInputSource(inputSource);
htmlScanner.scanDocument(true);
return handler;
}
private void fixNekoWeirdness(Document document) {
// Neko as of versions > 1.9.13 stuffs all leading will break due to both
//