All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.shindig.gadgets.parse.nekohtml.NekoSimplifiedHtmlParser Maven / Gradle / Ivy

Go to download

Renders gadgets, provides the gadget metadata service, and serves all javascript required by the OpenSocial specification.

There is a newer version: 3.0.0-beta4
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.shindig.gadgets.parse.nekohtml;

import org.apache.commons.lang.StringUtils;
import org.apache.shindig.gadgets.GadgetException;
import org.apache.shindig.gadgets.parse.GadgetHtmlParser;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.NamespaceContext;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLDocumentHandler;
import org.apache.xerces.xni.XMLLocator;
import org.apache.xerces.xni.XMLResourceIdentifier;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.XMLDocumentSource;
import org.apache.xerces.xni.parser.XMLInputSource;

import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.internal.ImmutableMap;
import org.cyberneko.html.HTMLConfiguration;
import org.cyberneko.html.HTMLElements;
import org.cyberneko.html.HTMLEntities;
import org.cyberneko.html.HTMLScanner;
import org.cyberneko.html.HTMLTagBalancer;
import org.cyberneko.html.filters.NamespaceBinder;
import org.w3c.dom.DOMException;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.Node;

import java.io.IOException;
import java.io.StringReader;
import java.util.Map;
import java.util.Stack;

/**
 * Supports parsing of social markup blocks inside gadget content.
 * <script> elements with types of either "text/os-template"
 * or "text/os-data" are parsed inline into contained DOM hierarchies
 * for subsequent processing by the pipeline and template rewriters.
 */
@Singleton
public class NekoSimplifiedHtmlParser extends GadgetHtmlParser {

  private static final HTMLElements.Element OSML_TEMPLATE_ELEMENT;
  private static final HTMLElements.Element OSML_DATA_ELEMENT;

  static {
    HTMLElements.Element unknown = HTMLElements.getElement(HTMLElements.UNKNOWN);
    OSML_TEMPLATE_ELEMENT = new HTMLElements.Element(unknown.code, OSML_TEMPLATE_TAG,
        unknown.flags, HTMLElements.BODY, unknown.closes);
    // Passing parent in constructor is ignored.
    // Only allow template tags in BODY
    OSML_TEMPLATE_ELEMENT.parent =
        new HTMLElements.Element[]{HTMLElements.getElement(HTMLElements.BODY)};

    // data tags are allowed in BODY only, since Neko disallows HEAD elements from
    // having child elements of their own.
    OSML_DATA_ELEMENT = new HTMLElements.Element(unknown.code, OSML_TEMPLATE_TAG,
        unknown.flags, HTMLElements.BODY, unknown.closes);
    OSML_DATA_ELEMENT.parent = new HTMLElements.Element[]{
        HTMLElements.getElement(HTMLElements.BODY)};
  }

  private static final Map OSML_ELEMENTS = ImmutableMap.of(
      OSML_TEMPLATE_TAG, OSML_TEMPLATE_ELEMENT, OSML_DATA_TAG, OSML_DATA_ELEMENT);

  private final DOMImplementation documentFactory;

  @Inject
  public NekoSimplifiedHtmlParser(DOMImplementation documentFactory) {
    this.documentFactory = documentFactory;
  }

  @Override
  protected Document parseDomImpl(String source) throws GadgetException {
    DocumentHandler handler;

    HTMLConfiguration config = newConfiguration();
    try {
      handler = parseHtmlImpl(source, config, new NormalizingTagBalancer());
    } catch (IOException ioe) {
      return null;
    }

    Document document = handler.getDocument();
    DocumentFragment fragment = handler.getFragment();
    normalizeFragment(document, fragment);
    return document;
  }

  @Override
  protected DocumentFragment parseFragmentImpl(String source) throws GadgetException {
    DocumentHandler handler;

    HTMLConfiguration config = newConfiguration();
    // http://cyberneko.org/html/features/balance-tags/document-fragment
    // deprecated http://cyberneko.org/html/features/document-fragment
    config.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
    config.setProperty("http://cyberneko.org/html/properties/balance-tags/fragment-context-stack",
        new QName[]{new QName(null, "HTML", "HTML", null), new QName(null, "BODY", "BODY", null)});

    try {
      handler = parseHtmlImpl(source, config, new NekoPatchTagBalancer());
    } catch (IOException ioe) {
      return null;
    }

    return handler.getFragment();
  }

  /**
   * Parse HTML source.
   *
   * @return a document handler containing the parsed source
   */
  private DocumentHandler parseHtmlImpl(String source, HTMLConfiguration config,
      NormalizingTagBalancer tagBalancer)
      throws IOException {

    HTMLScanner htmlScanner = new HTMLScanner();
    tagBalancer.setScanner(htmlScanner);

    DocumentHandler handler = newDocumentHandler(source);

    NamespaceBinder namespaceBinder = new NamespaceBinder();
    namespaceBinder.setDocumentHandler(handler);
    namespaceBinder.setDocumentSource(tagBalancer);
    namespaceBinder.reset(config);
    tagBalancer.setDocumentHandler(namespaceBinder);

    // Order of filter is Scanner -> OSMLFilter -> Tag Balancer
    tagBalancer.setDocumentSource(htmlScanner);
    htmlScanner.setDocumentHandler(tagBalancer);

    tagBalancer.reset(config);
    htmlScanner.reset(config);

    XMLInputSource inputSource = new XMLInputSource(null, null, null);
    inputSource.setEncoding("UTF-8");
    inputSource.setCharacterStream(new StringReader(source));
    htmlScanner.setInputSource(inputSource);
    htmlScanner.scanDocument(true);
    return handler;
  }

  protected HTMLConfiguration newConfiguration() {
    HTMLConfiguration config = new HTMLConfiguration();
    // Maintain original case for elements and attributes
    config.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
    config.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change");
    // Get notified of entity and character references
    config.setFeature("http://apache.org/xml/features/scanner/notify-char-refs", true);
    config.setFeature("http://cyberneko.org/html/features/scanner/notify-builtin-refs", true);
    config.setFeature("http://xml.org/sax/features/namespaces", true);
    return config;
  }

  protected DocumentHandler newDocumentHandler(String source) {
    return new DocumentHandler(source);
  }

  /** Handler for XNI events from Neko */
  protected class DocumentHandler implements XMLDocumentHandler {

    private final Stack elementStack = new Stack();

    private final StringBuilder builder;

    private boolean inEntity = false;


    private DocumentFragment documentFragment;

    private Document document;

    public DocumentHandler(String content) {
      builder = new StringBuilder(content.length() / 10);
    }

    public DocumentFragment getFragment() {
      return documentFragment;
    }

    public Document getDocument() {
      return document;
    }

    public void startDocument(XMLLocator xmlLocator, String encoding,
        NamespaceContext namespaceContext, Augmentations augs)
        throws XNIException {
      document = documentFactory.createDocument(null, null, null);
      elementStack.clear();
      documentFragment = document.createDocumentFragment();
      elementStack.push(documentFragment);
    }

    public void xmlDecl(String version, String encoding, String standalone, Augmentations augs)
        throws XNIException {
      // Dont really do anything with this
      builder.append("');
    }

    public void doctypeDecl(String rootElement, String publicId, String systemId,
        Augmentations augs) throws XNIException {
      document = documentFactory.createDocument(null, null,
          documentFactory.createDocumentType(rootElement, publicId, systemId));
      elementStack.clear();
      documentFragment = document.createDocumentFragment();
      elementStack.push(documentFragment);
    }

    public void comment(XMLString text, Augmentations augs) throws XNIException {
      flushTextBuffer();

      // Add comments as comment nodes - needed to support sanitization
      // of SocialMarkup-parsed content
      Node comment = getDocument().createComment(new String(text.ch, text.offset, text.length));
      appendChild(comment);
    }

    public void processingInstruction(String s, XMLString xmlString, Augmentations augs)
        throws XNIException {
      // No-op
    }

    public void startElement(QName qName, XMLAttributes xmlAttributes, Augmentations augs)
        throws XNIException {
      Element element = startElementImpl(qName, xmlAttributes);
      // Not an empty element, so push on the stack
      elementStack.push(element);
    }

    public void emptyElement(QName qName, XMLAttributes xmlAttributes, Augmentations augs)
        throws XNIException {
      startElementImpl(qName, xmlAttributes);
    }

    /** Flush any existing text content to the document.  Call this before appending any nodes. */
    protected void flushTextBuffer() {
      if (builder.length() > 0) {
        appendChild(document.createTextNode(builder.toString()));
        builder.setLength(0);
      }
    }

    /** Create an Element in the DOM */
    private Element startElementImpl(QName qName, XMLAttributes xmlAttributes) {
      flushTextBuffer();

      Element element;
      // Preserve XML namespace if present
      if (qName.uri != null) {
        element = document.createElementNS(qName.uri, qName.rawname);
      } else {
        element = document.createElement(qName.rawname);
      }

      for (int i = 0; i < xmlAttributes.getLength(); i++) {
        if (xmlAttributes.getURI(i) != null) {
          element.setAttributeNS(xmlAttributes.getURI(i), xmlAttributes.getQName(i),
              xmlAttributes.getValue(i));
        } else {
          try {
            element.setAttribute(xmlAttributes.getLocalName(i), xmlAttributes
                .getValue(i));
          } catch (DOMException e) {
            switch (e.code) {
              case DOMException.INVALID_CHARACTER_ERR:
                StringBuilder sb = new StringBuilder(e.getMessage());
                sb.append("Around ...<");
                if (qName.prefix != null) {
                  sb.append(qName.prefix);
                  sb.append(":");
                }
                sb.append(qName.localpart);
                for (int j = 0; j < xmlAttributes.getLength(); j++) {
                  if (StringUtils.isNotBlank(xmlAttributes.getLocalName(j))
                      && StringUtils.isNotBlank(xmlAttributes.getValue(j))) {
                    sb.append(' ');
                    sb.append(xmlAttributes.getLocalName(j));
                    sb.append("=\"");
                    sb.append(xmlAttributes.getValue(j)).append('\"');
                  }
                }
                sb.append("...");
                throw new DOMException(DOMException.INVALID_CHARACTER_ERR, sb.toString());
              default:
                throw e;
            }
          }
        }
      }
      appendChild(element);
      return element;
    }

    public void startGeneralEntity(String name, XMLResourceIdentifier id, String encoding,
        Augmentations augs) throws XNIException {
      if (name.startsWith("#")) {
        try {
          boolean hex = name.startsWith("#x");
          int offset = hex ? 2 : 1;
          int base = hex ? 16 : 10;
          int value = Integer.parseInt(name.substring(offset), base);
          String entity = HTMLEntities.get(value);
          if (entity != null) {
            name = entity;
          }
        }
        catch (NumberFormatException e) {
          // ignore
        }
      }
      printEntity(name);
      inEntity = true;
    }

    private void printEntity(String name) {
      builder.append('&');
      builder.append(name);
      builder.append(';');
    }

    public void textDecl(String s, String s1, Augmentations augs) throws XNIException {
      builder.append(s);
    }

    public void endGeneralEntity(String s, Augmentations augs) throws XNIException {
      inEntity = false;
    }

    public void characters(XMLString text, Augmentations augs) throws XNIException {
      if (inEntity) {
        return;
      }
      builder.append(text.ch, text.offset, text.length);
    }

    public void ignorableWhitespace(XMLString text, Augmentations augs) throws XNIException {
      builder.append(text.ch, text.offset, text.length);
    }

    public void endElement(QName qName, Augmentations augs) throws XNIException {
      flushTextBuffer();
      elementStack.pop();
    }

    public void startCDATA(Augmentations augs) throws XNIException {
      //No-op
    }

    public void endCDATA(Augmentations augs) throws XNIException {
      //No-op
    }

    public void endDocument(Augmentations augs) throws XNIException {
      flushTextBuffer();
      elementStack.pop();
    }

    public void setDocumentSource(XMLDocumentSource xmlDocumentSource) {
    }

    public XMLDocumentSource getDocumentSource() {
      return null;
    }

    private void appendChild(Node node) {
      elementStack.peek().appendChild(node);
    }
  }

  /**
   * Used when parsing document fragments to correct a bug in Neko 1.9.13. We use the
   * http://cyberneko.org/html/properties/balance-tags/fragment-context-stack
   * property of Neko to force the fragment to be parsed as if it were already container in a body
   * tag. This doesnt quite work together as without this fix it will still introduce head tags
   * if the first parsed tags are allowed in a head tag.
   * See https://sourceforge.net/tracker/?func=detail&atid=952178&aid=2870180&group_id=195122
   */
  private class NekoPatchTagBalancer extends NormalizingTagBalancer {

    /**
     * Override the document start to record whether HTML, HEAD or BODY have been seen
     */
    @Override
    public void startDocument(XMLLocator locator, String encoding,
        NamespaceContext nscontext, Augmentations augs)
        throws XNIException {

      super.startDocument(locator, encoding, nscontext, augs);
      for (int i = fElementStack.top - 1; i >= 0; i--) {
        fSeenAnything = true;
        if (fElementStack.data[i].element.code == HTMLElements.HTML) {
          fSeenRootElement = true;
        }
        if (fElementStack.data[i].element.code == HTMLElements.HEAD) {
          fSeenHeadElement = true;
        }
        if (fElementStack.data[i].element.code == HTMLElements.BODY) {
          fSeenBodyElement = true;
        }
      }
    }
  }

  /**
   * Subclass of Neko's tag balancer that
   * - Normalizes the case of forced html, head and body tags when they don't exist in the original
   * content.
   * - Convert script tags with type=os/* to OSData and OSTemplate. Record their text content and
   * force it to be reparsed.
   * -
   */
  private class NormalizingTagBalancer extends HTMLTagBalancer {

    private StringBuilder scriptContent;

    private HTMLScanner scanner;

    private QName currentOsmlTag;

    public NormalizingTagBalancer() {
    }

    public void setScanner(HTMLScanner scanner) {
      this.scanner = scanner;
    }

    @Override
    public void characters(XMLString text, Augmentations augs) throws XNIException {
      if (currentOsmlTag != null) {
        scriptContent.append(text.ch, text.offset, text.length);
      } else {
        super.characters(text, augs);
      }
    }

    @Override
    public void startElement(QName elem, XMLAttributes attrs, Augmentations augs)
        throws XNIException {
      // Normalize the case of forced-elements to lowercase for backward compatability
      if (!fSeenRootElement && elem.rawname.equalsIgnoreCase("html")) {
        elem.localpart = "html";
        elem.rawname = "html";
      } else if (!fSeenHeadElement && elem.rawname.equalsIgnoreCase("head")) {
        elem.localpart = "head";
        elem.rawname = "head";
      } else if (!fSeenBodyElement && elem.rawname.equalsIgnoreCase("body")) {
        elem.localpart = "body";
        elem.rawname = "body";
      }

      // Convert script tags of an OSML type to OSTemplate/OSData tags
      if ("script".equalsIgnoreCase(elem.rawname)) {
        String value = attrs.getValue("type");
        String osmlTagName = SCRIPT_TYPE_TO_OSML_TAG.get(value);
        if (osmlTagName != null) {
          if (currentOsmlTag != null) {
            throw new XNIException("Nested OpenSocial script elements");
          }
          currentOsmlTag = new QName(null, osmlTagName, osmlTagName, null);
          if (scriptContent == null) {
            scriptContent = new StringBuilder();
          }
          // Remove the type attribute
          attrs.removeAttributeAt(attrs.getIndex("type"));
          super.startElement(currentOsmlTag, attrs, augs);
          return;
        }
      }

      super.startElement(elem, attrs, augs);
    }


    @Override
    public void endElement(QName element, Augmentations augs) throws XNIException {
      if (currentOsmlTag != null && "script".equalsIgnoreCase(element.rawname)) {
        QName endingTag = currentOsmlTag;
        currentOsmlTag = null;

        XMLInputSource scriptSource = new XMLInputSource(null, null, null);
        scriptSource.setCharacterStream(new StringReader(scriptContent.toString()));
        scriptContent.setLength(0);

        // Evaluate the content of the script block immediately
        scanner.evaluateInputSource(scriptSource);

        super.endElement(endingTag, augs);
      } else {
        super.endElement(element, augs);
      }
    }

    @Override
    protected HTMLElements.Element getElement(QName elementName) {
      HTMLElements.Element osmlElement = OSML_ELEMENTS.get(elementName.localpart);
      if (osmlElement != null) {
        return osmlElement;
      }
      return super.getElement(elementName);
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy