com.day.cq.wcm.designimporter.parser.DesignImporterHTMLParser Maven / Gradle / Ivy

Go to download
/*************************************************************************
 *
 * ADOBE CONFIDENTIAL
 * ___________________
 *
 *  Copyright 2012 Adobe Systems Incorporated
 *  All Rights Reserved.
 *
 * NOTICE:  All information contained herein is, and remains
 * the property of Adobe Systems Incorporated and its suppliers,
 * if any.  The intellectual and technical concepts contained
 * herein are proprietary to Adobe Systems Incorporated and its
 * suppliers and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from Adobe Systems Incorporated.
 **************************************************************************/
package com.day.cq.wcm.designimporter.parser;

import com.day.cq.dam.indd.PageBuilder;
import com.day.cq.dam.indd.PageComponent;
import com.day.cq.wcm.api.Page;
import com.day.cq.wcm.designimporter.DesignImportException;
import com.day.cq.wcm.designimporter.DesignImporterContext;
import com.day.cq.wcm.designimporter.MissingCanvasException;
import com.day.cq.wcm.designimporter.UnsupportedTagContentException;
import com.day.cq.wcm.designimporter.api.TagHandlerProvider;
import com.day.cq.wcm.designimporter.util.StreamUtil;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import javax.jcr.RepositoryException;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

/**
 * Parses the HTML document stream in the design package and extracts the components, styles, scripts etc.
 */
public class DesignImporterHTMLParser {

    /**
     * The import context associated with the import request.
     */
    protected DesignImporterContext designImporterContext;

    private String canvasresourceType;

    private HTMLContentHandler handler;

    private Logger logger = LoggerFactory.getLogger(DesignImporterHTMLParser.class);

    private TagHandlerProvider tagHandlerProvider;

    private ParseResult parseResult;

    /**
     * Gets the {@link HTMLContent} associated with the body of the input HTML document.
     *
     * @return The {@link HTMLContent}
     *
     * @deprecated Use {@link com.day.cq.wcm.designimporter.parser.ParseResult#getBodyHtmlContent()} instead
     */
    @Deprecated
    public HTMLContent getBodyHtmlContent() {
        return parseResult.getBodyHtmlContent();
    }

    /**
     * Gets the list of {@link PageComponent} objects tranlated from the marked component
     * divs during the import process
     *
     * @return The {@link List} of {@link PageComponent} objects
     *
     * @deprecated Use {@link com.day.cq.wcm.designimporter.parser.ParseResult#getComponents()} instead
     */
    @Deprecated
    public List getComponents() {
        return parseResult.getComponents();
    }

    /**
     * Gets the {@link HTMLContent} associated with the head of the input HTML document.
     *
     * @return The {@link HTMLContent}
     *
     * @deprecated Use {@link com.day.cq.wcm.designimporter.parser.ParseResult#getHeadHtmlContent()} instead
     */
    @Deprecated
    public HTMLContent getHeadHtmlContent() {
        return parseResult.getHeadHtmlContent();
    }

    /**
     * Gets the language specified in the input HTML document.
     *
     * @return The locale represented by the lang attribute in HTML
     * @deprecated Use {@link com.day.cq.wcm.designimporter.parser.ParseResult#getLanguage()} instead
     */
    @Deprecated
    public String getLanguage() {
        return parseResult.getLanguage();
    }

    /**
     * Internal parse method
     *
     * @param stream
     * @param handler
     * @throws IOException
     * @throws SAXException
     */
    private void parse(InputStream stream, ContentHandler handler) throws IOException, SAXException {
    	if (!stream.markSupported()) {
    		stream = new BufferedInputStream(stream);
        }
        // Protect the stream from being closed by CyberNeko
        // TODO: Is this still needed, given our use of TagSoup?
        stream = new CloseShieldInputStream(stream);

        // Prepare the input source using the encoding hint if available
        InputSource source = new InputSource(stream);
        source.setEncoding(StreamUtil.getEncoding(stream));
        
        // Parse the HTML document
        org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();

        // TIKA-528: Reuse share schema to avoid heavy instantiation
        parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, new HTML5Schema());
        // TIKA-599: Shared schema is thread-safe only if bogons are ignored
        parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);

        parser.setContentHandler(handler);

        parser.parse(source);
    }

    /**
     * Extracts components from the HTML stream.
     * 
     * @param page The page for which the components are being extracted
     * @param stream The input HTML document stream
     * @param pageBuilder The {@link PageBuilder} object for building the {@link PageComponent}s
     *
     * @return List of extracted {@link PageComponent}s
     *
     * @throws RepositoryException
     * @throws UnsupportedTagContentException if unsupported content is found within a tag
     * @throws MissingCanvasException
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     *
     * @deprecated Use {@link #parse(java.io.InputStream, com.day.cq.dam.indd.PageBuilder)} instead
     */
    @Deprecated
    public void parse(Page page, InputStream stream, PageBuilder pageBuilder) throws DesignImportException {
        parse(stream, pageBuilder);
    }

    public ParseResult parse(InputStream stream, PageBuilder pageBuilder) throws DesignImportException {
        handler = new HTMLContentHandler();
        handler.setTagHandlerProvider(tagHandlerProvider);
        handler.setDesignImporterContext(designImporterContext);
        handler.setCanvasResourceType(canvasresourceType);
        handler.setPageBuilder(pageBuilder);

        try {
            parse(stream, handler);
        } catch (IOException e) {
            logger.error("An IO error occured while parsing the input HTML stream", e);
        } catch (SAXException e) {
            if (e.getException() instanceof DesignImportException) throw (DesignImportException) e.getException();
        }

        List generatedComponents = handler.getGeneratedComponents();
        if (generatedComponents == null) generatedComponents = new ArrayList();

        this.parseResult = new ParseResult(generatedComponents,
                handler.getHeadHtmlContent(),
                handler.getBodyHtmlContent(),
                handler.getLanguage());
        return parseResult;
    }

    /**
     * Sets the unique name of the canvas component that would be generated by this page extractor
     * 
     * @param canvasResourceType The resourceType of the canvas component
     */
    public void setCanvasResourceType(String canvasResourceType) {
        this.canvasresourceType = canvasResourceType;
    }

    /**
     * Sets the import context
     * 
     * @param designImporterContext The {@link DesignImporterContext} object representing the current import context
     */
    public void setDesignImporterContext(DesignImporterContext designImporterContext) {
        this.designImporterContext = designImporterContext;
    }

    /**
     * Sets the {@link TagHandlerProvider}
     *
     * @param tagHandlerProvider The {@link TagHandlerProvider} object
     */
    public void setTagHandlerProvider(TagHandlerProvider tagHandlerProvider) {
        this.tagHandlerProvider = tagHandlerProvider;
    }

  
}