
com.day.cq.wcm.designimporter.parser.DesignImporterHTMLParser Maven / Gradle / Ivy
/*************************************************************************
*
* ADOBE CONFIDENTIAL
* ___________________
*
* Copyright 2012 Adobe Systems Incorporated
* All Rights Reserved.
*
* NOTICE: All information contained herein is, and remains
* the property of Adobe Systems Incorporated and its suppliers,
* if any. The intellectual and technical concepts contained
* herein are proprietary to Adobe Systems Incorporated and its
* suppliers and are protected by trade secret or copyright law.
* Dissemination of this information or reproduction of this material
* is strictly forbidden unless prior written permission is obtained
* from Adobe Systems Incorporated.
**************************************************************************/
package com.day.cq.wcm.designimporter.parser;
import com.day.cq.dam.indd.PageBuilder;
import com.day.cq.dam.indd.PageComponent;
import com.day.cq.wcm.api.Page;
import com.day.cq.wcm.designimporter.DesignImportException;
import com.day.cq.wcm.designimporter.DesignImporterContext;
import com.day.cq.wcm.designimporter.MissingCanvasException;
import com.day.cq.wcm.designimporter.UnsupportedTagContentException;
import com.day.cq.wcm.designimporter.api.TagHandlerProvider;
import com.day.cq.wcm.designimporter.util.StreamUtil;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import javax.jcr.RepositoryException;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
/**
* Parses the HTML document stream in the design package and extracts the components, styles, scripts etc.
*/
public class DesignImporterHTMLParser {
/**
* The import context associated with the import request.
*/
protected DesignImporterContext designImporterContext;
private String canvasresourceType;
private HTMLContentHandler handler;
private Logger logger = LoggerFactory.getLogger(DesignImporterHTMLParser.class);
private TagHandlerProvider tagHandlerProvider;
private ParseResult parseResult;
/**
* Gets the {@link HTMLContent} associated with the body of the input HTML document.
*
* @return The {@link HTMLContent}
*
* @deprecated Use {@link com.day.cq.wcm.designimporter.parser.ParseResult#getBodyHtmlContent()} instead
*/
@Deprecated
public HTMLContent getBodyHtmlContent() {
return parseResult.getBodyHtmlContent();
}
/**
* Gets the list of {@link PageComponent} objects tranlated from the marked component
* divs during the import process
*
* @return The {@link List} of {@link PageComponent} objects
*
* @deprecated Use {@link com.day.cq.wcm.designimporter.parser.ParseResult#getComponents()} instead
*/
@Deprecated
public List getComponents() {
return parseResult.getComponents();
}
/**
* Gets the {@link HTMLContent} associated with the head of the input HTML document.
*
* @return The {@link HTMLContent}
*
* @deprecated Use {@link com.day.cq.wcm.designimporter.parser.ParseResult#getHeadHtmlContent()} instead
*/
@Deprecated
public HTMLContent getHeadHtmlContent() {
return parseResult.getHeadHtmlContent();
}
/**
* Gets the language specified in the input HTML document.
*
* @return The locale represented by the lang attribute in HTML
* @deprecated Use {@link com.day.cq.wcm.designimporter.parser.ParseResult#getLanguage()} instead
*/
@Deprecated
public String getLanguage() {
return parseResult.getLanguage();
}
/**
* Internal parse method
*
* @param stream
* @param handler
* @throws IOException
* @throws SAXException
*/
private void parse(InputStream stream, ContentHandler handler) throws IOException, SAXException {
if (!stream.markSupported()) {
stream = new BufferedInputStream(stream);
}
// Protect the stream from being closed by CyberNeko
// TODO: Is this still needed, given our use of TagSoup?
stream = new CloseShieldInputStream(stream);
// Prepare the input source using the encoding hint if available
InputSource source = new InputSource(stream);
source.setEncoding(StreamUtil.getEncoding(stream));
// Parse the HTML document
org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
// TIKA-528: Reuse share schema to avoid heavy instantiation
parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, new HTML5Schema());
// TIKA-599: Shared schema is thread-safe only if bogons are ignored
parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
parser.setContentHandler(handler);
parser.parse(source);
}
/**
* Extracts components from the HTML stream.
*
* @param page The page for which the components are being extracted
* @param stream The input HTML document stream
* @param pageBuilder The {@link PageBuilder} object for building the {@link PageComponent}s
*
* @return List of extracted {@link PageComponent}s
*
* @throws RepositoryException
* @throws UnsupportedTagContentException if unsupported content is found within a tag
* @throws MissingCanvasException
* @throws IOException
* @throws SAXException
* @throws TikaException
*
* @deprecated Use {@link #parse(java.io.InputStream, com.day.cq.dam.indd.PageBuilder)} instead
*/
@Deprecated
public void parse(Page page, InputStream stream, PageBuilder pageBuilder) throws DesignImportException {
parse(stream, pageBuilder);
}
public ParseResult parse(InputStream stream, PageBuilder pageBuilder) throws DesignImportException {
handler = new HTMLContentHandler();
handler.setTagHandlerProvider(tagHandlerProvider);
handler.setDesignImporterContext(designImporterContext);
handler.setCanvasResourceType(canvasresourceType);
handler.setPageBuilder(pageBuilder);
try {
parse(stream, handler);
} catch (IOException e) {
logger.error("An IO error occured while parsing the input HTML stream", e);
} catch (SAXException e) {
if (e.getException() instanceof DesignImportException) throw (DesignImportException) e.getException();
}
List generatedComponents = handler.getGeneratedComponents();
if (generatedComponents == null) generatedComponents = new ArrayList();
this.parseResult = new ParseResult(generatedComponents,
handler.getHeadHtmlContent(),
handler.getBodyHtmlContent(),
handler.getLanguage());
return parseResult;
}
/**
* Sets the unique name of the canvas component that would be generated by this page extractor
*
* @param canvasResourceType The resourceType of the canvas component
*/
public void setCanvasResourceType(String canvasResourceType) {
this.canvasresourceType = canvasResourceType;
}
/**
* Sets the import context
*
* @param designImporterContext The {@link DesignImporterContext} object representing the current import context
*/
public void setDesignImporterContext(DesignImporterContext designImporterContext) {
this.designImporterContext = designImporterContext;
}
/**
* Sets the {@link TagHandlerProvider}
*
* @param tagHandlerProvider The {@link TagHandlerProvider} object
*/
public void setTagHandlerProvider(TagHandlerProvider tagHandlerProvider) {
this.tagHandlerProvider = tagHandlerProvider;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy