com.processpuzzle.fitnesse.print.html.FitNessePageContentExtractor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of fit-print-plugin Show documentation
Show all versions of fit-print-plugin Show documentation
FitNesse Plugin to print wiki content into PDF.
The newest version!
package com.processpuzzle.fitnesse.print.html;
import static com.processpuzzle.fitnesse.print.html.XmlUtil.asList;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import org.apache.commons.lang.StringUtils;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.PrettyXmlSerializer;
import org.htmlcleaner.TagNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ResourceLoader;
import org.springframework.stereotype.Component;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
@Component
public class FitNessePageContentExtractor {
private static final String XSLT_FILE = "classpath:FitToPdf.xsl";
private static final Logger logger = LoggerFactory.getLogger( FitNessePageContentExtractor.class );
private DocumentBuilderFactory builderFactory;
private Document documentDOM;
private String correctedHtml;
private DocumentBuilder domParser;
@Autowired ResourceLoader resourceLoader;
private String sourceHtml;
private String strippedContent;
private XPath xPath;
// public accessors and mutators
public String cleanUpHtml( String inputHtml ) throws IOException {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
TagNode node = cleaner.clean( inputHtml );
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
new PrettyXmlSerializer(props).writeToStream( node, outputStream );
String cleanedHtml = outputStream.toString( StandardCharsets.UTF_8.name() );
return cleanedHtml;
}
public String extractRealContent( String sourceHtml ) {
logger.debug( "Extracting real content for FitNesse Page: " + sourceHtml );
this.sourceHtml = sourceHtml;
try{
correctFailures();
parseSourceHtml();
strippContentWithXslt();
}catch( IOException | TransformerException | ParserConfigurationException | SAXException e ){
logger.error( "Extracting real page content failed.", e );
}
logger.debug( "Stripped content HTML: \n" + strippedContent );
return strippedContent;
}
// protected, private helper mehtods
private String correctFailures() {
String[] searchList = { "", " ", "