com.processpuzzle.fitnesse.print.html.FitNessePageContentExtractor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of fit-print-plugin Show documentation
FitNesse Plugin to print wiki content into PDF.
The newest version!
package com.processpuzzle.fitnesse.print.html;

import static com.processpuzzle.fitnesse.print.html.XmlUtil.asList;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;

import org.apache.commons.lang.StringUtils;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.PrettyXmlSerializer;
import org.htmlcleaner.TagNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ResourceLoader;
import org.springframework.stereotype.Component;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

@Component
public class FitNessePageContentExtractor {
   private static final String XSLT_FILE = "classpath:FitToPdf.xsl";
   private static final Logger logger = LoggerFactory.getLogger( FitNessePageContentExtractor.class );
   private DocumentBuilderFactory builderFactory;
   private Document documentDOM;
   private String correctedHtml;
   private DocumentBuilder domParser;
   @Autowired ResourceLoader resourceLoader;
   private String sourceHtml;
   private String strippedContent;
   private XPath xPath;

   // public accessors and mutators
   public String cleanUpHtml( String inputHtml ) throws IOException {
      HtmlCleaner cleaner = new HtmlCleaner();
      CleanerProperties props = cleaner.getProperties();
      TagNode node = cleaner.clean( inputHtml );
      
      ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
      new PrettyXmlSerializer(props).writeToStream( node, outputStream );
      String cleanedHtml = outputStream.toString( StandardCharsets.UTF_8.name() );
      return cleanedHtml;
   }

   public String extractRealContent( String sourceHtml ) {
      logger.debug( "Extracting real content for FitNesse Page: " + sourceHtml );
      this.sourceHtml = sourceHtml;

      try{
         correctFailures();
         parseSourceHtml();
         strippContentWithXslt();
      }catch( IOException | TransformerException | ParserConfigurationException | SAXException e ){
         logger.error( "Extracting real page content failed.", e );
      }

      logger.debug( "Stripped content HTML: \n" + strippedContent );

      return strippedContent;
   }

   // protected, private helper mehtods
   private String correctFailures() {
      String[] searchList = { "", "
", "", "", "
", "", "", "content=\"IE=edge\">", ".css\">",
            "", "