All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.msl.pdfier.commons.html.HTMLPrintableUtil Maven / Gradle / Ivy

There is a newer version: 9.1.20
Show newest version
package com.msl.pdfier.commons.html;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.List;

import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.msl.pdfier.commons.exception.PdfierException;
import com.msl.pdfier.commons.io.IOUtils;

import net.htmlparser.jericho.Attribute;
import net.htmlparser.jericho.Attributes;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.OutputDocument;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.SourceCompactor;
import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.Util;

public class HTMLPrintableUtil {

	protected static final Logger logger = LoggerFactory.getLogger(HTMLPrintableUtil.class);

	public static String addMandatoryHtml(InputStream html) throws PdfierException {
		try {
			Source source = new Source(html);
			return addMandatoryHtml(source);
		} catch (IOException e) {
			logger.error("Error reading HTML to parse", e);
			throw new PdfierException("Error reading HTML to parse", e);
		}
	}

	public static String addMandatoryHtml(String html) throws PdfierException {
		Source source = new Source(html);
		return addMandatoryHtml(source);
	}
	
	public static String replaceNbsp(String html) throws PdfierException {
		return html.replace(" ", " ");
	}
	
	public static String removeBlanksBetweenTags(String html) throws PdfierException {
		return html.replace("> <", "><");
	}

	public static String moveStyleToHead(String inputHTML) throws PdfierException {
		try {
			Source source = new Source(inputHTML);
			Element bodyElement = source.getFirstElement(HTMLElementName.BODY);
			Element headElement = source.getFirstElement(HTMLElementName.HEAD);
			if (bodyElement != null) {
				List styleElements = bodyElement.getAllElements("style");
				if (styleElements != null && styleElements.size() > 0) {
					OutputDocument outputDocument = new OutputDocument(source);
					for (Element styleElement : styleElements) {
						if (styleElement != null) {
							outputDocument.remove(styleElement.getStartTag());
							outputDocument.remove(styleElement.getContent());
							if (!styleElement.getStartTag().isSyntacticalEmptyElementTag()) {
								outputDocument.remove(styleElement.getEndTag());
							}
							String styleString = "";
							outputDocument.replace(styleElement.getContent(), styleString);
							outputDocument.insert(headElement.getEndTag().getBegin(), styleElement);
						}
					}
					return outputDocument.toString();
				}
			}
			return source.toString();
		} catch (Exception e) {
			logger.error("Error generating HTML printable", e);
			throw new PdfierException("Error generating HTML printable", e);
		}
	}
	
	public static String addCDATAToHeadStyleTags(String inputHTML) throws PdfierException {
		try {
			Source source = new Source(inputHTML);
			Element headElement = source.getFirstElement(HTMLElementName.HEAD);
			if (headElement != null) {
				List styleElements = headElement.getAllElements("style");
				if (styleElements != null && styleElements.size() > 0) {
					OutputDocument outputDocument = new OutputDocument(source);
					for (Element styleElement : styleElements) {
						if (styleElement != null) {
							if(!styleElement.getContent().toString().startsWith("";
								sb.append("");
								outputDocument.replace(styleElement, sb.toString());
							}
						}
					}
					return outputDocument.toString();
				}
			}
			return source.toString();
		} catch (Exception e) {
			logger.error("Error generating HTML printable", e);
			throw new PdfierException("Error generating HTML printable", e);
		}
	}

	private static String addMandatoryHtml(Source source) throws PdfierException {
		try {
			Element htmlElement = source.getFirstElement(HTMLElementName.HTML);
			if (htmlElement != null) {
				logger.info("html element not null, parsing full HTML");
				return source.toString();
			} else {
				logger.info("main-content div received, parsing HTML fragment");
				OutputDocument outputDocument = new OutputDocument(source);
				StringBuilder sb = new StringBuilder();
				sb.append("");
				sb.append("");
				sb.append(source.toString());
				sb.append("");
				outputDocument.replace(outputDocument.getSegment(), sb.toString());
				return outputDocument.toString();
			}
		} catch (Exception e) {
			logger.error("Error generating HTML printable", e);
			throw new PdfierException("Error generating HTML printable", e);
		}
	}

	protected static String addLogoFragment(String inputHTML, String attr, String logoFragmentFile) throws PdfierException {
		try {
			Source source = new Source(inputHTML);
			OutputDocument outputDocument = new OutputDocument(source);
			StringBuilder sb = new StringBuilder();
			Element titleDIV = source.getFirstElement("class", attr, false);
			String logoFragment = "";
			try {
				logoFragment = Util.getString(new InputStreamReader(
						HTMLPrintableUtil.class.getClassLoader().getResourceAsStream(logoFragmentFile),
						Charset.forName("UTF-8")));
			} catch (Exception e) {
				logger.error("Error reading logo fragment html", e);
			}
			sb.setLength(0);
			sb.append("
\n").append(logoFragment).append(titleDIV.getContent()) .append("\n
"); outputDocument.replace(titleDIV, sb.toString()); return outputDocument.toString(); } catch (Exception e) { logger.error("Error adding logo fragment", e); throw new PdfierException("Error adding logo fragment", e); } } public static String removeElementByAttr(String inputHTML, String attr, String attrValue) throws PdfierException { try { Source source = new Source(inputHTML); OutputDocument outputDocument = new OutputDocument(source); List elements = source.getAllElements(attr, attrValue, false); for (Element element : elements) { outputDocument.remove(element.getStartTag().getBegin(), element.getEndTag().getEnd()); } return outputDocument.toString(); } catch (Exception e) { logger.error("Error removing element by attr value, attr" + attr + ",attrValue:" + attrValue, e); throw new PdfierException("Error removing element by attr value, attr" + attr + ",attrValue:" + attrValue, e); } } public static String addInlineStyleSheets(InputStream inputHTML, String cssFile) throws IOException { Source source = new Source(inputHTML); OutputDocument outputDocument = new OutputDocument(source); StringBuilder sb = new StringBuilder(); Element headElement = source.getFirstElement(HTMLElementName.HEAD); String styleSheetContent; try { styleSheetContent = Util.getString(new InputStreamReader( HTMLPrintableUtil.class.getClassLoader().getResourceAsStream(cssFile), Charset.forName("UTF-8"))); sb.append(headElement.getStartTag().toString()); sb.append(""); sb.append(headElement.getEndTag().toString()); outputDocument.replace(headElement, sb.append(headElement.getContent())); } catch (Exception e) { logger.error("Error reading stylesheet:" + cssFile, e); // don't convert if URL is invalid } return outputDocument.toString(); } public static String addExternalInlineStyleSheets(URL sourceUrl, String inputHTML) throws Exception { Source source = new Source(inputHTML); OutputDocument outputDocument = new OutputDocument(source); StringBuilder sb = new StringBuilder(); List linkStartTags = source.getAllStartTags(HTMLElementName.LINK); for (StartTag startTag : linkStartTags) { Attributes attributes = startTag.getAttributes(); String rel = attributes.getValue("rel"); if (!"stylesheet".equalsIgnoreCase(rel)) continue; String href = attributes.getValue("href"); if (href == null) continue; String styleSheetContent; try { styleSheetContent = Util.getString(new InputStreamReader(new URL(sourceUrl, href).openStream())); } catch (Exception ex) { logger.warn("Error adding external CSS to inline doc." + ex.getMessage()); continue; // don't convert if URL is invalid } sb.setLength(0); sb.append("\n").append(styleSheetContent).append("\n"); String finalStyleSheet = relativeToAbsoluteUrls(sourceUrl, href, sb.toString()); outputDocument.replace(startTag, finalStyleSheet); } logger.debug("Here is the document " + sourceUrl + " with all external stylesheets converted to inline stylesheets:\n"); return outputDocument.toString(); } protected static String relativeToAbsoluteUrls(URL sourceUrl, String href, String styleSheetContent) throws Exception { String content = styleSheetContent; if(styleSheetContent!= null && (styleSheetContent.contains("url(../") || styleSheetContent.contains("url(\"../"))){ if(href.startsWith("/")){ String rootPath = sourceUrl.getProtocol() + "://" + sourceUrl.getHost(); if(sourceUrl.getPort() > 0){ rootPath += ":" + sourceUrl.getPort(); } String[] paths = href.split("/"); String absoluteUrl = rootPath + "/" + paths[1] + "/"; content = styleSheetContent.replace("url(\"../", "url(\"" + absoluteUrl); content = content.replace("url(../", "url(" + absoluteUrl); // File file = new File("D:\\ECLIPSE_WORKSPACES\\PDFA\\pdf-project\\pdfa-gen\\replaced.css"); // IOUtils.stringToFile(content, file); } } return content; } protected static String parseImages(URL sourceUrl, String inputHTML) throws Exception { Source source = new Source(inputHTML); OutputDocument outputDocument = new OutputDocument(source); StringBuilder sb = new StringBuilder(); List imgStartTags = source.getAllStartTags(HTMLElementName.IMG); for (StartTag startTag : imgStartTags) { String fileName = "img" + System.currentTimeMillis() + ".png"; Attributes attributes = startTag.getAttributes(); String src = attributes.getValue("src"); if (src == null) continue; try { File file = new File("D:\\ECLIPSE_WORKSPACES\\PDFA\\pdf-project\\pdfa-gen\\src\\main\\resources\\pdf\\images\\" + fileName); URL imgUrl = new URL(sourceUrl, src); IOUtils.stringToFile(readUrlForPdf(imgUrl.toURI()), file); } catch (Exception ex) { logger.warn("Error adding external CSS to inline doc." + ex.getMessage()); continue; // don't convert if URL is invalid } sb.setLength(0); sb.append(""); outputDocument.replace(startTag, sb.toString()); } logger.debug("Here is the document " + sourceUrl + " with all external stylesheets converted to inline stylesheets:\n"); return outputDocument.toString(); } public static String readUrlForPdf(URI url) throws ClientProtocolException, IOException { CloseableHttpClient httpclient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(url); //httpGet.setHeader(HttpHeaders.CONTENT_TYPE, "application/json"); CloseableHttpResponse response = httpclient.execute(httpGet); String ret = ""; try { HttpEntity entity = response.getEntity(); ret = EntityUtils.toString(entity); } finally { response.close(); } return ret; } public static String compactSource(String inputHTML) throws PdfierException { try { Source source = new Source(inputHTML); SourceCompactor sourceCompactor = new SourceCompactor(source); return sourceCompactor.toString(); } catch (Exception e) { logger.error("Error compacting HTML", e); throw new PdfierException("Error compacting HTML", e); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy