org.docx4j.template.xhtml.handler.def.XHTMLDocumentHandler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of docx4j-template-xhtml Show documentation
Show all versions of docx4j-template-xhtml Show documentation
Building doc documents based on xhtml templates using docx4j
The newest version!
/*
* Copyright (c) 2018, hiwepy (https://github.com/hiwepy).
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.docx4j.template.xhtml.handler.def;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import org.docx4j.Docx4jProperties;
import org.docx4j.template.Docx4jConstants;
import org.docx4j.template.xhtml.DataMap;
import org.docx4j.template.xhtml.handler.DocumentHandler;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Document.OutputSettings;
import org.jsoup.safety.Whitelist;
/**
*
* TODO
* @author hiwepy
*/
public class XHTMLDocumentHandler implements DocumentHandler {
private static final XHTMLDocumentHandler XHTML_DOCUMENT_HANDLER = new XHTMLDocumentHandler();
/**
* Generate a DocumentHandler.
* @return the DocumentHandler
*/
public static DocumentHandler getDocumentHandler() {
return XHTML_DOCUMENT_HANDLER;
}
protected XHTMLDocumentHandler() {
}
/**
* Jsoup.parse(File in, String charsetName) : 它使用文件的路径做为 baseUri。 这个方法适用于如果被解析文件位于网站的本地文件系统,且相关链接也指向该文件系统
* Jsoup.parse(File in, String charsetName, String baseUri) : 这个方法用来加载和解析一个HTML文件。如在加载文件的时候发生错误,将抛出IOException,应作适当处理。
* baseUri 参数用于解决文件中URLs是相对路径的问题。如果不需要可以传入一个空的字符串。
*/
@Override
public Document handle( File htmlFile) throws IOException {
//获取Jsoup参数
String charsetName = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_CHARSETNAME, Docx4jConstants.DEFAULT_CHARSETNAME );
String baseUri = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_BASEURI,"");
//使用Jsoup将html转换成Document对象
Document doc = Jsoup.parse(htmlFile, charsetName, baseUri );
//返回Document对象
return doc;
}
/**
* Jsoup.parse(html)
* Jsoup.parse(html, baseUri)
* Jsoup.parseBodyFragment(bodyHtml)
* Jsoup.parseBodyFragment(bodyHtml, baseUri)
*/
@Override
public Document handle( String html,boolean fragment) throws IOException{
//获取Jsoup参数
String baseUri = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_BASEURI,"");
//使用Jsoup将html转换成Document对象
Document doc = fragment ? Jsoup.parseBodyFragment( html, baseUri) : Jsoup.parse( html,baseUri);
//返回Document对象
return doc;
}
/**
* Jsoup.parse(String url, int timeoutMillis)
* Jsoup.connect(String url) 方法创建一个新的 Connection, 和 post() 取得和解析一个HTML文件。如果从该URL获取HTML时发生错误,便会抛出 IOException,应适当处理。
* 这两个方法只支持Web URLs (http和https 协议);
*/
@Override
public Document handle(URL url) throws IOException{
//获取Jsoup参数
String baseUri = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_BASEURI,"");
int timeout = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_TIMEOUTMILLIS, Docx4jConstants.DEFAULT_TIMEOUTMILLIS);
//fetch the specified URL and parse to a HTML DOM
Document doc = Jsoup.parse(url,timeout);
doc.setBaseUri(baseUri);
//返回Document对象
return doc;
}
/**
* Jsoup.parse(String url, int timeoutMillis)
* Jsoup.connect(String url) 方法创建一个新的 Connection, 和 post() 取得和解析一个HTML文件。如果从该URL获取HTML时发生错误,便会抛出 IOException,应适当处理。
* 这两个方法只支持Web URLs (http和https 协议);
*/
@Override
public Document handle(String url, DataMap dataMap) throws IOException{
//获取Jsoup参数
String baseUri = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_BASEURI,"");
String userAgent = "Mozilla/5.0 (jsoup)";
int timeout = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_TIMEOUTMILLIS, Docx4jConstants.DEFAULT_TIMEOUTMILLIS);
//fetch the specified URL and parse to a HTML DOM
Document doc = Jsoup.connect(url)
.data(dataMap.getData1())
.data(dataMap.getData2())
.userAgent(userAgent)
.cookies(dataMap.getCookies())
.timeout(timeout)
.post();
doc.setBaseUri(baseUri);
//返回Document对象
return doc;
}
/**
* Jsoup.parse(in, charsetName, baseUri)
*/
@Override
public Document handle( InputStream input) throws IOException{
//获取Jsoup参数
String charsetName = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_CHARSETNAME, Docx4jConstants.DEFAULT_CHARSETNAME );
String baseUri = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_BASEURI,"");
//使用Jsoup将html转换成Document对象
Document doc = Jsoup.parse(input, charsetName, baseUri);
OutputSettings outputSettings = new OutputSettings();
outputSettings.prettyPrint(false);
/*
outputSettings.syntax(syntax)
outputSettings.charset(charset)
outputSettings*/
doc.outputSettings(outputSettings);
//返回Document对象
return doc;
}
public static void main(String[] args) {
String baseUri = "http://www.baidu.com";
String html = "搜索设置";
String doc = Jsoup.clean(html, baseUri, Whitelist.none());
System.out.println(doc);
System.out.println("*******");
doc = Jsoup.clean(html, baseUri, Whitelist.simpleText());
System.out.println(doc);
System.out.println("*******");
doc = Jsoup.clean(html, baseUri, Whitelist.basic());
System.out.println(doc);
System.out.println("*******");
doc = Jsoup.clean(html, baseUri, Whitelist.basicWithImages());
System.out.println(doc);
System.out.println("*******");
doc = Jsoup.clean(html, baseUri, Whitelist.relaxed());
System.out.println(doc);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy