com.janeluo.easypdf.DocReader Maven / Gradle / Ivy
The newest version!
/* Copyright (c) 2021 janeluo
* easy-pdf is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
*/
package com.janeluo.easypdf;
import com.alibaba.fastjson.JSONObject;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.*;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
/**
* 读取 .doc 文件,并转换为 TextPDF 可识别的模板格式
*/
public class DocReader {
private URL xslUrl = null;
private boolean autoTitle = false;
private boolean ignoreBlankPara = false;
private Map jsonObject;
private Map jsonData;
/**
* 如果指定,将在文件中增加 XSL 风格页的引用
*
* @param url XSL stylesheet URL
*/
public void setXSLUrl(URL url) {
this.xslUrl = url;
}
/**
* 自动识别标题行,默认关闭,可以通过这个函数开启
*
* @param autoTitle 是/否
*/
public void setAutoTitle(boolean autoTitle) {
this.autoTitle = autoTitle;
}
/**
* 是否忽略空白段落
*
* @param ignore
*/
public void ignoreBlankPara(boolean ignore) {
this.ignoreBlankPara = ignore;
}
private int getTitleIndex(Range range) {
int index = 0;
int maxFontSize = 0;
boolean center = false;
// 从头 3 段中找标题
int nParas = Math.min(3, range.numParagraphs());
for (int i = 0; i < nParas; i++) {
Paragraph para = range.getParagraph(i);
// 找到这一段中最大的字体
int fontSize = 0;
for (int j = 0; j < para.numCharacterRuns(); j++) {
CharacterRun run = para.getCharacterRun(j);
fontSize = Math.max(fontSize, run.getFontSize());
}
// 如果字体比之前的都大,则认为是标题
if (fontSize > maxFontSize) {
index = i;
maxFontSize = fontSize;
} else if (fontSize == maxFontSize) {
if (!center && para.getJustification() == 1) {
index = i;
center = true;
}
}
}
return index;
}
private void appendParaAttrs(StringBuilder builder, Paragraph para) {
switch (para.getJustification()) {
case 1:
builder.append(" align=\"center\"");
break;
case 2:
builder.append(" align=\"right\"");
break;
case 3: // left 对齐是默认的,不写入模板中
break;
default:
break;
}
}
private void appendRunAttrs(StringBuilder builder,
CharacterRun run, boolean isSpan) {
StringBuilder style = new StringBuilder();
if (isSpan) {
if (run.isBold()) {
style.append("bold");
}
if (run.isItalic()) {
if (style.length() > 0) {
style.append(",");
}
style.append("italic");
}
if (run.getUnderlineCode() == 1) {
if (style.length() > 0) {
style.append(",");
}
style.append("underline");
}
if (style.length() > 0) {
builder.append(" font-style=\"").append(style).append("\"");
}
}
builder.append(" font-size=\"");
builder.append(run.getFontSize() / 2);
builder.append("\"");
}
private String textEscape(String text) {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < text.length(); i++) {
String escape = Util.escapeXmlChars(text.charAt(i));
if (escape != null) {
builder.append(escape);
} else {
builder.append(text.charAt(i));
}
}
return builder.toString();
}
private void readCharacterRuns(Paragraph para, int paraIndex,
StringBuilder builder, boolean isTitle) {
StringBuilder allText = null;
if (isTitle && jsonObject != null) {
allText = new StringBuilder();
}
for (int j = 0; j < para.numCharacterRuns(); j++) {
CharacterRun run = para.getCharacterRun(j);
String text = run.text().replaceAll("[\u0000-\u001f]", "");
System.out.println("run text: " + text + " >i=" + paraIndex);
// System.out.println("vanished: " + run.isVanished());
// System.out.println("special: " + run.isSpecialCharacter());
// 忽略特殊字符
if (run.isSpecialCharacter()) {
continue;
}
// 忽略级链接
if (text.matches(" HYPERLINK .+") ||
text.matches("HYPERLINK .+") ||
text.matches(" PAGEREF .+") ||
text.matches(" TOC .+")) {
continue;
}
// \u3000: IDEOGRAPHIC SPACE
if (text.matches("^[\\s\u3000]+$")) {
if (run.getUnderlineCode() == 1) {
String vid = "vid_" + paraIndex + "_" + j;
builder.append(" \n");
if (jsonData != null) {
jsonData.put(vid, "");
}
} else {
builder.append(" \n");
}
} else if (text.matches("^_+$")) {
String vid = "vid_" + paraIndex + "_" + j;
builder.append(" \n");
if (jsonData != null) {
jsonData.put(vid, "");
}
} else if (text.length() > 0) {
builder.append(" ");
builder.append(textEscape(text));
builder.append("\n");
if (allText != null) {
allText.append(text);
}
}
}
if (isTitle && jsonObject != null) {
jsonObject.put("title", allText != null ? allText.toString() : "");
}
}
/**
* 转换 .doc 文件
*
* @param docStream .doc 数据流
* @param xmlStream .xml 输出流,用于保存转换后结果
*/
public void read(InputStream docStream, OutputStream xmlStream,
OutputStream jsonStream)
throws IOException {
if (docStream == null || xmlStream == null) {
System.err.println("Invalid argument");
return;
}
if (jsonStream != null) {
jsonObject = new HashMap<>();
jsonData = new HashMap<>();
}
HWPFDocument document = new HWPFDocument(docStream);
Range range = document.getRange();
StringBuilder builder = new StringBuilder();
builder.append("\n");
if (xslUrl != null) {
builder.append("\n");
}
builder.append("\n\n");
builder.append("\n\n");
xmlStream.write(builder.toString().getBytes(StandardCharsets.UTF_8));
builder.setLength(0);
int titleIndex = 0;
if (autoTitle) {
titleIndex = getTitleIndex(range);
}
Table table = null;
for (int i = 0; i < range.numParagraphs(); i++) {
Paragraph para = range.getParagraph(i);
boolean isTitle = false;
if (para.pageBreakBefore()) { // 换页符
builder.append(" \n");
}
if (para.isInTable()) { // 表格
if (table == null) {
table = range.getTable(para);
int maxCells = 0;
for (int m = 0; m < table.numRows(); m++) {
TableRow row = table.getRow(m);
maxCells = Math.max(maxCells, row.numCells());
}
StringBuilder columns = new StringBuilder();
columns.append("1");
for (int n = 1; n <= maxCells; n++) {
if (n == maxCells) {
columns.append(",0");
} else {
columns.append(",1");
}
}
builder.append(" \n");
}
String text = para.text().replaceAll("[\u0000-\u001f]", "");
builder.append(" ");
builder.append(textEscape(text));
builder.append(" | \n");
continue;
} else {
if (table != null) {
builder.append("
\n");
table = null;
}
}
if (ignoreBlankPara && para.numCharacterRuns() == 0) {
continue;
}
if (autoTitle && i == titleIndex) {
builder.append(" \n");
readCharacterRuns(para, i, builder, isTitle);
if (autoTitle && i == titleIndex) {
builder.append(" \n");
} else {
builder.append(" \n");
}
xmlStream.write(builder.toString().getBytes(StandardCharsets.UTF_8));
builder.setLength(0);
}
xmlStream.write(" \n".getBytes());
// 输出 JSON 数据模板
if (jsonStream != null) {
jsonObject.put("data", jsonData);
String jsonString = JSONObject.toJSONString(jsonObject);
jsonStream.write(jsonString.getBytes(StandardCharsets.UTF_8));
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy