
com.day.cq.wcm.offline.DocImporter Maven / Gradle / Ivy
/*************************************************************************
*
* ADOBE CONFIDENTIAL
* ___________________
*
* Copyright 2013 Adobe Systems Incorporated
* All Rights Reserved.
*
* NOTICE: All information contained herein is, and remains
* the property of Adobe Systems Incorporated and its suppliers,
* if any. The intellectual and technical concepts contained
* herein are proprietary to Adobe Systems Incorporated and its
* suppliers and are protected by trade secret or copyright law.
* Dissemination of this information or reproduction of this material
* is strictly forbidden unless prior written permission is obtained
* from Adobe Systems Incorporated.
**************************************************************************/
package com.day.cq.wcm.offline;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
/**
* An implementation of {@link TextDocumentImporter} for "classic" MS Word
* documents ("doc").
*/
public class DocImporter implements TextDocumentImporter {
private final HWPFDocument document;
private final Range range;
public DocImporter(InputStream stream) throws TextImportException {
try {
this.document = new HWPFDocument(stream);
this.range = document.getRange();
} catch (OfficeXmlFileException ex) {
throw new TextImportException("this is a docx file", ex);
} catch (IOException ex) {
throw new TextImportException(ex.getMessage(), ex);
}
}
/**
* @return contents of first non-empty paragraph
*/
public String getTitle() {
for (int i = 0; i < this.range.numParagraphs(); i++) {
org.apache.poi.hwpf.usermodel.Paragraph p = this.range.getParagraph(i);
String t = p.text().trim();
if (!"".equals(t)) {
return t;
}
}
return null;
}
public int getNumberOfParagraphs() {
return this.range.numParagraphs();
}
public Paragraph getParagraph(int index) {
return new DocParagraph(this.range.getParagraph(index));
}
private class DocParagraph implements Paragraph {
private final org.apache.poi.hwpf.usermodel.Paragraph p;
private final String text;
private final String textHTML;
private final List pictures;
public DocParagraph(org.apache.poi.hwpf.usermodel.Paragraph paragraph) {
this.p = paragraph;
StyleDescription sd = document.getStyleSheet().getStyleDescription(p.getStyleIndex());
String classname = WordStyleSupport.makeClassName(sd.getName());
String container = "p";
String elemname = WordStyleSupport.toHtmlElement(classname);
if (elemname != null) {
container = elemname;
classname = null;
}
StringBuilder sbtext = new StringBuilder();
StringBuilder sbhtml = new StringBuilder();
List pics = new ArrayList();
for (int i = 0; i < this.p.numCharacterRuns(); i++) {
CharacterRun characters = this.p.getCharacterRun(i);
if (document.getPicturesTable().hasPicture(characters)) {
pics.add(new DocPicture(document.getPicturesTable().extractPicture(characters, true)));
} else {
String contents = characters.text();
boolean onlyWhiteSpace = contents.trim().length() == 0;
if (!onlyWhiteSpace) {
if (characters.isBold()) {
sbhtml.append("");
}
if (characters.isItalic()) {
sbhtml.append("");
}
if (characters.isStrikeThrough()) {
sbhtml.append("");
}
if (characters.getUnderlineCode() != 0) {
sbhtml.append("");
}
if (characters.getSubSuperScriptIndex() == 1) {
sbhtml.append("");
}
if (characters.getSubSuperScriptIndex() == 2) {
sbhtml.append("");
}
}
sbtext.append(contents);
sbhtml.append(HtmlUtil.escapeHtmlText(contents));
if (!onlyWhiteSpace) {
if (characters.getSubSuperScriptIndex() == 2) {
sbhtml.append("");
}
if (characters.getSubSuperScriptIndex() == 1) {
sbhtml.append("");
}
if (characters.getUnderlineCode() != 0) {
sbhtml.append("");
}
if (characters.isStrikeThrough()) {
sbhtml.append("");
}
if (characters.isItalic()) {
sbhtml.append("");
}
if (characters.isBold()) {
sbhtml.append("");
}
}
}
}
String result = sbhtml.toString().trim();
if (result.length() > 0) {
StringBuilder tmp = new StringBuilder();
tmp.append("<");
tmp.append(container);
String style = null;
if (this.p.getJustification() == 2) {
style = "text-align: right;";
} else if (this.p.getJustification() == 1) {
style = "text-align: center;";
}
if (style != null) {
tmp.append(" style='" + style + "'");
}
if (classname != null) {
tmp.append(" class='" + HtmlUtil.escapeHtmlAttr(classname) + "'");
}
tmp.append(">");
tmp.append(result);
tmp.append("");
tmp.append(container);
tmp.append(">");
result = tmp.toString();
}
this.text = sbtext.toString().trim();
this.textHTML = result;
this.pictures = Collections.unmodifiableList(pics);
}
public String getText() {
return this.text;
}
public String getHTML() {
return this.textHTML;
}
public List getPictures() {
return this.pictures;
}
}
private class DocPicture implements Picture {
private final org.apache.poi.hwpf.usermodel.Picture picture;
public DocPicture(org.apache.poi.hwpf.usermodel.Picture picture) {
this.picture = picture;
}
public String getMediaType() {
return this.picture.getMimeType();
}
public byte[] getBytes() {
return this.picture.getContent();
}
@Override
public String toString() {
return getMediaType() + " (" + getBytes().length + " bytes)";
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy