com.day.cq.dam.word.extraction.WordExtractionHandler Maven / Gradle / Ivy
/*************************************************************************
*
* ADOBE CONFIDENTIAL
* ___________________
*
* Copyright 2012 Adobe Systems Incorporated
* All Rights Reserved.
*
* NOTICE: All information contained herein is, and remains
* the property of Adobe Systems Incorporated and its suppliers,
* if any. The intellectual and technical concepts contained
* herein are proprietary to Adobe Systems Incorporated and its
* suppliers and are protected by trade secret or copyright law.
* Dissemination of this information or reproduction of this material
* is strictly forbidden unless prior written permission is obtained
* from Adobe Systems Incorporated.
**************************************************************************/
package com.day.cq.dam.word.extraction;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Service;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.openxml4j.exceptions.OLE2NotOfficeXmlFileException;
import org.apache.poi.util.IOUtils;
import org.apache.poi.xwpf.usermodel.BodyElementType;
import org.apache.poi.xwpf.usermodel.VerticalAlign;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFPicture;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.sling.api.resource.ResourceResolver;
import org.apache.sling.api.wrappers.ValueMapDecorator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.day.cq.dam.api.Asset;
import com.day.cq.dam.api.Rendition;
import com.day.cq.dam.api.RenditionPicker;
import com.day.cq.dam.indd.AbstractPageExtractionHandler;
import com.day.cq.dam.indd.PageBuilder;
import com.day.cq.dam.indd.PageComponent;
import com.day.cq.dam.indd.PageExtractionException;
import com.day.cq.dam.indd.PageExtractionHandler;
import com.day.cq.wcm.api.Page;
import com.day.cq.wcm.api.WCMException;
/**
* A {@link PageExtractionHandler} for Word's DOCX and DOC format.
*
* An extraction handler to extract a page from a Word .docx and .doc files.
*
* @see PageExtractionHandler
*/
@Component
@Service(value = PageExtractionHandler.class)
@Properties({
@Property(name = PageExtractionHandler.SERVICE_PROPERTY_LABEL, value = "Word Extraction Handler", propertyPrivate = true),
@Property(name = PageExtractionHandler.SERVICE_PROPERTY_DESCRIPTION, value = "Extraction Handler for a MS Word files.", propertyPrivate = true)
})
public class WordExtractionHandler extends AbstractPageExtractionHandler {
private static final Logger log = LoggerFactory.getLogger(WordExtractionHandler.class);
private static final String TEXT_COMPONENT = "foundation/components/text";
private static final String IMAGE_COMPONENT = "foundation/components/image";
private static final String TEXT_IMAGE_COMPONENT = "foundation/components/textimage";
private static final int JUSTIFICATION_RIGHT = 2;
private static final int JUSTIFICATION_CENTER = 1;
private static final short TEXT_TYPE = 0;
private static final short IMAGE_TYPE = 1;
private static final short TEXT_IMAGE_TYPE = 2;
/**
* Get an MS Word targeted rendition picker.
*
* {@inheritDoc}
*/
public RenditionPicker getRenditionPicker() {
return new RenditionPicker() {
public Rendition getRendition(Asset asset) {
if (asset == null) {
return null;
}
final String assetMime = asset.getMimeType();
if (assetMime.matches("application.*msword") || assetMime.equals(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
return asset.getRendition("original");
} else {
return null;
}
}
};
}
/**
* {@inheritDoc}
*/
public Page extractPage(Rendition rend, String pageRoot, String pageName, String pageTitle, String pageTemplate, String pageDesign)
throws PageExtractionException {
final String rendMime = rend.getAsset().getMimeType();
if (rendMime.equals(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
return extractDocx(rend, pageRoot, pageName, pageTitle, pageTemplate, pageDesign);
} else if (rendMime.matches("application.*msword")) {
return extractDoc(rend, pageRoot, pageName, pageTitle, pageTemplate, pageDesign);
} else {
final String msg = "No appropriate extractor found for: " + rend.getAsset().getName();
log.info(msg);
throw new PageExtractionException(msg);
}
}
/**
* Extract the text, image, and textimage components from a Word .doc file and create a CQ Page that contains them plus a title.
* @param rend
* @param pageRoot
* @param pageName
* @param pageTitle
* @param pageTemplate
* @param pageDesign
* @return
* @throws PageExtractionException
*/
private Page extractDoc(Rendition rend, String pageRoot, String pageName, String pageTitle, String pageTemplate, String pageDesign)
throws PageExtractionException {
log.info("Beginning page extraction from: " + rend.getAsset().getPath());
int textIndex = 0;
int textImageIndex = 0;
int imageIndex = 0;
ResourceResolver resourceResolver = rend.getResourceResolver();
PageBuilder pageBuilder = getPageBuilder(resourceResolver);
PageComponent par = null;
String subAssetPath = rend.getAsset().getPath() + "/subassets";
log.debug("Images in the page will point to the appropriate subassets located at: " + subAssetPath);
InputStream is = null;
try {
is = rend.getAsset().getOriginal().getStream();
HWPFDocument doc = new HWPFDocument(is);
if (pageBuilder != null) {
par = createParComponent(pageBuilder);
Paragraph[] paragraphs = getParagraphs(doc.getRange());
log.debug("Found " + paragraphs.length+ " paragraphs in the document.");
for (int i = 0; i < paragraphs.length; i++) {
short type = getParagraphType(paragraphs[i], doc);
String nodeName = null;
switch (type) {
default:
log.debug("Unknown paragraph type, will treat it as text only");
// falls through
case TEXT_TYPE:
log.debug("Paragraph " + i + " contains only text.");
nodeName = "text_" + textIndex++;
PageComponent comp = createTextNode(paragraphs[i], pageBuilder, nodeName);
if (comp != null) {
par.getChildComponents().add(comp);
}
break;
case IMAGE_TYPE:
log.debug("Paragraph " + i + " contains only images.");
nodeName = "image_" + imageIndex++;
List imageComps = createImageNode(paragraphs[i], pageBuilder, nodeName,
subAssetPath, doc.getPicturesTable());
if (imageComps != null && !imageComps.isEmpty()) {
par.getChildComponents().addAll(imageComps);
}
break;
case TEXT_IMAGE_TYPE:
log.debug("Paragraph " + i + " contains both text and images.");
nodeName = "textImage_" + textImageIndex++;
List textImageComps = createTextImageNode(paragraphs[i], pageBuilder, nodeName,
subAssetPath, doc.getPicturesTable());
if (textImageComps != null && !textImageComps.isEmpty()) {
par.getChildComponents().addAll(textImageComps);
}
}
}
}
return buildPage(rend, pageRoot, pageName, pageTitle, pageTemplate,
pageDesign, pageBuilder, par);
}
catch (OLE2NotOfficeXmlFileException oe) {
log.error("Error while page extraction from : " + rend.getAsset().getPath(), oe);
}
catch (Throwable e) {
throw new PageExtractionException(e.getMessage(), e);
} finally {
IOUtils.closeQuietly(is);
}
return null;
}
/**
* Extract the text, image, and textimage components from a Word .docx file and create a CQ Page that contains them plus a title.
* @param rend
* @param pageRoot
* @param pageName
* @param pageTitle
* @param pageTemplate
* @param pageDesign
* @return
* @throws PageExtractionException
*/
private Page extractDocx(Rendition rend, String pageRoot, String pageName, String pageTitle, String pageTemplate, String pageDesign)
throws PageExtractionException {
log.info("Beginning page extraction from: " + rend.getAsset().getPath());
int textIndex = 0;
int textImageIndex = 0;
int imageIndex = 0;
ResourceResolver resourceResolver = rend.getResourceResolver();
PageBuilder pageBuilder = getPageBuilder(resourceResolver);
PageComponent par = null;
String subAssetPath = rend.getAsset().getPath() + "/subassets";
log.debug("Images in the page will point to the appropriate subassets located at: " + subAssetPath);
InputStream is = null;
try {
is = rend.getAsset().getOriginal().getStream();
XWPFDocument docx = new XWPFDocument(is);
if (pageBuilder != null) {
par = createParComponent(pageBuilder);
Iterator paraIter = docx.getParagraphsIterator();
while (paraIter.hasNext()) {
XWPFParagraph para = paraIter.next();
if (para.isEmpty()) {
log.debug("Empty paragraph found, ignoring.");
continue;
}
// Bug in apachae poi-ooxml library in which isPageBreak() get NPE. It is fixed via commit https://svn.apache.org/viewvc?view=revision&revision=1795254
// and will be included in version POI 3.17 beta 1 Remove this handling when we update poi library
try {
if (para.isPageBreak()) {
log.debug("Empty paragraph found, ignoring.");
continue;
}
} catch (NullPointerException npe) {
// Do not consider it a page break
}
short paraType = getParagraphType(para);
String nodeName = null;
switch (paraType) {
case TEXT_TYPE:
log.debug("Current paragraph contains only text.");
nodeName = "text_" + textIndex++;
PageComponent comp = createTextNode(para, pageBuilder, nodeName);
if (comp != null) {
par.getChildComponents().add(comp);
}
break;
case IMAGE_TYPE:
log.debug("Current paragraph contains only images.");
nodeName = "image_" + imageIndex++;
List imageComps = createImageNode(para, pageBuilder, nodeName, subAssetPath);
if (imageComps != null && !imageComps.isEmpty()) {
par.getChildComponents().addAll(imageComps);
}
break;
case TEXT_IMAGE_TYPE:
log.debug("Current paragraph contains both text and images.");
nodeName = "textImage_" + textImageIndex++;
List textImageComps = createTextImageNode(para, pageBuilder, nodeName, subAssetPath);
if (textImageComps != null && !textImageComps.isEmpty()) {
par.getChildComponents().addAll(textImageComps);
}
}
}
}
return buildPage(rend, pageRoot, pageName, pageTitle, pageTemplate,
pageDesign, pageBuilder, par);
}
catch (OLE2NotOfficeXmlFileException oe) {
log.error("Error while page extraction from : " + rend.getAsset().getPath(), oe);
}
catch (Throwable e) {
throw new PageExtractionException(e.getMessage(), e);
} finally {
IOUtils.closeQuietly(is);
}
return null;
}
private Page buildPage(Rendition rend, String pageRoot, String pageName,
String pageTitle, String pageTemplate, String pageDesign,
PageBuilder pageBuilder, PageComponent par) throws WCMException {
List pageComponents = new ArrayList();
pageComponents.add(par);
Asset asset = rend.getAsset();
String title = asset.getMetadataValue("dc:title");
if (title != null && !title.trim().equalsIgnoreCase("")) {
log.debug("Title found in the document metadata: " + title);
pageTitle = title;
} else {
pageTitle = pageTitle + " (created: " + new Date() + ")";
log.debug("No title found in the document metadata. Using: " + pageTitle);
}
pageComponents.add(createTitleComponent(pageBuilder, pageTitle));
Page page = pageBuilder.recreatePage(pageRoot, pageName, pageTitle, pageTemplate, pageDesign, pageComponents);
log.info("Page extraction from: " + rend.getAsset().getPath() + " has completed successfully.");
return page;
}
@Override
protected PageComponent createTitleComponent(PageBuilder pageBuilder, String pageTitle) {
Map properties = new HashMap();
properties.put("jcr:title", pageTitle);
properties.put("type", "extralarge");
return pageBuilder.createComponent("foundation/components/title", new ValueMapDecorator(properties));
}
/**
* Given the paragraph from a Word .doc file, 1 textimage component is created. Any additional images in the paragraph, beyond the first
* are created as image components.
* @param para
* @param pageBuilder
* @param name
* @param subAssetPath
* @param picTable
* @return A list of the components to be added to the Page.
*/
private List createTextImageNode(Paragraph para, PageBuilder pageBuilder, String name, String subAssetPath,
PicturesTable picTable) {
/*
* In theory there can be any number of text and image combinations in a paragraph. So just pair the text with the
* first image found. Addition images will be images on their own. The author can always make manual changes in
* the page later if this doesn't provide the desired results.
*/
int imageCount = 1;
List imageList = new ArrayList();
for (int i = 0; i < para.numCharacterRuns(); i++) {
CharacterRun run = para.getCharacterRun(i);
if (picTable.hasPicture(run)) {
Picture pic = picTable.extractPicture(run, false);
String path = subAssetPath + "/" + pic.suggestFullFileName();
Map props = new HashMap();
props.put("fileReference", path);
if (imageList.isEmpty()) {
//the first image needs to be named image as that is what the textimage component expects its image's name to be.
imageList.add(pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(props), "image"));
} else {
imageList.add(pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(props), "image_" + imageCount++));
}
}
}
log.debug("Found " + imageList.size() + " images.");
Map props = new HashMap();
props.put("textIsRich", true);
props.put("text", renderParagraph(para));
List comps = new ArrayList();
if (!imageList.isEmpty()) {
PageComponent textImageComp = pageBuilder.createComponent(TEXT_IMAGE_COMPONENT, new ValueMapDecorator(props), name);
textImageComp.getChildComponents().add(imageList.get(0));
comps.add(textImageComp);
for (int i = 1; i < imageList.size(); i++) {
comps.add(imageList.get(i));
}
} else {
comps.add(pageBuilder.createComponent(TEXT_COMPONENT, new ValueMapDecorator(props), name));
}
return comps;
}
/**
* Given the paragraph from a Word .docx file, 1 textimage component is created. Any additional images in the paragraph, beyond the first
* are created as image components.
* @param para
* @param pageBuilder
* @param name
* @param subAssetPath
* @return A list of the components to be added to the Page.
*/
private List createTextImageNode(XWPFParagraph para, PageBuilder pageBuilder, String name, String subAssetPath) {
/*
* In theory there can be any number of text and image combinations in a paragraph. So just pair the text with the
* first image found. Addition images will be images on their own. The author can always make manual changes in
* the page later if this doesn't provide the desired results.
*/
int imageCount = 1;
List imageList = new ArrayList();
Iterator runs = para.getRuns().iterator();
while (runs.hasNext()) {
XWPFRun run = runs.next();
if (run.getEmbeddedPictures().size() > 0) {
Map propsImage = new HashMap();
Iterator pics = run.getEmbeddedPictures().iterator();
while (pics.hasNext()) {
XWPFPictureData pic = pics.next().getPictureData();
String path = subAssetPath + "/" + pic.getFileName();
propsImage.put("fileReference", path);
if (imageList.size() == 0) {
//the first image must be named image as that's what the textimage component expects
imageList.add(pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(propsImage), "image"));
} else {
imageList.add(pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(propsImage),
"image_" + imageCount++));
}
}
}
}
String text = renderParagraph(para);
log.debug("Found " + imageList.size() + " images.");
Map props = new HashMap();
props.put("textIsRich", true);
if (text != null && !text.trim().equalsIgnoreCase("") && !text.trim().equalsIgnoreCase("null")) {
props.put("text", text);
}
List components = new ArrayList();
PageComponent textComp = pageBuilder.createComponent(TEXT_IMAGE_COMPONENT, new ValueMapDecorator(props), name);
if (imageList.size() > 0) {
PageComponent img = imageList.get(0);
textComp.getChildComponents().add(img);
components.add(textComp);
for (int i = 1; i < imageList.size(); i++) {
components.add(imageList.get(i));
}
} else {
components.add(textComp);
}
return components;
}
/**
* Given the paragraph from a Word .doc, image components are created for each image found in the paragraph.
* @param para
* @param pageBuilder
* @param name
* @param subAssetPath
* @param picTable
* @return A list of components to be added to the page.
*/
private List createImageNode(Paragraph para, PageBuilder pageBuilder, String name, String subAssetPath,
PicturesTable picTable) {
int count = 0;
List comps = new ArrayList();
for (int i = 0; i < para.numCharacterRuns(); i++) {
CharacterRun chars = para.getCharacterRun(i);
if (picTable.hasPicture(chars)) {
Picture pic = picTable.extractPicture(chars, false);
String path = subAssetPath + "/" + pic.suggestFullFileName();
Map props = new HashMap();
props.put("fileReference", path);
comps.add(pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(props), name + "_" + count++));
}
}
log.debug("Created " + comps.size() + "image components.");
return comps;
}
/**
* Given the paragraph from a Word .docx, image components are created for each image found in the paragraph.
* @param para
* @param pageBuilder
* @param name
* @param subAssetPath
* @return A list of components to be added to the page.
*/
private List createImageNode(XWPFParagraph para, PageBuilder pageBuilder, String name, String subAssetPath) {
int count = 0;
List comps = new ArrayList();
Iterator runs = para.getRuns().iterator();
while (runs.hasNext()) {
XWPFRun run = runs.next();
if (run.getEmbeddedPictures().size() > 0) {
Iterator pics = run.getEmbeddedPictures().iterator();
while (pics.hasNext()) {
PageComponent comp = null;
XWPFPictureData pic = pics.next().getPictureData();
String path = subAssetPath + "/" + pic.getFileName();
Map properties = new HashMap();
properties.put("fileReference", path);
comp = pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(properties), name + "_" + count++);
comps.add(comp);
}
}
}
log.debug("Created " + comps.size() + "image components.");
return comps;
}
/**
* Given the paragraph from a Word .doc, create a text component to emcompass the paragraph text.
* @param para
* @param pageBuilder
* @param name
* @return The text component to be added to the page.
*/
private PageComponent createTextNode(Paragraph para, PageBuilder pageBuilder, String name) {
Map props = new HashMap();
props.put("textIsRich", true);
props.put("text", renderParagraph(para));
return pageBuilder.createComponent(TEXT_COMPONENT, new ValueMapDecorator(props), name);
}
/**
* Given the paragraph from a Word .docx, create a text component to emcompass the paragraph text.
* @param para
* @param pageBuilder
* @param name
* @return The text component to be added to the page.
*/
private PageComponent createTextNode(XWPFParagraph para, PageBuilder pageBuilder, String name) {
PageComponent comp = null;
//check the type, we only handle paragraph types, not tables
if (para.getElementType() == BodyElementType.PARAGRAPH) {
String text = renderParagraph(para);
if (text != null && !text.trim().equalsIgnoreCase("") && !text.trim().equalsIgnoreCase("null")) {
log.debug("Paragraph text is: " + text);
Map properties = new HashMap();
properties.put("textIsRich", true);
properties.put("text", text);
comp = pageBuilder.createComponent(TEXT_COMPONENT, new ValueMapDecorator(properties), name);
}
} else {
log.warn("Non-PARAGRAPH type paragraph was found. The type is: " + para.getElementType() + ". Skipping.");
}
return comp;
}
private void addOpenStyleTags(XWPFRun run, StringBuilder builder) {
if (run.isBold()) {
builder.append("");
}
if (run.isItalic()) {
builder.append("");
}
if (run.isStrike()) {
builder.append("");
}
if (run.getSubscript() == VerticalAlign.SUBSCRIPT) {
builder.append("");
}
if (run.getSubscript() == VerticalAlign.SUPERSCRIPT) {
builder.append("");
}
}
private void addCloseStyleTags(XWPFRun run, StringBuilder builder) {
if (run.isBold()) {
builder.append("");
}
if (run.isItalic()) {
builder.append("");
}
if (run.isStrike()) {
builder.append("");
}
if (run.getSubscript() == VerticalAlign.SUBSCRIPT) {
builder.append("");
}
if (run.getSubscript() == VerticalAlign.SUPERSCRIPT) {
builder.append("");
}
}
private int checkStyle(XWPFParagraph para) {
String style = para.getStyle();
if (style != null) {
//looking for heading styles that translate to HTML, ignore others
if (style.toLowerCase().startsWith("heading")) {
int headingNum = -1;
try {
headingNum = Integer.parseInt(style.substring(style.length() - 1));
} catch (NumberFormatException e) {
}
return headingNum;
} else {
return -1;
}
} else {
return -1;
}
}
private Paragraph[] getParagraphs(Range range) {
Paragraph[] paragraphs = new Paragraph[range.numParagraphs()];
for (int i = 0; i < paragraphs.length; i++) {
paragraphs[i] = range.getParagraph(i);
}
return paragraphs;
}
private short getParagraphType(XWPFParagraph para) {
boolean hasPictures = false;
boolean hasText = false;
Iterator runs = para.getRuns().iterator();
while (runs.hasNext()) {
XWPFRun run = runs.next();
if (run.getEmbeddedPictures().size() > 0 ) {
hasPictures = true;
break;
}
}
String paraText = para.getParagraphText();
if (paraText != null && !paraText.trim().equals("") && !paraText.trim().startsWith("null")) {
hasText = true;
}
if (hasPictures && hasText) {
return TEXT_IMAGE_TYPE;
} else if (hasPictures) {
return IMAGE_TYPE;
} else {
return TEXT_TYPE;
}
}
private short getParagraphType(Paragraph paragraph, HWPFDocument doc) {
// assume paragraph with image
boolean hasPic = false;
boolean hasText = false;
for (int i = 0; i < paragraph.numCharacterRuns(); i++) {
CharacterRun characters = paragraph.getCharacterRun(i);
if (doc.getPicturesTable().hasPicture(characters)) {
hasPic = true;
} else {
hasText |= paragraph.getCharacterRun(i).text() != null
&& paragraph.getCharacterRun(i).text().trim().length() > 0;
}
}
if (hasPic && !hasText) {
return WordExtractionHandler.IMAGE_TYPE;
} else if (hasPic && hasText) {
return WordExtractionHandler.TEXT_IMAGE_TYPE;
} else {
return WordExtractionHandler.TEXT_TYPE;
}
}
/**
* Takes the given paragraph from a Word .doc file and formats the text with proper HTML tags.
* @param paragraph
* @return The HTML formatted text.
*/
private String renderParagraph(Paragraph paragraph) {
StringBuilder builder = new StringBuilder();
if (paragraph.getJustification() == JUSTIFICATION_RIGHT) {
builder.append("");
} else if (paragraph.getJustification() == JUSTIFICATION_CENTER) {
builder.append("
");
} else {
builder.append("
");
}
for (int i = 0; i < paragraph.numCharacterRuns(); i++) {
CharacterRun characters = paragraph.getCharacterRun(i);
if (characters.isBold()) {
builder.append("");
}
if (characters.isItalic()) {
builder.append("");
}
for (char c : characters.text().toCharArray()) {
if (c == '<') {
builder.append("<");
} else if (c == '>') {
builder.append(">");
} else if (c == '&') {
builder.append("&");
} else {
builder.append(c);
}
}
if (characters.isItalic()) {
builder.append("");
}
if (characters.isBold()) {
builder.append("");
}
}
builder.append("
");
return builder.toString();
}
private String renderParagraph(XWPFParagraph para) {
StringBuilder builder = new StringBuilder();
if (log.isDebugEnabled()) {
if (para.getText() == null) {
log.debug("Para text is a null object");
} else {
log.debug("Para text is " + para.getText());
}
}
if (para.getText() != null && !para.getText().trim().equalsIgnoreCase("") && !para.getText().trim().equalsIgnoreCase("null")) {
builder.append("");
//check if a head style applies
int style = checkStyle(para);
if (style == -1 || style > 6) {
//no heading style, so loop through all runs and build out the formatted string
//also treating any heading > 6 as normal text since HTML only supports up to H6
Iterator runIter = para.getRuns().iterator();
while (runIter.hasNext()) {
XWPFRun run = runIter.next();
if (run.getText(0) != null && !run.getText(0).trim().equalsIgnoreCase("null")) {
addOpenStyleTags(run, builder);
builder.append(run.getText(0));
addCloseStyleTags(run, builder);
}
}
} else {
//style heading exists, so just put all the text under the heading element
builder.append("");
builder.append(para.getText());
builder.append(" ");
}
builder.append("
");
}
return builder.toString();
}
}