com.day.cq.dam.word.extraction.WordExtractionHandler Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of uber-jar Show documentation
There is a newer version: 6.5.21
/*************************************************************************
 *
 * ADOBE CONFIDENTIAL
 * ___________________
 *
 *  Copyright 2012 Adobe Systems Incorporated
 *  All Rights Reserved.
 *
 * NOTICE:  All information contained herein is, and remains
 * the property of Adobe Systems Incorporated and its suppliers,
 * if any.  The intellectual and technical concepts contained
 * herein are proprietary to Adobe Systems Incorporated and its
 * suppliers and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from Adobe Systems Incorporated.
 **************************************************************************/
package com.day.cq.dam.word.extraction;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Service;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.openxml4j.exceptions.OLE2NotOfficeXmlFileException;
import org.apache.poi.util.IOUtils;
import org.apache.poi.xwpf.usermodel.BodyElementType;
import org.apache.poi.xwpf.usermodel.VerticalAlign;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFPicture;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.sling.api.resource.ResourceResolver;
import org.apache.sling.api.wrappers.ValueMapDecorator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.day.cq.dam.api.Asset;
import com.day.cq.dam.api.Rendition;
import com.day.cq.dam.api.RenditionPicker;
import com.day.cq.dam.indd.AbstractPageExtractionHandler;
import com.day.cq.dam.indd.PageBuilder;
import com.day.cq.dam.indd.PageComponent;
import com.day.cq.dam.indd.PageExtractionException;
import com.day.cq.dam.indd.PageExtractionHandler;
import com.day.cq.wcm.api.Page;
import com.day.cq.wcm.api.WCMException;

/**
 * A {@link PageExtractionHandler} for Word's DOCX and DOC format.
 * 
 * An extraction handler to extract a page from a Word .docx and .doc files.
 *
 * @see PageExtractionHandler
 */

@Component
@Service(value = PageExtractionHandler.class)
@Properties({
	@Property(name = PageExtractionHandler.SERVICE_PROPERTY_LABEL, value = "Word Extraction Handler", propertyPrivate = true),
	@Property(name = PageExtractionHandler.SERVICE_PROPERTY_DESCRIPTION, value = "Extraction Handler for a MS Word files.", propertyPrivate = true)
})
public class WordExtractionHandler extends AbstractPageExtractionHandler {
	
	private static final Logger log = LoggerFactory.getLogger(WordExtractionHandler.class);
	
	private static final String TEXT_COMPONENT = "foundation/components/text";
	private static final String IMAGE_COMPONENT = "foundation/components/image";
	private static final String TEXT_IMAGE_COMPONENT = "foundation/components/textimage";
	
	private static final int JUSTIFICATION_RIGHT = 2;
	private static final int JUSTIFICATION_CENTER = 1;
	
	private static final short TEXT_TYPE = 0;
	private static final short IMAGE_TYPE = 1;
	private static final short TEXT_IMAGE_TYPE = 2;
	
	/**
     * Get an MS Word targeted rendition picker.
     * 

     * {@inheritDoc}
     */
	public RenditionPicker getRenditionPicker() {
		return new RenditionPicker() {
			public Rendition getRendition(Asset asset) {
				if (asset == null) {
					return null;
				}
				
				final String assetMime = asset.getMimeType();
				if (assetMime.matches("application.*msword") || assetMime.equals(
						"application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
					return asset.getRendition("original");
				} else {
					return null;
				}
			}
		};
	}
	
	/**
     * {@inheritDoc}
     */
	public Page extractPage(Rendition rend, String pageRoot, String pageName, String pageTitle, String pageTemplate, String pageDesign) 
			throws PageExtractionException {
		final String rendMime = rend.getAsset().getMimeType();
		if (rendMime.equals(
				"application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
			return extractDocx(rend, pageRoot, pageName, pageTitle, pageTemplate, pageDesign);
		} else if (rendMime.matches("application.*msword")) {
			return extractDoc(rend, pageRoot, pageName, pageTitle, pageTemplate, pageDesign);
		} else {
		    final String msg = "No appropriate extractor found for: " + rend.getAsset().getName();
		    log.info(msg);
		    throw new PageExtractionException(msg);
		}
	}
	
	/**
	 * Extract the text, image, and textimage components from a Word .doc file and create a CQ Page that contains them plus a title.
	 * @param rend
	 * @param pageRoot
	 * @param pageName
	 * @param pageTitle
	 * @param pageTemplate
	 * @param pageDesign
	 * @return
	 * @throws PageExtractionException
	 */
	private Page extractDoc(Rendition rend, String pageRoot, String pageName, String pageTitle, String pageTemplate, String pageDesign) 
			throws PageExtractionException {
		log.info("Beginning page extraction from: " + rend.getAsset().getPath());
		
		int textIndex = 0;
		int textImageIndex = 0;
		int imageIndex = 0;
		
		ResourceResolver resourceResolver = rend.getResourceResolver();
		PageBuilder pageBuilder = getPageBuilder(resourceResolver);
		PageComponent par = null;
		
		String subAssetPath = rend.getAsset().getPath() + "/subassets";
		log.debug("Images in the page will point to the appropriate subassets located at: " + subAssetPath);
		InputStream is = null;
		try {
			is = rend.getAsset().getOriginal().getStream();
			HWPFDocument doc = new HWPFDocument(is);
			if (pageBuilder != null) {
				par = createParComponent(pageBuilder);
				Paragraph[] paragraphs = getParagraphs(doc.getRange());
				
				log.debug("Found " + paragraphs.length+ " paragraphs in the document.");
				
				for (int i = 0; i < paragraphs.length; i++) {
					short type = getParagraphType(paragraphs[i], doc);
					String nodeName = null;
					switch (type) {
					default:
						log.debug("Unknown paragraph type, will treat it as text only");
//						falls through
						
					case TEXT_TYPE:
						log.debug("Paragraph " + i + " contains only text.");
						nodeName = "text_" + textIndex++;
						PageComponent comp = createTextNode(paragraphs[i], pageBuilder, nodeName);
						if (comp != null) {
							par.getChildComponents().add(comp);
						}
						break;
						
					case IMAGE_TYPE:
						log.debug("Paragraph " + i + " contains only images.");
						nodeName = "image_" + imageIndex++;
						List imageComps = createImageNode(paragraphs[i], pageBuilder, nodeName,
								subAssetPath, doc.getPicturesTable());
						if (imageComps != null && !imageComps.isEmpty()) {
							par.getChildComponents().addAll(imageComps);
						}
						break;
						
					case TEXT_IMAGE_TYPE:
						log.debug("Paragraph " + i + " contains both text and images.");
						nodeName = "textImage_" + textImageIndex++; 
						List textImageComps = createTextImageNode(paragraphs[i], pageBuilder, nodeName,
								subAssetPath, doc.getPicturesTable());
						if (textImageComps != null && !textImageComps.isEmpty()) {
							par.getChildComponents().addAll(textImageComps);
						}
					}
				}
			}
			
			return buildPage(rend, pageRoot, pageName, pageTitle, pageTemplate,
					pageDesign, pageBuilder, par);
			
		}
		catch (OLE2NotOfficeXmlFileException oe) {
			log.error("Error while page extraction from : " + rend.getAsset().getPath(), oe);
		}
		catch (Throwable e) {
			throw new PageExtractionException(e.getMessage(), e);
		} finally {
			IOUtils.closeQuietly(is);
		}
		return null;
	}
	
	/**
	 * Extract the text, image, and textimage components from a Word .docx file and create a CQ Page that contains them plus a title.
	 * @param rend
	 * @param pageRoot
	 * @param pageName
	 * @param pageTitle
	 * @param pageTemplate
	 * @param pageDesign
	 * @return
	 * @throws PageExtractionException
	 */
	private Page extractDocx(Rendition rend, String pageRoot, String pageName, String pageTitle, String pageTemplate, String pageDesign)
			throws PageExtractionException {
		log.info("Beginning page extraction from: " + rend.getAsset().getPath());
		
		int textIndex = 0;
		int textImageIndex = 0;
		int imageIndex = 0;
		
		ResourceResolver resourceResolver = rend.getResourceResolver();
		PageBuilder pageBuilder = getPageBuilder(resourceResolver);
		PageComponent par = null;
		
		String subAssetPath = rend.getAsset().getPath() + "/subassets";
		log.debug("Images in the page will point to the appropriate subassets located at: " + subAssetPath);
		InputStream is = null;
		try {
			is = rend.getAsset().getOriginal().getStream();
			XWPFDocument docx = new XWPFDocument(is);
			
			if (pageBuilder != null) {
				par = createParComponent(pageBuilder);
				
				Iterator paraIter = docx.getParagraphsIterator();
				while (paraIter.hasNext()) {
					XWPFParagraph para = paraIter.next();
					if (para.isEmpty()) {
						log.debug("Empty paragraph found, ignoring.");
						continue;
					}

					// Bug in apachae poi-ooxml library in which isPageBreak() get NPE. It is fixed via commit https://svn.apache.org/viewvc?view=revision&revision=1795254
					// and will be included in version POI 3.17 beta 1 Remove this handling when we update poi library
					try {
						if (para.isPageBreak()) {
							log.debug("Empty paragraph found, ignoring.");
							continue;
						}
					} catch (NullPointerException npe) {
						// Do not consider it a page break
					}

					short paraType = getParagraphType(para);
					String nodeName = null;
					
					switch (paraType) {
						case TEXT_TYPE:
							log.debug("Current paragraph contains only text.");
							nodeName = "text_" + textIndex++;
							PageComponent comp = createTextNode(para, pageBuilder, nodeName);
							if (comp != null) {
								par.getChildComponents().add(comp);
							}
							break;
						case IMAGE_TYPE:
							log.debug("Current paragraph contains only images.");
							nodeName = "image_" + imageIndex++;
							List imageComps = createImageNode(para, pageBuilder, nodeName, subAssetPath);
							if (imageComps != null && !imageComps.isEmpty()) {
								par.getChildComponents().addAll(imageComps);		
							}
							break;
						case TEXT_IMAGE_TYPE:
							log.debug("Current paragraph contains both text and images.");
							nodeName = "textImage_" + textImageIndex++;
							List textImageComps = createTextImageNode(para, pageBuilder, nodeName, subAssetPath);
							if (textImageComps != null && !textImageComps.isEmpty()) {
								par.getChildComponents().addAll(textImageComps);
							}
					}
				}
			}
			
			return buildPage(rend, pageRoot, pageName, pageTitle, pageTemplate,
					pageDesign, pageBuilder, par);
			
		}
		catch (OLE2NotOfficeXmlFileException oe) {
			log.error("Error while page extraction from : " + rend.getAsset().getPath(), oe);
		}
		catch (Throwable e) {
			throw new PageExtractionException(e.getMessage(), e);
		} finally {
			IOUtils.closeQuietly(is);
		}
		return null;
	}

	private Page buildPage(Rendition rend, String pageRoot, String pageName,
			String pageTitle, String pageTemplate, String pageDesign,
			PageBuilder pageBuilder, PageComponent par) throws WCMException {
		List pageComponents = new ArrayList();
		pageComponents.add(par);
		
		Asset asset = rend.getAsset();
		String title = asset.getMetadataValue("dc:title");
		if (title != null && !title.trim().equalsIgnoreCase("")) {
			log.debug("Title found in the document metadata: " + title);
			pageTitle = title;
		} else {
			pageTitle = pageTitle + " (created: " + new Date() + ")";
			log.debug("No title found in the document metadata.  Using: " + pageTitle);
		}
		
		pageComponents.add(createTitleComponent(pageBuilder, pageTitle));
		Page page = pageBuilder.recreatePage(pageRoot, pageName, pageTitle, pageTemplate, pageDesign, pageComponents);
		
		log.info("Page extraction from: " + rend.getAsset().getPath() + " has completed successfully.");
		
		return page;
	}
	
	@Override
	protected PageComponent createTitleComponent(PageBuilder pageBuilder, String pageTitle) {
		Map properties = new HashMap();
		properties.put("jcr:title", pageTitle);
		properties.put("type", "extralarge");
		return pageBuilder.createComponent("foundation/components/title", new ValueMapDecorator(properties));
	}
	
	/**
	 * Given the paragraph from a Word .doc file, 1 textimage component is created.  Any additional images in the paragraph, beyond the first
	 * are created as image components.
	 * @param para
	 * @param pageBuilder
	 * @param name
	 * @param subAssetPath
	 * @param picTable
	 * @return A list of the components to be added to the Page.
	 */
	private List createTextImageNode(Paragraph para, PageBuilder pageBuilder, String name, String subAssetPath,
			PicturesTable picTable) {
		/*
		 * In theory there can be any number of text and image combinations in a paragraph.  So just pair the text with the
		 * first image found.  Addition images will be images on their own.  The author can always make manual changes in
		 * the page later if this doesn't provide the desired results.
		 */
		
		int imageCount = 1;
		List imageList = new ArrayList();
		
		for (int i = 0; i < para.numCharacterRuns(); i++) {
			CharacterRun run = para.getCharacterRun(i);
			if (picTable.hasPicture(run)) {
				Picture pic = picTable.extractPicture(run, false);
				String path = subAssetPath + "/" + pic.suggestFullFileName();
				Map props = new HashMap();
				props.put("fileReference", path);
				if (imageList.isEmpty()) {
					//the first image needs to be named image as that is what the textimage component expects its image's name to be.
					imageList.add(pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(props), "image"));
				} else {
					imageList.add(pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(props), "image_" + imageCount++));
				}
			}
		}
		
		log.debug("Found " + imageList.size() + " images.");
		
		Map props = new HashMap();
		props.put("textIsRich", true);
		props.put("text", renderParagraph(para));
		
		List comps = new ArrayList();
		if (!imageList.isEmpty()) {
			PageComponent textImageComp = pageBuilder.createComponent(TEXT_IMAGE_COMPONENT, new ValueMapDecorator(props), name);
			textImageComp.getChildComponents().add(imageList.get(0));
			comps.add(textImageComp);
			for (int i = 1; i < imageList.size(); i++) {
				comps.add(imageList.get(i));
			}
		} else {
			comps.add(pageBuilder.createComponent(TEXT_COMPONENT, new ValueMapDecorator(props), name));
		}
		
		return comps;
	}
	/**
	 * Given the paragraph from a Word .docx file, 1 textimage component is created.  Any additional images in the paragraph, beyond the first
	 * are created as image components.
	 * @param para
	 * @param pageBuilder
	 * @param name
	 * @param subAssetPath
	 * @return A list of the components to be added to the Page.
	 */
	private List createTextImageNode(XWPFParagraph para, PageBuilder pageBuilder, String name, String subAssetPath) {
		/*
		 * In theory there can be any number of text and image combinations in a paragraph.  So just pair the text with the
		 * first image found.  Addition images will be images on their own.  The author can always make manual changes in
		 * the page later if this doesn't provide the desired results.
		 */
		
		int imageCount = 1;
		List imageList = new ArrayList();
		
		Iterator runs = para.getRuns().iterator();
		while (runs.hasNext()) {
			XWPFRun run = runs.next();
			if (run.getEmbeddedPictures().size() > 0) {
				Map propsImage = new HashMap();
				Iterator pics = run.getEmbeddedPictures().iterator();
				while (pics.hasNext()) {
					XWPFPictureData pic = pics.next().getPictureData();
					String path = subAssetPath + "/" + pic.getFileName();
					propsImage.put("fileReference", path);
					if (imageList.size() == 0) {
						//the first image must be named image as that's what the textimage component expects
						imageList.add(pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(propsImage), "image"));
					} else {
						imageList.add(pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(propsImage), 
								"image_" + imageCount++));
					}
				}
			}
		}
		
		String text = renderParagraph(para);
		
		log.debug("Found " + imageList.size() + " images.");
		
		Map props = new HashMap();
		
		props.put("textIsRich", true);
		if (text != null && !text.trim().equalsIgnoreCase("") && !text.trim().equalsIgnoreCase("null")) {
			props.put("text", text);
		}
		
		List components = new ArrayList();
		PageComponent textComp = pageBuilder.createComponent(TEXT_IMAGE_COMPONENT, new ValueMapDecorator(props), name);
		
		if (imageList.size() > 0) {
			PageComponent img = imageList.get(0);
			textComp.getChildComponents().add(img);
			components.add(textComp);
			
			for (int i = 1; i < imageList.size(); i++) {
				components.add(imageList.get(i));
			}
		} else {
			components.add(textComp);
		}
		
		return components;
	}
	
	/**
	 * Given the paragraph from a Word .doc, image components are created for each image found in the paragraph.
	 * @param para
	 * @param pageBuilder
	 * @param name
	 * @param subAssetPath
	 * @param picTable
	 * @return A list of components to be added to the page.
	 */
	private List createImageNode(Paragraph para, PageBuilder pageBuilder, String name, String subAssetPath, 
			PicturesTable picTable) {
		int count = 0;
		
		List comps = new ArrayList();
		
		for (int i = 0; i < para.numCharacterRuns(); i++) {
			CharacterRun chars = para.getCharacterRun(i);
			if (picTable.hasPicture(chars)) {
				Picture pic = picTable.extractPicture(chars,  false);
				String path = subAssetPath + "/" + pic.suggestFullFileName();
				Map props = new HashMap();
				props.put("fileReference", path);
				comps.add(pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(props), name + "_" + count++));
			}
		}
		
		log.debug("Created " + comps.size() + "image components.");
		
		return comps;
	}
	
	/**
	 * Given the paragraph from a Word .docx, image components are created for each image found in the paragraph.
	 * @param para
	 * @param pageBuilder
	 * @param name
	 * @param subAssetPath
	 * @return A list of components to be added to the page.
	 */
	private List createImageNode(XWPFParagraph para, PageBuilder pageBuilder, String name, String subAssetPath) {
		int count = 0;
		
		List comps = new ArrayList();
		
		Iterator runs = para.getRuns().iterator();
		while (runs.hasNext()) {
			XWPFRun run = runs.next();
			if (run.getEmbeddedPictures().size() > 0) {
				Iterator pics = run.getEmbeddedPictures().iterator();
				while (pics.hasNext()) {
					PageComponent comp = null;
					XWPFPictureData pic = pics.next().getPictureData();
					String path = subAssetPath + "/" + pic.getFileName();
					
					Map properties = new HashMap();
					properties.put("fileReference", path);
					comp = pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(properties), name + "_" + count++);
					comps.add(comp);
				}
			}
		}
		
		log.debug("Created " + comps.size() + "image components.");
		
		return comps;
	}
	
	/**
	 * Given the paragraph from a Word .doc, create a text component to emcompass the paragraph text.
	 * @param para
	 * @param pageBuilder
	 * @param name
	 * @return The text component to be added to the page.
	 */
	private PageComponent createTextNode(Paragraph para, PageBuilder pageBuilder, String name) {
		Map props = new HashMap();
		props.put("textIsRich", true);
		props.put("text", renderParagraph(para));
		return pageBuilder.createComponent(TEXT_COMPONENT, new ValueMapDecorator(props), name);
	}
	
	/**
	 * Given the paragraph from a Word .docx, create a text component to emcompass the paragraph text.
	 * @param para
	 * @param pageBuilder
	 * @param name
	 * @return The text component to be added to the page.
	 */
	private PageComponent createTextNode(XWPFParagraph para, PageBuilder pageBuilder, String name) {
		PageComponent comp = null;
		
		//check the type, we only handle paragraph types, not tables
		if (para.getElementType() == BodyElementType.PARAGRAPH) {
			String text = renderParagraph(para);
			
			if (text != null && !text.trim().equalsIgnoreCase("") && !text.trim().equalsIgnoreCase("null")) {
				log.debug("Paragraph text is: " + text);
				Map properties = new HashMap();
				properties.put("textIsRich", true);
				properties.put("text", text);
				comp = pageBuilder.createComponent(TEXT_COMPONENT, new ValueMapDecorator(properties), name);
			}
		} else {
			log.warn("Non-PARAGRAPH type paragraph was found.  The type is: " + para.getElementType() + ". Skipping.");
		}
		
		return comp;
	}
	
	private void addOpenStyleTags(XWPFRun run, StringBuilder builder) {
		if (run.isBold()) {
			builder.append("");
		}
		if (run.isItalic()) {
			builder.append("");
		}
		if (run.isStrike()) {
			builder.append("");
		}
		if (run.getSubscript() == VerticalAlign.SUBSCRIPT) {
			builder.append("_{");
		}
		if (run.getSubscript() == VerticalAlign.SUPERSCRIPT) {
			builder.append("^{");
		}
	}
	
	private void addCloseStyleTags(XWPFRun run, StringBuilder builder) {
		if (run.isBold()) {
			builder.append("}}");
		}
		if (run.isItalic()) {
			builder.append("");
		}
		if (run.isStrike()) {
			builder.append("");
		}
		if (run.getSubscript() == VerticalAlign.SUBSCRIPT) {
			builder.append("");
		}
		if (run.getSubscript() == VerticalAlign.SUPERSCRIPT) {
			builder.append("");
		}
	}
	
	private int checkStyle(XWPFParagraph para) {
		String style = para.getStyle();
		if (style != null) {
			//looking for heading styles that translate to HTML, ignore others
			if (style.toLowerCase().startsWith("heading")) {
				int headingNum = -1;
				try {
					headingNum = Integer.parseInt(style.substring(style.length() - 1));
				} catch (NumberFormatException e) {
				}
				return headingNum;				
			} else {
				return -1;
			}
		} else {
			return -1;
		}
	}
	
	private Paragraph[] getParagraphs(Range range) {
        Paragraph[] paragraphs = new Paragraph[range.numParagraphs()];
        for (int i = 0; i < paragraphs.length; i++) {
            paragraphs[i] = range.getParagraph(i);
        }
        return paragraphs;
    }
	
	private short getParagraphType(XWPFParagraph para) {
		boolean hasPictures = false;
		boolean hasText = false;
		
		Iterator runs = para.getRuns().iterator();
		while (runs.hasNext()) {
			XWPFRun run = runs.next();
			if (run.getEmbeddedPictures().size() > 0 ) {
				hasPictures = true;
				break;
			}
		}
		
		String paraText = para.getParagraphText();
		if (paraText != null && !paraText.trim().equals("") && !paraText.trim().startsWith("null")) {
			hasText = true;
		}
		
		if (hasPictures && hasText) {
			return TEXT_IMAGE_TYPE;
		} else if (hasPictures) {
			return IMAGE_TYPE;
		} else {
			return TEXT_TYPE;
		}
	}
	
	private short getParagraphType(Paragraph paragraph, HWPFDocument doc) {
        // assume paragraph with image
        boolean hasPic = false;
        boolean hasText = false;
        for (int i = 0; i < paragraph.numCharacterRuns(); i++) {
            CharacterRun characters = paragraph.getCharacterRun(i);
            if (doc.getPicturesTable().hasPicture(characters)) {
                hasPic = true;
            } else {
                hasText |= paragraph.getCharacterRun(i).text() != null
                        && paragraph.getCharacterRun(i).text().trim().length() > 0;
            }
        }
        if (hasPic && !hasText) {
            return WordExtractionHandler.IMAGE_TYPE;
        } else if (hasPic && hasText) {
            return WordExtractionHandler.TEXT_IMAGE_TYPE;
        } else {
            return WordExtractionHandler.TEXT_TYPE;
        }
    }
	
	/**
	 * Takes the given paragraph from a Word .doc file and formats the text with proper HTML tags.
	 * @param paragraph
	 * @return The HTML formatted text.
	 */
	private String renderParagraph(Paragraph paragraph) {
        StringBuilder builder = new StringBuilder();
        if (paragraph.getJustification() == JUSTIFICATION_RIGHT) {
            builder.append("
");
        } else if (paragraph.getJustification() == JUSTIFICATION_CENTER) {
            builder.append("
");
        } else {
            builder.append("
");
        }

        for (int i = 0; i < paragraph.numCharacterRuns(); i++) {
            CharacterRun characters = paragraph.getCharacterRun(i);
            if (characters.isBold()) {
                builder.append("");
            }
            if (characters.isItalic()) {
                builder.append("");
            }

            for (char c : characters.text().toCharArray()) {
                if (c == '<') {
                    builder.append("<");
                } else if (c == '>') {
                    builder.append(">");
                } else if (c == '&') {
                    builder.append("&");
                } else {
                    builder.append(c);
                }
            }

            if (characters.isItalic()) {
                builder.append("");
            }
            if (characters.isBold()) {
                builder.append("");
            }
        }

        builder.append("");
        return builder.toString();
    }
	
	private String renderParagraph(XWPFParagraph para) {
		StringBuilder builder = new StringBuilder();
		
		if (log.isDebugEnabled()) {
			if (para.getText() == null) {
				log.debug("Para text is a null object");
			} else {
				log.debug("Para text is " + para.getText());
			}
		}
		
		if (para.getText() != null && !para.getText().trim().equalsIgnoreCase("") && !para.getText().trim().equalsIgnoreCase("null")) {
			builder.append("");
			//check if a head style applies
			int style = checkStyle(para);
			if (style == -1 || style > 6) {
				//no heading style, so loop through all runs and build out the formatted string
				//also treating any heading > 6 as normal text since HTML only supports up to H6
				Iterator runIter = para.getRuns().iterator();
				while (runIter.hasNext()) {
					XWPFRun run = runIter.next();
					if (run.getText(0) != null && !run.getText(0).trim().equalsIgnoreCase("null")) {
						addOpenStyleTags(run, builder);
						builder.append(run.getText(0));
						addCloseStyleTags(run, builder);
					}
				}
			} else {
				//style heading exists, so just put all the text under the heading element
				builder.append("");
				builder.append(para.getText());
				builder.append("");
			}
			builder.append("");
		}
		
		return builder.toString();
	}
	
}