All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.day.cq.dam.word.extraction.WordExtractionHandler Maven / Gradle / Ivy

There is a newer version: 6.5.21
Show newest version
/*************************************************************************
 *
 * ADOBE CONFIDENTIAL
 * ___________________
 *
 *  Copyright 2012 Adobe Systems Incorporated
 *  All Rights Reserved.
 *
 * NOTICE:  All information contained herein is, and remains
 * the property of Adobe Systems Incorporated and its suppliers,
 * if any.  The intellectual and technical concepts contained
 * herein are proprietary to Adobe Systems Incorporated and its
 * suppliers and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from Adobe Systems Incorporated.
 **************************************************************************/
package com.day.cq.dam.word.extraction;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Service;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.openxml4j.exceptions.OLE2NotOfficeXmlFileException;
import org.apache.poi.util.IOUtils;
import org.apache.poi.xwpf.usermodel.BodyElementType;
import org.apache.poi.xwpf.usermodel.VerticalAlign;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFPicture;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.sling.api.resource.ResourceResolver;
import org.apache.sling.api.wrappers.ValueMapDecorator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.day.cq.dam.api.Asset;
import com.day.cq.dam.api.Rendition;
import com.day.cq.dam.api.RenditionPicker;
import com.day.cq.dam.indd.AbstractPageExtractionHandler;
import com.day.cq.dam.indd.PageBuilder;
import com.day.cq.dam.indd.PageComponent;
import com.day.cq.dam.indd.PageExtractionException;
import com.day.cq.dam.indd.PageExtractionHandler;
import com.day.cq.wcm.api.Page;
import com.day.cq.wcm.api.WCMException;

/**
 * A {@link PageExtractionHandler} for Word's DOCX and DOC format.
 * 

* An extraction handler to extract a page from a Word .docx and .doc files. * * @see PageExtractionHandler */ @Component @Service(value = PageExtractionHandler.class) @Properties({ @Property(name = PageExtractionHandler.SERVICE_PROPERTY_LABEL, value = "Word Extraction Handler", propertyPrivate = true), @Property(name = PageExtractionHandler.SERVICE_PROPERTY_DESCRIPTION, value = "Extraction Handler for a MS Word files.", propertyPrivate = true) }) public class WordExtractionHandler extends AbstractPageExtractionHandler { private static final Logger log = LoggerFactory.getLogger(WordExtractionHandler.class); private static final String TEXT_COMPONENT = "foundation/components/text"; private static final String IMAGE_COMPONENT = "foundation/components/image"; private static final String TEXT_IMAGE_COMPONENT = "foundation/components/textimage"; private static final int JUSTIFICATION_RIGHT = 2; private static final int JUSTIFICATION_CENTER = 1; private static final short TEXT_TYPE = 0; private static final short IMAGE_TYPE = 1; private static final short TEXT_IMAGE_TYPE = 2; /** * Get an MS Word targeted rendition picker. *

* {@inheritDoc} */ public RenditionPicker getRenditionPicker() { return new RenditionPicker() { public Rendition getRendition(Asset asset) { if (asset == null) { return null; } final String assetMime = asset.getMimeType(); if (assetMime.matches("application.*msword") || assetMime.equals( "application/vnd.openxmlformats-officedocument.wordprocessingml.document")) { return asset.getRendition("original"); } else { return null; } } }; } /** * {@inheritDoc} */ public Page extractPage(Rendition rend, String pageRoot, String pageName, String pageTitle, String pageTemplate, String pageDesign) throws PageExtractionException { final String rendMime = rend.getAsset().getMimeType(); if (rendMime.equals( "application/vnd.openxmlformats-officedocument.wordprocessingml.document")) { return extractDocx(rend, pageRoot, pageName, pageTitle, pageTemplate, pageDesign); } else if (rendMime.matches("application.*msword")) { return extractDoc(rend, pageRoot, pageName, pageTitle, pageTemplate, pageDesign); } else { final String msg = "No appropriate extractor found for: " + rend.getAsset().getName(); log.info(msg); throw new PageExtractionException(msg); } } /** * Extract the text, image, and textimage components from a Word .doc file and create a CQ Page that contains them plus a title. * @param rend * @param pageRoot * @param pageName * @param pageTitle * @param pageTemplate * @param pageDesign * @return * @throws PageExtractionException */ private Page extractDoc(Rendition rend, String pageRoot, String pageName, String pageTitle, String pageTemplate, String pageDesign) throws PageExtractionException { log.info("Beginning page extraction from: " + rend.getAsset().getPath()); int textIndex = 0; int textImageIndex = 0; int imageIndex = 0; ResourceResolver resourceResolver = rend.getResourceResolver(); PageBuilder pageBuilder = getPageBuilder(resourceResolver); PageComponent par = null; String subAssetPath = rend.getAsset().getPath() + "/subassets"; log.debug("Images in the page will point to the appropriate subassets located at: " + subAssetPath); InputStream is = null; try { is = rend.getAsset().getOriginal().getStream(); HWPFDocument doc = new HWPFDocument(is); if (pageBuilder != null) { par = createParComponent(pageBuilder); Paragraph[] paragraphs = getParagraphs(doc.getRange()); log.debug("Found " + paragraphs.length+ " paragraphs in the document."); for (int i = 0; i < paragraphs.length; i++) { short type = getParagraphType(paragraphs[i], doc); String nodeName = null; switch (type) { default: log.debug("Unknown paragraph type, will treat it as text only"); // falls through case TEXT_TYPE: log.debug("Paragraph " + i + " contains only text."); nodeName = "text_" + textIndex++; PageComponent comp = createTextNode(paragraphs[i], pageBuilder, nodeName); if (comp != null) { par.getChildComponents().add(comp); } break; case IMAGE_TYPE: log.debug("Paragraph " + i + " contains only images."); nodeName = "image_" + imageIndex++; List imageComps = createImageNode(paragraphs[i], pageBuilder, nodeName, subAssetPath, doc.getPicturesTable()); if (imageComps != null && !imageComps.isEmpty()) { par.getChildComponents().addAll(imageComps); } break; case TEXT_IMAGE_TYPE: log.debug("Paragraph " + i + " contains both text and images."); nodeName = "textImage_" + textImageIndex++; List textImageComps = createTextImageNode(paragraphs[i], pageBuilder, nodeName, subAssetPath, doc.getPicturesTable()); if (textImageComps != null && !textImageComps.isEmpty()) { par.getChildComponents().addAll(textImageComps); } } } } return buildPage(rend, pageRoot, pageName, pageTitle, pageTemplate, pageDesign, pageBuilder, par); } catch (OLE2NotOfficeXmlFileException oe) { log.error("Error while page extraction from : " + rend.getAsset().getPath(), oe); } catch (Throwable e) { throw new PageExtractionException(e.getMessage(), e); } finally { IOUtils.closeQuietly(is); } return null; } /** * Extract the text, image, and textimage components from a Word .docx file and create a CQ Page that contains them plus a title. * @param rend * @param pageRoot * @param pageName * @param pageTitle * @param pageTemplate * @param pageDesign * @return * @throws PageExtractionException */ private Page extractDocx(Rendition rend, String pageRoot, String pageName, String pageTitle, String pageTemplate, String pageDesign) throws PageExtractionException { log.info("Beginning page extraction from: " + rend.getAsset().getPath()); int textIndex = 0; int textImageIndex = 0; int imageIndex = 0; ResourceResolver resourceResolver = rend.getResourceResolver(); PageBuilder pageBuilder = getPageBuilder(resourceResolver); PageComponent par = null; String subAssetPath = rend.getAsset().getPath() + "/subassets"; log.debug("Images in the page will point to the appropriate subassets located at: " + subAssetPath); InputStream is = null; try { is = rend.getAsset().getOriginal().getStream(); XWPFDocument docx = new XWPFDocument(is); if (pageBuilder != null) { par = createParComponent(pageBuilder); Iterator paraIter = docx.getParagraphsIterator(); while (paraIter.hasNext()) { XWPFParagraph para = paraIter.next(); if (para.isEmpty()) { log.debug("Empty paragraph found, ignoring."); continue; } // Bug in apachae poi-ooxml library in which isPageBreak() get NPE. It is fixed via commit https://svn.apache.org/viewvc?view=revision&revision=1795254 // and will be included in version POI 3.17 beta 1 Remove this handling when we update poi library try { if (para.isPageBreak()) { log.debug("Empty paragraph found, ignoring."); continue; } } catch (NullPointerException npe) { // Do not consider it a page break } short paraType = getParagraphType(para); String nodeName = null; switch (paraType) { case TEXT_TYPE: log.debug("Current paragraph contains only text."); nodeName = "text_" + textIndex++; PageComponent comp = createTextNode(para, pageBuilder, nodeName); if (comp != null) { par.getChildComponents().add(comp); } break; case IMAGE_TYPE: log.debug("Current paragraph contains only images."); nodeName = "image_" + imageIndex++; List imageComps = createImageNode(para, pageBuilder, nodeName, subAssetPath); if (imageComps != null && !imageComps.isEmpty()) { par.getChildComponents().addAll(imageComps); } break; case TEXT_IMAGE_TYPE: log.debug("Current paragraph contains both text and images."); nodeName = "textImage_" + textImageIndex++; List textImageComps = createTextImageNode(para, pageBuilder, nodeName, subAssetPath); if (textImageComps != null && !textImageComps.isEmpty()) { par.getChildComponents().addAll(textImageComps); } } } } return buildPage(rend, pageRoot, pageName, pageTitle, pageTemplate, pageDesign, pageBuilder, par); } catch (OLE2NotOfficeXmlFileException oe) { log.error("Error while page extraction from : " + rend.getAsset().getPath(), oe); } catch (Throwable e) { throw new PageExtractionException(e.getMessage(), e); } finally { IOUtils.closeQuietly(is); } return null; } private Page buildPage(Rendition rend, String pageRoot, String pageName, String pageTitle, String pageTemplate, String pageDesign, PageBuilder pageBuilder, PageComponent par) throws WCMException { List pageComponents = new ArrayList(); pageComponents.add(par); Asset asset = rend.getAsset(); String title = asset.getMetadataValue("dc:title"); if (title != null && !title.trim().equalsIgnoreCase("")) { log.debug("Title found in the document metadata: " + title); pageTitle = title; } else { pageTitle = pageTitle + " (created: " + new Date() + ")"; log.debug("No title found in the document metadata. Using: " + pageTitle); } pageComponents.add(createTitleComponent(pageBuilder, pageTitle)); Page page = pageBuilder.recreatePage(pageRoot, pageName, pageTitle, pageTemplate, pageDesign, pageComponents); log.info("Page extraction from: " + rend.getAsset().getPath() + " has completed successfully."); return page; } @Override protected PageComponent createTitleComponent(PageBuilder pageBuilder, String pageTitle) { Map properties = new HashMap(); properties.put("jcr:title", pageTitle); properties.put("type", "extralarge"); return pageBuilder.createComponent("foundation/components/title", new ValueMapDecorator(properties)); } /** * Given the paragraph from a Word .doc file, 1 textimage component is created. Any additional images in the paragraph, beyond the first * are created as image components. * @param para * @param pageBuilder * @param name * @param subAssetPath * @param picTable * @return A list of the components to be added to the Page. */ private List createTextImageNode(Paragraph para, PageBuilder pageBuilder, String name, String subAssetPath, PicturesTable picTable) { /* * In theory there can be any number of text and image combinations in a paragraph. So just pair the text with the * first image found. Addition images will be images on their own. The author can always make manual changes in * the page later if this doesn't provide the desired results. */ int imageCount = 1; List imageList = new ArrayList(); for (int i = 0; i < para.numCharacterRuns(); i++) { CharacterRun run = para.getCharacterRun(i); if (picTable.hasPicture(run)) { Picture pic = picTable.extractPicture(run, false); String path = subAssetPath + "/" + pic.suggestFullFileName(); Map props = new HashMap(); props.put("fileReference", path); if (imageList.isEmpty()) { //the first image needs to be named image as that is what the textimage component expects its image's name to be. imageList.add(pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(props), "image")); } else { imageList.add(pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(props), "image_" + imageCount++)); } } } log.debug("Found " + imageList.size() + " images."); Map props = new HashMap(); props.put("textIsRich", true); props.put("text", renderParagraph(para)); List comps = new ArrayList(); if (!imageList.isEmpty()) { PageComponent textImageComp = pageBuilder.createComponent(TEXT_IMAGE_COMPONENT, new ValueMapDecorator(props), name); textImageComp.getChildComponents().add(imageList.get(0)); comps.add(textImageComp); for (int i = 1; i < imageList.size(); i++) { comps.add(imageList.get(i)); } } else { comps.add(pageBuilder.createComponent(TEXT_COMPONENT, new ValueMapDecorator(props), name)); } return comps; } /** * Given the paragraph from a Word .docx file, 1 textimage component is created. Any additional images in the paragraph, beyond the first * are created as image components. * @param para * @param pageBuilder * @param name * @param subAssetPath * @return A list of the components to be added to the Page. */ private List createTextImageNode(XWPFParagraph para, PageBuilder pageBuilder, String name, String subAssetPath) { /* * In theory there can be any number of text and image combinations in a paragraph. So just pair the text with the * first image found. Addition images will be images on their own. The author can always make manual changes in * the page later if this doesn't provide the desired results. */ int imageCount = 1; List imageList = new ArrayList(); Iterator runs = para.getRuns().iterator(); while (runs.hasNext()) { XWPFRun run = runs.next(); if (run.getEmbeddedPictures().size() > 0) { Map propsImage = new HashMap(); Iterator pics = run.getEmbeddedPictures().iterator(); while (pics.hasNext()) { XWPFPictureData pic = pics.next().getPictureData(); String path = subAssetPath + "/" + pic.getFileName(); propsImage.put("fileReference", path); if (imageList.size() == 0) { //the first image must be named image as that's what the textimage component expects imageList.add(pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(propsImage), "image")); } else { imageList.add(pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(propsImage), "image_" + imageCount++)); } } } } String text = renderParagraph(para); log.debug("Found " + imageList.size() + " images."); Map props = new HashMap(); props.put("textIsRich", true); if (text != null && !text.trim().equalsIgnoreCase("") && !text.trim().equalsIgnoreCase("null")) { props.put("text", text); } List components = new ArrayList(); PageComponent textComp = pageBuilder.createComponent(TEXT_IMAGE_COMPONENT, new ValueMapDecorator(props), name); if (imageList.size() > 0) { PageComponent img = imageList.get(0); textComp.getChildComponents().add(img); components.add(textComp); for (int i = 1; i < imageList.size(); i++) { components.add(imageList.get(i)); } } else { components.add(textComp); } return components; } /** * Given the paragraph from a Word .doc, image components are created for each image found in the paragraph. * @param para * @param pageBuilder * @param name * @param subAssetPath * @param picTable * @return A list of components to be added to the page. */ private List createImageNode(Paragraph para, PageBuilder pageBuilder, String name, String subAssetPath, PicturesTable picTable) { int count = 0; List comps = new ArrayList(); for (int i = 0; i < para.numCharacterRuns(); i++) { CharacterRun chars = para.getCharacterRun(i); if (picTable.hasPicture(chars)) { Picture pic = picTable.extractPicture(chars, false); String path = subAssetPath + "/" + pic.suggestFullFileName(); Map props = new HashMap(); props.put("fileReference", path); comps.add(pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(props), name + "_" + count++)); } } log.debug("Created " + comps.size() + "image components."); return comps; } /** * Given the paragraph from a Word .docx, image components are created for each image found in the paragraph. * @param para * @param pageBuilder * @param name * @param subAssetPath * @return A list of components to be added to the page. */ private List createImageNode(XWPFParagraph para, PageBuilder pageBuilder, String name, String subAssetPath) { int count = 0; List comps = new ArrayList(); Iterator runs = para.getRuns().iterator(); while (runs.hasNext()) { XWPFRun run = runs.next(); if (run.getEmbeddedPictures().size() > 0) { Iterator pics = run.getEmbeddedPictures().iterator(); while (pics.hasNext()) { PageComponent comp = null; XWPFPictureData pic = pics.next().getPictureData(); String path = subAssetPath + "/" + pic.getFileName(); Map properties = new HashMap(); properties.put("fileReference", path); comp = pageBuilder.createComponent(IMAGE_COMPONENT, new ValueMapDecorator(properties), name + "_" + count++); comps.add(comp); } } } log.debug("Created " + comps.size() + "image components."); return comps; } /** * Given the paragraph from a Word .doc, create a text component to emcompass the paragraph text. * @param para * @param pageBuilder * @param name * @return The text component to be added to the page. */ private PageComponent createTextNode(Paragraph para, PageBuilder pageBuilder, String name) { Map props = new HashMap(); props.put("textIsRich", true); props.put("text", renderParagraph(para)); return pageBuilder.createComponent(TEXT_COMPONENT, new ValueMapDecorator(props), name); } /** * Given the paragraph from a Word .docx, create a text component to emcompass the paragraph text. * @param para * @param pageBuilder * @param name * @return The text component to be added to the page. */ private PageComponent createTextNode(XWPFParagraph para, PageBuilder pageBuilder, String name) { PageComponent comp = null; //check the type, we only handle paragraph types, not tables if (para.getElementType() == BodyElementType.PARAGRAPH) { String text = renderParagraph(para); if (text != null && !text.trim().equalsIgnoreCase("") && !text.trim().equalsIgnoreCase("null")) { log.debug("Paragraph text is: " + text); Map properties = new HashMap(); properties.put("textIsRich", true); properties.put("text", text); comp = pageBuilder.createComponent(TEXT_COMPONENT, new ValueMapDecorator(properties), name); } } else { log.warn("Non-PARAGRAPH type paragraph was found. The type is: " + para.getElementType() + ". Skipping."); } return comp; } private void addOpenStyleTags(XWPFRun run, StringBuilder builder) { if (run.isBold()) { builder.append(""); } if (run.isItalic()) { builder.append(""); } if (run.isStrike()) { builder.append(""); } if (run.getSubscript() == VerticalAlign.SUBSCRIPT) { builder.append(""); } if (run.getSubscript() == VerticalAlign.SUPERSCRIPT) { builder.append(""); } } private void addCloseStyleTags(XWPFRun run, StringBuilder builder) { if (run.isBold()) { builder.append(""); } if (run.isItalic()) { builder.append(""); } if (run.isStrike()) { builder.append(""); } if (run.getSubscript() == VerticalAlign.SUBSCRIPT) { builder.append(""); } if (run.getSubscript() == VerticalAlign.SUPERSCRIPT) { builder.append(""); } } private int checkStyle(XWPFParagraph para) { String style = para.getStyle(); if (style != null) { //looking for heading styles that translate to HTML, ignore others if (style.toLowerCase().startsWith("heading")) { int headingNum = -1; try { headingNum = Integer.parseInt(style.substring(style.length() - 1)); } catch (NumberFormatException e) { } return headingNum; } else { return -1; } } else { return -1; } } private Paragraph[] getParagraphs(Range range) { Paragraph[] paragraphs = new Paragraph[range.numParagraphs()]; for (int i = 0; i < paragraphs.length; i++) { paragraphs[i] = range.getParagraph(i); } return paragraphs; } private short getParagraphType(XWPFParagraph para) { boolean hasPictures = false; boolean hasText = false; Iterator runs = para.getRuns().iterator(); while (runs.hasNext()) { XWPFRun run = runs.next(); if (run.getEmbeddedPictures().size() > 0 ) { hasPictures = true; break; } } String paraText = para.getParagraphText(); if (paraText != null && !paraText.trim().equals("") && !paraText.trim().startsWith("null")) { hasText = true; } if (hasPictures && hasText) { return TEXT_IMAGE_TYPE; } else if (hasPictures) { return IMAGE_TYPE; } else { return TEXT_TYPE; } } private short getParagraphType(Paragraph paragraph, HWPFDocument doc) { // assume paragraph with image boolean hasPic = false; boolean hasText = false; for (int i = 0; i < paragraph.numCharacterRuns(); i++) { CharacterRun characters = paragraph.getCharacterRun(i); if (doc.getPicturesTable().hasPicture(characters)) { hasPic = true; } else { hasText |= paragraph.getCharacterRun(i).text() != null && paragraph.getCharacterRun(i).text().trim().length() > 0; } } if (hasPic && !hasText) { return WordExtractionHandler.IMAGE_TYPE; } else if (hasPic && hasText) { return WordExtractionHandler.TEXT_IMAGE_TYPE; } else { return WordExtractionHandler.TEXT_TYPE; } } /** * Takes the given paragraph from a Word .doc file and formats the text with proper HTML tags. * @param paragraph * @return The HTML formatted text. */ private String renderParagraph(Paragraph paragraph) { StringBuilder builder = new StringBuilder(); if (paragraph.getJustification() == JUSTIFICATION_RIGHT) { builder.append("

"); } else if (paragraph.getJustification() == JUSTIFICATION_CENTER) { builder.append("

"); } else { builder.append("

"); } for (int i = 0; i < paragraph.numCharacterRuns(); i++) { CharacterRun characters = paragraph.getCharacterRun(i); if (characters.isBold()) { builder.append(""); } if (characters.isItalic()) { builder.append(""); } for (char c : characters.text().toCharArray()) { if (c == '<') { builder.append("<"); } else if (c == '>') { builder.append(">"); } else if (c == '&') { builder.append("&"); } else { builder.append(c); } } if (characters.isItalic()) { builder.append(""); } if (characters.isBold()) { builder.append(""); } } builder.append("

"); return builder.toString(); } private String renderParagraph(XWPFParagraph para) { StringBuilder builder = new StringBuilder(); if (log.isDebugEnabled()) { if (para.getText() == null) { log.debug("Para text is a null object"); } else { log.debug("Para text is " + para.getText()); } } if (para.getText() != null && !para.getText().trim().equalsIgnoreCase("") && !para.getText().trim().equalsIgnoreCase("null")) { builder.append("

"); //check if a head style applies int style = checkStyle(para); if (style == -1 || style > 6) { //no heading style, so loop through all runs and build out the formatted string //also treating any heading > 6 as normal text since HTML only supports up to H6 Iterator runIter = para.getRuns().iterator(); while (runIter.hasNext()) { XWPFRun run = runIter.next(); if (run.getText(0) != null && !run.getText(0).trim().equalsIgnoreCase("null")) { addOpenStyleTags(run, builder); builder.append(run.getText(0)); addCloseStyleTags(run, builder); } } } else { //style heading exists, so just put all the text under the heading element builder.append(""); builder.append(para.getText()); builder.append(""); } builder.append("

"); } return builder.toString(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy