com.day.cq.dam.word.process.ExtractPlainProcess Maven / Gradle / Ivy

/*************************************************************************
 *
 * ADOBE CONFIDENTIAL
 * ___________________
 *
 *  Copyright 2012 Adobe Systems Incorporated
 *  All Rights Reserved.
 *
 * NOTICE:  All information contained herein is, and remains
 * the property of Adobe Systems Incorporated and its suppliers,
 * if any.  The intellectual and technical concepts contained
 * herein are proprietary to Adobe Systems Incorporated and its
 * suppliers and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from Adobe Systems Incorporated.
 **************************************************************************/
package com.day.cq.dam.word.process;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;

import javax.jcr.Session;

import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.util.IOUtils;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.sling.api.resource.LoginException;
import org.apache.sling.api.resource.Resource;
import org.apache.sling.api.resource.ResourceResolver;
import org.apache.sling.api.resource.ResourceResolverFactory;
import org.apache.sling.api.resource.ModifiableValueMap;
import org.osgi.framework.Constants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.adobe.granite.workflow.WorkflowException;
import com.adobe.granite.workflow.WorkflowSession;
import com.adobe.granite.workflow.exec.WorkItem;
import com.adobe.granite.workflow.exec.WorkflowProcess;
import com.adobe.granite.workflow.metadata.MetaDataMap;
import com.day.cq.dam.api.Asset;
import com.day.cq.dam.commons.util.DamUtil;
import com.day.cq.dam.api.Rendition;
import com.day.cq.commons.jcr.JcrConstants;



/**
 * A {@link com.adobe.granite.workflow.exec.WorkflowProcess} for extracting plain text from a Word document.
 * 
 * A workflow process that extracts all text from a Word document (.docx and .doc format) and adds it as a rendition.
 *
 * @see com.adobe.granite.workflow.exec.WorkflowProcess
 */

@Component
@Service(value = WorkflowProcess.class)
@Properties({
        @Property(name = Constants.SERVICE_DESCRIPTION, value = "Extracts plain text from a Word document and adds it as a rendition."),
        @Property(name = Constants.SERVICE_VENDOR, value = "Adobe"),
        @Property(name="process.label", value = "Extract Plain Text From Word")})
public class ExtractPlainProcess implements WorkflowProcess {

    private static final Logger log = LoggerFactory.getLogger(ExtractPlainProcess.class);

    private static final String JCR_PATH = "JCR_PATH";

    @Reference
    ResourceResolverFactory resourceResolverFactory;

    /**
     * Given a Word document, parse out any text and add it as a "plain" rendition.
     */
    public void execute(WorkItem item, WorkflowSession session, MetaDataMap args) throws WorkflowException {
        try {
            Session jcrSession = session.adaptTo(Session.class);
            HashMap  params = new HashMap();
            params.put("user.jcr.session", jcrSession);
            ResourceResolver resourceResolver = resourceResolverFactory.getResourceResolver(params);
			Asset asset = getPayloadAsset(item, resourceResolver);
            String doc;
            final String assetMime = asset.getMimeType();
            if(assetMime.matches("application.*msword")){
                doc = extractFromDoc(asset);
            } else if (assetMime.equals(
                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
                doc = extractFromDocx(asset);
            } else {
                final String msg = "No appropriate extractor found for: " + asset.getName();
                throw new IOException(msg);
            }
            if (doc == null || doc.isEmpty()) {
                return;
            }
            Rendition rendition = asset.addRendition("plain",new ByteArrayInputStream(doc.getBytes("UTF-8")),"text/plain");
            Resource resource = rendition.getChild("jcr:content");
            ModifiableValueMap contentProps = resource.adaptTo(ModifiableValueMap.class);
            contentProps.put(JcrConstants.JCR_ENCODING, "UTF-8");
            ResourceResolver resResolver = resource.getResourceResolver();
            resResolver.commit();
        } catch (IOException ex) {
            log.error("Could not generate plain text rendition: "+ex);
        } catch (LoginException ex) {
            log.error("Could not generate plain text rendition: "+ex);
        }
    }

    /**
     * Extract the plain text String from a .doc file
     * @param asset to extract the data from
     * @return plain text body of the doc
     * @throws IOException
     */
    private String extractFromDoc(Asset asset) throws IOException{
        String result = "";
        InputStream is = null;
        try {
            is = asset.getOriginal().getStream();
            HWPFDocument doc = new HWPFDocument(is);

            Paragraph[] paragraphs = getParagraphs(doc.getRange());
            for (Paragraph paragraph : paragraphs) {
                result += paragraph.text() + "\n\n";
            }
        }
        catch (Exception e) {
            log.error("Error while extracting plain text from doc : " + asset.getPath());
        } finally {
            IOUtils.closeQuietly(is);
        }
        return result;
    }

    /**
     * Extract the plain text String from a .docx file
     * @param asset to extract the data from
     * @return plain text body of the docx
     * @throws IOException
     */
    private String extractFromDocx(Asset asset) throws IOException{
        String result = "";
        InputStream is = null;
        try {
            is = asset.getOriginal().getStream();
            XWPFDocument doc = new XWPFDocument(is);

            Iterator paraIter = doc.getParagraphsIterator();
            while (paraIter.hasNext()) {
                result += paraIter.next().getText() + "\n\n";
            }
        } catch (Exception e) {
            log.error("Error while extracting plain text from docx : " + asset.getPath());
        } finally {
            IOUtils.closeQuietly(is);
        }
        return result;
    }

    /**
     * Extract the Asset from the WorkItem
     * @param item the WorkItem of the process
     * @param resourceResolver the resource resolver to resolve with
     * @return the Asset
     */
    private Asset getPayloadAsset(final WorkItem item, ResourceResolver resourceResolver) {
        Asset asset = null;
        if (item.getWorkflowData().getPayloadType().equals(JCR_PATH)) {
            final String path = item.getWorkflowData().getPayload().toString();
            if (resourceResolver != null) {
                final Resource resource = resourceResolver.getResource(path);
                if (null != resource) {
                    asset = DamUtil.resolveToAsset(resource);
                }
            }
        }
        return asset;
    }

    /**
     * Extract the paragraphs from a .doc file
     * @param range over the whole .doc
     * @return array of .doc Paragraphs
     */
    private Paragraph[] getParagraphs(Range range) {
        Paragraph[] paragraphs = new Paragraph[range.numParagraphs()];
        for (int i = 0; i < paragraphs.length; i++) {
            paragraphs[i] = range.getParagraph(i);
        }
        return paragraphs;
    }
}