All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.day.cq.dam.word.process.ExtractPlainProcess Maven / Gradle / Ivy

/*************************************************************************
 *
 * ADOBE CONFIDENTIAL
 * ___________________
 *
 *  Copyright 2012 Adobe Systems Incorporated
 *  All Rights Reserved.
 *
 * NOTICE:  All information contained herein is, and remains
 * the property of Adobe Systems Incorporated and its suppliers,
 * if any.  The intellectual and technical concepts contained
 * herein are proprietary to Adobe Systems Incorporated and its
 * suppliers and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from Adobe Systems Incorporated.
 **************************************************************************/
package com.day.cq.dam.word.process;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;

import javax.jcr.Session;

import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.util.IOUtils;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.sling.api.resource.LoginException;
import org.apache.sling.api.resource.Resource;
import org.apache.sling.api.resource.ResourceResolver;
import org.apache.sling.api.resource.ResourceResolverFactory;
import org.apache.sling.api.resource.ModifiableValueMap;
import org.osgi.framework.Constants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.adobe.granite.workflow.WorkflowException;
import com.adobe.granite.workflow.WorkflowSession;
import com.adobe.granite.workflow.exec.WorkItem;
import com.adobe.granite.workflow.exec.WorkflowProcess;
import com.adobe.granite.workflow.metadata.MetaDataMap;
import com.day.cq.dam.api.Asset;
import com.day.cq.dam.commons.util.DamUtil;
import com.day.cq.dam.api.Rendition;
import com.day.cq.commons.jcr.JcrConstants;



/**
 * A {@link com.adobe.granite.workflow.exec.WorkflowProcess} for extracting plain text from a Word document.
 * 

* A workflow process that extracts all text from a Word document (.docx and .doc format) and adds it as a rendition. * * @see com.adobe.granite.workflow.exec.WorkflowProcess */ @Component @Service(value = WorkflowProcess.class) @Properties({ @Property(name = Constants.SERVICE_DESCRIPTION, value = "Extracts plain text from a Word document and adds it as a rendition."), @Property(name = Constants.SERVICE_VENDOR, value = "Adobe"), @Property(name="process.label", value = "Extract Plain Text From Word")}) public class ExtractPlainProcess implements WorkflowProcess { private static final Logger log = LoggerFactory.getLogger(ExtractPlainProcess.class); private static final String JCR_PATH = "JCR_PATH"; @Reference ResourceResolverFactory resourceResolverFactory; /** * Given a Word document, parse out any text and add it as a "plain" rendition. */ public void execute(WorkItem item, WorkflowSession session, MetaDataMap args) throws WorkflowException { try { Session jcrSession = session.adaptTo(Session.class); HashMap params = new HashMap(); params.put("user.jcr.session", jcrSession); ResourceResolver resourceResolver = resourceResolverFactory.getResourceResolver(params); Asset asset = getPayloadAsset(item, resourceResolver); String doc; final String assetMime = asset.getMimeType(); if(assetMime.matches("application.*msword")){ doc = extractFromDoc(asset); } else if (assetMime.equals( "application/vnd.openxmlformats-officedocument.wordprocessingml.document")) { doc = extractFromDocx(asset); } else { final String msg = "No appropriate extractor found for: " + asset.getName(); throw new IOException(msg); } if (doc == null || doc.isEmpty()) { return; } Rendition rendition = asset.addRendition("plain",new ByteArrayInputStream(doc.getBytes("UTF-8")),"text/plain"); Resource resource = rendition.getChild("jcr:content"); ModifiableValueMap contentProps = resource.adaptTo(ModifiableValueMap.class); contentProps.put(JcrConstants.JCR_ENCODING, "UTF-8"); ResourceResolver resResolver = resource.getResourceResolver(); resResolver.commit(); } catch (IOException ex) { log.error("Could not generate plain text rendition: "+ex); } catch (LoginException ex) { log.error("Could not generate plain text rendition: "+ex); } } /** * Extract the plain text String from a .doc file * @param asset to extract the data from * @return plain text body of the doc * @throws IOException */ private String extractFromDoc(Asset asset) throws IOException{ String result = ""; InputStream is = null; try { is = asset.getOriginal().getStream(); HWPFDocument doc = new HWPFDocument(is); Paragraph[] paragraphs = getParagraphs(doc.getRange()); for (Paragraph paragraph : paragraphs) { result += paragraph.text() + "\n\n"; } } catch (Exception e) { log.error("Error while extracting plain text from doc : " + asset.getPath()); } finally { IOUtils.closeQuietly(is); } return result; } /** * Extract the plain text String from a .docx file * @param asset to extract the data from * @return plain text body of the docx * @throws IOException */ private String extractFromDocx(Asset asset) throws IOException{ String result = ""; InputStream is = null; try { is = asset.getOriginal().getStream(); XWPFDocument doc = new XWPFDocument(is); Iterator paraIter = doc.getParagraphsIterator(); while (paraIter.hasNext()) { result += paraIter.next().getText() + "\n\n"; } } catch (Exception e) { log.error("Error while extracting plain text from docx : " + asset.getPath()); } finally { IOUtils.closeQuietly(is); } return result; } /** * Extract the Asset from the WorkItem * @param item the WorkItem of the process * @param resourceResolver the resource resolver to resolve with * @return the Asset */ private Asset getPayloadAsset(final WorkItem item, ResourceResolver resourceResolver) { Asset asset = null; if (item.getWorkflowData().getPayloadType().equals(JCR_PATH)) { final String path = item.getWorkflowData().getPayload().toString(); if (resourceResolver != null) { final Resource resource = resourceResolver.getResource(path); if (null != resource) { asset = DamUtil.resolveToAsset(resource); } } } return asset; } /** * Extract the paragraphs from a .doc file * @param range over the whole .doc * @return array of .doc Paragraphs */ private Paragraph[] getParagraphs(Range range) { Paragraph[] paragraphs = new Paragraph[range.numParagraphs()]; for (int i = 0; i < paragraphs.length; i++) { paragraphs[i] = range.getParagraph(i); } return paragraphs; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy