com.day.cq.dam.word.process.ExtractPlainProcess Maven / Gradle / Ivy
/*************************************************************************
*
* ADOBE CONFIDENTIAL
* ___________________
*
* Copyright 2012 Adobe Systems Incorporated
* All Rights Reserved.
*
* NOTICE: All information contained herein is, and remains
* the property of Adobe Systems Incorporated and its suppliers,
* if any. The intellectual and technical concepts contained
* herein are proprietary to Adobe Systems Incorporated and its
* suppliers and are protected by trade secret or copyright law.
* Dissemination of this information or reproduction of this material
* is strictly forbidden unless prior written permission is obtained
* from Adobe Systems Incorporated.
**************************************************************************/
package com.day.cq.dam.word.process;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;
import javax.jcr.Session;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.util.IOUtils;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.sling.api.resource.LoginException;
import org.apache.sling.api.resource.Resource;
import org.apache.sling.api.resource.ResourceResolver;
import org.apache.sling.api.resource.ResourceResolverFactory;
import org.apache.sling.api.resource.ModifiableValueMap;
import org.osgi.framework.Constants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.adobe.granite.workflow.WorkflowException;
import com.adobe.granite.workflow.WorkflowSession;
import com.adobe.granite.workflow.exec.WorkItem;
import com.adobe.granite.workflow.exec.WorkflowProcess;
import com.adobe.granite.workflow.metadata.MetaDataMap;
import com.day.cq.dam.api.Asset;
import com.day.cq.dam.commons.util.DamUtil;
import com.day.cq.dam.api.Rendition;
import com.day.cq.commons.jcr.JcrConstants;
/**
* A {@link com.adobe.granite.workflow.exec.WorkflowProcess} for extracting plain text from a Word document.
*
* A workflow process that extracts all text from a Word document (.docx and .doc format) and adds it as a rendition.
*
* @see com.adobe.granite.workflow.exec.WorkflowProcess
*/
@Component
@Service(value = WorkflowProcess.class)
@Properties({
@Property(name = Constants.SERVICE_DESCRIPTION, value = "Extracts plain text from a Word document and adds it as a rendition."),
@Property(name = Constants.SERVICE_VENDOR, value = "Adobe"),
@Property(name="process.label", value = "Extract Plain Text From Word")})
public class ExtractPlainProcess implements WorkflowProcess {
private static final Logger log = LoggerFactory.getLogger(ExtractPlainProcess.class);
private static final String JCR_PATH = "JCR_PATH";
@Reference
ResourceResolverFactory resourceResolverFactory;
/**
* Given a Word document, parse out any text and add it as a "plain" rendition.
*/
public void execute(WorkItem item, WorkflowSession session, MetaDataMap args) throws WorkflowException {
try {
Session jcrSession = session.adaptTo(Session.class);
HashMap params = new HashMap();
params.put("user.jcr.session", jcrSession);
ResourceResolver resourceResolver = resourceResolverFactory.getResourceResolver(params);
Asset asset = getPayloadAsset(item, resourceResolver);
String doc;
final String assetMime = asset.getMimeType();
if(assetMime.matches("application.*msword")){
doc = extractFromDoc(asset);
} else if (assetMime.equals(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
doc = extractFromDocx(asset);
} else {
final String msg = "No appropriate extractor found for: " + asset.getName();
throw new IOException(msg);
}
if (doc == null || doc.isEmpty()) {
return;
}
Rendition rendition = asset.addRendition("plain",new ByteArrayInputStream(doc.getBytes("UTF-8")),"text/plain");
Resource resource = rendition.getChild("jcr:content");
ModifiableValueMap contentProps = resource.adaptTo(ModifiableValueMap.class);
contentProps.put(JcrConstants.JCR_ENCODING, "UTF-8");
ResourceResolver resResolver = resource.getResourceResolver();
resResolver.commit();
} catch (IOException ex) {
log.error("Could not generate plain text rendition: "+ex);
} catch (LoginException ex) {
log.error("Could not generate plain text rendition: "+ex);
}
}
/**
* Extract the plain text String from a .doc file
* @param asset to extract the data from
* @return plain text body of the doc
* @throws IOException
*/
private String extractFromDoc(Asset asset) throws IOException{
String result = "";
InputStream is = null;
try {
is = asset.getOriginal().getStream();
HWPFDocument doc = new HWPFDocument(is);
Paragraph[] paragraphs = getParagraphs(doc.getRange());
for (Paragraph paragraph : paragraphs) {
result += paragraph.text() + "\n\n";
}
}
catch (Exception e) {
log.error("Error while extracting plain text from doc : " + asset.getPath());
} finally {
IOUtils.closeQuietly(is);
}
return result;
}
/**
* Extract the plain text String from a .docx file
* @param asset to extract the data from
* @return plain text body of the docx
* @throws IOException
*/
private String extractFromDocx(Asset asset) throws IOException{
String result = "";
InputStream is = null;
try {
is = asset.getOriginal().getStream();
XWPFDocument doc = new XWPFDocument(is);
Iterator paraIter = doc.getParagraphsIterator();
while (paraIter.hasNext()) {
result += paraIter.next().getText() + "\n\n";
}
} catch (Exception e) {
log.error("Error while extracting plain text from docx : " + asset.getPath());
} finally {
IOUtils.closeQuietly(is);
}
return result;
}
/**
* Extract the Asset from the WorkItem
* @param item the WorkItem of the process
* @param resourceResolver the resource resolver to resolve with
* @return the Asset
*/
private Asset getPayloadAsset(final WorkItem item, ResourceResolver resourceResolver) {
Asset asset = null;
if (item.getWorkflowData().getPayloadType().equals(JCR_PATH)) {
final String path = item.getWorkflowData().getPayload().toString();
if (resourceResolver != null) {
final Resource resource = resourceResolver.getResource(path);
if (null != resource) {
asset = DamUtil.resolveToAsset(resource);
}
}
}
return asset;
}
/**
* Extract the paragraphs from a .doc file
* @param range over the whole .doc
* @return array of .doc Paragraphs
*/
private Paragraph[] getParagraphs(Range range) {
Paragraph[] paragraphs = new Paragraph[range.numParagraphs()];
for (int i = 0; i < paragraphs.length; i++) {
paragraphs[i] = range.getParagraph(i);
}
return paragraphs;
}
}