
com.day.cq.dam.word.process.ExtractImagesProcess Maven / Gradle / Ivy
/*************************************************************************
*
* ADOBE CONFIDENTIAL
* ___________________
*
* Copyright 2012 Adobe Systems Incorporated
* All Rights Reserved.
*
* NOTICE: All information contained herein is, and remains
* the property of Adobe Systems Incorporated and its suppliers,
* if any. The intellectual and technical concepts contained
* herein are proprietary to Adobe Systems Incorporated and its
* suppliers and are protected by trade secret or copyright law.
* Dissemination of this information or reproduction of this material
* is strictly forbidden unless prior written permission is obtained
* from Adobe Systems Incorporated.
**************************************************************************/
package com.day.cq.dam.word.process;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import javax.jcr.RepositoryException;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Service;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.openxml4j.exceptions.OLE2NotOfficeXmlFileException;
import org.apache.poi.util.IOUtils;
import org.apache.poi.xwpf.usermodel.Document;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.osgi.framework.Constants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.adobe.granite.workflow.exec.WorkflowProcess;
import com.day.cq.dam.api.Asset;
import com.day.cq.dam.commons.process.AbstractAssetWorkflowProcess;
import com.day.cq.workflow.WorkflowSession;
import com.day.cq.workflow.WorkflowException;
import com.day.cq.workflow.metadata.MetaDataMap;
import com.day.cq.workflow.exec.WorkItem;
/**
* A {@link WorkflowProcess} for extracting images from a Word document.
*
* A workflow process that extracts all images from a Word document (.docx and .doc format) and adds them as sub-assets.
*
* @see WorkflowProcess
*/
@Component
@Service
@Properties({
@Property(name = Constants.SERVICE_DESCRIPTION, value = "Extracts images from a Word document and adds them to the DAM as sub-assets."),
@Property(name = Constants.SERVICE_VENDOR, value = "Adobe"),
@Property(name="process.label", value = "Extract Images From Word")})
public class ExtractImagesProcess extends AbstractAssetWorkflowProcess {
private static final Logger log = LoggerFactory.getLogger(ExtractImagesProcess.class);
private static final String BMP_MIME_TYPE = "image/bmp";
private static final String DIB_MIME_TYPE = "image/dib";
private static final String EMF_MIME_TYPE = "image/x-emf";
private static final String EPS_MIME_TYPE = "image/eps";
private static final String GIF_MIME_TYPE = "image/gif";
private static final String JPG_MIME_TYPE = "image/jpeg";
private static final String PICT_MIME_TYPE = "image/pict";
private static final String PNG_MIME_TYPE = "image/png";
private static final String WMF_MIME_TYPE = "image/wmf";
private static final String WPG_MIME_TYPE = "image/wpg";
private static Map mimeTypeMap = null;
private void extractFromDoc(Asset asset, WorkflowSession session) throws WorkflowException {
log.info("Extracting images from: " + asset.getPath());
boolean oldBatchMode = false;
InputStream is = null;
try {
is = asset.getOriginal().getStream();
HWPFDocument doc = new HWPFDocument(is);
oldBatchMode = asset.isBatchMode();
asset.setBatchMode(true);
List pics = doc.getPicturesTable().getAllPictures();
log.debug("Found " + pics.size() + " images to extract.");
Iterator picIter = pics.iterator();
while (picIter.hasNext()) {
Picture pic = picIter.next();
String filename = pic.suggestFullFileName();
String mimeType = pic.getMimeType();
InputStream stream = new BufferedInputStream(new ByteArrayInputStream(pic.getRawContent()));
asset.addSubAsset(filename, mimeType, stream);
}
session.getSession().save();
log.info("Done extracting images from: " + asset.getPath());
}
catch (OLE2NotOfficeXmlFileException oe) {
log.error("Error while extracting images from: " + asset.getPath(), oe);
}
catch (Throwable t) {
throw new WorkflowException(t.getMessage(), t);
} finally {
try {
session.getSession().refresh(false);
} catch (RepositoryException e) {
}
if (asset != null) {
asset.setBatchMode(oldBatchMode);
}
IOUtils.closeQuietly(is);
}
}
private void extractFromDocx(Asset asset, WorkflowSession session) throws WorkflowException {
log.info("Extracting images from: " + asset.getPath());
boolean oldBatchMode = false;
InputStream is = null;
try {
is = asset.getOriginal().getStream();
XWPFDocument doc = new XWPFDocument(is);
oldBatchMode = asset.isBatchMode();
asset.setBatchMode(true);
List pics = doc.getAllPictures();
log.debug("Found " + pics.size() + " images to extract.");
Iterator picIter = pics.iterator();
while (picIter.hasNext()) {
XWPFPictureData pic = picIter.next();
String filename = pic.getFileName();
String mimeType = getMimeType(pic.getPictureType());
InputStream stream = new BufferedInputStream(new ByteArrayInputStream(pic.getData()));
asset.addSubAsset(filename, mimeType, stream);
}
session.getSession().save();
log.info("Done extracting images from: " + asset.getPath());
}
catch (OLE2NotOfficeXmlFileException oe) {
log.error("Error while extracting images from: " + asset.getPath(), oe);
}
catch (Throwable t) {
throw new WorkflowException(t.getMessage(), t);
} finally {
try {
session.getSession().refresh(false);
} catch (RepositoryException e) {
}
if (asset != null) {
asset.setBatchMode(oldBatchMode);
}
IOUtils.closeQuietly(is);
}
}
private String getMimeType(int picType) {
if (mimeTypeMap == null) {
mimeTypeMap = new HashMap();
mimeTypeMap.put(new Integer(Document.PICTURE_TYPE_BMP), BMP_MIME_TYPE);
mimeTypeMap.put(new Integer(Document.PICTURE_TYPE_DIB), DIB_MIME_TYPE);
mimeTypeMap.put(new Integer(Document.PICTURE_TYPE_EMF), EMF_MIME_TYPE);
mimeTypeMap.put(new Integer(Document.PICTURE_TYPE_EPS), EPS_MIME_TYPE);
mimeTypeMap.put(new Integer(Document.PICTURE_TYPE_GIF), GIF_MIME_TYPE);
mimeTypeMap.put(new Integer(Document.PICTURE_TYPE_JPEG), JPG_MIME_TYPE);
mimeTypeMap.put(new Integer(Document.PICTURE_TYPE_PICT), PICT_MIME_TYPE);
mimeTypeMap.put(new Integer(Document.PICTURE_TYPE_PNG), PNG_MIME_TYPE);
mimeTypeMap.put(new Integer(Document.PICTURE_TYPE_WMF), WMF_MIME_TYPE);
mimeTypeMap.put(new Integer(Document.PICTURE_TYPE_WPG), WPG_MIME_TYPE);
}
String mimeType = mimeTypeMap.get(new Integer(picType));
if (mimeType == null) {
//this should not happen, but just in case use a generic mime type
mimeType = "application/octet-stream";
}
return mimeType;
}
public void execute(WorkItem item,WorkflowSession session,MetaDataMap args)
throws WorkflowException {
Asset asset = getAssetFromPayload(item, session.getSession());
final String assetMime = asset.getMimeType();
if (assetMime.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
extractFromDocx(asset, session);
} else if (assetMime.matches("application.*msword")) {
extractFromDoc(asset, session);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy