All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.imixs.archive.documents.PDFXMLExtractorPlugin Maven / Gradle / Ivy

There is a newer version: 3.0.2
Show newest version
package org.imixs.archive.documents;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import javax.ejb.EJB;
import javax.xml.bind.JAXBException;
import javax.xml.transform.TransformerException;

import org.apache.pdfbox.cos.COSInputStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.imixs.workflow.FileData;
import org.imixs.workflow.ItemCollection;
import org.imixs.workflow.engine.ReportService;
import org.imixs.workflow.engine.plugins.AbstractPlugin;
import org.imixs.workflow.exceptions.PluginException;
import org.imixs.workflow.util.XMLParser;
import org.imixs.workflow.xml.XMLDocumentAdapter;
import org.imixs.workflow.xml.XSLHandler;

/**
 * The PDFXMLExtractorPlugin extracts embedded XML files from a PDF
 * document and transforms the content into a Imixs XMLDocument. This data can
 * be added into the current workitem for further processing.
 * 

* The plugin is based on the Apache PDFBox project. The maven dependency need * to be added to a project *

* *

 * {@code
     
       org.apache.pdfbox
       pdfbox
       compile
     
   }
 * 
* * To activate the plugin, the BPMN event must contain the following item * definition * *
 * {@code
     
    *.xml
    myReport
   
   }
 * 
* * * * * @version 1.0 * @author rsoika */ public class PDFXMLExtractorPlugin extends AbstractPlugin { public static final String PDFXMLEXTRACTOR = "PDFXMLExtractor"; public static final String PARSING_EXCEPTION = "PARSING_EXCEPTION"; public static final String PLUGIN_ERROR = "PLUGIN_ERROR"; public static final String REPORT_ERROR = "REPORT_ERROR"; public static final String FILE_PATTERN_PDF = ".[pP][dD][fF]"; public static final String FILE_PATTERN_XML = ".[xX][mM][lL]"; @EJB ReportService reportService; private static Logger logger = Logger.getLogger(PDFXMLExtractorPlugin.class.getName()); /** * This method parses the content of an attached pdf file and extracts an * embedded XML file. This xml file will than be transformed by a given report * definition into a Imixs XMLDocument. The content of the XMLDocument is than * merged into the current document. * * @throws PluginException */ @Override public ItemCollection run(ItemCollection document, ItemCollection event) throws PluginException { byte[] xmlData = null; ItemCollection evalItemCollection = getWorkflowService().evalWorkflowResult(event, "item", document, false); if (evalItemCollection == null) { return document; } String processValue = evalItemCollection.getItemValueString(PDFXMLEXTRACTOR); if (!processValue.isEmpty()) { ItemCollection processData = XMLParser.parseItemStructure(processValue); String reportName = processData.getItemValueString("report"); String file_pattern = processData.getItemValueString("filename"); xmlData = getXMLFile(document, file_pattern); if (xmlData != null) { logger.info("...do something with the xml file.." + reportName); // load the report ItemCollection report = reportService.findReport(reportName); if (report == null) { throw new PluginException(PDFXMLExtractorPlugin.class.getSimpleName(), REPORT_ERROR, "unable to load report '" + reportName + "'. Please check model configuration"); } String xsl = report.getItemValueString("XSL").trim(); String encoding = report.getItemValueString("encoding"); // no encoding defined so take a default encoding // (UTF-8) if ("".equals(encoding)) { encoding = "UTF-8"; } byte[] byteData = null; ItemCollection resultItemCol = null; ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); try { String xml = new String(xmlData); XSLHandler.transform(new String(xml), xsl, encoding, outputStream); byteData = outputStream.toByteArray(); // create XMLDocument resultItemCol = XMLDocumentAdapter.readItemCollection(byteData); } catch (TransformerException | JAXBException | IOException e) { e.printStackTrace(); } finally { try { outputStream.close(); } catch (IOException e) { e.printStackTrace(); } } // merge the data.... if (resultItemCol != null) { document.replaceAllItems(resultItemCol.getAllItems()); } } } return document; } /** * This method searches attached PDF files of a workitem and extracts an * embedded XML file. *

* The method only returns the first embedded xml file and does not support * multiple xml files embedded in one pdf file. * * @param document * @param filePattern * @return * @throws PluginException */ public static byte[] getXMLFile(ItemCollection document, String file_pattern) throws PluginException { // verify all attached PDF files List filenames = document.getFileNames(); for (String filename : filenames) { if (Pattern.compile(file_pattern).matcher(filename).find()) { logger.info("...extract embedded XML from '" + filename + "'"); // we only support the first embedded XML file here! FileData pdfFileData = document.getFileData(filename); return getFirstEmbeddedXML(pdfFileData.getContent()); } } // no data found return null; } /** * This method converts a inputStream into a byte array. * * @param ins * @return * @throws IOException */ public static byte[] streamToByteArray(InputStream ins) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); byte[] byteBuffer = new byte[1024]; int len; while ((len = ins.read(byteBuffer)) > -1) { baos.write(byteBuffer, 0, len); } baos.flush(); return baos.toByteArray(); } /** * This method extract the first XML file form a pdf file content * * @param content * @return a byte array with the xml data or null if no xml file was found in * the pdf file. */ private static byte[] getFirstEmbeddedXML(byte[] content) { PDDocument doc = null; byte[] result = null; try { doc = PDDocument.load(content); PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(doc.getDocumentCatalog()); PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); if (efTree != null) { Map names = efTree.getNames(); if (names != null) { result = extractFirstXMLFile(names); } else { List> kids = efTree.getKids(); for (PDNameTreeNode node : kids) { names = node.getNames(); result = extractFirstXMLFile(names); } } } } catch (IOException e) { logger.warning("unable to load embedded xml : " + e.getMessage()); } finally { if (doc != null) { try { doc.close(); } catch (IOException e) { e.printStackTrace(); } } } return result; } /** * Helper method to extract the first XML file from a list of files * * @param names * @return * @throws IOException */ private static byte[] extractFirstXMLFile(Map names) throws IOException { for (Map.Entry entry : names.entrySet()) { PDComplexFileSpecification fileSpec = entry.getValue(); String filename = fileSpec.getFile(); // test if file is a xml file if (Pattern.compile(FILE_PATTERN_XML).matcher(filename).find()) { // we found an xml file! PDEmbeddedFile embeddedFile = getEmbeddedFile(fileSpec); COSInputStream inStream = embeddedFile.createInputStream(); return streamToByteArray(inStream); } } // no XML file found return null; } /** * Helper method to extract platform specific embedded file format. * * @param fileSpec * @return */ private static PDEmbeddedFile getEmbeddedFile(PDComplexFileSpecification fileSpec) { PDEmbeddedFile embeddedFile = null; if (fileSpec != null) { embeddedFile = fileSpec.getEmbeddedFileUnicode(); if (embeddedFile == null) { embeddedFile = fileSpec.getEmbeddedFileDos(); } if (embeddedFile == null) { embeddedFile = fileSpec.getEmbeddedFileMac(); } if (embeddedFile == null) { embeddedFile = fileSpec.getEmbeddedFileUnix(); } if (embeddedFile == null) { embeddedFile = fileSpec.getEmbeddedFile(); } } return embeddedFile; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy