org.imixs.archive.documents.OCRDocumentAdapter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of imixs-archive-documents Show documentation
There is a newer version: 3.0.2
Show newest version
package org.imixs.archive.documents;

import java.util.List;
import java.util.logging.Logger;

import javax.inject.Inject;

import org.eclipse.microprofile.config.inject.ConfigProperty;
import org.imixs.archive.core.SnapshotService;
import org.imixs.workflow.ItemCollection;
import org.imixs.workflow.SignalAdapter;
import org.imixs.workflow.engine.WorkflowService;
import org.imixs.workflow.exceptions.AdapterException;
import org.imixs.workflow.exceptions.PluginException;

/**
 * The TikaDocumentAdapter reacts on ProcessingEvent to auto extract the text
 * content.
 * 
 * The adapter expect the following environment setting
 * 
 * TIKA_SERVICE_MODE: "MODEL"
 * 
 * You can set additional options to be passed to the Tika Service
 * 
 * 

 * 
 * 
 * {@code
        X-Tika-PDFocrStrategy=OCR_ONLY
        X-Tika-PDFOcrImageType=RGB
        X-Tika-PDFOcrDPI=400
   }
 * 
 * 
 * @see OCRDocumentService
 * @version 1.0
 * @author rsoika
 */
public class OCRDocumentAdapter implements SignalAdapter {

    public static final String OCR_ERROR = "OCR_ERROR";

    private static Logger logger = Logger.getLogger(OCRDocumentAdapter.class.getName());

    @Inject
    @ConfigProperty(name = TikaService.ENV_OCR_SERVICE_MODE, defaultValue = "auto")
    String serviceMode;

    @Inject
    TikaService ocrService;

    @Inject
    WorkflowService workflowService;

    @Inject
    SnapshotService snapshotService;

    /**
     * This method posts a text from an attachment to the Imixs-ML Analyse service
     * endpoint
     */
    @SuppressWarnings("unchecked")
    @Override
    public ItemCollection execute(ItemCollection document, ItemCollection event) throws AdapterException {

        logger.info("......starting TikaDocumentAdapter mode=" + serviceMode);
        if ("model".equalsIgnoreCase(serviceMode)) {
            logger.finest("...running api adapter...");

            try {
                List tikaOptions = null;
                String filePattern = null;
                int maxPdfPages=0;
                // read opitonal tika options
                ItemCollection evalItemCollection = workflowService.evalWorkflowResult(event, "tika", document, false);
                if (evalItemCollection != null) {
                    tikaOptions = evalItemCollection.getItemValue("options");
                    filePattern = evalItemCollection.getItemValueString("filepattern");
                    maxPdfPages = evalItemCollection.getItemValueInteger("maxpdfpages"); // only for pdf documents
                }
                // extract text data....
                ocrService.extractText(document, snapshotService.findSnapshot(document), null, tikaOptions,
                        filePattern,maxPdfPages);
            } catch (PluginException e) {
                String message = "Tika OCRService - unable to extract text: " + e.getMessage();
                throw new AdapterException(e.getErrorContext(), e.getErrorCode(), message, e);
            } catch (RuntimeException e) {
                // we catch a runtimeException to avoid dead locks in the eventLog processing
                // issue #153
                String message = "Tika OCRService - unable to extract text: " + e.getMessage();
                throw new AdapterException(OCRDocumentAdapter.class.getSimpleName(), OCR_ERROR, message, e);
            }
        } else {
            logger.warning("unexpected TIKA_SERVICE_MODE=" + serviceMode
                    + " - running the OCRDocumentAdapter the env TIKA_SERVICE_MODE should be set to 'model'. Adapter will be ignored!");
        }

        return document;
    }

}