All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.imixs.archive.documents.OCRDocumentAdapter Maven / Gradle / Ivy

There is a newer version: 3.0.2
Show newest version
package org.imixs.archive.documents;

import java.util.List;
import java.util.logging.Logger;

import javax.inject.Inject;

import org.eclipse.microprofile.config.inject.ConfigProperty;
import org.imixs.archive.core.SnapshotService;
import org.imixs.workflow.ItemCollection;
import org.imixs.workflow.SignalAdapter;
import org.imixs.workflow.engine.WorkflowService;
import org.imixs.workflow.exceptions.AdapterException;
import org.imixs.workflow.exceptions.PluginException;

/**
 * The TikaDocumentAdapter reacts on ProcessingEvent to auto extract the text
 * content.
 * 

* The adapter expect the following environment setting * * TIKA_SERVICE_MODE: "MODEL" * * You can set additional options to be passed to the Tika Service * *

* *

 * {@code
        X-Tika-PDFocrStrategy=OCR_ONLY
        X-Tika-PDFOcrImageType=RGB
        X-Tika-PDFOcrDPI=400
   }
 * 
* * @see OCRDocumentService * @version 1.0 * @author rsoika */ public class OCRDocumentAdapter implements SignalAdapter { public static final String OCR_ERROR = "OCR_ERROR"; private static Logger logger = Logger.getLogger(OCRDocumentAdapter.class.getName()); @Inject @ConfigProperty(name = TikaService.ENV_OCR_SERVICE_MODE, defaultValue = "auto") String serviceMode; @Inject TikaService ocrService; @Inject WorkflowService workflowService; @Inject SnapshotService snapshotService; /** * This method posts a text from an attachment to the Imixs-ML Analyse service * endpoint */ @SuppressWarnings("unchecked") @Override public ItemCollection execute(ItemCollection document, ItemCollection event) throws AdapterException { logger.info("......starting TikaDocumentAdapter mode=" + serviceMode); if ("model".equalsIgnoreCase(serviceMode)) { logger.finest("...running api adapter..."); try { List tikaOptions = null; String filePattern = null; int maxPdfPages=0; // read opitonal tika options ItemCollection evalItemCollection = workflowService.evalWorkflowResult(event, "tika", document, false); if (evalItemCollection != null) { tikaOptions = evalItemCollection.getItemValue("options"); filePattern = evalItemCollection.getItemValueString("filepattern"); maxPdfPages = evalItemCollection.getItemValueInteger("maxpdfpages"); // only for pdf documents } // extract text data.... ocrService.extractText(document, snapshotService.findSnapshot(document), null, tikaOptions, filePattern,maxPdfPages); } catch (PluginException e) { String message = "Tika OCRService - unable to extract text: " + e.getMessage(); throw new AdapterException(e.getErrorContext(), e.getErrorCode(), message, e); } catch (RuntimeException e) { // we catch a runtimeException to avoid dead locks in the eventLog processing // issue #153 String message = "Tika OCRService - unable to extract text: " + e.getMessage(); throw new AdapterException(OCRDocumentAdapter.class.getSimpleName(), OCR_ERROR, message, e); } } else { logger.warning("unexpected TIKA_SERVICE_MODE=" + serviceMode + " - running the OCRDocumentAdapter the env TIKA_SERVICE_MODE should be set to 'model'. Adapter will be ignored!"); } return document; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy