org.imixs.archive.documents.TikaService Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of imixs-archive-documents Show documentation
There is a newer version: 3.0.2
Show newest version
package org.imixs.archive.documents;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import javax.ejb.Stateless;
import javax.inject.Inject;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.eclipse.microprofile.config.inject.ConfigProperty;
import org.imixs.workflow.FileData;
import org.imixs.workflow.ItemCollection;
import org.imixs.workflow.exceptions.PluginException;

/**
 * The OCRService extracts the textual information from document attachments of
 * a workitem and stores the data into the $file attribute 'text'.
 * 
 * For the text extraction the services sends the content of a document to an
 * instance of a Apache Tika server via the Rest API. The environment variable
 * OCR_STRATEGY defines how PDF files will be scanned. Possible values are:
 * 

 * AUTO - The best OCR strategy is chosen by the Tika Server itself. This is
 * the default setting.
 * NO_OCR - OCR processing is disabled and text is extracted only from PDF
 * files including a raw text. If a pdf file does not contain raw text data no
 * text will be extracted!
 * OCR_ONLY - PDF files will always be OCR scanned even if the pdf file
 * contains text data.
 * OCR_AND_TEXT_EXTRACTION - OCR processing and raw text extraction is
 * performed. Note: This may result is a duplication of text and the mode is not
 * recommended.
 * 
 * The service expects a valid Rest API end-point to an instance of a Tika
 * Server defined by the Environment Parameter 'TIKA_SERVICE_ENDPONT'.
 * 

 * The environment parameter 'TIKA_SERVICE_MODE' must be set to 'auto' to enable
 * the service.
 * 

 * See also the project: https://github.com/imixs/imixs-docker/tree/master/tika
 * 
 * @version 1.1
 * @author rsoika
 */
@Stateless
public class TikaService {

    public static final String FILE_ATTRIBUTE_TEXT = "text";
    public static final String DEFAULT_ENCODING = "UTF-8";
    public static final String PLUGIN_ERROR = "PLUGIN_ERROR";
    public static final String ENV_OCR_SERVICE_ENDPOINT = "ocr.service.endpoint";
    public static final String ENV_OCR_SERVICE_MODE = "ocr.service.mode";
    public static final String ENV_OCR_SERVICE_MAXFILESIZE = "ocr.service.maxfilesize";

    public static final String ENV_OCR_STRATEGY = "ocr.strategy"; // NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION, AUTO
                                                                  // (default)

    public static final String OCR_STRATEGY_NO_OCR = "NO_OCR";
    public static final String OCR_STRATEGY_OCR_AND_TEXT_EXTRACTION = "OCR_AND_TEXT_EXTRACTION";
    public static final String OCR_STRATEGY_OCR_ONLY = "OCR_ONLY";
    public static final String OCR_STRATEGY_AUTO = "AUTO"; // default

    private static Logger logger = Logger.getLogger(TikaService.class.getName());

    @Inject
    @ConfigProperty(name = ENV_OCR_SERVICE_ENDPOINT)
    Optional serviceEndpoint;

    @Inject
    @ConfigProperty(name = ENV_OCR_STRATEGY, defaultValue = OCR_STRATEGY_AUTO)
    String ocrStategy;

    // Maximum size of bytes to be scanned (default is 5MB)
    @Inject
    @ConfigProperty(name = ENV_OCR_SERVICE_MAXFILESIZE, defaultValue = "5242880")
    int ocrMaxFileSize;

    /**
     * Extracts the textual information from document attachments.
     * 

     * The method extracts the textual content for each new document of a given
     * workitem. For PDF files with textual content the method calls the method
     * 'extractTextFromPDF' using the PDFBox api. In other cases, the method sends
     * the content via a Rest API to the tika server for OCR processing.
     * 

     * The result is stored into the fileData attribute 'text'
     * 
     * @param workitem
     * @throws PluginException
     */
    public void extractText(ItemCollection workitem, ItemCollection snapshot) throws PluginException {
        extractText(workitem, snapshot, ocrStategy, null, null, 0);
    }

    /**
     * Extracts the textual information from document attachments.
     * 

     * The method extracts the textual content for each new file attachment of a
     * given workitem. The text information is stored in the $file attribute 'text'.
     * 

     * For PDF files with textual content the method calls the method
     * 'extractTextFromPDF' using the PDFBox api. In other cases, the method sends
     * the content via a Rest API to the tika server for OCR processing.
     * 

     * The method also extracts files already stored in a snapshot workitem. In this
     * case the method tests if the $file attribute 'text' already exists.
     * 

     * An optional param 'filePattern' can be provided to extract text only from
     * Attachments mating the given file pattern (regex).
     * 

     * The optioanl param 'maxPages' can be provided to reduce the size of PDF
     * documents to a maximum of pages. This avoids blocking the tika service by
     * processing to large documetns. For example only the first 5 pages can be
     * scanned.
     * 
     * @param workitem         - workitem with file attachments
     * @param pdf_mode         - TEXT_ONLY, OCR_ONLY, TEXT_AND_OCR
     * @param options          - optional tika header params
     * @param filePatternRegex - optional regular expression to match files
     * @throws PluginException
     */
    public void extractText(ItemCollection workitem, ItemCollection snapshot, String _ocrStategy, List options,
            String filePatternRegex, int maxPdfPages) throws PluginException {
        boolean debug = logger.isLoggable(Level.FINE);
        Pattern filePattern = null;

        if (options == null) {
            options = new ArrayList();
        }

        // overwrite ocrmode?
        if (_ocrStategy != null) {
            this.ocrStategy = _ocrStategy;
        }

        // validate OCR MODE....
        if ("AUTO, NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION".indexOf(ocrStategy) == -1) {
            throw new PluginException(TikaService.class.getSimpleName(), PLUGIN_ERROR,
                    "Invalid TIKA_OCR_MODE - expected one of the following options: NO_OCR | OCR_ONLY | OCR_AND_TEXT_EXTRACTION");
        }

        // if the options did not already include the X-Tika-PDFOcrStrategy than we add
        // it now...
        boolean hasPDFOcrStrategy = options.stream()
                .anyMatch(s -> s.toLowerCase().startsWith("X-Tika-PDFOcrStrategy=".toLowerCase()));
        if (!hasPDFOcrStrategy) {
            // we do need to set a OcrStrategy from the environment...
            options.add("X-Tika-PDFOcrStrategy=" + ocrStategy);
        }

        // print tika options...
        if (debug) {
            logger.info("......  filepattern = "+filePatternRegex); 
            for (String opt : options) {
                logger.info("......  Tika Option = " + opt);
            }
        }
       
        // do we have a file pattern?
        if (filePatternRegex != null && !filePatternRegex.isEmpty()) {
            filePattern = Pattern.compile(filePatternRegex);
        }

        long l = System.currentTimeMillis();
        // List currentDmsList = DMSHandler.getDmsList(workitem);
        List files = workitem.getFileData();
        
        if (debug) {
            logger.info("... found " + files.size() +" files");
        }

        for (FileData fileData : files) {
            logger.fine("... processing file: "+fileData.getName()); 
            // do we have an optional file pattern?
            if (filePattern != null && !filePattern.matcher(fileData.getName()).find()) {
                // the file did not match the given pattern!
                logger.info("... filename does not match given pattern!"); 
                continue;
            }

            // do we need to parse the content?
            if (!hasOCRContent(fileData)) {
                if (debug) {
                    logger.info("... workitem has not OCRContent - fetching origin file data...");
                }
                // yes - fetch the origin fileData object....
                FileData originFileData = fetchOriginFileData(fileData, snapshot);
                if (originFileData != null) {
                    String textContent = null;
                    // extract the text content...
                    try {
                        // if the size of the file is greater then ENV_OCR_SERVICE_MAXFILESIZE,
                        // we ignore the file!
                        if (originFileData.getContent() != null
                                && originFileData.getContent().length > ocrMaxFileSize) {
                            logger.warning("The file size '" + fileData.getName() + "' excided the allowed max size of "
                                    + ocrMaxFileSize + " bytes (file size=" + originFileData.getContent().length + ")");
                            continue;
                        }
                        if (debug) {
                            logger.info("...text extraction '" + originFileData.getName() + "' content size=" +originFileData.getContent().length +" ...");
                        }

                        textContent = doORCProcessing(originFileData, options, maxPdfPages);

                        if (textContent == null) {
                            logger.warning("Unable to extract text-content for '" + fileData.getName() + "'");
                            textContent = "";
                        }                      
                        // store the ocrContent....
                        List