org.imixs.archive.documents.TikaService Maven / Gradle / Ivy
package org.imixs.archive.documents;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import javax.ejb.Stateless;
import javax.inject.Inject;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.eclipse.microprofile.config.inject.ConfigProperty;
import org.imixs.workflow.FileData;
import org.imixs.workflow.ItemCollection;
import org.imixs.workflow.exceptions.PluginException;
/**
* The OCRService extracts the textual information from document attachments of
* a workitem and stores the data into the $file attribute 'text'.
*
* For the text extraction the services sends the content of a document to an
* instance of a Apache Tika server via the Rest API. The environment variable
* OCR_STRATEGY defines how PDF files will be scanned. Possible values are:
*
* - AUTO - The best OCR strategy is chosen by the Tika Server itself. This is
* the default setting.
* - NO_OCR - OCR processing is disabled and text is extracted only from PDF
* files including a raw text. If a pdf file does not contain raw text data no
* text will be extracted!
* - OCR_ONLY - PDF files will always be OCR scanned even if the pdf file
* contains text data.
* - OCR_AND_TEXT_EXTRACTION - OCR processing and raw text extraction is
* performed. Note: This may result is a duplication of text and the mode is not
* recommended.
*
* The service expects a valid Rest API end-point to an instance of a Tika
* Server defined by the Environment Parameter 'TIKA_SERVICE_ENDPONT'.
*
* The environment parameter 'TIKA_SERVICE_MODE' must be set to 'auto' to enable
* the service.
*
* See also the project: https://github.com/imixs/imixs-docker/tree/master/tika
*
* @version 1.1
* @author rsoika
*/
@Stateless
public class TikaService {
public static final String FILE_ATTRIBUTE_TEXT = "text";
public static final String DEFAULT_ENCODING = "UTF-8";
public static final String PLUGIN_ERROR = "PLUGIN_ERROR";
public static final String ENV_OCR_SERVICE_ENDPOINT = "ocr.service.endpoint";
public static final String ENV_OCR_SERVICE_MODE = "ocr.service.mode";
public static final String ENV_OCR_SERVICE_MAXFILESIZE = "ocr.service.maxfilesize";
public static final String ENV_OCR_STRATEGY = "ocr.strategy"; // NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION, AUTO
// (default)
public static final String OCR_STRATEGY_NO_OCR = "NO_OCR";
public static final String OCR_STRATEGY_OCR_AND_TEXT_EXTRACTION = "OCR_AND_TEXT_EXTRACTION";
public static final String OCR_STRATEGY_OCR_ONLY = "OCR_ONLY";
public static final String OCR_STRATEGY_AUTO = "AUTO"; // default
private static Logger logger = Logger.getLogger(TikaService.class.getName());
@Inject
@ConfigProperty(name = ENV_OCR_SERVICE_ENDPOINT)
Optional serviceEndpoint;
@Inject
@ConfigProperty(name = ENV_OCR_STRATEGY, defaultValue = OCR_STRATEGY_AUTO)
String ocrStategy;
// Maximum size of bytes to be scanned (default is 5MB)
@Inject
@ConfigProperty(name = ENV_OCR_SERVICE_MAXFILESIZE, defaultValue = "5242880")
int ocrMaxFileSize;
/**
* Extracts the textual information from document attachments.
*
* The method extracts the textual content for each new document of a given
* workitem. For PDF files with textual content the method calls the method
* 'extractTextFromPDF' using the PDFBox api. In other cases, the method sends
* the content via a Rest API to the tika server for OCR processing.
*
* The result is stored into the fileData attribute 'text'
*
* @param workitem
* @throws PluginException
*/
public void extractText(ItemCollection workitem, ItemCollection snapshot) throws PluginException {
extractText(workitem, snapshot, ocrStategy, null, null, 0);
}
/**
* Extracts the textual information from document attachments.
*
* The method extracts the textual content for each new file attachment of a
* given workitem. The text information is stored in the $file attribute 'text'.
*
* For PDF files with textual content the method calls the method
* 'extractTextFromPDF' using the PDFBox api. In other cases, the method sends
* the content via a Rest API to the tika server for OCR processing.
*
* The method also extracts files already stored in a snapshot workitem. In this
* case the method tests if the $file attribute 'text' already exists.
*
* An optional param 'filePattern' can be provided to extract text only from
* Attachments mating the given file pattern (regex).
*
* The optioanl param 'maxPages' can be provided to reduce the size of PDF
* documents to a maximum of pages. This avoids blocking the tika service by
* processing to large documetns. For example only the first 5 pages can be
* scanned.
*
* @param workitem - workitem with file attachments
* @param pdf_mode - TEXT_ONLY, OCR_ONLY, TEXT_AND_OCR
* @param options - optional tika header params
* @param filePatternRegex - optional regular expression to match files
* @throws PluginException
*/
public void extractText(ItemCollection workitem, ItemCollection snapshot, String _ocrStategy, List options,
String filePatternRegex, int maxPdfPages) throws PluginException {
boolean debug = logger.isLoggable(Level.FINE);
Pattern filePattern = null;
if (options == null) {
options = new ArrayList();
}
// overwrite ocrmode?
if (_ocrStategy != null) {
this.ocrStategy = _ocrStategy;
}
// validate OCR MODE....
if ("AUTO, NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION".indexOf(ocrStategy) == -1) {
throw new PluginException(TikaService.class.getSimpleName(), PLUGIN_ERROR,
"Invalid TIKA_OCR_MODE - expected one of the following options: NO_OCR | OCR_ONLY | OCR_AND_TEXT_EXTRACTION");
}
// if the options did not already include the X-Tika-PDFOcrStrategy than we add
// it now...
boolean hasPDFOcrStrategy = options.stream()
.anyMatch(s -> s.toLowerCase().startsWith("X-Tika-PDFOcrStrategy=".toLowerCase()));
if (!hasPDFOcrStrategy) {
// we do need to set a OcrStrategy from the environment...
options.add("X-Tika-PDFOcrStrategy=" + ocrStategy);
}
// print tika options...
if (debug) {
logger.info("...... filepattern = "+filePatternRegex);
for (String opt : options) {
logger.info("...... Tika Option = " + opt);
}
}
// do we have a file pattern?
if (filePatternRegex != null && !filePatternRegex.isEmpty()) {
filePattern = Pattern.compile(filePatternRegex);
}
long l = System.currentTimeMillis();
// List currentDmsList = DMSHandler.getDmsList(workitem);
List files = workitem.getFileData();
if (debug) {
logger.info("... found " + files.size() +" files");
}
for (FileData fileData : files) {
logger.fine("... processing file: "+fileData.getName());
// do we have an optional file pattern?
if (filePattern != null && !filePattern.matcher(fileData.getName()).find()) {
// the file did not match the given pattern!
logger.info("... filename does not match given pattern!");
continue;
}
// do we need to parse the content?
if (!hasOCRContent(fileData)) {
if (debug) {
logger.info("... workitem has not OCRContent - fetching origin file data...");
}
// yes - fetch the origin fileData object....
FileData originFileData = fetchOriginFileData(fileData, snapshot);
if (originFileData != null) {
String textContent = null;
// extract the text content...
try {
// if the size of the file is greater then ENV_OCR_SERVICE_MAXFILESIZE,
// we ignore the file!
if (originFileData.getContent() != null
&& originFileData.getContent().length > ocrMaxFileSize) {
logger.warning("The file size '" + fileData.getName() + "' excided the allowed max size of "
+ ocrMaxFileSize + " bytes (file size=" + originFileData.getContent().length + ")");
continue;
}
if (debug) {
logger.info("...text extraction '" + originFileData.getName() + "' content size=" +originFileData.getContent().length +" ...");
}
textContent = doORCProcessing(originFileData, options, maxPdfPages);
if (textContent == null) {
logger.warning("Unable to extract text-content for '" + fileData.getName() + "'");
textContent = "";
}
// store the ocrContent....
List