org.imixs.workflow.documents.TikaDocumentService Maven / Gradle / Ivy
package org.imixs.workflow.documents;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Logger;
import javax.ejb.Stateless;
import javax.enterprise.event.Observes;
import javax.inject.Inject;
import org.eclipse.microprofile.config.inject.ConfigProperty;
import org.imixs.workflow.FileData;
import org.imixs.workflow.ItemCollection;
import org.imixs.workflow.engine.ProcessingEvent;
import org.imixs.workflow.exceptions.PluginException;
/**
* The TikaPlugin extracts the textual information from document attachments.
* The plug-in sends each new attached document to an instance of an Apache Tika
* Server to get the file content.
*
* The plug-in expects the Environment Parameter 'TIKA_SERVICE_ENDPONT' to get
* the Rest API end-point.
*
* See also the project: https://github.com/imixs/imixs-docker/tree/master/tika
*
* @version 1.0
* @author rsoika
*/
@Stateless
public class TikaDocumentService {
public static final String PLUGIN_ERROR = "PLUGIN_ERROR";
public static final String ENV_TIKA_SERVICE_ENDPONT = "TIKA_SERVICE_ENDPONT";
public static final String ENV_TIKA_SERVICE_MODE = "TIKA_SERVICE_MODE";
private static Logger logger = Logger.getLogger(TikaDocumentService.class.getName());
@Inject
@ConfigProperty(name = ENV_TIKA_SERVICE_ENDPONT, defaultValue = "")
String serviceEndpoint;
@Inject
@ConfigProperty(name = ENV_TIKA_SERVICE_MODE, defaultValue = "auto")
String serviceMode;
/**
* React on the ProcessingEvent This method sends the document content to the
* tika server and updates teh DMS information.
*
* @throws PluginException
*/
public void onBeforeProcess(@Observes ProcessingEvent processingEvent) throws PluginException {
if (serviceEndpoint == null || serviceEndpoint.isEmpty()) {
return;
}
// read the Tika Service mode
if ("auto".equalsIgnoreCase(serviceMode)) {
if (processingEvent.getEventType() == ProcessingEvent.BEFORE_PROCESS) {
// update the dms meta data
extractText(processingEvent.getDocument());
}
}
}
/**
* Extracts the textual information from document attachments.
*
* The method sends each new document to the tika server and updates the
* fileData attribute 'content'
*
* @param workitem
* @throws PluginException
*/
public void extractText(ItemCollection workitem) throws PluginException {
// read the Tika Service Enpoint
if (serviceEndpoint == null || serviceEndpoint.isEmpty()) {
return;
}
long l = System.currentTimeMillis();
// List currentDmsList = DMSHandler.getDmsList(workitem);
List files = workitem.getFileData();
for (FileData fileData : files) {
// We parse the file content if a new file content was added
byte[] fileContent = fileData.getContent();
// tesseract did not support any content type (e.g. application/octet-stream)
if (acceptContentType(fileData.getContentType()) && fileContent != null && fileContent.length > 1) {
// scan content...
try {
logger.info("...send " + fileData.getName() + " to tika server...");
// RestClient restClient = new RestClient(serviceEndpoint);
String result = put(serviceEndpoint, fileContent, fileData.getContentType(), "UTF-8");
if (result != null && !result.isEmpty()) {
List