org.imixs.workflow.documents.TikaDocumentService Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of imixs-archive-documents Show documentation
There is a newer version: 3.0.2
Show newest version
package org.imixs.workflow.documents;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Logger;

import javax.ejb.Stateless;
import javax.enterprise.event.Observes;
import javax.inject.Inject;

import org.eclipse.microprofile.config.inject.ConfigProperty;
import org.imixs.workflow.FileData;
import org.imixs.workflow.ItemCollection;
import org.imixs.workflow.engine.ProcessingEvent;
import org.imixs.workflow.exceptions.PluginException;

/**
 * The TikaPlugin extracts the textual information from document attachments.
 * The plug-in sends each new attached document to an instance of an Apache Tika
 * Server to get the file content.
 * 
 * The plug-in expects the Environment Parameter 'TIKA_SERVICE_ENDPONT' to get
 * the Rest API end-point.
 * 

 * See also the project: https://github.com/imixs/imixs-docker/tree/master/tika
 * 
 * @version 1.0
 * @author rsoika
 */
@Stateless
public class TikaDocumentService {

	public static final String PLUGIN_ERROR = "PLUGIN_ERROR";
	public static final String ENV_TIKA_SERVICE_ENDPONT = "TIKA_SERVICE_ENDPONT";
	public static final String ENV_TIKA_SERVICE_MODE = "TIKA_SERVICE_MODE";

	private static Logger logger = Logger.getLogger(TikaDocumentService.class.getName());

	
	@Inject
	@ConfigProperty(name = ENV_TIKA_SERVICE_ENDPONT, defaultValue = "")
	String serviceEndpoint;
	
	@Inject
	@ConfigProperty(name = ENV_TIKA_SERVICE_MODE, defaultValue = "auto")
	String serviceMode;

	
	

	/**
	 * React on the ProcessingEvent This method sends the document content to the
	 * tika server and updates teh DMS information.
	 * 
	 * @throws PluginException
	 */
	public void onBeforeProcess(@Observes ProcessingEvent processingEvent) throws PluginException {
		
		if (serviceEndpoint == null || serviceEndpoint.isEmpty()) {
			return;
		}
		// read the Tika Service mode
		
		if ("auto".equalsIgnoreCase(serviceMode)) {
			if (processingEvent.getEventType() == ProcessingEvent.BEFORE_PROCESS) {
				// update the dms meta data
				extractText(processingEvent.getDocument());
			}
		}
	}

	/**
	 * Extracts the textual information from document attachments.
	 * 

	 * The method sends each new document to the tika server and updates the
	 * fileData attribute 'content'
	 * 
	 * @param workitem
	 * @throws PluginException
	 */
	public void extractText(ItemCollection workitem) throws PluginException {
		// read the Tika Service Enpoint
		
		if (serviceEndpoint == null || serviceEndpoint.isEmpty()) {
			return;
		}

		long l = System.currentTimeMillis();
		// List currentDmsList = DMSHandler.getDmsList(workitem);
		List files = workitem.getFileData();

		for (FileData fileData : files) {
			// We parse the file content if a new file content was added
			byte[] fileContent = fileData.getContent();
			// tesseract did not support any content type (e.g. application/octet-stream)
			if (acceptContentType(fileData.getContentType()) && fileContent != null && fileContent.length > 1) {
				// scan content...
				try {
					logger.info("...send " + fileData.getName() + " to tika server...");
					// RestClient restClient = new RestClient(serviceEndpoint);

					String result = put(serviceEndpoint, fileContent, fileData.getContentType(), "UTF-8");

					if (result != null && !result.isEmpty()) {
						List