All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.batch.builders.BatchProcessBuilder Maven / Gradle / Ivy

There is a newer version: 3.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.batch.builders;


import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import javax.xml.parsers.DocumentBuilder;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import org.apache.tika.batch.BatchProcess;
import org.apache.tika.batch.ConsumersManager;
import org.apache.tika.batch.FileResource;
import org.apache.tika.batch.FileResourceCrawler;
import org.apache.tika.batch.Interrupter;
import org.apache.tika.batch.StatusReporter;
import org.apache.tika.exception.TikaException;
import org.apache.tika.util.ClassLoaderUtil;
import org.apache.tika.util.XMLDOMUtil;
import org.apache.tika.utils.XMLReaderUtils;


/**
 * Builds a BatchProcessor from a combination of runtime arguments and the
 * config file.
 */
public class BatchProcessBuilder {

    public final static int DEFAULT_MAX_QUEUE_SIZE = 1000;
    public final static String MAX_QUEUE_SIZE_KEY = "maxQueueSize";
    public final static String NUM_CONSUMERS_KEY = "numConsumers";

    /**
     * numConsumers is needed by both the crawler and the consumers. This utility method
     * is to be used to extract the number of consumers from a map of String key value pairs.
     * 

* If the value is "default", not a parseable integer or has a value < 1, * then AbstractConsumersBuilder's getDefaultNumConsumers() * * @param attrs attributes from which to select the NUM_CONSUMERS_KEY * @return number of consumers */ public static int getNumConsumers(Map attrs) { String nString = attrs.get(BatchProcessBuilder.NUM_CONSUMERS_KEY); if (nString == null || nString.equals("default")) { return AbstractConsumersBuilder.getDefaultNumConsumers(); } int n = -1; try { n = Integer.parseInt(nString); } catch (NumberFormatException e) { //swallow } if (n < 1) { n = AbstractConsumersBuilder.getDefaultNumConsumers(); } return n; } /** * Builds a BatchProcess from runtime arguments and a * input stream of a configuration file. With the exception of the QueueBuilder, * the builders choose how to adjudicate between * runtime arguments and the elements in the configuration file. *

* This does not close the InputStream! * * @param is inputStream * @param runtimeAttributes incoming runtime attributes * @return batch process * @throws java.io.IOException */ public BatchProcess build(InputStream is, Map runtimeAttributes) throws IOException { Document doc = null; try { DocumentBuilder docBuilder = XMLReaderUtils.getDocumentBuilder(); doc = docBuilder.parse(is); } catch (TikaException | SAXException e) { throw new IOException(e); } Node docElement = doc.getDocumentElement(); return build(docElement, runtimeAttributes); } /** * Builds a FileResourceBatchProcessor from runtime arguments and a * document node of a configuration file. With the exception of the QueueBuilder, * the builders choose how to adjudicate between * runtime arguments and the elements in the configuration file. * * @param docElement document element of the xml config file * @param incomingRuntimeAttributes runtime arguments * @return FileResourceBatchProcessor */ public BatchProcess build(Node docElement, Map incomingRuntimeAttributes) { //key components long timeoutThresholdMillis = XMLDOMUtil.getLong("timeoutThresholdMillis", incomingRuntimeAttributes, docElement); long timeoutCheckPulseMillis = XMLDOMUtil .getLong("timeoutCheckPulseMillis", incomingRuntimeAttributes, docElement); long pauseOnEarlyTerminationMillis = XMLDOMUtil .getLong("pauseOnEarlyTerminationMillis", incomingRuntimeAttributes, docElement); int maxAliveTimeSeconds = XMLDOMUtil.getInt("maxAliveTimeSeconds", incomingRuntimeAttributes, docElement); FileResourceCrawler crawler = null; ConsumersManager consumersManager = null; StatusReporter reporter = null; Interrupter interrupter = null; /* * TODO: This is a bit smelly. NumConsumers needs to be used by the crawler * and the consumers. This copies the incomingRuntimeAttributes and then * supplies the numConsumers from the commandline (if it exists) or from the config file * At least this creates an unmodifiable defensive copy of incomingRuntimeAttributes... */ Map runtimeAttributes = setNumConsumersInRuntimeAttributes(docElement, incomingRuntimeAttributes); //build queue ArrayBlockingQueue queue = buildQueue(docElement, runtimeAttributes); NodeList children = docElement.getChildNodes(); Map keyNodes = new HashMap<>(); for (int i = 0; i < children.getLength(); i++) { Node child = children.item(i); if (child.getNodeType() != Node.ELEMENT_NODE) { continue; } String nodeName = child.getNodeName(); keyNodes.put(nodeName, child); } //build consumers consumersManager = buildConsumersManager(keyNodes.get("consumers"), runtimeAttributes, queue); //build crawler crawler = buildCrawler(queue, keyNodes.get("crawler"), runtimeAttributes); if (keyNodes.containsKey("reporter")) { reporter = buildReporter(crawler, consumersManager, keyNodes.get("reporter"), runtimeAttributes); } if (keyNodes.containsKey("interrupter")) { interrupter = buildInterrupter(keyNodes.get("interrupter"), pauseOnEarlyTerminationMillis, runtimeAttributes); } BatchProcess proc = new BatchProcess(crawler, consumersManager, reporter, interrupter); if (timeoutThresholdMillis > -1) { proc.setTimeoutThresholdMillis(timeoutThresholdMillis); } if (pauseOnEarlyTerminationMillis > -1) { proc.setPauseOnEarlyTerminationMillis(pauseOnEarlyTerminationMillis); } if (timeoutCheckPulseMillis > -1) { proc.setTimeoutCheckPulseMillis(timeoutCheckPulseMillis); } proc.setMaxAliveTimeSeconds(maxAliveTimeSeconds); return proc; } private Interrupter buildInterrupter(Node node, long pauseOnEarlyTermination, Map runtimeAttributes) { Map attrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes); String className = attrs.get("builderClass"); if (className == null) { throw new RuntimeException("Need to specify class name in interrupter element"); } InterrupterBuilder builder = ClassLoaderUtil.buildClass(InterrupterBuilder.class, className); return builder.build(node, pauseOnEarlyTermination, runtimeAttributes); } private StatusReporter buildReporter(FileResourceCrawler crawler, ConsumersManager consumersManager, Node node, Map runtimeAttributes) { Map attrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes); String className = attrs.get("builderClass"); if (className == null) { throw new RuntimeException("Need to specify class name in reporter element"); } StatusReporterBuilder builder = ClassLoaderUtil.buildClass(StatusReporterBuilder.class, className); return builder.build(crawler, consumersManager, node, runtimeAttributes); } private Map setNumConsumersInRuntimeAttributes(Node docElement, Map incomingRuntimeAttributes) { Map runtimeAttributes = new HashMap<>(); for (Map.Entry e : incomingRuntimeAttributes.entrySet()) { runtimeAttributes.put(e.getKey(), e.getValue()); } //if this is set at runtime use that value if (runtimeAttributes.containsKey(NUM_CONSUMERS_KEY)) { return Collections.unmodifiableMap(runtimeAttributes); } Node ncNode = docElement.getAttributes().getNamedItem("numConsumers"); int numConsumers = -1; String numConsumersString = ncNode.getNodeValue(); try { numConsumers = Integer.parseInt(numConsumersString); } catch (NumberFormatException e) { //swallow and just use numConsumers } //TODO: should we have a max range check? if (numConsumers < 1) { numConsumers = AbstractConsumersBuilder.getDefaultNumConsumers(); } runtimeAttributes.put(NUM_CONSUMERS_KEY, Integer.toString(numConsumers)); return Collections.unmodifiableMap(runtimeAttributes); } //tries to get maxQueueSize from main element private ArrayBlockingQueue buildQueue(Node docElement, Map runtimeAttributes) { int maxQueueSize = DEFAULT_MAX_QUEUE_SIZE; String szString = runtimeAttributes.get(MAX_QUEUE_SIZE_KEY); if (szString == null) { Node szNode = docElement.getAttributes().getNamedItem(MAX_QUEUE_SIZE_KEY); if (szNode != null) { szString = szNode.getNodeValue(); } } if (szString != null) { try { maxQueueSize = Integer.parseInt(szString); } catch (NumberFormatException e) { //swallow } } if (maxQueueSize < 0) { maxQueueSize = DEFAULT_MAX_QUEUE_SIZE; } return new ArrayBlockingQueue<>(maxQueueSize); } private ConsumersManager buildConsumersManager(Node node, Map runtimeAttributes, ArrayBlockingQueue queue) { Map attrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes); String className = attrs.get("builderClass"); if (className == null) { throw new RuntimeException("Need to specify class name in consumers element"); } AbstractConsumersBuilder builder = ClassLoaderUtil.buildClass(AbstractConsumersBuilder.class, className); return builder.build(node, runtimeAttributes, queue); } private FileResourceCrawler buildCrawler(ArrayBlockingQueue queue, Node node, Map runtimeAttributes) { Map attrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes); String className = attrs.get("builderClass"); if (className == null) { throw new RuntimeException("Need to specify class name in crawler element"); } ICrawlerBuilder builder = ClassLoaderUtil.buildClass(ICrawlerBuilder.class, className); return builder.build(node, runtimeAttributes, queue); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy