All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.pipes.pipesiterator.PipesIterator Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.pipes.pipesiterator;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.tika.config.ConfigBase;
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
import org.apache.tika.config.Param;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaTimeoutException;
import org.apache.tika.pipes.FetchEmitTuple;
import org.apache.tika.pipes.HandlerConfig;
import org.apache.tika.sax.BasicContentHandlerFactory;

/**
 * Abstract class that handles the testing for timeouts/thread safety
 * issues.  Concrete classes implement the blocking {@link #enqueue()}.
 * If there's an exception in the enqueuing thread, this will throw
 * a RuntimeException.  It will throw an IllegalStateException if
 * next() is called after hasNext() has returned false.
 */
public abstract class PipesIterator extends ConfigBase
        implements Callable, Iterable, Initializable  {

    public static final long DEFAULT_MAX_WAIT_MS = 300_000;
    public static final int DEFAULT_QUEUE_SIZE = 1000;

    public static final FetchEmitTuple COMPLETED_SEMAPHORE =
            new FetchEmitTuple(null,null, null, null, null, null);

    private static final Logger LOGGER = LoggerFactory.getLogger(PipesIterator.class);

    private long maxWaitMs = DEFAULT_MAX_WAIT_MS;
    private ArrayBlockingQueue queue = null;
    private int queueSize = DEFAULT_QUEUE_SIZE;
    private String fetcherName;
    private String emitterName;
    private FetchEmitTuple.ON_PARSE_EXCEPTION onParseException =
            FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT;
    private BasicContentHandlerFactory.HANDLER_TYPE handlerType =
            BasicContentHandlerFactory.HANDLER_TYPE.TEXT;

    private HandlerConfig.PARSE_MODE parseMode = HandlerConfig.PARSE_MODE.RMETA;

    private int writeLimit = -1;
    private int maxEmbeddedResources = -1;

    private int added = 0;
    private FutureTask futureTask;

    public static PipesIterator build(Path tikaConfigFile) throws IOException,
            TikaConfigException {
        try (InputStream is = Files.newInputStream(tikaConfigFile)) {
            return buildSingle(
                    "pipesIterator",
                    PipesIterator.class, is);
        }
    }

    public String getFetcherName() {
        return fetcherName;
    }

    @Field
    public void setFetcherName(String fetcherName) {
        this.fetcherName = fetcherName;
    }

    public String getEmitterName() {
        return emitterName;
    }

    @Field
    public void setEmitterName(String emitterName) {
        this.emitterName = emitterName;
    }

    @Field
    public void setMaxWaitMs(long maxWaitMs) {
        this.maxWaitMs = maxWaitMs;
    }

    @Field
    public void setQueueSize(int queueSize) {
        this.queueSize = queueSize;
    }

    public FetchEmitTuple.ON_PARSE_EXCEPTION getOnParseException() {
        return onParseException;
    }

    @Field
    public void setOnParseException(String onParseException) throws TikaConfigException {
        if ("skip".equalsIgnoreCase(onParseException)) {
            setOnParseException(FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP);
        } else if ("emit".equalsIgnoreCase(onParseException)) {
            setOnParseException(FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT);
        } else {
            throw new TikaConfigException("must be either 'skip' or 'emit': " + onParseException);
        }
    }

    public void setOnParseException(FetchEmitTuple.ON_PARSE_EXCEPTION onParseException) {
        this.onParseException = onParseException;
    }

    @Field
    public void setHandlerType(String handlerType) {
        this.handlerType = BasicContentHandlerFactory
                .parseHandlerType(handlerType, BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
    }

    @Field
    public void setWriteLimit(int writeLimit) {
        this.writeLimit = writeLimit;
    }

    @Field
    public void setMaxEmbeddedResources(int maxEmbeddedResources) {
        this.maxEmbeddedResources = maxEmbeddedResources;
    }

    @Field
    public void setParseMode(String parseModeString) {
        setParseMode(HandlerConfig.PARSE_MODE.parseMode(parseModeString));
    }

    public void setParseMode(HandlerConfig.PARSE_MODE parsePARSEMode) {
        this.parseMode = parsePARSEMode;
    }

    public Integer call() throws Exception {
        enqueue();
        tryToAdd(COMPLETED_SEMAPHORE);
        return added;
    }

    protected HandlerConfig getHandlerConfig() {
        return new HandlerConfig(handlerType, parseMode, writeLimit, maxEmbeddedResources);
    }

    protected abstract void enqueue() throws IOException, TimeoutException, InterruptedException;

    protected void tryToAdd(FetchEmitTuple p) throws InterruptedException, TimeoutException {
        added++;
        boolean offered = queue.offer(p, maxWaitMs, TimeUnit.MILLISECONDS);
        if (!offered) {
            throw new TimeoutException("timed out while offering");
        }
    }

    @Override
    public void initialize(Map params) throws TikaConfigException {
        //no-op
    }

    @Override
    public void checkInitialization(InitializableProblemHandler problemHandler)
            throws TikaConfigException {
        //no-op
    }

    @Override
    public Iterator iterator() {
        if (futureTask != null) {
            throw new IllegalStateException("Can't call iterator more than once!");
        }
        futureTask = new FutureTask<>(this);
        queue = new ArrayBlockingQueue<>(queueSize);
        new Thread(futureTask).start();
        return new TupleIterator();
    }

    private class TupleIterator implements Iterator {
        FetchEmitTuple next = null;

        @Override
        public boolean hasNext() {
            if (next == null) {
                next = pollNext();
            }
            return next != COMPLETED_SEMAPHORE;
        }

        @Override
        public FetchEmitTuple next() {
            if (next == COMPLETED_SEMAPHORE) {
                throw new IllegalStateException(
                        "don't call next() after hasNext() has returned false!");
            }
            FetchEmitTuple ret = next;
            next = pollNext();
            return ret;
        }

        private FetchEmitTuple pollNext() throws TikaTimeoutException {

            FetchEmitTuple t = null;
            long start = System.currentTimeMillis();
            try {
                long elapsed = System.currentTimeMillis() - start;
                while (t == null && elapsed < maxWaitMs) {
                    checkThreadOk();
                    t = queue.poll(100, TimeUnit.MILLISECONDS);
                    elapsed = System.currentTimeMillis() - start;
                }
            } catch (InterruptedException e) {
                LOGGER.warn("interrupted");
                return COMPLETED_SEMAPHORE;
            }
            if (t == null) {
                throw new TikaTimeoutException(
                        "waited longer than " + maxWaitMs + "ms for the next tuple");
            }
            return t;
        }

        /**
         * this checks to make sure that the thread hasn't terminated early.
         * Will return true if the thread has successfully completed or if
         * it has not completed.  Will return false if there has been a thread
         * interrupt. Will throw a RuntimeException if there's been
         * an execution exception in the thread.
         */
        private void checkThreadOk() throws InterruptedException {
            if (futureTask.isDone()) {
                try {
                    futureTask.get();
                } catch (ExecutionException e) {
                    throw new RuntimeException(e.getCause());
                }
            }
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy