All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.fork.ForkParser Maven / Gradle / Ivy

There is a newer version: 2024.11.18598.20241113T125352Z-241000
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.fork;

import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.Set;

import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

public class ForkParser extends AbstractParser implements Closeable {

    /** Serial version UID */
    private static final long serialVersionUID = -4962742892274663950L;

    //these are used by the legacy usage
    private final ClassLoader loader;
    private final Parser parser;

    //these are used when the server builds a parser via a directory
    //of jars, not via legacy bootstrap etc.
    private final Path tikaBin;
    private final ParserFactoryFactory parserFactoryFactory;

    /** Java command line */
    private List java = Arrays.asList("java", "-Xmx32m", "-Djava.awt.headless=true");

    /** Process pool size */
    @Field
    private int poolSize = 5;

    private int currentlyInUse = 0;

    private final Queue pool = new LinkedList<>();

    @Field
    private long serverPulseMillis = 1000;

    @Field
    private long serverParseTimeoutMillis = 60000;

    @Field
    private long serverWaitTimeoutMillis = 60000;

    @Field
    private int maxFilesProcessedPerClient = -1;

    /**
     * If you have a directory with, say, tike-app.jar and you want the child process/server to build a parser
     * and run it from that -- so that you can keep all of those dependencies out of your client code, use
     * this initializer.
     *
     * @param tikaBin directory containing the tika-app.jar or similar -- full jar including tika-core and all
     *                desired parsers and dependencies
     * @param factoryFactory
     */
    public ForkParser(Path tikaBin, ParserFactoryFactory factoryFactory) {
        loader = null;
        parser = null;
        this.tikaBin = tikaBin;
        this.parserFactoryFactory = factoryFactory;
    }

    /**
     * EXPERT
     * @param tikaBin directory containing the tika-app.jar or similar -- full jar including tika-core and all
     *                desired parsers and dependencies
     * @param parserFactoryFactory -- the factory to use to generate the parser factory in the child process/server
     * @param classLoader to use for all classes besides the parser in the child process/server
     */
    public ForkParser(Path tikaBin, ParserFactoryFactory parserFactoryFactory, ClassLoader classLoader) {
        parser = null;
        loader = classLoader;
        this.tikaBin = tikaBin;
        this.parserFactoryFactory = parserFactoryFactory;
    }

    /**
     * @param loader The ClassLoader to use 
     * @param parser the parser to delegate to. This one cannot be another ForkParser
     */
    public ForkParser(ClassLoader loader, Parser parser) {
        if (parser instanceof ForkParser) {
            throw new IllegalArgumentException("The underlying parser of a ForkParser should not be a ForkParser, but a specific implementation.");
        }
        this.tikaBin = null;
        this.parserFactoryFactory = null;
        this.loader = loader;
        this.parser = parser;
    }

    public ForkParser(ClassLoader loader) {
        this(loader, new AutoDetectParser());
    }

    public ForkParser() {
        this(ForkParser.class.getClassLoader());
    }

    /**
     * Returns the size of the process pool.
     *
     * @return process pool size
     */
    public synchronized int getPoolSize() {
        return poolSize;
    }

    /**
     * Sets the size of the process pool.
     *
     * @param poolSize process pool size
     */
    public synchronized void setPoolSize(int poolSize) {
        this.poolSize = poolSize;
    }

    /**
     * Returns the command used to start the forked server process.
     *
     * @return java command line
     * @deprecated since 1.8
     * @see ForkParser#getJavaCommandAsList()
     */
    @Deprecated
    public String getJavaCommand() {
        StringBuilder sb = new StringBuilder();
        for (String part : getJavaCommandAsList()) {
            sb.append(part).append(' ');
        }
        sb.deleteCharAt(sb.length() - 1);
        return sb.toString();
    }

    /**
     * Returns the command used to start the forked server process.
     * 

* Returned list is unmodifiable. * @return java command line args */ public List getJavaCommandAsList() { return Collections.unmodifiableList(java); } /** * Sets the command used to start the forked server process. * The arguments "-jar" and "/path/to/bootstrap.jar" * or "-cp" and "/path/to/tika_bin" are * appended to the given command when starting the process. * The default setting is {"java", "-Xmx32m"}. *

* Creates a defensive copy. * @param java java command line */ public void setJavaCommand(List java) { this.java = new ArrayList<>(java); } /** * Sets the command used to start the forked server process. * The given command line is split on whitespace and the arguments * "-jar" and "/path/to/bootstrap.jar" are appended to it when starting * the process. The default setting is "java -Xmx32m". * * @param java java command line * @deprecated since 1.8 * @see ForkParser#setJavaCommand(List) */ @Deprecated public void setJavaCommand(String java) { setJavaCommand(Arrays.asList(java.split(" "))); } public Set getSupportedTypes(ParseContext context) { return parser.getSupportedTypes(context); } /** * * This sends the objects to the server for parsing, and the server via * the proxies acts on the handler as if it were updating it directly. *

* If using a RecursiveParserWrapper, there are two options: *

*

*

    *
  1. Send in a class that extends {@link org.apache.tika.sax.RecursiveParserWrapperHandler}, * and the server will proxy back the data as best it can[0].
  2. *
  3. Send in a class that extends {@link AbstractRecursiveParserWrapperHandler} * and the server will act on the class but not proxy back the data. This * can be used, for example, if all you want to do is write to disc, extend * {@link AbstractRecursiveParserWrapperHandler} to write to disc when * {@link AbstractRecursiveParserWrapperHandler#endDocument(ContentHandler, Metadata)} * is called, and the server will take care of the writing via the handler.
  4. *
*

*

* NOTE:[0] "the server will proxy back the data as best it can". If the handler * implements Serializable and is actually serializable, the server will send it and the * {@link Metadata} back upon {@link org.apache.tika.sax.RecursiveParserWrapperHandler#endEmbeddedDocument(ContentHandler, Metadata)} * or {@link org.apache.tika.sax.RecursiveParserWrapperHandler#endEmbeddedDocument(ContentHandler, Metadata)}. * If the handler does not implement {@link java.io.Serializable} or if there is a * {@link java.io.NotSerializableException} thrown during serialization, the server will * call {@link ContentHandler#toString()} on the ContentHandler and set that value with the * {@link org.apache.tika.sax.RecursiveParserWrapperHandler#TIKA_CONTENT} key and then * serialize and proxy that data back. *

* * @param stream the document stream (input) * @param handler handler for the XHTML SAX events (output) * @param metadata document metadata (input and output) * @param context parse context * @throws IOException * @throws SAXException * @throws TikaException */ public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { if (stream == null) { throw new NullPointerException("null stream"); } Throwable t; boolean alive = false; ForkClient client = acquireClient(); try { ContentHandler tee = (handler instanceof AbstractRecursiveParserWrapperHandler) ? handler : new TeeContentHandler( handler, new MetadataContentHandler(metadata)); t = client.call("parse", stream, tee, metadata, context); alive = true; } catch (TikaException te) { // Problem occurred on our side alive = true; throw te; } catch (IOException e) { // Problem occurred on the other side throw new TikaException( "Failed to communicate with a forked parser process." + " The process has most likely crashed due to some error" + " like running out of memory. A new process will be" + " started for the next parsing request.", e); } finally { releaseClient(client, alive); } if (t instanceof IOException) { throw (IOException) t; } else if (t instanceof SAXException) { throw (SAXException) t; } else if (t instanceof TikaException) { throw (TikaException) t; } else if (t != null) { throw new TikaException( "Unexpected error in forked server process", t); } } public synchronized void close() { for (ForkClient client : pool) { client.close(); } pool.clear(); poolSize = 0; } private synchronized ForkClient acquireClient() throws IOException, TikaException { while (true) { ForkClient client = pool.poll(); // Create a new process if there's room in the pool if (client == null && currentlyInUse < poolSize) { client = newClient(); } // Ping the process, and get rid of it if it's inactive if (client != null && !client.ping()) { client.close(); client = null; } if (client != null) { currentlyInUse++; return client; } else if (currentlyInUse >= poolSize) { try { wait(); } catch (InterruptedException e) { throw new TikaException( "Interrupted while waiting for a fork parser", e); } } } } private ForkClient newClient() throws IOException, TikaException { TimeoutLimits timeoutLimits = new TimeoutLimits(serverPulseMillis, serverParseTimeoutMillis, serverWaitTimeoutMillis); if (loader == null && parser == null && tikaBin != null && parserFactoryFactory != null) { return new ForkClient(tikaBin, parserFactoryFactory, java, timeoutLimits); } else if (loader != null && parser != null && tikaBin == null && parserFactoryFactory == null) { return new ForkClient(loader, parser, java, timeoutLimits); } else if (loader != null && parser == null && tikaBin != null && parserFactoryFactory != null) { return new ForkClient(tikaBin, parserFactoryFactory, loader, java, timeoutLimits); } else { //TODO: make this more useful throw new IllegalStateException("Unexpected combination of state items"); } } private synchronized void releaseClient(ForkClient client, boolean alive) { currentlyInUse--; if (currentlyInUse + pool.size() < poolSize && alive) { if (maxFilesProcessedPerClient > 0 && client.getFilesProcessed() >= maxFilesProcessedPerClient) { client.close(); } else { pool.offer(client); } notifyAll(); } else { client.close(); } } /** * The amount of time in milliseconds that the server * should wait before checking to see if the parse has timed out * or if the wait has timed out * The default is 5 seconds. * * @param serverPulseMillis milliseconds to sleep before checking if there has been any activity */ public void setServerPulseMillis(long serverPulseMillis) { this.serverPulseMillis = serverPulseMillis; } /** * The maximum amount of time allowed for the server to try to parse a file. * If more than this time elapses, the server shuts down, and the ForkParser * throws an exception. * * @param serverParseTimeoutMillis */ public void setServerParseTimeoutMillis(long serverParseTimeoutMillis) { this.serverParseTimeoutMillis = serverParseTimeoutMillis; } /** * The maximum amount of time allowed for the server to wait for a new request to parse * a file. The server will shutdown after this amount of time, and a new server will have * to be started by a new client. * @param serverWaitTimeoutMillis */ public void setServerWaitTimeoutMillis(long serverWaitTimeoutMillis) { this.serverWaitTimeoutMillis = serverWaitTimeoutMillis; } /** * If there is a slowly building memory leak in one of the parsers, * it is useful to set a limit on the number of files processed * by a server before it is shutdown and restarted. Default value is -1. * * @param maxFilesProcessedPerClient maximum number of files that a server can handle * before the parser shuts down a client and creates * a new process. If set to -1, the server is never restarted * because of the number of files handled. */ public void setMaxFilesProcessedPerServer(int maxFilesProcessedPerClient) { this.maxFilesProcessedPerClient = maxFilesProcessedPerClient; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy