All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.external.ExternalParser Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.external;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.io.Serializable;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOUtils;
import org.apache.tika.io.NullOutputStream;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import static java.nio.charset.StandardCharsets.UTF_8;

/**
 * Parser that uses an external program (like catdoc or pdf2txt) to extract
 * text content and metadata from a given document.
 */
public class ExternalParser extends AbstractParser {

    /**
     * Consumer contract
     *
     * @since Apache Tika 1.14
     */
    public interface LineConsumer extends Serializable {
        /**
         * Consume a line
         *
         * @param line a line of string
         */
        void consume(String line);

        /**
         * A null consumer
         */
        LineConsumer NULL = new LineConsumer() {
            @Override
            public void consume(String line) {
                // ignores
            }
        };
    }

    private static final long serialVersionUID = -1079128990650687037L;

    /**
     * The token, which if present in the Command string, will
     * be replaced with the input filename.
     * Alternately, the input data can be streamed over STDIN.
     */
    public static final String INPUT_FILE_TOKEN = "${INPUT}";
    /**
     * The token, which if present in the Command string, will
     * be replaced with the output filename.
     * Alternately, the output data can be collected on STDOUT.
     */
    public static final String OUTPUT_FILE_TOKEN = "${OUTPUT}";

    /**
     * Media types supported by the external program.
     */
    private Set supportedTypes = Collections.emptySet();

    /**
     * Regular Expressions to run over STDOUT to
     * extract Metadata.
     */
    private Map metadataPatterns = null;

    /**
     * The external command to invoke.
     *
     * @see Runtime#exec(String[])
     */
    private String[] command = new String[]{"cat"};

    /**
     * A consumer for ignored Lines
     */
    private LineConsumer ignoredLineConsumer = LineConsumer.NULL;

    public Set getSupportedTypes(ParseContext context) {
        return getSupportedTypes();
    }

    public Set getSupportedTypes() {
        return supportedTypes;
    }

    public void setSupportedTypes(Set supportedTypes) {
        this.supportedTypes =
                Collections.unmodifiableSet(new HashSet(supportedTypes));
    }


    public String[] getCommand() {
        return command;
    }

    /**
     * Sets the command to be run. This can include either of
     * {@link #INPUT_FILE_TOKEN} or {@link #OUTPUT_FILE_TOKEN}
     * if the command needs filenames.
     *
     * @see Runtime#exec(String[])
     */
    public void setCommand(String... command) {
        this.command = command;
    }

    /**
     * Gets lines consumer
     *
     * @return consumer instance
     */
    public LineConsumer getIgnoredLineConsumer() {
        return ignoredLineConsumer;
    }

    /**
     * Set a consumer for the lines ignored by the parse functions
     *
     * @param ignoredLineConsumer consumer instance
     */
    public void setIgnoredLineConsumer(LineConsumer ignoredLineConsumer) {
        this.ignoredLineConsumer = ignoredLineConsumer;
    }

    public Map getMetadataExtractionPatterns() {
        return metadataPatterns;
    }

    /**
     * Sets the map of regular expression patterns and Metadata
     * keys. Any matching patterns will have the matching
     * metadata entries set.
     * Set this to null to disable Metadata extraction.
     */
    public void setMetadataExtractionPatterns(Map patterns) {
        this.metadataPatterns = patterns;
    }


    /**
     * Executes the configured external command and passes the given document
     * stream as a simple XHTML document to the given SAX content handler.
     * Metadata is only extracted if {@link #setMetadataExtractionPatterns(Map)}
     * has been called to set patterns.
     */
    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml =
                new XHTMLContentHandler(handler, metadata);

        TemporaryResources tmp = new TemporaryResources();
        try {
            parse(TikaInputStream.get(stream, tmp),
                    xhtml, metadata, tmp);
        } finally {
            tmp.dispose();
        }
    }

    private void parse(
            TikaInputStream stream, XHTMLContentHandler xhtml,
            Metadata metadata, TemporaryResources tmp)
            throws IOException, SAXException, TikaException {
        boolean inputToStdIn = true;
        boolean outputFromStdOut = true;
        boolean hasPatterns = (metadataPatterns != null && !metadataPatterns.isEmpty());

        File output = null;

        // Build our command
        String[] cmd;
        if (command.length == 1) {
            cmd = command[0].split(" ");
        } else {
            cmd = new String[command.length];
            System.arraycopy(command, 0, cmd, 0, command.length);
        }
        for (int i = 0; i < cmd.length; i++) {
            if (cmd[i].indexOf(INPUT_FILE_TOKEN) != -1) {
                cmd[i] = cmd[i].replace(INPUT_FILE_TOKEN, stream.getFile().getPath());
                inputToStdIn = false;
            }
            if (cmd[i].indexOf(OUTPUT_FILE_TOKEN) != -1) {
                output = tmp.createTemporaryFile();
                outputFromStdOut = false;
                cmd[i] = cmd[i].replace(OUTPUT_FILE_TOKEN, output.getPath());
            }
        }

        // Execute
        Process process = null;
        try {
            if (cmd.length == 1) {
                process = Runtime.getRuntime().exec(cmd[0]);
            } else {
                process = Runtime.getRuntime().exec(cmd);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        try {
            if (inputToStdIn) {
                sendInput(process, stream);
            } else {
                process.getOutputStream().close();
            }

            InputStream out = process.getInputStream();
            InputStream err = process.getErrorStream();

            if (hasPatterns) {
                extractMetadata(err, metadata);

                if (outputFromStdOut) {
                    extractOutput(out, xhtml);
                } else {
                    extractMetadata(out, metadata);
                }
            } else {
                ignoreStream(err);

                if (outputFromStdOut) {
                    extractOutput(out, xhtml);
                } else {
                    ignoreStream(out);
                }
            }
        } finally {
            try {
                process.waitFor();
            } catch (InterruptedException ignore) {
            }
        }

        // Grab the output if we haven't already
        if (!outputFromStdOut) {
            extractOutput(new FileInputStream(output), xhtml);
        }
    }

    /**
     * Starts a thread that extracts the contents of the standard output
     * stream of the given process to the given XHTML content handler.
     * The standard output stream is closed once fully processed.
     *
     * @param process process
     * @param xhtml   XHTML content handler
     * @throws SAXException if the XHTML SAX events could not be handled
     * @throws IOException  if an input error occurred
     */
    private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
            throws SAXException, IOException {
        try (Reader reader = new InputStreamReader(stream, UTF_8)) {
            xhtml.startDocument();
            xhtml.startElement("p");
            char[] buffer = new char[1024];
            for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
                xhtml.characters(buffer, 0, n);
            }
            xhtml.endElement("p");
            xhtml.endDocument();
        }
    }

    /**
     * Starts a thread that sends the contents of the given input stream
     * to the standard input stream of the given process. Potential
     * exceptions are ignored, and the standard input stream is closed
     * once fully processed. Note that the given input stream is not
     * closed by this method.
     *
     * @param process process
     * @param stream  input stream
     */
    private void sendInput(final Process process, final InputStream stream) {
        Thread t = new Thread() {
            public void run() {
                OutputStream stdin = process.getOutputStream();
                try {
                    IOUtils.copy(stream, stdin);
                } catch (IOException e) {
                }
            }
        };
        t.start();
        try {
            t.join();
        } catch (InterruptedException ignore) {
        }
    }


    /**
     * Starts a thread that reads and discards the contents of the
     * standard stream of the given process. Potential exceptions
     * are ignored, and the stream is closed once fully processed.
     * Note: calling this starts a new thread and blocks the current(caller) thread until the new thread dies
     *
     * @param stream stream to be ignored
     */
    private static void ignoreStream(final InputStream stream) {
        ignoreStream(stream, true);
    }

    /**
     * Starts a thread that reads and discards the contents of the
     * standard stream of the given process. Potential exceptions
     * are ignored, and the stream is closed once fully processed.
     *
     * @param stream       stream to sent to black hole (a k a null)
     * @param waitForDeath when {@code true} the caller thread will be blocked till the death of new thread.
     * @return The thread that is created and started
     */
    private static Thread ignoreStream(final InputStream stream, boolean waitForDeath) {
        Thread t = new Thread() {
            public void run() {
                try {
                    IOUtils.copy(stream, new NullOutputStream());
                } catch (IOException e) {
                } finally {
                    IOUtils.closeQuietly(stream);
                }
            }
        };
        t.start();
        if (waitForDeath) {
            try {
                t.join();
            } catch (InterruptedException ignore) {
            }
        }
        return t;
    }

    private void extractMetadata(final InputStream stream, final Metadata metadata) {
        Thread t = new Thread() {
            public void run() {
                BufferedReader reader;
                reader = new BufferedReader(new InputStreamReader(stream, UTF_8));
                try {
                    String line;
                    while ((line = reader.readLine()) != null) {
                        boolean consumed = false;
                        for (Pattern p : metadataPatterns.keySet()) {
                            Matcher m = p.matcher(line);
                            if (m.find()) {
                                consumed = true;
                                if (metadataPatterns.get(p) != null &&
                                        !metadataPatterns.get(p).equals("")) {
                                    metadata.add(metadataPatterns.get(p), m.group(1));
                                } else {
                                    metadata.add(m.group(1), m.group(2));
                                }
                            }
                        }
                        if (!consumed) {
                            ignoredLineConsumer.consume(line);
                        }
                    }
                } catch (IOException e) {
                    // Ignore
                } finally {
                    IOUtils.closeQuietly(reader);
                    IOUtils.closeQuietly(stream);
                }
            }
        };
        t.start();
        try {
            t.join();
        } catch (InterruptedException ignore) {
        }
    }

    /**
     * Checks to see if the command can be run. Typically used with
     * something like "myapp --version" to check to see if "myapp"
     * is installed and on the path.
     *
     * @param checkCmd   The check command to run
     * @param errorValue What is considered an error value?
     */
    public static boolean check(String checkCmd, int... errorValue) {
        return check(new String[]{checkCmd}, errorValue);
    }

    public static boolean check(String[] checkCmd, int... errorValue) {
        if (errorValue.length == 0) {
            errorValue = new int[]{127};
        }

        try {
            Process process = Runtime.getRuntime().exec(checkCmd);
            Thread stdErrSuckerThread = ignoreStream(process.getErrorStream(), false);
            Thread stdOutSuckerThread = ignoreStream(process.getInputStream(), false);
            stdErrSuckerThread.join();
            stdOutSuckerThread.join();
            int result = process.waitFor();
            for (int err : errorValue) {
                if (result == err) return false;
            }
            return true;
        } catch (IOException e) {
            // Some problem, command is there or is broken
            return false;
        } catch (InterruptedException ie) {
            // Some problem, command is there or is broken
            return false;
        } catch (SecurityException se) {
            // External process execution is banned by the security manager
            return false;
        } catch (Error err) {
            if (err.getMessage() != null &&
                    (err.getMessage().contains("posix_spawn") ||
                            err.getMessage().contains("UNIXProcess"))) {
                //"Error forking command due to JVM locale bug
                //(see TIKA-1526 and SOLR-6387)"
                return false;
            }
            //throw if a different kind of error
            throw err;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy