All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.embedder.ExternalEmbedder Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.embedder;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;

import static java.nio.charset.StandardCharsets.UTF_8;

/**
 * Embedder that uses an external program (like sed or exiftool) to embed text
 * content and metadata into a given document.
 *
 * @since Apache Tika 1.3
 */
public class ExternalEmbedder implements Embedder {

    private static final long serialVersionUID = -2828829275642475697L;

    /**
     * Token to be replaced with a String array of metadata assignment command
     * arguments
     */
    public static final String METADATA_COMMAND_ARGUMENTS_TOKEN = "${METADATA}";

    /**
     * Token to be replaced with a String array of metadata assignment command
     * arguments
     */
    public static final String METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN = "${METADATA_SERIALIZED}";

    /**
     * Media types supported by the external program.
     */
    private Set supportedEmbedTypes = Collections.emptySet();

    /**
     * Mapping of Tika metadata to command line parameters.
     */
    private Map metadataCommandArguments = null;

    /**
     * The external command to invoke.
     *
     * @see Runtime#exec(String[])
     */
    private String[] command = new String[] {
            "sed", "-e",
            "$a\\\n" + METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN,
            ExternalParser.INPUT_FILE_TOKEN
    };

    private String commandAssignmentOperator = "=";
    private String commandAssignmentDelimeter = ", ";
    private String commandAppendOperator = "=";

    private boolean quoteAssignmentValues = false;

    private TemporaryResources tmp = new TemporaryResources();

    public Set getSupportedEmbedTypes(ParseContext context) {
        return getSupportedEmbedTypes();
    }

    public Set getSupportedEmbedTypes() {
        return supportedEmbedTypes;
    }

    public void setSupportedEmbedTypes(Set supportedEmbedTypes) {
        this.supportedEmbedTypes = Collections
                .unmodifiableSet(new HashSet(supportedEmbedTypes));
    }

    /**
     * Gets the command to be run. This can include either of
     * {@link #INPUT_FILE_TOKEN} or {@link #OUTPUT_FILE_TOKEN} if the command
     * needs filenames.
     *
     * @return
     */
    public String[] getCommand() {
        return command;
    }

    /**
     * Sets the command to be run. This can include either of
     * {@link #INPUT_FILE_TOKEN} or {@link #OUTPUT_FILE_TOKEN} if the command
     * needs filenames.
     *
     * @see Runtime#exec(String[])
     */
    public void setCommand(String... command) {
        this.command = command;
    }

    /**
     * Gets the assignment operator for the command line tool, i.e. "=".
     *
     * @return the assignment operator
     */
    public String getCommandAssignmentOperator() {
        return commandAssignmentOperator;
    }

    /**
     * Sets the assignment operator for the command line tool, i.e. "=".
     *
     * @param commandAssignmentOperator
     */
    public void setCommandAssignmentOperator(String commandAssignmentOperator) {
        this.commandAssignmentOperator = commandAssignmentOperator;
    }

    /**
     * Gets the delimiter for multiple assignments for the command line tool,
     * i.e. ", ".
     *
     * @return the assignment delimiter
     */
    public String getCommandAssignmentDelimeter() {
        return commandAssignmentDelimeter;
    }

    /**
     * Sets the delimiter for multiple assignments for the command line tool,
     * i.e. ", ".
     *
     * @param commandAssignmentDelimeter
     */
    public void setCommandAssignmentDelimeter(String commandAssignmentDelimeter) {
        this.commandAssignmentDelimeter = commandAssignmentDelimeter;
    }

    /**
     * Gets the operator to append rather than replace a value for the command
     * line tool, i.e. "+=".
     *
     * @return the append operator
     */
    public String getCommandAppendOperator() {
        return commandAppendOperator;
    }

    /**
     * Sets the operator to append rather than replace a value for the command
     * line tool, i.e. "+=".
     *
     * @param commandAppendOperator
     */
    public void setCommandAppendOperator(String commandAppendOperator) {
        this.commandAppendOperator = commandAppendOperator;
    }

    /**
     * Gets whether or not to quote assignment values, i.e. tag='value'. The
     * default is false.
     *
     * @return whether or not to quote assignment values
     */
    public boolean isQuoteAssignmentValues() {
        return quoteAssignmentValues;
    }

    /**
     * Sets whether or not to quote assignment values, i.e. tag='value'.
     *
     * @param quoteAssignmentValues
     */
    public void setQuoteAssignmentValues(boolean quoteAssignmentValues) {
        this.quoteAssignmentValues = quoteAssignmentValues;
    }

    /**
     * Gets the map of Metadata keys to command line parameters.
     *
     * @return the metadata to CLI param map
     */
    public Map getMetadataCommandArguments() {
        return metadataCommandArguments;
    }

    /**
     * Sets the map of Metadata keys to command line parameters. Set this to
     * null to disable Metadata embedding.
     *
     * @param arguments
     */
    public void setMetadataCommandArguments(Map arguments) {
        this.metadataCommandArguments = arguments;
    }

    /**
     * Constructs a collection of command line arguments responsible for setting
     * individual metadata fields based on the given metadata.
     *
     * @param metadata the metadata to embed
     * @return the metadata-related command line arguments
     */
    protected List getCommandMetadataSegments(Metadata metadata) {
        List commandMetadataSegments = new ArrayList();
        if (metadata == null || metadata.names() == null) {
            return commandMetadataSegments;
        }
        for (String metadataName : metadata.names()) {
            for (Property property : getMetadataCommandArguments().keySet()) {
                if (metadataName.equals(property.getName())) {
                    String[] metadataCommandArguments = getMetadataCommandArguments().get(property);
                    if (metadataCommandArguments != null) {
                        for (String metadataCommandArgument : metadataCommandArguments) {
                            if (metadata.isMultiValued(metadataName)) {
                                for (String metadataValue : metadata.getValues(metadataName)) {
                                    String assignmentValue = metadataValue;
                                    if (quoteAssignmentValues) {
                                        assignmentValue = "'" + assignmentValue + "'";
                                    }
                                    commandMetadataSegments.add(metadataCommandArgument
                                            + commandAppendOperator
                                            + assignmentValue);
                                }
                            } else {
                                String assignmentValue = metadata.get(metadataName);
                                if (quoteAssignmentValues) {
                                    assignmentValue = "'" + assignmentValue + "'";
                                }
                                commandMetadataSegments.add(metadataCommandArgument
                                        + commandAssignmentOperator
                                        + assignmentValue);
                            }
                        }
                    }
                }
            }
        }
        return commandMetadataSegments;
    }

    /**
     * Serializes a collection of metadata command line arguments into a single
     * string.
     *
     * @param metadataCommandArguments
     * @return the serialized metadata arguments string
     */
    protected static String serializeMetadata(
            List metadataCommandArguments) {
        if (metadataCommandArguments != null) {
            return Arrays.toString(metadataCommandArguments.toArray());
        }
        return "";
    }

    /**
     * Executes the configured external command and passes the given document
     * stream as a simple XHTML document to the given SAX content handler.
     * Metadata is only extracted if {@link #setMetadataCommandArguments(Map)}
     * has been called to set arguments.
     */
    public void embed(final Metadata metadata, final InputStream inputStream,
            final OutputStream outputStream, final ParseContext context)
            throws IOException, TikaException {

        boolean inputToStdIn = true;
        boolean outputFromStdOut = true;
        boolean hasMetadataCommandArguments =
                (metadataCommandArguments != null && !metadataCommandArguments.isEmpty());
        boolean serializeMetadataCommandArgumentsToken = false;
        boolean replacedMetadataCommandArgumentsToken = false;

        TikaInputStream tikaInputStream = TikaInputStream.get(inputStream);
        File tempOutputFile = null;

        List commandMetadataSegments = null;
        if (hasMetadataCommandArguments) {
            commandMetadataSegments = getCommandMetadataSegments(metadata);
        }

        // Build our command
        List origCmd = Arrays.asList(command);
        List cmd = new ArrayList();
        for (String commandSegment : origCmd) {
            if (commandSegment.indexOf(ExternalParser.INPUT_FILE_TOKEN) != -1) {
                commandSegment = commandSegment.replace(
                        ExternalParser.INPUT_FILE_TOKEN,
                        tikaInputStream.getFile().toString());
                inputToStdIn = false;
            }
            if (commandSegment.indexOf(ExternalParser.OUTPUT_FILE_TOKEN) != -1) {
                tempOutputFile = tmp.createTemporaryFile();
                commandSegment = commandSegment.replace(
                        ExternalParser.OUTPUT_FILE_TOKEN,
                        tempOutputFile.toString());
                outputFromStdOut = false;
            }
            if (commandSegment
                    .indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
                serializeMetadataCommandArgumentsToken = true;
            }
            if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_TOKEN) != -1) {
                if (hasMetadataCommandArguments) {
                    for (String commandMetadataSegment : commandMetadataSegments) {
                        cmd.add(commandMetadataSegment);
                    }
                }
                replacedMetadataCommandArgumentsToken = true;
            } else {
                cmd.add(commandSegment);
            }
        }
        if (hasMetadataCommandArguments) {
            if (serializeMetadataCommandArgumentsToken) {
                // Find all metadata tokens and replace with encapsulated metadata
                int i = 0;
                for (String commandSegment : cmd) {
                    if (commandSegment
                            .indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
                        commandSegment = commandSegment.replace(
                                METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN,
                                serializeMetadata(commandMetadataSegments));
                        cmd.set(i, commandSegment);
                    }
                    i++;
                }
            } else if (!replacedMetadataCommandArgumentsToken
                    && !serializeMetadataCommandArgumentsToken) {
                // Tack metadata onto the end of the cmd as arguments
                cmd.addAll(commandMetadataSegments);
            }
        }

        // Execute
        Process process;
        if (cmd.toArray().length == 1) {
            process = Runtime.getRuntime().exec(cmd.toArray(new String[] {})[0]);
        } else {
            process = Runtime.getRuntime().exec(cmd.toArray(new String[] {}));
        }

        ByteArrayOutputStream stdErrOutputStream = new ByteArrayOutputStream();

        try {
            sendStdErrToOutputStream(process, stdErrOutputStream);

            if (inputToStdIn) {
                sendInputStreamToStdIn(inputStream, process);
            } else {
                // We're not writing to std in this case so close
                process.getOutputStream().close();
            }

            if (outputFromStdOut) {
                sendStdOutToOutputStream(process, outputStream);
            } else {
                tmp.dispose();
                try {
                    process.waitFor();
                } catch (InterruptedException ignore) {
                }
                // The command is finished, read the output file into the given output stream
                InputStream tempOutputFileInputStream = TikaInputStream.get(tempOutputFile);
                IOUtils.copy(tempOutputFileInputStream, outputStream);
            }
        } finally {
            if (outputFromStdOut) {
                try {
                    process.waitFor();
                } catch (InterruptedException ignore) {
                }
            } else {
                try {
                    // Clean up temp output files
                    tempOutputFile.delete();
                } catch (Exception e) {
                }
            }
            if (!inputToStdIn) {
                // Close input file (and delete if created by up TemporaryResources.createTemporaryFile) 
                IOUtils.closeQuietly(tikaInputStream);
            }
            IOUtils.closeQuietly(outputStream);
            IOUtils.closeQuietly(stdErrOutputStream);
            if (process.exitValue() != 0) {
                throw new TikaException("There was an error executing the command line" +
                        "\nExecutable Command:\n\n" + cmd +
                        "\nExecutable Error:\n\n" + stdErrOutputStream.toString(UTF_8.name()));
            }
        }
    }

    /**
     * Creates a new thread for copying a given input stream to a given output stream.
     *
     * @param inputStream the source input stream
     * @param outputStream the target output stream
     */
    private void multiThreadedStreamCopy(
            final InputStream inputStream,
            final OutputStream outputStream) {
        new Thread(new Runnable() {
            public void run() {
                try {
                    IOUtils.copy(inputStream, outputStream);
                } catch (IOException e) {
                    System.out.println("ERROR: " + e.getMessage());
                }
            }
        }).start();
    }

    /**
     * Sends the contents of the given input stream to the
     * standard input of the given process. Potential exceptions are
     * ignored.
     * 

* Note that the given input stream is not closed by this method. * * @param process the process * @param inputStream the input stream to send to standard input of the process */ private void sendInputStreamToStdIn( final InputStream inputStream, final Process process) { multiThreadedStreamCopy(inputStream, process.getOutputStream()); } /** * Sends the standard output of the given * process to the given output stream. Potential exceptions are * ignored. *

* Note that the given output stream is not closed by this method. * * @param process the process * @param outputStream the putput stream to send to standard input of the process */ private void sendStdOutToOutputStream( final Process process, final OutputStream outputStream) { try { IOUtils.copy(process.getInputStream(), outputStream); } catch (IOException e) { System.out.println("ERROR: " + e.getMessage()); } } /** * Starts a thread that reads and discards the contents of the standard * stream of the given process. Potential exceptions are ignored, and the * stream is closed once fully processed. * * @param process the process * param outputStream the output stream to send to standard error of the process */ private void sendStdErrToOutputStream( final Process process, final OutputStream outputStream) { multiThreadedStreamCopy(process.getErrorStream(), outputStream); } /** * Checks to see if the command can be run. Typically used with something * like "myapp --version" to check to see if "myapp" is installed and on the * path. * * @param checkCmd the check command to run * @param errorValue what is considered an error value? * @return whether or not the check completed without error */ public static boolean check(String checkCmd, int... errorValue) { return check(new String[] { checkCmd }, errorValue); } /** * Checks to see if the command can be run. Typically used with something * like "myapp --version" to check to see if "myapp" is installed and on the * path. * * @param checkCmd the check command to run * @param errorValue what is considered an error value? * @return whether or not the check completed without error */ public static boolean check(String[] checkCmd, int... errorValue) { if (errorValue.length == 0) { errorValue = new int[] { 127 }; } try { Process process; if (checkCmd.length == 1) { process = Runtime.getRuntime().exec(checkCmd[0]); } else { process = Runtime.getRuntime().exec(checkCmd); } int result = process.waitFor(); for (int err : errorValue) { if (result == err) return false; } return true; } catch (IOException e) { // Some problem, command is there or is broken return false; } catch (InterruptedException ie) { // Some problem, command is there or is broken return false; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy