![JAR search and dependency download from the Maven repository](/logo.png)
org.apache.tika.embedder.ExternalEmbedder Maven / Gradle / Ivy
Show all versions of aem-sdk-api Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.embedder;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;
import static java.nio.charset.StandardCharsets.UTF_8;
/**
* Embedder that uses an external program (like sed or exiftool) to embed text
* content and metadata into a given document.
*
* @since Apache Tika 1.3
*/
public class ExternalEmbedder implements Embedder {
private static final long serialVersionUID = -2828829275642475697L;
/**
* Token to be replaced with a String array of metadata assignment command
* arguments
*/
public static final String METADATA_COMMAND_ARGUMENTS_TOKEN = "${METADATA}";
/**
* Token to be replaced with a String array of metadata assignment command
* arguments
*/
public static final String METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN = "${METADATA_SERIALIZED}";
/**
* Media types supported by the external program.
*/
private Set supportedEmbedTypes = Collections.emptySet();
/**
* Mapping of Tika metadata to command line parameters.
*/
private Map metadataCommandArguments = null;
/**
* The external command to invoke.
*
* @see Runtime#exec(String[])
*/
private String[] command = new String[] {
"sed", "-e",
"$a\\\n" + METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN,
ExternalParser.INPUT_FILE_TOKEN
};
private String commandAssignmentOperator = "=";
private String commandAssignmentDelimeter = ", ";
private String commandAppendOperator = "=";
private boolean quoteAssignmentValues = false;
private TemporaryResources tmp = new TemporaryResources();
public Set getSupportedEmbedTypes(ParseContext context) {
return getSupportedEmbedTypes();
}
public Set getSupportedEmbedTypes() {
return supportedEmbedTypes;
}
public void setSupportedEmbedTypes(Set supportedEmbedTypes) {
this.supportedEmbedTypes = Collections
.unmodifiableSet(new HashSet(supportedEmbedTypes));
}
/**
* Gets the command to be run. This can include either of
* {@link ExternalParser#INPUT_FILE_TOKEN} or {@link ExternalParser#OUTPUT_FILE_TOKEN} if the command
* needs filenames.
*
* @return
*/
public String[] getCommand() {
return command;
}
/**
* Sets the command to be run. This can include either of
* {@link ExternalParser#INPUT_FILE_TOKEN} or {@link ExternalParser#OUTPUT_FILE_TOKEN} if the command
* needs filenames.
*
* @see Runtime#exec(String[])
*/
public void setCommand(String... command) {
this.command = command;
}
/**
* Gets the assignment operator for the command line tool, i.e. "=".
*
* @return the assignment operator
*/
public String getCommandAssignmentOperator() {
return commandAssignmentOperator;
}
/**
* Sets the assignment operator for the command line tool, i.e. "=".
*
* @param commandAssignmentOperator
*/
public void setCommandAssignmentOperator(String commandAssignmentOperator) {
this.commandAssignmentOperator = commandAssignmentOperator;
}
/**
* Gets the delimiter for multiple assignments for the command line tool,
* i.e. ", ".
*
* @return the assignment delimiter
*/
public String getCommandAssignmentDelimeter() {
return commandAssignmentDelimeter;
}
/**
* Sets the delimiter for multiple assignments for the command line tool,
* i.e. ", ".
*
* @param commandAssignmentDelimeter
*/
public void setCommandAssignmentDelimeter(String commandAssignmentDelimeter) {
this.commandAssignmentDelimeter = commandAssignmentDelimeter;
}
/**
* Gets the operator to append rather than replace a value for the command
* line tool, i.e. "+=".
*
* @return the append operator
*/
public String getCommandAppendOperator() {
return commandAppendOperator;
}
/**
* Sets the operator to append rather than replace a value for the command
* line tool, i.e. "+=".
*
* @param commandAppendOperator
*/
public void setCommandAppendOperator(String commandAppendOperator) {
this.commandAppendOperator = commandAppendOperator;
}
/**
* Gets whether or not to quote assignment values, i.e. tag='value'. The
* default is false.
*
* @return whether or not to quote assignment values
*/
public boolean isQuoteAssignmentValues() {
return quoteAssignmentValues;
}
/**
* Sets whether or not to quote assignment values, i.e. tag='value'.
*
* @param quoteAssignmentValues
*/
public void setQuoteAssignmentValues(boolean quoteAssignmentValues) {
this.quoteAssignmentValues = quoteAssignmentValues;
}
/**
* Gets the map of Metadata keys to command line parameters.
*
* @return the metadata to CLI param map
*/
public Map getMetadataCommandArguments() {
return metadataCommandArguments;
}
/**
* Sets the map of Metadata keys to command line parameters. Set this to
* null to disable Metadata embedding.
*
* @param arguments
*/
public void setMetadataCommandArguments(Map arguments) {
this.metadataCommandArguments = arguments;
}
/**
* Constructs a collection of command line arguments responsible for setting
* individual metadata fields based on the given metadata
.
*
* @param metadata the metadata to embed
* @return the metadata-related command line arguments
*/
protected List getCommandMetadataSegments(Metadata metadata) {
List commandMetadataSegments = new ArrayList();
if (metadata == null || metadata.names() == null) {
return commandMetadataSegments;
}
for (String metadataName : metadata.names()) {
for (Property property : getMetadataCommandArguments().keySet()) {
if (metadataName.equals(property.getName())) {
String[] metadataCommandArguments = getMetadataCommandArguments().get(property);
if (metadataCommandArguments != null) {
for (String metadataCommandArgument : metadataCommandArguments) {
if (metadata.isMultiValued(metadataName)) {
for (String metadataValue : metadata.getValues(metadataName)) {
String assignmentValue = metadataValue;
if (quoteAssignmentValues) {
assignmentValue = "'" + assignmentValue + "'";
}
commandMetadataSegments.add(metadataCommandArgument
+ commandAppendOperator
+ assignmentValue);
}
} else {
String assignmentValue = metadata.get(metadataName);
if (quoteAssignmentValues) {
assignmentValue = "'" + assignmentValue + "'";
}
commandMetadataSegments.add(metadataCommandArgument
+ commandAssignmentOperator
+ assignmentValue);
}
}
}
}
}
}
return commandMetadataSegments;
}
/**
* Serializes a collection of metadata command line arguments into a single
* string.
*
* @param metadataCommandArguments
* @return the serialized metadata arguments string
*/
protected static String serializeMetadata(
List metadataCommandArguments) {
if (metadataCommandArguments != null) {
return Arrays.toString(metadataCommandArguments.toArray());
}
return "";
}
/**
* Executes the configured external command and passes the given document
* stream as a simple XHTML document to the given SAX content handler.
* Metadata is only extracted if {@link #setMetadataCommandArguments(Map)}
* has been called to set arguments.
*/
public void embed(final Metadata metadata, final InputStream inputStream,
final OutputStream outputStream, final ParseContext context)
throws IOException, TikaException {
boolean inputToStdIn = true;
boolean outputFromStdOut = true;
boolean hasMetadataCommandArguments =
(metadataCommandArguments != null && !metadataCommandArguments.isEmpty());
boolean serializeMetadataCommandArgumentsToken = false;
boolean replacedMetadataCommandArgumentsToken = false;
TikaInputStream tikaInputStream = TikaInputStream.get(inputStream);
File tempOutputFile = null;
List commandMetadataSegments = null;
if (hasMetadataCommandArguments) {
commandMetadataSegments = getCommandMetadataSegments(metadata);
}
// Build our command
List origCmd = Arrays.asList(command);
List cmd = new ArrayList();
for (String commandSegment : origCmd) {
if (commandSegment.indexOf(ExternalParser.INPUT_FILE_TOKEN) != -1) {
commandSegment = commandSegment.replace(
ExternalParser.INPUT_FILE_TOKEN,
tikaInputStream.getFile().toString());
inputToStdIn = false;
}
if (commandSegment.indexOf(ExternalParser.OUTPUT_FILE_TOKEN) != -1) {
tempOutputFile = tmp.createTemporaryFile();
commandSegment = commandSegment.replace(
ExternalParser.OUTPUT_FILE_TOKEN,
tempOutputFile.toString());
outputFromStdOut = false;
}
if (commandSegment
.indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
serializeMetadataCommandArgumentsToken = true;
}
if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_TOKEN) != -1) {
if (hasMetadataCommandArguments) {
for (String commandMetadataSegment : commandMetadataSegments) {
cmd.add(commandMetadataSegment);
}
}
replacedMetadataCommandArgumentsToken = true;
} else {
cmd.add(commandSegment);
}
}
if (hasMetadataCommandArguments) {
if (serializeMetadataCommandArgumentsToken) {
// Find all metadata tokens and replace with encapsulated metadata
int i = 0;
for (String commandSegment : cmd) {
if (commandSegment
.indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
commandSegment = commandSegment.replace(
METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN,
serializeMetadata(commandMetadataSegments));
cmd.set(i, commandSegment);
}
i++;
}
} else if (!replacedMetadataCommandArgumentsToken
&& !serializeMetadataCommandArgumentsToken) {
// Tack metadata onto the end of the cmd as arguments
cmd.addAll(commandMetadataSegments);
}
}
// Execute
Process process;
if (cmd.toArray().length == 1) {
process = Runtime.getRuntime().exec(cmd.toArray(new String[] {})[0]);
} else {
process = Runtime.getRuntime().exec(cmd.toArray(new String[] {}));
}
ByteArrayOutputStream stdErrOutputStream = new ByteArrayOutputStream();
try {
sendStdErrToOutputStream(process, stdErrOutputStream);
if (inputToStdIn) {
sendInputStreamToStdIn(inputStream, process);
} else {
// We're not writing to std in this case so close
process.getOutputStream().close();
}
if (outputFromStdOut) {
sendStdOutToOutputStream(process, outputStream);
} else {
tmp.dispose();
try {
process.waitFor();
} catch (InterruptedException ignore) {
}
// The command is finished, read the output file into the given output stream
InputStream tempOutputFileInputStream = TikaInputStream.get(tempOutputFile);
IOUtils.copy(tempOutputFileInputStream, outputStream);
}
} finally {
if (outputFromStdOut) {
try {
process.waitFor();
} catch (InterruptedException ignore) {
}
} else {
try {
// Clean up temp output files
tempOutputFile.delete();
} catch (Exception e) {
}
}
if (!inputToStdIn) {
// Close input file (and delete if created by up TemporaryResources.createTemporaryFile)
IOUtils.closeQuietly(tikaInputStream);
}
IOUtils.closeQuietly(outputStream);
IOUtils.closeQuietly(stdErrOutputStream);
if (process.exitValue() != 0) {
throw new TikaException("There was an error executing the command line" +
"\nExecutable Command:\n\n" + cmd +
"\nExecutable Error:\n\n" + stdErrOutputStream.toString(UTF_8.name()));
}
}
}
/**
* Creates a new thread for copying a given input stream to a given output stream.
*
* @param inputStream the source input stream
* @param outputStream the target output stream
*/
private void multiThreadedStreamCopy(
final InputStream inputStream,
final OutputStream outputStream) {
new Thread(new Runnable() {
public void run() {
try {
IOUtils.copy(inputStream, outputStream);
} catch (IOException e) {
System.out.println("ERROR: " + e.getMessage());
}
}
}).start();
}
/**
* Sends the contents of the given input stream to the
* standard input of the given process. Potential exceptions are
* ignored.
*
* Note that the given input stream is not closed by this method.
*
* @param process the process
* @param inputStream the input stream to send to standard input of the process
*/
private void sendInputStreamToStdIn(
final InputStream inputStream,
final Process process) {
multiThreadedStreamCopy(inputStream, process.getOutputStream());
}
/**
* Sends the standard output of the given
* process to the given output stream. Potential exceptions are
* ignored.
*
* Note that the given output stream is not closed by this method.
*
* @param process the process
* @param outputStream the putput stream to send to standard input of the process
*/
private void sendStdOutToOutputStream(
final Process process,
final OutputStream outputStream) {
try {
IOUtils.copy(process.getInputStream(), outputStream);
} catch (IOException e) {
System.out.println("ERROR: " + e.getMessage());
}
}
/**
* Starts a thread that reads and discards the contents of the standard
* stream of the given process. Potential exceptions are ignored, and the
* stream is closed once fully processed.
*
* @param process the process
* param outputStream the output stream to send to standard error of the process
*/
private void sendStdErrToOutputStream(
final Process process,
final OutputStream outputStream) {
multiThreadedStreamCopy(process.getErrorStream(), outputStream);
}
/**
* Checks to see if the command can be run. Typically used with something
* like "myapp --version" to check to see if "myapp" is installed and on the
* path.
*
* @param checkCmd the check command to run
* @param errorValue what is considered an error value?
* @return whether or not the check completed without error
*/
public static boolean check(String checkCmd, int... errorValue) {
return check(new String[] { checkCmd }, errorValue);
}
/**
* Checks to see if the command can be run. Typically used with something
* like "myapp --version" to check to see if "myapp" is installed and on the
* path.
*
* @param checkCmd the check command to run
* @param errorValue what is considered an error value?
* @return whether or not the check completed without error
*/
public static boolean check(String[] checkCmd, int... errorValue) {
if (errorValue.length == 0) {
errorValue = new int[] { 127 };
}
try {
Process process;
if (checkCmd.length == 1) {
process = Runtime.getRuntime().exec(checkCmd[0]);
} else {
process = Runtime.getRuntime().exec(checkCmd);
}
int result = process.waitFor();
for (int err : errorValue) {
if (result == err)
return false;
}
return true;
} catch (IOException e) {
// Some problem, command is there or is broken
return false;
} catch (InterruptedException ie) {
// Some problem, command is there or is broken
return false;
}
}
}