org.apache.tika.cli.BatchCommandLineBuilder Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.cli;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.SystemUtils;
/**
* This takes a TikaCLI commandline and builds the full commandline for
* org.apache.tika.batch.fs.FSBatchProcessCLI.
*
* The "default" batch config file that this relies on
* if no batch config file is specified on the commandline
* is: tika-batch/src/main/resources/.../default-tika-batch-config.xml
*/
class BatchCommandLineBuilder {
static Pattern JVM_OPTS_PATTERN = Pattern.compile("^(--?)J(.+)");
protected static String[] build(String[] args) throws IOException {
Map processArgs = new LinkedHashMap<>();
Map jvmOpts = new LinkedHashMap<>();
//take the args, and divide them into process args and options for
//the forked jvm process (i.e. log files, etc)
mapifyArgs(args, processArgs, jvmOpts);
//now modify processArgs in place
translateCommandLine(args, processArgs);
//maybe the user specified a different classpath?!
if (!jvmOpts.containsKey("-cp") && !jvmOpts.containsKey("--classpath")) {
String cp = System.getProperty("java.class.path");
jvmOpts.put("-cp", cp);
}
boolean hasLog4j = false;
for (String k : jvmOpts.keySet()) {
if (k.startsWith("-Dlog4j.configurationFile=")) {
hasLog4j = true;
break;
}
}
//use the log4j config file inside the app /resources/log4j2_batch_process.properties
if (!hasLog4j) {
jvmOpts.put("-Dlog4j.configurationFile=log4j2_batch_process.properties", "");
}
//now build the full command line
List fullCommand = new ArrayList<>();
fullCommand.add("java");
boolean foundHeadlessOption = false;
for (Map.Entry e : jvmOpts.entrySet()) {
fullCommand.add(e.getKey());
if (e.getValue().length() > 0) {
fullCommand.add(commandLineSafe(e.getValue()));
}
if (e.getKey().contains("java.awt.headless")) {
foundHeadlessOption = true;
}
}
//run in headless mode unless the user asks for something else TIKA-2434
if (!foundHeadlessOption) {
fullCommand.add("-Djava.awt.headless=true");
}
fullCommand.add("org.apache.tika.batch.fs.FSBatchProcessCLI");
//now add the process commands
for (Map.Entry e : processArgs.entrySet()) {
fullCommand.add(e.getKey());
if (e.getValue().length() > 0) {
fullCommand.add(commandLineSafe(e.getValue()));
}
}
return fullCommand.toArray(new String[0]);
}
protected static String commandLineSafe(String arg) {
if (arg == null) {
return arg;
}
//need to test for " " on windows, can't just add double quotes
//across platforms.
if (arg.contains(" ") && SystemUtils.IS_OS_WINDOWS) {
arg = "\"" + arg + "\"";
}
return arg;
}
/**
* Take the input args and separate them into args that belong on the commandline
* and those that belong as jvm args for the forked process.
*
* @param args -- literal args from TikaCLI commandline
* @param commandLine args that should be part of the batch commandline
* @param jvmArgs args that belong as jvm arguments for the forked process
*/
private static void mapifyArgs(final String[] args, final Map commandLine,
final Map jvmArgs) {
if (args.length == 0) {
return;
}
Matcher matcher = JVM_OPTS_PATTERN.matcher("");
for (int i = 0; i < args.length; i++) {
if (matcher.reset(args[i]).find()) {
String jvmArg = matcher.group(1) + matcher.group(2);
String v = "";
if (i < args.length - 1 && !args[i + 1].startsWith("-")) {
v = args[i + 1];
i++;
}
jvmArgs.put(jvmArg, v);
} else if (args[i].startsWith("-")) {
String k = args[i];
String v = "";
if (i < args.length - 1 && !args[i + 1].startsWith("-")) {
v = args[i + 1];
i++;
}
commandLine.put(k, v);
}
}
}
private static void translateCommandLine(String[] args, Map map)
throws IOException {
//if there are only two args and they are both directories, treat the first
//as input and the second as output.
if (args.length == 2 && !args[0].startsWith("-") && !args[1].startsWith("-")) {
Path candInput = Paths.get(args[0]);
Path candOutput = Paths.get(args[1]);
if (Files.isRegularFile(candOutput)) {
throw new IllegalArgumentException("Can't specify an existing file as the " +
"second argument for the output directory of a batch process");
}
if (Files.isDirectory(candInput)) {
map.put("-inputDir", args[0]);
map.put("-outputDir", args[1]);
}
}
//look for tikaConfig
for (String arg : args) {
if (arg.startsWith("--config=")) {
String configPath = arg.substring("--config=".length());
map.put("-c", configPath);
//now remove --config=x.config from the map :)
map.remove(arg);
break;
}
}
//now translate output types
if (map.containsKey("-h") || map.containsKey("--html")) {
map.remove("-h");
map.remove("--html");
map.put("-basicHandlerType", "html");
} else if (map.containsKey("-x") || map.containsKey("--xml")) {
map.remove("-x");
map.remove("--xml");
map.put("-basicHandlerType", "xml");
} else if (map.containsKey("-t") || map.containsKey("--text")) {
map.remove("-t");
map.remove("--text");
map.put("-basicHandlerType", "text");
} else if (map.containsKey("-m") || map.containsKey("--metadata")) {
map.remove("-m");
map.remove("--metadata");
map.put("-basicHandlerType", "ignore");
} else if (map.containsKey("-T") || map.containsKey("--text-main")) {
map.remove("-T");
map.remove("--text-main");
map.put("-basicHandlerType", "body");
}
if (map.containsKey("-J") || map.containsKey("--jsonRecursive")) {
map.remove("-J");
map.remove("--jsonRecursive");
map.put("-recursiveParserWrapper", "true");
}
if (map.containsKey("--inputDir") || map.containsKey("-i")) {
String v1 = map.remove("--inputDir");
String v2 = map.remove("-i");
String v = (v1 == null) ? v2 : v1;
map.put("-inputDir", v);
}
if (map.containsKey("--outputDir") || map.containsKey("-o")) {
String v1 = map.remove("--outputDir");
String v2 = map.remove("-o");
String v = (v1 == null) ? v2 : v1;
map.put("-outputDir", v);
}
}
}