All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.batch.fs.strawman.StrawManTikaAppDriver Maven / Gradle / Ivy

There is a newer version: 3.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.batch.fs.strawman;

import java.io.BufferedReader;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicInteger;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.MarkerFactory;

/**
 * Simple single-threaded class that calls tika-app against every file in a directory.
 * 

* This is exceedingly robust. One file per process. *

* However, you can use this to compare performance against tika-batch fs code. */ public class StrawManTikaAppDriver implements Callable { private static final Logger LOG = LoggerFactory.getLogger(StrawManTikaAppDriver.class); private static final AtomicInteger threadCount = new AtomicInteger(0); private final int totalThreads; private final int threadNum; private final Path inputRoot; private final Path outputRoot; private final Path fileList; private final String[] args; public StrawManTikaAppDriver(Path inputRoot, Path outputRoot, int totalThreads, Path fileList, String[] args) { this.inputRoot = inputRoot; this.outputRoot = outputRoot; this.fileList = fileList; this.args = args; threadNum = threadCount.getAndIncrement(); this.totalThreads = totalThreads; } public static String usage() { StringBuilder sb = new StringBuilder(); sb.append("Example usage:\n"); sb.append("java -cp org.apache.batch.fs.strawman.StrawManTikaAppDriver "); sb.append(" "); sb.append("java -jar tika-app-X.Xjar <...commandline arguments for tika-app>\n\n"); return sb.toString(); } public static void main(String[] args) { long start = System.currentTimeMillis(); if (args.length < 6) { System.err.println(StrawManTikaAppDriver.usage()); } Path inputDir = Paths.get(args[0]); Path outputDir = Paths.get(args[1]); int totalThreads = Integer.parseInt(args[2]); Path fileList = null; if (args.length > 3) { fileList = Paths.get(args[3]); if (!Files.isReadable(fileList)) { fileList = null; } } int initialParams = (fileList == null) ? 3 : 4; List commandLine = new ArrayList<>(Arrays.asList(args).subList(initialParams, args.length)); totalThreads = Math.max(totalThreads, 1); ExecutorService ex = Executors.newFixedThreadPool(totalThreads); ExecutorCompletionService completionService = new ExecutorCompletionService<>(ex); for (int i = 0; i < totalThreads; i++) { StrawManTikaAppDriver driver = new StrawManTikaAppDriver(inputDir, outputDir, totalThreads, fileList, commandLine.toArray(new String[0])); completionService.submit(driver); } int totalFilesProcessed = 0; for (int i = 0; i < totalThreads; i++) { try { Future future = completionService.take(); if (future != null) { totalFilesProcessed += future.get(); } } catch (InterruptedException | ExecutionException e) { LOG.error(e.getMessage(), e); } } double elapsedSeconds = (double) (System.currentTimeMillis() - start) / (double) 1000; LOG.info("Processed {} in {} seconds", totalFilesProcessed, elapsedSeconds); ex.shutdownNow(); } @Override public Integer call() throws Exception { long start = System.currentTimeMillis(); TikaVisitor v = new TikaVisitor(); if (fileList != null) { TikaVisitor tikaVisitor = new TikaVisitor(); try (BufferedReader reader = Files .newBufferedReader(fileList, StandardCharsets.UTF_8)) { String line = reader.readLine(); while (line != null) { Path inputFile = inputRoot.resolve(line.trim()); if (Files.isReadable(inputFile)) { try { tikaVisitor.visitFile(inputFile, Files.readAttributes(inputFile, BasicFileAttributes.class)); } catch (IOException e) { LOG.warn("Problem with: " + inputFile, e); } } else { LOG.warn("Not readable: " + inputFile); } line = reader.readLine(); } } } else { Files.walkFileTree(inputRoot, v); } int processed = v.getProcessed(); double elapsedSecs = ((double) System.currentTimeMillis() - (double) start) / (double) 1000; LOG.info("Finished processing {} files in {} seconds.", processed, elapsedSecs); return processed; } private class TikaVisitor extends SimpleFileVisitor { private final AtomicInteger processed = new AtomicInteger(0); int getProcessed() { return processed.get(); } @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attr) { if (totalThreads > 1) { int hashCode = file.toAbsolutePath().toString().hashCode(); if (Math.abs(hashCode % totalThreads) != threadNum) { return FileVisitResult.CONTINUE; } } if (!file.startsWith(inputRoot)) { LOG.warn("File (" + file.toAbsolutePath() + ") doesn't start with input root (" + inputRoot.toAbsolutePath() + ")"); return FileVisitResult.CONTINUE; } Path relPath = inputRoot.relativize(file); String suffix = ".txt"; List commandLine = new ArrayList<>(); for (String arg : args) { commandLine.add(arg); if (arg.equals("-J")) { suffix = ".json"; } else if (arg.contains("-x")) { suffix = ".html"; } } String fullPath = file.toAbsolutePath().toString(); if (fullPath.contains(" ")) { fullPath = "\"" + fullPath + "\""; } commandLine.add(fullPath); Path outputFile = Paths.get(outputRoot.toAbsolutePath().toString(), relPath.toString() + suffix); try { Files.createDirectories(outputFile.getParent()); } catch (IOException e) { LOG.error(MarkerFactory.getMarker("FATAL"), "parent directory for {} was not made!", outputFile); throw new RuntimeException("couldn't make parent file for " + outputFile); } ProcessBuilder builder = new ProcessBuilder(); builder.command(commandLine); LOG.info("about to process: {}", file.toAbsolutePath()); builder.redirectOutput(outputFile.toFile()); builder.redirectError(ProcessBuilder.Redirect.INHERIT); Process proc; try { proc = builder.start(); } catch (IOException e) { LOG.error(e.getMessage(), e); return FileVisitResult.CONTINUE; } boolean finished = false; long totalTime = 180000;//3 minutes long pulse = 100; for (int i = 0; i < totalTime; i += pulse) { try { Thread.sleep(pulse); } catch (InterruptedException e) { //swallow } try { int exit = proc.exitValue(); finished = true; break; } catch (IllegalThreadStateException e) { //swallow } } if (!finished) { LOG.warn("Had to terminate process working on: {}", file.toAbsolutePath()); proc.destroyForcibly(); } try { proc.getOutputStream().flush(); proc.getOutputStream().close(); } catch (IOException e) { LOG.warn("couldn't close process outputstream", e); } processed.incrementAndGet(); return FileVisitResult.CONTINUE; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy