org.apache.tika.batch.fs.FSDirectoryCrawler Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.batch.fs;
import java.io.IOException;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import org.apache.tika.batch.FileResource;
import org.apache.tika.batch.FileResourceCrawler;
public class FSDirectoryCrawler extends FileResourceCrawler {
private final Path root;
private final Path startDirectory;
private final Comparator pathComparator = new FileNameComparator();
private CRAWL_ORDER crawlOrder;
public FSDirectoryCrawler(ArrayBlockingQueue fileQueue, int numConsumers,
Path root, CRAWL_ORDER crawlOrder) {
super(fileQueue, numConsumers);
this.root = root;
this.startDirectory = root;
this.crawlOrder = crawlOrder;
if (!Files.isDirectory(startDirectory)) {
throw new RuntimeException(
"Crawler couldn't find this directory:" + startDirectory.toAbsolutePath());
}
}
public FSDirectoryCrawler(ArrayBlockingQueue fileQueue, int numConsumers,
Path root, Path startDirectory, CRAWL_ORDER crawlOrder) {
super(fileQueue, numConsumers);
this.root = root;
this.startDirectory = startDirectory;
this.crawlOrder = crawlOrder;
assert (startDirectory.toAbsolutePath().startsWith(root.toAbsolutePath()));
if (!Files.isDirectory(startDirectory)) {
throw new RuntimeException(
"Crawler couldn't find this directory:" + startDirectory.toAbsolutePath());
}
}
public void start() throws InterruptedException {
addFiles(startDirectory);
}
private void addFiles(Path directory) throws InterruptedException {
if (directory == null) {
LOG.warn("FSFileAdder asked to process null directory?!");
return;
}
List files = new ArrayList<>();
try (DirectoryStream ds = Files.newDirectoryStream(directory)) {
for (Path p : ds) {
files.add(p);
}
} catch (IOException e) {
LOG.warn("FSFileAdder couldn't read {}: {}", directory.toAbsolutePath(), e.getMessage(),
e);
}
if (files.size() == 0) {
LOG.info("Empty directory: {}", directory.toAbsolutePath());
return;
}
if (crawlOrder == CRAWL_ORDER.RANDOM) {
Collections.shuffle(files);
} else if (crawlOrder == CRAWL_ORDER.SORTED) {
files.sort(pathComparator);
}
int numFiles = 0;
List directories = new LinkedList<>();
for (Path f : files) {
if (Thread.currentThread().isInterrupted()) {
throw new InterruptedException("file adder interrupted");
}
if (!Files.isReadable(f)) {
LOG.warn("Skipping -- {} -- file/directory is not readable", f.toAbsolutePath());
continue;
}
if (Files.isDirectory(f)) {
directories.add(f);
continue;
}
numFiles++;
if (numFiles == 1) {
handleFirstFileInDirectory(f);
}
int added = tryToAdd(new FSFileResource(root, f));
if (added == FileResourceCrawler.STOP_NOW) {
LOG.debug("crawler has hit a limit: {} : {}", f.toAbsolutePath(), added);
return;
}
LOG.debug("trying to add: {} : {}", f.toAbsolutePath(), added);
}
for (Path f : directories) {
addFiles(f);
}
}
/**
* Override this if you have any special handling
* for the first actual file that the crawler comes across
* in a directory. For example, it might be handy to call
* mkdirs() on an output directory if your FileResourceConsumers
* are writing to a file.
*
* @param f file to handle
*/
public void handleFirstFileInDirectory(Path f) {
//no-op
}
public enum CRAWL_ORDER {
SORTED, //alphabetical order; necessary for cross-platform unit tests
RANDOM, //shuffle
OS_ORDER //operating system chooses
}
//simple lexical order for the file name, we don't really care about localization.
//we do want this, though, because file.compareTo behaves differently
//on different OS's.
private static class FileNameComparator implements Comparator {
@Override
public int compare(Path f1, Path f2) {
if (f1 == null || f2 == null) {
return 0;
}
return f1.getFileName().toString().compareTo(f2.getFileName().toString());
}
}
}