org.apache.tika.batch.fs.FSDirectoryCrawler Maven / Gradle / Ivy
package org.apache.tika.batch.fs;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import org.apache.tika.batch.FileResource;
import org.apache.tika.batch.FileResourceCrawler;
public class FSDirectoryCrawler extends FileResourceCrawler {
public enum CRAWL_ORDER
{
SORTED, //alphabetical order; necessary for cross-platform unit tests
RANDOM, //shuffle
OS_ORDER //operating system chooses
}
private final Path root;
private final Path startDirectory;
private final Comparator pathComparator = new FileNameComparator();
private CRAWL_ORDER crawlOrder;
public FSDirectoryCrawler(ArrayBlockingQueue fileQueue,
int numConsumers, Path root, CRAWL_ORDER crawlOrder) {
super(fileQueue, numConsumers);
this.root = root;
this.startDirectory = root;
this.crawlOrder = crawlOrder;
if (!Files.isDirectory(startDirectory)) {
throw new RuntimeException("Crawler couldn't find this directory:" +
startDirectory.toAbsolutePath());
}
}
public FSDirectoryCrawler(ArrayBlockingQueue fileQueue,
int numConsumers, Path root, Path startDirectory,
CRAWL_ORDER crawlOrder) {
super(fileQueue, numConsumers);
this.root = root;
this.startDirectory = startDirectory;
this.crawlOrder = crawlOrder;
assert(startDirectory.toAbsolutePath().startsWith(root.toAbsolutePath()));
if (! Files.isDirectory(startDirectory)) {
throw new RuntimeException("Crawler couldn't find this directory:" + startDirectory.toAbsolutePath());
}
}
public void start() throws InterruptedException {
addFiles(startDirectory);
}
private void addFiles(Path directory) throws InterruptedException {
if (directory == null) {
logger.warn("FSFileAdder asked to process null directory?!");
return;
}
List files = new ArrayList<>();
try (DirectoryStream ds = Files.newDirectoryStream(directory)){
for (Path p : ds) {
files.add(p);
}
} catch (IOException e) {
logger.warn("FSFileAdder couldn't read "+directory.toAbsolutePath() +
": "+e.getMessage(), e);
}
if (files.size() == 0) {
logger.info("Empty directory: " + directory.toAbsolutePath());
return;
}
if (crawlOrder == CRAWL_ORDER.RANDOM) {
Collections.shuffle(files);
} else if (crawlOrder == CRAWL_ORDER.SORTED) {
Collections.sort(files, pathComparator);
}
int numFiles = 0;
List directories = new LinkedList<>();
for (Path f : files) {
if (Thread.currentThread().isInterrupted()) {
throw new InterruptedException("file adder interrupted");
}
if (!Files.isReadable(f)) {
logger.warn("Skipping -- "+f.toAbsolutePath()+
" -- file/directory is not readable");
continue;
}
if (Files.isDirectory(f)) {
directories.add(f);
continue;
}
numFiles++;
if (numFiles == 1) {
handleFirstFileInDirectory(f);
}
int added = tryToAdd(new FSFileResource(root, f));
if (added == FileResourceCrawler.STOP_NOW) {
logger.debug("crawler has hit a limit: "+f.toAbsolutePath() + " : " + added);
return;
}
logger.debug("trying to add: "+f.toAbsolutePath() + " : " + added);
}
for (Path f : directories) {
addFiles(f);
}
}
/**
* Override this if you have any special handling
* for the first actual file that the crawler comes across
* in a directory. For example, it might be handy to call
* mkdirs() on an output directory if your FileResourceConsumers
* are writing to a file.
*
* @param f file to handle
*/
public void handleFirstFileInDirectory(Path f) {
//no-op
}
//simple lexical order for the file name, we don't really care about localization.
//we do want this, though, because file.compareTo behaves differently
//on different OS's.
private class FileNameComparator implements Comparator {
@Override
public int compare(Path f1, Path f2) {
if (f1 == null || f2 == null) {
return 0;
}
return f1.getFileName().toString().compareTo(f2.getFileName().toString());
}
}
}