org.apache.tika.batch.fs.FSDirectoryCrawler Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tika-batch Show documentation
There is a newer version: 1.0.18
package org.apache.tika.batch.fs;
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;

import org.apache.tika.batch.FileResource;
import org.apache.tika.batch.FileResourceCrawler;

public class FSDirectoryCrawler extends FileResourceCrawler {

    public enum CRAWL_ORDER
    {
        SORTED, //alphabetical order; necessary for cross-platform unit tests
        RANDOM, //shuffle
        OS_ORDER //operating system chooses
    }

    private final Path root;
    private final Path startDirectory;
    private final Comparator pathComparator = new FileNameComparator();
    private CRAWL_ORDER crawlOrder;

    public FSDirectoryCrawler(ArrayBlockingQueue fileQueue,
                              int numConsumers, Path root, CRAWL_ORDER crawlOrder) {
        super(fileQueue, numConsumers);
        this.root = root;
        this.startDirectory = root;
        this.crawlOrder = crawlOrder;
        if (!Files.isDirectory(startDirectory)) {
            throw new RuntimeException("Crawler couldn't find this directory:" +
                    startDirectory.toAbsolutePath());
        }

    }

    public FSDirectoryCrawler(ArrayBlockingQueue fileQueue,
                              int numConsumers, Path root, Path startDirectory,
                              CRAWL_ORDER crawlOrder) {
        super(fileQueue, numConsumers);
        this.root = root;
        this.startDirectory = startDirectory;
        this.crawlOrder = crawlOrder;
        assert(startDirectory.toAbsolutePath().startsWith(root.toAbsolutePath()));

        if (! Files.isDirectory(startDirectory)) {
            throw new RuntimeException("Crawler couldn't find this directory:" + startDirectory.toAbsolutePath());
        }
    }

    public void start() throws InterruptedException {
        addFiles(startDirectory);
    }

    private void addFiles(Path directory) throws InterruptedException {

        if (directory == null) {
            logger.warn("FSFileAdder asked to process null directory?!");
            return;
        }

        List files = new ArrayList<>();
        try (DirectoryStream ds = Files.newDirectoryStream(directory)){
            for (Path p : ds) {
                files.add(p);
            }
        } catch (IOException e) {
            logger.warn("FSFileAdder couldn't read "+directory.toAbsolutePath() +
            ": "+e.getMessage(), e);
        }
        if (files.size() == 0) {
            logger.info("Empty directory: " + directory.toAbsolutePath());
            return;
        }


        if (crawlOrder == CRAWL_ORDER.RANDOM) {
            Collections.shuffle(files);
        } else if (crawlOrder == CRAWL_ORDER.SORTED) {
            Collections.sort(files, pathComparator);
        }

        int numFiles = 0;
        List directories = new LinkedList<>();
        for (Path f : files) {
            if (Thread.currentThread().isInterrupted()) {
                throw new InterruptedException("file adder interrupted");
            }
            if (!Files.isReadable(f)) {
                logger.warn("Skipping -- "+f.toAbsolutePath()+
                        " -- file/directory is not readable");
                continue;
            }
            if (Files.isDirectory(f)) {
                directories.add(f);
                continue;
            }
            numFiles++;
            if (numFiles == 1) {
                handleFirstFileInDirectory(f);
            }
            int added = tryToAdd(new FSFileResource(root, f));
            if (added == FileResourceCrawler.STOP_NOW) {
                logger.debug("crawler has hit a limit: "+f.toAbsolutePath() + " : " + added);
                return;
            }
            logger.debug("trying to add: "+f.toAbsolutePath() + " : " + added);
        }

        for (Path f : directories) {
            addFiles(f);
        }
    }

    /**
     * Override this if you have any special handling
     * for the first actual file that the crawler comes across
     * in a directory.  For example, it might be handy to call
     * mkdirs() on an output directory if your FileResourceConsumers
     * are writing to a file.
     *
     * @param f file to handle
     */
    public void handleFirstFileInDirectory(Path f) {
        //no-op
    }

    //simple lexical order for the file name, we don't really care about localization.
    //we do want this, though, because file.compareTo behaves differently
    //on different OS's.
    private class FileNameComparator implements Comparator {

        @Override
        public int compare(Path f1, Path f2) {
            if (f1 == null || f2 == null) {
                return 0;
            }
            return f1.getFileName().toString().compareTo(f2.getFileName().toString());
        }
    }
}