org.apache.tika.batch.fs.FSDirectoryCrawler Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tika-batch Show documentation
There is a newer version: 3.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.batch.fs;


import java.io.IOException;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;

import org.apache.tika.batch.FileResource;
import org.apache.tika.batch.FileResourceCrawler;

public class FSDirectoryCrawler extends FileResourceCrawler {

    private final Path root;
    private final Path startDirectory;
    private final Comparator pathComparator = new FileNameComparator();
    private CRAWL_ORDER crawlOrder;

    public FSDirectoryCrawler(ArrayBlockingQueue fileQueue, int numConsumers,
                              Path root, CRAWL_ORDER crawlOrder) {
        super(fileQueue, numConsumers);
        this.root = root;
        this.startDirectory = root;
        this.crawlOrder = crawlOrder;
        if (!Files.isDirectory(startDirectory)) {
            throw new RuntimeException(
                    "Crawler couldn't find this directory:" + startDirectory.toAbsolutePath());
        }

    }

    public FSDirectoryCrawler(ArrayBlockingQueue fileQueue, int numConsumers,
                              Path root, Path startDirectory, CRAWL_ORDER crawlOrder) {
        super(fileQueue, numConsumers);
        this.root = root;
        this.startDirectory = startDirectory;
        this.crawlOrder = crawlOrder;
        assert (startDirectory.toAbsolutePath().startsWith(root.toAbsolutePath()));

        if (!Files.isDirectory(startDirectory)) {
            throw new RuntimeException(
                    "Crawler couldn't find this directory:" + startDirectory.toAbsolutePath());
        }
    }

    public void start() throws InterruptedException {
        addFiles(startDirectory);
    }

    private void addFiles(Path directory) throws InterruptedException {

        if (directory == null) {
            LOG.warn("FSFileAdder asked to process null directory?!");
            return;
        }

        List files = new ArrayList<>();
        try (DirectoryStream ds = Files.newDirectoryStream(directory)) {
            for (Path p : ds) {
                files.add(p);
            }
        } catch (IOException e) {
            LOG.warn("FSFileAdder couldn't read {}: {}", directory.toAbsolutePath(), e.getMessage(),
                    e);
        }
        if (files.size() == 0) {
            LOG.info("Empty directory: {}", directory.toAbsolutePath());
            return;
        }


        if (crawlOrder == CRAWL_ORDER.RANDOM) {
            Collections.shuffle(files);
        } else if (crawlOrder == CRAWL_ORDER.SORTED) {
            files.sort(pathComparator);
        }

        int numFiles = 0;
        List directories = new LinkedList<>();
        for (Path f : files) {
            if (Thread.currentThread().isInterrupted()) {
                throw new InterruptedException("file adder interrupted");
            }
            if (!Files.isReadable(f)) {
                LOG.warn("Skipping -- {} -- file/directory is not readable", f.toAbsolutePath());
                continue;
            }
            if (Files.isDirectory(f)) {
                directories.add(f);
                continue;
            }
            numFiles++;
            if (numFiles == 1) {
                handleFirstFileInDirectory(f);
            }
            int added = tryToAdd(new FSFileResource(root, f));
            if (added == FileResourceCrawler.STOP_NOW) {
                LOG.debug("crawler has hit a limit: {} : {}", f.toAbsolutePath(), added);
                return;
            }
            LOG.debug("trying to add: {} : {}", f.toAbsolutePath(), added);
        }

        for (Path f : directories) {
            addFiles(f);
        }
    }

    /**
     * Override this if you have any special handling
     * for the first actual file that the crawler comes across
     * in a directory.  For example, it might be handy to call
     * mkdirs() on an output directory if your FileResourceConsumers
     * are writing to a file.
     *
     * @param f file to handle
     */
    public void handleFirstFileInDirectory(Path f) {
        //no-op
    }

    public enum CRAWL_ORDER {
        SORTED, //alphabetical order; necessary for cross-platform unit tests
        RANDOM, //shuffle
        OS_ORDER //operating system chooses
    }

    //simple lexical order for the file name, we don't really care about localization.
    //we do want this, though, because file.compareTo behaves differently
    //on different OS's.
    private static class FileNameComparator implements Comparator {

        @Override
        public int compare(Path f1, Path f2) {
            if (f1 == null || f2 == null) {
                return 0;
            }
            return f1.getFileName().toString().compareTo(f2.getFileName().toString());
        }
    }
}