All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.batch.fs.FSListCrawler Maven / Gradle / Ivy

There is a newer version: 1.0.18
Show newest version
package org.apache.tika.batch.fs;
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.concurrent.ArrayBlockingQueue;

import org.apache.tika.batch.FileResource;
import org.apache.tika.batch.FileResourceCrawler;

/**
 * Class that "crawls" a list of files.
 */
public class FSListCrawler extends FileResourceCrawler {

    private final BufferedReader reader;
    private final Path root;

    /**
     *
     * @param fileQueue
     * @param numConsumers
     * @param root
     * @param list
     * @param encoding
     * @throws FileNotFoundException
     * @throws UnsupportedEncodingException
     * @deprecated
     * @see #FSListCrawler(ArrayBlockingQueue, int, Path, Path, Charset)
     */
    @Deprecated
    public FSListCrawler(ArrayBlockingQueue fileQueue,
                         int numConsumers, File root, File list, String encoding)
            throws FileNotFoundException, UnsupportedEncodingException {
        super(fileQueue, numConsumers);
        reader = new BufferedReader(new InputStreamReader(new FileInputStream(list), encoding));
        this.root = Paths.get(root.toURI());

    }

    /**
     * Constructor for a crawler that reads a list of files to process.
     * 

* The list should be paths relative to the root. * * @param fileQueue queue for batch * @param numConsumers number of consumers * @param root root input director * @param list text file list (one file per line) of paths relative to * the root for processing * @param charset charset of the file * @throws IOException */ public FSListCrawler(ArrayBlockingQueue fileQueue, int numConsumers, Path root, Path list, Charset charset) throws IOException { super(fileQueue, numConsumers); reader = Files.newBufferedReader(list, charset); this.root = root; } public void start() throws InterruptedException { String line = nextLine(); while (line != null) { if (Thread.currentThread().isInterrupted()) { throw new InterruptedException("file adder interrupted"); } Path f = Paths.get(root.toString(), line); if (! Files.exists(f)) { logger.warn("File doesn't exist:"+f.toAbsolutePath()); line = nextLine(); continue; } if (Files.isDirectory(f)) { logger.warn("File is a directory:"+f.toAbsolutePath()); line = nextLine(); continue; } tryToAdd(new FSFileResource(root, f)); line = nextLine(); } } private String nextLine() { String line = null; try { line = reader.readLine(); } catch (IOException e) { throw new RuntimeException(e); } return line; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy