org.apache.tika.batch.FileResourceCrawler Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.batch;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.metadata.Metadata;
public abstract class FileResourceCrawler implements Callable {
protected static final Logger LOG =
LoggerFactory.getLogger(FileResourceCrawler.class.toString());
protected final static int SKIPPED = 0;
protected final static int ADDED = 1;
protected final static int STOP_NOW = 2;
//how long to pause if can't add to queue
private static final long PAUSE_INCREMENT_MILLIS = 1000;
private final ArrayBlockingQueue queue;
private final int numConsumers;
private volatile boolean hasCompletedCrawling = false;
private volatile boolean shutDownNoPoison = false;
private volatile boolean isActive = true;
private volatile boolean timedOut = false;
private int maxFilesToAdd = -1;
private int maxFilesToConsider = -1;
private long maxConsecWaitInMillis = 300000;//300,000ms = 5 minutes
private DocumentSelector documentSelector = null;
//number of files added to queue
private int added = 0;
//number of files considered including those that were rejected by documentSelector
private int considered = 0;
/**
* @param queue shared queue
* @param numConsumers number of consumers (needs to know how many poisons to add when done)
*/
public FileResourceCrawler(ArrayBlockingQueue queue, int numConsumers) {
this.queue = queue;
this.numConsumers = numConsumers;
}
/**
* Implement this to control the addition of FileResources. Call {@link #tryToAdd}
* to add FileResources to the queue.
*
* @throws InterruptedException
*/
public abstract void start() throws InterruptedException;
public FileResourceCrawlerFutureResult call() {
try {
start();
} catch (InterruptedException e) {
//this can be triggered by shutdownNow in BatchProcess
LOG.info("InterruptedException in FileCrawler", e);
} catch (Exception e) {
LOG.error("Exception in FileResourceCrawler: {}", e.getMessage(), e);
} finally {
isActive = false;
}
try {
shutdown();
} catch (InterruptedException e) {
//swallow
}
return new FileResourceCrawlerFutureResult(considered, added);
}
/**
* @param fileResource resource to add
* @return int status of the attempt (SKIPPED, ADDED, STOP_NOW) to add the resource to the queue.
* @throws InterruptedException
*/
protected int tryToAdd(FileResource fileResource) throws InterruptedException {
if (maxFilesToAdd > -1 && added >= maxFilesToAdd) {
return STOP_NOW;
}
if (maxFilesToConsider > -1 && considered > maxFilesToConsider) {
return STOP_NOW;
}
boolean isAdded = false;
if (select(fileResource.getMetadata())) {
long start = System.currentTimeMillis();
while (queue.offer(fileResource, PAUSE_INCREMENT_MILLIS, TimeUnit.MILLISECONDS) ==
false) {
long elapsed = System.currentTimeMillis() - start;
LOG.info("FileResourceCrawler is pausing. Queue is full: {} after {} ms",
queue.size(), elapsed);
if (maxConsecWaitInMillis > -1 && elapsed > maxConsecWaitInMillis) {
timedOut = true;
String msg = "FileResourceCrawler had to wait longer (" + elapsed +
" ms) than allowed (" + maxConsecWaitInMillis + " ms)";
LOG.error(msg);
throw new InterruptedException(msg);
}
if (Thread.currentThread().isInterrupted()) {
LOG.info("FileResourceCrawler shutting down because of interrupted thread.");
throw new InterruptedException("FileResourceCrawler interrupted.");
}
}
isAdded = true;
added++;
} else {
LOG.debug("crawler did not select: {}", fileResource.getResourceId());
}
considered++;
return (isAdded) ? ADDED : SKIPPED;
}
//Warning! Depending on the value of maxConsecWaitInMillis
//this could try forever in vain to add poison to the queue.
private void shutdown() throws InterruptedException {
LOG.debug("FileResourceCrawler entering shutdown");
if (hasCompletedCrawling || shutDownNoPoison) {
return;
}
int i = 0;
long start = System.currentTimeMillis();
while (queue.offer(new PoisonFileResource(), 1L, TimeUnit.SECONDS)) {
if (shutDownNoPoison) {
LOG.debug("quitting the poison loop because shutDownNoPoison is now true");
return;
}
if (Thread.currentThread().isInterrupted()) {
LOG.debug("thread interrupted while trying to add poison");
return;
}
long elapsed = System.currentTimeMillis() - start;
if (maxConsecWaitInMillis > -1 && elapsed > maxConsecWaitInMillis) {
LOG.error("Crawler timed out while trying to add poison");
return;
}
LOG.debug("added {} number of PoisonFileResource(s)", i);
if (i++ >= numConsumers) {
break;
}
}
hasCompletedCrawling = true;
}
/**
* If the crawler stops for any reason, it is no longer active.
*
* @return whether crawler is active or not
*/
public boolean isActive() {
return isActive;
}
public void setMaxConsecWaitInMillis(long maxConsecWaitInMillis) {
this.maxConsecWaitInMillis = maxConsecWaitInMillis;
}
public void setDocumentSelector(DocumentSelector documentSelector) {
this.documentSelector = documentSelector;
}
public int getConsidered() {
return considered;
}
protected boolean select(Metadata m) {
return documentSelector.select(m);
}
/**
* Maximum number of files to add. If {@link #maxFilesToAdd} < 0 (default),
* then this crawler will add all documents.
*
* @param maxFilesToAdd maximum number of files to add to the queue
*/
public void setMaxFilesToAdd(int maxFilesToAdd) {
this.maxFilesToAdd = maxFilesToAdd;
}
/**
* Maximum number of files to consider. A file is considered
* whether or not the DocumentSelector selects a document.
*
* If {@link #maxFilesToConsider} < 0 (default), then this crawler
* will add all documents.
*
* @param maxFilesToConsider maximum number of files to consider adding to the queue
*/
public void setMaxFilesToConsider(int maxFilesToConsider) {
this.maxFilesToConsider = maxFilesToConsider;
}
/**
* Use sparingly. This synchronizes on the queue!
*
* @return whether this queue contains any non-poison file resources
*/
public boolean isQueueEmpty() {
int size = 0;
synchronized (queue) {
for (FileResource aQueue : queue) {
if (!(aQueue instanceof PoisonFileResource)) {
size++;
}
}
}
return size == 0;
}
/**
* Returns whether the crawler timed out while trying to add a resource
* to the queue.
*
* If the crawler timed out while trying to add poison, this is not
* set to true.
*
* @return whether this was timed out or not
*/
public boolean wasTimedOut() {
return timedOut;
}
/**
* @return number of files that this crawler added to the queue
*/
public int getAdded() {
return added;
}
/**
* Set to true to shut down the FileResourceCrawler without
* adding poison. Do this only if you've already called another mechanism
* to request that consumers shut down. This prevents a potential deadlock issue
* where the crawler is trying to add to the queue, but it is full.
*/
public void shutDownNoPoison() {
this.shutDownNoPoison = true;
}
}