org.ow2.weblab.crawler.FolderCrawler Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of folder-crawler Show documentation
Use this component crawl a folder. This is a basic component, no thread, no complex timings, no data comparison. A real crawler could use multiple instances of this component.
The newest version!
/**
 * WEBLAB: Service oriented integration platform for media mining and intelligence applications
 * 
 * Copyright (C) 2004 - 2011 Cassidian, an EADS company
 * 
 * This library is free software; you can redistribute it and/or modify it under the terms of
 * the GNU Lesser General Public License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License along with this
 * library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
 * Floor, Boston, MA 02110-1301 USA
 */

package org.ow2.weblab.crawler;

import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.ow2.weblab.content.api.ContentManager;
import org.ow2.weblab.core.extended.exception.WebLabCheckedException;
import org.ow2.weblab.core.extended.exception.WebLabUncheckedException;
import org.ow2.weblab.core.extended.factory.ResourceFactory;
import org.ow2.weblab.core.model.ComposedResource;
import org.ow2.weblab.core.model.Document;
import org.ow2.weblab.core.model.processing.WProcessingAnnotator;
import org.purl.dc.elements.DublinCoreAnnotator;
import org.purl.dc.terms.DCTermsAnnotator;

/**
 * Use this component crawl a folder. This is a basic component, no thread, no complex timings, no data comparison. A real crawler could use multiple instances
 * of this component.
 * 
 * @todo Migrate to Commons-IO
 * @author WebLab IPCC Team
 */
public class FolderCrawler {


	final protected ContentManager contentManager;


	final protected File folder;


	final protected FileFilter fileFilter;


	final protected FileFilter folderFilter;


	final protected int bufferSize = 10000;


	final protected boolean recursiveMode;


	final private List crawledFiles = new ArrayList();


	final private byte[] lock = new byte[0];


	protected final static String CRAWLER_ID = "crawlerFolder";


	protected final static String CRAWLER_CONTENT_ID = "crawlerFolderContent";


	// TODO removing properties that allow offer to add isExposedAs : this part should be revised
	// final public static String CONFIG_FILE = "FolderCrawler.config";
	//
	// public static final String EXPOSED_ROOT_PROPERTY_NAME = "exposedRoot";
	//
	// public static final String EXPOSED_AS_URI_PROPERTY_NAME = "exposedAsUri";
	//
	// final protected String exposedRoot;
	//
	// final protected String exposedAsUri;

	private static final Log LOG = LogFactory.getLog(FolderCrawler.class);


	private final static FileFilter FOLDER_FILTER = new FileFilter() {


		@Override
		public boolean accept(final File file) {
			if (file.isDirectory()) {
				return true;
			}
			return false;
		}
	};


	/**
	 * Constructors
	 * 
	 * @param contentManager
	 *            The content manager
	 * @param folder
	 *            The folder to crawl
	 * @param fileFilter
	 *            The file filter to be used
	 * @param recursiveMode
	 *            Whether or not to crawl contained folders
	 * @param folderFilter
	 *            A filter on the folder
	 * @throws WebLabCheckedException
	 *             If one of the parameters is not correct
	 */
	public FolderCrawler(final ContentManager contentManager, final File folder, final FileFilter fileFilter, final boolean recursiveMode,
			final FileFilter folderFilter) throws WebLabCheckedException {
		super();
		if (contentManager == null) {
			throw new WebLabCheckedException("Content manager must be well instanciated.");
		}
		this.contentManager = contentManager;
		if (!folder.exists() || folder.isFile() || !folder.canRead()) {
			throw new WebLabCheckedException("Folder to crawl '" + folder.getAbsolutePath() + "' is unvalid.");
		}
		this.folder = folder;
		this.recursiveMode = recursiveMode;
		this.fileFilter = fileFilter;
		this.folderFilter = folderFilter;

		// final Map props = PropertiesLoader.loadProperties(FolderCrawler.CONFIG_FILE);
		// this.exposedRoot = props.get(EXPOSED_ROOT_PROPERTY_NAME);
		// this.exposedAsUri = props.get(EXPOSED_AS_URI_PROPERTY_NAME);
	}


	/**
	 * Constructors
	 * 
	 * @param contentManager
	 *            The content manager
	 * @param folder
	 *            The folder to crawl
	 * @param fileFilter
	 *            The file filter to be used
	 * @param recursiveMode
	 *            Whether or not to crawl contained folders
	 * @throws WebLabCheckedException
	 *             If one of the parameters is not correct or if the creation of
	 *             mimeinfo throws exception.
	 */
	public FolderCrawler(final ContentManager contentManager, final File folder, final FileFilter fileFilter, final boolean recursiveMode)
			throws WebLabCheckedException {
		this(contentManager, folder, fileFilter, recursiveMode, FolderCrawler.FOLDER_FILTER);
	}


	/**
	 * Constructors
	 * 
	 * @param folderToCrawl
	 *            The folder to crawl
	 * @param fileFilter
	 *            The file filter to be used
	 * @throws WebLabCheckedException
	 *             If one of the parameters is not correct or if the creation of
	 *             mimeinfo throws exception.
	 */
	public FolderCrawler(final String folderToCrawl, final FileFilter fileFilter) throws WebLabCheckedException {
		this(folderToCrawl, fileFilter, false, FolderCrawler.FOLDER_FILTER);
	}


	/**
	 * Constructors
	 * 
	 * @param folderToCrawl
	 *            The folder to crawl
	 * @param fileFilter
	 *            The file filter to be used
	 * @param recursiveMode
	 *            Whether or not to crawl contained folders
	 * @throws WebLabCheckedException
	 *             If one of the parameters is not correct.
	 */
	public FolderCrawler(final String folderToCrawl, final FileFilter fileFilter, final boolean recursiveMode) throws WebLabCheckedException {
		this(ContentManager.getInstance(), new File(folderToCrawl), fileFilter, recursiveMode, FolderCrawler.FOLDER_FILTER);
	}


	/**
	 * Constructors
	 * 
	 * @param folderToCrawl
	 *            The folder to crawl
	 * @param fileFilter
	 *            The file filter to be used
	 * @param recursiveMode
	 *            Whether or not to crawl contained folders
	 * @param folderFilter
	 *            The folder filter to be used
	 * @throws WebLabCheckedException
	 *             If one of the parameters is not correct.
	 */
	public FolderCrawler(final String folderToCrawl, final FileFilter fileFilter, final boolean recursiveMode, final FileFilter folderFilter)
			throws WebLabCheckedException {
		this(ContentManager.getInstance(), new File(folderToCrawl), fileFilter, recursiveMode, folderFilter);
	}


	/**
	 * Constructors
	 * 
	 * @param internFolder
	 * @param folderToCrawl
	 * @throws WebLabCheckedException
	 */
	public FolderCrawler(final String folderToCrawl) throws WebLabCheckedException {
		this(folderToCrawl, new FileFilter() {


			@Override
			public boolean accept(final File file) {
				if (file.isFile()) {
					return true;
				}
				return false;
			}
		});
	}


	/**
	 * @return The number of file crawled.
	 */
	public int getNbFiles() {
		return this.crawledFiles.size();
	}


	/**
	 * Crawls the folder using the file filter and fills the crawled files list.
	 */
	public void startCrawl() {
		if ((this.folder == null) || (this.fileFilter == null)) {
			throw new WebLabUncheckedException("Folder to crawl and file filter " + "should have been defined previously.");
		}
		synchronized (this.lock) {
			this.listAndAddFiles(this.folder);
		}
		FolderCrawler.LOG.info(this.crawledFiles.size() + " crawled files in FolderCrawler: " + this.toString());
		FolderCrawler.LOG.debug("Crawled files: " + this.folder);
	}


	/**
	 * @param newFolder
	 *            The folder to be crawled
	 */
	protected void listAndAddFiles(final File newFolder) {
		if (newFolder.isDirectory()) {
			FolderCrawler.LOG.debug("Add content of folder: " + newFolder.getAbsolutePath());
			final boolean debug = FolderCrawler.LOG.isDebugEnabled();

			for (final File file : newFolder.listFiles(this.fileFilter)) {
				if (!this.crawledFiles.contains(file)) {
					if (debug) {
						FolderCrawler.LOG.trace("Add file: " + file.getAbsolutePath());
					}
					this.crawledFiles.add(file);
				}
			}
			if (this.recursiveMode) {
				for (final File dir : newFolder.listFiles(this.folderFilter)) {
					this.listAndAddFiles(dir);
				}
			}
		}
	}


	/**
	 * @param offset
	 *            the starting point in the collection. If negative, 0 is used.
	 * @param limit
	 *            if negative of null, Integer.MAX_VALUE is used.
	 * @return A resource collection
	 */
	public ComposedResource getCrawledDocuments(final int offset, final int limit) {
		int theOffset = offset;
		int theLimit = limit;

		synchronized (this.lock) {
			final long time = System.currentTimeMillis();
			final ComposedResource col = ResourceFactory.createResource(FolderCrawler.CRAWLER_ID, "tempCollection-" + time, ComposedResource.class);

			if (this.crawledFiles.isEmpty()) {
				FolderCrawler.LOG.warn("Either you haven't done a startCrawl before or folder (" + this.folder + ") was empty.");
				return col;
			}

			if (theOffset >= this.crawledFiles.size()) {
				FolderCrawler.LOG.warn("Every files have already been crawled.");
				return col;
			}

			if (theOffset < 0) {
				FolderCrawler.LOG.warn("Offset was negative, 0 used instead.");
				theOffset = 0;
			}

			if (theLimit <= 0) {
				FolderCrawler.LOG.info("Limit was null or negative. Integer.MAX_VALUE will be used.");
				theLimit = Integer.MAX_VALUE;
			}

			int cpt = theOffset;
			boolean toContinue = true;
			do {
				if (cpt < this.crawledFiles.size()) {
					final File file = this.crawledFiles.get(cpt);
					if ((!file.exists()) || (!file.isFile()) || (!file.canRead())) {
						/*
						 * If the file changed of status between startCrawl and getCrawledDocuments, we remove it from the list and continue the loop.
						 */
						this.crawledFiles.remove(cpt);
						FolderCrawler.LOG.warn("File (" + file + ") is not crawlable");
						continue;
					}

					final Document document = ResourceFactory.createResource(FolderCrawler.CRAWLER_ID, "file" + cpt, Document.class);

					FolderCrawler.LOG.debug("Loading file: " + file.getAbsolutePath());
					try {
						// contentUri =
						this.contentManager.writeNativeContent(new FileInputStream(file), document);
					} catch (final WebLabCheckedException wlce) {
						throw new WebLabUncheckedException("Unexpected error with content manager.", wlce);
					} catch (final FileNotFoundException e) {
						throw new WebLabUncheckedException("Cannot create an InputStream on file [" + file + "].", e);
					}

					writeWeblabAnnotations(document, file);
					
					col.getResource().add(document);
					cpt++;

					if ((cpt - theOffset) >= theLimit) {
						toContinue = false;
					}
				} else {
					toContinue = false;
				}
			} while (toContinue);
			FolderCrawler.LOG.info((this.crawledFiles.size() - cpt) + " files remaining in foldercrawler " + this.toString());
			return col;
		}

	}

	protected void writeWeblabAnnotations(Document document,File file) {
		String path;
		try {
			path = file.getCanonicalPath();
		} catch (final IOException ioe) {
			FolderCrawler.LOG.warn("Unable to get canonical path of file: " + file.getAbsolutePath() + "; absolute path will be used instead.");
			path = file.getAbsolutePath();
		}
		
		// Add WebLab Processing Annotations
		final WProcessingAnnotator wpa = new WProcessingAnnotator(document);
		// wpa.writeNativeContent(contentUri); // this annotation is now added by the ContentManager
		wpa.writeGatheringDate(new Date());
		wpa.writeOriginalFileName(file.getName());
		wpa.writeOriginalFileSize(Long.valueOf(file.length()));

		// Add Dublin Core Annotation
		final DublinCoreAnnotator dca = new DublinCoreAnnotator(document);
		dca.writeSource(path);

		// Add Dublin Core Terms Annotations
		final DCTermsAnnotator dcta = new DCTermsAnnotator(document);
		dcta.writeExtent(file.length() + " bytes");
		dcta.writeModified(new Date(file.lastModified()));
	}


	@Override
	public String toString() {
		return "Folder to crawl: '" + this.folder.getAbsolutePath() + "'.";
	}
}