
org.ow2.weblab.crawler.FolderCrawler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of folder-crawler Show documentation
Show all versions of folder-crawler Show documentation
Use this component crawl a folder. This is a basic component, no thread, no complex timings, no data comparison. A real crawler could use multiple instances of this component.
The newest version!
/**
* WEBLAB: Service oriented integration platform for media mining and intelligence applications
*
* Copyright (C) 2004 - 2011 Cassidian, an EADS company
*
* This library is free software; you can redistribute it and/or modify it under the terms of
* the GNU Lesser General Public License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License along with this
* library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
* Floor, Boston, MA 02110-1301 USA
*/
package org.ow2.weblab.crawler;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.ow2.weblab.content.api.ContentManager;
import org.ow2.weblab.core.extended.exception.WebLabCheckedException;
import org.ow2.weblab.core.extended.exception.WebLabUncheckedException;
import org.ow2.weblab.core.extended.factory.ResourceFactory;
import org.ow2.weblab.core.model.ComposedResource;
import org.ow2.weblab.core.model.Document;
import org.ow2.weblab.core.model.processing.WProcessingAnnotator;
import org.purl.dc.elements.DublinCoreAnnotator;
import org.purl.dc.terms.DCTermsAnnotator;
/**
* Use this component crawl a folder. This is a basic component, no thread, no complex timings, no data comparison. A real crawler could use multiple instances
* of this component.
*
* @todo Migrate to Commons-IO
* @author WebLab IPCC Team
*/
public class FolderCrawler {
final protected ContentManager contentManager;
final protected File folder;
final protected FileFilter fileFilter;
final protected FileFilter folderFilter;
final protected int bufferSize = 10000;
final protected boolean recursiveMode;
final private List crawledFiles = new ArrayList();
final private byte[] lock = new byte[0];
protected final static String CRAWLER_ID = "crawlerFolder";
protected final static String CRAWLER_CONTENT_ID = "crawlerFolderContent";
// TODO removing properties that allow offer to add isExposedAs : this part should be revised
// final public static String CONFIG_FILE = "FolderCrawler.config";
//
// public static final String EXPOSED_ROOT_PROPERTY_NAME = "exposedRoot";
//
// public static final String EXPOSED_AS_URI_PROPERTY_NAME = "exposedAsUri";
//
// final protected String exposedRoot;
//
// final protected String exposedAsUri;
private static final Log LOG = LogFactory.getLog(FolderCrawler.class);
private final static FileFilter FOLDER_FILTER = new FileFilter() {
@Override
public boolean accept(final File file) {
if (file.isDirectory()) {
return true;
}
return false;
}
};
/**
* Constructors
*
* @param contentManager
* The content manager
* @param folder
* The folder to crawl
* @param fileFilter
* The file filter to be used
* @param recursiveMode
* Whether or not to crawl contained folders
* @param folderFilter
* A filter on the folder
* @throws WebLabCheckedException
* If one of the parameters is not correct
*/
public FolderCrawler(final ContentManager contentManager, final File folder, final FileFilter fileFilter, final boolean recursiveMode,
final FileFilter folderFilter) throws WebLabCheckedException {
super();
if (contentManager == null) {
throw new WebLabCheckedException("Content manager must be well instanciated.");
}
this.contentManager = contentManager;
if (!folder.exists() || folder.isFile() || !folder.canRead()) {
throw new WebLabCheckedException("Folder to crawl '" + folder.getAbsolutePath() + "' is unvalid.");
}
this.folder = folder;
this.recursiveMode = recursiveMode;
this.fileFilter = fileFilter;
this.folderFilter = folderFilter;
// final Map props = PropertiesLoader.loadProperties(FolderCrawler.CONFIG_FILE);
// this.exposedRoot = props.get(EXPOSED_ROOT_PROPERTY_NAME);
// this.exposedAsUri = props.get(EXPOSED_AS_URI_PROPERTY_NAME);
}
/**
* Constructors
*
* @param contentManager
* The content manager
* @param folder
* The folder to crawl
* @param fileFilter
* The file filter to be used
* @param recursiveMode
* Whether or not to crawl contained folders
* @throws WebLabCheckedException
* If one of the parameters is not correct or if the creation of
* mimeinfo throws exception.
*/
public FolderCrawler(final ContentManager contentManager, final File folder, final FileFilter fileFilter, final boolean recursiveMode)
throws WebLabCheckedException {
this(contentManager, folder, fileFilter, recursiveMode, FolderCrawler.FOLDER_FILTER);
}
/**
* Constructors
*
* @param folderToCrawl
* The folder to crawl
* @param fileFilter
* The file filter to be used
* @throws WebLabCheckedException
* If one of the parameters is not correct or if the creation of
* mimeinfo throws exception.
*/
public FolderCrawler(final String folderToCrawl, final FileFilter fileFilter) throws WebLabCheckedException {
this(folderToCrawl, fileFilter, false, FolderCrawler.FOLDER_FILTER);
}
/**
* Constructors
*
* @param folderToCrawl
* The folder to crawl
* @param fileFilter
* The file filter to be used
* @param recursiveMode
* Whether or not to crawl contained folders
* @throws WebLabCheckedException
* If one of the parameters is not correct.
*/
public FolderCrawler(final String folderToCrawl, final FileFilter fileFilter, final boolean recursiveMode) throws WebLabCheckedException {
this(ContentManager.getInstance(), new File(folderToCrawl), fileFilter, recursiveMode, FolderCrawler.FOLDER_FILTER);
}
/**
* Constructors
*
* @param folderToCrawl
* The folder to crawl
* @param fileFilter
* The file filter to be used
* @param recursiveMode
* Whether or not to crawl contained folders
* @param folderFilter
* The folder filter to be used
* @throws WebLabCheckedException
* If one of the parameters is not correct.
*/
public FolderCrawler(final String folderToCrawl, final FileFilter fileFilter, final boolean recursiveMode, final FileFilter folderFilter)
throws WebLabCheckedException {
this(ContentManager.getInstance(), new File(folderToCrawl), fileFilter, recursiveMode, folderFilter);
}
/**
* Constructors
*
* @param internFolder
* @param folderToCrawl
* @throws WebLabCheckedException
*/
public FolderCrawler(final String folderToCrawl) throws WebLabCheckedException {
this(folderToCrawl, new FileFilter() {
@Override
public boolean accept(final File file) {
if (file.isFile()) {
return true;
}
return false;
}
});
}
/**
* @return The number of file crawled.
*/
public int getNbFiles() {
return this.crawledFiles.size();
}
/**
* Crawls the folder using the file filter and fills the crawled files list.
*/
public void startCrawl() {
if ((this.folder == null) || (this.fileFilter == null)) {
throw new WebLabUncheckedException("Folder to crawl and file filter " + "should have been defined previously.");
}
synchronized (this.lock) {
this.listAndAddFiles(this.folder);
}
FolderCrawler.LOG.info(this.crawledFiles.size() + " crawled files in FolderCrawler: " + this.toString());
FolderCrawler.LOG.debug("Crawled files: " + this.folder);
}
/**
* @param newFolder
* The folder to be crawled
*/
protected void listAndAddFiles(final File newFolder) {
if (newFolder.isDirectory()) {
FolderCrawler.LOG.debug("Add content of folder: " + newFolder.getAbsolutePath());
final boolean debug = FolderCrawler.LOG.isDebugEnabled();
for (final File file : newFolder.listFiles(this.fileFilter)) {
if (!this.crawledFiles.contains(file)) {
if (debug) {
FolderCrawler.LOG.trace("Add file: " + file.getAbsolutePath());
}
this.crawledFiles.add(file);
}
}
if (this.recursiveMode) {
for (final File dir : newFolder.listFiles(this.folderFilter)) {
this.listAndAddFiles(dir);
}
}
}
}
/**
* @param offset
* the starting point in the collection. If negative, 0 is used.
* @param limit
* if negative of null, Integer.MAX_VALUE is used.
* @return A resource collection
*/
public ComposedResource getCrawledDocuments(final int offset, final int limit) {
int theOffset = offset;
int theLimit = limit;
synchronized (this.lock) {
final long time = System.currentTimeMillis();
final ComposedResource col = ResourceFactory.createResource(FolderCrawler.CRAWLER_ID, "tempCollection-" + time, ComposedResource.class);
if (this.crawledFiles.isEmpty()) {
FolderCrawler.LOG.warn("Either you haven't done a startCrawl before or folder (" + this.folder + ") was empty.");
return col;
}
if (theOffset >= this.crawledFiles.size()) {
FolderCrawler.LOG.warn("Every files have already been crawled.");
return col;
}
if (theOffset < 0) {
FolderCrawler.LOG.warn("Offset was negative, 0 used instead.");
theOffset = 0;
}
if (theLimit <= 0) {
FolderCrawler.LOG.info("Limit was null or negative. Integer.MAX_VALUE will be used.");
theLimit = Integer.MAX_VALUE;
}
int cpt = theOffset;
boolean toContinue = true;
do {
if (cpt < this.crawledFiles.size()) {
final File file = this.crawledFiles.get(cpt);
if ((!file.exists()) || (!file.isFile()) || (!file.canRead())) {
/*
* If the file changed of status between startCrawl and getCrawledDocuments, we remove it from the list and continue the loop.
*/
this.crawledFiles.remove(cpt);
FolderCrawler.LOG.warn("File (" + file + ") is not crawlable");
continue;
}
final Document document = ResourceFactory.createResource(FolderCrawler.CRAWLER_ID, "file" + cpt, Document.class);
FolderCrawler.LOG.debug("Loading file: " + file.getAbsolutePath());
try {
// contentUri =
this.contentManager.writeNativeContent(new FileInputStream(file), document);
} catch (final WebLabCheckedException wlce) {
throw new WebLabUncheckedException("Unexpected error with content manager.", wlce);
} catch (final FileNotFoundException e) {
throw new WebLabUncheckedException("Cannot create an InputStream on file [" + file + "].", e);
}
writeWeblabAnnotations(document, file);
col.getResource().add(document);
cpt++;
if ((cpt - theOffset) >= theLimit) {
toContinue = false;
}
} else {
toContinue = false;
}
} while (toContinue);
FolderCrawler.LOG.info((this.crawledFiles.size() - cpt) + " files remaining in foldercrawler " + this.toString());
return col;
}
}
protected void writeWeblabAnnotations(Document document,File file) {
String path;
try {
path = file.getCanonicalPath();
} catch (final IOException ioe) {
FolderCrawler.LOG.warn("Unable to get canonical path of file: " + file.getAbsolutePath() + "; absolute path will be used instead.");
path = file.getAbsolutePath();
}
// Add WebLab Processing Annotations
final WProcessingAnnotator wpa = new WProcessingAnnotator(document);
// wpa.writeNativeContent(contentUri); // this annotation is now added by the ContentManager
wpa.writeGatheringDate(new Date());
wpa.writeOriginalFileName(file.getName());
wpa.writeOriginalFileSize(Long.valueOf(file.length()));
// Add Dublin Core Annotation
final DublinCoreAnnotator dca = new DublinCoreAnnotator(document);
dca.writeSource(path);
// Add Dublin Core Terms Annotations
final DCTermsAnnotator dcta = new DCTermsAnnotator(document);
dcta.writeExtent(file.length() + " bytes");
dcta.writeModified(new Date(file.lastModified()));
}
@Override
public String toString() {
return "Folder to crawl: '" + this.folder.getAbsolutePath() + "'.";
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy