com.googlecode.fascinator.harvester.filesystem.FileSystemHarvester Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of plugin-harvester-filesystem Show documentation
Performs a batch harvest of any filesystem that can support Java
The newest version!
/* 
 * The Fascinator - File System Harvester Plugin
 * Copyright (C) 2009-2011 University of Southern Queensland
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */
package com.googlecode.fascinator.harvester.filesystem;

import java.io.File;
import java.io.FileFilter;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.Set;
import java.util.Stack;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.googlecode.fascinator.api.harvester.HarvesterException;
import com.googlecode.fascinator.api.storage.DigitalObject;
import com.googlecode.fascinator.api.storage.StorageException;
import com.googlecode.fascinator.common.JsonSimple;
import com.googlecode.fascinator.common.harvester.impl.GenericHarvester;
import com.googlecode.fascinator.common.storage.StorageUtils;

/**
 * 
 * This plugin harvests files in a specified directory or a specified file on
 * the local file system. it can use a cache to do incremental harvests, which
 * only harvests files that have changed since the last time it was run. system.
 * 
 * 
 * Configuration
 * 
 * Sample configuration file for file system harvester: local-files.json
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * Option Description Required Default
baseDir Path of directory or file to be harvested Yes ${user.home}/Documents/public/
facetDir Used to specify the top level directory for the file_path facet No ${user.home}/Documents/public/
ignoreFilter Pipe-separated ('|') list of filename patterns to ignore No .svn|.ice|.*|~*|Thumbs.db|.DS_Store
recursive Set true to harvest files recursively No true
force Force harvest the specified directory or file again even when it's not
 * modified (ignore cache) No false
link Store the digital object as a link in the storage and point to the
 * original file in the file system No true
caching Caching method to use. Valid entries are 'basic' and 'hashed' No null
cacheId The cache ID to use in the database if caching is in use. Yes (if valid 'caching' value is provided) null
derbyHome Path to use for the file store of the database. Should match other Derby
 * paths provided in the configuration file for the application. Yes (if valid 'caching' value is provided) null
 * 
 * Caching With regards to the underlying cache you have three options
 * for configuration:
 * 
 * No caching: All files will always be be harvested. Be aware that without
 * caching there is no support for deletion.
 * Basic caching: The file is considered 'cached' if the last
 * modified date matches the database entry. On some operating systems (like
 * linux) this can provide a minimum of around 2 seconds of granularity. For
 * most purposes this is sufficient, and this cache is the most efficient.
 * Hashed caching: The entire contents of the file are SHA hashed and
 * the hash is stored in the database. The file is considered cached if the old
 * hash matches the new hash. This approach will only trigger a harvest if the
 * contents of the file really change, but it is quite slow across large data
 * sets and large files.
 * 
 * Deletion support is provided by any configured cache. After the standard
 * harvest is performed any 'stale' cache entries are considered to targets for
 * deletion. This is why the 'cacheId' is particularly important, because you
 * don't want cache entries from a different harvest configuration getting
 * deleted.
 * 
 * Examples
 * 
 * 
 * Harvesting ${user.home}/Documents/public/ directory recursively. Ignore files
 * with the filename match the pattern specified in the ignoreFilter. The
 * harvest includes the files in the subdirectory, and do not re-harvest
 * unmodified file if the file exist in the cache database under the 'default'
 * cache.
 * 
 *  *   "harvester": {
 *      "type": "file-system",
 *      "file-system": {
 *          "targets": [
 *              {
 *                  "baseDir": "${user.home}/Documents/public/",
 *                  "facetDir": "${user.home}/Documents/public/",
 *                  "ignoreFilter": ".svn|.ice|.*|~*|Thumbs.db|.DS_Store",
 *                  "recursive": true,
 *                  "force": false,
 *                  "link": true
 *              }
 *          ],
 *          "caching": "basic",
 *          "cacheId": "default",
 *          "derbyHome" : "${fascinator.home}/database"
 *      }
 *  }
 * 
 * 
 * 
 * 
 * 
 * Rule file
 * 
 * Sample rule file for the file system harvester: local-files.py
 * 
 * 
 * Wiki Link
 * 
 * None
 * 
 * 
 * @author Oliver Lucido
 */
public class FileSystemHarvester extends GenericHarvester {

    /** default ignore list */
    private static final String DEFAULT_IGNORE_PATTERNS = ".svn";

    /** logging */
    private Logger log = LoggerFactory.getLogger(FileSystemHarvester.class);

    /** Harvesting targets */
    private List targets;

    /** Target index */
    private Integer targetIndex;

    /** Target index */
    private File nextFile;

    /** Stack of queued files to harvest */
    private Stack fileStack;

    /** Path data for facet */
    private String facetBase;

    /** whether or not there are more files to harvest */
    private boolean hasMore;

    /** filter used to ignore files matching specified patterns */
    private IgnoreFilter ignoreFilter;

    /** whether or not to recursively harvest */
    private boolean recursive;

    /** force harvesting all files */
    private boolean force;

    /** use links instead of copying */
    private boolean link;

    /** Render chains */
    private Map>> renderChains;

    /** Caching */
    private DerbyCache cache;

    /** Delete Support? */
    private boolean supportDeletes;

    /**
     * File filter used to ignore specified files
     */
    private class IgnoreFilter implements FileFilter {

        /** wildcard patterns of files to ignore */
        private String[] patterns;

        public IgnoreFilter(String[] patterns) {
            this.patterns = patterns;
        }

        @Override
        public boolean accept(File path) {
            for (String pattern : patterns) {
                if (FilenameUtils.wildcardMatch(path.getName(), pattern)) {
                    return false;
                }
            }
            return true;
        }
    }

    /**
     * File System Harvester Constructor
     */
    public FileSystemHarvester() {
        super("file-system", "File System Harvester");
    }

    /**
     * Initialisation of File system harvester plugin
     * 
     * @throws HarvesterException if fails to initialise
     */
    @Override
    public void init() throws HarvesterException {
        // Check for valid targests
        targets = getJsonConfig().getJsonSimpleList("harvester", "file-system",
                "targets");
        if (targets.isEmpty()) {
            throw new HarvesterException("No targets specified");
        }

        // Loop processing variables
        fileStack = new Stack();
        targetIndex = null;
        hasMore = true;

        // Caching
        try {
            cache = new DerbyCache(getJsonConfig());
            // Reset flags for deletion support
            cache.resetFlags();
            // But don't support deletes until we've seen 'add' traffic
            // otherwise the flags will all be unset and everything will
            // be flagged for deletion
            supportDeletes = false;
        } catch (Exception ex) {
            log.error("Error instantiating cache: ", ex);
            throw new HarvesterException(ex);
        }

        // Rendering: Order is significant
        renderChains = new LinkedHashMap>>();
        Map renderTypes = getJsonConfig().getJsonSimpleMap(
                "renderTypes");
        if (renderTypes != null) {
            for (Entry entry : renderTypes.entrySet()) {
                Map> details = new HashMap>();
                details.put("fileTypes",
                        entry.getValue().getStringList("fileTypes"));
                details.put("harvestQueue",
                        entry.getValue().getStringList("harvestQueue"));
                details.put("indexOnHarvest",
                        entry.getValue().getStringList("indexOnHarvest"));
                details.put("renderQueue",
                        entry.getValue().getStringList("renderQueue"));
                renderChains.put(entry.getKey(), details);
            }
        }

        // Prep the first file
        nextFile = getNextFile();
    }

    /**
     * Get the next file due to be harvested
     * 
     * @return The next file to harvest, null if none
     */
    private File getNextFile() {
        File next = null;
        if (fileStack.empty()) {
            next = getNextTarget();
        } else {
            next = fileStack.pop();
        }
        if (next == null) {
            hasMore = false;
        }
        return next;
    }

    /**
     * Retrieve the next file specified as a target in configuration
     * 
     * @return The next target file, null if none
     */
    private File getNextTarget() {
        // First execution
        if (targetIndex == null) {
            targetIndex = Integer.valueOf(0);
        } else {
            targetIndex++;
        }

        // We're finished
        if (targetIndex >= targets.size()) {
            return null;
        }

        // Get the next target
        JsonSimple target = targets.get(targetIndex);
        String path = target.getString(null, "baseDir");
        if (path == null) {
            log.warn("No path provided for target, skipping!");
            return getNextTarget();

        } else {
            File file = new File(path);
            if (!file.exists()) {
                log.warn("Path '{}' does not exist, skipping!", path);
                return getNextTarget();

            } else {
                log.info("Target file/directory found: '{}'", path);
                updateConfig(target, path);
                return file;
            }
        }
    }

    /**
     * Update harvest configuration when switching target path
     * 
     * @param tConfig The target configuration
     * @param path The path to the target (used as default facet)
     */
    private void updateConfig(JsonSimple tConfig, String path) {
        recursive = tConfig.getBoolean(false, "recursive");
        ignoreFilter = new IgnoreFilter(tConfig.getString(
                DEFAULT_IGNORE_PATTERNS, "ignoreFilter").split("\\|"));
        force = tConfig.getBoolean(false, "force");
        link = tConfig.getBoolean(false, "link");
        facetBase = tConfig.getString(path, "facetDir");
    }

    /**
     * Shutdown the plugin
     * 
     * @throws HarvesterException is there are errors
     */
    @Override
    public void shutdown() throws HarvesterException {
        if (cache != null) {
            try {
                cache.shutdown();
            } catch (Exception ex) {
                log.error("Error shutting down cache: ", ex);
                throw new HarvesterException(ex);
            }
        }
    }

    /**
     * Harvest the next set of files, and return their Object IDs
     * 
     * @return Set The set of object IDs just harvested
     * @throws HarvesterException is there are errors
     */
    @Override
    public Set getObjectIdList() throws HarvesterException {
        Set fileObjectIdList = new HashSet();

        // We had no valid targets
        if (nextFile == null) {
            hasMore = false;
            return fileObjectIdList;
        }

        // Normal logic
        if (nextFile.isDirectory()) {
            File[] children = nextFile.listFiles(ignoreFilter);
            for (File child : children) {
                if (child.isDirectory()) {
                    if (recursive) {
                        fileStack.push(child);
                    }
                } else {
                    harvestFile(fileObjectIdList, child);
                }
            }

        } else {
            harvestFile(fileObjectIdList, nextFile);
        }

        // Progess the stack and return
        nextFile = getNextFile();
        return fileObjectIdList;
    }

    /**
     * Harvest a file based on configuration
     * 
     * @param list The set of harvested IDs to add to
     * @param file The file to harvest
     * @throws HarvesterException is there are errors
     */
    private void harvestFile(Set list, File file)
            throws HarvesterException {
        // What OID will be used ID we did store this?
        String oid = StorageUtils.generateOid(file);
        // Check if it is in the cache, make sure the cache call come before
        // 'force' in the boolean OR so that the cache entry is 'touched'
        if (cache.hasChanged(oid, file) || force) {
            try {
                list.add(createDigitalObject(file));
            } catch (StorageException se) {
                log.warn("File not harvested {}: {}", file, se.getMessage());
            }
        }
    }

    /**
     * Check if there are more objects to harvest
     * 
     * @return true if there are more, false otherwise
     */
    @Override
    public boolean hasMoreObjects() {
        if (!hasMore) {
            // 'Add' harvesting must be run through to completeion before we
            // support deletes.
            supportDeletes = true;
        }
        return hasMore;
    }

    /**
     * Delete cached references to files which no longer exist and return the
     * set of IDs to delete from the system.
     * 
     * @return Set The set of object IDs deleted
     * @throws HarvesterException is there are errors
     */
    @Override
    public Set getDeletedObjectIdList() throws HarvesterException {
        if (!supportDeletes) {
            String msg = "This plugin only supports deletion if caching is"
                    + " enabled and all 'add' and 'update' harvesting has been"
                    + " processed first. Please ensure caching is configured"
                    + " correctly and that harvesting has continued until"
                    + " hasMoreObjects() returns false. ";
            throw new HarvesterException(msg);
        }

        // Make sure we don't get called twice
        supportDeletes = false;
        // Get our response data
        Set response = cache.getUnsetFlags();
        // Clean up the cache
        cache.purgeUnsetFlags();

        return response;
    }

    /**
     * Check if there are more objects to delete
     * 
     * @return true if there are more, false otherwise
     */
    @Override
    public boolean hasMoreDeletedObjects() {
        return supportDeletes;
    }

    /**
     * Create digital object
     * 
     * @param file File to be transformed to be digital object
     * @return object id of created digital object
     * @throws HarvesterException if fail to create the object
     * @throws StorageException if fail to save the file to the storage
     */
    private String createDigitalObject(File file) throws HarvesterException,
            StorageException {
        DigitalObject object = StorageUtils.storeFile(getStorage(), file, link);

        // update object metadata
        Properties props = object.getMetadata();
        props.setProperty("render-pending", "true");
        props.setProperty("file.path",
                FilenameUtils.separatorsToUnix(file.getAbsolutePath()));
        props.setProperty("base.file.path",
                FilenameUtils.separatorsToUnix(facetBase));

        // Store rendition information if we have it
        String ext = FilenameUtils.getExtension(file.getName());
        for (String chain : renderChains.keySet()) {
            Map> details = renderChains.get(chain);
            if (details.get("fileTypes").contains(ext)) {
                storeList(props, details, "harvestQueue");
                storeList(props, details, "indexOnHarvest");
                storeList(props, details, "renderQueue");
            }
        }

        object.close();
        return object.getId();
    }

    /**
     * Take a list of strings from a Java Map, concatenate the values together
     * and store them in a Properties object using the Map's original key.
     * 
     * @param props Properties object to store into
     * @param details The full Java Map
     * @param field The key to use in both objects
     */
    private void storeList(Properties props, Map> details,
            String field) {
        Set valueSet = new LinkedHashSet();
        // merge with original property value if exists
        String currentValue = props.getProperty(field, "");
        if (!"".equals(currentValue)) {
            String[] currentList = currentValue.split(",");
            valueSet.addAll(Arrays.asList(currentList));
        }
        valueSet.addAll(details.get(field));
        String joinedList = StringUtils.join(valueSet, ",");
        props.setProperty(field, joinedList);
    }
}
Option	Description	Required	Default
baseDir	Path of directory or file to be harvested	Yes	${user.home}/Documents/public/
facetDir	Used to specify the top level directory for the file_path facet	No	${user.home}/Documents/public/
ignoreFilter	Pipe-separated ('\|') list of filename patterns to ignore	No	.svn\|.ice\|.\|~\|Thumbs.db\|.DS_Store
recursive	Set true to harvest files recursively	No	true
force	Force harvest the specified directory or file again even when it's not * modified (ignore cache)	No	false
link	Store the digital object as a link in the storage and point to the * original file in the file system	No	true
caching	Caching method to use. Valid entries are 'basic' and 'hashed'	No	null
cacheId	The cache ID to use in the database if caching is in use.	Yes (if valid 'caching' value is provided)	null
derbyHome	Path to use for the file store of the database. Should match other Derby * paths provided in the configuration file for the application.	Yes (if valid 'caching' value is provided)	null