All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.duracloud.chunk.FileChunker Maven / Gradle / Ivy

The newest version!
/*
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 *     http://duracloud.org/license/
 */
package org.duracloud.chunk;

import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.security.DigestInputStream;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Random;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.filefilter.IOFileFilter;
import org.apache.commons.io.input.AutoCloseInputStream;
import org.duracloud.chunk.error.NotFoundException;
import org.duracloud.chunk.stream.ChunkInputStream;
import org.duracloud.chunk.writer.AddContentResult;
import org.duracloud.chunk.writer.ContentWriter;
import org.duracloud.common.error.DuraCloudRuntimeException;
import org.duracloud.common.util.ChecksumUtil;
import org.duracloud.common.util.ExceptionUtil;
import org.duracloud.storage.util.StorageProviderUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This class provides the ability to loop over a directory of content which
 * may include files over 5-GB in size, chunk, and push them to a DataStore
 * encapsulated by the member: ContentWriter.
 * A ContentWriter may push to any DataStore, such as:
 * - a filesystem or
 * - a DuraCloud space
 *
 * @author Andrew Woods
 *         Date: Feb 4, 2010
 */
public class FileChunker {

    private final Logger log = LoggerFactory.getLogger(FileChunker.class);

    private ContentWriter contentWriter;
    private FileChunkerOptions options;

    public FileChunker(
        ContentWriter contentWriter) {
        this(contentWriter,new FileChunkerOptions());
    }

    public FileChunker(ContentWriter contentWriter,
                       FileChunkerOptions options) {
        this.contentWriter = contentWriter;
        this.options = options;
    }

    protected void writeReport(File outputFile) {
        StringBuilder sb = new StringBuilder();
        if (!outputFile.exists()) {
            sb.append("spaceId,contentId,md5,size,state\n");
        }

        OutputStream outputStream = getOutputStream(outputFile);

        List results = contentWriter.getResults();
        for (AddContentResult result : results) {
            sb.append(result.getSpaceId());
            sb.append(",");
            sb.append(result.getContentId());
            sb.append(",");
            sb.append(result.getMd5());
            sb.append(",");
            sb.append(result.getContentSize());
            sb.append(",");
            sb.append(result.getState().name());
            sb.append("\n");
        }

        try {
            outputStream.write(sb.toString().getBytes());
        } catch (IOException e) {
            throw new DuraCloudRuntimeException(e);
        } finally {
            IOUtils.closeQuietly(outputStream);
        }
    }

    private OutputStream getOutputStream(File outputFile) {
        boolean append = true;
        try {
            return new FileOutputStream(outputFile, append);
        } catch (IOException e) {
            throw new DuraCloudRuntimeException(e);
        }
    }

    /**
     * This method pushes the content file to the space destSpaceId with the
     * content ID destContentId
     *
     * @param destSpaceId   of content destination
     * @param destContentId of content
     * @param fileChecksum MD5 checksum of file or null if not known
     * @param file          to add
     */
    public void addContent(String destSpaceId,
                           String destContentId,
                           String fileChecksum,
                           File file) {
        Map properties =
            StorageProviderUtil.createContentProperties(
                                                        file.getAbsolutePath(), 
                                                        null);
        addContent(destSpaceId,
                   destContentId,
                   fileChecksum,
                   file.length(),
                   getInputStream(file),
                   properties);
    }

    /**
     * This method pushes the content file to the space destSpaceId with the
     * content ID destContentId
     *
     * @param destSpaceId   of content destination
     * @param destContentId of content
     * @param fileChecksum MD5 checksum of file or null if not known
     * @param stream        to add
     * @param properties user-defined properties associated with content
     */
    public void addContent(String destSpaceId,
                           String destContentId,
                           String fileChecksum,
                           long fileSize,
                           InputStream stream,
                           Map properties) {
        try {
            doAddContent(destSpaceId,
                         destContentId,
                         fileChecksum,
                         fileSize,
                         getInputStream(stream), 
                         properties);
        } catch(NotFoundException e) {
            throw new DuraCloudRuntimeException(e);
        }
    }

    /**
     * This method loops the arg baseDir and pushes the found content to the
     * arg destSpace.
     *
     * @param baseDir     of content to push to DataStore
     * @param destSpaceId of content destination
     */
    protected void addContentFrom(File baseDir, String destSpaceId) {

        Collection files = listFiles(baseDir,
                                           options.getFileFilter(),
                                           options.getDirFilter());
        for (File file : files) {
            try {
                doAddContent(baseDir, destSpaceId, file);

            } catch (Exception e) {
                StringBuilder sb = new StringBuilder("Error: ");
                sb.append("Unable to addContentFrom [");
                sb.append(baseDir);
                sb.append(", ");
                sb.append(destSpaceId);
                sb.append("] : ");
                sb.append(e.getMessage());
                sb.append("\n");
                sb.append(ExceptionUtil.getStackTraceAsString(e));
                log.error(sb.toString());
            }
        }
    }

    private void doAddContent(File baseDir, String destSpaceId, File file)
        throws NotFoundException {
        Map properties =
            StorageProviderUtil.createContentProperties(file.getAbsolutePath(),
                                                        null);
        String destContentId = getContentId(baseDir, file);
        InputStream stream = getInputStream(file);
        doAddContent(destSpaceId, destContentId, null, file.length(), stream, properties);
    }

    private void doAddContent(String destSpaceId,
                              String destContentId,
                              String fileChecksum,
                              long fileSize,
                              InputStream stream, 
                              Map properties)
        throws NotFoundException {
        long maxChunkSize = options.getMaxChunkSize();
        boolean ignoreLargeFiles = options.isIgnoreLargeFiles();
        boolean preserveChunkMD5s = options.isPreserveChunkMD5s();

        log.debug("loading file: " + destContentId + "[" + fileSize + "]");
        if (fileSize <= maxChunkSize) {
            BufferedInputStream buffStream = new BufferedInputStream(stream);
            ChunkInputStream chunk = new ChunkInputStream(destContentId,
                                                          buffStream,
                                                          fileSize,
                                                          false);

            contentWriter.writeSingle(destSpaceId, fileChecksum, chunk, properties);

        } else if (!ignoreLargeFiles) {
            ChunkableContent chunkable = new ChunkableContent(destContentId,
                                                              stream,
                                                              fileSize,
                                                              maxChunkSize);
            chunkable.setPreserveChunkMD5s(preserveChunkMD5s);

            contentWriter.write(destSpaceId, chunkable, properties);

            // Verify final checksum
            if(fileChecksum != null) {
                String finalChecksum =
                    chunkable.getManifest().getHeader().getSourceMD5();                
                if(!fileChecksum.equals(finalChecksum)) {
                    String err = "Final checksum of chunked content " +
                                 finalChecksum +
                                 " does not match provided checksum " +
                                 fileChecksum;
                    throw new DuraCloudRuntimeException(err);
                }
            }

        } else {
            log.info("Ignoring: [" + destContentId + "]");
            contentWriter.ignore(destSpaceId, destContentId, fileSize);
        }

        IOUtils.closeQuietly(stream);
    }

    private Collection listFiles(File baseDir,
                                       IOFileFilter fileFilter,
                                       IOFileFilter dirFilter) {
        if (!baseDir.isDirectory()) {
            throw new DuraCloudRuntimeException("Invalid dir: " + baseDir);
        }

        Collection files = FileUtils.listFiles(baseDir, fileFilter, dirFilter);
        if (null == files || files.size() == 0) {
            throw new DuraCloudRuntimeException("No files found: " + baseDir);
        }

        return files;
    }

    /**
     * This method defines the returned contentId as the path of the arg file
     * minus the path of the arg baseDir, in which the file was found.
     *
     * @param baseDir dir that contained the arg file or one of its parents
     * @param file    for which contentId is to be found
     * @return contentId of arg file
     */
    private String getContentId(File baseDir, File file) {
        String filePath = file.getPath();
        String basePath = baseDir.getPath();

        int index = filePath.indexOf(basePath);
        if (index == -1) {
            StringBuilder sb = new StringBuilder("Invalid basePath for file: ");
            sb.append("b: '" + basePath + "', ");
            sb.append("f: '" + filePath + "'");
            throw new DuraCloudRuntimeException(sb.toString());
        }

        String contentId = filePath.substring(index + basePath.length());
        if (contentId.startsWith(File.separator)) {
            contentId = contentId.substring(1, contentId.length());
        }
        // Replace backslash (\) with forward slash (/) for all content IDs
        contentId = contentId.replaceAll("\\\\", "/");
        return contentId;
    }

    private InputStream getInputStream(File file) {
        try {
            return getInputStream(new FileInputStream(file));
        } catch (FileNotFoundException e) {
            throw new DuraCloudRuntimeException(e.getMessage(), e);
        }
    }

    private InputStream getInputStream(InputStream stream) {
        return new AutoCloseInputStream(stream);
    }

    /**
     * This method generates a test file with random char content.
     *
     * @param outFile of test file
     * @param size    number of bytes in test file
     * @return DigestInputStream of test file
     * @throws IOException on error
     */
    public static DigestInputStream createTestContent(File outFile,
                                                         long size)
        throws IOException {
        final int BUF_SZ = 8192;
        FileOutputStream fos = new FileOutputStream(outFile);
        OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
        BufferedWriter bw = new BufferedWriter(osw, BUF_SZ);

        int MIN_CHAR = 32;
        int MAX_CHAR_MINUS_MIN_CHAR = 126 - MIN_CHAR;
        Random r = new Random();
        for (long i = 0; i < size; ++i) {
            bw.write(r.nextInt(MAX_CHAR_MINUS_MIN_CHAR) + MIN_CHAR);
        }
        IOUtils.closeQuietly(bw);

        return ChecksumUtil.wrapStream(new FileInputStream(outFile),
                                       ChecksumUtil.Algorithm.MD5);
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy