All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.dspace.pack.bagit.Bag Maven / Gradle / Ivy

/**
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 * http://www.dspace.org/license/
 */
package org.dspace.pack.bagit;

import java.util.Arrays;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.security.DigestInputStream;
import java.security.DigestOutputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import java.util.zip.ZipOutputStream;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.stream.XMLStreamWriter;

import org.apache.commons.compress.archivers.ArchiveOutputStream;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;

import org.dspace.curate.Utils;

// Warning - static import ahead!
import static javax.xml.stream.XMLStreamConstants.*;
import org.dspace.core.ConfigurationManager;

/**
 * Bag represents a rudimentary bag conformant to LC Bagit spec - version 0.96.
 * A Bag is a directory with contents and a little structured metadata in it.
 * Although they don't have to be, these bags are 'wormy' - meaning that they
 * can be written only once (have no update semantics), and can be 'holey' -
 * meaning they can contain content by reference as well as inclusion. The
 * implementation provides 3 symmetrical interfaces for reading from and writing
 * to the bag: (1) A 'flat' reader and writer for line-oriented character
 * data, (2) an 'xml' reader and writer for XML documents, and (3) a 'raw'
 * stream-based reader and writer for uninterpreted binary data.
 * 
 * The bag also can serialize itself to a compressed archive file (supported
 * formats zip or tgz) or be deserialized from same or a stream, 
 * abiding by the serialization recommendations of the specification.
 * 
 * @author richardrodgers
 */
public class Bag {
    // coding constants
    private static final String ENCODING = "UTF-8";
    private static final String CS_ALGO = "MD5";
    private static final String BAGIT_VSN = "0.96";
    private static final String DFLT_FMT = ConfigurationManager.getProperty("replicate", "packer.archfmt");

    // mandated file names
    private static final String MANIF_FILE = "manifest-" + CS_ALGO.toLowerCase() + ".txt";
    private static final String TAGMANIF_FILE = "tag" + MANIF_FILE;
    private static final String DECL_FILE = "bagit.txt";
    private static final String REF_FILE = "fetch.txt";

    private XMLInputFactory inFactory = XMLInputFactory.newInstance();
    private XMLOutputFactory outFactory = XMLOutputFactory.newInstance();

    private FlatWriter tagWriter = null;
    private FlatWriter manWriter = null;
    private FlatWriter refWriter = null;

    // directory root of bag
    private File baseDir = null;
    // have all content and tag files been written?
    private boolean filled = false;

    /**
     * Constructor - creates a new bag. There are 3 distinct modes
     * of creation, depending on passed base file:
     * (1) file does not have a 'data' subdirectory - create new unfilled bag.
     * (2) file exists, is a directory with a 'data' subdirectory - assumed
     *     to be an existing Bag structure. Create as filled.
     * (3) file exists and is not a directory - assume it to be a compressed
     *     archive of a bag. Explode the archive and create a filled bag in
     *     the same directory as the archive file.
     * 
     * @param baseFile File
     * @throws IOException if I/O error
     */
    public Bag(File baseFile) throws IOException
    {
        // is it an archive file? If so, inflate into bag
        String baseName = baseFile.getName();
        int sfxIdx = baseName.lastIndexOf(".");
        String suffix = (sfxIdx != -1) ? baseName.substring(sfxIdx + 1) : null;
        if (baseFile.exists() && ! baseFile.isDirectory()
            && suffix != null && suffix.equals(DFLT_FMT))
        {
            String dirName = baseName.substring(0, sfxIdx);
            baseDir = new File(baseFile.getParent(), dirName);
            File dFile = bagFile("data");
            dFile.mkdirs();
            inflate();
        }
        else
        {
            // pregenerate data directory if creating
            baseDir = baseFile;
            File dFile = bagFile("data");
            if (dFile.exists())
            {
                filled = true;
            }
            else
            {
                dFile.mkdirs();
                // prepare manifest writers
                tagWriter = new FlatWriter(bagFile(TAGMANIF_FILE), null, null);
                manWriter = new FlatWriter(bagFile(MANIF_FILE), null, tagWriter);
            }
        } 
    }

    public static String getVersion()
    {
        return BAGIT_VSN;
    }

    public String getName()
    {
        return baseDir.getName();
    }

    public boolean isFilled()
    {
        return filled;
    }

    public FlatReader flatReader(String name) throws IOException
    {
        File flatFile = dataFile(name);
        return flatFile.exists() ? new FlatReader(flatFile) : null;
    }

    public XmlReader xmlReader(String name) throws IOException
    {
        File xmlFile = dataFile(name);
        return xmlFile.exists() ? new XmlReader(xmlFile) : null;
    }

    public FlatWriter flatWriter(String name) throws IOException
    {
        if (filled)
        {
            throw new IllegalStateException("Cannot write to filled bag");
        }
        String brPath = "data/" + name;
        return new FlatWriter(dataFile(name), brPath, manWriter);
    }

    public XmlWriter xmlWriter(String name) throws IOException
    {
        if (filled)
        {
            throw new IllegalStateException("Cannot write to filled bag");
        }
        String brPath = "data/" + name;
        return new XmlWriter(dataFile(name), brPath, manWriter);
    }

    public InputStream dataStream(String name) throws IOException
    {
        File dFile = dataFile(name);
        return dFile.exists() ? new FileInputStream(dFile) : null;
    }

    public void addData(String relPath, long size, InputStream is) throws IOException
    {
        if (filled)
        {
            throw new IllegalStateException("Cannot add data to filled bag");
        }
        // wrap stream in digest stream
        DigestInputStream dis = null;
        try
        {
            dis = new DigestInputStream(is, MessageDigest.getInstance(CS_ALGO));
            FileOutputStream fos = new FileOutputStream(dataFile(relPath));
            // attempt to optimize copy in various ways - TODO
            Utils.copy(dis, fos);
            fos.close();
            dis.close();
            is.close();
        }
        catch (NoSuchAlgorithmException nsaE)
        {
            throw new IOException("no algorithm: " + CS_ALGO);
        }
        // record checksum
        String brPath = "data/" + relPath;
        manWriter.writeProperty(Utils.toHex(dis.getMessageDigest().digest()), brPath);
    }

    public void addDataRef(String relPath, long size, String url) throws IOException
    {
        if (refWriter == null)
        {
            refWriter = new FlatWriter(bagFile(REF_FILE), null, tagWriter);
        }
        String brPath = "data/" + relPath;
        refWriter.writeLine(url + " " + size + " " + brPath);
    }
    
    public List listDataFiles() throws IOException
    {
       return Arrays.asList(bagFile("data").listFiles());
    }

    public List getDataRefs() throws IOException
    {
        List refList = new ArrayList();
        File refFile = bagFile(REF_FILE);
        if (refFile.exists())
        {
            FlatReader reader = new FlatReader(refFile);
            String line = null;
            while ((line = reader.readLine()) != null)
            {
                refList.add(line);
            }
        }
        return refList;
    }

    public void close() throws IOException
    {
        if (! filled)
        {
            // close the manifest file
            manWriter.close();
            // close ref file if present
            if (refWriter != null) {
                refWriter.close();
            }
            // write out bagit declaration file
            FlatWriter fwriter = new FlatWriter(bagFile(DECL_FILE), null, tagWriter);
            fwriter.writeLine("BagIt-Version: " + BAGIT_VSN);
            fwriter.writeLine("Tag-File-Character-Encoding: " + ENCODING);
            fwriter.close();
            // close tag manifest file of previous tag files
            tagWriter.close();
            filled = true;
        }
    }

    public void empty()
    {
        // just delete everything
        deleteDir(baseDir);
        baseDir.delete();
        filled = false;
    }

    private void deleteDir(File dirFile)
    {
       for (File file : dirFile.listFiles())
       {
           if (file.isDirectory())
           {
               deleteDir(file);
           }
           file.delete();
       }
    }
    
    public File deflate() throws IOException
    {
        // deflate this bag inplace (in current directory) using default archive format
        return deflate(baseDir.getParent(), DFLT_FMT);
    }
    
    public File deflate(String fmt) throws IOException
    {
        // deflate this bag inplace (in current directory) using given archive format
        return deflate(baseDir.getParent(), fmt);        
    }
    
    public File deflate(String destDir, String fmt) throws IOException
    {
        File defFile = new File(destDir, baseDir.getName() + "." + fmt);
        FileOutputStream fout = new FileOutputStream(defFile);
        deflate(fout, fmt);
        fout.close();
        return defFile;
    }
    
    public void deflate(OutputStream out, String fmt) throws IOException
    {
        if (! filled)
        {
            throw new IllegalStateException("Cannot deflate unfilled bag");
        }
        if ("zip".equals(fmt))
        {
            ZipOutputStream zout = new ZipOutputStream(
                                   new BufferedOutputStream(out));
            fillZip(baseDir, baseDir.getName(), zout);
            zout.close(); 
        }
        else if ("tgz".equals(fmt))
        {
            TarArchiveOutputStream tout = new TarArchiveOutputStream(
                                          new BufferedOutputStream(
                                          new GzipCompressorOutputStream(out)));
            fillArchive(baseDir, baseDir.getName(), tout);
            tout.close(); 
        }
    }
    
    public final void inflate() throws IOException
    {
        // assume current directory & default format
        inflate(baseDir.getParent() + File.separator + baseDir.getName() + "." + DFLT_FMT);
    }
    
    public void inflate(String archFile) throws IOException
    {
        String fmt = archFile.substring(archFile.lastIndexOf(".") + 1);
        InputStream in = new FileInputStream(new File(archFile));
        inflate(in, fmt);
        in.close();
    }
    
    public void inflate(InputStream in, String fmt) throws IOException
    {
        if (filled)
        {
            throw new IllegalStateException("Cannot inflate filled bag");
        }
        if ("zip".equals(fmt))
        {
            ZipInputStream zin = new ZipInputStream(in);
            ZipEntry entry = null;
            while((entry = zin.getNextEntry()) != null)
            {
                File outFile = new File(baseDir.getParent(), entry.getName());
                outFile.getParentFile().mkdirs();
                FileOutputStream fout = new FileOutputStream(outFile);
                Utils.copy(zin, fout);
                fout.close();
            }
            zin.close();
        }
        else if ("tgz".equals(fmt))
        {
            TarArchiveInputStream tin = new TarArchiveInputStream(
                                        new GzipCompressorInputStream(in));
            TarArchiveEntry entry = null;
            while((entry = tin.getNextTarEntry()) != null)
            {
                File outFile = new File(baseDir.getParent(), entry.getName());
                outFile.getParentFile().mkdirs();
                FileOutputStream fout = new FileOutputStream(outFile);
                Utils.copy(tin, fout);
                fout.close();
            }
            tin.close();                                 
        }
        filled = true;
    }
    
    private void fillArchive(File dirFile, String relBase, ArchiveOutputStream out) throws IOException
    {
        for (File file : dirFile.listFiles())
        {
            String relPath = relBase + File.separator + file.getName();
            if (file.isDirectory())
            {
                fillArchive(file, relPath, out);
            }
            else
            {
                TarArchiveEntry entry = new TarArchiveEntry(relPath);
                entry.setSize(file.length());
                entry.setModTime(0L);
                out.putArchiveEntry(entry);
                FileInputStream fin = new FileInputStream(file);
                Utils.copy(fin, out);
                out.closeArchiveEntry();
                fin.close();
            }
        }
    }

    private void fillZip(File dirFile, String relBase, ZipOutputStream zout) throws IOException
    {
        for (File file : dirFile.listFiles())
        {
            String relPath = relBase + File.separator + file.getName();
            if (file.isDirectory())
            {
                fillZip(file, relPath, zout);
            }
            else
            {
                ZipEntry entry = new ZipEntry(relPath);
                entry.setTime(0L);
                zout.putNextEntry(entry);
                FileInputStream fin = new FileInputStream(file);
                Utils.copy(fin, zout);
                zout.closeEntry();
                fin.close();
            }
        }
    }

    private File dataFile(String name)
    {
        // all user-defined files live in payload area - ie. under 'data'
        File dataFile = new File(bagFile("data"), name);
        // create needed dirs
        File parentFile = dataFile.getParentFile();
        if (! parentFile.isDirectory())
        {
            parentFile.mkdirs();
        }
        return dataFile;
    }

    private File bagFile(String name)
    {
        return new File(baseDir, name);
    }

    // Assortment of small helper classes for reading & writing bag files
    // Writers capture the checksums of written files, needed for bag manifests

    public class FlatReader
    {
        private BufferedReader reader = null;

        private FlatReader(File file) throws IOException
        {
            reader = new BufferedReader(new FileReader(file));
        }

        public String readLine() throws IOException
        {
            return reader.readLine();
        }

        public void close() throws IOException
        {
            reader.close();
        }
    }

    // wrapper for simple Stax-based XML reader
    public class XmlReader
    {
        private XMLStreamReader reader = null;

        private XmlReader(File file) throws IOException
        {
            try
            {
                reader = inFactory.createXMLStreamReader(new FileInputStream(file),
                                                         ENCODING);
            }
            catch (XMLStreamException xsE)
            {
                throw new IOException(xsE.getMessage(), xsE);
            }
        }

        public boolean findStanza(String name) throws IOException
        {
            try
            {
                while(reader.hasNext())
                {
                    if (reader.next() == START_ELEMENT &&
                        reader.getLocalName().equals(name))
                    {
                        return true;
                    }
                }
            }
            catch (XMLStreamException xsE)
            {
                throw new IOException(xsE.getMessage(), xsE);
            }
            return false;
        }

        public Value nextValue() throws IOException
        {
            Value value = null;
            try
            {
                while(reader.hasNext())
                {
                    switch (reader.next())
                    {
                        case START_ELEMENT:
                            value = new Value();
                            int numAttrs = reader.getAttributeCount();
                            for (int idx = 0; idx < numAttrs; idx++)
                            {
                                value.addAttr(reader.getAttributeLocalName(idx),
                                              reader.getAttributeValue(idx));
                            }
                            break;
                        case ATTRIBUTE:
                            break;
                        case CHARACTERS:
                            value.val = reader.getText();
                            break;
                        case END_ELEMENT:
                            return value;
                        default:
                            break;
                    }
                }
            }
            catch (XMLStreamException xsE)
            {
                throw new IOException(xsE.getMessage(), xsE);
            }
            return value;
        }

        public void close() throws IOException
        {
            try
            {
                reader.close();
            }
            catch (XMLStreamException xsE)
            {
                throw new IOException(xsE.getMessage(), xsE);
            }
        }
    }

    public class FlatWriter
    {
        private String brPath = null;
        private OutputStream out = null;
        private DigestOutputStream dout = null;
        private FlatWriter tailWriter = null;

        private FlatWriter(File file, String brPath, FlatWriter tailWriter) throws IOException
        {
            try
            {
                out = new FileOutputStream(file);
                dout = new DigestOutputStream(out,
                                           MessageDigest.getInstance(CS_ALGO));
                this.brPath = (brPath != null) ? brPath : file.getName();
                this.tailWriter = tailWriter;
            }
            catch (NoSuchAlgorithmException nsae)
            {
                throw new IOException("no such algorithm: " + CS_ALGO);
            }
        }

        public void writeProperty(String key, String value) throws IOException
        {
            writeLine(key + " " + value);
        }

        public void writeLine(String line) throws IOException
        {
            byte[] bytes = (line + "\n").getBytes(ENCODING);
            dout.write(bytes);
        }

        public void close() throws IOException
        {
            dout.flush();
            dout.close();
            out.close();
            if (tailWriter != null)
            {
                tailWriter.writeProperty(
                       Utils.toHex(dout.getMessageDigest().digest()), brPath);
            }
        }
    }

    // Wrapper for simple Stax-based writer
    public class XmlWriter
    {
        private String brPath = null;
        private OutputStream out = null;
        private DigestOutputStream dout = null;
        private XMLStreamWriter writer = null;
        private FlatWriter tailWriter = null;

        private XmlWriter(File file, String brPath, FlatWriter tailWriter) throws IOException
        {
            try
            {
                out = new FileOutputStream(file);
                dout = new DigestOutputStream(out,
                                    MessageDigest.getInstance(CS_ALGO));
                writer = outFactory.createXMLStreamWriter(dout, ENCODING);
                writer.writeStartDocument(ENCODING, "1.0");
                this.brPath = (brPath != null) ? brPath : file.getName();
                this.tailWriter = tailWriter;
            }
            catch (XMLStreamException xsE)
            {
                throw new IOException(xsE.getMessage(), xsE);
            }
            catch (NoSuchAlgorithmException nsaE)
            {
                throw new IOException("no such algorithm: " + CS_ALGO);
            }
        }

        public void startStanza(String name) throws IOException
        {
            try
            {
                writer.writeStartElement(name);
            }
            catch (XMLStreamException xsE)
            {
                throw new IOException(xsE.getMessage(), xsE);
            }
        }

        public void endStanza() throws IOException
        {
            try
            {
                writer.writeEndElement();
            }
            catch (XMLStreamException xsE)
            {
                throw new IOException(xsE.getMessage(), xsE);
            }
        }

        public void writeValue(String name, String val) throws IOException
        {
            if (name != null && val != null)
            {
                try
                {
                    writer.writeStartElement("value");
                    writer.writeAttribute("name", name);
                    writer.writeCharacters(val);
                    writer.writeEndElement();
                }
                catch (XMLStreamException xsE)
                {
                    throw new IOException(xsE.getMessage(), xsE);
                }
            }
        }

        public void writeValue(Value value) throws IOException
        {
            try
            {
                writer.writeStartElement("value");
                for (String attrName : value.attrs.keySet())
                {
                    String attrVal = value.attrs.get(attrName);
                    if (attrVal != null)
                    {
                        writer.writeAttribute(attrName, attrVal);
                    }
                }
                writer.writeCharacters(value.val);
                writer.writeEndElement();
            }
            catch (XMLStreamException xsE)
            {
                throw new IOException(xsE.getMessage(), xsE);
            }
        }

        public void close() throws IOException
        {
            try
            {
                writer.writeEndDocument();
                writer.flush();
                writer.close();
                out.close();
                if (tailWriter != null)
                {
                    tailWriter.writeProperty(
                            Utils.toHex(dout.getMessageDigest().digest()), brPath);
                }
           }
           catch (XMLStreamException xsE)
           {
                throw new IOException(xsE.getMessage(), xsE);
           }
        }
    }

    public static class Value
    {
        public String name = null;
        public String val = null;
        public Map attrs = null;

        public void addAttr(String name, String val)
        {
            if ("name".equals(name))
            {
                this.name = val;
            }
            else
            {
                if (attrs == null)
                {
                    attrs = new HashMap();
                }
                attrs.put(name, val);
            }
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy