com.marklogic.contentpump.OutputArchive Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mlcp Show documentation
MarkLogic Content Pump
There is a newer version: 11.3.1
/*
 * Copyright (c) 2020 MarkLogic Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.marklogic.contentpump;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.ZipEntry;
import java.util.zip.ZipException;
import java.util.zip.ZipOutputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.LocalFileSystem;

import com.marklogic.mapreduce.InternalConstants;

/**
 * Archive for export, create zip file(s).
 * @author ali
 *
 */
public class OutputArchive implements InternalConstants {
    public static final Log LOG = LogFactory.getLog(OutputArchive.class);
    public static final String EXTENSION = ".zip";
    private long currentFileBytes = 0;
    private ZipOutputStream outputStream;
    private String basePath;
    private String currPath;
    private static AtomicInteger fileCount = new AtomicInteger();
    private int currentEntries;
    private Configuration conf;
    
    public OutputArchive(String path, Configuration conf) {
        if (path.toLowerCase().endsWith(EXTENSION)) {
            this.basePath = path;
        } else {
            this.basePath = path + EXTENSION;
        }
        this.conf = conf;
    }

    private void newOutputStream() throws IOException {
        // use the constructor filename for the first zip,
        // then add filecount to subsequent archives, if any.
        int count = fileCount.getAndIncrement();
        currPath = newPackagePath(basePath, count, 6);
        if (outputStream != null) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("closing output archive: " + currPath);
            }   
            outputStream.flush();
            outputStream.close();
        }
        currentFileBytes = 0;
        currentEntries = 0;

        Path zpath = new Path(currPath);
        FileSystem fs = zpath.getFileSystem(conf);
        if (fs.exists(zpath)) {
            throw new IOException(zpath + " already exists.");
        }
        
        if (LOG.isDebugEnabled()) {
            LOG.debug("Creating output archive: " + zpath);
            LOG.debug("Default charset: " + Charset.defaultCharset());
        }
        // if fs instanceof DistributedFileSystem or MapRFileSystem, 
        // use hadoop api; otherwise, use java api
        if (fs instanceof LocalFileSystem) {
            File f = new File(zpath.toUri().getPath());
            if (!f.exists()) {
                f.getParentFile().mkdirs();
                f.createNewFile();
            }
            FileOutputStream fos = new FileOutputStream(f, false);
            outputStream = new ZipOutputStream(new BufferedOutputStream(fos));
        } else {
            FSDataOutputStream fsout = fs.create(zpath, false);
            outputStream = 
                    new ZipOutputStream(new BufferedOutputStream(fsout));
        }
    }

    /**
     * @param canonicalPath
     * @param count
     * @param width
     * @return
     */
    static protected String newPackagePath(String canonicalPath, int count,
        int width) {
        String path = canonicalPath;
        if (path.endsWith(EXTENSION)) {
            int index1 = path.lastIndexOf(EXTENSION);
            String subStr = path.substring(0, index1);
            int index2 = subStr.lastIndexOf('-');
            path = path.substring(0, index2)
                + String.format("-%0" + width + "d", count)
                + path.substring(index2);
        } else {
            path = path + "-" + count;
        }
        return path;
    }
    
    public void write(String uri, InputStream is, long size, 
            boolean isExportDoc) 
    throws IOException {
        ZipEntry entry = new ZipEntry(uri);
        if (outputStream == null || 
            (currentFileBytes + size > Integer.MAX_VALUE) &&
             currentFileBytes > 0) {
            if (currentEntries % 2 == 0 && !isExportDoc) {
                //the file overflowed is metadata, create new zip
                newOutputStream();
            }
        }     
        long totalRead = 0;
        try {
            outputStream.putNextEntry(entry);
            long bufSize = Math.min(size, MAX_BUFFER_SIZE);
            byte[] buf = new byte[(int)bufSize];       
            for (long toRead = size, read = 0; toRead > 0; toRead -= read) {
                read = is.read(buf, 0, (int)bufSize);
                if (read > 0) {
                    outputStream.write(buf, 0, (int)read);
                    totalRead += read;
                } else {
                    if (size != Integer.MAX_VALUE) {
                        LOG.warn("Premature EOF: uri=" + uri +
                            ",toRead=" + toRead);
                    }
                    break;
                } 
            }
            outputStream.closeEntry();
        } catch (ZipException e) {
            LOG.warn("Exception caught: " + e.getMessage() + entry.getName());
        }
        currentFileBytes += totalRead;
        currentEntries++;
    }

    public long write(String outputPath, byte[] bytes, boolean isExportDoc)
    throws IOException {

        if (null == outputPath) {
            throw new NullPointerException("null path");
        }
        if (null == bytes) {
            throw new NullPointerException("null content bytes");
        }

        long total = bytes.length;
        ZipEntry entry = new ZipEntry(outputPath);

        if (outputStream == null) {
            newOutputStream();
        }

        if (currentFileBytes > 0
            && currentFileBytes + total > Integer.MAX_VALUE) {
            if (currentEntries % 2 ==0 && !isExportDoc) {
            	//the file overflowed is metadata, create new zip
                newOutputStream();
            } else {
            	//the file overflowed is doc, keep it in current zip
                LOG.warn("too many bytes in current package:" + currPath);
            }
        }

        try {
            outputStream.putNextEntry(entry);
            outputStream.write(bytes);
            outputStream.closeEntry();
        } catch (ZipException e) {
            LOG.warn("Exception caught: " + e.getMessage() + entry.getName());
            return 0;
        }

        currentFileBytes += total;
        currentEntries++;

        return total;

    }

    public void close() throws IOException {
        if (outputStream != null) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("closing output archive: " + currPath);
            }           
            outputStream.flush();
            outputStream.close();
        }
    }

}