All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.marklogic.contentpump.CompressedDocumentReader Maven / Gradle / Ivy

There is a newer version: 11.3.1
Show newest version
/*
 * Copyright (c) 2020 MarkLogic Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.marklogic.contentpump;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import com.marklogic.contentpump.utilities.FileIterator;
import com.marklogic.mapreduce.CompressionCodec;
import com.marklogic.mapreduce.MarkLogicConstants;

/**
 * RecordReader for CompressedDocumentInputFormat.
 * 
 * @author ali
 * 
 * @param 
 */
public class CompressedDocumentReader extends
    ImportRecordReader {
    public static final Log LOG = LogFactory.getLog(
            CompressedDocumentReader.class);
    protected InputStream zipIn;
    protected byte[] buf = new byte[65536];
    protected boolean hasNext = true;
    protected CompressionCodec codec;
    protected int batchSize;
    public CompressedDocumentReader() {}

    @Override
    public void close() throws IOException {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Closing " + file);
        }
        if (zipIn != null) {
            zipIn.close();
        }
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        return hasNext ? 0 : 1;
    }

    @Override
    public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
        initConfig(context);
        batchSize = conf.getInt(MarkLogicConstants.BATCH_SIZE, 
            MarkLogicConstants.DEFAULT_BATCH_SIZE);
        setFile(((FileSplit) inSplit).getPath());  
        fs = file.getFileSystem(conf);
        FileStatus status = fs.getFileStatus(file);
        if(status.isDirectory()) {
            iterator = new FileIterator((FileSplit)inSplit, context);
            inSplit = iterator.next();
        }
        initStream(inSplit);
    }
    
    protected void initStream(InputSplit inSplit) throws IOException {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Starting " + file);
        }
        FSDataInputStream fileIn = openFile(inSplit, false);
        if (fileIn == null) {
            return;
        }

        String codecString = conf.get(
            ConfigConstants.CONF_INPUT_COMPRESSION_CODEC,
            CompressionCodec.ZIP.toString()).toUpperCase();
        try {
            codec = CompressionCodec.valueOf(codecString);
        } catch (IllegalArgumentException e) {
            String error = "Unsupported codec: " + codec.name();
            LOG.error(error, new UnsupportedOperationException(error));
            return;
        }
        switch (codec) {
        case ZIP:
            zipIn = new ZipInputStream(fileIn);
            break;
        case GZIP:
            zipIn = new GZIPInputStream(fileIn);
            String uri = makeURIFromPath(file);
            if (uri.toLowerCase().endsWith(".gz") || 
                    uri.toLowerCase().endsWith(".gzip")) {
                uri = uri.substring(0, uri.lastIndexOf('.'));
            } 
            setKey(uri, 0, 0, true);
            break;
        default:
            String error = "Unsupported codec: " + codec.name();
            LOG.error(error, new UnsupportedOperationException(error));
        }
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if (zipIn == null) {
            hasNext = false;
            return false;
        }
        if (codec == CompressionCodec.ZIP) {
            ZipEntry zipEntry;
            ZipInputStream zis = (ZipInputStream) zipIn;
            while (true) {
                try {
                    zipEntry = zis.getNextEntry();
                    if (zipEntry == null) {
                        break;
                    } 
                    if (zipEntry.getSize() == 0) {
                        continue;
                    }
                    subId = zipEntry.getName();
                    String uri = makeURIForZipEntry(file, subId);
                    if (setKey(uri, 0, 0, true)) {
                        return true;
                    }
                    setValue(zipEntry.getSize());
                    return true;
                } catch (IllegalArgumentException e) {
                    LOG.warn("Skipped a zip entry in : " + file.toUri()
                            + ", reason: " + e.getMessage());
                }
            }
        } else if (codec == CompressionCodec.GZIP) {
            setValue(0);
            zipIn.close();
            zipIn = null;
            hasNext = false;
            return true;
        } else {
            return false;
        }       
        if (iterator != null && iterator.hasNext()) {
            close();
            initStream(iterator.next());
            return nextKeyValue();
        } else {
            hasNext = false;
            return false;
        }
    }

    protected void setValue(long length) throws IOException {
        ByteArrayOutputStream baos;
        if (length > 0) {
            baos = new ByteArrayOutputStream((int) length);
        } else {
            baos = new ByteArrayOutputStream();
        }
         
        int size;
        while ((size = zipIn.read(buf, 0, buf.length)) != -1) {
            baos.write(buf, 0, size);
        }
        if (value instanceof Text) {
            ((Text) value).set(baos.toString(encoding));
        } else {
            if (batchSize > 1) {
                // Copy data since XCC won't do it when Content is created.
                value = (VALUEIN)new BytesWritable();
            }
            ((BytesWritable) value).set(baos.toByteArray(), 0, baos.size());
        } 
        baos.close();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy