All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.marklogic.spark.reader.file.GzipFileReader Maven / Gradle / Ivy

/*
 * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
 */
package com.marklogic.spark.reader.file;

import com.marklogic.spark.ConnectorException;
import com.marklogic.spark.Util;
import org.apache.commons.io.IOUtils;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
import org.apache.spark.sql.connector.read.PartitionReader;
import org.apache.spark.unsafe.types.ByteArray;
import org.apache.spark.unsafe.types.UTF8String;

import java.io.IOException;
import java.io.InputStream;

/**
 * Expects to read a single gzipped file and return a single row. May expand the scope of this later to expect multiple
 * files and to thus return multiple rows.
 */
class GzipFileReader implements PartitionReader {

    private final FilePartition filePartition;
    private final FileContext fileContext;

    private int nextFilePathIndex;
    private InternalRow rowToReturn;

    GzipFileReader(FilePartition filePartition, FileContext fileContext) {
        this.filePartition = filePartition;
        this.fileContext = fileContext;
    }

    @Override
    public boolean next() {
        if (nextFilePathIndex >= filePartition.getPaths().size()) {
            return false;
        }

        String currentFilePath = filePartition.getPaths().get(nextFilePathIndex);
        nextFilePathIndex++;
        InputStream gzipInputStream = null;
        try {
            gzipInputStream = fileContext.openFile(currentFilePath);
            byte[] content = extractGZIPContents(currentFilePath, gzipInputStream);
            String uri = makeURI(currentFilePath);
            this.rowToReturn = new GenericInternalRow(new Object[]{
                UTF8String.fromString(uri), ByteArray.concat(content),
                null, null, null, null, null, null
            });
            return true;
        } catch (RuntimeException ex) {
            if (fileContext.isReadAbortOnFailure()) {
                throw ex;
            }
            Util.MAIN_LOGGER.warn("Unable to read file at {}; cause: {}", currentFilePath, ex.getMessage());
            return next();
        } finally {
            IOUtils.closeQuietly(gzipInputStream);
        }
    }

    @Override
    public InternalRow get() {
        return rowToReturn;
    }

    @Override
    public void close() {
        // Nothing to close.
    }

    private byte[] extractGZIPContents(String currentFilePath, InputStream gzipInputStream) {
        try {
            return fileContext.readBytes(gzipInputStream);
        } catch (IOException e) {
            throw new ConnectorException(String.format("Unable to read from gzip file at %s; cause: %s",
                currentFilePath, e.getMessage()), e);
        }
    }

    private String makeURI(String path) {
        // Copied from MLCP.
        return path.endsWith(".gzip") || path.endsWith(".gz") ?
            path.substring(0, path.lastIndexOf(".")) :
            path;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy