All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.marklogic.spark.reader.file.GenericFileReader Maven / Gradle / Ivy

There is a newer version: 2.4.2
Show newest version
/*
 * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
 */
package com.marklogic.spark.reader.file;

import com.marklogic.spark.ConnectorException;
import com.marklogic.spark.Util;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
import org.apache.spark.sql.connector.read.PartitionReader;
import org.apache.spark.unsafe.types.ByteArray;
import org.apache.spark.unsafe.types.UTF8String;

import java.io.IOException;
import java.io.InputStream;

/**
 * "Generic" = read each file as-is with no special processing.
 */
class GenericFileReader implements PartitionReader {

    private final FilePartition filePartition;
    private final FileContext fileContext;
    private final boolean isStreaming;

    private InternalRow nextRowToReturn;
    private int filePathIndex;

    GenericFileReader(FilePartition filePartition, FileContext fileContext) {
        this.filePartition = filePartition;
        this.fileContext = fileContext;
        this.isStreaming = fileContext.isStreamingFiles();
    }

    @Override
    public boolean next() {
        if (filePathIndex >= filePartition.getPaths().size()) {
            return false;
        }

        // If streaming, we want to put the unaltered file path in the row. The writer can then decode it and also use
        // its original value as the URI, as the PUT v1/documents endpoint does not allow e.g. spaces.
        final String originalFilePath = filePartition.getPaths().get(filePathIndex);
        final String path = this.isStreaming ? originalFilePath : fileContext.decodeFilePath(originalFilePath);

        filePathIndex++;
        try {
            byte[] content = this.isStreaming ?
                FileUtil.serializeFileContext(fileContext, path) :
                readFileIntoByteArray(path);

            nextRowToReturn = new GenericInternalRow(new Object[]{
                UTF8String.fromString(path),
                ByteArray.concat(content),
                null, null, null, null, null, null
            });
        } catch (Exception ex) {
            String message = String.format("Unable to read file at %s; cause: %s", path, ex.getMessage());
            if (fileContext.isReadAbortOnFailure()) {
                throw new ConnectorException(message, ex);
            }
            Util.MAIN_LOGGER.warn(message);
            return next();
        }
        return true;
    }

    @Override
    public InternalRow get() {
        return nextRowToReturn;
    }

    @Override
    public void close() throws IOException {
        // Nothing to close.
    }

    private byte[] readFileIntoByteArray(String path) throws IOException {
        try (InputStream inputStream = fileContext.openFile(path)) {
            return fileContext.readBytes(inputStream);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy