All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.hpccsystems.spark.datasource.HpccRelation Maven / Gradle / Ivy

package org.hpccsystems.spark.datasource;

import org.hpccsystems.spark.datasource.*;

import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;

import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.sources.BaseRelation;
import org.apache.spark.sql.sources.Filter;
import org.apache.spark.sql.sources.PrunedFilteredScan;
import org.apache.spark.sql.types.StructType;

import org.hpccsystems.spark.HpccFile;
import org.hpccsystems.spark.SparkSchemaTranslator;

public class HpccRelation extends BaseRelation implements PrunedFilteredScan
{
    private static Logger log        = LogManager.getLogger(HpccRelation.class);

    private HpccFile      file       = null;
    private SQLContext    sqlContext = null;
    private HpccOptions   options    = null;

    public HpccRelation(SQLContext context, HpccOptions opts)
    {
        sqlContext = context;
        options = opts;
    }

    @Override
    public boolean needConversion()
    {
        return true;
    }

    @Override
    public StructType schema()
    {
        if (file == null)
        {
            try
            {
                file = new HpccFile(options.fileName, options.connectionInfo);
                file.setFileAccessExpirySecs(options.expirySeconds);

                if (options.projectList != null)
                {
                    file.setProjectList(options.projectList);
                }
            }
            catch (Exception e)
            {
                String error = "Unable to create HpccRDD with error: " + e.getMessage();
                log.error(error);
                throw new RuntimeException(error);
            }
        }

        StructType schema = null;
        try
        {
            schema = SparkSchemaTranslator.toSparkSchema(file.getProjectedRecordDefinition());
        }
        catch (Exception e)
        {
            String error = "Unable to translate HPCC record defintion to Spark schema:" + e.getMessage();
            log.error(error);
            throw new RuntimeException(error);
        }

        return schema;
    }

    public long sizeInBytes()
    {
        return super.sizeInBytes();
    }

    @Override
    public SQLContext sqlContext()
    {
        return sqlContext;
    }

    @Override
    public Filter[] unhandledFilters(Filter[] filters)
    {
        return filters;
    }

    @Override
    public RDD buildScan(String[] requiredColumns, Filter[] filters)
    {
        String projectList = String.join(", ", requiredColumns);

        RDD ret = null;
        try
        {
            if (file == null)
            {
                file = new HpccFile(options.fileName, options.connectionInfo);
                file.setFileAccessExpirySecs(options.expirySeconds);
            }

            file.setFilePartRecordLimit(options.filePartLimit);

            if (options.projectList != null)
            {
                projectList = options.projectList;
            }
            file.setProjectList(projectList);

            if (options.filterString != null)
            {
                file.setFilter(options.filterString);
            }

            ret = file.getRDD(sqlContext.sparkContext());
        }
        catch (Exception e)
        {
            String error = "Unable to create HpccRDD with error: " + e.getMessage();
            log.error(error);
            throw new RuntimeException(error);
        }

        return ret;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy