All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datastax.data.exploration.common.SparkOperator Maven / Gradle / Ivy

package com.datastax.data.exploration.common;

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.javatuples.Pair;
import org.javatuples.Triplet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.List;

/**
 * 数据探索中Spark操作
 */
public class SparkOperator {

    private static final Logger logger = LoggerFactory.getLogger(SparkOperator.class);

    /**
     * 读取数据集
     * @param title 数据集是否包含标题
     * @param address 数据集位置
     * @param limit 数据集最大展示记录数
     * @return 数据集记录数,标题,数据
     */
    public static Triplet> getData(boolean title, String address, long limit) {
        SparkSession sparkSession = SparkSession.builder()
                .appName("datastax-insight")
                .master("local[*]")
                .getOrCreate();
        try {
            Dataset dataset = getDataset(sparkSession, title, address);
            long count = dataset.count();
            if (count < limit) {
                String[] columns = dataset.columns();
                List listRow = dataset.collectAsList();
                return new Triplet<>(count, columns, listRow);
            }
            return new Triplet<>(count, null, null);
        } catch (Exception e) {
            return new Triplet<>(0L, null, null);
        } finally {
            sparkSession.stop();
            logger.info(sparkSession.hashCode() + " sparkSession stop!");
        }
    }

    /**
     * 数据集抽样
     * @param title 数据集是否包含标题
     * @param address 数据集位置
     * @param size 数据集抽样记录数
     * @return 标题,数据
     */
    public static Pair> sample(boolean title, String address, int size) {
        SparkSession sparkSession = SparkSession.builder()
                .appName("datastax-insight")
                .master("local[*]")
                .getOrCreate();
        try {
            Dataset dataset = getDataset(sparkSession, title, address);
            String[] columns = dataset.columns();
            List listRow = dataset.javaRDD().takeSample(false, size);
            return new Pair<>(columns, listRow);
        } catch (Exception e) {
            return new Pair<>(null, null);
        } finally {
            sparkSession.stop();
            logger.info(sparkSession.hashCode() + " sparkSession stop!");
        }
    }

    private static Dataset getDataset(SparkSession sparkSession, boolean title, String address) {
        Dataset dataset;
        if (title) {
            dataset = sparkSession.read().option("header", "true").csv(address);
        } else {
            dataset = sparkSession.read().csv(address);
        }
        if(dataset == null) logger.info("read dataset error!");
        return dataset;
    }

    public static Pair> getData(boolean title, String address) {
        SparkSession sparkSession = SparkSession.builder()
                .appName("datastax-insight")
                .master("local[*]")
                .getOrCreate();
        try {
            Dataset dataset = getDataset(sparkSession, title, address);
            String[] columns = dataset.columns();
            List listRow = dataset.collectAsList();
            return new Pair<>(columns, listRow);
        } catch (Exception e) {
            return new Pair<>(null, null);
        } finally {
            sparkSession.stop();
            logger.info(sparkSession.hashCode() + " sparkSession stop!");
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy