org.datacleaner.spark.SparkAnalysisRunner Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of DataCleaner-env-spark
There is a newer version: 6.0.0
/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.spark;

import java.util.Collections;
import java.util.List;

import org.apache.metamodel.csv.CsvConfiguration;
import org.apache.metamodel.fixedwidth.FixedWidthConfiguration;
import org.apache.metamodel.util.Resource;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.datacleaner.api.AnalyzerResult;
import org.datacleaner.api.InputRow;
import org.datacleaner.connection.CsvDatastore;
import org.datacleaner.connection.Datastore;
import org.datacleaner.connection.FixedWidthDatastore;
import org.datacleaner.connection.JsonDatastore;
import org.datacleaner.job.AnalysisJob;
import org.datacleaner.job.runner.AnalysisResultFuture;
import org.datacleaner.job.runner.AnalysisRunner;
import org.datacleaner.spark.functions.AnalyzerResultReduceFunction;
import org.datacleaner.spark.functions.CsvParserFunction;
import org.datacleaner.spark.functions.ExtractAnalyzerResultFunction;
import org.datacleaner.spark.functions.FixedWidthParserFunction;
import org.datacleaner.spark.functions.JsonParserFunction;
import org.datacleaner.spark.functions.RowProcessingFunction;
import org.datacleaner.spark.functions.TuplesToTuplesFunction;
import org.datacleaner.spark.functions.ValuesToInputRowFunction;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import scala.Tuple2;

public class SparkAnalysisRunner implements AnalysisRunner {

    private static final Logger logger = LoggerFactory.getLogger(SparkAnalysisRunner.class);

    private final SparkJobContext _sparkJobContext;
    private final JavaSparkContext _sparkContext;

    private final Integer _minPartitions;

    public SparkAnalysisRunner(JavaSparkContext sparkContext, SparkJobContext sparkJobContext) {
        this(sparkContext, sparkJobContext, null);
    }

    public SparkAnalysisRunner(JavaSparkContext sparkContext, SparkJobContext sparkJobContext, Integer minPartitions) {
        _sparkContext = sparkContext;
        _sparkJobContext = sparkJobContext;
        if (minPartitions != null) {
            if (minPartitions > 0) {
                _minPartitions = minPartitions;
            } else {
                logger.warn(
                        "Minimum number of partitions needs to be a positive number, but specified: {}. Disregarding the value and inferring the number of partitions automatically",
                        minPartitions);
                _minPartitions = null;
            }
        } else {
            _minPartitions = null;
        }
    }

    @Override
    public AnalysisResultFuture run(AnalysisJob job) {
        return run();
    }

    public AnalysisResultFuture run() {
        _sparkJobContext.triggerOnJobStart();
        final AnalysisJob analysisJob = _sparkJobContext.getAnalysisJob();
        final Datastore datastore = analysisJob.getDatastore();

        final JavaRDD inputRowsRDD = openSourceDatastore(datastore);

        final JavaPairRDD namedAnalyzerResultsRDD;
        if (_sparkJobContext.getAnalysisJobBuilder().isDistributable()) {
            logger.info("Running the job in distributed mode");

            // TODO: We have yet to get more experience with this setting - do a
            // benchmark of what works best, true or false.
            final boolean preservePartitions = true;

            final JavaRDD> processedTuplesRdd = inputRowsRDD
                    .mapPartitionsWithIndex(new RowProcessingFunction(_sparkJobContext), preservePartitions);
            
            if (_sparkJobContext.isResultEnabled()) {
                final JavaPairRDD partialNamedAnalyzerResultsRDD = processedTuplesRdd
                        .mapPartitionsToPair(new TuplesToTuplesFunction(), preservePartitions);
                
                namedAnalyzerResultsRDD = partialNamedAnalyzerResultsRDD.reduceByKey(new AnalyzerResultReduceFunction(
                        _sparkJobContext));
            } else {
                // call count() to block and wait for RDD to be fully processed
                processedTuplesRdd.count();
                namedAnalyzerResultsRDD = null;
            }
        } else {
            logger.warn("Running the job in non-distributed mode");
            final JavaRDD coalescedInputRowsRDD = inputRowsRDD.coalesce(1);
            namedAnalyzerResultsRDD = coalescedInputRowsRDD.mapPartitionsToPair(new RowProcessingFunction(
                    _sparkJobContext));
            
            if (!_sparkJobContext.isResultEnabled()) {
                // call count() to block and wait for RDD to be fully processed
                namedAnalyzerResultsRDD.count();
            }
        }
        
        if (!_sparkJobContext.isResultEnabled()) {
            final List> results = Collections.emptyList();
            return new SparkAnalysisResultFuture(results, _sparkJobContext);
        }

        assert namedAnalyzerResultsRDD != null;
        final JavaPairRDD finalAnalyzerResultsRDD = namedAnalyzerResultsRDD
                .mapValues(new ExtractAnalyzerResultFunction());

        // log analyzer results
        final List> results = finalAnalyzerResultsRDD.collect();

        logger.info("Finished! Number of AnalyzerResult objects: {}", results.size());
        for (Tuple2 analyzerResultTuple : results) {
            final String key = analyzerResultTuple._1;
            final AnalyzerResult result = analyzerResultTuple._2;
            logger.info("AnalyzerResult (" + key + "):\n\n" + result + "\n");
        }

        _sparkJobContext.triggerOnJobEnd();
        return new SparkAnalysisResultFuture(results, _sparkJobContext);
    }

    private JavaRDD openSourceDatastore(Datastore datastore) {
        if (datastore instanceof CsvDatastore) {
            final CsvDatastore csvDatastore = (CsvDatastore) datastore;
            final Resource resource = csvDatastore.getResource();
            assert resource != null;
            final String datastorePath = resource.getQualifiedPath();

            final CsvConfiguration csvConfiguration = csvDatastore.getCsvConfiguration();

            final JavaRDD rawInput;
            if (_minPartitions != null) {
                rawInput = _sparkContext.textFile(datastorePath, _minPartitions);
            } else {
                rawInput = _sparkContext.textFile(datastorePath);
            }
            final JavaRDD parsedInput = rawInput.map(new CsvParserFunction(csvConfiguration));

            JavaPairRDD zipWithIndex = parsedInput.zipWithIndex();

            if (csvConfiguration.getColumnNameLineNumber() != CsvConfiguration.NO_COLUMN_NAME_LINE) {
                zipWithIndex = zipWithIndex.filter(new SkipHeaderLineFunction(csvConfiguration
                        .getColumnNameLineNumber()));
            }

            final JavaRDD inputRowsRDD = zipWithIndex.map(new ValuesToInputRowFunction(_sparkJobContext));

            return inputRowsRDD;
        } else if (datastore instanceof JsonDatastore) {
            final JsonDatastore jsonDatastore = (JsonDatastore) datastore;
            final String datastorePath = jsonDatastore.getResource().getQualifiedPath();
            final JavaRDD rawInput;
            if (_minPartitions != null) {
                rawInput = _sparkContext.textFile(datastorePath, _minPartitions);
            } else {
                rawInput = _sparkContext.textFile(datastorePath);
            }

            final JavaRDD parsedInput = rawInput.map(new JsonParserFunction(jsonDatastore));
            final JavaPairRDD zipWithIndex = parsedInput.zipWithIndex();
            final JavaRDD inputRowsRDD = zipWithIndex.map(new ValuesToInputRowFunction(_sparkJobContext));
            return inputRowsRDD;
        } else if (datastore instanceof FixedWidthDatastore) {

            final FixedWidthDatastore fixedWidthDatastore = (FixedWidthDatastore) datastore;

            final Resource resource = fixedWidthDatastore.getResource();
            final String datastorePath = resource.getQualifiedPath();
            final FixedWidthConfiguration fixedWidthConfiguration = fixedWidthDatastore.getConfiguration();
            final JavaRDD rawInput;
            if (_minPartitions != null) {
                rawInput = _sparkContext.textFile(datastorePath, _minPartitions);
            } else {
                rawInput = _sparkContext.textFile(datastorePath);
            }

            final JavaRDD parsedInput = rawInput.map(new FixedWidthParserFunction(fixedWidthConfiguration));

            JavaPairRDD zipWithIndex = parsedInput.zipWithIndex();

            if (fixedWidthConfiguration.getColumnNameLineNumber() != FixedWidthConfiguration.NO_COLUMN_NAME_LINE) {
                zipWithIndex = zipWithIndex.filter(new SkipHeaderLineFunction(fixedWidthConfiguration
                        .getColumnNameLineNumber()));
            }

            final JavaRDD inputRowsRDD = zipWithIndex.map(new ValuesToInputRowFunction(_sparkJobContext));
            return inputRowsRDD;
        }

        throw new UnsupportedOperationException("Unsupported datastore type or configuration: " + datastore);
    }
}