org.datacleaner.spark.SparkAnalysisRunner Maven / Gradle / Ivy
/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.spark;
import java.util.Collections;
import java.util.List;
import org.apache.metamodel.csv.CsvConfiguration;
import org.apache.metamodel.util.Resource;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.datacleaner.api.AnalyzerResult;
import org.datacleaner.api.InputRow;
import org.datacleaner.connection.CsvDatastore;
import org.datacleaner.connection.Datastore;
import org.datacleaner.connection.JsonDatastore;
import org.datacleaner.job.AnalysisJob;
import org.datacleaner.job.runner.AnalysisResultFuture;
import org.datacleaner.job.runner.AnalysisRunner;
import org.datacleaner.spark.functions.AnalyzerResultReduceFunction;
import org.datacleaner.spark.functions.CsvParserFunction;
import org.datacleaner.spark.functions.ExtractAnalyzerResultFunction;
import org.datacleaner.spark.functions.JsonParserFunction;
import org.datacleaner.spark.functions.RowProcessingFunction;
import org.datacleaner.spark.functions.TuplesToTuplesFunction;
import org.datacleaner.spark.functions.ValuesToInputRowFunction;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
public class SparkAnalysisRunner implements AnalysisRunner {
private static final Logger logger = LoggerFactory.getLogger(SparkAnalysisRunner.class);
private final SparkJobContext _sparkJobContext;
private final JavaSparkContext _sparkContext;
private final Integer _minPartitions;
public SparkAnalysisRunner(JavaSparkContext sparkContext, SparkJobContext sparkJobContext) {
this(sparkContext, sparkJobContext, null);
}
public SparkAnalysisRunner(JavaSparkContext sparkContext, SparkJobContext sparkJobContext, Integer minPartitions) {
_sparkContext = sparkContext;
_sparkJobContext = sparkJobContext;
if (minPartitions != null) {
if (minPartitions > 0) {
_minPartitions = minPartitions;
} else {
logger.warn(
"Minimum number of partitions needs to be a positive number, but specified: {}. Disregarding the value and inferring the number of partitions automatically",
minPartitions);
_minPartitions = null;
}
} else {
_minPartitions = null;
}
}
@Override
public AnalysisResultFuture run(AnalysisJob job) {
return run();
}
public AnalysisResultFuture run() {
_sparkJobContext.triggerOnJobStart();
final AnalysisJob analysisJob = _sparkJobContext.getAnalysisJob();
final Datastore datastore = analysisJob.getDatastore();
final JavaRDD inputRowsRDD = openSourceDatastore(datastore);
final JavaPairRDD namedAnalyzerResultsRDD;
if (_sparkJobContext.getAnalysisJobBuilder().isDistributable()) {
logger.info("Running the job in distributed mode");
// TODO: We have yet to get more experience with this setting - do a
// benchmark of what works best, true or false.
final boolean preservePartitions = true;
final JavaRDD> processedTuplesRdd = inputRowsRDD
.mapPartitionsWithIndex(new RowProcessingFunction(_sparkJobContext), preservePartitions);
if (_sparkJobContext.isResultEnabled()) {
final JavaPairRDD partialNamedAnalyzerResultsRDD = processedTuplesRdd
.mapPartitionsToPair(new TuplesToTuplesFunction(), preservePartitions);
namedAnalyzerResultsRDD = partialNamedAnalyzerResultsRDD.reduceByKey(new AnalyzerResultReduceFunction(
_sparkJobContext));
} else {
// call count() to block and wait for RDD to be fully processed
processedTuplesRdd.count();
namedAnalyzerResultsRDD = null;
}
} else {
logger.warn("Running the job in non-distributed mode");
final JavaRDD coalescedInputRowsRDD = inputRowsRDD.coalesce(1);
namedAnalyzerResultsRDD = coalescedInputRowsRDD.mapPartitionsToPair(new RowProcessingFunction(
_sparkJobContext));
if (!_sparkJobContext.isResultEnabled()) {
// call count() to block and wait for RDD to be fully processed
namedAnalyzerResultsRDD.count();
}
}
if (!_sparkJobContext.isResultEnabled()) {
final List> results = Collections.emptyList();
return new SparkAnalysisResultFuture(results, _sparkJobContext);
}
assert namedAnalyzerResultsRDD != null;
final JavaPairRDD finalAnalyzerResultsRDD = namedAnalyzerResultsRDD
.mapValues(new ExtractAnalyzerResultFunction());
// log analyzer results
final List> results = finalAnalyzerResultsRDD.collect();
logger.info("Finished! Number of AnalyzerResult objects: {}", results.size());
for (Tuple2 analyzerResultTuple : results) {
final String key = analyzerResultTuple._1;
final AnalyzerResult result = analyzerResultTuple._2;
logger.info("AnalyzerResult (" + key + "):\n\n" + result + "\n");
}
_sparkJobContext.triggerOnJobEnd();
return new SparkAnalysisResultFuture(results, _sparkJobContext);
}
private JavaRDD openSourceDatastore(Datastore datastore) {
if (datastore instanceof CsvDatastore) {
final CsvDatastore csvDatastore = (CsvDatastore) datastore;
final Resource resource = csvDatastore.getResource();
assert resource != null;
final String datastorePath = resource.getQualifiedPath();
final CsvConfiguration csvConfiguration = csvDatastore.getCsvConfiguration();
final JavaRDD rawInput;
if (_minPartitions != null) {
rawInput = _sparkContext.textFile(datastorePath, _minPartitions);
} else {
rawInput = _sparkContext.textFile(datastorePath);
}
final JavaRDD