org.qcri.rheem.profiler.spark.Main Maven / Gradle / Ivy
package org.qcri.rheem.profiler.spark;
import de.hpi.isg.profiledb.instrumentation.StopWatch;
import de.hpi.isg.profiledb.store.model.Experiment;
import de.hpi.isg.profiledb.store.model.Subject;
import de.hpi.isg.profiledb.store.model.TimeMeasurement;
import org.qcri.rheem.core.api.Configuration;
import org.qcri.rheem.core.util.RheemArrays;
import org.qcri.rheem.core.util.RheemCollections;
import org.qcri.rheem.profiler.data.DataGenerators;
import org.qcri.rheem.spark.platform.SparkPlatform;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
/**
* Starts a profiling run of Spark.
*/
public class Main {
public static void main(String[] args) {
if (args.length < 2) {
System.err.printf("Usage: java %s [[,]*]+ \n", Main.class);
System.exit(1);
}
String operator = args[0];
List> allCardinalities = new LinkedList<>();
for (int i = 1; i < args.length; i++) {
List cardinalities = Arrays.stream(args[i].split(",")).map(Long::valueOf).collect(Collectors.toList());
allCardinalities.add(cardinalities);
}
List results;
switch (operator) {
case "textsource":
results = profile(OperatorProfilers.createSparkTextFileSourceProfiler(), allCardinalities);
break;
case "collectionsource":
results = profile(OperatorProfilers.createSparkCollectionSourceProfiler(), allCardinalities);
break;
case "map":
results = profile(OperatorProfilers.createSparkMapProfiler(), allCardinalities);
break;
case "filter":
results = profile(OperatorProfilers.createSparkFilterProfiler(), allCardinalities);
break;
case "flatmap":
results = profile(OperatorProfilers.createSparkFlatMapProfiler(), allCardinalities);
break;
case "reduce":
results = profile(OperatorProfilers.createSparkReduceByProfiler(), allCardinalities);
break;
case "globalreduce":
results = profile(OperatorProfilers.createSparkGlobalReduceProfiler(), allCardinalities);
break;
case "distinct":
case "distinct-string":
results = profile(OperatorProfilers.createSparkDistinctProfiler(), allCardinalities);
break;
case "distinct-integer":
results = profile(OperatorProfilers.createSparkDistinctProfiler(
DataGenerators.createReservoirBasedIntegerSupplier(new ArrayList<>(), 0.7d, new Random(42)),
Integer.class,
new Configuration()
), allCardinalities);
break;
case "sort":
case "sort-string":
results = profile(OperatorProfilers.createSparkSortProfiler(), allCardinalities);
break;
case "sort-integer":
results = profile(OperatorProfilers.createSparkSortProfiler(
DataGenerators.createReservoirBasedIntegerSupplier(new ArrayList<>(), 0.7d, new Random(42)),
Integer.class,
new Configuration()
), allCardinalities);
break;
case "count":
results = profile(OperatorProfilers.createSparkCountProfiler(), allCardinalities);
break;
case "groupby":
results = profile(OperatorProfilers.createSparkMaterializedGroupByProfiler(), allCardinalities);
break;
case "join":
results = profile(OperatorProfilers.createSparkJoinProfiler(), allCardinalities);
break;
case "union":
results = profile(OperatorProfilers.createSparkUnionProfiler(), allCardinalities);
break;
case "cartesian":
results = profile(OperatorProfilers.createSparkCartesianProfiler(), allCardinalities);
break;
case "callbacksink":
results = profile(OperatorProfilers.createSparkLocalCallbackSinkProfiler(), allCardinalities);
break;
// case "word-count-split": {
// final Supplier randomStringSupplier = DataGenerators.createRandomStringSupplier(2, 10, new Random(42));
// results = profile(
// org.qcri.rheem.profiler.java.OperatorProfilers.createJavaFlatMapProfiler(
// () -> String.format("%s %s %s %s %s %s %s %s %s",
// randomStringSupplier.get(), randomStringSupplier.get(),
// randomStringSupplier.get(), randomStringSupplier.get(),
// randomStringSupplier.get(), randomStringSupplier.get(),
// randomStringSupplier.get(), randomStringSupplier.get(),
// randomStringSupplier.get()),
// str -> Arrays.asList(str.split(" ")),
// String.class,
// String.class
// ),
// cardinalities);
// break;
// }
// case "word-count-canonicalize": {
// final Supplier randomStringSupplier = DataGenerators.createRandomStringSupplier(2, 10, new Random(42));
// results = profile(
// org.qcri.rheem.profiler.java.OperatorProfilers.createJavaMapProfiler(
// randomStringSupplier,
// word -> new Tuple2<>(word.toLowerCase(), 1),
// String.class,
// Tuple2.class
// ),
// cardinalities
// );
// break;
// }
// case "word-count-count": {
// final Supplier stringSupplier = DataGenerators.createReservoirBasedStringSupplier(new ArrayList<>(), 0.7, new Random(42), 2, 10);
// results = profile(
// org.qcri.rheem.profiler.java.OperatorProfilers.createJavaReduceByProfiler(
// () -> new Tuple2<>(stringSupplier.get(), 1),
// pair -> pair.field0,
// (p1, p2) -> {
// p1.field1 += p2.field1;
// return p1;
// },
// cast(Tuple2.class),
// String.class
// ),
// cardinalities
// );
// break;
// }
default:
System.out.println("Unknown operator: " + operator);
return;
}
System.out.println();
System.out.println(RheemCollections.getAny(results).getCsvHeader());
results.forEach(result -> System.out.println(result.toCsvString()));
}
private static StopWatch createStopWatch() {
Experiment experiment = new Experiment("rheem-profiler", new Subject("Rheem", "0.1"));
return new StopWatch(experiment);
}
/**
* Run the {@code opProfiler} with all combinations that can be derived from {@code allCardinalities}.
*/
private static List profile(SparkOperatorProfiler opProfiler,
List> allCardinalities) {
return StreamSupport.stream(RheemCollections.streamedCrossProduct(allCardinalities).spliterator(), false)
.map(cardinalities -> profile(opProfiler, RheemArrays.toArray(cardinalities)))
.collect(Collectors.toList());
}
/**
* Run the {@code opProfiler} with the given {@code cardinalities}.
*/
private static SparkOperatorProfiler.Result profile(SparkOperatorProfiler opProfiler, long... cardinalities) {
System.out.printf("Profiling %s with %s data quanta.\n", opProfiler, RheemArrays.asList(cardinalities));
final StopWatch stopWatch = createStopWatch();
SparkOperatorProfiler.Result result = null;
try {
System.out.println("Prepare...");
final TimeMeasurement preparation = stopWatch.start("Preparation");
SparkPlatform.getInstance().warmUp(new Configuration());
opProfiler.prepare(cardinalities);
preparation.stop();
System.out.println("Execute...");
final TimeMeasurement execution = stopWatch.start("Execution");
result = opProfiler.run();
execution.stop();
} finally {
System.out.println("Clean up...");
final TimeMeasurement cleanUp = stopWatch.start("Clean up");
opProfiler.cleanUp();
cleanUp.stop();
System.out.println("Measurement:");
if (result != null) System.out.println(result);
System.out.println(stopWatch.toPrettyString());
System.out.println();
}
return result;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy