ai.grakn.kb.internal.computer.GraknSparkComputer Maven / Gradle / Ivy
/*
* GRAKN.AI - THE KNOWLEDGE GRAPH
* Copyright (C) 2018 Grakn Labs Ltd
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*/
package ai.grakn.kb.internal.computer;
import org.apache.commons.configuration.ConfigurationUtils;
import org.apache.commons.configuration.FileConfiguration;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.commons.lang3.concurrent.BasicThreadFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.spark.HashPartitioner;
import org.apache.spark.Partitioner;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.launcher.SparkLauncher;
import org.apache.spark.serializer.KryoSerializer;
import org.apache.spark.storage.StorageLevel;
import org.apache.tinkerpop.gremlin.hadoop.Constants;
import org.apache.tinkerpop.gremlin.hadoop.process.computer.AbstractHadoopGraphComputer;
import org.apache.tinkerpop.gremlin.hadoop.process.computer.util.ComputerSubmissionHelper;
import org.apache.tinkerpop.gremlin.hadoop.structure.HadoopConfiguration;
import org.apache.tinkerpop.gremlin.hadoop.structure.HadoopGraph;
import org.apache.tinkerpop.gremlin.hadoop.structure.io.FileSystemStorage;
import org.apache.tinkerpop.gremlin.hadoop.structure.io.GraphFilterAware;
import org.apache.tinkerpop.gremlin.hadoop.structure.io.HadoopPoolShimService;
import org.apache.tinkerpop.gremlin.hadoop.structure.io.VertexWritable;
import org.apache.tinkerpop.gremlin.hadoop.structure.util.ConfUtil;
import org.apache.tinkerpop.gremlin.process.computer.ComputerResult;
import org.apache.tinkerpop.gremlin.process.computer.GraphComputer;
import org.apache.tinkerpop.gremlin.process.computer.MapReduce;
import org.apache.tinkerpop.gremlin.process.computer.Memory;
import org.apache.tinkerpop.gremlin.process.computer.VertexProgram;
import org.apache.tinkerpop.gremlin.process.computer.util.DefaultComputerResult;
import org.apache.tinkerpop.gremlin.process.computer.util.MapMemory;
import org.apache.tinkerpop.gremlin.process.traversal.TraversalStrategies;
import org.apache.tinkerpop.gremlin.process.traversal.util.TraversalInterruptedException;
import org.apache.tinkerpop.gremlin.spark.process.computer.payload.ViewIncomingPayload;
import org.apache.tinkerpop.gremlin.spark.process.computer.traversal.strategy.optimization.SparkInterceptorStrategy;
import org.apache.tinkerpop.gremlin.spark.process.computer.traversal.strategy.optimization.SparkSingleIterationStrategy;
import org.apache.tinkerpop.gremlin.spark.structure.Spark;
import org.apache.tinkerpop.gremlin.spark.structure.io.InputFormatRDD;
import org.apache.tinkerpop.gremlin.spark.structure.io.InputOutputHelper;
import org.apache.tinkerpop.gremlin.spark.structure.io.InputRDD;
import org.apache.tinkerpop.gremlin.spark.structure.io.OutputFormatRDD;
import org.apache.tinkerpop.gremlin.spark.structure.io.OutputRDD;
import org.apache.tinkerpop.gremlin.spark.structure.io.PersistedInputRDD;
import org.apache.tinkerpop.gremlin.spark.structure.io.PersistedOutputRDD;
import org.apache.tinkerpop.gremlin.spark.structure.io.SparkContextStorage;
import org.apache.tinkerpop.gremlin.spark.structure.io.gryo.GryoRegistrator;
import org.apache.tinkerpop.gremlin.spark.structure.io.gryo.kryoshim.unshaded.UnshadedKryoShimService;
import org.apache.tinkerpop.gremlin.structure.Direction;
import org.apache.tinkerpop.gremlin.structure.io.IoRegistry;
import org.apache.tinkerpop.gremlin.structure.io.Storage;
import org.apache.tinkerpop.gremlin.structure.io.gryo.kryoshim.KryoShimServiceLoader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadLocalRandom;
/**
*
* This is a modified version of Spark Computer.
* We change its behaviour so it can won't destroy the rdd after every job.
*
*
* @author Jason Liu
* @author Marko A. Rodriguez
*/
public final class GraknSparkComputer extends AbstractHadoopGraphComputer {
private static final Logger LOGGER = LoggerFactory.getLogger(GraknSparkComputer.class);
private final org.apache.commons.configuration.Configuration sparkConfiguration;
private boolean workersSet = false;
private final ThreadFactory threadFactoryBoss =
new BasicThreadFactory.Builder().namingPattern(GraknSparkComputer.class.getSimpleName() + "-boss").build();
private static final Set KEYS_PASSED_IN_JVM_SYSTEM_PROPERTIES = new HashSet<>(Arrays.asList(
KryoShimServiceLoader.KRYO_SHIM_SERVICE,
IoRegistry.IO_REGISTRY));
private final ExecutorService computerService = Executors.newSingleThreadExecutor(threadFactoryBoss);
static {
TraversalStrategies.GlobalCache.registerStrategies(GraknSparkComputer.class,
TraversalStrategies.GlobalCache.getStrategies(GraphComputer.class).clone().addStrategies(
SparkSingleIterationStrategy.instance(),
SparkInterceptorStrategy.instance()));
}
private String jobGroupId = null;
public GraknSparkComputer(final HadoopGraph hadoopGraph) {
super(hadoopGraph);
this.sparkConfiguration = new HadoopConfiguration();
ConfigurationUtils.copy(this.hadoopGraph.configuration(), this.sparkConfiguration);
}
@Override
public GraphComputer workers(final int workers) {
super.workers(workers);
if (this.sparkConfiguration.containsKey(SparkLauncher.SPARK_MASTER) &&
this.sparkConfiguration.getString(SparkLauncher.SPARK_MASTER).startsWith("local")) {
this.sparkConfiguration.setProperty(SparkLauncher.SPARK_MASTER, "local[" + this.workers + "]");
}
this.workersSet = true;
return this;
}
@Override
public GraphComputer configure(final String key, final Object value) {
this.sparkConfiguration.setProperty(key, value);
return this;
}
@Override
public Future submit() {
this.validateStatePriorToExecution();
return ComputerSubmissionHelper
.runWithBackgroundThread(exec -> submitWithExecutor(), "SparkSubmitter");
}
public void cancelJobs() {
if (jobGroupId != null) {
Spark.getContext().cancelJobGroup(jobGroupId);
}
}
@SuppressWarnings("PMD.UnusedFormalParameter")
private Future submitWithExecutor() {
jobGroupId = Integer.toString(ThreadLocalRandom.current().nextInt(Integer.MAX_VALUE));
String jobDescription = this.vertexProgram == null ? this.mapReducers.toString() :
this.vertexProgram + "+" + this.mapReducers;
// Use different output locations
this.sparkConfiguration.setProperty(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION,
this.sparkConfiguration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION) + "/" + jobGroupId);
updateConfigKeys(sparkConfiguration);
final Future result = computerService.submit(() -> {
final long startTime = System.currentTimeMillis();
//////////////////////////////////////////////////
/////// PROCESS SHIM AND SYSTEM PROPERTIES ///////
//////////////////////////////////////////////////
final String shimService = KryoSerializer.class.getCanonicalName().equals(this.sparkConfiguration.getString(Constants.SPARK_SERIALIZER, null)) ?
UnshadedKryoShimService.class.getCanonicalName() :
HadoopPoolShimService.class.getCanonicalName();
this.sparkConfiguration.setProperty(KryoShimServiceLoader.KRYO_SHIM_SERVICE, shimService);
///////////
final StringBuilder params = new StringBuilder();
this.sparkConfiguration.getKeys().forEachRemaining(key -> {
if (KEYS_PASSED_IN_JVM_SYSTEM_PROPERTIES.contains(key)) {
params.append(" -D").append("tinkerpop.").append(key).append("=").append(this.sparkConfiguration.getProperty(key));
System.setProperty("tinkerpop." + key, this.sparkConfiguration.getProperty(key).toString());
}
});
if (params.length() > 0) {
this.sparkConfiguration.setProperty(SparkLauncher.EXECUTOR_EXTRA_JAVA_OPTIONS,
(this.sparkConfiguration.getString(SparkLauncher.EXECUTOR_EXTRA_JAVA_OPTIONS, "") + params.toString()).trim());
this.sparkConfiguration.setProperty(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS,
(this.sparkConfiguration.getString(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, "") + params.toString()).trim());
}
KryoShimServiceLoader.applyConfiguration(this.sparkConfiguration);
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
// apache and hadoop configurations that are used throughout the graph computer computation
final org.apache.commons.configuration.Configuration graphComputerConfiguration =
new HadoopConfiguration(this.sparkConfiguration);
if (!graphComputerConfiguration.containsKey(Constants.SPARK_SERIALIZER)) {
graphComputerConfiguration.setProperty(Constants.SPARK_SERIALIZER, KryoSerializer.class.getCanonicalName());
if (!graphComputerConfiguration.containsKey(Constants.SPARK_KRYO_REGISTRATOR)){
graphComputerConfiguration.setProperty(Constants.SPARK_KRYO_REGISTRATOR, GryoRegistrator.class.getCanonicalName());}
}
graphComputerConfiguration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES,
this.persist.equals(GraphComputer.Persist.EDGES));
final Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(graphComputerConfiguration);
final Storage fileSystemStorage = FileSystemStorage.open(hadoopConfiguration);
final boolean inputFromHDFS = FileInputFormat.class.isAssignableFrom(
hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class));
final boolean inputFromSpark = PersistedInputRDD.class.isAssignableFrom(
hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class));
final boolean outputToHDFS = FileOutputFormat.class.isAssignableFrom(
hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class));
final boolean outputToSpark = PersistedOutputRDD.class.isAssignableFrom(
hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class));
final boolean skipPartitioner = graphComputerConfiguration.getBoolean(
Constants.GREMLIN_SPARK_SKIP_PARTITIONER, false);
final boolean skipPersist = graphComputerConfiguration.getBoolean(
Constants.GREMLIN_SPARK_SKIP_GRAPH_CACHE, false);
if (inputFromHDFS) {
String inputLocation = Constants
.getSearchGraphLocation(
hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION),
fileSystemStorage)
.orElse(null);
if (null != inputLocation) {
try {
graphComputerConfiguration.setProperty(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR,
FileSystem.get(hadoopConfiguration).getFileStatus(new Path(inputLocation)).getPath()
.toString());
hadoopConfiguration.set(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR,
FileSystem.get(hadoopConfiguration).getFileStatus(new Path(inputLocation)).getPath()
.toString());
} catch (final IOException e) {
throw new IllegalStateException(e.getMessage(), e);
}
}
}
final InputRDD inputRDD;
final OutputRDD outputRDD;
final boolean filtered;
try {
inputRDD = InputRDD.class.isAssignableFrom(
hadoopConfiguration.getClass(
Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class)) ?
hadoopConfiguration.getClass(
Constants.GREMLIN_HADOOP_GRAPH_READER, InputRDD.class, InputRDD.class).newInstance() :
InputFormatRDD.class.newInstance();
outputRDD = OutputRDD.class.isAssignableFrom(
hadoopConfiguration.getClass(
Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class)) ?
hadoopConfiguration.getClass(
Constants.GREMLIN_HADOOP_GRAPH_WRITER, OutputRDD.class, OutputRDD.class).newInstance() :
OutputFormatRDD.class.newInstance();
// if the input class can filter on load, then set the filters
if (inputRDD instanceof InputFormatRDD &&
GraphFilterAware.class.isAssignableFrom(hadoopConfiguration.getClass(
Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class, InputFormat.class))) {
GraphFilterAware.storeGraphFilter(
graphComputerConfiguration, hadoopConfiguration, this.graphFilter);
filtered = false;
} else if (inputRDD instanceof GraphFilterAware) {
((GraphFilterAware) inputRDD).setGraphFilter(this.graphFilter);
filtered = false;
} else filtered = this.graphFilter.hasFilter();
} catch (final InstantiationException | IllegalAccessException e) {
throw new IllegalStateException(e.getMessage(), e);
}
// create the spark context from the graph computer configuration
final JavaSparkContext sparkContext = new JavaSparkContext(Spark.create(hadoopConfiguration));
final Storage sparkContextStorage = SparkContextStorage.open();
sparkContext.setJobGroup(jobGroupId, jobDescription);
GraknSparkMemory memory = null;
// delete output location
final String outputLocation =
hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, null);
if (null != outputLocation) {
if (outputToHDFS && fileSystemStorage.exists(outputLocation)) {
fileSystemStorage.rm(outputLocation);
}
if (outputToSpark && sparkContextStorage.exists(outputLocation)) {
sparkContextStorage.rm(outputLocation);
}
}
// the Spark application name will always be set by SparkContextStorage,
// thus, INFO the name to make it easier to debug
logger.debug(Constants.GREMLIN_HADOOP_SPARK_JOB_PREFIX +
(null == this.vertexProgram ? "No VertexProgram" :
this.vertexProgram) + "[" + this.mapReducers + "]");
// add the project jars to the cluster
this.loadJars(hadoopConfiguration, sparkContext);
updateLocalConfiguration(sparkContext, hadoopConfiguration);
// create a message-passing friendly rdd from the input rdd
boolean partitioned = false;
JavaPairRDD
© 2015 - 2025 Weber Informatics LLC | Privacy Policy