
com.tinkerpop.gremlin.hadoop.process.computer.giraph.GiraphGraphComputer Maven / Gradle / Ivy
The newest version!
package com.tinkerpop.gremlin.hadoop.process.computer.giraph;
import com.tinkerpop.gremlin.hadoop.Constants;
import com.tinkerpop.gremlin.hadoop.process.computer.giraph.io.GiraphVertexInputFormat;
import com.tinkerpop.gremlin.hadoop.process.computer.giraph.io.GiraphVertexOutputFormat;
import com.tinkerpop.gremlin.hadoop.process.computer.util.MapReduceHelper;
import com.tinkerpop.gremlin.hadoop.process.computer.util.MemoryMapReduce;
import com.tinkerpop.gremlin.hadoop.structure.HadoopGraph;
import com.tinkerpop.gremlin.hadoop.structure.util.ConfUtil;
import com.tinkerpop.gremlin.hadoop.structure.util.HadoopHelper;
import com.tinkerpop.gremlin.process.computer.ComputerResult;
import com.tinkerpop.gremlin.process.computer.GraphComputer;
import com.tinkerpop.gremlin.process.computer.MapReduce;
import com.tinkerpop.gremlin.process.computer.VertexProgram;
import com.tinkerpop.gremlin.process.computer.util.ComputerDataStrategy;
import com.tinkerpop.gremlin.process.computer.util.GraphComputerHelper;
import com.tinkerpop.gremlin.process.computer.util.MapMemory;
import com.tinkerpop.gremlin.structure.Graph;
import com.tinkerpop.gremlin.structure.util.StringFactory;
import org.apache.commons.configuration.BaseConfiguration;
import org.apache.commons.configuration.Configuration;
import org.apache.commons.configuration.FileConfiguration;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.giraph.conf.GiraphConfiguration;
import org.apache.giraph.job.GiraphJob;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Future;
import java.util.stream.Stream;
/**
* @author Marko A. Rodriguez (http://markorodriguez.com)
*/
public class GiraphGraphComputer extends Configured implements GraphComputer, Tool {
public static final Logger LOGGER = LoggerFactory.getLogger(GiraphGraphComputer.class);
protected final HadoopGraph hadoopGraph;
protected GiraphConfiguration giraphConfiguration = new GiraphConfiguration();
private boolean executed = false;
private final Set mapReduces = new HashSet<>();
private VertexProgram vertexProgram;
private MapMemory memory = new MapMemory();
public GiraphGraphComputer(final HadoopGraph hadoopGraph) {
this.hadoopGraph = hadoopGraph;
final Configuration configuration = hadoopGraph.configuration();
configuration.getKeys().forEachRemaining(key -> this.giraphConfiguration.set(key, configuration.getProperty(key).toString()));
this.giraphConfiguration.setMasterComputeClass(GiraphMemory.class);
this.giraphConfiguration.setVertexClass(GiraphComputeVertex.class);
this.giraphConfiguration.setWorkerContextClass(GiraphWorkerContext.class);
this.giraphConfiguration.setOutEdgesClass(EmptyOutEdges.class);
this.giraphConfiguration.setClass("giraph.vertexIdClass", LongWritable.class, LongWritable.class);
this.giraphConfiguration.setClass("giraph.vertexValueClass", Text.class, Text.class);
this.giraphConfiguration.setVertexInputFormatClass(GiraphVertexInputFormat.class);
this.giraphConfiguration.setVertexOutputFormatClass(GiraphVertexOutputFormat.class);
}
@Override
public GraphComputer isolation(final Isolation isolation) {
if (!isolation.equals(Isolation.BSP))
throw GraphComputer.Exceptions.isolationNotSupported(isolation);
return this;
}
@Override
public GraphComputer program(final VertexProgram vertexProgram) {
this.vertexProgram = vertexProgram;
this.memory.addVertexProgramMemoryComputeKeys(this.vertexProgram);
final BaseConfiguration apacheConfiguration = new BaseConfiguration();
vertexProgram.storeState(apacheConfiguration);
ConfUtil.mergeApacheIntoHadoopConfiguration(apacheConfiguration, this.giraphConfiguration);
this.vertexProgram.getMessageCombiner().ifPresent(combiner -> this.giraphConfiguration.setCombinerClass(GiraphMessageCombiner.class));
return this;
}
@Override
public GraphComputer mapReduce(final MapReduce mapReduce) {
this.mapReduces.add(mapReduce);
return this;
}
public String toString() {
return StringFactory.graphComputerString(this);
}
@Override
public Future submit() {
if (this.executed)
throw Exceptions.computerHasAlreadyBeenSubmittedAVertexProgram();
else
this.executed = true;
// it is not possible execute a computer if it has no vertex program nor mapreducers
if (null == this.vertexProgram && this.mapReduces.isEmpty())
throw GraphComputer.Exceptions.computerHasNoVertexProgramNorMapReducers();
// it is possible to run mapreducers without a vertex program
if (null != this.vertexProgram)
GraphComputerHelper.validateProgramOnComputer(this, vertexProgram);
final long startTime = System.currentTimeMillis();
return CompletableFuture.supplyAsync(() -> {
try {
final FileSystem fs = FileSystem.get(this.giraphConfiguration);
this.loadJars(fs);
fs.delete(new Path(this.giraphConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION)), true);
ToolRunner.run(this, new String[]{});
// memory.keys().forEach(k -> LOGGER.error(k + "---" + memory.get(k)));
} catch (Exception e) {
//e.printStackTrace();
throw new IllegalStateException(e.getMessage(), e);
}
this.memory.setRuntime(System.currentTimeMillis() - startTime);
final Graph outputGraph = HadoopHelper.getOutputGraph(this.hadoopGraph);
return new ComputerResult(null == this.vertexProgram ? outputGraph : ComputerDataStrategy.wrapGraph(outputGraph, this.vertexProgram), this.memory.asImmutable());
});
}
@Override
public int run(final String[] args) {
try {
// it is possible to run graph computer without a vertex program (and thus, only map reduce jobs if they exist)
if (null != this.vertexProgram) {
final GiraphJob job = new GiraphJob(this.giraphConfiguration, Constants.GREMLIN_HADOOP_GIRAPH_JOB_PREFIX + this.vertexProgram);
final Path inputPath = new Path(this.giraphConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION));
if (!FileSystem.get(this.giraphConfiguration).exists(inputPath))
throw new IllegalArgumentException("The provided input path does not exist: " + inputPath);
FileInputFormat.setInputPaths(job.getInternalJob(), inputPath);
FileOutputFormat.setOutputPath(job.getInternalJob(), new Path(this.giraphConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION) + "/" + Constants.SYSTEM_G));
// job.getInternalJob().setJarByClass(GiraphGraphComputer.class);
LOGGER.info(Constants.GREMLIN_HADOOP_GIRAPH_JOB_PREFIX + this.vertexProgram);
if (!job.run(true)) {
throw new IllegalStateException("The GiraphGraphComputer job failed -- aborting all subsequent MapReduce jobs");
}
this.mapReduces.addAll(this.vertexProgram.getMapReducers());
// calculate main vertex program memory if desired (costs one mapreduce job)
if (this.giraphConfiguration.getBoolean(Constants.GREMLIN_HADOOP_DERIVE_MEMORY, false)) {
final Set memoryKeys = new HashSet(this.vertexProgram.getMemoryComputeKeys());
memoryKeys.add(Constants.SYSTEM_ITERATION);
this.giraphConfiguration.setStrings(Constants.GREMLIN_HADOOP_MEMORY_KEYS, (String[]) memoryKeys.toArray(new String[memoryKeys.size()]));
this.mapReduces.add(new MemoryMapReduce(memoryKeys));
}
}
// do map reduce jobs
for (final MapReduce mapReduce : this.mapReduces) {
this.memory.addMapReduceMemoryKey(mapReduce);
MapReduceHelper.executeMapReduceJob(mapReduce, this.memory, this.giraphConfiguration);
}
} catch (final Exception e) {
throw new IllegalStateException(e.getMessage(), e);
}
return 0;
}
private void loadJars(final FileSystem fs) {
final String hadoopGremlinLibsRemote = "hadoop-gremlin-libs";
if (this.giraphConfiguration.getBoolean(Constants.GREMLIN_HADOOP_JARS_IN_DISTRIBUTED_CACHE, true)) {
final String giraphGremlinLibsLocal = System.getenv(Constants.HADOOP_GREMLIN_LIBS);
if (null == giraphGremlinLibsLocal)
LOGGER.warn(Constants.HADOOP_GREMLIN_LIBS + " is not set -- proceeding regardless");
else {
final String[] paths = giraphGremlinLibsLocal.split(":");
for (final String path : paths) {
final File file = new File(path);
if (file.exists()) {
Stream.of(file.listFiles()).filter(f -> f.getName().endsWith(Constants.DOT_JAR)).forEach(f -> {
try {
final Path jarFile = new Path(fs.getHomeDirectory() + "/" + hadoopGremlinLibsRemote + "/" + f.getName());
fs.copyFromLocalFile(new Path(f.getPath()), jarFile);
try {
DistributedCache.addArchiveToClassPath(jarFile, this.giraphConfiguration, fs);
} catch (final Exception e) {
throw new RuntimeException(e.getMessage(), e);
}
} catch (Exception e) {
throw new IllegalStateException(e.getMessage(), e);
}
});
} else {
LOGGER.warn(path + " does not reference a valid directory -- proceeding regardless");
}
}
}
}
}
public static void main(final String[] args) throws Exception {
try {
final FileConfiguration configuration = new PropertiesConfiguration();
configuration.load(new File(args[0]));
final GiraphGraphComputer computer = new GiraphGraphComputer(HadoopGraph.open(configuration));
computer.program(VertexProgram.createVertexProgram(configuration)).submit().get();
} catch (Exception e) {
e.printStackTrace();
throw e;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy