org.apache.crunch.impl.spark.SparkPipeline Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.crunch.impl.spark;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import org.apache.crunch.CachingOptions;
import org.apache.crunch.PCollection;
import org.apache.crunch.PipelineExecution;
import org.apache.crunch.PipelineResult;
import org.apache.crunch.impl.dist.DistributedPipeline;
import org.apache.crunch.impl.dist.collect.PCollectionImpl;
import org.apache.crunch.impl.spark.collect.SparkCollectFactory;
import org.apache.crunch.io.ReadableSource;
import org.apache.crunch.materialize.MaterializableIterable;
import org.apache.hadoop.conf.Configuration;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.storage.StorageLevel;
import java.util.Map;
public class SparkPipeline extends DistributedPipeline {
private final String sparkConnect;
private JavaSparkContext sparkContext;
private final Map, StorageLevel> cachedCollections = Maps.newHashMap();
public SparkPipeline(String sparkConnect, String appName) {
super(appName, new Configuration(), new SparkCollectFactory());
this.sparkConnect = Preconditions.checkNotNull(sparkConnect);
}
public SparkPipeline(JavaSparkContext sparkContext, String appName) {
super(appName, new Configuration(), new SparkCollectFactory());
this.sparkContext = Preconditions.checkNotNull(sparkContext);
this.sparkConnect = sparkContext.getSparkHome().orNull();
}
@Override
public Iterable materialize(PCollection pcollection) {
ReadableSource readableSrc = getMaterializeSourceTarget(pcollection);
MaterializableIterable c = new MaterializableIterable(this, readableSrc);
if (!outputTargetsToMaterialize.containsKey(pcollection)) {
outputTargetsToMaterialize.put((PCollectionImpl) pcollection, c);
}
return c;
}
@Override
public void cache(PCollection pcollection, CachingOptions options) {
cachedCollections.put(pcollection, toStorageLevel(options));
}
private StorageLevel toStorageLevel(CachingOptions options) {
return StorageLevel.apply(
options.useDisk(),
options.useMemory(),
options.deserialized(),
options.replicas());
}
@Override
public PipelineResult run() {
try {
PipelineExecution exec = runAsync();
exec.waitUntilDone();
return exec.getResult();
} catch (Exception e) {
// TODO: How to handle this without changing signature?
// LOG.error("Exception running pipeline", e);
return PipelineResult.EMPTY;
}
}
@Override
public PipelineExecution runAsync() {
Map, MaterializableIterable> toMaterialize = Maps.newHashMap();
for (PCollectionImpl> c : outputTargets.keySet()) {
if (outputTargetsToMaterialize.containsKey(c)) {
toMaterialize.put(c, outputTargetsToMaterialize.get(c));
outputTargetsToMaterialize.remove(c);
}
}
if (sparkContext == null) {
this.sparkContext = new JavaSparkContext(sparkConnect, getName());
}
SparkRuntime runtime = new SparkRuntime(this, sparkContext, getConfiguration(), outputTargets, toMaterialize,
cachedCollections);
runtime.execute();
outputTargets.clear();
return runtime;
}
@Override
public PipelineResult done() {
PipelineResult res = super.done();
if (sparkContext != null) {
sparkContext.stop();
sparkContext = null;
}
return res;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy