com.dataartisans.flink.cascading.planner.FlinkFlowStepJob Maven / Gradle / Ivy
/*
* Copyright 2015 data Artisans GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dataartisans.flink.cascading.planner;
import cascading.flow.planner.FlowStepJob;
import cascading.management.state.ClientState;
import cascading.stats.FlowNodeStats;
import cascading.stats.FlowStepStats;
import com.dataartisans.flink.cascading.runtime.stats.AccumulatorCache;
import com.dataartisans.flink.cascading.util.FlinkConfigConstants;
import org.apache.flink.api.common.ExecutionMode;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.JobSubmissionResult;
import org.apache.flink.api.common.Plan;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.LocalEnvironment;
import org.apache.flink.client.program.Client;
import org.apache.flink.client.program.ContextEnvironment;
import org.apache.flink.client.program.JobWithJars;
import org.apache.flink.client.program.OptimizerPlanEnvironment;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.core.fs.Path;
import org.apache.flink.optimizer.DataStatistics;
import org.apache.flink.optimizer.Optimizer;
import org.apache.flink.optimizer.plan.OptimizedPlan;
import org.apache.flink.optimizer.plantranslate.JobGraphGenerator;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.minicluster.FlinkMiniCluster;
import org.apache.flink.runtime.minicluster.LocalFlinkMiniCluster;
import org.apache.hadoop.conf.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.concurrent.duration.FiniteDuration;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
public class FlinkFlowStepJob extends FlowStepJob
{
private static final Logger LOG = LoggerFactory.getLogger( FlinkFlowStepJob.class );
private final Configuration currentConf;
private Client client;
private JobID jobID;
private Throwable jobException;
private List classPath;
private final ExecutionEnvironment env;
private AccumulatorCache accumulatorCache;
private Future jobSubmission;
private ExecutorService executorService = Executors.newFixedThreadPool(1);
private static final int accumulatorUpdateIntervalSecs = 10;
private volatile static FlinkMiniCluster localCluster;
private volatile static int localClusterUsers;
private static final Object lock = new Object();
private static final FiniteDuration DEFAULT_TIMEOUT = new FiniteDuration(60, TimeUnit.SECONDS);
public FlinkFlowStepJob( ClientState clientState, FlinkFlowStep flowStep, Configuration currentConf, List classPath ) {
super(clientState, currentConf, flowStep, 1000, 60000, 60000);
this.currentConf = currentConf;
this.env = ((FlinkFlowStep)this.flowStep).getExecutionEnvironment();
this.classPath = classPath;
if( flowStep.isDebugEnabled() ) {
flowStep.logDebug("using polling interval: " + pollingInterval);
}
}
@Override
public Configuration getConfig() {
return currentConf;
}
@Override
protected FlowStepStats createStepStats(ClientState clientState) {
this.accumulatorCache = new AccumulatorCache(accumulatorUpdateIntervalSecs);
return new FlinkFlowStepStats(this.flowStep, clientState, accumulatorCache);
}
protected void internalBlockOnStop() throws IOException {
if (jobSubmission != null && !jobSubmission.isDone()) {
try {
client.cancel(jobID);
} catch (Exception e) {
throw new IOException("An exception occurred while stopping the Flink job with ID: " + jobID + ": " + e.getMessage());
}
}
}
protected void internalNonBlockingStart() throws IOException {
Plan plan = env.createProgramPlan();
// set exchange mode, BATCH is default
String execMode = getConfig().get(FlinkConfigConstants.EXECUTION_MODE);
if (execMode == null || FlinkConfigConstants.EXECUTION_MODE_BATCH.equals(execMode)) {
env.getConfig().setExecutionMode(ExecutionMode.BATCH);
}
else if (FlinkConfigConstants.EXECUTION_MODE_PIPELINED.equals(execMode)) {
env.getConfig().setExecutionMode(ExecutionMode.PIPELINED);
}
else {
LOG.warn("Unknow value for '" + FlinkConfigConstants.EXECUTION_MODE + "' parameter. " +
"Only '" + FlinkConfigConstants.EXECUTION_MODE_BATCH + "' " +
"or '" + FlinkConfigConstants.EXECUTION_MODE_PIPELINED + "' supported. " +
"Using " + FlinkConfigConstants.EXECUTION_MODE_BATCH + " exchange by default.");
env.getConfig().setExecutionMode(ExecutionMode.BATCH);
}
Optimizer optimizer = new Optimizer(new DataStatistics(), new org.apache.flink.configuration.Configuration());
OptimizedPlan optimizedPlan = optimizer.compile(plan);
final JobGraph jobGraph = new JobGraphGenerator().compileJobGraph(optimizedPlan);
for (String jarPath : classPath) {
jobGraph.addJar(new Path(jarPath));
}
jobID = jobGraph.getJobID();
accumulatorCache.setJobID(jobID);
if (isLocalExecution()) {
flowStep.logInfo("Executing in local mode.");
startLocalCluster();
org.apache.flink.configuration.Configuration config = new org.apache.flink.configuration.Configuration();
config.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, localCluster.hostname());
client = new Client(config);
client.setPrintStatusDuringExecution(env.getConfig().isSysoutLoggingEnabled());
} else if (isRemoteExecution()) {
flowStep.logInfo("Executing in cluster mode.");
try {
String path = this.getClass().getProtectionDomain().getCodeSource().getLocation().toURI().getPath();
jobGraph.addJar(new Path(path));
classPath.add(path);
} catch (URISyntaxException e) {
throw new IOException("Could not add the submission JAR as a dependency.");
}
client = ((ContextEnvironment) env).getClient();
}
List fileList = new ArrayList(classPath.size());
for (String path : classPath) {
URL url;
try {
url = new URL(path);
} catch (MalformedURLException e) {
url = new URL("file://" + path);
}
fileList.add(url);
}
final ClassLoader loader =
JobWithJars.buildUserCodeClassLoader(fileList, Collections.emptyList(), getClass().getClassLoader());
accumulatorCache.setClient(client);
final Callable callable = new Callable() {
@Override
public JobSubmissionResult call() throws Exception {
return client.runBlocking(jobGraph, loader);
}
};
jobSubmission = executorService.submit(callable);
flowStep.logInfo("submitted Flink job: " + jobID);
}
@Override
protected void updateNodeStatus( FlowNodeStats flowNodeStats ) {
try {
if (internalNonBlockingIsComplete() && internalNonBlockingIsSuccessful()) {
flowNodeStats.markSuccessful();
} else if(internalIsStartedRunning()) {
flowNodeStats.isRunning();
} else {
flowNodeStats.markFailed(jobException);
}
} catch (IOException e) {
flowStep.logError("Failed to update node status.");
}
}
protected boolean internalNonBlockingIsSuccessful() throws IOException {
try {
jobSubmission.get(0, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
return false;
} catch (ExecutionException e) {
jobException = e.getCause();
return false;
} catch (TimeoutException e) {
return false;
}
boolean isDone = jobSubmission.isDone();
if (isDone) {
accumulatorCache.update(true);
accumulatorCache.setJobID(null);
accumulatorCache.setClient(null);
stopCluster();
}
return isDone;
}
@Override
public Throwable call()
{
if (env instanceof OptimizerPlanEnvironment) {
// We have an OptimizerPlanEnvironment.
// This environment is only used to to fetch the Flink execution plan.
try {
// OptimizerPlanEnvironment does not execute but only build the execution plan.
env.execute("plan generation");
}
// execute() throws a ProgramAbortException if everything goes well
catch(OptimizerPlanEnvironment.ProgramAbortException pae) {
// Forward call() to get Cascading's internal job stats right.
// The job will be skipped due to the overridden isSkipFlowStep method.
super.call();
// forward expected ProgramAbortException
return pae;
}
//
catch(Exception e) {
// forward unexpected exception
return e;
}
}
// forward to call() if we have a regular ExecutionEnvironment
return super.call();
}
protected boolean isSkipFlowStep() throws IOException
{
if (env instanceof OptimizerPlanEnvironment) {
// We have an OptimizerPlanEnvironment.
// This environment is only used to to fetch the Flink execution plan.
// We do not want to execute the job in this case.
return true;
} else {
return super.isSkipFlowStep();
}
}
@Override
protected boolean isRemoteExecution() {
return env instanceof ContextEnvironment;
}
@Override
protected Throwable getThrowable() {
return jobException;
}
protected String internalJobId() {
return jobID.toString();
}
protected boolean internalNonBlockingIsComplete() throws IOException {
return jobSubmission.isDone();
}
protected void dumpDebugInfo() {
}
protected boolean internalIsStartedRunning() {
return jobSubmission != null;
}
private boolean isLocalExecution() {
return env instanceof LocalEnvironment;
}
private void startLocalCluster() {
synchronized (lock) {
if (localCluster == null) {
org.apache.flink.configuration.Configuration configuration = new org.apache.flink.configuration.Configuration();
configuration.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, env.getParallelism() * 2);
localCluster = new LocalFlinkMiniCluster(configuration, false);
localCluster.start();
}
localClusterUsers++;
}
}
private void stopCluster() {
synchronized (lock) {
if (localCluster != null) {
if (--localClusterUsers <= 0) {
localCluster.shutdown();
localCluster.awaitTermination();
localCluster = null;
localClusterUsers = 0;
}
}
if (executorService != null) {
executorService.shutdown();
}
}
}
}