org.apache.flink.runtime.operators.DataSourceTask Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.operators;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.api.common.io.InputFormat;
import org.apache.flink.api.common.io.RichInputFormat;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.common.typeutils.TypeSerializerFactory;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.io.InputSplit;
import org.apache.flink.metrics.Counter;
import org.apache.flink.metrics.SimpleCounter;
import org.apache.flink.runtime.execution.CancelTaskException;
import org.apache.flink.runtime.execution.Environment;
import org.apache.flink.runtime.io.network.api.writer.RecordWriter;
import org.apache.flink.runtime.jobgraph.InputOutputFormatContainer;
import org.apache.flink.runtime.jobgraph.OperatorID;
import org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable;
import org.apache.flink.runtime.jobgraph.tasks.InputSplitProvider;
import org.apache.flink.runtime.jobgraph.tasks.InputSplitProviderException;
import org.apache.flink.runtime.metrics.groups.InternalOperatorIOMetricGroup;
import org.apache.flink.runtime.metrics.groups.InternalOperatorMetricGroup;
import org.apache.flink.runtime.operators.chaining.ChainedDriver;
import org.apache.flink.runtime.operators.chaining.ExceptionInChainedStubException;
import org.apache.flink.runtime.operators.util.DistributedRuntimeUDFContext;
import org.apache.flink.runtime.operators.util.TaskConfig;
import org.apache.flink.runtime.operators.util.metrics.CountingCollector;
import org.apache.flink.util.Collector;
import org.apache.flink.util.UserCodeClassLoader;
import org.apache.commons.lang3.tuple.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
/**
* DataSourceTask which is executed by a task manager. The task reads data and uses an {@link
* InputFormat} to create records from the input.
*
* @see org.apache.flink.api.common.io.InputFormat
*/
public class DataSourceTask extends AbstractInvokable {
private static final Logger LOG = LoggerFactory.getLogger(DataSourceTask.class);
private List> eventualOutputs;
// Output collector
private Collector output;
// InputFormat instance
private InputFormat format;
// type serializer for the input
private TypeSerializerFactory serializerFactory;
// Task configuration
private TaskConfig config;
// tasks chained to this data source
private ArrayList> chainedTasks;
// cancel flag
private volatile boolean taskCanceled = false;
/**
* Create an Invokable task and set its environment.
*
* @param environment The environment assigned to this invokable.
*/
public DataSourceTask(Environment environment) {
super(environment);
}
@Override
public void invoke() throws Exception {
// --------------------------------------------------------------------
// Initialize
// --------------------------------------------------------------------
initInputFormat();
LOG.debug(getLogString("Start registering input and output"));
try {
initOutputs(getEnvironment().getUserCodeClassLoader());
} catch (Exception ex) {
throw new RuntimeException(
"The initialization of the DataSource's outputs caused an error: "
+ ex.getMessage(),
ex);
}
LOG.debug(getLogString("Finished registering input and output"));
// --------------------------------------------------------------------
// Invoke
// --------------------------------------------------------------------
LOG.debug(getLogString("Starting data source operator"));
RuntimeContext ctx = createRuntimeContext();
final Counter numRecordsOut;
{
Counter tmpNumRecordsOut;
try {
InternalOperatorIOMetricGroup ioMetricGroup =
((InternalOperatorMetricGroup) ctx.getMetricGroup()).getIOMetricGroup();
ioMetricGroup.reuseInputMetricsForTask();
if (this.config.getNumberOfChainedStubs() == 0) {
ioMetricGroup.reuseOutputMetricsForTask();
}
tmpNumRecordsOut = ioMetricGroup.getNumRecordsOutCounter();
} catch (Exception e) {
LOG.warn("An exception occurred during the metrics setup.", e);
tmpNumRecordsOut = new SimpleCounter();
}
numRecordsOut = tmpNumRecordsOut;
}
Counter completedSplitsCounter = ctx.getMetricGroup().counter("numSplitsProcessed");
if (RichInputFormat.class.isAssignableFrom(this.format.getClass())) {
((RichInputFormat) this.format).setRuntimeContext(ctx);
LOG.debug(getLogString("Rich Source detected. Initializing runtime context."));
((RichInputFormat) this.format).openInputFormat();
LOG.debug(getLogString("Rich Source detected. Opening the InputFormat."));
}
ExecutionConfig executionConfig = getExecutionConfig();
boolean objectReuseEnabled = executionConfig.isObjectReuseEnabled();
LOG.debug(
"DataSourceTask object reuse: "
+ (objectReuseEnabled ? "ENABLED" : "DISABLED")
+ ".");
final TypeSerializer serializer = this.serializerFactory.getSerializer();
try {
// start all chained tasks
BatchTask.openChainedTasks(this.chainedTasks, this);
// get input splits to read
final Iterator splitIterator = getInputSplits();
// for each assigned input split
while (!this.taskCanceled && splitIterator.hasNext()) {
// get start and end
final InputSplit split = splitIterator.next();
LOG.debug(getLogString("Opening input split " + split.toString()));
final InputFormat format = this.format;
// open input format
format.open(split);
LOG.debug(getLogString("Starting to read input from split " + split.toString()));
try {
final Collector output =
new CountingCollector<>(this.output, numRecordsOut);
if (objectReuseEnabled) {
OT reuse = serializer.createInstance();
// as long as there is data to read
while (!this.taskCanceled && !format.reachedEnd()) {
OT returned;
if ((returned = format.nextRecord(reuse)) != null) {
output.collect(returned);
}
}
} else {
// as long as there is data to read
while (!this.taskCanceled && !format.reachedEnd()) {
OT returned;
if ((returned = format.nextRecord(serializer.createInstance()))
!= null) {
output.collect(returned);
}
}
}
if (LOG.isDebugEnabled() && !this.taskCanceled) {
LOG.debug(getLogString("Closing input split " + split.toString()));
}
} finally {
// close. We close here such that a regular close throwing an exception marks a
// task as failed.
format.close();
}
completedSplitsCounter.inc();
} // end for all input splits
// close all chained tasks letting them report failure
BatchTask.closeChainedTasks(this.chainedTasks, this);
// close the output collector
this.output.close();
} catch (Exception ex) {
// close the input, but do not report any exceptions, since we already have another root
// cause
try {
this.format.close();
} catch (Throwable ignored) {
}
BatchTask.cancelChainedTasks(this.chainedTasks);
ex = ExceptionInChainedStubException.exceptionUnwrap(ex);
if (ex instanceof CancelTaskException) {
// forward canceling exception
throw ex;
} else if (!this.taskCanceled) {
// drop exception, if the task was canceled
BatchTask.logAndThrowException(ex, this);
}
} finally {
BatchTask.clearWriters(eventualOutputs);
// --------------------------------------------------------------------
// Closing
// --------------------------------------------------------------------
if (this.format != null
&& RichInputFormat.class.isAssignableFrom(this.format.getClass())) {
((RichInputFormat) this.format).closeInputFormat();
LOG.debug(getLogString("Rich Source detected. Closing the InputFormat."));
}
}
if (!this.taskCanceled) {
LOG.debug(getLogString("Finished data source operator"));
} else {
LOG.debug(getLogString("Data source operator cancelled"));
}
}
@Override
public void cancel() throws Exception {
this.taskCanceled = true;
LOG.debug(getLogString("Cancelling data source operator"));
}
/**
* Initializes the InputFormat implementation and configuration.
*
* @throws RuntimeException Throws if instance of InputFormat implementation can not be
* obtained.
*/
private void initInputFormat() {
ClassLoader userCodeClassLoader = getUserCodeClassLoader();
// obtain task configuration (including stub parameters)
Configuration taskConf = getTaskConfiguration();
this.config = new TaskConfig(taskConf);
final Pair> operatorIdAndInputFormat;
InputOutputFormatContainer formatContainer =
new InputOutputFormatContainer(config, userCodeClassLoader);
try {
operatorIdAndInputFormat = formatContainer.getUniqueInputFormat();
this.format = operatorIdAndInputFormat.getValue();
// check if the class is a subclass, if the check is required
if (!InputFormat.class.isAssignableFrom(this.format.getClass())) {
throw new RuntimeException(
"The class '"
+ this.format.getClass().getName()
+ "' is not a subclass of '"
+ InputFormat.class.getName()
+ "' as is required.");
}
} catch (ClassCastException ccex) {
throw new RuntimeException(
"The stub class is not a proper subclass of " + InputFormat.class.getName(),
ccex);
}
Thread thread = Thread.currentThread();
ClassLoader original = thread.getContextClassLoader();
// configure the stub. catch exceptions here extra, to report them as originating from the
// user code
try {
thread.setContextClassLoader(userCodeClassLoader);
this.format.configure(formatContainer.getParameters(operatorIdAndInputFormat.getKey()));
} catch (Throwable t) {
throw new RuntimeException(
"The user defined 'configure()' method caused an error: " + t.getMessage(), t);
} finally {
thread.setContextClassLoader(original);
}
// get the factory for the type serializer
this.serializerFactory = this.config.getOutputSerializer(userCodeClassLoader);
}
/**
* Creates a writer for each output. Creates an OutputCollector which forwards its input to all
* writers. The output collector applies the configured shipping strategy.
*/
private void initOutputs(UserCodeClassLoader cl) throws Exception {
this.chainedTasks = new ArrayList>();
this.eventualOutputs = new ArrayList>();
this.output =
BatchTask.initOutputs(
this,
cl,
this.config,
this.chainedTasks,
this.eventualOutputs,
getExecutionConfig(),
getEnvironment().getAccumulatorRegistry().getUserMap());
}
// ------------------------------------------------------------------------
// Utilities
// ------------------------------------------------------------------------
/**
* Utility function that composes a string for logging purposes. The string includes the given
* message and the index of the task in its task group together with the number of tasks in the
* task group.
*
* @param message The main message for the log.
* @return The string ready for logging.
*/
private String getLogString(String message) {
return getLogString(message, this.getEnvironment().getTaskInfo().getTaskName());
}
/**
* Utility function that composes a string for logging purposes. The string includes the given
* message and the index of the task in its task group together with the number of tasks in the
* task group.
*
* @param message The main message for the log.
* @param taskName The name of the task.
* @return The string ready for logging.
*/
private String getLogString(String message, String taskName) {
return BatchTask.constructLogString(message, taskName, this);
}
private Iterator getInputSplits() {
final InputSplitProvider provider = getEnvironment().getInputSplitProvider();
return new Iterator() {
private InputSplit nextSplit;
private boolean exhausted;
@Override
public boolean hasNext() {
if (exhausted) {
return false;
}
if (nextSplit != null) {
return true;
}
final InputSplit split;
try {
split = provider.getNextInputSplit(getUserCodeClassLoader());
} catch (InputSplitProviderException e) {
throw new RuntimeException("Could not retrieve next input split.", e);
}
if (split != null) {
this.nextSplit = split;
return true;
} else {
exhausted = true;
return false;
}
}
@Override
public InputSplit next() {
if (this.nextSplit == null && !hasNext()) {
throw new NoSuchElementException();
}
final InputSplit tmp = this.nextSplit;
this.nextSplit = null;
return tmp;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
public DistributedRuntimeUDFContext createRuntimeContext() {
Environment env = getEnvironment();
String sourceName = getEnvironment().getTaskInfo().getTaskName().split("->")[0].trim();
sourceName = sourceName.startsWith("CHAIN") ? sourceName.substring(6) : sourceName;
return new DistributedRuntimeUDFContext(
env.getJobInfo(),
env.getTaskInfo(),
env.getUserCodeClassLoader(),
getExecutionConfig(),
env.getDistributedCacheEntries(),
env.getAccumulatorRegistry().getUserMap(),
getEnvironment().getMetricGroup().getOrAddOperator(sourceName),
env.getExternalResourceInfoProvider());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy