eu.stratosphere.pact.runtime.task.DataSinkTask Maven / Gradle / Ivy
/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/
package eu.stratosphere.pact.runtime.task;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import eu.stratosphere.api.common.io.FileOutputFormat;
import eu.stratosphere.api.common.io.FileOutputFormat.OutputDirectoryMode;
import eu.stratosphere.api.common.io.OutputFormat;
import eu.stratosphere.api.common.typeutils.TypeComparatorFactory;
import eu.stratosphere.api.common.typeutils.TypeSerializer;
import eu.stratosphere.api.common.typeutils.TypeSerializerFactory;
import eu.stratosphere.configuration.Configuration;
import eu.stratosphere.core.fs.FileSystem;
import eu.stratosphere.core.fs.FileSystem.WriteMode;
import eu.stratosphere.core.fs.Path;
import eu.stratosphere.core.io.IOReadableWritable;
import eu.stratosphere.nephele.execution.librarycache.LibraryCacheManager;
import eu.stratosphere.nephele.io.MutableReader;
import eu.stratosphere.nephele.io.MutableRecordReader;
import eu.stratosphere.nephele.io.MutableUnionRecordReader;
import eu.stratosphere.nephele.template.AbstractOutputTask;
import eu.stratosphere.pact.runtime.plugable.DeserializationDelegate;
import eu.stratosphere.pact.runtime.sort.UnilateralSortMerger;
import eu.stratosphere.pact.runtime.task.util.CloseableInputProvider;
import eu.stratosphere.pact.runtime.task.util.ReaderIterator;
import eu.stratosphere.pact.runtime.task.util.RecordReaderIterator;
import eu.stratosphere.pact.runtime.task.util.TaskConfig;
import eu.stratosphere.types.Record;
import eu.stratosphere.util.MutableObjectIterator;
/**
* DataSinkTask which is executed by a Nephele task manager.
* The task hands the data to an output format.
*
* @see OutputFormat
*/
public class DataSinkTask extends AbstractOutputTask {
public static final String DEGREE_OF_PARALLELISM_KEY = "sink.dop";
// Obtain DataSinkTask Logger
private static final Log LOG = LogFactory.getLog(DataSinkTask.class);
// --------------------------------------------------------------------------------------------
// OutputFormat instance. volatile, because the asynchronous canceller may access it
private volatile OutputFormat format;
// input reader
private MutableObjectIterator reader;
// input iterator
private MutableObjectIterator input;
// The serializer for the input type
private TypeSerializerFactory inputTypeSerializerFactory;
// local strategy
private CloseableInputProvider localStrategy;
// task configuration
private TaskConfig config;
// class loader for user code
private ClassLoader userCodeClassLoader;
// cancel flag
private volatile boolean taskCanceled;
@Override
public void registerInputOutput() {
if (LOG.isDebugEnabled()) {
LOG.debug(getLogString("Start registering input and output"));
}
// initialize OutputFormat
initOutputFormat();
// initialize input readers
try {
initInputReaders();
} catch (Exception e) {
throw new RuntimeException("Initializing the input streams failed" +
e.getMessage() == null ? "." : ": " + e.getMessage(), e);
}
if (LOG.isDebugEnabled()) {
LOG.debug(getLogString("Finished registering input and output"));
}
}
@Override
public void invoke() throws Exception
{
if (LOG.isDebugEnabled()) {
LOG.debug(getLogString("Starting data sink operator"));
}
try {
// initialize local strategies
switch (this.config.getInputLocalStrategy(0)) {
case NONE:
// nothing to do
localStrategy = null;
input = reader;
break;
case SORT:
// initialize sort local strategy
try {
// get type comparator
TypeComparatorFactory compFact = this.config.getInputComparator(0, this.userCodeClassLoader);
if (compFact == null) {
throw new Exception("Missing comparator factory for local strategy on input " + 0);
}
// initialize sorter
UnilateralSortMerger sorter = new UnilateralSortMerger(
getEnvironment().getMemoryManager(),
getEnvironment().getIOManager(),
this.reader, this, this.inputTypeSerializerFactory, compFact.createComparator(),
this.config.getMemoryInput(0), this.config.getFilehandlesInput(0),
this.config.getSpillingThresholdInput(0));
this.localStrategy = sorter;
this.input = sorter.getIterator();
} catch (Exception e) {
throw new RuntimeException("Initializing the input processing failed" +
e.getMessage() == null ? "." : ": " + e.getMessage(), e);
}
break;
default:
throw new RuntimeException("Invalid local strategy for DataSinkTask");
}
// read the reader and write it to the output
final TypeSerializer serializer = this.inputTypeSerializerFactory.getSerializer();
final MutableObjectIterator input = this.input;
final OutputFormat format = this.format;
IT record = serializer.createInstance();
// check if task has been canceled
if (this.taskCanceled) {
return;
}
if (LOG.isDebugEnabled()) {
LOG.debug(getLogString("Starting to produce output"));
}
// open
format.open(this.getEnvironment().getIndexInSubtaskGroup(), this.getEnvironment().getCurrentNumberOfSubtasks());
// work!
while (!this.taskCanceled && ((record = input.next(record)) != null)) {
format.writeRecord(record);
}
// close. We close here such that a regular close throwing an exception marks a task as failed.
if (!this.taskCanceled) {
this.format.close();
this.format = null;
}
}
catch (Exception ex) {
// drop, if the task was canceled
if (!this.taskCanceled) {
if (LOG.isErrorEnabled()) {
LOG.error(getLogString("Error in user code: " + ex.getMessage()), ex);
}
throw ex;
}
}
finally {
if (this.format != null) {
// close format, if it has not been closed, yet.
// This should only be the case if we had a previous error, or were canceled.
try {
this.format.close();
}
catch (Throwable t) {
if (LOG.isWarnEnabled()) {
LOG.warn(getLogString("Error closing the ouput format."), t);
}
}
}
// close local strategy if necessary
if (localStrategy != null) {
try {
this.localStrategy.close();
} catch (Throwable t) {
LOG.error("Error closing local strategy", t);
}
}
}
if (!this.taskCanceled) {
if (LOG.isDebugEnabled()) {
LOG.debug(getLogString("Finished data sink operator"));
}
}
else {
if (LOG.isDebugEnabled()) {
LOG.debug(getLogString("Data sink operator cancelled"));
}
}
}
@Override
public void cancel() throws Exception {
this.taskCanceled = true;
OutputFormat format = this.format;
if (format != null) {
try {
this.format.close();
} catch (Throwable t) {}
}
if (LOG.isDebugEnabled()) {
LOG.debug(getLogString("Cancelling data sink operator"));
}
}
/**
* Sets the class-loader to be used to load the user code.
*
* @param cl The class-loader to be used to load the user code.
*/
public void setUserCodeClassLoader(ClassLoader cl) {
this.userCodeClassLoader = cl;
}
/**
* Initializes the OutputFormat implementation and configuration.
*
* @throws RuntimeException
* Throws if instance of OutputFormat implementation can not be
* obtained.
*/
private void initOutputFormat() {
if (this.userCodeClassLoader == null) {
try {
this.userCodeClassLoader = LibraryCacheManager.getClassLoader(getEnvironment().getJobID());
} catch (IOException ioe) {
throw new RuntimeException("Library cache manager could not be instantiated.", ioe);
}
}
// obtain task configuration (including stub parameters)
Configuration taskConf = getTaskConfiguration();
taskConf.setClassLoader(this.userCodeClassLoader);
this.config = new TaskConfig(taskConf);
try {
this.format = config.>getStubWrapper(this.userCodeClassLoader).getUserCodeObject(OutputFormat.class, this.userCodeClassLoader);
// check if the class is a subclass, if the check is required
if (!OutputFormat.class.isAssignableFrom(this.format.getClass())) {
throw new RuntimeException("The class '" + this.format.getClass().getName() + "' is not a subclass of '" +
OutputFormat.class.getName() + "' as is required.");
}
}
catch (ClassCastException ccex) {
throw new RuntimeException("The stub class is not a proper subclass of " + OutputFormat.class.getName(), ccex);
}
// configure the stub. catch exceptions here extra, to report them as originating from the user code
try {
this.format.configure(this.config.getStubParameters());
}
catch (Throwable t) {
throw new RuntimeException("The user defined 'configure()' method in the Output Format caused an error: "
+ t.getMessage(), t);
}
}
/**
* Initializes the input readers of the DataSinkTask.
*
* @throws RuntimeException
* Thrown in case of invalid task input configuration.
*/
@SuppressWarnings("unchecked")
private void initInputReaders() throws Exception {
MutableReader> inputReader;
int numGates = 0;
// ---------------- create the input readers ---------------------
// in case where a logical input unions multiple physical inputs, create a union reader
final int groupSize = this.config.getGroupSize(0);
numGates += groupSize;
if (groupSize == 1) {
// non-union case
inputReader = new MutableRecordReader>(this);
} else if (groupSize > 1){
// union case
MutableRecordReader[] readers = new MutableRecordReader[groupSize];
for (int j = 0; j < groupSize; ++j) {
readers[j] = new MutableRecordReader(this);
}
inputReader = new MutableUnionRecordReader(readers);
} else {
throw new Exception("Illegal input group size in task configuration: " + groupSize);
}
this.inputTypeSerializerFactory = this.config.getInputSerializer(0, this.userCodeClassLoader);
if (this.inputTypeSerializerFactory.getDataType() == Record.class) {
// record specific deserialization
MutableReader reader = (MutableReader) inputReader;
this.reader = (MutableObjectIterator)new RecordReaderIterator(reader);
} else {
// generic data type serialization
MutableReader> reader = (MutableReader>) inputReader;
@SuppressWarnings({ "rawtypes" })
final MutableObjectIterator> iter = new ReaderIterator(reader, this.inputTypeSerializerFactory.getSerializer());
this.reader = (MutableObjectIterator)iter;
}
// final sanity check
if (numGates != this.config.getNumInputs()) {
throw new Exception("Illegal configuration: Number of input gates and group sizes are not consistent.");
}
}
// ------------------------------------------------------------------------
// Degree of parallelism & checks
// ------------------------------------------------------------------------
@Override
public int getMaximumNumberOfSubtasks() {
if (!(this.format instanceof FileOutputFormat>)) {
return -1;
}
final FileOutputFormat> fileOutputFormat = (FileOutputFormat>) this.format;
// ----------------- This code applies only to file inputs ------------------
final Path path = fileOutputFormat.getOutputFilePath();
final WriteMode writeMode = fileOutputFormat.getWriteMode();
final OutputDirectoryMode outDirMode = fileOutputFormat.getOutputDirectoryMode();
// Prepare output path and determine max DOP
try {
int dop = getTaskConfiguration().getInteger(DEGREE_OF_PARALLELISM_KEY, -1);
final FileSystem fs = path.getFileSystem();
if(dop == 1 && outDirMode == OutputDirectoryMode.PARONLY) {
// output is not written in parallel and should be written to a single file.
if(fs.isDistributedFS()) {
// prepare distributed output path
if(!fs.initOutPathDistFS(path, writeMode, false)) {
// output preparation failed! Cancel task.
throw new IOException("Output path could not be initialized.");
}
}
return 1;
} else {
// output should be written to a directory
if(fs.isDistributedFS()) {
// only distributed file systems can be initialized at start-up time.
if(!fs.initOutPathDistFS(path, writeMode, true)) {
throw new IOException("Output directory could not be created.");
}
}
return -1;
}
}
catch (IOException e) {
LOG.error("Could not access the file system to detemine the status of the output.", e);
throw new RuntimeException("I/O Error while accessing file", e);
}
}
// ------------------------------------------------------------------------
// Utilities
// ------------------------------------------------------------------------
/**
* Utility function that composes a string for logging purposes. The string includes the given message and
* the index of the task in its task group together with the number of tasks in the task group.
*
* @param message The main message for the log.
* @return The string ready for logging.
*/
private String getLogString(String message) {
return RegularPactTask.constructLogString(message, this.getEnvironment().getTaskName(), this);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy