All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.stratosphere.pact.runtime.task.DataSinkTask Maven / Gradle / Ivy

/***********************************************************************************************************************
 * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 **********************************************************************************************************************/

package eu.stratosphere.pact.runtime.task;

import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import eu.stratosphere.api.common.io.FileOutputFormat;
import eu.stratosphere.api.common.io.FileOutputFormat.OutputDirectoryMode;
import eu.stratosphere.api.common.io.OutputFormat;
import eu.stratosphere.api.common.typeutils.TypeComparatorFactory;
import eu.stratosphere.api.common.typeutils.TypeSerializer;
import eu.stratosphere.api.common.typeutils.TypeSerializerFactory;
import eu.stratosphere.configuration.Configuration;
import eu.stratosphere.core.fs.FileSystem;
import eu.stratosphere.core.fs.FileSystem.WriteMode;
import eu.stratosphere.core.fs.Path;
import eu.stratosphere.core.io.IOReadableWritable;
import eu.stratosphere.nephele.execution.librarycache.LibraryCacheManager;
import eu.stratosphere.nephele.io.MutableReader;
import eu.stratosphere.nephele.io.MutableRecordReader;
import eu.stratosphere.nephele.io.MutableUnionRecordReader;
import eu.stratosphere.nephele.template.AbstractOutputTask;
import eu.stratosphere.pact.runtime.plugable.DeserializationDelegate;
import eu.stratosphere.pact.runtime.sort.UnilateralSortMerger;
import eu.stratosphere.pact.runtime.task.util.CloseableInputProvider;
import eu.stratosphere.pact.runtime.task.util.ReaderIterator;
import eu.stratosphere.pact.runtime.task.util.RecordReaderIterator;
import eu.stratosphere.pact.runtime.task.util.TaskConfig;
import eu.stratosphere.types.Record;
import eu.stratosphere.util.MutableObjectIterator;

/**
 * DataSinkTask which is executed by a Nephele task manager.
 * The task hands the data to an output format.
 * 
 * @see OutputFormat
 */
public class DataSinkTask extends AbstractOutputTask {
	
	public static final String DEGREE_OF_PARALLELISM_KEY = "sink.dop";
	
	// Obtain DataSinkTask Logger
	private static final Log LOG = LogFactory.getLog(DataSinkTask.class);

	// --------------------------------------------------------------------------------------------
	
	// OutputFormat instance. volatile, because the asynchronous canceller may access it
	private volatile OutputFormat format;
	
	// input reader
	private MutableObjectIterator reader;
	
	// input iterator
	private MutableObjectIterator input;
	
	// The serializer for the input type
	private TypeSerializerFactory inputTypeSerializerFactory;
	
	// local strategy
	private CloseableInputProvider localStrategy;

	// task configuration
	private TaskConfig config;
	
	// class loader for user code
	private ClassLoader userCodeClassLoader;

	// cancel flag
	private volatile boolean taskCanceled;
	

	@Override
	public void registerInputOutput() {
		if (LOG.isDebugEnabled()) {
			LOG.debug(getLogString("Start registering input and output"));
		}

		// initialize OutputFormat
		initOutputFormat();
		
		// initialize input readers
		try {
			initInputReaders();
		} catch (Exception e) {
			throw new RuntimeException("Initializing the input streams failed" +
				e.getMessage() == null ? "." : ": " + e.getMessage(), e);
		}

		if (LOG.isDebugEnabled()) {
			LOG.debug(getLogString("Finished registering input and output"));
		}
	}


	@Override
	public void invoke() throws Exception
	{
		if (LOG.isDebugEnabled()) {
			LOG.debug(getLogString("Starting data sink operator"));
		}
		
		try {
			
			// initialize local strategies
			switch (this.config.getInputLocalStrategy(0)) {
			case NONE:
				// nothing to do
				localStrategy = null;
				input = reader;
				break;
			case SORT:
				// initialize sort local strategy
				try {
					// get type comparator
					TypeComparatorFactory compFact = this.config.getInputComparator(0, this.userCodeClassLoader);
					if (compFact == null) {
						throw new Exception("Missing comparator factory for local strategy on input " + 0);
					}
					
					// initialize sorter
					UnilateralSortMerger sorter = new UnilateralSortMerger(
							getEnvironment().getMemoryManager(), 
							getEnvironment().getIOManager(),
							this.reader, this, this.inputTypeSerializerFactory, compFact.createComparator(),
							this.config.getMemoryInput(0), this.config.getFilehandlesInput(0),
							this.config.getSpillingThresholdInput(0));
					
					this.localStrategy = sorter;
					this.input = sorter.getIterator();
				} catch (Exception e) {
					throw new RuntimeException("Initializing the input processing failed" +
						e.getMessage() == null ? "." : ": " + e.getMessage(), e);
				}
				break;
			default:
				throw new RuntimeException("Invalid local strategy for DataSinkTask");
			}
			
			// read the reader and write it to the output
			
			final TypeSerializer serializer = this.inputTypeSerializerFactory.getSerializer();
			final MutableObjectIterator input = this.input;
			final OutputFormat format = this.format;
			
			
			IT record = serializer.createInstance();
			
			// check if task has been canceled
			if (this.taskCanceled) {
				return;
			}

			if (LOG.isDebugEnabled()) {
				LOG.debug(getLogString("Starting to produce output"));
			}

			// open
			format.open(this.getEnvironment().getIndexInSubtaskGroup(), this.getEnvironment().getCurrentNumberOfSubtasks());

			// work!
			while (!this.taskCanceled && ((record = input.next(record)) != null)) {
				format.writeRecord(record);
			}
			
			// close. We close here such that a regular close throwing an exception marks a task as failed.
			if (!this.taskCanceled) {
				this.format.close();
				this.format = null;
			}
		}
		catch (Exception ex) {
			// drop, if the task was canceled
			if (!this.taskCanceled) {
				if (LOG.isErrorEnabled()) {
					LOG.error(getLogString("Error in user code: " + ex.getMessage()), ex);
				}
				throw ex;
			}
		}
		finally {
			if (this.format != null) {
				// close format, if it has not been closed, yet.
				// This should only be the case if we had a previous error, or were canceled.
				try {
					this.format.close();
				}
				catch (Throwable t) {
					if (LOG.isWarnEnabled()) {
						LOG.warn(getLogString("Error closing the ouput format."), t);
					}
				}
			}
			// close local strategy if necessary
			if (localStrategy != null) {
				try {
					this.localStrategy.close();
				} catch (Throwable t) {
					LOG.error("Error closing local strategy", t);
				}
			}
		}

		if (!this.taskCanceled) {
			if (LOG.isDebugEnabled()) {
				LOG.debug(getLogString("Finished data sink operator"));
			}
		}
		else {
			if (LOG.isDebugEnabled()) {
				LOG.debug(getLogString("Data sink operator cancelled"));
			}
		}
	}

	@Override
	public void cancel() throws Exception {
		this.taskCanceled = true;
		OutputFormat format = this.format;
		if (format != null) {
			try {
				this.format.close();
			} catch (Throwable t) {}
		}
		
		if (LOG.isDebugEnabled()) {
			LOG.debug(getLogString("Cancelling data sink operator"));
		}
	}
	
	/**
	 * Sets the class-loader to be used to load the user code.
	 * 
	 * @param cl The class-loader to be used to load the user code.
	 */
	public void setUserCodeClassLoader(ClassLoader cl) {
		this.userCodeClassLoader = cl;
	}

	/**
	 * Initializes the OutputFormat implementation and configuration.
	 * 
	 * @throws RuntimeException
	 *         Throws if instance of OutputFormat implementation can not be
	 *         obtained.
	 */
	private void initOutputFormat() {
		if (this.userCodeClassLoader == null) {
			try {
				this.userCodeClassLoader = LibraryCacheManager.getClassLoader(getEnvironment().getJobID());
			} catch (IOException ioe) {
				throw new RuntimeException("Library cache manager could not be instantiated.", ioe);
			}
		}
		// obtain task configuration (including stub parameters)
		Configuration taskConf = getTaskConfiguration();
		taskConf.setClassLoader(this.userCodeClassLoader);
		this.config = new TaskConfig(taskConf);

		try {
			this.format = config.>getStubWrapper(this.userCodeClassLoader).getUserCodeObject(OutputFormat.class, this.userCodeClassLoader);

			// check if the class is a subclass, if the check is required
			if (!OutputFormat.class.isAssignableFrom(this.format.getClass())) {
				throw new RuntimeException("The class '" + this.format.getClass().getName() + "' is not a subclass of '" + 
						OutputFormat.class.getName() + "' as is required.");
			}
		}
		catch (ClassCastException ccex) {
			throw new RuntimeException("The stub class is not a proper subclass of " + OutputFormat.class.getName(), ccex);
		}
		
		// configure the stub. catch exceptions here extra, to report them as originating from the user code 
		try {
			this.format.configure(this.config.getStubParameters());
		}
		catch (Throwable t) {
			throw new RuntimeException("The user defined 'configure()' method in the Output Format caused an error: " 
				+ t.getMessage(), t);
		}
	}

	/**
	 * Initializes the input readers of the DataSinkTask.
	 * 
	 * @throws RuntimeException
	 *         Thrown in case of invalid task input configuration.
	 */
	@SuppressWarnings("unchecked")
	private void initInputReaders() throws Exception {
		
		MutableReader inputReader;
		
		int numGates = 0;
		//  ---------------- create the input readers ---------------------
		// in case where a logical input unions multiple physical inputs, create a union reader
		final int groupSize = this.config.getGroupSize(0);
		numGates += groupSize;
		if (groupSize == 1) {
			// non-union case
			inputReader = new MutableRecordReader>(this);
		} else if (groupSize > 1){
			// union case
			
			MutableRecordReader[] readers = new MutableRecordReader[groupSize];
			for (int j = 0; j < groupSize; ++j) {
				readers[j] = new MutableRecordReader(this);
			}
			inputReader = new MutableUnionRecordReader(readers);
		} else {
			throw new Exception("Illegal input group size in task configuration: " + groupSize);
		}
		
		this.inputTypeSerializerFactory = this.config.getInputSerializer(0, this.userCodeClassLoader);
		
		if (this.inputTypeSerializerFactory.getDataType() == Record.class) {
			// record specific deserialization
			MutableReader reader = (MutableReader) inputReader;
			this.reader = (MutableObjectIterator)new RecordReaderIterator(reader);
		} else {
			// generic data type serialization
			MutableReader> reader = (MutableReader>) inputReader;
			@SuppressWarnings({ "rawtypes" })
			final MutableObjectIterator iter = new ReaderIterator(reader, this.inputTypeSerializerFactory.getSerializer());
			this.reader = (MutableObjectIterator)iter;
		}
		
		// final sanity check
		if (numGates != this.config.getNumInputs()) {
			throw new Exception("Illegal configuration: Number of input gates and group sizes are not consistent.");
		}
	}
	
	// ------------------------------------------------------------------------
	//                     Degree of parallelism & checks
	// ------------------------------------------------------------------------
	

	@Override
	public int getMaximumNumberOfSubtasks() {
		if (!(this.format instanceof FileOutputFormat)) {
			return -1;
		}
		
		final FileOutputFormat fileOutputFormat = (FileOutputFormat) this.format;
		
		// ----------------- This code applies only to file inputs ------------------
		
		final Path path = fileOutputFormat.getOutputFilePath();
		final WriteMode writeMode = fileOutputFormat.getWriteMode();
		final OutputDirectoryMode outDirMode = fileOutputFormat.getOutputDirectoryMode();

		// Prepare output path and determine max DOP		
		try {
			
			int dop = getTaskConfiguration().getInteger(DEGREE_OF_PARALLELISM_KEY, -1);
			final FileSystem fs = path.getFileSystem();
			
			if(dop == 1 && outDirMode == OutputDirectoryMode.PARONLY) {
				// output is not written in parallel and should be written to a single file.
				
				if(fs.isDistributedFS()) {
					// prepare distributed output path
					if(!fs.initOutPathDistFS(path, writeMode, false)) {
						// output preparation failed! Cancel task.
						throw new IOException("Output path could not be initialized.");
					}
				}
				
				return 1;
				
			} else {
				// output should be written to a directory
				
				if(fs.isDistributedFS()) {
					// only distributed file systems can be initialized at start-up time.
					if(!fs.initOutPathDistFS(path, writeMode, true)) {
						throw new IOException("Output directory could not be created.");
					}
				}
				
				return -1;
				
			}
		}
		catch (IOException e) {
			LOG.error("Could not access the file system to detemine the status of the output.", e);
			throw new RuntimeException("I/O Error while accessing file", e);
		}
	}

	// ------------------------------------------------------------------------
	//                               Utilities
	// ------------------------------------------------------------------------
	
	/**
	 * Utility function that composes a string for logging purposes. The string includes the given message and
	 * the index of the task in its task group together with the number of tasks in the task group.
	 * 
	 * @param message The main message for the log.
	 * @return The string ready for logging.
	 */
	private String getLogString(String message) {
		return RegularPactTask.constructLogString(message, this.getEnvironment().getTaskName(), this);
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy