All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.dstream.DStreamExecutionGraphBuilder Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.dstream;

import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import io.dstream.DStreamInvocationChain.DStreamInvocation;
import io.dstream.SerializableStreamAssets.SerBinaryOperator;
import io.dstream.SerializableStreamAssets.SerComparator;
import io.dstream.SerializableStreamAssets.SerFunction;
import io.dstream.function.BiFunctionToBinaryOperatorAdapter;
import io.dstream.function.DStreamToStreamAdapterFunction;
import io.dstream.function.KeyValueMappingFunction;
import io.dstream.function.ValuesAggregatingFunction;
import io.dstream.function.ValuesReducingFunction;
import io.dstream.support.Aggregators;
import io.dstream.utils.Assert;

/**
 * Builder of DStreamOperations
 */
final class DStreamExecutionGraphBuilder {
	
	private final DStreamInvocationChain invocationPipeline;
	
	private final Properties executionConfig;
	
	private final SerFunction>, Stream> shuffleResultNormalizer;
	

	private DStreamOperation currentStreamOperation;
	
	private int operationIdCounter;
	
	private boolean combiningStreams;
	
	
	/**
	 */
	@SuppressWarnings("unchecked")
	DStreamExecutionGraphBuilder(DStreamInvocationChain invocationPipeline, Properties executionConfig){
		this.invocationPipeline = invocationPipeline;
		this.executionConfig = executionConfig;
		this.shuffleResultNormalizer = stream -> stream
				.flatMap(entry -> StreamSupport
						.stream(Spliterators.spliteratorUnknownSize((Iterator)entry.getValue(), Spliterator.ORDERED), false));
	}
	
	/**
	 * 
	 */
	protected DStreamExecutionGraph build(){	
		return this.doBuild(false);
	}
	
	/**
	 * The 'dependentStream' attribute implies that the DStreamOperations the methid is about to build 
	 * is a dependency of another DStreamOperations (e.g., for cases such as Join, union etc.)
	 * Basically, each process should have at least two operations - 'read -> write' with shuffle in between.
	 * For cases where we are building a dependent operations the second operation is actually 
	 * implicit (the operation that does the join, union etc.).
	 */
	@SuppressWarnings("rawtypes")
	private DStreamExecutionGraph doBuild(boolean isDependent){	
		this.invocationPipeline.getInvocations().forEach(this::addInvocation);
		
		if (this.requiresInitialSetOfOperations()){
			this.createDefaultExtractOperation();
		}
		if (this.requiresPostShuffleOperation(isDependent)){
			SerFunction loadFunc = this.determineUnmapFunction(this.currentStreamOperation.getLastOperationName());
			this.currentStreamOperation = new DStreamOperation(this.operationIdCounter++, this.currentStreamOperation);
			this.currentStreamOperation.addStreamOperationFunction(Ops.load.name(), loadFunc);
		}
		
		DStreamOperation parent = this.currentStreamOperation;			
		List operationList = new ArrayList<>();
		do {
			operationList.add(parent);
			parent = parent.getParent();
		} while (parent != null);	
		Collections.reverse(operationList);
		//	
		
		DStreamExecutionGraph operations = new DStreamExecutionGraph(
				this.invocationPipeline.getSourceElementType(), 
				this.invocationPipeline.getSourceIdentifier(), 
				Collections.unmodifiableList(operationList));
		
		return operations;
	}
	
	/**
	 * 
	 */
	private void addInvocation(DStreamInvocation invocation) {
		Ops operation = Ops.valueOf(invocation.getMethod().getName());
		
		if (Ops.isTransformation(operation)){
			this.addTransformationOperation(invocation);
		}
		else if (Ops.isShuffle(operation)){
			if (operation.equals(Ops.join) || operation.equals(Ops.union) ||operation.equals(Ops.unionAll)){
				this.addStreamsCombineOperation(invocation);
				this.combiningStreams = true;
			}
			else {
				this.combiningStreams = false;			
				if (operation.equals(Ops.reduceValues) || operation.equals(Ops.aggregateValues)){
					this.addAggregationOperation(invocation);
				}
				else if (operation.equals(Ops.classify)){
					this.addClassifyOperation(invocation);
				}
			}
		}
		else if (Ops.isStreamReduce(operation)){
			this.addStreamReduceOperation(invocation);
		}
		else if (Ops.isStreamComparator(operation)){
			this.addStreamComparatorOperation(invocation);
		}
	}
	
	/**
	 * 
	 */
	@SuppressWarnings({ "rawtypes", "unchecked" })
	private void addClassifyOperation(DStreamInvocation invocation) {
		Object[] arguments = invocation.getArguments();
		SerFunction classifier = (SerFunction) arguments[0];
		SerFunction kvMapper = new KeyValueMappingFunction<>(classifier, s -> s, null);
		
		if (this.currentStreamOperation == null){
			this.currentStreamOperation = new DStreamOperation(this.operationIdCounter++, this.currentStreamOperation);
		}
		
		this.currentStreamOperation.addStreamOperationFunction(Ops.classify.name(), kvMapper);

		if (this.currentStreamOperation.getParent() != null){
			this.addPostShuffleNormalizer();
		}
	}
	
	/**
	 * 
	 */
	@SuppressWarnings("rawtypes")
	private void addStreamsCombineOperation(DStreamInvocation invocation){
		if (this.executionConfig.containsKey(DStreamConstants.PARALLELISM)){
			int parallelism = Integer.parseInt(this.executionConfig.getProperty(DStreamConstants.PARALLELISM));
			if (this.currentStreamOperation == null || 
				(this.currentStreamOperation.getParent() != null && !this.currentStreamOperation.getParent().isClassify()) ||
				(this.currentStreamOperation.getParent() == null && !this.currentStreamOperation.isClassify())){
				Assert.isTrue(parallelism == 1, "Combining streams without prior classification is not supported when parallelism is > 1");
			}
		}
		Method method = invocation.getMethod();
		Object[] arguments = invocation.getArguments();
		Ops operation = Ops.valueOf(method.getName());
		
		AbstractStreamMergingFunction streamsCombiner;
		if (this.currentStreamOperation == null){
			this.createDefaultExtractOperation();
			// this condition possible when doing join without classification, so no need to 
			streamsCombiner = this.createStreamCombiner(operation.name(), s -> s);
			DStreamOperation newStreamOperation = new DStreamOperation(this.operationIdCounter++, this.currentStreamOperation);

			this.currentStreamOperation = newStreamOperation;
			this.currentStreamOperation.setStreamsCombiner(operation.name(), streamsCombiner);
		}
		else if (this.currentStreamOperation.isStreamsCombiner()) {
			streamsCombiner = (AbstractStreamMergingFunction)(SerFunction)this.currentStreamOperation.getStreamOperationFunction();
		}
		else {
			streamsCombiner = this.createStreamCombiner(operation.name(), this.determineUnmapFunction(this.currentStreamOperation.getLastOperationName()));
			DStreamOperation newStreamOperation = new DStreamOperation(this.operationIdCounter++, this.currentStreamOperation);

			this.currentStreamOperation = newStreamOperation;
			this.currentStreamOperation.setStreamsCombiner(operation.name(), streamsCombiner);
		}
		
		DStreamInvocationChain dependentPipeline = (DStreamInvocationChain) arguments[0];
		DStreamExecutionGraphBuilder dependentBuilder = new DStreamExecutionGraphBuilder(dependentPipeline, this.executionConfig);
		DStreamExecutionGraph dependentOperations = dependentBuilder.doBuild(true);
		int joiningStreamsSize = dependentPipeline.getStreamType().getTypeParameters().length;
		
		this.currentStreamOperation.addCombinableExecutionGraph(dependentOperations); 
		streamsCombiner.addCheckPoint(joiningStreamsSize);
		if (invocation.getSupplementaryOperation() != null){
			streamsCombiner.addTransformationOrPredicate(Ops.filter.name(), invocation.getSupplementaryOperation());
		}
	}
	
	/**
	 * 
	 */
	@SuppressWarnings("unchecked")
	private void addTransformationOperation(DStreamInvocation invocation){
		Method method = invocation.getMethod();
		Object[] arguments = invocation.getArguments();
		Ops operation = Ops.valueOf(method.getName());
		
		SerFunction, Stream> currentStreamFunction = operation.equals(Ops.compute)
				? (SerFunction, Stream>) arguments[0]
						: new DStreamToStreamAdapterFunction(operation.name(), arguments.length > 0 ? arguments[0] : null);
		
		if (this.combiningStreams){
			SerFunction currentFunction = this.currentStreamOperation.getStreamOperationFunction();
			AbstractStreamMergingFunction joiner = (AbstractStreamMergingFunction) currentFunction;
			joiner.addTransformationOrPredicate(currentStreamFunction);
		}
		else {
			if (this.currentStreamOperation == null){
				this.currentStreamOperation = new DStreamOperation(this.operationIdCounter++);
			} 
			else if (this.currentStreamOperation.getLastOperationName().equals(Ops.classify.name())){
				this.addPostShuffleNormalizer();
			}
				
			this.currentStreamOperation.addStreamOperationFunction(operation.name(), currentStreamFunction);
		}
	}
	
	/**
	 * 
	 */
	private void addAggregationOperation(DStreamInvocation invocation) {
		Method method = invocation.getMethod();
		Object[] arguments = invocation.getArguments();
		Ops operation = Ops.valueOf(method.getName());
		
		SerFunction keyMapper = (SerFunction) arguments[0];
		SerFunction valueMapper = (SerFunction) arguments[1];
		SerBinaryOperator valueAggregator = operation.equals(Ops.reduceValues) 
				? (SerBinaryOperator)arguments[2]
						: new BiFunctionToBinaryOperatorAdapter(Aggregators::aggregateToList);
				
		int operationId = this.currentStreamOperation == null ? 1 : this.currentStreamOperation.getId();
		String propertyName = DStreamConstants.MAP_SIDE_COMBINE + operationId + "_" + this.invocationPipeline.getSourceIdentifier();
		boolean mapSideCombine = Boolean.parseBoolean((String)this.executionConfig.getOrDefault(propertyName, "false"));

		@SuppressWarnings({ "unchecked", "rawtypes" })
		SerFunction kvMapper = new KeyValueMappingFunction(keyMapper, valueMapper, mapSideCombine ? valueAggregator : null);
		
		this.adjustCurrentStreamState();
		
		this.currentStreamOperation.addStreamOperationFunction(Ops.mapKeyValues.name(), kvMapper);

		DStreamOperation newStreamOperation = new DStreamOperation(this.operationIdCounter++, this.currentStreamOperation);
		SerFunction newStreamFunction = operation.equals(Ops.reduceValues) 
				? new ValuesReducingFunction<>(valueAggregator)
						: new ValuesAggregatingFunction<>(valueAggregator);
		newStreamOperation.addStreamOperationFunction(operation.name(), newStreamFunction);
		this.currentStreamOperation = newStreamOperation;
	}
	
	/**
	 * 
	 * @param invocation
	 */
	private void addStreamComparatorOperation(DStreamInvocation invocation){
		SerComparator comparator = invocation.getArguments().length == 1 ? (SerComparator)invocation.getArguments()[0] : null ;
		this.adjustCurrentStreamState();
		DStreamToStreamAdapterFunction streamAdaperFunc = new DStreamToStreamAdapterFunction(invocation.getMethod().getName(), comparator);
		this.currentStreamOperation.addStreamOperationFunction(invocation.getMethod().getName(), streamAdaperFunc);
	}
	
	/**
	 * 
	 */
	private void addStreamReduceOperation(DStreamInvocation invocation){	
		Ops operation = Ops.valueOf(invocation.getMethod().getName());
		SerFunction keyMapper = s -> 0;
		SerFunction valueMapper;
		SerBinaryOperator valueAggregator;
		
		if (operation.equals(Ops.count)){
			valueMapper = s -> 1L;
			valueAggregator = (a, b) -> ((long)a) + ((long)b);
		} 
		else if (operation.equals(Ops.reduce)) {
			valueMapper = s -> s;
			valueAggregator = (SerBinaryOperator) invocation.getArguments()[0];
		}
		else {
			throw new IllegalStateException("Unrecognized or unsupported operation: " + invocation.getMethod().getName());
		}
		@SuppressWarnings({ "unchecked", "rawtypes" })
		SerFunction kvMapper = new KeyValueMappingFunction(keyMapper, valueMapper, valueAggregator);
		
		this.adjustCurrentStreamState();
		
		this.currentStreamOperation.addStreamOperationFunction(Ops.mapKeyValues.name(), kvMapper);

		SerFunction>>,Stream> valueReducingFunction = 
				new ValuesReducingFunction(valueAggregator);
		DStreamOperation valueReducingOperation = new DStreamOperation(this.operationIdCounter++, this.currentStreamOperation);
			
		valueReducingOperation.addStreamOperationFunction(Ops.reduceValues.name(), valueReducingFunction);
		this.currentStreamOperation = valueReducingOperation;
		
		DStreamOperation mappingOperation = new DStreamOperation(this.operationIdCounter++, this.currentStreamOperation);
		mappingOperation.addStreamOperationFunction(Ops.map.name(), this.shuffleResultNormalizer);
		this.currentStreamOperation = mappingOperation;
	}
	
	/**
	 * 
	 */
	@SuppressWarnings({"unchecked", "rawtypes" })
	private AbstractStreamMergingFunction createStreamCombiner(String operationName, SerFunction firstStreamPreProcessingFunction) {
		return operationName.equals(Ops.join.name())
				? new StreamJoinerFunction(firstStreamPreProcessingFunction)
					: new StreamUnionFunction(operationName.equals(Ops.union.name()), firstStreamPreProcessingFunction);
	}
	
	/**
	 * Will ensure that:
* if no stream operations were created, * it will create an initial one, essentially creating an execution stage.
* If current operation is 'classify', it would create a post shuffle stage */ private void adjustCurrentStreamState(){ if (this.currentStreamOperation == null){ this.currentStreamOperation = new DStreamOperation(this.operationIdCounter++); } if (this.currentStreamOperation.isClassify()){ this.addPostShuffleNormalizer(); } } /** * Determines which unmap function to use. For cases when previous operation was 'classify' * the unmap function is {@link #unmapFunction} otherwise it will produce a pass thru function. */ private SerFunction determineUnmapFunction(String lastOperationName){ return Ops.classify.name().equals(lastOperationName) ? this.shuffleResultNormalizer : s -> s; } /** * */ private void addPostShuffleNormalizer(){ this.currentStreamOperation = new DStreamOperation(this.operationIdCounter++, this.currentStreamOperation); this.currentStreamOperation.addStreamOperationFunction(Ops.load.name(), this.shuffleResultNormalizer); } /** * Will return true if this builder doesn't have any operations. * Typical for cases where DStream is executed without any operations * invoked (e.g., DStream.ofType(String.class, "wc").executeAs("WordCount"); ) */ private boolean requiresInitialSetOfOperations(){ return this.currentStreamOperation == null; } /** * Will return true if a post shuffle operation is required. * Typical case is where this builder has only one operation and * it's building a non-dependent stream operations. */ private boolean requiresPostShuffleOperation(boolean isDependent){ return this.currentStreamOperation.getParent() == null && !isDependent; } /** * Will create an operation named 'extract' with a pass-thru function */ private void createDefaultExtractOperation(){ this.currentStreamOperation = new DStreamOperation(this.operationIdCounter++); this.currentStreamOperation.addStreamOperationFunction(Ops.extract.name(), s -> s); } }