eu.stratosphere.api.java.operators.TwoInputUdfOperator Maven / Gradle / Ivy
/***********************************************************************************************************************
*
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*
**********************************************************************************************************************/
package eu.stratosphere.api.java.operators;
import java.lang.annotation.Annotation;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import eu.stratosphere.api.common.operators.DualInputSemanticProperties;
import eu.stratosphere.api.java.DataSet;
import eu.stratosphere.api.java.functions.FunctionAnnotation;
import eu.stratosphere.api.java.functions.SemanticPropUtil;
import eu.stratosphere.types.TypeInformation;
import eu.stratosphere.configuration.Configuration;
/**
* The TwoInputUdfOperator is the base class of all binary operators that execute
* user-defined functions (UDFs). The UDFs encapsulated by this operator are naturally UDFs that
* have two inputs (such as {@link JoinFunction} or {@link CoGroupFunction}).
*
* This class encapsulates utilities for the UDFs, such as broadcast variables, parameterization
* through configuration objects, and semantic properties.
*
* @param The data type of the first input data set.
* @param The data type of the second input data set.
* @param The data type of the returned data set.
*/
public abstract class TwoInputUdfOperator>
extends TwoInputOperator implements UdfOperator
{
private Configuration parameters;
private Map> broadcastVariables;
private DualInputSemanticProperties udfSemantics;
// --------------------------------------------------------------------------------------------
/**
* Creates a new operators with the two given data sets as inputs. The given result type
* describes the data type of the elements in the data set produced by the operator.
*
* @param input1 The data set for the first input.
* @param input2 The data set for the second input.
* @param resultType The type of the elements in the resulting data set.
*/
protected TwoInputUdfOperator(DataSet input1, DataSet input2, TypeInformation resultType) {
super(input1, input2, resultType);
}
protected void extractSemanticAnnotationsFromUdf(Class> udfClass) {
Set annotations = FunctionAnnotation.readDualConstantAnnotations(udfClass);
DualInputSemanticProperties dsp = SemanticPropUtil.getSemanticPropsDual(annotations,
getInput1Type(), getInput2Type(), getResultType());
setSemanticProperties(dsp);
}
// --------------------------------------------------------------------------------------------
// Fluent API methods
// --------------------------------------------------------------------------------------------
@Override
public O withParameters(Configuration parameters) {
this.parameters = parameters;
@SuppressWarnings("unchecked")
O returnType = (O) this;
return returnType;
}
@Override
public O withBroadcastSet(DataSet> data, String name) {
if (this.broadcastVariables == null) {
this.broadcastVariables = new HashMap>();
}
this.broadcastVariables.put(name, data);
@SuppressWarnings("unchecked")
O returnType = (O) this;
return returnType;
}
/**
* Adds a constant-set annotation for the first input of the UDF.
*
*
* Constant set annotations are used by the optimizer to infer the existence of data properties (sorted, partitioned, grouped).
* In certain cases, these annotations allow the optimizer to generate a more efficient execution plan which can lead to improved performance.
* Constant set annotations can only be specified if the first input and the output type of the UDF are of {@link Tuple} data types.
*
*
* A constant-set annotation is a set of constant field specifications. The constant field specification String "4->3" specifies, that this UDF copies the fourth field of
* an input tuple to the third field of the output tuple. Field references are zero-indexed.
*
*
* NOTICE: Constant set annotations are optional, but if given need to be correct. Otherwise, the program might produce wrong results!
*
* @param constantSetFirst A list of constant field specification Strings for the first input.
* @return This operator with an annotated constant field set for the first input.
*/
@SuppressWarnings("unchecked")
public O withConstantSetFirst(String... constantSetFirst) {
DualInputSemanticProperties dsp = SemanticPropUtil.getSemanticPropsDualFromString(constantSetFirst, null,
null, null, null, null, this.getInput1Type(), this.getInput2Type(), this.getResultType());
this.setSemanticProperties(dsp);
O returnType = (O) this;
return returnType;
}
/**
* Adds a constant-set annotation for the second input of the UDF.
*
*
* Constant set annotations are used by the optimizer to infer the existence of data properties (sorted, partitioned, grouped).
* In certain cases, these annotations allow the optimizer to generate a more efficient execution plan which can lead to improved performance.
* Constant set annotations can only be specified if the second input and the output type of the UDF are of {@link Tuple} data types.
*
*
* A constant-set annotation is a set of constant field specifications. The constant field specification String "4->3" specifies, that this UDF copies the fourth field of
* an input tuple to the third field of the output tuple. Field references are zero-indexed.
*
*
* NOTICE: Constant set annotations are optional, but if given need to be correct. Otherwise, the program might produce wrong results!
*
* @param constantSetSecond A list of constant field specification Strings for the second input.
* @return This operator with an annotated constant field set for the second input.
*/
@SuppressWarnings("unchecked")
public O withConstantSetSecond(String... constantSetSecond) {
DualInputSemanticProperties dsp = SemanticPropUtil.getSemanticPropsDualFromString(null, constantSetSecond,
null, null, null, null, this.getInput1Type(), this.getInput2Type(), this.getResultType());
this.setSemanticProperties(dsp);
O returnType = (O) this;
return returnType;
}
// --------------------------------------------------------------------------------------------
// Accessors
// --------------------------------------------------------------------------------------------
@Override
public Map> getBroadcastSets() {
return this.broadcastVariables == null ?
Collections.>emptyMap() :
Collections.unmodifiableMap(this.broadcastVariables);
}
@Override
public Configuration getParameters() {
return this.parameters;
}
@Override
public DualInputSemanticProperties getSematicProperties() {
return this.udfSemantics;
}
/**
* Sets the semantic properties for the user-defined function (UDF). The semantic properties
* define how fields of tuples and other objects are modified or preserved through this UDF.
* The configured properties can be retrieved via {@link UdfOperator#getSematicProperties()}.
*
* @param properties The semantic properties for the UDF.
* @see UdfOperator#getSematicProperties()
*/
public void setSemanticProperties(DualInputSemanticProperties properties) {
this.udfSemantics = properties;
}
}