All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.stratosphere.api.java.operators.TwoInputUdfOperator Maven / Gradle / Ivy

There is a newer version: 0.5.2-hadoop2
Show newest version
/***********************************************************************************************************************
 *
 * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 *
 **********************************************************************************************************************/
package eu.stratosphere.api.java.operators;

import java.lang.annotation.Annotation;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import eu.stratosphere.api.common.operators.DualInputSemanticProperties;
import eu.stratosphere.api.java.DataSet;
import eu.stratosphere.api.java.functions.FunctionAnnotation;
import eu.stratosphere.api.java.functions.SemanticPropUtil;
import eu.stratosphere.types.TypeInformation;
import eu.stratosphere.configuration.Configuration;

/**
 * The TwoInputUdfOperator is the base class of all binary operators that execute
 * user-defined functions (UDFs). The UDFs encapsulated by this operator are naturally UDFs that
 * have two inputs (such as {@link JoinFunction} or {@link CoGroupFunction}).
 * 

* This class encapsulates utilities for the UDFs, such as broadcast variables, parameterization * through configuration objects, and semantic properties. * * @param The data type of the first input data set. * @param The data type of the second input data set. * @param The data type of the returned data set. */ public abstract class TwoInputUdfOperator> extends TwoInputOperator implements UdfOperator { private Configuration parameters; private Map> broadcastVariables; private DualInputSemanticProperties udfSemantics; // -------------------------------------------------------------------------------------------- /** * Creates a new operators with the two given data sets as inputs. The given result type * describes the data type of the elements in the data set produced by the operator. * * @param input1 The data set for the first input. * @param input2 The data set for the second input. * @param resultType The type of the elements in the resulting data set. */ protected TwoInputUdfOperator(DataSet input1, DataSet input2, TypeInformation resultType) { super(input1, input2, resultType); } protected void extractSemanticAnnotationsFromUdf(Class udfClass) { Set annotations = FunctionAnnotation.readDualConstantAnnotations(udfClass); DualInputSemanticProperties dsp = SemanticPropUtil.getSemanticPropsDual(annotations, getInput1Type(), getInput2Type(), getResultType()); setSemanticProperties(dsp); } // -------------------------------------------------------------------------------------------- // Fluent API methods // -------------------------------------------------------------------------------------------- @Override public O withParameters(Configuration parameters) { this.parameters = parameters; @SuppressWarnings("unchecked") O returnType = (O) this; return returnType; } @Override public O withBroadcastSet(DataSet data, String name) { if (this.broadcastVariables == null) { this.broadcastVariables = new HashMap>(); } this.broadcastVariables.put(name, data); @SuppressWarnings("unchecked") O returnType = (O) this; return returnType; } /** * Adds a constant-set annotation for the first input of the UDF. * *

* Constant set annotations are used by the optimizer to infer the existence of data properties (sorted, partitioned, grouped). * In certain cases, these annotations allow the optimizer to generate a more efficient execution plan which can lead to improved performance. * Constant set annotations can only be specified if the first input and the output type of the UDF are of {@link Tuple} data types. * *

* A constant-set annotation is a set of constant field specifications. The constant field specification String "4->3" specifies, that this UDF copies the fourth field of * an input tuple to the third field of the output tuple. Field references are zero-indexed. * *

* NOTICE: Constant set annotations are optional, but if given need to be correct. Otherwise, the program might produce wrong results! * * @param constantSetFirst A list of constant field specification Strings for the first input. * @return This operator with an annotated constant field set for the first input. */ @SuppressWarnings("unchecked") public O withConstantSetFirst(String... constantSetFirst) { DualInputSemanticProperties dsp = SemanticPropUtil.getSemanticPropsDualFromString(constantSetFirst, null, null, null, null, null, this.getInput1Type(), this.getInput2Type(), this.getResultType()); this.setSemanticProperties(dsp); O returnType = (O) this; return returnType; } /** * Adds a constant-set annotation for the second input of the UDF. * *

* Constant set annotations are used by the optimizer to infer the existence of data properties (sorted, partitioned, grouped). * In certain cases, these annotations allow the optimizer to generate a more efficient execution plan which can lead to improved performance. * Constant set annotations can only be specified if the second input and the output type of the UDF are of {@link Tuple} data types. * *

* A constant-set annotation is a set of constant field specifications. The constant field specification String "4->3" specifies, that this UDF copies the fourth field of * an input tuple to the third field of the output tuple. Field references are zero-indexed. * *

* NOTICE: Constant set annotations are optional, but if given need to be correct. Otherwise, the program might produce wrong results! * * @param constantSetSecond A list of constant field specification Strings for the second input. * @return This operator with an annotated constant field set for the second input. */ @SuppressWarnings("unchecked") public O withConstantSetSecond(String... constantSetSecond) { DualInputSemanticProperties dsp = SemanticPropUtil.getSemanticPropsDualFromString(null, constantSetSecond, null, null, null, null, this.getInput1Type(), this.getInput2Type(), this.getResultType()); this.setSemanticProperties(dsp); O returnType = (O) this; return returnType; } // -------------------------------------------------------------------------------------------- // Accessors // -------------------------------------------------------------------------------------------- @Override public Map> getBroadcastSets() { return this.broadcastVariables == null ? Collections.>emptyMap() : Collections.unmodifiableMap(this.broadcastVariables); } @Override public Configuration getParameters() { return this.parameters; } @Override public DualInputSemanticProperties getSematicProperties() { return this.udfSemantics; } /** * Sets the semantic properties for the user-defined function (UDF). The semantic properties * define how fields of tuples and other objects are modified or preserved through this UDF. * The configured properties can be retrieved via {@link UdfOperator#getSematicProperties()}. * * @param properties The semantic properties for the UDF. * @see UdfOperator#getSematicProperties() */ public void setSemanticProperties(DualInputSemanticProperties properties) { this.udfSemantics = properties; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy