org.apache.hadoop.hive.ql.udf.ptf.TableFunctionEvaluator Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.udf.ptf;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.ql.exec.PTFOperator;
import org.apache.hadoop.hive.ql.exec.PTFPartition;
import org.apache.hadoop.hive.ql.exec.PTFPartition.PTFPartitionIterator;
import org.apache.hadoop.hive.ql.exec.PTFUtils;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.PTFDesc;
import org.apache.hadoop.hive.ql.plan.ptf.PartitionedTableFunctionDef;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
/*
* Interface Design:
* A TableFunction provides 2 interfaces of execution 'Batch' and 'Streaming'.
* - In Batch mode the contract is Partition in - Partition Out
* - In Streaming mode the contract is a stream of processRow calls - each of which may return 0 or more rows.
*
* A Partition is not just a batch of rows, it enables more than a single iteration of
* the i/p data: multiple passes, arbitrary access of input rows, relative navigation between
* rows(for e.g. lead/lag fns). Most PTFs will work in batch mode.
*
* The Streaming mode gives up on the capabilities of Partitions for the benefit of smaller footprint,
* and faster processing. Window Function processing is an e.g. of this: when there are only Ranking
* functions each row needs to be accessed once in the order it is provided; hence there is no need
* to hold all input rows in a Partition. The 'pattern' is: any time you want to only enhance/enrich
* an Input Row Streaming mode is the right choice. This is the fundamental difference between Ranking
* fns and UDAFs: Ranking functions keep the original data intact whereas UDAF only return aggregate
* information.
*
* Finally we have provided a 'mixed' mode where a non Streaming TableFunction can provide its output
* as an Iterator. As far as we can tell, this is a special case for Windowing handling. If Windowing
* is the only or last TableFunction in a chain, it makes no sense to collect the output rows into a
* output Partition. We justify the pollution of the api by the observation that Windowing is a very
* common use case.
*
*/
/**
* Based on Hive {@link GenericUDAFEvaluator}. Break up the responsibility of the old AbstractTableFunction
* class into a Resolver and Evaluator.
*
* The Evaluator also holds onto the {@link TableFunctionDef}. This provides information
* about the arguments to the function, the shape of the Input partition and the Partitioning details.
* The Evaluator is responsible for providing the 2 execute methods:
*
* - execute: which is invoked after the input is partitioned; the contract
* is, it is given an input Partition and must return an output Partition. The shape of the output
* Partition is obtained from the getOutputOI call.
*
- transformRawInput: In the case where this function indicates that it will transform the raw input
* before it is fed through the partitioning mechanics, this function is called. Again the contract is
* t is given an input Partition and must return an Partition. The shape of the output Partition is
* obtained from getRawInputOI() call.
*
*
*/
public abstract class TableFunctionEvaluator {
/*
* how is this different from the OutputShape set on the TableDef.
* This is the OI of the object coming out of the PTF.
* It is put in an output Partition whose Serde is usually LazyBinarySerde.
* So the next PTF (or Operator) in the chain gets a LazyBinaryStruct.
*/
transient protected StructObjectInspector OI;
/*
* same comment as OI applies here.
*/
transient protected StructObjectInspector rawInputOI;
protected PartitionedTableFunctionDef tableDef;
protected PTFDesc ptfDesc;
boolean transformsRawInput;
transient protected PTFPartition outputPartition;
transient protected boolean canAcceptInputAsStream;
static {
PTFUtils.makeTransient(TableFunctionEvaluator.class, "outputOI", "rawInputOI");
}
public StructObjectInspector getOutputOI() {
return OI;
}
protected void setOutputOI(StructObjectInspector outputOI) {
OI = outputOI;
}
public PartitionedTableFunctionDef getTableDef() {
return tableDef;
}
public void setTableDef(PartitionedTableFunctionDef tDef) {
this.tableDef = tDef;
}
protected PTFDesc getQueryDef() {
return ptfDesc;
}
protected void setQueryDef(PTFDesc ptfDesc) {
this.ptfDesc = ptfDesc;
}
public StructObjectInspector getRawInputOI() {
return rawInputOI;
}
protected void setRawInputOI(StructObjectInspector rawInputOI) {
this.rawInputOI = rawInputOI;
}
public boolean isTransformsRawInput() {
return transformsRawInput;
}
public void setTransformsRawInput(boolean transformsRawInput) {
this.transformsRawInput = transformsRawInput;
}
public PTFPartition execute(PTFPartition iPart)
throws HiveException {
if ( ptfDesc.isMapSide() ) {
return transformRawInput(iPart);
}
PTFPartitionIterator