All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.udf.ptf.TableFunctionEvaluator Maven / Gradle / Ivy

There is a newer version: 4.0.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.udf.ptf;

import java.util.Iterator;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.ql.exec.PTFOperator;
import org.apache.hadoop.hive.ql.exec.PTFPartition;
import org.apache.hadoop.hive.ql.exec.PTFPartition.PTFPartitionIterator;
import org.apache.hadoop.hive.ql.exec.PTFUtils;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.PTFDesc;
import org.apache.hadoop.hive.ql.plan.ptf.PartitionedTableFunctionDef;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;

/*
 * Interface Design:
 * A TableFunction provides 2 interfaces of execution 'Batch' and 'Streaming'.
 * - In Batch mode the contract is Partition in - Partition Out
 * - In Streaming mode the contract is a stream of processRow calls - each of which may return 0 or more rows.
 * 
 * A Partition is not just a batch of rows, it enables more than a single iteration of
 * the i/p data: multiple passes, arbitrary access of input rows, relative navigation between
 * rows(for e.g. lead/lag fns). Most PTFs will work in batch mode.
 * 
 * The Streaming mode gives up on the capabilities of Partitions for the benefit of smaller footprint,
 * and faster processing. Window Function processing is an e.g. of this: when there are only Ranking
 * functions each row needs to be accessed once in the order it is provided; hence there is no need 
 * to hold all input rows in a Partition. The 'pattern' is: any time you want to only enhance/enrich 
 * an Input Row Streaming mode is the right choice. This is the fundamental difference between Ranking
 * fns and UDAFs: Ranking functions keep the original data intact whereas UDAF only return aggregate
 * information.
 * 
 * Finally we have provided a 'mixed' mode where a non Streaming TableFunction can provide its output
 * as an Iterator. As far as we can tell, this is a special case for Windowing handling. If Windowing
 * is the only or last TableFunction in a chain, it makes no sense to collect the output rows into a 
 * output Partition. We justify the pollution of the api by the observation that Windowing is a very 
 * common use case.
 * 
 */

/**
 * Based on Hive {@link GenericUDAFEvaluator}. Break up the responsibility of the old AbstractTableFunction
 * class into a Resolver and Evaluator.
 * 

* The Evaluator also holds onto the {@link TableFunctionDef}. This provides information * about the arguments to the function, the shape of the Input partition and the Partitioning details. * The Evaluator is responsible for providing the 2 execute methods: *

    *
  1. execute: which is invoked after the input is partitioned; the contract * is, it is given an input Partition and must return an output Partition. The shape of the output * Partition is obtained from the getOutputOI call. *
  2. transformRawInput: In the case where this function indicates that it will transform the raw input * before it is fed through the partitioning mechanics, this function is called. Again the contract is * t is given an input Partition and must return an Partition. The shape of the output Partition is * obtained from getRawInputOI() call. *
* */ public abstract class TableFunctionEvaluator { /* * how is this different from the OutputShape set on the TableDef. * This is the OI of the object coming out of the PTF. * It is put in an output Partition whose Serde is usually LazyBinarySerde. * So the next PTF (or Operator) in the chain gets a LazyBinaryStruct. */ transient protected StructObjectInspector OI; /* * same comment as OI applies here. */ transient protected StructObjectInspector rawInputOI; protected PartitionedTableFunctionDef tableDef; protected PTFDesc ptfDesc; boolean transformsRawInput; transient protected PTFPartition outputPartition; transient protected boolean canAcceptInputAsStream; static { PTFUtils.makeTransient(TableFunctionEvaluator.class, "outputOI", "rawInputOI"); } public StructObjectInspector getOutputOI() { return OI; } protected void setOutputOI(StructObjectInspector outputOI) { OI = outputOI; } public PartitionedTableFunctionDef getTableDef() { return tableDef; } public void setTableDef(PartitionedTableFunctionDef tDef) { this.tableDef = tDef; } protected PTFDesc getQueryDef() { return ptfDesc; } protected void setQueryDef(PTFDesc ptfDesc) { this.ptfDesc = ptfDesc; } public StructObjectInspector getRawInputOI() { return rawInputOI; } protected void setRawInputOI(StructObjectInspector rawInputOI) { this.rawInputOI = rawInputOI; } public boolean isTransformsRawInput() { return transformsRawInput; } public void setTransformsRawInput(boolean transformsRawInput) { this.transformsRawInput = transformsRawInput; } public PTFPartition execute(PTFPartition iPart) throws HiveException { if ( ptfDesc.isMapSide() ) { return transformRawInput(iPart); } PTFPartitionIterator pItr = iPart.iterator(); PTFOperator.connectLeadLagFunctionsToPartition(ptfDesc, pItr); if ( outputPartition == null ) { outputPartition = PTFPartition.create(ptfDesc.getCfg(), tableDef.getOutputShape().getSerde(), OI, tableDef.getOutputShape().getOI()); } else { outputPartition.reset(); } execute(pItr, outputPartition); return outputPartition; } protected abstract void execute(PTFPartitionIterator pItr, PTFPartition oPart) throws HiveException; protected PTFPartition transformRawInput(PTFPartition iPart) throws HiveException { if (!isTransformsRawInput()) { throw new HiveException(String.format("Internal Error: mapExecute called on function (%s)that has no Map Phase", tableDef.getName())); } return _transformRawInput(iPart); } protected PTFPartition _transformRawInput(PTFPartition iPart) throws HiveException { return null; } /* * A TableFunction may be able to provide its Output as an Iterator. * In case it can then for Map-side processing and for the last PTF in a Reduce-side chain * we can forward rows one by one. This will save the time/space to populate and read an Output * Partition. */ public boolean canIterateOutput() { return false; } public Iterator iterator(PTFPartitionIterator pItr) throws HiveException { if ( ptfDesc.isMapSide() ) { return transformRawInputIterator(pItr); } if (!canIterateOutput()) { throw new HiveException( "Internal error: iterator called on a PTF that cannot provide its output as an Iterator"); } throw new HiveException(String.format( "Internal error: PTF %s, provides no iterator method", getClass().getName())); } protected Iterator transformRawInputIterator(PTFPartitionIterator pItr) throws HiveException { if (!canIterateOutput()) { throw new HiveException( "Internal error: iterator called on a PTF that cannot provide its output as an Iterator"); } throw new HiveException(String.format( "Internal error: PTF %s, provides no iterator method", getClass().getName())); } /* * A TableFunction may be able to accept its input as a stream. * In this case the contract is: * - startPartition must be invoked to give the PTF a chance to initialize stream processing. * - each input row is passed in via a processRow(or processRows) invocation. processRow * can return 0 or more o/p rows. * - finishPartition is invoked to give the PTF a chance to finish processing and return any * remaining o/p rows. */ public boolean canAcceptInputAsStream() { return canAcceptInputAsStream; } public void initializeStreaming(Configuration cfg, StructObjectInspector inputOI, boolean isMapSide) throws HiveException { canAcceptInputAsStream = false; } public void startPartition() throws HiveException { if (!canAcceptInputAsStream() ) { throw new HiveException(String.format( "Internal error: PTF %s, doesn't support Streaming", getClass().getName())); } } public List processRow(Object row) throws HiveException { if (!canAcceptInputAsStream() ) { throw new HiveException(String.format( "Internal error: PTF %s, doesn't support Streaming", getClass().getName())); } return null; } public List finishPartition() throws HiveException { if (!canAcceptInputAsStream() ) { throw new HiveException(String.format( "Internal error: PTF %s, doesn't support Streaming", getClass().getName())); } return null; } public void close() { if (outputPartition != null) { outputPartition.close(); } outputPartition = null; } }