com.yahoo.sketches.pig.quantiles.DataToItemsSketch Maven / Gradle / Ivy
/*
* Copyright 2016, Yahoo! Inc.
* Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
*/
package com.yahoo.sketches.pig.quantiles;
import java.io.IOException;
import java.util.Comparator;
import org.apache.pig.Accumulator;
import org.apache.pig.Algebraic;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import com.yahoo.memory.Memory;
import com.yahoo.sketches.ArrayOfItemsSerDe;
import com.yahoo.sketches.quantiles.ItemsSketch;
import com.yahoo.sketches.quantiles.ItemsUnion;
/**
* Builds ItemsSketch from data.
* To assist Pig, this class implements both the Accumulator and Algebraic interfaces.
* @param type of item
*/
public abstract class DataToItemsSketch extends EvalFunc
implements Accumulator, Algebraic {
private static final TupleFactory tupleFactory_ = TupleFactory.getInstance();
// With the single exception of the Accumulator interface, UDFs are stateless.
// All parameters kept at the class level must be final, except for the accumUnion.
private final int k_;
private final Comparator comparator_;
private final ArrayOfItemsSerDe serDe_;
private ItemsUnion accumUnion_;
// TOP LEVEL API
/**
* Base constructor.
*
* @param k parameter that determines the accuracy and size of the sketch.
* The value of 0 means the default k, whatever it is in the sketches-core library
* @param comparator for items of type T
* @param serDe an instance of ArrayOfItemsSerDe for type T
*/
public DataToItemsSketch(final int k, final Comparator comparator,
final ArrayOfItemsSerDe serDe) {
super();
k_ = k;
comparator_ = comparator;
serDe_ = serDe;
}
//@formatter:off
/**
* Top-level exec function.
* This method accepts an input Tuple containing a Bag of one or more inner Datum Tuples
* and returns a single Sketch as a Sketch Tuple.
*
* If a large number of calls is anticipated, leveraging either the Algebraic or
* Accumulator interfaces is recommended. Pig normally handles this automatically.
*
*
Internally, this method presents the inner Datum Tuples to a new Union,
* which is returned as a Sketch Tuple
*
*
Types below are in the form: Java data type: Pig DataType
*
*
Input Tuple
*
* - Tuple: TUPLE (Must contain only one field)
*
* - index 0: DataBag: BAG (May contain 0 or more Inner Tuples)
*
* - index 0: Tuple: TUPLE Datum Tuple
* - ...
* - index n-1: Tuple: TUPLE Datum Tuple
*
*
*
*
*
*
* Datum Tuple
*
* - Tuple: TUPLE (Must contain only one field)
*
* - index 0: T: some suitable Pig type convertible to T
*
*
*
*
* Sketch Tuple
*
* - Tuple: TUPLE (Contains exactly 1 field)
*
* - index 0: DataByteArray: BYTEARRAY = a serialized QuantilesSketch object.
*
*
*
*
* @param inputTuple A tuple containing a single bag, containing Datum Tuples.
* @return Sketch Tuple. If inputTuple is null or empty, returns empty sketch.
* @see "org.apache.pig.EvalFunc.exec(org.apache.pig.data.Tuple)"
* @throws IOException from Pig.
*/
// @formatter:on
@Override // TOP LEVEL EXEC
public Tuple exec(final Tuple inputTuple) throws IOException {
//The exec is a stateless function. It operates on the input and returns a result.
if (inputTuple != null && inputTuple.size() > 0) {
final ItemsUnion union = k_ > 0
? ItemsUnion.getInstance(k_, comparator_)
: ItemsUnion.getInstance(comparator_);
final DataBag bag = (DataBag) inputTuple.get(0);
for (final Tuple innerTuple: bag) {
union.update(extractValue(innerTuple.get(0)));
}
final ItemsSketch resultSketch = union.getResultAndReset();
if (resultSketch != null) {
return tupleFactory_.newTuple(new DataByteArray(resultSketch.toByteArray(serDe_)));
}
}
// return empty sketch
final ItemsSketch sketch = k_ > 0
? ItemsSketch.getInstance(k_, comparator_)
: ItemsSketch.getInstance(comparator_);
return tupleFactory_.newTuple(new DataByteArray(sketch.toByteArray(serDe_)));
}
@Override
public Schema outputSchema(final Schema input) {
if (input == null) { return null; }
try {
final Schema tupleSchema = new Schema();
tupleSchema.add(new Schema.FieldSchema("Sketch", DataType.BYTEARRAY));
return new Schema(new Schema.FieldSchema(getSchemaName(
this.getClass().getName().toLowerCase(), input), tupleSchema, DataType.TUPLE));
} catch (final FrontendException e) {
throw new RuntimeException(e);
}
}
// ACCUMULATOR INTERFACE
/**
* An Accumulator version of the standard exec() method. Like exec(),
* accumulator is called with a bag of Datum Tuples. Unlike exec(), it doesn't serialize the
* sketch at the end. Instead, it can be called multiple times, each time with another bag of
* Datum Tuples to be input to the Union.
*
* @param inputTuple A tuple containing a single bag, containing Datum Tuples.
* @see #exec
* @see "org.apache.pig.Accumulator.accumulate(org.apache.pig.data.Tuple)"
* @throws IOException by Pig
*/
@Override
public void accumulate(final Tuple inputTuple) throws IOException {
if (inputTuple == null || inputTuple.size() == 0) { return; }
final DataBag bag = (DataBag) inputTuple.get(0);
if (bag == null) { return; }
if (accumUnion_ == null) {
accumUnion_ = k_ > 0
? ItemsUnion.getInstance(k_, comparator_)
: ItemsUnion.getInstance(comparator_);
}
for (final Tuple innerTuple: bag) {
accumUnion_.update(extractValue(innerTuple.get(0)));
}
}
/**
* Returns the result of the Union that has been built up by multiple calls to {@link #accumulate}.
*
* @return Sketch Tuple. (see {@link #exec} for return tuple format)
* @see "org.apache.pig.Accumulator.getValue()"
*/
@Override
public Tuple getValue() {
if (accumUnion_ != null) {
final ItemsSketch resultSketch = accumUnion_.getResultAndReset();
if (resultSketch != null) {
return tupleFactory_.newTuple(new DataByteArray(resultSketch.toByteArray(serDe_)));
}
}
// return empty sketch
final ItemsSketch sketch = k_ > 0
? ItemsSketch.getInstance(k_, comparator_)
: ItemsSketch.getInstance(comparator_);
return tupleFactory_.newTuple(new DataByteArray(sketch.toByteArray(serDe_)));
}
/**
* Cleans up the UDF state after being called using the {@link Accumulator} interface.
*
* @see "org.apache.pig.Accumulator.cleanup()"
*/
@Override
public void cleanup() {
accumUnion_ = null;
}
/**
* Override this if it takes more than a cast to convert from Pig type to type T
* @param object Pig object, which needs to be converted to type T
* @return value of type T
*/
@SuppressWarnings("unchecked")
protected T extractValue(final Object object) {
return (T) object;
}
// STATIC Initial Class only called by Pig
/**
* Class used to calculate the initial pass of an Algebraic sketch operation.
*
*
* The Initial class simply passes through all records unchanged so that they can be
* processed by the intermediate processor instead.
*/
public static class DataToItemsSketchInitial extends EvalFunc {
// The Algebraic worker classes (Initial, IntermediateFinal) are static and stateless.
// The constructors must mirror the main UDF class
/**
* Default constructor.
*/
public DataToItemsSketchInitial() {}
/**
* Constructor with specific k
* @param kStr string representation of k
*/
public DataToItemsSketchInitial(final String kStr) {}
@Override
public Tuple exec(final Tuple inputTuple) throws IOException {
return inputTuple;
}
}
// STATIC IntermediateFinal Class only called by Pig
/**
* Class used to calculate the intermediate or final combiner pass of an Algebraic sketch
* operation. This is called from the combiner, and may be called multiple times (from the mapper
* and from the reducer). It will receive a bag of values returned by either the Intermediate
* stage or the Initial stages, so it needs to be able to differentiate between and
* interpret both types.
* @param type of item
*/
public static abstract class DataToItemsSketchIntermediateFinal extends EvalFunc {
// The Algebraic worker classes (Initial, IntermediateFinal) are static and stateless.
// The constructors of the concrete class must mirror the ones in the main UDF class
private final int k_;
private final Comparator comparator_;
private final ArrayOfItemsSerDe serDe_;
/**
* Constructor for the intermediate and final passes of an Algebraic function.
*
* @param k parameter that determines the accuracy and size of the sketch.
* @param comparator for items of type T
* @param serDe an instance of ArrayOfItemsSerDe for type T
*/
public DataToItemsSketchIntermediateFinal(
final int k, final Comparator comparator, final ArrayOfItemsSerDe serDe) {
super();
k_ = k;
comparator_ = comparator;
serDe_ = serDe;
}
/**
* Override this if it takes more than a cast to convert from Pig type to type T
* @param object Pig object, which needs to be converted to type T
* @return value of type T
*/
@SuppressWarnings("unchecked")
protected T extractValue(final Object object) {
return (T) object;
}
@Override // IntermediateFinal exec
public Tuple exec(final Tuple inputTuple) throws IOException { //throws is in API
if (inputTuple != null && inputTuple.size() > 0) {
final ItemsUnion union = k_ > 0
? ItemsUnion.getInstance(k_, comparator_)
: ItemsUnion.getInstance(comparator_);
final DataBag outerBag = (DataBag) inputTuple.get(0);
for (final Tuple dataTuple: outerBag) {
final Object f0 = dataTuple.get(0);
if (f0 == null) { continue; }
if (f0 instanceof DataBag) {
final DataBag innerBag = (DataBag) f0; // inputTuple.bag0.dataTupleN.f0:bag
if (innerBag.size() == 0) { continue; }
// If field 0 of a dataTuple is a Bag all innerTuples of this inner bag
// will be passed into the union.
// It is due to system bagged outputs from multiple mapper Initial functions.
// The Intermediate stage was bypassed.
for (final Tuple innerTuple: innerBag) {
union.update(extractValue(innerTuple.get(0)));
}
} else if (f0 instanceof DataByteArray) { // inputTuple.bag0.dataTupleN.f0:DBA
// If field 0 of a dataTuple is a DataByteArray we assume it is a sketch
// due to system bagged outputs from multiple mapper Intermediate functions.
// Each dataTuple.DBA:sketch will merged into the union.
final DataByteArray dba = (DataByteArray) f0;
union.update(ItemsSketch.getInstance(Memory.wrap(dba.get()), comparator_, serDe_));
} else {
throw new IllegalArgumentException("dataTuple.Field0: Is not a DataByteArray: "
+ f0.getClass().getName());
}
}
final ItemsSketch resultSketch = union.getResultAndReset();
if (resultSketch != null) {
return tupleFactory_.newTuple(new DataByteArray(resultSketch.toByteArray(serDe_)));
}
}
// return empty sketch
final ItemsSketch sketch = k_ > 0
? ItemsSketch.getInstance(k_, comparator_)
: ItemsSketch.getInstance(comparator_);
return tupleFactory_.newTuple(new DataByteArray(sketch.toByteArray(serDe_)));
}
} // end IntermediateFinal
}