
com.yahoo.sketches.pig.theta.Intersect Maven / Gradle / Ivy
/*
* Copyright 2015, Yahoo! Inc.
* Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
*/
package com.yahoo.sketches.pig.theta;
import static com.yahoo.sketches.Util.DEFAULT_UPDATE_SEED;
import static com.yahoo.sketches.pig.theta.PigUtil.compactOrderedSketchToTuple;
import static com.yahoo.sketches.pig.theta.PigUtil.emptySketchTuple;
import static com.yahoo.sketches.pig.theta.PigUtil.extractBag;
import static com.yahoo.sketches.pig.theta.PigUtil.extractFieldAtIndex;
import static com.yahoo.sketches.pig.theta.PigUtil.extractTypeAtIndex;
import java.io.IOException;
import org.apache.pig.Accumulator;
import org.apache.pig.Algebraic;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import com.yahoo.sketches.memory.Memory;
import com.yahoo.sketches.memory.NativeMemory;
import com.yahoo.sketches.theta.CompactSketch;
import com.yahoo.sketches.theta.Intersection;
import com.yahoo.sketches.theta.SetOperation;
import com.yahoo.sketches.theta.Sketch;
/**
* This is a Pig UDF that performs the Intersection Set Operation on Sketches.
* To assist Pig, this class implements both the Accumulator and Algebraic interfaces.
*
* @author Lee Rhodes
*/
public class Intersect extends EvalFunc implements Accumulator, Algebraic {
//With the single exception of the Accumulator interface, UDFs are stateless.
//All parameters kept at the class level must be final, except for the accumUpdateSketch.
private final long seed_;
private final Tuple emptyCompactOrderedSketchTuple_;
private Intersection accumIntersection_;
//TOP LEVEL API
/**
* Default constructor to make pig validation happy. Assumes:
*
*/
public Intersect() {
this(DEFAULT_UPDATE_SEED);
}
/**
* Full string constructor.
*
* @param seedStr See Update Hash Seed.
*/
public Intersect(String seedStr) {
this(Long.parseLong(seedStr));
}
/**
* Base constructor.
*
* @param seed See Update Hash Seed.
*/
public Intersect(long seed) {
super();
this.seed_ = seed;
this.emptyCompactOrderedSketchTuple_ = emptySketchTuple(seed);
}
//@formatter:off
/************************************************************************************************
* Top-level exec function.
* This method accepts an input Tuple containing a Bag of one or more inner Sketch Tuples
* and returns a single updated Sketch as a Sketch Tuple.
*
* If a large number of calls are anticipated, leveraging either the Algebraic or
* Accumulator interfaces is recommended. Pig normally handles this automatically.
*
*
Internally, this method presents the inner Sketch Tuples to a new Intersection.
* The result is returned as a Sketch Tuple
*
*
Input Tuple
*
* - Tuple: TUPLE (Must contain only one field)
*
* - index 0: DataBag: BAG (May contain 0 or more Inner Tuples)
*
* - index 0: Tuple: TUPLE Sketch Tuple
* - ...
* - index n-1: Tuple: TUPLE Sketch Tuple
*
*
*
*
*
*
* Sketch Tuple
*
* - Tuple: TUPLE (Contains exactly 1 field)
*
* - index 0: DataByteArray: BYTEARRAY = The serialization of a Sketch object.
*
*
*
*
* @param inputTuple A tuple containing a single bag, containing Sketch Tuples.
* @return Sketch Tuple. If inputTuple is null or empty, returns empty sketch (8 bytes).
* @see "org.apache.pig.EvalFunc.exec(org.apache.pig.data.Tuple)"
*/
//@formatter:on
@Override //TOP LEVEL EXEC
public Tuple exec(Tuple inputTuple) throws IOException { //throws is in API
//The exec is a stateless function. It operates on the input and returns a result.
// It can only call static functions.
Intersection intersection = SetOperation.builder().setSeed(seed_).buildIntersection();
DataBag bag = extractBag(inputTuple);
if (bag == null) return emptyCompactOrderedSketchTuple_; //Configured with parent
updateIntersection(bag, intersection, seed_);
CompactSketch compactSketch = intersection.getResult(true, null);
return compactOrderedSketchToTuple(compactSketch);
}
@Override
public Schema outputSchema(Schema input) {
if (input != null) {
try {
Schema tupleSchema = new Schema();
tupleSchema.add(new Schema.FieldSchema("Sketch", DataType.BYTEARRAY));
return new Schema(new Schema.FieldSchema(getSchemaName(this
.getClass().getName().toLowerCase(), input), tupleSchema, DataType.TUPLE));
}
catch (FrontendException e) {
// fall through
}
}
return null;
}
//ACCUMULATOR INTERFACE
/*************************************************************************************************
* An Accumulator version of the standard exec() method. Like exec(),
* accumulator is called with a bag of Sketch Tuples. Unlike exec(), it doesn't serialize the
* sketch at the end. Instead, it can be called multiple times, each time with another bag of
* Sketch Tuples to be input to the Intersection.
*
* @param inputTuple A tuple containing a single bag, containing Sketch Tuples.
* @see #exec
* @see "org.apache.pig.Accumulator.accumulate(org.apache.pig.data.Tuple)"
* @throws IOException by Pig
*/
@Override
public void accumulate(Tuple inputTuple) throws IOException { //throws is in API
if (accumIntersection_ == null) {
accumIntersection_ = SetOperation.builder().setSeed(seed_).buildIntersection();
}
DataBag bag = extractBag(inputTuple);
if (bag == null) return;
updateIntersection(bag, accumIntersection_, seed_);
}
/**
* Returns the sketch that has been built up by multiple calls to {@link #accumulate}.
*
* @return Sketch Tuple. (see {@link #exec} for return tuple format)
* @see "org.apache.pig.Accumulator.getValue()"
*/
@Override
public Tuple getValue() {
if ((accumIntersection_ == null) || !accumIntersection_.hasResult()) {
throw new IllegalStateException(""
+ "The accumulate(Tuple) method must be called at least once with "
+ "a valid inputTuple.bag.SketchTuple prior to calling getValue().");
}
CompactSketch compactSketch = accumIntersection_.getResult(true, null);
return compactOrderedSketchToTuple(compactSketch);
}
/**
* Cleans up the UDF state after being called using the {@link Accumulator} interface.
*
* @see "org.apache.pig.Accumulator.cleanup()"
*/
@Override
public void cleanup() {
accumIntersection_ = null;
}
//ALGEBRAIC INTERFACE
/*************************************************************************************************/
@Override
public String getInitial() {
return Initial.class.getName();
}
@Override
public String getIntermed() {
return IntermediateFinal.class.getName();
}
@Override
public String getFinal() {
return IntermediateFinal.class.getName();
}
//TOP LEVEL PRIVATE STATIC METHODS
/*************************************************************************************************
* Updates an intersection from a bag of sketches
*
* @param bag A bag of sketchTuples.
* @param intersection The intersection to update
* @param seed to check against incoming sketches
*/
private static void updateIntersection(DataBag bag, Intersection intersection, long seed) {
//Bag is not empty. process each innerTuple in the bag
for (Tuple innerTuple : bag) {
//validate the inner Tuples
Object f0 = extractFieldAtIndex(innerTuple, 0);
if (f0 == null) {
continue;
}
Byte type = extractTypeAtIndex(innerTuple, 0);
// add only the first field of the innerTuple to the intersection
if (type == DataType.BYTEARRAY) {
DataByteArray dba = (DataByteArray) f0;
Memory srcMem = new NativeMemory(dba.get());
Sketch sketch = Sketch.wrap(srcMem, seed);
intersection.update(sketch);
}
else {
throw new IllegalArgumentException(
"Field type was not DataType.BYTEARRAY: " + type);
}
}
}
//STATIC Initial Class only called by Pig
/*************************************************************************************************
* Class used to calculate the initial pass of an Algebraic sketch operation.
*
*
* The Initial class simply passes through all records unchanged so that they can be
* processed by the intermediate processor instead.
*/
public static class Initial extends EvalFunc {
//The Algebraic worker classes (Initial, IntermediateFinal) are static and stateless.
//The constructors and final parameters must mirror the parent class as there is no linkage
// between them.
/**
* Default constructor to make pig validation happy.
*/
public Initial() {
this(Long.toString(DEFAULT_UPDATE_SEED));
}
/**
* Constructor for the initial pass of an Algebraic function. Pig will call this and pass the
* same constructor arguments as the original UDF. In this case the arguments are ignored.
*
* @param seedStr See Update Hash Seed.
*/
public Initial(String seedStr) {}
@Override //Initial exec
public Tuple exec(Tuple inputTuple) throws IOException { //throws is in API
return inputTuple;
}
}
// STATIC IntermediateFinal Class only called by Pig
/*************************************************************************************************
* Class used to calculate the intermediate or final combiner pass of an Algebraic intersection
* operation. This is called from the combiner, and may be called multiple times (from the mapper
* and from the reducer). It will receive a bag of values returned by either the Intermediate
* stage or the Initial stages, so it needs to be able to differentiate between and
* interpret both types.
*/
public static class IntermediateFinal extends EvalFunc {
//The Algebraic worker classes (Initial, IntermediateFinal) are static and stateless.
//The constructors and final parameters must mirror the parent class as there is no linkage
// between them.
private final long mySeed_;
private final Tuple myEmptyCompactOrderedSketchTuple_;
/**
* Default constructor to make pig validation happy. Assumes:
*
*/
public IntermediateFinal() {
this(DEFAULT_UPDATE_SEED);
}
/**
* Constructor with strings for the intermediate and final passes of an Algebraic function.
* Pig will call this and pass the same constructor arguments as the original UDF.
*
* @param seedStr See Update Hash Seed.
*/
public IntermediateFinal(String seedStr) {
this(Long.parseLong(seedStr));
}
/**
* Constructor with primitives for the intermediate and final passes of an Algebraic function.
* Pig will call this and pass the same constructor arguments as the Top Level UDF.
*
* @param seed See Update Hash Seed.
*/
public IntermediateFinal(long seed) {
this.mySeed_ = seed;
this.myEmptyCompactOrderedSketchTuple_ = emptySketchTuple(seed);
}
@Override //IntermediateFinal exec
public Tuple exec(Tuple inputTuple) throws IOException { //throws is in API
Intersection intersection = SetOperation.builder().setSeed(mySeed_).buildIntersection();
DataBag outerBag = extractBag(inputTuple); //InputTuple.bag0
if (outerBag == null) { //must have non-empty outer bag at field 0.
return myEmptyCompactOrderedSketchTuple_;
}
//Bag is not empty.
for (Tuple dataTuple : outerBag) {
Object f0 = extractFieldAtIndex(dataTuple, 0); //inputTuple.bag0.dataTupleN.f0
//must have non-null field zero
if (f0 == null) {
continue; //go to next dataTuple if there is one. If none, exception is thrown.
}
//f0 is not null
if (f0 instanceof DataBag) {
DataBag innerBag = (DataBag)f0; //inputTuple.bag0.dataTupleN.f0:bag
if (innerBag.size() == 0) {
continue; //go to next dataTuple if there is one. If none, exception is thrown.
}
//If field 0 of a dataTuple is again a Bag all tuples of this inner bag
// will be passed into the union.
//It is due to system bagged outputs from multiple mapper Initial functions.
//The Intermediate stage was bypassed.
updateIntersection(innerBag, intersection, mySeed_); //process all tuples of innerBag
}
else if (f0 instanceof DataByteArray) { //inputTuple.bag0.dataTupleN.f0:DBA
//If field 0 of a dataTuple is a DataByteArray we assume it is a sketch from a prior call
//It is due to system bagged outputs from multiple mapper Intermediate functions.
// Each dataTuple.DBA:sketch will merged into the union.
DataByteArray dba = (DataByteArray) f0;
Memory srcMem = new NativeMemory(dba.get());
Sketch sketch = Sketch.wrap(srcMem, mySeed_);
intersection.update(sketch);
}
else { // we should never get here.
throw new IllegalArgumentException("dataTuple.Field0: Is not a DataByteArray: "
+ f0.getClass().getName());
}
}
CompactSketch compactSketch = intersection.getResult(true, null);
return compactOrderedSketchToTuple(compactSketch);
}
} //End IntermediateFinal
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy