All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.pig.theta.Merge Maven / Gradle / Ivy

/*
 * Copyright 2015, Yahoo! Inc.
 * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
 */
package com.yahoo.sketches.pig.theta;

import static com.yahoo.sketches.Util.DEFAULT_NOMINAL_ENTRIES;
import static com.yahoo.sketches.Util.DEFAULT_UPDATE_SEED;
import static com.yahoo.sketches.Util.checkIfPowerOf2;
import static com.yahoo.sketches.Util.checkProbability;
import static com.yahoo.sketches.pig.theta.PigUtil.compactOrderedSketchToTuple;
import static com.yahoo.sketches.pig.theta.PigUtil.emptySketchTuple;
import static com.yahoo.sketches.pig.theta.PigUtil.extractBag;
import static com.yahoo.sketches.pig.theta.PigUtil.extractFieldAtIndex;
import static com.yahoo.sketches.pig.theta.PigUtil.extractTypeAtIndex;
import static com.yahoo.sketches.pig.theta.PigUtil.RF;

import java.io.IOException;

import org.apache.pig.Accumulator;
import org.apache.pig.Algebraic;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;

import com.yahoo.sketches.Util;
import com.yahoo.sketches.memory.Memory;
import com.yahoo.sketches.memory.NativeMemory;
import com.yahoo.sketches.theta.CompactSketch;
import com.yahoo.sketches.theta.SetOperation;
import com.yahoo.sketches.theta.Union;


/**
 * This is a Pig UDF that performs the Union Set Operation on Sketches. 
 * To assist Pig, this class implements both the Accumulator and Algebraic interfaces.
 * 
 * @author Lee Rhodes
 */
public class Merge extends EvalFunc implements Accumulator, Algebraic {
  //With the single exception of the Accumulator interface, UDFs are stateless.
  //All parameters kept at the class level must be final, except for the accumUpdateSketch.
  private final int nomEntries_;
  private final float p_;
  private final long seed_;
  private final Tuple emptyCompactOrderedSketchTuple_;
  private Union accumUnion_;
  
  //TOP LEVEL API
  
  /**
   * Default constructor to make pig validation happy.  Assumes:
   * 
   */
  public Merge() {
    this(DEFAULT_NOMINAL_ENTRIES, (float)(1.0), DEFAULT_UPDATE_SEED);
  }
  
  /**
   * String constructor. Assumes:
   * 
   * 
   * @param nomEntriesStr See Nominal Entries
   */
  public Merge(String nomEntriesStr) {
    this(Integer.parseInt(nomEntriesStr), (float)(1.0), DEFAULT_UPDATE_SEED);
  }
  
  /**
   * String constructor. Assumes:
   * 
   * 
   * @param nomEntriesStr See Nominal Entries
   * @param pStr See Sampling Probability, p.
   * Although this functionality is implemented for SketchUnions, it rarely makes sense to use it 
   * here. The proper use of upfront sampling is when building the sketches.
   */
  public Merge(String nomEntriesStr, String pStr) {
    this(Integer.parseInt(nomEntriesStr), Float.parseFloat(pStr), DEFAULT_UPDATE_SEED);
  }
  
  /**
   * Full string constructor.
   * 
   * @param nomEntriesStr See Nominal Entries.
   * @param pStr See Sampling Probability, p. 
   * Although this functionality is implemented for SketchUnions, it rarely makes sense to use it 
   * here. The proper use of upfront sampling is when building the sketches.
   * @param seedStr  See Update Hash Seed.
   */
  public Merge(String nomEntriesStr, String pStr, String seedStr) {
    this(Integer.parseInt(nomEntriesStr), Float.parseFloat(pStr), Long.parseLong(seedStr));
  }
  
  /**
   * Base constructor.
   * 
   * @param nomEntries See Nominal Entries.
   * @param p See Sampling Probability, p.
   * Although this functionality is implemented for SketchUnions, it rarely makes sense to use it 
   * here. The proper use of upfront sampling is when building the sketches.
   * @param seed  See Update Hash Seed.
   */
  public Merge(int nomEntries, float p, long seed) {
    super();
    this.nomEntries_ = nomEntries;
    this.p_ = p;
    this.seed_ = seed;
    this.emptyCompactOrderedSketchTuple_ = emptySketchTuple(seed);
    //Catch these errors during construction, don't wait for the exec to be called.
    checkIfPowerOf2(nomEntries, "nomEntries");
    checkProbability(p, "p");
    if (nomEntries < (1 << Util.MIN_LG_NOM_LONGS)) {
      throw new IllegalArgumentException("NomEntries too small: "+nomEntries+
          ", required: "+(1 << Util.MIN_LG_NOM_LONGS));
    }
  }
  
  //@formatter:off
  /************************************************************************************************
   * Top-level exec function.
   * This method accepts an input Tuple containing a Bag of one or more inner Sketch Tuples
   * and returns a single updated Sketch as a Sketch Tuple.
   * 
   * 

* If a large number of calls are anticipated, leveraging either the Algebraic or * Accumulator interfaces is recommended. Pig normally handles this automatically. * *

* Internally, this method presents the inner Sketch Tuples to a new Union. * The result is returned as a Sketch Tuple * *

* * Input Tuple *

    *
  • Tuple: TUPLE (Must contain only one field) *
      *
    • index 0: DataBag: BAG (May contain 0 or more Inner Tuples) *
        *
      • index 0: Tuple: TUPLE Sketch Tuple
      • *
      • ...
      • *
      • index n-1: Tuple: TUPLE Sketch Tuple
      • *
      *
    • *
    *
  • *
* * Sketch Tuple *
    *
  • Tuple: TUPLE (Contains exactly 1 field) *
      *
    • index 0: DataByteArray: BYTEARRAY = The serialization of a Sketch object.
    • *
    *
  • *
* * @param inputTuple A tuple containing a single bag, containing Sketch Tuples. * @return Sketch Tuple. If inputTuple is null or empty, returns empty sketch (8 bytes). * @see "org.apache.pig.EvalFunc.exec(org.apache.pig.data.Tuple)" */ //@formatter:on @Override //TOP LEVEL EXEC public Tuple exec(Tuple inputTuple) throws IOException { //throws is in API //The exec is a stateless function. It operates on the input and returns a result. // It can only call static functions. Union union = SetOperation.builder().setP(p_).setSeed(seed_).setResizeFactor(RF). buildUnion(nomEntries_); DataBag bag = extractBag(inputTuple); if (bag == null) { return emptyCompactOrderedSketchTuple_; //Configured with parent } updateUnion(bag, union); CompactSketch compactSketch = union.getResult(true, null); return compactOrderedSketchToTuple(compactSketch); } @Override public Schema outputSchema(Schema input) { if (input != null) { try { Schema tupleSchema = new Schema(); tupleSchema.add(new Schema.FieldSchema("Sketch", DataType.BYTEARRAY)); return new Schema(new Schema.FieldSchema(getSchemaName(this .getClass().getName().toLowerCase(), input), tupleSchema, DataType.TUPLE)); } catch (FrontendException e) { // fall through } } return null; } //ACCUMULATOR INTERFACE /************************************************************************************************* * An Accumulator version of the standard exec() method. Like exec(), * accumulator is called with a bag of Sketch Tuples. Unlike exec(), it doesn't serialize the * sketch at the end. Instead, it can be called multiple times, each time with another bag of * Sketch Tuples to be input to the Union. * * @param inputTuple A tuple containing a single bag, containing Sketch Tuples. * @see #exec * @see "org.apache.pig.Accumulator.accumulate(org.apache.pig.data.Tuple)" * @throws IOException by Pig */ @Override public void accumulate(Tuple inputTuple) throws IOException { //throws is in API if (accumUnion_ == null) { accumUnion_ = SetOperation.builder().setP(p_).setSeed(seed_).setResizeFactor(RF). buildUnion(nomEntries_); } DataBag bag = extractBag(inputTuple); if (bag == null) return; updateUnion(bag, accumUnion_); } /** * Returns the sketch that has been built up by multiple calls to {@link #accumulate}. * * @return Sketch Tuple. (see {@link #exec} for return tuple format) * @see "org.apache.pig.Accumulator.getValue()" */ @Override public Tuple getValue() { if (accumUnion_ == null) return emptyCompactOrderedSketchTuple_; //Configured with parent CompactSketch compactSketch = accumUnion_.getResult(true, null); return compactOrderedSketchToTuple(compactSketch); } /** * Cleans up the UDF state after being called using the {@link Accumulator} interface. * * @see "org.apache.pig.Accumulator.cleanup()" */ @Override public void cleanup() { accumUnion_ = null; } //ALGEBRAIC INTERFACE /*************************************************************************************************/ @Override public String getInitial() { return Initial.class.getName(); } @Override public String getIntermed() { return IntermediateFinal.class.getName(); } @Override public String getFinal() { return IntermediateFinal.class.getName(); } //TOP LEVEL PRIVATE STATIC METHODS /************************************************************************************************* * Updates a union from a bag of sketches * * @param bag A bag of sketchTuples. * @param union The union to update */ private static void updateUnion(DataBag bag, Union union) { //Bag is not empty. process each innerTuple in the bag for (Tuple innerTuple : bag) { //validate the inner Tuples Object f0 = extractFieldAtIndex(innerTuple, 0); if (f0 == null) { continue; } Byte type = extractTypeAtIndex(innerTuple, 0); if (type == null) { continue; } // add only the first field of the innerTuple to the union if (type == DataType.BYTEARRAY) { DataByteArray dba = (DataByteArray) f0; if (dba.size() > 0) { union.update(new NativeMemory(dba.get())); } } else { throw new IllegalArgumentException( "Field type was not DataType.BYTEARRAY: "+type); } } } //STATIC Initial Class only called by Pig /************************************************************************************************* * Class used to calculate the initial pass of an Algebraic sketch operation. * *

* The Initial class simply passes through all records unchanged so that they can be * processed by the intermediate processor instead.

*/ public static class Initial extends EvalFunc { //The Algebraic worker classes (Initial, IntermediateFinal) are static and stateless. //The constructors and final parameters must mirror the parent class as there is no linkage // between them. /** * Default constructor to make pig validation happy. */ public Initial() { this(Integer.toString(DEFAULT_NOMINAL_ENTRIES), "1.0", Long.toString(DEFAULT_UPDATE_SEED)); } /** * Constructor for the initial pass of an Algebraic function. Pig will call this and pass the * same constructor arguments as the original UDF. In this case the arguments are ignored. * * @param nomEntriesStr See Nominal Entries. */ public Initial(String nomEntriesStr) { this(nomEntriesStr, "1.0", Long.toString(DEFAULT_UPDATE_SEED)); } /** * Constructor for the initial pass of an Algebraic function. Pig will call this and pass the * same constructor arguments as the original UDF. In this case the arguments are ignored. * * @param nomEntriesStr See Nominal Entries. * @param pStr See Sampling Probability, p. * Although this functionality is implemented for SketchUnions, it rarely makes sense to use it * here. The proper use of upfront sampling is when building the sketches. */ public Initial(String nomEntriesStr, String pStr) { this(nomEntriesStr, pStr, Long.toString(DEFAULT_UPDATE_SEED)); } /** * Constructor for the initial pass of an Algebraic function. Pig will call this and pass the * same constructor arguments as the original UDF. In this case the arguments are ignored. * * @param nomEntriesStr See Nominal Entries. * @param pStr See Sampling Probability, p. * Although this functionality is implemented for SketchUnions, it rarely makes sense to use it * here. The proper use of upfront sampling is when building the sketches. * @param seedStr See Update Hash Seed. */ public Initial(String nomEntriesStr, String pStr, String seedStr) {} @Override //Initial exec public Tuple exec(Tuple inputTuple) throws IOException { //throws is in API return inputTuple; } } // STATIC IntermediateFinal Class only called by Pig /************************************************************************************************* * Class used to calculate the intermediate or final combiner pass of an Algebraic union * operation. This is called from the combiner, and may be called multiple times (from the mapper * and from the reducer). It will receive a bag of values returned by either the Intermediate * stage or the Initial stages, so it needs to be able to differentiate between and * interpret both types. */ public static class IntermediateFinal extends EvalFunc { //The Algebraic worker classes (Initial, IntermediateFinal) are static and stateless. //The constructors and final parameters must mirror the parent class as there is no linkage // between them. private final int myNomEntries_; private final float myP_; private final long mySeed_; private final Tuple myEmptyCompactOrderedSketchTuple_; /** * Default constructor to make pig validation happy. Assumes: * */ public IntermediateFinal() { this(Integer.toString(DEFAULT_NOMINAL_ENTRIES), "1.0", Long.toString(DEFAULT_UPDATE_SEED)); } /** * Constructor for the intermediate and final passes of an Algebraic function. Pig will call * this and pass the same constructor arguments as the base UDF. Assumes: * * * @param nomEntriesStr See Nominal Entries. */ public IntermediateFinal(String nomEntriesStr) { this(nomEntriesStr, "1.0", Long.toString(DEFAULT_UPDATE_SEED)); } /** * Constructor for the intermediate and final passes of an Algebraic function. Pig will call * this and pass the same constructor arguments as the base UDF. Assumes: * * * @param nomEntriesStr See Nominal Entries. * @param pStr See Sampling Probability, p. */ public IntermediateFinal(String nomEntriesStr, String pStr) { this(nomEntriesStr, pStr, Long.toString(DEFAULT_UPDATE_SEED)); } /** * Constructor with strings for the intermediate and final passes of an Algebraic function. * Pig will call this and pass the same constructor arguments as the original UDF. * * @param nomEntriesStr See Nominal Entries. * @param pStr See Sampling Probability, p. * @param seedStr See Update Hash Seed. */ public IntermediateFinal(String nomEntriesStr, String pStr, String seedStr) { this(Integer.parseInt(nomEntriesStr), Float.parseFloat(pStr), Long.parseLong(seedStr)); } /** * Constructor with primitives for the intermediate and final passes of an Algebraic function. * Pig will call this and pass the same constructor arguments as the Top Level UDF. * * @param nomEntries See Nominal Entries. * @param p See Sampling Probability, p. * @param seed See Update Hash Seed. */ public IntermediateFinal(int nomEntries, float p, long seed) { this.myNomEntries_ = nomEntries; this.myP_ = p; this.mySeed_ = seed; this.myEmptyCompactOrderedSketchTuple_ = emptySketchTuple(seed); } @Override //IntermediateFinal exec public Tuple exec(Tuple inputTuple) throws IOException { //throws is in API Union union = SetOperation.builder().setP(myP_).setSeed(mySeed_).setResizeFactor(RF). buildUnion(myNomEntries_); DataBag outerBag = extractBag(inputTuple); //InputTuple.bag0 if (outerBag == null) { //must have non-empty outer bag at field 0. return myEmptyCompactOrderedSketchTuple_; } //Bag is not empty. for (Tuple dataTuple : outerBag) { Object f0 = extractFieldAtIndex(dataTuple, 0); //inputTuple.bag0.dataTupleN.f0 //must have non-null field zero if (f0 == null) { continue; //go to next dataTuple if there is one. If none, exception is thrown. } //f0 is not null if (f0 instanceof DataBag) { DataBag innerBag = (DataBag)f0; //inputTuple.bag0.dataTupleN.f0:bag if (innerBag.size() == 0) { continue; //go to next dataTuple if there is one. If none, exception is thrown. } //If field 0 of a dataTuple is again a Bag all tuples of this inner bag // will be passed into the union. //It is due to system bagged outputs from multiple mapper Initial functions. //The Intermediate stage was bypassed. updateUnion(innerBag, union); //process all tuples of innerBag } else if (f0 instanceof DataByteArray) { //inputTuple.bag0.dataTupleN.f0:DBA //If field 0 of a dataTuple is a DataByteArray we assume it is a sketch from a prior call //It is due to system bagged outputs from multiple mapper Intermediate functions. // Each dataTuple.DBA:sketch will merged into the union. DataByteArray dba = (DataByteArray) f0; Memory srcMem = new NativeMemory(dba.get()); union.update(srcMem); } else { // we should never get here. throw new IllegalArgumentException("dataTuple.Field0: Is not a DataByteArray: " + f0.getClass().getName()); } } CompactSketch compactSketch = union.getResult(true, null); return compactOrderedSketchToTuple(compactSketch); } } //End IntermediateFinal }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy