All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.pig.theta.DataToSketch Maven / Gradle / Ivy

There is a newer version: 0.13.0
Show newest version
/*
 * Copyright 2016, Yahoo! Inc.
 * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.pig.theta;

import static com.yahoo.sketches.Util.DEFAULT_NOMINAL_ENTRIES;
import static com.yahoo.sketches.Util.DEFAULT_UPDATE_SEED;
import static com.yahoo.sketches.Util.checkIfPowerOf2;
import static com.yahoo.sketches.Util.checkProbability;
import static com.yahoo.sketches.pig.theta.PigUtil.RF;
import static com.yahoo.sketches.pig.theta.PigUtil.compactOrderedSketchToTuple;
import static com.yahoo.sketches.pig.theta.PigUtil.emptySketchTuple;
import static com.yahoo.sketches.pig.theta.PigUtil.extractBag;
import static com.yahoo.sketches.pig.theta.PigUtil.extractFieldAtIndex;
import static com.yahoo.sketches.pig.theta.PigUtil.extractTypeAtIndex;

import java.io.IOException;

import org.apache.pig.Accumulator;
import org.apache.pig.Algebraic;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;

import com.yahoo.memory.Memory;
import com.yahoo.sketches.Util;
import com.yahoo.sketches.theta.CompactSketch;
import com.yahoo.sketches.theta.SetOperation;
import com.yahoo.sketches.theta.Union;

/**
 * This is a Pig UDF that builds Sketches from data.
 * To assist Pig, this class implements both the Accumulator and Algebraic interfaces.
 *
 * @author Lee Rhodes
 */
public class DataToSketch extends EvalFunc implements Accumulator, Algebraic {
  //With the single exception of the Accumulator interface, UDFs are stateless.
  //All parameters kept at the class level must be final, except for the accumUnion.
  private final int nomEntries_;
  private final float p_;
  private final long seed_;
  private final Tuple emptyCompactOrderedSketchTuple_;
  private Union accumUnion_;

  //TOP LEVEL API

  /**
   * Default constructor. Assumes:
   * 
   */
  public DataToSketch() {
    this(DEFAULT_NOMINAL_ENTRIES, (float)(1.0), DEFAULT_UPDATE_SEED);
  }

  /**
   * String constructor. Assumes:
   * 
   *
   * @param nomEntriesStr See Nominal Entries
   */
  public DataToSketch(final String nomEntriesStr) {
    this(Integer.parseInt(nomEntriesStr), (float)(1.0), DEFAULT_UPDATE_SEED);
  }

  /**
   * String constructor. Assumes:
   * 
   *
   * @param nomEntriesStr See Nominal Entries
   * @param pStr See Sampling Probability, p
   */
  public DataToSketch(final String nomEntriesStr, final String pStr) {
    this(Integer.parseInt(nomEntriesStr), Float.parseFloat(pStr), DEFAULT_UPDATE_SEED);
  }

  /**
   * Full string constructor.
   *
   * @param nomEntriesStr See Nominal Entries.
   * @param pStr See Sampling Probability, p.
   * @param seedStr  See Update Hash Seed.
   */
  public DataToSketch(final String nomEntriesStr, final String pStr, final String seedStr) {
    this(Integer.parseInt(nomEntriesStr), Float.parseFloat(pStr), Long.parseLong(seedStr));
  }

  /**
   * Base constructor.
   *
   * @param nomEntries See Nominal Entries.
   * @param p See Sampling Probability, p.
   * @param seed  See Update Hash Seed.
   */
  public DataToSketch(final int nomEntries, final float p, final long seed) {
    super();
    this.nomEntries_ = nomEntries;
    this.p_ = p;
    this.seed_ = seed;
    this.emptyCompactOrderedSketchTuple_ = emptySketchTuple(seed);
    //Catch these errors during construction, don't wait for the exec to be called.
    checkIfPowerOf2(nomEntries, "nomEntries");
    checkProbability(p, "p");
    if (nomEntries < (1 << Util.MIN_LG_NOM_LONGS)) {
      throw new IllegalArgumentException("NomEntries too small: " + nomEntries
          + ", required: " + (1 << Util.MIN_LG_NOM_LONGS));
    }
  }

  //@formatter:off
  /*************************************************************************************************
   * Top-level exec function.
   * This method accepts an input Tuple containing a Bag of one or more inner Datum Tuples
   * and returns a single updated Sketch as a Sketch Tuple.
   *
   * 

If a large number of calls is anticipated, leveraging either the Algebraic or * Accumulator interfaces is recommended. Pig normally handles this automatically. * *

Internally, this method presents the inner Datum Tuples to a new Sketch, * which is returned as a Sketch Tuple * *

Input Tuple *

    *
  • Tuple: TUPLE (Must contain only one field) *
      *
    • index 0: DataBag: BAG (May contain 0 or more Inner Tuples) *
        *
      • index 0: Tuple: TUPLE Datum Tuple
      • *
      • ...
      • *
      • index n-1: Tuple: TUPLE Datum Tuple
      • *
      *
    • *
    *
  • *
* * Datum Tuple *
    *
  • Tuple: TUPLE (Must contain only one field) *
      *
    • index 0: Java data type : Pig DataType: may be any one of: *
        *
      • Byte: BYTE
      • *
      • Integer: INTEGER
      • *
      • Long: LONG
      • *
      • Float: FLOAT
      • *
      • Double: DOUBLE
      • *
      • String: CHARARRAY
      • *
      • DataByteArray: BYTEARRAY
      • *
      *
    • *
    *
  • *
* * Sketch Tuple *
    *
  • Tuple: TUPLE (Contains exactly 1 field) *
      *
    • index 0: DataByteArray: BYTEARRAY = The serialization of a Sketch object.
    • *
    *
  • *
* * @param inputTuple A tuple containing a single bag, containing Datum Tuples. * @return Sketch Tuple. If inputTuple is null or empty, returns empty sketch (8 bytes). * @see "org.apache.pig.EvalFunc.exec(org.apache.pig.data.Tuple)" * @throws IOException from Pig. */ // @formatter:on @Override //TOP LEVEL EXEC public Tuple exec(final Tuple inputTuple) throws IOException { //throws is in API //The exec is a stateless function. It operates on the input and returns a result. // It can only call static functions. final Union union = newUnion(nomEntries_, p_, seed_); final DataBag bag = extractBag(inputTuple); if (bag == null) { return emptyCompactOrderedSketchTuple_; //Configured with parent } updateUnion(bag, union); //updates union with all elements of the bag final CompactSketch compOrdSketch = union.getResult(true, null); return compactOrderedSketchToTuple(compOrdSketch); } @Override public Schema outputSchema(final Schema input) { if (input != null) { try { final Schema tupleSchema = new Schema(); tupleSchema.add(new Schema.FieldSchema("Sketch", DataType.BYTEARRAY)); return new Schema(new Schema.FieldSchema(getSchemaName(this .getClass().getName().toLowerCase(), input), tupleSchema, DataType.TUPLE)); } catch (final FrontendException e) { // fall through } } return null; } //ACCUMULATOR INTERFACE /************************************************************************************************* * An Accumulator version of the standard exec() method. Like exec(), * accumulator is called with a bag of Datum Tuples. Unlike exec(), it doesn't serialize the * sketch at the end. Instead, it can be called multiple times, each time with another bag of * Datum Tuples to be input to the sketch. * * @param inputTuple A tuple containing a single bag, containing Datum Tuples. * @see #exec * @see "org.apache.pig.Accumulator.accumulate(org.apache.pig.data.Tuple)" * @throws IOException by Pig */ @Override public void accumulate(final Tuple inputTuple) throws IOException { //throws is in API if (accumUnion_ == null) { accumUnion_ = DataToSketch.newUnion(nomEntries_, p_, seed_); } final DataBag bag = extractBag(inputTuple); if (bag == null) { return; } updateUnion(bag, accumUnion_); } /** * Returns the sketch that has been built up by multiple calls to {@link #accumulate}. * * @return Sketch Tuple. (see {@link #exec} for return tuple format) * @see "org.apache.pig.Accumulator.getValue()" */ @Override public Tuple getValue() { if (accumUnion_ == null) { return emptyCompactOrderedSketchTuple_; //Configured with parent } final CompactSketch compOrdSketch = accumUnion_.getResult(true, null); return compactOrderedSketchToTuple(compOrdSketch); } /** * Cleans up the UDF state after being called using the {@link Accumulator} interface. * * @see "org.apache.pig.Accumulator.cleanup()" */ @Override public void cleanup() { accumUnion_ = null; } //ALGEBRAIC INTERFACE /*************************************************************************************************/ @Override public String getInitial() { return Initial.class.getName(); } @Override public String getIntermed() { return IntermediateFinal.class.getName(); } @Override public String getFinal() { return IntermediateFinal.class.getName(); } //TOP LEVEL PRIVATE STATIC METHODS /** * Return a new empty HeapUnion * @param nomEntries the given nominal entries * @param p the given probability p * @param seed the given seed * @return a new empty HeapUnion */ private static final Union newUnion(final int nomEntries, final float p, final long seed) { return SetOperation.builder() .setSeed(seed).setP(p).setResizeFactor(RF).setNominalEntries(nomEntries).buildUnion(); } /************************************************************************************************* * Updates a union with the data from the given bag. * * @param bag A bag of tuples to insert. * @param union the union to update */ private static void updateUnion(final DataBag bag, final Union union) { //Bag is not empty. process each innerTuple in the bag for (Tuple innerTuple : bag) { final Object f0 = extractFieldAtIndex(innerTuple, 0); //consider only field 0 if (f0 == null) { continue; } final Byte type = extractTypeAtIndex(innerTuple, 0); if (type == null) { continue; } switch (type) { case DataType.NULL: break; case DataType.BYTE: union.update((byte) f0); break; case DataType.INTEGER: union.update((int) f0); break; case DataType.LONG: union.update((long) f0); break; case DataType.FLOAT: union.update((float) f0); break; case DataType.DOUBLE: union.update((double) f0); break; case DataType.BYTEARRAY: { final DataByteArray dba = (DataByteArray) f0; union.update(dba.get()); //checks null, empty break; } case DataType.CHARARRAY: { union.update(f0.toString()); //checks null, empty break; } default: // types not handled throw new IllegalArgumentException("Field 0 of innerTuple must be one of " + "NULL, BYTE, INTEGER, LONG, FLOAT, DOUBLE, BYTEARRAY or CHARARRAY. " + "Given Type = " + DataType.findTypeName(type) + ", Object = " + f0.toString()); } //End switch } //End for } //STATIC Initial Class only called by Pig /************************************************************************************************* * Class used to calculate the initial pass of an Algebraic sketch operation. * *

* The Initial class simply passes through all records unchanged so that they can be * processed by the intermediate processor instead.

*/ public static class Initial extends EvalFunc { //The Algebraic worker classes (Initial, IntermediateFinal) are static and stateless. //The constructors and final parameters must mirror the parent class as there is no linkage // between them. /** * Default constructor to make pig validation happy. */ public Initial() { this(Integer.toString(Util.DEFAULT_NOMINAL_ENTRIES), "1.0", Long.toString(Util.DEFAULT_UPDATE_SEED)); } /** * Constructor for the initial pass of an Algebraic function. Pig will call this and pass the * same constructor arguments as the original UDF. In this case the arguments are ignored. * * @param nomEntriesStr See Nominal Entries. */ public Initial(final String nomEntriesStr) { this(nomEntriesStr, "1.0", Long.toString(Util.DEFAULT_UPDATE_SEED)); } /** * Constructor for the initial pass of an Algebraic function. Pig will call this and pass the * same constructor arguments as the original UDF. In this case the arguments are ignored. * * @param nomEntriesStr See Nominal Entries. * @param pStr See Sampling Probability, p. * * */ public Initial(final String nomEntriesStr, final String pStr) { this(nomEntriesStr, pStr, Long.toString(Util.DEFAULT_UPDATE_SEED)); } /** * Constructor for the initial pass of an Algebraic function. Pig will call this and pass the * same constructor arguments as the original UDF. In this case the arguments are ignored. * * @param nomEntriesStr See Nominal Entries. * @param pStr See Sampling Probability, p. * * * @param seedStr See Update Hash Seed. */ public Initial(final String nomEntriesStr, final String pStr, final String seedStr) {} @Override //Initial exec public Tuple exec(final Tuple inputTuple) throws IOException { //throws is in API return inputTuple; } } // STATIC IntermediateFinal Class only called by Pig /************************************************************************************************* * Class used to calculate the intermediate or final combiner pass of an Algebraic sketch * operation. This is called from the combiner, and may be called multiple times (from the mapper * and from the reducer). It will receive a bag of values returned by either the Intermediate * stage or the Initial stages, so it needs to be able to differentiate between and * interpret both types. */ public static class IntermediateFinal extends EvalFunc { //The Algebraic worker classes (Initial, IntermediateFinal) are static and stateless. //The constructors and final parameters must mirror the parent class as there is no linkage // between them. private final int myNomEntries_; private final float myP_; private final long mySeed_; private final Tuple myEmptyCompactOrderedSketchTuple_; /** * Default constructor to make pig validation happy. Assumes: * */ public IntermediateFinal() { this(Integer.toString(Util.DEFAULT_NOMINAL_ENTRIES), "1.0", Long.toString(Util.DEFAULT_UPDATE_SEED)); } /** * Constructor for the intermediate and final passes of an Algebraic function. Pig will call * this and pass the same constructor arguments as the base UDF. Assumes: * * * @param nomEntriesStr See Nominal Entries. */ public IntermediateFinal(final String nomEntriesStr) { this(nomEntriesStr, "1.0", Long.toString(Util.DEFAULT_UPDATE_SEED)); } /** * Constructor for the intermediate and final passes of an Algebraic function. Pig will call * this and pass the same constructor arguments as the base UDF. Assumes: * * * @param nomEntriesStr See Nominal Entries. * @param pStr See Sampling Probability, p. */ public IntermediateFinal(final String nomEntriesStr, final String pStr) { this(nomEntriesStr, pStr, Long.toString(Util.DEFAULT_UPDATE_SEED)); } /** * Constructor with strings for the intermediate and final passes of an Algebraic function. * Pig will call this and pass the same constructor arguments as the original UDF. * * @param nomEntriesStr See Nominal Entries. * @param pStr See Sampling Probability, p. * @param seedStr See Update Hash Seed. */ public IntermediateFinal(final String nomEntriesStr, final String pStr, final String seedStr) { this(Integer.parseInt(nomEntriesStr), Float.parseFloat(pStr), Long.parseLong(seedStr)); } /** * Constructor with primitives for the intermediate and final passes of an Algebraic function. * Pig will call this and pass the same constructor arguments as the Top Level UDF. * * @param nomEntries See Nominal Entries. * @param p See Sampling Probability, p. * @param seed See Update Hash Seed. */ public IntermediateFinal(final int nomEntries, final float p, final long seed) { this.myNomEntries_ = nomEntries; this.myP_ = p; this.mySeed_ = seed; this.myEmptyCompactOrderedSketchTuple_ = emptySketchTuple(seed); } @Override //IntermediateFinal exec public Tuple exec(final Tuple inputTuple) throws IOException { //throws is in API final Union union = newUnion(myNomEntries_, myP_, mySeed_); final DataBag outerBag = extractBag(inputTuple); //InputTuple.bag0 if (outerBag == null) { //must have non-empty outer bag at field 0. return myEmptyCompactOrderedSketchTuple_; //abort & return empty sketch } //Bag is not empty. for (Tuple dataTuple : outerBag) { final Object f0 = extractFieldAtIndex(dataTuple, 0); //inputTuple.bag0.dataTupleN.f0 //must have non-null field zero if (f0 == null) { continue; //go to next dataTuple if there is one } //f0 is not null if (f0 instanceof DataBag) { final DataBag innerBag = (DataBag)f0; //inputTuple.bag0.dataTupleN.f0:bag if (innerBag.size() == 0) { continue; } //If field 0 of a dataTuple is a Bag all innerTuples of this inner bag // will be passed into the union. //It is due to system bagged outputs from multiple mapper Initial functions. //The Intermediate stage was bypassed. updateUnion(innerBag, union); //process all tuples of innerBag } else if (f0 instanceof DataByteArray) { //inputTuple.bag0.dataTupleN.f0:DBA //If field 0 of a dataTuple is a DataByteArray we assume it is a sketch // due to system bagged outputs from multiple mapper Intermediate functions. // Each dataTuple.DBA:sketch will merged into the union. final DataByteArray dba = ((DataByteArray) f0); union.update(Memory.wrap(dba.get())); } else { // we should never get here. throw new IllegalArgumentException("dataTuple.Field0: Is not a DataByteArray: " + f0.getClass().getName()); } } //End for final CompactSketch compactSketch = union.getResult(true, null); return compactOrderedSketchToTuple(compactSketch); } } //End IntermediateFinal }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy