All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.pig.quantiles.DataToItemsSketch Maven / Gradle / Ivy

There is a newer version: 0.13.0
Show newest version
/*
 * Copyright 2016, Yahoo! Inc.
 * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.pig.quantiles;

import java.io.IOException;
import java.util.Comparator;

import org.apache.pig.Accumulator;
import org.apache.pig.Algebraic;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;

import com.yahoo.memory.Memory;
import com.yahoo.sketches.ArrayOfItemsSerDe;
import com.yahoo.sketches.quantiles.ItemsSketch;
import com.yahoo.sketches.quantiles.ItemsUnion;

/**
 * Builds ItemsSketch from data.
 * To assist Pig, this class implements both the Accumulator and Algebraic interfaces.
 * @param  type of item
 */
public abstract class DataToItemsSketch extends EvalFunc
    implements Accumulator, Algebraic {

  private static final TupleFactory tupleFactory_ = TupleFactory.getInstance();

  // With the single exception of the Accumulator interface, UDFs are stateless.
  // All parameters kept at the class level must be final, except for the accumUnion.
  private final int k_;
  private final Comparator comparator_;
  private final ArrayOfItemsSerDe serDe_;
  private ItemsUnion accumUnion_;

  // TOP LEVEL API

  /**
   * Base constructor.
   *
   * @param k parameter that determines the accuracy and size of the sketch.
   * The value of 0 means the default k, whatever it is in the sketches-core library
   * @param comparator for items of type T
   * @param serDe an instance of ArrayOfItemsSerDe for type T
   */
  public DataToItemsSketch(final int k, final Comparator comparator,
      final ArrayOfItemsSerDe serDe) {
    super();
    k_ = k;
    comparator_ = comparator;
    serDe_ = serDe;
  }

  //@formatter:off
  /**
   * Top-level exec function.
   * This method accepts an input Tuple containing a Bag of one or more inner Datum Tuples
   * and returns a single Sketch as a Sketch Tuple.
   *
   * 

If a large number of calls is anticipated, leveraging either the Algebraic or * Accumulator interfaces is recommended. Pig normally handles this automatically. * *

Internally, this method presents the inner Datum Tuples to a new Union, * which is returned as a Sketch Tuple * *

Types below are in the form: Java data type: Pig DataType * *

Input Tuple *

    *
  • Tuple: TUPLE (Must contain only one field) *
      *
    • index 0: DataBag: BAG (May contain 0 or more Inner Tuples) *
        *
      • index 0: Tuple: TUPLE Datum Tuple
      • *
      • ...
      • *
      • index n-1: Tuple: TUPLE Datum Tuple
      • *
      *
    • *
    *
  • *
* * Datum Tuple *
    *
  • Tuple: TUPLE (Must contain only one field) *
      *
    • index 0: T: some suitable Pig type convertible to T
    • *
    *
  • *
* * Sketch Tuple *
    *
  • Tuple: TUPLE (Contains exactly 1 field) *
      *
    • index 0: DataByteArray: BYTEARRAY = a serialized QuantilesSketch object.
    • *
    *
  • *
* * @param inputTuple A tuple containing a single bag, containing Datum Tuples. * @return Sketch Tuple. If inputTuple is null or empty, returns empty sketch. * @see "org.apache.pig.EvalFunc.exec(org.apache.pig.data.Tuple)" * @throws IOException from Pig. */ // @formatter:on @Override // TOP LEVEL EXEC public Tuple exec(final Tuple inputTuple) throws IOException { //The exec is a stateless function. It operates on the input and returns a result. if (inputTuple != null && inputTuple.size() > 0) { final ItemsUnion union = k_ > 0 ? ItemsUnion.getInstance(k_, comparator_) : ItemsUnion.getInstance(comparator_); final DataBag bag = (DataBag) inputTuple.get(0); for (final Tuple innerTuple: bag) { union.update(extractValue(innerTuple.get(0))); } final ItemsSketch resultSketch = union.getResultAndReset(); if (resultSketch != null) { return tupleFactory_.newTuple(new DataByteArray(resultSketch.toByteArray(serDe_))); } } // return empty sketch final ItemsSketch sketch = k_ > 0 ? ItemsSketch.getInstance(k_, comparator_) : ItemsSketch.getInstance(comparator_); return tupleFactory_.newTuple(new DataByteArray(sketch.toByteArray(serDe_))); } @Override public Schema outputSchema(final Schema input) { if (input == null) { return null; } try { final Schema tupleSchema = new Schema(); tupleSchema.add(new Schema.FieldSchema("Sketch", DataType.BYTEARRAY)); return new Schema(new Schema.FieldSchema(getSchemaName( this.getClass().getName().toLowerCase(), input), tupleSchema, DataType.TUPLE)); } catch (final FrontendException e) { throw new RuntimeException(e); } } // ACCUMULATOR INTERFACE /** * An Accumulator version of the standard exec() method. Like exec(), * accumulator is called with a bag of Datum Tuples. Unlike exec(), it doesn't serialize the * sketch at the end. Instead, it can be called multiple times, each time with another bag of * Datum Tuples to be input to the Union. * * @param inputTuple A tuple containing a single bag, containing Datum Tuples. * @see #exec * @see "org.apache.pig.Accumulator.accumulate(org.apache.pig.data.Tuple)" * @throws IOException by Pig */ @Override public void accumulate(final Tuple inputTuple) throws IOException { if (inputTuple == null || inputTuple.size() == 0) { return; } final DataBag bag = (DataBag) inputTuple.get(0); if (bag == null) { return; } if (accumUnion_ == null) { accumUnion_ = k_ > 0 ? ItemsUnion.getInstance(k_, comparator_) : ItemsUnion.getInstance(comparator_); } for (final Tuple innerTuple: bag) { accumUnion_.update(extractValue(innerTuple.get(0))); } } /** * Returns the result of the Union that has been built up by multiple calls to {@link #accumulate}. * * @return Sketch Tuple. (see {@link #exec} for return tuple format) * @see "org.apache.pig.Accumulator.getValue()" */ @Override public Tuple getValue() { if (accumUnion_ != null) { final ItemsSketch resultSketch = accumUnion_.getResultAndReset(); if (resultSketch != null) { return tupleFactory_.newTuple(new DataByteArray(resultSketch.toByteArray(serDe_))); } } // return empty sketch final ItemsSketch sketch = k_ > 0 ? ItemsSketch.getInstance(k_, comparator_) : ItemsSketch.getInstance(comparator_); return tupleFactory_.newTuple(new DataByteArray(sketch.toByteArray(serDe_))); } /** * Cleans up the UDF state after being called using the {@link Accumulator} interface. * * @see "org.apache.pig.Accumulator.cleanup()" */ @Override public void cleanup() { accumUnion_ = null; } /** * Override this if it takes more than a cast to convert from Pig type to type T * @param object Pig object, which needs to be converted to type T * @return value of type T */ @SuppressWarnings("unchecked") protected T extractValue(final Object object) { return (T) object; } // STATIC Initial Class only called by Pig /** * Class used to calculate the initial pass of an Algebraic sketch operation. * *

* The Initial class simply passes through all records unchanged so that they can be * processed by the intermediate processor instead.

*/ public static class DataToItemsSketchInitial extends EvalFunc { // The Algebraic worker classes (Initial, IntermediateFinal) are static and stateless. // The constructors must mirror the main UDF class /** * Default constructor. */ public DataToItemsSketchInitial() {} /** * Constructor with specific k * @param kStr string representation of k */ public DataToItemsSketchInitial(final String kStr) {} @Override public Tuple exec(final Tuple inputTuple) throws IOException { return inputTuple; } } // STATIC IntermediateFinal Class only called by Pig /** * Class used to calculate the intermediate or final combiner pass of an Algebraic sketch * operation. This is called from the combiner, and may be called multiple times (from the mapper * and from the reducer). It will receive a bag of values returned by either the Intermediate * stage or the Initial stages, so it needs to be able to differentiate between and * interpret both types. * @param type of item */ public static abstract class DataToItemsSketchIntermediateFinal extends EvalFunc { // The Algebraic worker classes (Initial, IntermediateFinal) are static and stateless. // The constructors of the concrete class must mirror the ones in the main UDF class private final int k_; private final Comparator comparator_; private final ArrayOfItemsSerDe serDe_; /** * Constructor for the intermediate and final passes of an Algebraic function. * * @param k parameter that determines the accuracy and size of the sketch. * @param comparator for items of type T * @param serDe an instance of ArrayOfItemsSerDe for type T */ public DataToItemsSketchIntermediateFinal( final int k, final Comparator comparator, final ArrayOfItemsSerDe serDe) { super(); k_ = k; comparator_ = comparator; serDe_ = serDe; } /** * Override this if it takes more than a cast to convert from Pig type to type T * @param object Pig object, which needs to be converted to type T * @return value of type T */ @SuppressWarnings("unchecked") protected T extractValue(final Object object) { return (T) object; } @Override // IntermediateFinal exec public Tuple exec(final Tuple inputTuple) throws IOException { //throws is in API if (inputTuple != null && inputTuple.size() > 0) { final ItemsUnion union = k_ > 0 ? ItemsUnion.getInstance(k_, comparator_) : ItemsUnion.getInstance(comparator_); final DataBag outerBag = (DataBag) inputTuple.get(0); for (final Tuple dataTuple: outerBag) { final Object f0 = dataTuple.get(0); if (f0 == null) { continue; } if (f0 instanceof DataBag) { final DataBag innerBag = (DataBag) f0; // inputTuple.bag0.dataTupleN.f0:bag if (innerBag.size() == 0) { continue; } // If field 0 of a dataTuple is a Bag all innerTuples of this inner bag // will be passed into the union. // It is due to system bagged outputs from multiple mapper Initial functions. // The Intermediate stage was bypassed. for (final Tuple innerTuple: innerBag) { union.update(extractValue(innerTuple.get(0))); } } else if (f0 instanceof DataByteArray) { // inputTuple.bag0.dataTupleN.f0:DBA // If field 0 of a dataTuple is a DataByteArray we assume it is a sketch // due to system bagged outputs from multiple mapper Intermediate functions. // Each dataTuple.DBA:sketch will merged into the union. final DataByteArray dba = (DataByteArray) f0; union.update(ItemsSketch.getInstance(Memory.wrap(dba.get()), comparator_, serDe_)); } else { throw new IllegalArgumentException("dataTuple.Field0: Is not a DataByteArray: " + f0.getClass().getName()); } } final ItemsSketch resultSketch = union.getResultAndReset(); if (resultSketch != null) { return tupleFactory_.newTuple(new DataByteArray(resultSketch.toByteArray(serDe_))); } } // return empty sketch final ItemsSketch sketch = k_ > 0 ? ItemsSketch.getInstance(k_, comparator_) : ItemsSketch.getInstance(comparator_); return tupleFactory_.newTuple(new DataByteArray(sketch.toByteArray(serDe_))); } } // end IntermediateFinal }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy