All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.pig.cpc.DataToSketch Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2019, Verizon Media.
 * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.pig.cpc;

import static com.yahoo.sketches.Util.DEFAULT_UPDATE_SEED;

import java.io.IOException;

import org.apache.log4j.Logger;
import org.apache.pig.Accumulator;
import org.apache.pig.Algebraic;
import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;

import com.yahoo.sketches.cpc.CpcSketch;

/**
 * This is a Pig UDF that builds sketches from data.
 * This class implements both the Accumulator and Algebraic interfaces.
 *
 * @author Alexander Saydakov
 */
public class DataToSketch extends EvalFunc implements Accumulator, Algebraic {

  private DataByteArray emptySketch_; // this is to cash an empty sketch

  private final int lgK_;
  private final long seed_;
  private CpcSketch accumSketch_;
  private boolean isFirstCall_; // for logging

  /**
   * Constructor with default lgK and seed
   */
  public DataToSketch() {
    this(CpcSketch.DEFAULT_LG_K, DEFAULT_UPDATE_SEED);
  }

  /**
   * Constructor with given lgK as string and default seed
   *
   * @param lgK in a form of a String
   */
  public DataToSketch(final String lgK) {
    this(Integer.parseInt(lgK), DEFAULT_UPDATE_SEED);
  }

  /**
   * Constructor with given lgK and seed as strings
   *
   * @param lgK in a form of a String
   * @param seed in a form of a String
   */
  public DataToSketch(final String lgK, final String seed) {
    this(Integer.parseInt(lgK), Long.parseLong(seed));
  }

  /**
   * Base constructor.
   *
   * @param lgK parameter controlling the sketch size and accuracy
   * @param seed parameter to use during hashing
   */
  public DataToSketch(final int lgK, final long seed) {
    super();
    lgK_ = lgK;
    seed_ = seed;
  }

  /**
   * Top-level exec function.
   * This method accepts an input Tuple containing a Bag of one or more inner Datum Tuples
   * and returns a single serialized CpcSketch as a DataByteArray.
   *
   * Datum Tuple is a Tuple containing a single field, which can be one of the following
   * (Java type: Pig type):
   * 
    *
  • Byte: BYTE
  • *
  • Integer: INTEGER
  • *
  • Long: LONG
  • *
  • Float: FLOAT
  • *
  • Double: DOUBLE
  • *
  • String: CHARARRAY
  • *
  • DataByteArray: BYTEARRAY
  • *
* * @param inputTuple A tuple containing a single bag, containing Datum Tuples. * @return serialized CpcSketch * @see "org.apache.pig.EvalFunc.exec(org.apache.pig.data.Tuple)" * @throws IOException from Pig */ @Override public DataByteArray exec(final Tuple inputTuple) throws IOException { if (isFirstCall_) { Logger.getLogger(getClass()).info("Exec was used"); isFirstCall_ = false; } if (inputTuple == null || inputTuple.size() == 0) { if (emptySketch_ == null) { emptySketch_ = new DataByteArray(new CpcSketch(lgK_, seed_).toByteArray()); } return emptySketch_; } final CpcSketch sketch = new CpcSketch(lgK_, seed_); final DataBag bag = (DataBag) inputTuple.get(0); updateSketch(bag, sketch); return new DataByteArray(sketch.toByteArray()); } /** * An Accumulator version of the standard exec() method. Like exec(), * accumulator is called with a bag of Datum Tuples. Unlike exec(), it doesn't serialize the * result at the end. Instead, it can be called multiple times, each time with another bag of * Datum Tuples to be input to the sketch. * * @param inputTuple A tuple containing a single bag, containing Datum Tuples. * @see #exec * @see "org.apache.pig.Accumulator.accumulate(org.apache.pig.data.Tuple)" * @throws IOException by Pig */ @Override public void accumulate(final Tuple inputTuple) throws IOException { if (isFirstCall_) { Logger.getLogger(getClass()).info("Accumulator was used"); isFirstCall_ = false; } if (inputTuple == null || inputTuple.size() == 0) { return; } final DataBag bag = (DataBag) inputTuple.get(0); if (bag == null) { return; } if (accumSketch_ == null) { accumSketch_ = new CpcSketch(lgK_); } updateSketch(bag, accumSketch_); } /** * Returns the sketch that has been built up by multiple calls to {@link #accumulate}. * * @return serialized CpcSketch * @see "org.apache.pig.Accumulator.getValue()" */ @Override public DataByteArray getValue() { if (accumSketch_ == null) { if (emptySketch_ == null) { emptySketch_ = new DataByteArray(new CpcSketch(lgK_, seed_).toByteArray()); } return emptySketch_; } return new DataByteArray(accumSketch_.toByteArray()); } /** * Cleans up the UDF state after being called using the {@link Accumulator} interface. * * @see "org.apache.pig.Accumulator.cleanup()" */ @Override public void cleanup() { accumSketch_ = null; } @Override public String getInitial() { return AlgebraicInitial.class.getName(); } @Override public String getIntermed() { return DataToSketchAlgebraicIntermediate.class.getName(); } @Override public String getFinal() { return DataToSketchAlgebraicFinal.class.getName(); } static void updateSketch(final DataBag bag, final CpcSketch sketch) throws ExecException { // bag is not empty, process each innerTuple in the bag for (final Tuple innerTuple: bag) { final Object f0 = innerTuple.get(0); // consider only field 0 if (f0 == null) { continue; } final byte type = innerTuple.getType(0); switch (type) { case DataType.NULL: break; case DataType.BYTE: sketch.update((byte) f0); break; case DataType.INTEGER: sketch.update((int) f0); break; case DataType.LONG: sketch.update((long) f0); break; case DataType.FLOAT: sketch.update((float) f0); break; case DataType.DOUBLE: sketch.update((double) f0); break; case DataType.BYTEARRAY: { final DataByteArray dba = (DataByteArray) f0; sketch.update(dba.get()); break; } case DataType.CHARARRAY: { final String str = (String) f0; // conversion to char[] avoids costly UTF-8 encoding sketch.update(str.toCharArray()); break; } default: throw new IllegalArgumentException("Field 0 of innerTuple must be one of " + "NULL, BYTE, INTEGER, LONG, FLOAT, DOUBLE, BYTEARRAY or CHARARRAY. " + "Given Type = " + DataType.findTypeName(type) + ", Object = " + f0.toString()); } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy