All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.hive.theta.DataToSketchUDAF Maven / Gradle / Ivy

There is a newer version: 0.13.0
Show newest version
/*
 * Copyright 2016, Yahoo! Inc.
 * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.hive.theta;

import static com.yahoo.sketches.Util.DEFAULT_NOMINAL_ENTRIES;
import static com.yahoo.sketches.Util.DEFAULT_UPDATE_SEED;

import java.util.Arrays;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFParameterInfo;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;

@Description(
    name = "dataToSketch",
    value = "_FUNC_(expr, size, prob, seed) - "
        + "Compute a sketch of given size, sampling probability and seed on data 'expr'",
    extended = "Example:\n"
    + "> SELECT dataToSketch(val, 16384) FROM src;\n"
    + "The return value is a binary blob that can be operated on by other sketch related functions."
    + " The sketch size is optional, must be a power of 2 and "
    + "controls the relative error expected from the sketch."
    + " A size of 16384 can be expected to yield errors of roughly +-1.5% in the estimation of uniques."
    + " The default size is defined in the sketches-core library "
    + "and at the time of this writing was 4096 (about 3% error)."
    + " The sampling probability is optional and must be from 0 to 1. The default is 1 (no sampling)"
    + " The seed is optional, and using it is not recommended unless you really know why you need it")
public class DataToSketchUDAF extends AbstractGenericUDAFResolver {

  /**
   * Performs argument number and type validation. DataToSketch expects
   * to receive between one and four arguments.
   * 
    *
  • The first (required) is the value to add to the sketch and must be a primitive.
  • * *
  • The second (optional) is the sketch size to use. This must be an integral value * and must be constant.
  • * *
  • The third (optional) is the sampling probability and is a floating point value between * 0.0 and 1.0. It must be a constant
  • * *
  • The fourth (optional) is an update seed. * It must be an integral value and must be constant.
  • *
* * @see org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver * #getEvaluator(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFParameterInfo) * * @param info Parameter info to validate * @return The GenericUDAFEvaluator that should be used to calculate the function. */ @Override public GenericUDAFEvaluator getEvaluator(final GenericUDAFParameterInfo info) throws SemanticException { final ObjectInspector[] parameters = info.getParameterObjectInspectors(); // Validate the correct number of parameters if (parameters.length < 1) { throw new UDFArgumentException("Please specify at least 1 argument"); } if (parameters.length > 4) { throw new UDFArgumentException("Please specify no more than 4 arguments"); } // Validate first parameter type ObjectInspectorValidator.validateCategoryPrimitive(parameters[0], 0); // Validate second argument if present if (parameters.length > 1) { ObjectInspectorValidator.validateIntegralParameter(parameters[1], 1); if (!ObjectInspectorUtils.isConstantObjectInspector(parameters[1])) { throw new UDFArgumentTypeException(1, "The second argument must be a constant"); } } // Validate third argument if present if (parameters.length > 2) { ObjectInspectorValidator.validateFloatingPointParameter(parameters[2], 2); if (!ObjectInspectorUtils.isConstantObjectInspector(parameters[2])) { throw new UDFArgumentTypeException(2, "The third argument must be a constant"); } } // Validate fourth argument if present if (parameters.length > 3) { ObjectInspectorValidator.validateIntegralParameter(parameters[3], 3); if (!ObjectInspectorUtils.isConstantObjectInspector(parameters[3])) { throw new UDFArgumentTypeException(3, "The fourth argument must be a constant"); } } return new DataToSketchEvaluator(); } public static class DataToSketchEvaluator extends UnionEvaluator { // FOR PARTIAL1 and COMPLETE modes: ObjectInspectors for original data private transient PrimitiveObjectInspector samplingProbabilityObjectInspector; /* * (non-Javadoc) * * @see * org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator#init(org.apache * .hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode, * org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector[]) */ @Override public ObjectInspector init(final Mode mode, final ObjectInspector[] parameters) throws HiveException { super.init(mode, parameters); if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) { // input is original data inputObjectInspector = (PrimitiveObjectInspector) parameters[0]; if (parameters.length > 1) { nominalEntriesObjectInspector = (PrimitiveObjectInspector) parameters[1]; } if (parameters.length > 2) { samplingProbabilityObjectInspector = (PrimitiveObjectInspector) parameters[2]; } if (parameters.length > 3) { seedObjectInspector = (PrimitiveObjectInspector) parameters[3]; } } else { // input for PARTIAL2 and FINAL is the output from PARTIAL1 intermediateObjectInspector = (StructObjectInspector) parameters[0]; } if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) { // intermediate results need to include the the nominal number of entries and the seed return ObjectInspectorFactory.getStandardStructObjectInspector( Arrays.asList(NOMINAL_ENTRIES_FIELD, SEED_FIELD, SKETCH_FIELD), Arrays.asList( PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(PrimitiveCategory.INT), PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(PrimitiveCategory.LONG), PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(PrimitiveCategory.BINARY) ) ); } // final results include just the sketch return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(PrimitiveCategory.BINARY); } /* * (non-Javadoc) * * @see * org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator#iterate(org * .apache * .hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AggregationBuffer, * java.lang.Object[]) */ @Override public void iterate(final @SuppressWarnings("deprecation") AggregationBuffer agg, final Object[] parameters) throws HiveException { if (parameters[0] == null) { return; } final UnionState state = (UnionState) agg; if (!state.isInitialized()) { initializeState(state, parameters); } state.update(parameters[0], inputObjectInspector); } private void initializeState(final UnionState state, final Object[] parameters) { int sketchSize = DEFAULT_NOMINAL_ENTRIES; if (nominalEntriesObjectInspector != null) { sketchSize = PrimitiveObjectInspectorUtils.getInt(parameters[1], nominalEntriesObjectInspector); } float samplingProbability = UnionState.DEFAULT_SAMPLING_PROBABILITY; if (samplingProbabilityObjectInspector != null) { samplingProbability = PrimitiveObjectInspectorUtils.getFloat(parameters[2], samplingProbabilityObjectInspector); } long seed = DEFAULT_UPDATE_SEED; if (seedObjectInspector != null) { seed = PrimitiveObjectInspectorUtils.getLong(parameters[3], seedObjectInspector); } state.init(sketchSize, samplingProbability, seed); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy