All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.datasketches.hive.theta.IntersectSketchUDAF Maven / Gradle / Ivy

There is a newer version: 2.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.datasketches.hive.theta;

import static org.apache.datasketches.Util.DEFAULT_UPDATE_SEED;

import java.util.Arrays;

import org.apache.datasketches.hive.common.BytesWritableHelper;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.theta.Intersection;
import org.apache.datasketches.theta.SetOperation;
import org.apache.datasketches.theta.Sketch;
import org.apache.datasketches.theta.Sketches;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFParameterInfo;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;

@Description(
    name = "intersectSketch",
    value = "_FUNC_(sketch, seed) - Compute the intersection of sketches",
    extended = "Example:\n"
    + "> SELECT intersectSketch(sketch) FROM src;\n"
    + "The return value is a binary blob that contains a compact sketch, which can "
    + "be operated on by the other sketch-related functions. "
    + "The seed is optional, "
    + "and using it is not recommended unless you really know why you need it.")
@SuppressWarnings("deprecation")
public class IntersectSketchUDAF extends AbstractGenericUDAFResolver {

  @Override
  public GenericUDAFEvaluator getEvaluator(final GenericUDAFParameterInfo info)
      throws SemanticException {
    final ObjectInspector[] inspectors = info.getParameterObjectInspectors();
    if (inspectors.length < 1) {
      throw new UDFArgumentException("Please specify at least 1 argument");
    }
    if (inspectors.length > 2) {
      throw new
      UDFArgumentTypeException(inspectors.length - 1, "Please specify no more than 2 arguments");
    }
    ObjectInspectorValidator.validateGivenPrimitiveCategory(inspectors[0], 0,
        PrimitiveCategory.BINARY);
    if (inspectors.length > 1) {
      ObjectInspectorValidator.validateIntegralParameter(inspectors[1], 1);
    }
    return new IntersectSketchUDAFEvaluator();
  }

  static class IntersectSketchUDAFEvaluator extends GenericUDAFEvaluator {
    protected static final String SEED_FIELD = "seed";
    protected static final String SKETCH_FIELD = "sketch";

    // FOR PARTIAL1 and COMPLETE modes: ObjectInspectors for original data
    private transient PrimitiveObjectInspector inputObjectInspector;
    protected transient PrimitiveObjectInspector seedObjectInspector;

    // FOR PARTIAL2 and FINAL modes: ObjectInspectors for partial aggregations
    protected transient StructObjectInspector intermediateObjectInspector;

    @Override
    public ObjectInspector init(final Mode mode, final ObjectInspector[] parameters) throws HiveException {
      super.init(mode, parameters);
      if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {
        this.inputObjectInspector = (PrimitiveObjectInspector) parameters[0];
        if (parameters.length > 1) {
          this.seedObjectInspector = (PrimitiveObjectInspector) parameters[1];
        }
      } else {
        this.intermediateObjectInspector = (StandardStructObjectInspector) parameters[0];
      }

      if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) {
        // intermediate results need to include the seed
        return ObjectInspectorFactory.getStandardStructObjectInspector(
          Arrays.asList(SEED_FIELD, SKETCH_FIELD),
          Arrays.asList(
            PrimitiveObjectInspectorFactory
              .getPrimitiveWritableObjectInspector(PrimitiveCategory.LONG),
            PrimitiveObjectInspectorFactory
              .getPrimitiveWritableObjectInspector(PrimitiveCategory.BINARY)
          )
        );
      }
      // final results include just the sketch
      return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(PrimitiveCategory.BINARY);
    }

    @Override
    public void iterate(final AggregationBuffer buf,
        final Object[] data) throws HiveException {
      if (data[0] == null) { return; }
      final IntersectionState state = (IntersectionState) buf;
      if (!state.isInitialized()) {
        long seed = DEFAULT_UPDATE_SEED;
        if (this.seedObjectInspector != null) {
          seed = PrimitiveObjectInspectorUtils.getLong(data[1], this.seedObjectInspector);
        }
        state.init(seed);
      }
      final byte[] serializedSketch = (byte[]) this.inputObjectInspector.getPrimitiveJavaObject(data[0]);
      if (serializedSketch == null) { return; }
      state.update(Memory.wrap(serializedSketch));
    }

    @Override
    public Object terminatePartial(final AggregationBuffer buf)
        throws HiveException {
      final IntersectionState state = (IntersectionState) buf;
      final Sketch intermediate = state.getResult();
      if (intermediate == null) { return null; }
      final byte[] bytes = intermediate.toByteArray();
      return Arrays.asList(
        new LongWritable(state.getSeed()),
        new BytesWritable(bytes)
      );
    }

    @Override
    public void merge(final AggregationBuffer buf,
        final Object data) throws HiveException {
      if (data == null) { return; }
      final IntersectionState state = (IntersectionState) buf;
      if (!state.isInitialized()) {
        final long seed = ((LongWritable) this.intermediateObjectInspector.getStructFieldData(
            data, this.intermediateObjectInspector.getStructFieldRef(SEED_FIELD))).get();
        state.init(seed);
      }

      final Memory serializedSketch = BytesWritableHelper.wrapAsMemory(
          (BytesWritable) this.intermediateObjectInspector.getStructFieldData(
          data, this.intermediateObjectInspector.getStructFieldRef(SKETCH_FIELD)));
      state.update(serializedSketch);
    }

    @Override
    public Object terminate(final AggregationBuffer buf)
        throws HiveException {
      final IntersectionState state = (IntersectionState) buf;
      final Sketch resultSketch = state.getResult();
      if (resultSketch == null) { return null; }
      return new BytesWritable(resultSketch.toByteArray());
    }

    @Override
    public AggregationBuffer getNewAggregationBuffer() throws HiveException {
      return new IntersectionState();
    }

    @Override
    public void reset(final AggregationBuffer buf)
        throws HiveException {
      final IntersectionState state = (IntersectionState) buf;
      state.reset();
    }

    static class IntersectionState extends AbstractAggregationBuffer {
      private long seed_;
      private Intersection intersection_;

      boolean isInitialized() {
        return this.intersection_ != null;
      }

      void init(final long seed) {
        this.seed_ = seed;
        this.intersection_ = SetOperation.builder().setSeed(seed).buildIntersection();
      }

      long getSeed() {
        return this.seed_;
      }

      void update(final Memory serializedSketch) {
        this.intersection_.intersect(Sketches.wrapSketch(serializedSketch, this.seed_));
      }

      Sketch getResult() {
        if (this.intersection_ == null) { return null; }
        return this.intersection_.getResult();
      }

      void reset() {
        this.intersection_ = null;
      }
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy