All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.pig.hash.MurmurHash3 Maven / Gradle / Ivy

There is a newer version: 0.13.0
Show newest version
/*
 * Copyright 2016, Yahoo! Inc.
 * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.pig.hash;

import static com.yahoo.sketches.hash.MurmurHash3.hash;
import static com.yahoo.sketches.hash.MurmurHash3Adaptor.hashToLongs;
import static com.yahoo.sketches.hash.MurmurHash3Adaptor.modulo;
import static java.lang.Math.min;
import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.IOException;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;

/**
 * Computes a 128-bit MurmurHash3 hash of data input types: String, DataByteArray, Long, Integer,
 * Double, and Float. The 2nd optional parameter can be a Long or Integer seed. The 3rd optional
 * parameter can be a positive Integer modulus divisor. If the divisor is provided, the Integer
 * modulus remainder is computed on the entire 128-bit hash output treated as if it were a 128-bit
 * positive value.
 *
 * @author Lee Rhodes
 */
public class MurmurHash3 extends EvalFunc {
  private static final TupleFactory mTupleFactory = TupleFactory.getInstance();
  private int divisor_ = 0;

  /**
   * Top Level Exec Function.
   * 

* This method accepts an object to be hashed and returns a Hash Result Tuple. *

* * Hash Input Tuple *
    *
  • Tuple: TUPLE (Must contain 1 field and may contain 2 or 3):
    *
      *
    • index 0: Object to be hashed: one of String, DataByteArray, Long, Integer, Double, * Float.
    • *
    • index 1: Seed: Long or Integer
    • *
    • index 2: Modulus divisor: Integer; must be positive.
    • *
    *
  • *
* *

* Any other input tuple will throw an exception! *

* * Hash Result Tuple *
    *
  • Tuple: TUPLE (Contains 3 fields) *
      *
    • index 0: Long: LONG: The 1st 64 bits of the hash.
    • *
    • index 1: Long: LONG: The 2nd 64 bits of the hash.
    • *
    • index 2: Integer: INTEGER: Result of modulo operation, -1 if not computed. *
    *
  • *
* * @param input Hash Input Tuple. If null or empty exec returns null. */ @Override public Tuple exec(final Tuple input) throws IOException { if ((input == null) || (input.size() == 0)) { return null; } final Tuple out = mTupleFactory.newTuple(3); final long[] hashOut = extractInputs(input); if (hashOut == null) { return out; //contains 3 nulls } final long h0 = hashOut[0]; final long h1 = hashOut[1]; final int modResult = (divisor_ > 0L) ? modulo(h0, h1, divisor_) : -1; out.set(0, Long.valueOf(h0)); out.set(1, Long.valueOf(h1)); out.set(2, Integer.valueOf(modResult)); return out; } private long[] extractInputs(final Tuple input) throws IOException { //may return null final int sw = min(input.size(), 3); long seed = 0L; Object obj = null; long[] hashOut = null; switch (sw) { case 3: { //modulus divisor obj = input.get(2); if (obj != null) { if (obj instanceof Integer) { divisor_ = (Integer) obj; } else { throw new IllegalArgumentException("Modulus divisor must be an Integer."); } if (divisor_ <= 0) { throw new IllegalArgumentException("Modulus divisor must be greater than zero. " + divisor_); } } //divisor may be null. If so it will not be used. } //$FALL-THROUGH$ case 2: { //seed obj = input.get(1); if (obj != null) { if (obj instanceof Long) { seed = (Long) obj; } else if (obj instanceof Integer) { seed = (Integer) obj; } else { throw new IllegalArgumentException("Seed must be an Integer or Long"); } } } //$FALL-THROUGH$ case 1: { obj = input.get(0); final int type = input.getType(0); switch (type) { case 1: { //Null type, returns null break; } case 10: { //Integer final long[] data = { (Integer) obj }; hashOut = hash(data, seed); break; } case 15: { //Long final long[] data = { (Long) obj }; hashOut = hash(data, seed); break; } case 20: { //Float hashOut = hashToLongs((Float) obj, seed); break; } case 25: { //Double hashOut = hashToLongs((Double) obj, seed); break; } case 50: { //BYTEARRAY = DataByteArray final DataByteArray dba = (DataByteArray) obj; if (dba.size() == 0) { break; //Empty return null } hashOut = hash(dba.get(), seed); break; } case 55: { //CHARARRAY = String final String datum = (String) obj; if (datum.isEmpty()) { break; //Empty return null } hashOut = hash(datum.getBytes(UTF_8), seed); break; } default: { throw new IllegalArgumentException("Cannot use this DataType: " + type); } } } } return hashOut; } /** * The output consists of two longs, or 128 bits, plus the result of the modulo division if * specified. */ @Override public Schema outputSchema(final Schema input) { if (input != null) { try { final Schema tupleSchema = new Schema(); tupleSchema.add(new Schema.FieldSchema("Hash0", DataType.LONG)); tupleSchema.add(new Schema.FieldSchema("Hash1", DataType.LONG)); tupleSchema.add(new Schema.FieldSchema("ModuloResult", DataType.INTEGER)); return new Schema(new Schema.FieldSchema(getSchemaName(this .getClass().getName().toLowerCase(), input), tupleSchema, DataType.TUPLE)); } catch (final FrontendException e) { //fall through } } return null; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy