All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.spotify.scio.extra.hll.zetasketch.ZetaSketchHLL.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2020 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.scio.extra.hll.zetasketch

import com.google.zetasketch.HyperLogLogPlusPlus
import com.spotify.scio.coders.Coder
import com.twitter.algebird.{Monoid, MonoidAggregator}

/**
 * This is a wrapper for internal HyperLogLogPlusPlus implementation.
 *
 * @param arrOpt
 *   - serialized byte array to construct the initial HyperLogLogPlusPlus instance.
 * @param elemOpt
 *   - first element to add to the constructed new HyperLogLogPlusPlus instance.
 */
final class ZetaSketchHll[T](arrOpt: Option[Array[Byte]], elemOpt: Option[T] = None)(implicit
  hp: HllPlus[T]
) extends Serializable {

  @transient private lazy val hll: HyperLogLogPlusPlus[hp.In] = (arrOpt, elemOpt) match {
    case (Some(arr), Some(elem)) => {
      val x = hp.hll(arr)
      x.add(hp.convert(elem))
      x
    }
    case (Some(arr), None) => hp.hll(arr)
    case (None, Some(elem)) => {
      val x = hp.hll()
      x.add(hp.convert(elem))
      x
    }
    case _ => hp.hll()
  }

  /**
   * Add a new element to this [[ZetaSketchHll]]. Need to return a new instance to maintain the
   * immutability of the [[hll]] value.
   * @param elem
   *   - new element to add.
   * @return
   *   - new [[ZetaSketchHll]] instance with the same type parameter.
   */
  def add(elem: T): ZetaSketchHll[T] =
    new ZetaSketchHll[T](Option(hll.serializeToByteArray()), Option(elem))

  /**
   * Merge both this and that [[ZetaSketchHll]] instances. Need to return a new instance to maintain
   * the immutability of this [[hll]] value and that [[hll]] value.
   * @param that
   *   - [[ZetaSketchHll]] to merge with this.
   * @return
   *   new instance of [[ZetaSketchHll]]
   */
  def merge(that: ZetaSketchHll[T]): ZetaSketchHll[T] = {
    val nhll = hp.hll(hll.serializeToByteArray())
    nhll.merge(that.hll.serializeToByteArray())
    new ZetaSketchHll[T](Option(nhll.serializeToByteArray()))
  }

  /** @return the byte array representation of the hll */
  def serializeToByteArray(): Array[Byte] = hll.serializeToByteArray()

  /** @return the estimated distinct count */
  def estimateSize(): Long = hll.result()

  /** @return precision used in hyperloglog++ algorithm. */
  def precision: Int = hll.getNormalPrecision

  /** @return sparse precision used in hyperloglog++ algorithm. */
  def sparsePrecision: Int = hll.getSparsePrecision
}

object ZetaSketchHll {
  def create[T: HllPlus](): ZetaSketchHll[T] = new ZetaSketchHll[T](None)

  def create[T: HllPlus](arr: Array[Byte]) = new ZetaSketchHll[T](Option(arr))

  def create[T](p: Int)(implicit hp: HllPlus[T]): ZetaSketchHll[T] = create(
    hp.hll(p).serializeToByteArray()
  )

  def create[T: HllPlus](elem: T): ZetaSketchHll[T] = new ZetaSketchHll[T](None, Option(elem))

  /** ZetaSketchHll [[com.twitter.algebird.Monoid]] impl */
  final case class ZetaSketchHllMonoid[T: HllPlus]() extends Monoid[ZetaSketchHll[T]] {
    override def zero: ZetaSketchHll[T] = ZetaSketchHll.create[T]()

    override def plus(x: ZetaSketchHll[T], y: ZetaSketchHll[T]): ZetaSketchHll[T] = x.merge(y)
  }

  /** ZetaSketchHll [[com.twitter.algebird.MonoidAggregator]] impl */
  final case class ZetaSketchHllAggregator[T: HllPlus]()
      extends MonoidAggregator[T, ZetaSketchHll[T], Long] {
    override def monoid: Monoid[ZetaSketchHll[T]] = ZetaSketchHllMonoid()

    override def prepare(input: T): ZetaSketchHll[T] = ZetaSketchHll.create[T](input)

    override def present(reduction: ZetaSketchHll[T]): Long = reduction.estimateSize()
  }

  /** Coder for ZetaSketchHll */
  implicit def coder[T: HllPlus]: Coder[ZetaSketchHll[T]] = {
    Coder.xmap[Array[Byte], ZetaSketchHll[T]](Coder.arrayByteCoder)(
      arr => ZetaSketchHll.create[T](arr),
      zt => zt.hll.serializeToByteArray()
    )
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy