All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.spotify.scio.extra.hll.zetasketch.syntax.SCollectionSyntax.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2020 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.scio.extra.hll.zetasketch.syntax

import com.spotify.scio.extra.hll.zetasketch.ZetaSketchHll.ZetaSketchHllAggregator
import com.spotify.scio.extra.hll.zetasketch.{HllPlus, ZetaSketchHll}
import com.spotify.scio.values.SCollection

trait SCollectionSyntax {
  implicit final class ZetaSCollection[T](private val scol: SCollection[T]) {

    /**
     * Convert each element to [[ZetaSketchHll]]. Only support for Int, Long, String and ByteString
     * types.
     *
     * @Example
     *   {{{
     *   val input: SCollection[T] = ...
     *   val zCol: SCollection[ZetaSketchHll[T]] = input.asZetaSketchHll
     *   val approxDistCount: SCollection[Long] = zCol.sumHll.approxDistinctCount
     *   }}}
     *
     * [[ZetaSketchHll]] has few extra methods to access precision, sparse precision.
     *
     * @return
     *   [[SCollection]] of [[ZetaSketchHll]]. This will have the exactly the same number of element
     *   as input [[SCollection]]
     */
    def asZetaSketchHll(implicit hp: HllPlus[T]): SCollection[ZetaSketchHll[T]] =
      scol.map(ZetaSketchHll.create[T](_))

    /**
     * Calculate the approximate distinct count using HyperLogLog++ algorithm. Only support for Int,
     * Long, String and ByteString types.
     *
     * @Example
     *   {{{
     *   val input: SCollection[T] = ...
     *   val approxDistCount: SCollection[Long] = input.approxDistinctCountWithZetaHll
     *   }}}
     *
     * @return
     *   - [[SCollection]] with one [[Long]] value.
     */
    def approxDistinctCountWithZetaHll(implicit hp: HllPlus[T]): SCollection[Long] =
      scol.aggregate(ZetaSketchHllAggregator())
  }

  implicit final class PairedZetaSCollection[K, V](private val kvScol: SCollection[(K, V)]) {

    /**
     * Convert each value in key-value pair to [[ZetaSketchHll]]. Only support for Int, Long, String
     * and ByteString value types.
     *
     * @Example
     *   {{{
     *   val input: SCollection[(K, V)] = ...
     *   val zCol: SCollection[(K, ZetaSketchHll[V])] = input.asZetaSketchHllByKey
     *   val approxDistCount: SCollection[(K, Long)] = zCol.sumHllByKey.approxDistinctCountByKey
     *   }}}
     *
     * [[ZetaSketchHll]] has few extra methods to access precision, sparse precision.
     *
     * @return
     *   key-value [[SCollection]] where value being [[ZetaSketchHll]]. This will have the similar
     *   number of elements as input [[SCollection]].
     */
    def asZetaSketchHllByKey(implicit hp: HllPlus[V]): SCollection[(K, ZetaSketchHll[V])] =
      kvScol.mapValues(ZetaSketchHll.create[V](_))

    /**
     * Calculate the approximate distinct count using HyperLogLog++ algorithm. Only support for Int,
     * Long, String and ByteString value types.
     *
     * @Example
     *   {{{
     *   val input: SCollection[(K, V)] = ...
     *   val approxDistCount: SCollection[(K, Long)] = input.approxDistinctCountWithZetaHllByKey
     *   }}}
     *
     * @return
     *   - [[SCollection]] with one [[Long]] value per each unique key.
     */
    def approxDistinctCountWithZetaHllByKey(implicit hp: HllPlus[V]): SCollection[(K, Long)] =
      kvScol.aggregateByKey(ZetaSketchHllAggregator())
  }

  implicit final class ZetaSketchHllSCollection[T](
    private val scol: SCollection[ZetaSketchHll[T]]
  ) {
    def sumHll: SCollection[ZetaSketchHll[T]] = scol.reduce(_.merge(_))

    def approxDistinctCount: SCollection[Long] = scol.map(_.estimateSize())
  }

  implicit final class ZetaSketchHllSCollectionKV[K, V](
    private val kvSCol: SCollection[(K, ZetaSketchHll[V])]
  ) {
    def sumHllByKey: SCollection[(K, ZetaSketchHll[V])] = kvSCol.reduceByKey(_.merge(_))

    def approxDistinctCountByKey: SCollection[(K, Long)] = kvSCol.mapValues(_.estimateSize())
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy