All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.spotify.scio.extra.sparkey.package.scala Maven / Gradle / Ivy

There is a newer version: 0.14.8
Show newest version
/*
 * Copyright 2019 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.scio.extra

import java.lang.Math.floorMod
import java.util.UUID

import com.spotify.scio.ScioContext
import com.spotify.scio.annotations.experimental
import com.spotify.scio.coders.Coder
import com.spotify.scio.extra.sparkey.instances.{
  CachedStringSparkeyReader,
  SparkeyReaderInstances,
  TypedSparkeyReader
}
import com.spotify.scio.util.Cache
import com.spotify.scio.values.{SCollection, SideInput}
import com.spotify.sparkey.{CompressionType, SparkeyReader}
import org.apache.beam.sdk.io.FileSystems
import org.apache.beam.sdk.transforms.{DoFn, View}
import org.apache.beam.sdk.values.PCollectionView
import org.slf4j.LoggerFactory

import scala.jdk.CollectionConverters._
import scala.util.hashing.MurmurHash3

/**
 * Main package for Sparkey side input APIs. Import all.
 *
 * {{{
 * import com.spotify.scio.extra.sparkey._
 * }}}
 *
 * To save an `SCollection[(String, String)]` to a Sparkey fileset:
 * {{{
 * val s = sc.parallelize(Seq("a" -> "one", "b" -> "two"))
 *
 * // temporary location
 * val s1: SCollection[SparkeyUri] = s.asSparkey
 *
 * // specific location
 * val s1: SCollection[SparkeyUri] = s.asSparkey("gs:////")
 * }}}
 *
 * // with multiple shards, sharded by MurmurHash3 of the key
 * val s1: SCollection[SparkeyUri] = s.asSparkey("gs:////", numShards=2)
 * }}}
 *
 * The result `SCollection[SparkeyUri]` can be converted to a side input:
 * {{{
 * val s: SCollection[SparkeyUri] = sc.parallelize(Seq("a" -> "one", "b" -> "two")).asSparkey
 * val side: SideInput[SparkeyReader] = s.asSparkeySideInput
 * }}}
 *
 * These two steps can be done with a syntactic sugar:
 * {{{
 * val side: SideInput[SparkeyReader] = sc
 *   .parallelize(Seq("a" -> "one", "b" -> "two"))
 *   .asSparkeySideInput
 * }}}
 *
 * An existing Sparkey file can also be converted to a side input directly:
 * {{{
 * sc.sparkeySideInput("gs:////")
 * }}}
 *
 * A sharded collection of Sparkey files can also be used as a side input by specifying a glob path:
 * {{{
 * sc.sparkeySideInput("gs://///part-*")
 * }}}
 *
 * `SparkeyReader` can be used like a lookup table in a side input operation:
 * {{{
 * val main: SCollection[String] = sc.parallelize(Seq("a", "b", "c"))
 * val side: SideInput[SparkeyReader] = sc
 *   .parallelize(Seq("a" -> "one", "b" -> "two"))
 *   .asSparkeySideInput
 *
 * main.withSideInputs(side)
 *   .map { (x, s) =>
 *     s(side).getOrElse(x, "unknown")
 *   }
 * }}}
 *
 * A `TypedSparkeyReader` can be used to do automatic decoding of JVM types from byte values:
 * {{{
 * val main: SCollection[String] = sc.parallelize(Seq("a", "b", "c"))
 * val side: SideInput[TypedSparkeyReader[MyObject]] = sc
 *   .typedSparkeySideInput("gs:////", MyObject.decode)
 *
 * val objects: SCollection[MyObject] = main
 *   .withSideInputs(side)
 *   .map { (x, s) => s(side).get(x) }
 *   .toSCollection
 * }}}
 *
 * A `TypedSparkeyReader` can also accept a Caffeine cache to reduce IO and deserialization load:
 * {{{
 * val main: SCollection[String] = sc.parallelize(Seq("a", "b", "c"))
 * val cache: Cache[String, MyObject] = ...
 * val side: SideInput[TypedSparkeyReader[MyObject]] = sc
 *   .typedSparkeySideInput("gs:////", MyObject.decode, cache)
 *
 * val objects: SCollection[MyObject] = main
 *   .withSideInputs(side)
 *   .map { (x, s) => s(side).get(x) }
 *   .toSCollection
 * }}}
 */
package object sparkey extends SparkeyReaderInstances {

  /** Enhanced version of [[ScioContext]] with Sparkey methods. */
  implicit class SparkeyScioContext(private val self: ScioContext) extends AnyVal {
    private def singleViewOf(basePath: String): PCollectionView[SparkeyUri] =
      self.parallelize(Seq(SparkeyUri(basePath, self.options))).applyInternal(View.asSingleton())

    private def shardedViewOf(basePath: String): PCollectionView[SparkeyUri] =
      self
        .parallelize(Seq[SparkeyUri](ShardedSparkeyUri(basePath, self.options)))
        .applyInternal(View.asSingleton())

    /**
     * Create a SideInput of `SparkeyReader` from a [[SparkeyUri]] base path, to be used with
     * [[com.spotify.scio.values.SCollection.withSideInputs SCollection.withSideInputs]].
     * If the provided base path ends with "*", it will be treated as a sharded collection of
     * Sparkey files.
     */
    @experimental
    def sparkeySideInput(basePath: String): SideInput[SparkeyReader] = {
      val view = if (basePath.endsWith("*")) {
        val basePathWithoutGlobPart = basePath.split("/").dropRight(1).mkString("/")
        shardedViewOf(basePathWithoutGlobPart)
      } else {
        singleViewOf(basePath)
      }

      new SparkeySideInput(view)
    }

    /**
     * Create a SideInput of `TypedSparkeyReader` from a [[SparkeyUri]] base path, to be used with
     * [[com.spotify.scio.values.SCollection.withSideInputs SCollection.withSideInputs]].
     * The provided decoder function will map from the underlying byte array to a JVM type, and
     * the optional [[Cache]] object can be used to cache reads in memory after decoding.
     */
    @experimental
    def typedSparkeySideInput[T](
      basePath: String,
      decoder: Array[Byte] => T,
      cache: Cache[String, T] = null
    ): SideInput[TypedSparkeyReader[T]] =
      sparkeySideInput(basePath).map(reader => new TypedSparkeyReader[T](reader, decoder, cache))

    /**
     * Create a SideInput of `CachedStringSparkeyReader` from a [[SparkeyUri]] base path, to be used
     * with [[com.spotify.scio.values.SCollection.withSideInputs SCollection.withSideInputs]].
     */
    @experimental
    def cachedStringSparkeySideInput[T](
      basePath: String,
      cache: Cache[String, String]
    ): SideInput[CachedStringSparkeyReader] =
      sparkeySideInput(basePath).map(reader => new CachedStringSparkeyReader(reader, cache))
  }

  private val DefaultNumShards: Short = 1
  private val DefaultSideInputNumShards: Short = 64
  private val DefaultCompressionType: CompressionType = CompressionType.NONE
  private val DefaultCompressionBlockSize: Int = 0

  private def writeToSparkey[K, V](
    uri: SparkeyUri,
    maxMemoryUsage: Long,
    compressionType: CompressionType,
    compressionBlockSize: Int,
    elements: Iterable[(K, V)]
  )(implicit w: SparkeyWritable[K, V], koder: Coder[K], voder: Coder[V]): SparkeyUri = {
    val writer = new SparkeyWriter(uri, compressionType, compressionBlockSize, maxMemoryUsage)
    val it = elements.iterator
    while (it.hasNext) {
      val kv = it.next()
      w.put(writer, kv._1, kv._2)
    }
    writer.close()
    uri
  }

  /** Enhanced version of [[com.spotify.scio.values.SCollection SCollection]] with Sparkey methods. */
  implicit class SparkeyPairSCollection[K, V](@transient private val self: SCollection[(K, V)]) {
    private val logger = LoggerFactory.getLogger(this.getClass)

    /**
     * Write the key-value pairs of this SCollection as a Sparkey file to a specific location.
     *
     * @param path where to write the sparkey files. Defaults to a temporary location.
     * @param maxMemoryUsage (optional) how much memory (in bytes) is allowed for writing
     *                       the index file
     * @param numShards (optional) the number of shards to split this dataset into before writing.
     *                  One pair of Sparkey files will be written for each shard, sharded
     *                  by MurmurHash3 of the key mod the number of shards.
     * @return A singleton SCollection containing the [[SparkeyUri]] of the saved files.
     */
    @experimental
    def asSparkey(
      path: String = null,
      maxMemoryUsage: Long = -1,
      numShards: Short = DefaultNumShards,
      compressionType: CompressionType = DefaultCompressionType,
      compressionBlockSize: Int = DefaultCompressionBlockSize
    )(implicit
      w: SparkeyWritable[K, V],
      koder: Coder[K],
      voder: Coder[V]
    ): SCollection[SparkeyUri] = {
      require(numShards > 0, s"numShards must be greater than 0, found $numShards")
      if (compressionType != CompressionType.NONE) {
        require(
          compressionBlockSize > 0,
          s"Compression block size must be > 0 for $compressionType"
        )
      }

      val tempLocation = self.context.options.getTempLocation()
      val tempPath = s"$tempLocation/sparkey-${UUID.randomUUID}"
      val basePath = if (path == null) tempPath else path
      val nonShardedUri = SparkeyUri(basePath, self.context.options)
      require(!nonShardedUri.exists, s"Sparkey URI $nonShardedUri already exists")
      val uri = ShardedSparkeyUri(basePath, self.context.options)
      require(!uri.exists, s"Sparkey URI $uri already exists")

      logger.info(s"Saving as Sparkey with $numShards shards: $basePath")

      self.transform { collection =>
        val shards = collection
          .groupBy { case (k, _) => floorMod(w.shardHash(k), numShards).toShort }
          .map { case (shard, xs) =>
            shard -> writeToSparkey(
              uri.sparkeyUriForShard(shard, numShards),
              maxMemoryUsage,
              compressionType,
              compressionBlockSize,
              xs
            )
          }

        val shardsMap = shards.asMapSideInput

        val uris = shards.context
          .parallelize((0 until numShards).map(_.toShort))
          .withSideInputs(shardsMap)
          .map { case (shard, sideContext) =>
            sideContext(shardsMap).getOrElse(
              shard,
              writeToSparkey(
                uri.sparkeyUriForShard(shard, numShards),
                maxMemoryUsage,
                compressionType,
                compressionBlockSize,
                Iterable.empty[(K, V)]
              )
            )
          }
          .toSCollection

        uris.reifyAsListInGlobalWindow
          .map { _ =>
            if (numShards == 1) {
              val src = FileSystems
                .`match`(basePath + "/*")
                .metadata()
                .asScala
                .map(_.resourceId())
                .sortWith(_.getFilename < _.getFilename)
                .asJava
              val dst = SparkeyUri.extensions
                .map(ext => FileSystems.matchNewResource(s"$basePath$ext", false))
                .asJava

              FileSystems.rename(src, dst)

              nonShardedUri
            } else {
              uri
            }

          }
      }
    }

    /**
     * Write the key-value pairs of this SCollection as a Sparkey file to a temporary location.
     *
     * @return A singleton SCollection containing the [[SparkeyUri]] of the saved files.
     */
    @experimental
    def asSparkey(implicit
      w: SparkeyWritable[K, V],
      koder: Coder[K],
      voder: Coder[V]
    ): SCollection[SparkeyUri] = this.asSparkey()

    /**
     * Convert this SCollection to a SideInput, mapping key-value pairs of each window to a
     * `SparkeyReader`, to be used with
     * [[com.spotify.scio.values.SCollection.withSideInputs SCollection.withSideInputs]]. It is
     * required that each key of the input be associated with a single value.
     *
     * @param numShards the number of shards to use when writing the Sparkey file(s).
     */
    @experimental
    def asSparkeySideInput(
      numShards: Short = DefaultSideInputNumShards,
      compressionType: CompressionType = DefaultCompressionType,
      compressionBlockSize: Int = DefaultCompressionBlockSize
    )(implicit
      w: SparkeyWritable[K, V],
      koder: Coder[K],
      voder: Coder[V]
    ): SideInput[SparkeyReader] =
      self
        .asSparkey(
          numShards = numShards,
          compressionType = compressionType,
          compressionBlockSize = compressionBlockSize
        )
        .asSparkeySideInput

    /**
     * Convert this SCollection to a SideInput, mapping key-value pairs of each window to a
     * `SparkeyReader`, to be used with
     * [[com.spotify.scio.values.SCollection.withSideInputs SCollection.withSideInputs]]. It is
     * required that each key of the input be associated with a single value.
     */
    @experimental
    def asSparkeySideInput(implicit
      w: SparkeyWritable[K, V],
      koder: Coder[K],
      voder: Coder[V]
    ): SideInput[SparkeyReader] =
      self.asSparkeySideInput()

    /**
     * Convert this SCollection to a SideInput, mapping key-value pairs of each window to a
     * `SparkeyReader`, to be used with
     * [[com.spotify.scio.values.SCollection.withSideInputs SCollection.withSideInputs]]. It is
     * required that each key of the input be associated with a single value. The provided
     * [[Cache]] will be used to cache reads from the resulting [[SparkeyReader]].
     */
    @experimental
    def asTypedSparkeySideInput[T](decoder: Array[Byte] => T)(implicit
      w: SparkeyWritable[K, V],
      koder: Coder[K],
      voder: Coder[V]
    ): SideInput[TypedSparkeyReader[T]] =
      self.asSparkey.asTypedSparkeySideInput[T](decoder)

    /**
     * Convert this SCollection to a SideInput, mapping key-value pairs of each window to a
     * `SparkeyReader`, to be used with
     * [[com.spotify.scio.values.SCollection.withSideInputs SCollection.withSideInputs]]. It is
     * required that each key of the input be associated with a single value. The provided
     * [[Cache]] will be used to cache reads from the resulting [[SparkeyReader]].
     */
    @experimental
    def asTypedSparkeySideInput[T](
      cache: Cache[String, T],
      numShards: Short = DefaultSideInputNumShards,
      compressionType: CompressionType = DefaultCompressionType,
      compressionBlockSize: Int = DefaultCompressionBlockSize
    )(
      decoder: Array[Byte] => T
    )(implicit
      w: SparkeyWritable[K, V],
      koder: Coder[K],
      voder: Coder[V]
    ): SideInput[TypedSparkeyReader[T]] =
      self
        .asSparkey(
          numShards = numShards,
          compressionType = compressionType,
          compressionBlockSize = compressionBlockSize
        )
        .asTypedSparkeySideInput[T](cache)(decoder)

    /**
     * Convert this SCollection to a SideInput, mapping key-value pairs of each window to a
     * `CachedStringSparkeyReader`, to be used with
     * [[com.spotify.scio.values.SCollection.withSideInputs SCollection.withSideInputs]].
     */
    @experimental
    def asCachedStringSparkeySideInput(
      cache: Cache[String, String],
      numShards: Short = DefaultSideInputNumShards,
      compressionType: CompressionType = DefaultCompressionType,
      compressionBlockSize: Int = DefaultCompressionBlockSize
    )(implicit
      w: SparkeyWritable[K, V],
      koder: Coder[K],
      voder: Coder[V]
    ): SideInput[CachedStringSparkeyReader] =
      self
        .asSparkey(
          numShards = numShards,
          compressionType = compressionType,
          compressionBlockSize = compressionBlockSize
        )
        .asCachedStringSparkeySideInput(cache)
  }

  /** Enhanced version of [[com.spotify.scio.values.SCollection SCollection]] with Sparkey methods. */
  implicit class SparkeySCollection(private val self: SCollection[SparkeyUri]) extends AnyVal {

    /**
     * Convert this SCollection to a SideInput of `SparkeyReader`, to be used with
     * [[com.spotify.scio.values.SCollection.withSideInputs SCollection.withSideInputs]].
     */
    @experimental
    def asSparkeySideInput: SideInput[SparkeyReader] = {
      val view = self.applyInternal(View.asSingleton())
      new SparkeySideInput(view)
    }

    /**
     * Convert this SCollection to a SideInput of `SparkeyReader`, to be used with
     * [[com.spotify.scio.values.SCollection.withSideInputs SCollection.withSideInputs]].
     * The provided decoder function will map from the underlying byte array to a JVM type, and
     * the optional [[Cache]] object can be used to cache reads in memory after decoding.
     */
    @experimental
    def asTypedSparkeySideInput[T](cache: Cache[String, T])(
      decoder: Array[Byte] => T
    ): SideInput[TypedSparkeyReader[T]] =
      asSparkeySideInput
        .map(reader => new TypedSparkeyReader[T](reader, decoder, cache))

    /**
     * Convert this SCollection to a SideInput of `SparkeyReader`, to be used with
     * [[com.spotify.scio.values.SCollection.withSideInputs SCollection.withSideInputs]].
     * The provided decoder function will map from the underlying byte array to a JVM type, and
     * the optional [[Cache]] object can be used to cache reads in memory after decoding.
     */
    @experimental
    def asTypedSparkeySideInput[T](decoder: Array[Byte] => T): SideInput[TypedSparkeyReader[T]] =
      asSparkeySideInput
        .map(reader => new TypedSparkeyReader[T](reader, decoder, Cache.noOp))

    /**
     * Convert this SCollection to a SideInput of `CachedStringSparkeyReader`, to be used with
     * [[com.spotify.scio.values.SCollection.withSideInputs SCollection.withSideInputs]].
     */
    @experimental
    def asCachedStringSparkeySideInput(
      cache: Cache[String, String]
    ): SideInput[CachedStringSparkeyReader] =
      asSparkeySideInput
        .map(reader => new CachedStringSparkeyReader(reader, cache))
  }

  private class SparkeySideInput(val view: PCollectionView[SparkeyUri])
      extends SideInput[SparkeyReader] {
    override def updateCacheOnGlobalWindow: Boolean = false
    override def get[I, O](context: DoFn[I, O]#ProcessContext): SparkeyReader =
      SparkeySideInput.checkMemory(context.sideInput(view).getReader)
  }

  private object SparkeySideInput {
    private val logger = LoggerFactory.getLogger(this.getClass)
    def checkMemory(reader: SparkeyReader): SparkeyReader = {
      val memoryBytes = java.lang.management.ManagementFactory.getOperatingSystemMXBean
        .asInstanceOf[com.sun.management.OperatingSystemMXBean]
        .getTotalPhysicalMemorySize
      if (reader.getTotalBytes > memoryBytes) {
        logger.warn(
          "Sparkey size {} > total memory {}, look up performance will be severely degraded. " +
            "Increase memory or use faster SSD drives.",
          reader.getTotalBytes,
          memoryBytes
        )
      }
      reader
    }
  }

  sealed trait SparkeyWritable[K, V] extends Serializable {
    private[sparkey] def put(w: SparkeyWriter, key: K, value: V): Unit
    private[sparkey] def shardHash(key: K): Int
  }

  implicit val stringSparkeyWritable: SparkeyWritable[String, String] =
    new SparkeyWritable[String, String] {
      def put(w: SparkeyWriter, key: String, value: String): Unit =
        w.put(key, value)

      def shardHash(key: String): Int = MurmurHash3.stringHash(key, 1)
    }

  implicit val ByteArraySparkeyWritable: SparkeyWritable[Array[Byte], Array[Byte]] =
    new SparkeyWritable[Array[Byte], Array[Byte]] {
      def put(w: SparkeyWriter, key: Array[Byte], value: Array[Byte]): Unit =
        w.put(key, value)

      def shardHash(key: Array[Byte]): Int = MurmurHash3.bytesHash(key, 1)
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy