All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.spotify.scio.extra.sparkey.instances.ShardedSparkeyReader.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2020 Spotify AB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.spotify.scio.extra.sparkey.instances

import java.util

import com.spotify.sparkey.{IndexHeader, LogHeader, SparkeyReader}

import scala.util.hashing.MurmurHash3
import scala.jdk.CollectionConverters._

/**
 * A wrapper class around SparkeyReader that allows the reading of multiple Sparkey files, sharded
 * by their keys (via MurmurHash3). At most 32,768 Sparkey files are supported.
 *
 * @param sparkeys
 *   a map of shard ID to sparkey reader
 * @param numShards
 *   the total count of shards used (needed for keying as some shards may be empty)
 */
class ShardedSparkeyReader(val sparkeys: Map[Short, SparkeyReader], val numShards: Short)
    extends SparkeyReader {
  def hashKey(arr: Array[Byte]): Short =
    Math.floorMod(MurmurHash3.bytesHash(arr, 1), numShards.toInt).toShort

  def hashKey(str: String): Short =
    Math.floorMod(MurmurHash3.stringHash(str, 1), numShards.toInt).toShort

  override def getAsString(key: String): String = {
    val hashed = hashKey(key)
    if (sparkeys.contains(hashed)) {
      sparkeys(hashed).getAsString(key)
    } else {
      null
    }
  }

  override def getAsByteArray(key: Array[Byte]): Array[Byte] = {
    val hashed = hashKey(key)
    if (sparkeys.contains(hashed)) {
      sparkeys(hashed).getAsByteArray(key)
    } else {
      null
    }
  }

  override def getAsEntry(key: Array[Byte]): SparkeyReader.Entry = {
    val hashed = hashKey(key)
    if (sparkeys.contains(hashed)) {
      sparkeys(hashed).getAsEntry(key)
    } else {
      null
    }
  }

  override def getIndexHeader: IndexHeader =
    throw new NotImplementedError("ShardedSparkeyReader does not support getIndexHeader.")

  override def getLogHeader: LogHeader =
    throw new NotImplementedError("ShardedSparkeyReader does not support getLogHeader.")

  override def duplicate(): SparkeyReader =
    new ShardedSparkeyReader(sparkeys.map { case (k, v) => (k, v.duplicate) }, numShards)

  override def close(): Unit = sparkeys.values.foreach(_.close())

  override def iterator(): util.Iterator[SparkeyReader.Entry] =
    sparkeys.values.map(_.iterator.asScala).reduce(_ ++ _).asJava

  override def getLoadedBytes: Long = sparkeys.valuesIterator.map(_.getLoadedBytes).sum

  override def getTotalBytes: Long = sparkeys.valuesIterator.map(_.getTotalBytes).sum
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy