All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.couchbase.spark.rdd.KeyValueRDD.scala Maven / Gradle / Ivy

/*
 * Copyright (c) 2015 Couchbase, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.couchbase.spark.rdd


import java.net.InetAddress
import java.util.zip.CRC32

import com.couchbase.client.core.config.CouchbaseBucketConfig
import com.couchbase.client.core.message.cluster.{GetClusterConfigRequest, GetClusterConfigResponse}
import com.couchbase.client.java.document.Document
import com.couchbase.spark.Logging
import com.couchbase.spark.connection.{CouchbaseConfig, CouchbaseConnection, KeyValueAccessor}
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, SparkContext, TaskContext}

import scala.reflect.ClassTag
import rx.lang.scala.JavaConversions._

import scala.concurrent.duration.Duration

class KeyValuePartition(id: Int, docIds: Seq[String], loc: Option[InetAddress]) extends Partition {
  override def index: Int = id
  def ids: Seq[String] = docIds
  def location: Option[InetAddress] = loc
  override def toString = s"KeyValuePartition($id, $docIds, $loc)"
}

class KeyValueRDD[D <: Document[_]]
  (@transient private val sc: SparkContext, ids: Seq[String], bname: String = null,
   timeout: Option[Duration] = None)
  (implicit ct: ClassTag[D])
  extends RDD[D](sc, Nil) {

  private val cbConfig = CouchbaseConfig(sc.getConf)
  private val bucketName = Option(bname).getOrElse(cbConfig.buckets.head.name)

  override def compute(split: Partition, context: TaskContext): Iterator[D] = {
    val p = split.asInstanceOf[KeyValuePartition]
    new KeyValueAccessor[D](cbConfig, p.ids, bucketName, timeout).compute()
  }

  override protected def getPartitions: Array[Partition] = {
    val core = CouchbaseConnection().bucket(cbConfig, bucketName).core()

    val req = new GetClusterConfigRequest()
    val config = toScalaObservable(core.send[GetClusterConfigResponse](req))
      .map(c => {
        logWarning(c.config().bucketConfigs().toString)
        logWarning(bucketName)
        c.config().bucketConfig(bucketName)
      })
      .toBlocking
      .single

    val parts = config match {
      case bucketConfig: CouchbaseBucketConfig =>
        val numPartitions = bucketConfig.numberOfPartitions()
        var partitionIndex = 0
        ids.groupBy(id => {
          val crc32 = new CRC32()
          crc32.update(id.getBytes("UTF-8"))
          val rv = (crc32.getValue >> 16) & 0x7fff
          rv.toInt & numPartitions - 1
        }).map(grouped => {
          val hostname = Some(
            bucketConfig.nodeAtIndex(bucketConfig.nodeIndexForMaster(grouped._1, false)).hostname()
          )
          val currentIdx = partitionIndex
          partitionIndex += 1
          new KeyValuePartition(currentIdx, grouped._2,
            Some(InetAddress.getByName(hostname.get.address())))
        }).toArray
      case _ =>
        logWarning("Memcached preferred locations currently not supported.")
        Array(new KeyValuePartition(0, ids, None))
    }

    parts.asInstanceOf[Array[Partition]]
  }

  override protected def getPreferredLocations(split: Partition): Seq[String] = {
    val p = split.asInstanceOf[KeyValuePartition]
    if (p.location.isDefined) {
      Seq(p.location.get.getHostName, p.location.get.getHostAddress)
    } else {
      Nil
    }
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy