com.sequoiadb.spark.SequoiadbRelation.scala Maven / Gradle / Ivy
/*
* Licensed to SequoiaDB (C) under one or more contributor license agreements.
* See the NOTICE file distributed with this work for additional information
* regarding copyright ownership. The SequoiaDB (C) licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.sequoiadb.spark
/**
* Source File Name = SequoiadbRelation.scala
* Description = SequoiaDB Relation that derived from PrunedFilteredScan
* Restrictions = N/A
* Change Activity:
* Date Who Description
* ======== ================== ================================================
* 20150305 Tao Wang Initial Draft
*/
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.sources.{Filter, PrunedFilteredScan, BaseRelation, InsertableRelation}
import com.sequoiadb.spark.partitioner.SequoiadbPartitioner
import com.sequoiadb.spark.rdd.SequoiadbRDD
import com.sequoiadb.spark.schema.SequoiadbSchema
import com.sequoiadb.spark.schema.SequoiadbRowConverter
import com.sequoiadb.spark.io.SequoiadbWriter
import com.sequoiadb.exception.BaseException
/**
* A SequoiaDB baseRelation that can eliminate unneeded columns
* and filter using selected predicates before producing
* an RDD containing all matching tuples as Row objects.
* @param config A SequoiadB configuration
* @param schema The optionally provided schema. If not provided,
* it will be inferred from the whole field projection
* of the specified table in Spark SQL statement using
* a sample ratio (as JSON Data Source does).
* @param sqlContext An existing Spark SQL context.
*/
case class SequoiadbRelation(
config: SequoiadbConfig,
schemaProvided: Option[StructType] = None)(
@transient val sqlContext: SQLContext) extends BaseRelation
with PrunedFilteredScan
with InsertableRelation {
import SequoiadbRelation._
private val rddPartitioner: SequoiadbPartitioner =
new SequoiadbPartitioner(config)
/**
* Default schema to be used in case no schema was provided before.
* It scans the RDD generated by Spark SQL statement,
* using specified sample ratio.
*/
@transient private lazy val lazySchema =
SequoiadbSchema(
SequoiadbRDD(
sqlContext,
config,
Option(rddPartitioner)),
config[Double](SequoiadbConfig.SamplingRatio)).schema()
/**
* Either use schema that provided by user, or automatically generate schema
* by sampling the collection
*/
override val schema: StructType = schemaProvided.getOrElse(lazySchema)
/**
* Override build scan function that takes requiredColumns and filters as input
* @param requiredColumns Array list of columns that required
* @param filters Predicates that need to be applied to data source
*/
override def buildScan(
requiredColumns : Array[String],
filters : Array[Filter]): RDD[Row] = {
val rdd = SequoiadbRDD(
sqlContext,
config,
Option(rddPartitioner),
requiredColumns,
filters)
SequoiadbRowConverter.asRow(pruneSchema(schema, requiredColumns), rdd)
}
/**
* Override insert method to insert data into this dataframe
* @param data DataFrame of input
* @param overwrite Whether the table is overwriten
*/
override def insert(data: DataFrame, overwrite: Boolean): Unit = {
val schema = data.schema
if ( overwrite ) {
// we don't support truncate yet
throw new BaseException ( "SDB_OPTION_NOT_SUPPORT" )
}
data.foreachPartition(it => {
// always write through coord node which specified in config
new SequoiadbWriter(config).save(it, schema)
//it.map(row =>SequoiadbRowConverter.rowAsDBObject(row, data.schema)))
})
}
}
object SequoiadbRelation {
/**
* Prune whole schema in order to fit with
* required columns in Spark SQL statement.
* @param schema Whole field projection schema.
* @param requiredColumns Required fields in statement
* @return A new pruned schema
*/
def pruneSchema(
schema: StructType,
requiredColumns: Array[String]): StructType =
StructType(
requiredColumns.flatMap(column =>
schema.fields.find(_.name == column)))
}