com.stratio.provider.mongodb.MongodbRelation.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-mongodb-core Show documentation
Show all versions of spark-mongodb-core Show documentation
A Spark SQl library for MongoDB
The newest version!
/*
* Licensed to STRATIO (C) under one or more contributor license agreements.
* See the NOTICE file distributed with this work for additional information
* regarding copyright ownership. The STRATIO (C) licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.stratio.provider.mongodb
import com.stratio.provider.DeepConfig
import com.stratio.provider.mongodb.partitioner.MongodbPartitioner
import com.stratio.provider.mongodb.rdd.MongodbRDD
import com.stratio.provider.mongodb.schema.{MongodbRowConverter, MongodbSchema}
import com.stratio.provider.mongodb.writer.MongodbSimpleWriter
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.sql.sources.{InsertableRelation, BaseRelation, Filter, PrunedFilteredScan}
import org.apache.spark.sql.types._
/**
* A MongoDB baseRelation that can eliminate unneeded columns
* and filter using selected predicates before producing
* an RDD containing all matching tuples as Row objects.
* @param config A Deep configuration with needed properties for MongoDB
* @param schemaProvided The optionally provided schema. If not provided,
* it will be inferred from the whole field projection
* of the specified table in Spark SQL statement using
* a sample ratio (as JSON Data Source does).
* @param sqlContext An existing Spark SQL context.
*/
case class MongodbRelation(
config: DeepConfig,
schemaProvided: Option[StructType] = None)(
@transient val sqlContext: SQLContext) extends BaseRelation
with PrunedFilteredScan with InsertableRelation {
import MongodbRelation._
private val rddPartitioner: MongodbPartitioner =
new MongodbPartitioner(config)
/**
* Default schema to be used in case no schema was provided before.
* It scans the RDD generated by Spark SQL statement,
* using specified sample ratio.
*/
@transient private lazy val lazySchema =
MongodbSchema(
new MongodbRDD(sqlContext, config, rddPartitioner),
config[Double](MongodbConfig.SamplingRatio)).schema()
override val schema: StructType = schemaProvided.getOrElse(lazySchema)
override def buildScan(
requiredColumns: Array[String],
filters: Array[Filter]): RDD[Row] = {
val rdd = new MongodbRDD(
sqlContext,
config,
rddPartitioner,
requiredColumns,
filters)
MongodbRowConverter.asRow(pruneSchema(schema, requiredColumns), rdd)
}
def isEmptyCollection: Boolean = new MongodbSimpleWriter(config).isEmpty
/**
* Insert data into the specified DataSource.
* @param data Data to insert.
* @param overwrite Boolean indicating whether to overwrite the data.
*/
def insert(data: DataFrame, overwrite: Boolean): Unit = {
if(overwrite){
new MongodbSimpleWriter(config).dropCollection
}
data.saveToMongodb(config)
}
}
object MongodbRelation {
/**
* Prune whole schema in order to fit with
* required columns in Spark SQL statement.
* @param schema Whole field projection schema.
* @param requiredColumns Required fields in statement
* @return A new pruned schema
*/
def pruneSchema(
schema: StructType,
requiredColumns: Array[String]): StructType =
StructType(
requiredColumns.flatMap(column =>
schema.fields.find(_.name == column)))
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy