com.stratio.provider.mongodb.MongodbRelation.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-mongodb-core Show documentation
A Spark SQl library for MongoDB
The newest version!
/*
 *  Licensed to STRATIO (C) under one or more contributor license agreements.
 *  See the NOTICE file distributed with this work for additional information
 *  regarding copyright ownership. The STRATIO (C) licenses this file
 *  to you under the Apache License, Version 2.0 (the
 *  "License"); you may not use this file except in compliance
 *  with the License. You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing,
 *  software distributed under the License is distributed on an
 *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 *  KIND, either express or implied. See the License for the
 *  specific language governing permissions and limitations
 *  under the License.
 */

package com.stratio.provider.mongodb

import com.stratio.provider.DeepConfig
import com.stratio.provider.mongodb.partitioner.MongodbPartitioner
import com.stratio.provider.mongodb.rdd.MongodbRDD
import com.stratio.provider.mongodb.schema.{MongodbRowConverter, MongodbSchema}
import com.stratio.provider.mongodb.writer.MongodbSimpleWriter
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.sql.sources.{InsertableRelation, BaseRelation, Filter, PrunedFilteredScan}
import org.apache.spark.sql.types._

/**
 * A MongoDB baseRelation that can eliminate unneeded columns
 * and filter using selected predicates before producing
 * an RDD containing all matching tuples as Row objects.
 * @param config A Deep configuration with needed properties for MongoDB
 * @param schemaProvided The optionally provided schema. If not provided,
 *                       it will be inferred from the whole field projection
 *                       of the specified table in Spark SQL statement using
 *                       a sample ratio (as JSON Data Source does).
 * @param sqlContext An existing Spark SQL context.
 */
case class MongodbRelation(
  config: DeepConfig,
  schemaProvided: Option[StructType] = None)(
  @transient val sqlContext: SQLContext) extends BaseRelation
with PrunedFilteredScan with InsertableRelation {

  import MongodbRelation._

  private val rddPartitioner: MongodbPartitioner =
    new MongodbPartitioner(config)

  /**
   * Default schema to be used in case no schema was provided before.
   * It scans the RDD generated by Spark SQL statement,
   * using specified sample ratio.
   */
  @transient private lazy val lazySchema =
    MongodbSchema(
      new MongodbRDD(sqlContext, config, rddPartitioner),
      config[Double](MongodbConfig.SamplingRatio)).schema()

  override val schema: StructType = schemaProvided.getOrElse(lazySchema)

  override def buildScan(
    requiredColumns: Array[String],
    filters: Array[Filter]): RDD[Row] = {

    val rdd = new MongodbRDD(
      sqlContext,
      config,
      rddPartitioner,
      requiredColumns,
      filters)

    MongodbRowConverter.asRow(pruneSchema(schema, requiredColumns), rdd)

  }

  def isEmptyCollection: Boolean = new MongodbSimpleWriter(config).isEmpty

  /**
   * Insert data into the specified DataSource.
   * @param data Data to insert.
   * @param overwrite Boolean indicating whether to overwrite the data.
   */
  def insert(data: DataFrame, overwrite: Boolean): Unit = {
    if(overwrite){
      new MongodbSimpleWriter(config).dropCollection
    }

    data.saveToMongodb(config)
  }

}

object MongodbRelation {

  /**
   * Prune whole schema in order to fit with
   * required columns in Spark SQL statement.
   * @param schema Whole field projection schema.
   * @param requiredColumns Required fields in statement
   * @return A new pruned schema
   */
  def pruneSchema(
    schema: StructType,
    requiredColumns: Array[String]): StructType =
    StructType(
      requiredColumns.flatMap(column =>
        schema.fields.find(_.name == column)))

}