
com.mongodb.spark.MongoSpark.scala Maven / Gradle / Ivy
/*
* Copyright 2016 MongoDB, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.mongodb.spark
import scala.collection.JavaConverters._
import scala.reflect.ClassTag
import scala.reflect.runtime.universe._
import org.apache.spark.SparkContext
import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.sources.{Filter, IsNotNull}
import org.apache.spark.sql.types.StructType
import org.bson.conversions.Bson
import org.bson.{BsonDocument, Document}
import com.mongodb.client.MongoCollection
import com.mongodb.client.model.{InsertOneModel, ReplaceOneModel, UpdateOptions}
import com.mongodb.spark.DefaultHelper.DefaultsTo
import com.mongodb.spark.config.{ReadConfig, WriteConfig}
import com.mongodb.spark.rdd.MongoRDD
import com.mongodb.spark.rdd.api.java.JavaMongoRDD
import com.mongodb.spark.sql.MapFunctions.{documentToRow, rowToDocument}
import com.mongodb.spark.sql.{MongoInferSchema, helpers}
/**
* The MongoSpark helper allows easy creation of RDDs, DataFrames or Datasets from MongoDB.
*
* @since 1.0
*/
object MongoSpark {
private val DefaultMaxBatchSize = 512
/**
* The default source string for creating DataFrames from MongoDB
*/
val defaultSource = "com.mongodb.spark.sql.DefaultSource"
/**
* Create a builder for configuring the [[MongoSpark]]
*
* @return a MongoSession Builder
*/
def builder(): Builder = new Builder
/**
* Load data from MongoDB
*
* @param sc the Spark context containing the MongoDB connection configuration
* @return a MongoRDD
*/
def load[D: ClassTag](sc: SparkContext)(implicit e: D DefaultsTo Document): MongoRDD[D] = load(sc, ReadConfig(sc))
/**
* Load data from MongoDB
*
* @param sc the Spark context containing the MongoDB connection configuration
* @return a MongoRDD
*/
def load[D: ClassTag](sc: SparkContext, readConfig: ReadConfig)(implicit e: D DefaultsTo Document): MongoRDD[D] =
builder().sparkContext(sc).readConfig(readConfig).build().toRDD[D]()
/**
* Load data from MongoDB
*
* @param sparkSession the SparkSession containing the MongoDB connection configuration
* @tparam D The optional class defining the schema for the data
* @return a MongoRDD
*/
def load[D <: Product: TypeTag](sparkSession: SparkSession): DataFrame =
load[D](sparkSession, ReadConfig(sparkSession.sparkContext))
/**
* Load data from MongoDB
*
* @param sparkSession the SparkSession containing the MongoDB connection configuration
* @tparam D The optional class defining the schema for the data
* @return a MongoRDD
*/
def load[D <: Product: TypeTag](sparkSession: SparkSession, readConfig: ReadConfig): DataFrame =
builder().sparkSession(sparkSession).readConfig(readConfig).build().toDF[D]()
/**
* Load data from MongoDB
*
* @param sparkSession the SparkSession containing the MongoDB connection configuration
* @param clazz the class of the data contained in the RDD
* @tparam D The bean class defining the schema for the data
* @return a MongoRDD
*/
def load[D](sparkSession: SparkSession, readConfig: ReadConfig, clazz: Class[D]): Dataset[D] =
builder().sparkSession(sparkSession).readConfig(readConfig).build().toDS(clazz)
/**
* Save data to MongoDB
*
* Uses the `SparkConf` for the database and collection information
* Requires a codec for the data type
*
* @param rdd the RDD data to save to MongoDB
* @tparam D the type of the data in the RDD
*/
def save[D: ClassTag](rdd: RDD[D]): Unit = save(rdd, WriteConfig(rdd.sparkContext))
/**
* Save data to MongoDB
*
* @param rdd the RDD data to save to MongoDB
* @param writeConfig the writeConfig
* @tparam D the type of the data in the RDD
*/
def save[D: ClassTag](rdd: RDD[D], writeConfig: WriteConfig): Unit = {
val mongoConnector = MongoConnector(writeConfig.asOptions)
rdd.foreachPartition(iter => if (iter.nonEmpty) {
mongoConnector.withCollectionDo(writeConfig, { collection: MongoCollection[D] =>
iter.grouped(DefaultMaxBatchSize).foreach(batch => collection.insertMany(batch.toList.asJava))
})
})
}
/**
* Save data to MongoDB
*
* Uses the `SparkConf` for the database and collection information
*
* '''Note:''' If the dataFrame contains an `_id` field the data will upserted and replace any existing documents in the collection.
*
* @param dataset the dataset to save to MongoDB
* @tparam D
* @since 1.1.0
*/
def save[D](dataset: Dataset[D]): Unit = save(dataset, WriteConfig(dataset.sparkSession.sparkContext.getConf))
/**
* Save data to MongoDB
*
* '''Note:''' If the dataFrame contains an `_id` field the data will upserted and replace any existing documents in the collection.
*
* @param dataset the dataset to save to MongoDB
* @param writeConfig the writeConfig
* @tparam D
* @since 1.1.0
*/
def save[D](dataset: Dataset[D], writeConfig: WriteConfig): Unit = {
val mongoConnector = MongoConnector(writeConfig.asOptions)
val documentRdd: RDD[BsonDocument] = dataset.toDF().rdd.map(row => rowToDocument(row))
if (dataset.schema.fields.exists(_.name == "_id")) {
documentRdd.foreachPartition(iter => if (iter.nonEmpty) {
mongoConnector.withCollectionDo(writeConfig, { collection: MongoCollection[BsonDocument] =>
iter.grouped(DefaultMaxBatchSize).foreach(batch => {
val updateOptions = new UpdateOptions().upsert(true)
val requests = batch.map(doc =>
Option(doc.get("_id")) match {
case Some(_id) => new ReplaceOneModel[BsonDocument](new BsonDocument("_id", _id), doc, updateOptions)
case None => new InsertOneModel[BsonDocument](doc)
})
collection.bulkWrite(requests.toList.asJava)
})
})
})
} else {
MongoSpark.save(documentRdd, writeConfig)
}
}
/**
* Save data to MongoDB
*
* Uses the `SparkConf` for the database and collection information
*
* @param dataFrameWriter the DataFrameWriter save to MongoDB
*/
def save(dataFrameWriter: DataFrameWriter[_]): Unit = dataFrameWriter.format(defaultSource).save()
/**
* Save data to MongoDB
*
* @param dataFrameWriter the DataFrameWriter save to MongoDB
* @param writeConfig the writeConfig
*/
def save(dataFrameWriter: DataFrameWriter[_], writeConfig: WriteConfig): Unit =
dataFrameWriter.format(defaultSource).options(writeConfig.asOptions).save()
/**
* Creates a DataFrameReader with `MongoDB` as the source
*
* @param sparkSession the SparkSession
* @return the DataFrameReader
*/
def read(sparkSession: SparkSession): DataFrameReader = sparkSession.read.format("com.mongodb.spark.sql")
/**
* Creates a DataFrameWriter with the `MongoDB` underlying output data source.
*
* @param dataset the Dataset to convert into a DataFrameWriter
* @return the DataFrameWriter
*/
def write[T](dataset: Dataset[T]): DataFrameWriter[T] = dataset.write.format("com.mongodb.spark.sql")
/**
* Builder for configuring and creating a [[MongoSpark]]
*
* It requires a `SparkSession` or the `SparkContext`
*/
class Builder {
private var sparkSession: Option[SparkSession] = None
private var connector: Option[MongoConnector] = None
private var readConfig: Option[ReadConfig] = None
private var pipeline: Seq[Bson] = Nil
private var options: collection.Map[String, String] = Map()
def build(): MongoSpark = {
require(sparkSession.isDefined, "The SparkSession must be set, either explicitly or via the SparkContext")
val session = sparkSession.get
val readConf = readConfig.isDefined match {
case true => ReadConfig(options, readConfig)
case false => ReadConfig(session.sparkContext.getConf, options)
}
val mongoConnector = connector.getOrElse(MongoConnector(readConf))
val bsonDocumentPipeline = pipeline.map(x => x.toBsonDocument(classOf[Document], mongoConnector.codecRegistry))
new MongoSpark(session, mongoConnector, readConf, bsonDocumentPipeline)
}
/**
* Sets the SparkSession from the sparkContext
*
* @param sparkSession for the RDD
*/
def sparkSession(sparkSession: SparkSession): Builder = {
this.sparkSession = Option(sparkSession)
this
}
/**
* Sets the SparkSession from the sparkContext
*
* @param sparkContext for the RDD
*/
def sparkContext(sparkContext: SparkContext): Builder = {
this.sparkSession = Option(SparkSession.builder().config(sparkContext.getConf).getOrCreate())
this
}
/**
* Sets the SparkSession from the javaSparkContext
*
* @param javaSparkContext for the RDD
*/
def javaSparkContext(javaSparkContext: JavaSparkContext): Builder = sparkContext(javaSparkContext.sc)
/**
* Sets the SparkSession
*
* @param sqlContext for the RDD
*/
@deprecated("As of Spark 2.0 SQLContext was replaced by SparkSession. Use the SparkSession method instead", "2.0.0")
def sqlContext(sqlContext: SQLContext): Builder = sparkSession(sqlContext.sparkSession)
/**
* Append a configuration option
*
* These options can be used to configure all aspects of how to connect to MongoDB
*
* @param key the configuration key
* @param value the configuration value
*/
def option(key: String, value: String): Builder = {
this.options = this.options + (key -> value)
this
}
/**
* Set configuration options
*
* These options can configure all aspects of how to connect to MongoDB
*
* @param options the configuration options
*/
def options(options: collection.Map[String, String]): Builder = {
this.options = options
this
}
/**
* Set configuration options
*
* These options can configure all aspects of how to connect to MongoDB
*
* @param options the configuration options
*/
def options(options: java.util.Map[String, String]): Builder = {
this.options = options.asScala
this
}
/**
* Sets the [[com.mongodb.spark.MongoConnector]] to use
*
* @param connector the MongoConnector
*/
def connector(connector: MongoConnector): Builder = {
this.connector = Option(connector)
this
}
/**
* Sets the [[com.mongodb.spark.config.ReadConfig]] to use
*
* @param readConfig the readConfig
*/
def readConfig(readConfig: ReadConfig): Builder = {
this.readConfig = Option(readConfig)
this
}
/**
* Sets the aggregation pipeline to use
*
* @param pipeline the aggregation pipeline
*/
def pipeline(pipeline: Seq[Bson]): Builder = {
this.pipeline = pipeline
this
}
}
/*
* Java API helpers
*/
/**
* Load data from MongoDB
*
* @param jsc the Spark context containing the MongoDB connection configuration
* @return a MongoRDD
*/
def load(jsc: JavaSparkContext): JavaMongoRDD[Document] = builder().javaSparkContext(jsc).build().toJavaRDD()
/**
* Load data from MongoDB
*
* @param jsc the Spark context containing the MongoDB connection configuration
* @return a MongoRDD
*/
def load(jsc: JavaSparkContext, readConfig: ReadConfig): JavaMongoRDD[Document] =
builder().javaSparkContext(jsc).readConfig(readConfig).build().toJavaRDD()
/**
* Load data from MongoDB
*
* @param jsc the Spark context containing the MongoDB connection configuration
* @param clazz the class of the data contained in the RDD
* @tparam D the type of the data in the RDD
* @return a MongoRDD
*/
def load[D](jsc: JavaSparkContext, readConfig: ReadConfig, clazz: Class[D]): JavaMongoRDD[D] =
builder().javaSparkContext(jsc).readConfig(readConfig).build().toJavaRDD(clazz)
/**
* Load data from MongoDB
*
* @param jsc the Spark context containing the MongoDB connection configuration
* @param clazz the class of the data contained in the RDD
* @tparam D the type of the data in the RDD
* @return a MongoRDD
*/
def load[D](jsc: JavaSparkContext, clazz: Class[D]): JavaMongoRDD[D] = builder().javaSparkContext(jsc).build().toJavaRDD(clazz)
/**
* Save data to MongoDB
*
* Uses the `SparkConf` for the database and collection information
*
* @param javaRDD the RDD data to save to MongoDB
* @return the javaRDD
*/
def save(javaRDD: JavaRDD[Document]): Unit = save(javaRDD, classOf[Document])
/**
* Save data to MongoDB
*
* Uses the `SparkConf` for the database and collection information
* Requires a codec for the data type
*
* @param javaRDD the RDD data to save to MongoDB
* @param clazz the class of the data contained in the RDD
* @tparam D the type of the data in the RDD
* @return the javaRDD
*/
def save[D](javaRDD: JavaRDD[D], clazz: Class[D]): Unit = {
notNull("javaRDD", javaRDD)
implicit def ct: ClassTag[D] = ClassTag(clazz)
save[D](javaRDD.rdd)
}
/**
* Save data to MongoDB
*
* Uses the `SparkConf` for the database information
*
* @param javaRDD the RDD data to save to MongoDB
* @param writeConfig the [[com.mongodb.spark.config.WriteConfig]]
* @return the javaRDD
*/
def save(javaRDD: JavaRDD[Document], writeConfig: WriteConfig): Unit =
save(javaRDD, writeConfig, classOf[Document])
/**
* Save data to MongoDB
*
* Uses the `writeConfig` for the database information
* Requires a codec for the data type
*
* @param javaRDD the RDD data to save to MongoDB
* @param writeConfig the [[com.mongodb.spark.config.WriteConfig]]
* @param clazz the class of the data contained in the RDD
* @tparam D the type of the data in the RDD
* @return the javaRDD
*/
def save[D](javaRDD: JavaRDD[D], writeConfig: WriteConfig, clazz: Class[D]): Unit = {
notNull("javaRDD", javaRDD)
notNull("writeConfig", writeConfig)
implicit def ct: ClassTag[D] = ClassTag(clazz)
save[D](javaRDD.rdd, writeConfig)
}
// Deprecated APIs
/**
* Load data from MongoDB
*
* @param sqlContext the SQLContext containing the MongoDB connection configuration
* @tparam D The optional class defining the schema for the data
* @return a MongoRDD
*/
@deprecated("As of Spark 2.0 SQLContext was replaced by SparkSession. Use the SparkSession method instead", "2.0.0")
def load[D <: Product: TypeTag](sqlContext: SQLContext): DataFrame = load[D](sqlContext.sparkSession)
/**
* Load data from MongoDB
*
* @param sqlContext the SQLContext containing the MongoDB connection configuration
* @tparam D The optional class defining the schema for the data
* @return a MongoRDD
*/
@deprecated("As of Spark 2.0 SQLContext was replaced by SparkSession. Use the SparkSession method instead", "2.0.0")
def load[D <: Product: TypeTag](sqlContext: SQLContext, readConfig: ReadConfig): DataFrame =
load(sqlContext.sparkSession, readConfig)
/**
* Load data from MongoDB
*
* @param sqlContext the SQL context containing the MongoDB connection configuration
* @param clazz the class of the data contained in the RDD
* @tparam D The bean class defining the schema for the data
* @return a MongoRDD
*/
@deprecated("As of Spark 2.0 SQLContext was replaced by SparkSession. Use the SparkSession method instead", "2.0.0")
def load[D](sqlContext: SQLContext, readConfig: ReadConfig, clazz: Class[D]): DataFrame =
builder().sparkSession(sqlContext.sparkSession).readConfig(readConfig).build().toDF(clazz)
/**
* Creates a DataFrameReader with `MongoDB` as the source
*
* @param sqlContext the SQLContext
* @return the DataFrameReader
*/
@deprecated("As of Spark 2.0 SQLContext was replaced by SparkSession. Use the SparkSession method instead", "2.0.0")
def read(sqlContext: SQLContext): DataFrameReader = read(sqlContext.sparkSession)
}
/**
* The MongoSpark class
*
* '''Note:''' Creation of the class should be via [[MongoSpark$.builder]].
*
* @since 1.0
*/
case class MongoSpark(sparkSession: SparkSession, connector: MongoConnector, readConfig: ReadConfig, pipeline: Seq[BsonDocument]) {
private def rdd[D: ClassTag]()(implicit e: D DefaultsTo Document): MongoRDD[D] =
new MongoRDD[D](sparkSession, sparkSession.sparkContext.broadcast(connector), readConfig, pipeline)
if (readConfig.registerSQLHelperFunctions) {
helpers.UDF.registerFunctions(sparkSession)
}
/**
* Creates a `RDD` for the collection
*
* @tparam D the datatype for the collection
* @return a MongoRDD[D]
*/
def toRDD[D: ClassTag]()(implicit e: D DefaultsTo Document): MongoRDD[D] = rdd[D]
/**
* Creates a `JavaRDD` for the collection
*
* @return a JavaMongoRDD[Document]
*/
def toJavaRDD(): JavaMongoRDD[Document] = rdd[Document].toJavaRDD()
/**
* Creates a `JavaRDD` for the collection
*
* @param clazz the class of the data contained in the RDD
* @tparam D the type of the data in the RDD
* @return the javaRDD
*/
def toJavaRDD[D](clazz: Class[D]): JavaMongoRDD[D] = {
implicit def ct: ClassTag[D] = ClassTag(clazz)
rdd[D].toJavaRDD()
}
/**
* Creates a `DataFrame` based on the schema derived from the optional type.
*
* '''Note:''' Prefer [[toDS[T<:Product]()*]] as computations will be more efficient.
* The rdd must contain an `_id` for MongoDB versions < 3.2.
*
* @tparam T The optional type of the data from MongoDB, if not provided the schema will be inferred from the collection
* @return a DataFrame
*/
def toDF[T <: Product: TypeTag](): DataFrame = {
val schema: StructType = MongoInferSchema.reflectSchema[T]() match {
case Some(reflectedSchema) => reflectedSchema
case None => MongoInferSchema(toBsonDocumentRDD)
}
toDF(schema)
}
/**
* Creates a `DataFrame` based on the schema derived from the bean class.
*
* '''Note:''' Prefer [[toDS[T](beanClass:Class[T])*]] as computations will be more efficient.
*
* @param beanClass encapsulating the data from MongoDB
* @tparam T The bean class type to shape the data from MongoDB into
* @return a DataFrame
*/
def toDF[T](beanClass: Class[T]): DataFrame = toDF(MongoInferSchema.reflectSchema[T](beanClass))
/**
* Creates a `DataFrame` based on the provided schema.
*
* @param schema the schema representing the DataFrame.
* @return a DataFrame.
*/
def toDF(schema: StructType): DataFrame = {
sparkSession.read.format("com.mongodb.spark.sql")
.schema(schema)
.options(readConfig.asOptions)
.option("pipeline", pipeline.map(_.toJson).mkString("[", ",", "]"))
.load()
}
/**
* Creates a `Dataset` from the collection strongly typed to the provided case class.
*
* @tparam T The type of the data from MongoDB
* @return
*/
def toDS[T <: Product: TypeTag: NotNothing](): Dataset[T] = {
val dataFrame: DataFrame = toDF[T]()
import dataFrame.sqlContext.implicits._
dataFrame.as[T]
}
/**
* Creates a `Dataset` from the RDD strongly typed to the provided java bean.
*
* @tparam T The type of the data from MongoDB
* @return
*/
def toDS[T](beanClass: Class[T]): Dataset[T] = toDF[T](beanClass).as(Encoders.bean(beanClass))
private def toBsonDocumentRDD: MongoRDD[BsonDocument] = {
MongoSpark.builder()
.sparkSession(sparkSession)
.connector(connector)
.readConfig(readConfig)
.pipeline(pipeline)
.build()
.toRDD[BsonDocument]()
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy