com.mongodb.spark.rdd.api.java.JavaMongoRDD.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of mongo-spark-connector_2.12 Show documentation

mongo-spark-connector

There is a newer version: 10.2.3

Show newest version

/*
 * Copyright 2016 MongoDB, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.mongodb.spark.rdd.api.java

import java.util

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.sql.{DataFrame, Dataset}

import org.bson.conversions.Bson
import com.mongodb.spark.notNull
import com.mongodb.spark.rdd.MongoRDD

/**
 * Java specific version of [[com.mongodb.spark.rdd.MongoRDD]].
 *
 * @since 1.0
 */
case class JavaMongoRDD[D](override val rdd: MongoRDD[D])(implicit override val classTag: ClassTag[D]) extends JavaRDD[D](rdd)(classTag) {

  /**
   * Returns a copy with the specified aggregation pipeline
   *
   * @param pipeline the aggregation pipeline to use
   * @return the updated MongoJavaRDD
   */
  def withPipeline[B <: Bson](pipeline: util.List[B]): JavaMongoRDD[D] = {
    notNull("pipeline", pipeline)
    JavaMongoRDD(rdd.withPipeline(pipeline.asScala))
  }

  /**
   * Creates a `DataFrame` inferring the schema by sampling data from MongoDB.
   *
   * '''Note:''' Prefer [[toDS[T](beanClass:Class[T])*]] as any computations will be more efficient.
   *  The rdd must contain an `_id` for MongoDB versions < 3.2.
   *
   * @return a DataFrame
   */
  def toDF(): DataFrame = rdd.toDF()

  /**
   * Creates a `DataFrame` based on the schema derived from the bean class
   *
   * @param beanClass encapsulating the data from MongoDB
   * @tparam T The bean class type to shape the data from MongoDB into
   * @return a DataFrame
   */
  def toDF[T](beanClass: Class[T]): DataFrame = {
    notNull("beanClass", beanClass)
    rdd.toDF(beanClass)
  }

  /**
   * Creates a `Dataset` from the RDD strongly typed to the provided java bean.
   *
   * @param beanClass encapsulating the data from MongoDB
   * @tparam T The type of the data from MongoDB
   * @return a Dataset
   */
  def toDS[T](beanClass: Class[T]): Dataset[T] = {
    notNull("beanClass", beanClass)
    rdd.toDS(beanClass)
  }

}