Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
Copyright (c) 2014 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package ml.dmlc.xgboost4j.scala.spark.rapids
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.{DataSource, HadoopFsRelation}
import scala.collection.JavaConverters._
class GpuDataReader(sparkSession: SparkSession) {
/**
* Specifies the input data source format.
*/
def format(source: String): GpuDataReader = {
this.source = source
this
}
/**
* Specifies the input schema to use when parsing data.
*/
def schema(schema: StructType): GpuDataReader = {
specifiedSchema = Option(schema)
this
}
/**
* Specifies the schema by using the input DDL-formatted string.
*
* {{{
* spark.read.schema("a INT, b STRING, c DOUBLE").csv("test.csv")
* }}}
*/
def schema(schemaString: String): GpuDataReader = {
this.specifiedSchema = Option(StructType.fromDDL(schemaString))
this
}
/**
* Adds an input option for the underlying data source.
*/
def option(key: String, value: String): GpuDataReader = {
extraOptions += (key -> value)
this
}
/**
* Adds an input option for the underlying data source.
*/
def option(key: String, value: Boolean): GpuDataReader = option(key, value.toString)
/**
* Adds an input option for the underlying data source.
*/
def option(key: String, value: Long): GpuDataReader = option(key, value.toString)
/**
* Adds an input option for the underlying data source.
*/
def option(key: String, value: Double): GpuDataReader = option(key, value.toString)
/**
* (Scala-specific) Adds input options for the underlying data source.
*/
def options(options: scala.collection.Map[String, String]): GpuDataReader = {
extraOptions ++= options
this
}
/**
* Adds input options for the underlying data source.
*/
def options(options: java.util.Map[String, String]): GpuDataReader = {
this.options(options.asScala)
this
}
/**
* Loads input in as a `GpuDataset`, for data sources that don't require a path (e.g. external
* key-value stores).
*/
def load(): GpuDataset = {
load(Seq.empty: _*) // force invocation of `load(...varargs...)`
}
/**
* Loads input in as a `GpuDataset`, for data sources that require a path (e.g. data backed by
* a local or distributed file system).
*/
def load(path: String): GpuDataset = {
// force invocation of `load(...varargs...)`
option("path", path).load(Seq.empty: _*)
}
/**
* Loads input in as a `GpuDataset`, for data sources that support multiple paths.
* Only works if the source is a HadoopFsRelationProvider.
*/
@scala.annotation.varargs
def load(paths: String*): GpuDataset = {
val sourceType = sourceTypeMap.getOrElse(source.toLowerCase(),
throw new UnsupportedOperationException("Unsupported input format: " + source))
val optionsMap = extraOptions.toMap
val fsRelation = DataSource.apply(sparkSession,
paths = paths,
userSpecifiedSchema = specifiedSchema,
className = source,
options = optionsMap).resolveRelation()
createDataset(fsRelation.asInstanceOf[HadoopFsRelation], sourceType, optionsMap)
}
protected def createDataset(relation: HadoopFsRelation, sourceType: String,
sourceOptions: Map[String, String]): GpuDataset = {
val asFloats = sourceOptions.getOrElse("asFloats", "true").toBoolean
new GpuDataset(relation, sourceType, sourceOptions - "asFloats", asFloats)
}
/**
* Loads a CSV file and returns the result as a `GpuDataset`. See the documentation on the
* other overloaded `csv()` method for more details.
*/
def csv(path: String): GpuDataset = {
// This method ensures that calls that explicit need single argument works, see SPARK-16009
csv(Seq(path): _*)
}
/**
* Loads CSV files and returns the result as a `GpuDataset`.
* You can set the following CSV-specific options to deal with CSV files:
*
*
`sep` (default `,`): sets a single character as a separator for each
* field and value.
*
`quote` (default `"`): sets a single character used for escaping quoted values where
* the separator can be part of the value.
*
`comment` (default empty string): sets a single character used for skipping lines
* beginning with this character. By default, it is disabled.
*
`header` (default `false`): uses the first line as names of columns.
*
`nullValue` (default empty string): sets the string representation of a null value.
*
*/
@scala.annotation.varargs
def csv(paths: String*): GpuDataset = format("csv").load(paths : _*)
/**
* Loads a Parquet file, returning the result as a `GpuDataset`. See the documentation
* on the other overloaded `parquet()` method for more details.
*/
def parquet(path: String): GpuDataset = {
// This method ensures that calls that explicit need single argument works, see SPARK-16009
parquet(Seq(path): _*)
}
/**
* Loads a Parquet file, returning the result as a `GpuDataset`.
*
* You can set the following Parquet-specific option(s) for reading Parquet files:
*
*
`mergeSchema` (default is the value specified in `spark.sql.parquet.mergeSchema`): sets
* whether we should merge schemas collected from all Parquet part-files. This will override
* `spark.sql.parquet.mergeSchema`.