All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.sql.DataSourceRegistration.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql

import org.apache.spark.SparkClassNotFoundException
import org.apache.spark.annotation.Evolving
import org.apache.spark.internal.Logging
import org.apache.spark.sql.errors.QueryCompilationErrors
import org.apache.spark.sql.execution.datasources.{DataSource, DataSourceManager}
import org.apache.spark.sql.execution.datasources.v2.python.UserDefinedPythonDataSource
import org.apache.spark.sql.internal.SQLConf

/**
 * Functions for registering user-defined data sources.
 * Use `SparkSession.dataSource` to access this.
 */
@Evolving
class DataSourceRegistration private[sql] (dataSourceManager: DataSourceManager)
  extends Logging {

  protected[sql] def registerPython(
      name: String,
      dataSource: UserDefinedPythonDataSource): Unit = {
    log.debug(
      s"""
         | Registering new Python data source:
         | name: $name
         | command: ${dataSource.dataSourceCls.command}
         | envVars: ${dataSource.dataSourceCls.envVars}
         | pythonIncludes: ${dataSource.dataSourceCls.pythonIncludes}
         | pythonExec: ${dataSource.dataSourceCls.pythonExec}
      """.stripMargin)

    checkDataSourceExists(name)

    dataSourceManager.registerDataSource(name, dataSource)
  }

  /**
   * Checks if the specified data source exists.
   *
   * This method allows for user-defined data sources to be registered even if they
   * have the same name as an existing data source in the registry. However, if the
   * data source can be successfully loaded and is not a user-defined one, an error
   * is thrown to prevent lookup errors with built-in or Scala/Java data sources.
   */
  private def checkDataSourceExists(name: String): Unit = {
    // Allow re-registration of user-defined data sources.
    // TODO(SPARK-46616): disallow re-registration of statically registered data sources.
    if (dataSourceManager.dataSourceExists(name)) return

    try {
      DataSource.lookupDataSource(name, SQLConf.get)
      throw QueryCompilationErrors.dataSourceAlreadyExists(name)
    } catch {
      case e: SparkClassNotFoundException if e.getErrorClass == "DATA_SOURCE_NOT_FOUND" => // OK
      case _: Throwable =>
        // If there are other errors when resolving the data source, it's unclear whether
        // it's safe to proceed. To prevent potential lookup errors, treat it as an existing
        // data source and prevent re-registration.
        throw QueryCompilationErrors.dataSourceAlreadyExists(name)
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy