org.apache.hudi.utilities.sources.JdbcSource Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.utilities.sources;

import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.utilities.SqlQueryBuilder;
import org.apache.hudi.utilities.config.JdbcSourceConfig;
import org.apache.hudi.utilities.exception.HoodieReadFromSourceException;
import org.apache.hudi.utilities.schema.SchemaProvider;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.DataFrameReader;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.storage.StorageLevel;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.InputStream;
import java.net.URI;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;

import static org.apache.hudi.common.util.ConfigUtils.checkRequiredConfigProperties;
import static org.apache.hudi.common.util.ConfigUtils.containsConfigProperty;
import static org.apache.hudi.common.util.ConfigUtils.getBooleanWithAltKeys;
import static org.apache.hudi.common.util.ConfigUtils.getStringWithAltKeys;
import static org.apache.hudi.common.util.ConfigUtils.stripPrefix;

/**
 * Reads data from RDBMS data sources.
 */

public class JdbcSource extends RowSource {

  private static final Logger LOG = LoggerFactory.getLogger(JdbcSource.class);
  private static final List DB_LIMIT_CLAUSE = Arrays.asList("mysql", "postgresql", "h2");
  private static final String URI_JDBC_PREFIX = "jdbc:";

  public JdbcSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
                    SchemaProvider schemaProvider) {
    super(props, sparkContext, sparkSession, schemaProvider);
  }

  /**
   * Validates all user properties and prepares the {@link DataFrameReader} to read from RDBMS.
   *
   * @param session    The {@link SparkSession}.
   * @param properties The JDBC connection properties and data source options.
   * @return The {@link DataFrameReader} to read from RDBMS
   * @throws HoodieException
   */
  private static DataFrameReader validatePropsAndGetDataFrameReader(final SparkSession session,
                                                                    final TypedProperties properties)
      throws HoodieException {
    DataFrameReader dataFrameReader;
    InputStream passwordFileStream = null;
    try {
      dataFrameReader = session.read().format("jdbc");
      dataFrameReader = dataFrameReader.option(
          Config.URL_PROP, getStringWithAltKeys(properties, JdbcSourceConfig.URL));
      dataFrameReader = dataFrameReader.option(
          Config.USER_PROP, getStringWithAltKeys(properties, JdbcSourceConfig.USER));
      dataFrameReader = dataFrameReader.option(
          Config.DRIVER_PROP, getStringWithAltKeys(properties, JdbcSourceConfig.DRIVER_CLASS));
      dataFrameReader = dataFrameReader.option(
          Config.RDBMS_TABLE_PROP, getStringWithAltKeys(properties, JdbcSourceConfig.RDBMS_TABLE_NAME));

      if (containsConfigProperty(properties, JdbcSourceConfig.PASSWORD)) {
        LOG.info("Reading JDBC password from properties file....");
        dataFrameReader = dataFrameReader.option(Config.PASSWORD_PROP,
            getStringWithAltKeys(properties, JdbcSourceConfig.PASSWORD));
      } else if (containsConfigProperty(properties, JdbcSourceConfig.PASSWORD_FILE)
          && !StringUtils.isNullOrEmpty(getStringWithAltKeys(properties, JdbcSourceConfig.PASSWORD_FILE))) {
        LOG.info(String.format("Reading JDBC password from password file %s", getStringWithAltKeys(properties, JdbcSourceConfig.PASSWORD_FILE)));
        FileSystem fileSystem = FileSystem.get(session.sparkContext().hadoopConfiguration());
        passwordFileStream = fileSystem.open(new Path(getStringWithAltKeys(properties, JdbcSourceConfig.PASSWORD_FILE)));
        byte[] bytes = new byte[passwordFileStream.available()];
        passwordFileStream.read(bytes);
        dataFrameReader = dataFrameReader.option(Config.PASSWORD_PROP, new String(bytes));
      } else {
        throw new IllegalArgumentException(String.format("JDBCSource needs either a %s or %s to connect to RDBMS "
            + "datasource", JdbcSourceConfig.PASSWORD_FILE.key(), JdbcSourceConfig.PASSWORD.key()));
      }

      addExtraJdbcOptions(properties, dataFrameReader);

      if (getBooleanWithAltKeys(properties, JdbcSourceConfig.IS_INCREMENTAL)) {
        checkRequiredConfigProperties(properties, Collections.singletonList(JdbcSourceConfig.INCREMENTAL_COLUMN));
      }
      return dataFrameReader;
    } catch (Exception e) {
      throw new HoodieException("Failed to validate properties", e);
    } finally {
      IOUtils.closeStream(passwordFileStream);
    }
  }

  /**
   * Accepts spark JDBC options from the user in terms of EXTRA_OPTIONS adds them to {@link DataFrameReader} Example: In
   * a normal spark code you would do something like: session.read.format('jdbc') .option(fetchSize,1000)
   * .option(timestampFormat,"yyyy-mm-dd hh:mm:ss")
   * 
   * The way to pass these properties to HUDI is through the config file. Any property starting with
   * hoodie.streamer.jdbc.extra.options. will be added.
   * 

   * Example: hoodie.streamer.jdbc.extra.options.fetchSize=100
   * hoodie.streamer.jdbc.extra.options.upperBound=1
   * hoodie.streamer.jdbc.extra.options.lowerBound=100
   *
   * @param properties      The JDBC connection properties and data source options.
   * @param dataFrameReader The {@link DataFrameReader} to which data source options will be added.
   */
  private static void addExtraJdbcOptions(TypedProperties properties, DataFrameReader dataFrameReader) {
    Set