org.apache.hudi.utilities.sources.JdbcSource Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.utilities.sources;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.utilities.SqlQueryBuilder;
import org.apache.hudi.utilities.config.JdbcSourceConfig;
import org.apache.hudi.utilities.exception.HoodieReadFromSourceException;
import org.apache.hudi.utilities.schema.SchemaProvider;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.DataFrameReader;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.storage.StorageLevel;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.InputStream;
import java.net.URI;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import static org.apache.hudi.common.util.ConfigUtils.checkRequiredConfigProperties;
import static org.apache.hudi.common.util.ConfigUtils.containsConfigProperty;
import static org.apache.hudi.common.util.ConfigUtils.getBooleanWithAltKeys;
import static org.apache.hudi.common.util.ConfigUtils.getStringWithAltKeys;
import static org.apache.hudi.common.util.ConfigUtils.stripPrefix;
/**
* Reads data from RDBMS data sources.
*/
public class JdbcSource extends RowSource {
private static final Logger LOG = LoggerFactory.getLogger(JdbcSource.class);
private static final List DB_LIMIT_CLAUSE = Arrays.asList("mysql", "postgresql", "h2");
private static final String URI_JDBC_PREFIX = "jdbc:";
public JdbcSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
SchemaProvider schemaProvider) {
super(props, sparkContext, sparkSession, schemaProvider);
}
/**
* Validates all user properties and prepares the {@link DataFrameReader} to read from RDBMS.
*
* @param session The {@link SparkSession}.
* @param properties The JDBC connection properties and data source options.
* @return The {@link DataFrameReader} to read from RDBMS
* @throws HoodieException
*/
private static DataFrameReader validatePropsAndGetDataFrameReader(final SparkSession session,
final TypedProperties properties)
throws HoodieException {
DataFrameReader dataFrameReader;
InputStream passwordFileStream = null;
try {
dataFrameReader = session.read().format("jdbc");
dataFrameReader = dataFrameReader.option(
Config.URL_PROP, getStringWithAltKeys(properties, JdbcSourceConfig.URL));
dataFrameReader = dataFrameReader.option(
Config.USER_PROP, getStringWithAltKeys(properties, JdbcSourceConfig.USER));
dataFrameReader = dataFrameReader.option(
Config.DRIVER_PROP, getStringWithAltKeys(properties, JdbcSourceConfig.DRIVER_CLASS));
dataFrameReader = dataFrameReader.option(
Config.RDBMS_TABLE_PROP, getStringWithAltKeys(properties, JdbcSourceConfig.RDBMS_TABLE_NAME));
if (containsConfigProperty(properties, JdbcSourceConfig.PASSWORD)) {
LOG.info("Reading JDBC password from properties file....");
dataFrameReader = dataFrameReader.option(Config.PASSWORD_PROP,
getStringWithAltKeys(properties, JdbcSourceConfig.PASSWORD));
} else if (containsConfigProperty(properties, JdbcSourceConfig.PASSWORD_FILE)
&& !StringUtils.isNullOrEmpty(getStringWithAltKeys(properties, JdbcSourceConfig.PASSWORD_FILE))) {
LOG.info(String.format("Reading JDBC password from password file %s", getStringWithAltKeys(properties, JdbcSourceConfig.PASSWORD_FILE)));
FileSystem fileSystem = FileSystem.get(session.sparkContext().hadoopConfiguration());
passwordFileStream = fileSystem.open(new Path(getStringWithAltKeys(properties, JdbcSourceConfig.PASSWORD_FILE)));
byte[] bytes = new byte[passwordFileStream.available()];
passwordFileStream.read(bytes);
dataFrameReader = dataFrameReader.option(Config.PASSWORD_PROP, new String(bytes));
} else {
throw new IllegalArgumentException(String.format("JDBCSource needs either a %s or %s to connect to RDBMS "
+ "datasource", JdbcSourceConfig.PASSWORD_FILE.key(), JdbcSourceConfig.PASSWORD.key()));
}
addExtraJdbcOptions(properties, dataFrameReader);
if (getBooleanWithAltKeys(properties, JdbcSourceConfig.IS_INCREMENTAL)) {
checkRequiredConfigProperties(properties, Collections.singletonList(JdbcSourceConfig.INCREMENTAL_COLUMN));
}
return dataFrameReader;
} catch (Exception e) {
throw new HoodieException("Failed to validate properties", e);
} finally {
IOUtils.closeStream(passwordFileStream);
}
}
/**
* Accepts spark JDBC options from the user in terms of EXTRA_OPTIONS adds them to {@link DataFrameReader} Example: In
* a normal spark code you would do something like: session.read.format('jdbc') .option(fetchSize,1000)
* .option(timestampFormat,"yyyy-mm-dd hh:mm:ss")
*
* The way to pass these properties to HUDI is through the config file. Any property starting with
* hoodie.streamer.jdbc.extra.options. will be added.
*
* Example: hoodie.streamer.jdbc.extra.options.fetchSize=100
* hoodie.streamer.jdbc.extra.options.upperBound=1
* hoodie.streamer.jdbc.extra.options.lowerBound=100
*
* @param properties The JDBC connection properties and data source options.
* @param dataFrameReader The {@link DataFrameReader} to which data source options will be added.
*/
private static void addExtraJdbcOptions(TypedProperties properties, DataFrameReader dataFrameReader) {
Set
© 2015 - 2025 Weber Informatics LLC | Privacy Policy