io.cdap.plugin.db.batch.source.DBSource Maven / Gradle / Ivy
The newest version!
/*
* Copyright © 2015-2019 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package io.cdap.plugin.db.batch.source;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Strings;
import io.cdap.cdap.api.annotation.Description;
import io.cdap.cdap.api.annotation.Macro;
import io.cdap.cdap.api.annotation.Metadata;
import io.cdap.cdap.api.annotation.MetadataProperty;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.annotation.Plugin;
import io.cdap.cdap.api.data.batch.Input;
import io.cdap.cdap.api.data.format.StructuredRecord;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.dataset.lib.KeyValue;
import io.cdap.cdap.api.plugin.PluginConfig;
import io.cdap.cdap.etl.api.Emitter;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.PipelineConfigurer;
import io.cdap.cdap.etl.api.batch.BatchRuntimeContext;
import io.cdap.cdap.etl.api.batch.BatchSource;
import io.cdap.cdap.etl.api.batch.BatchSourceContext;
import io.cdap.cdap.etl.api.connector.Connector;
import io.cdap.plugin.ConnectionConfig;
import io.cdap.plugin.DBManager;
import io.cdap.plugin.DBRecord;
import io.cdap.plugin.FieldCase;
import io.cdap.plugin.StructuredRecordUtils;
import io.cdap.plugin.common.Asset;
import io.cdap.plugin.common.LineageRecorder;
import io.cdap.plugin.common.ReferenceBatchSource;
import io.cdap.plugin.common.ReferencePluginConfig;
import io.cdap.plugin.common.SourceInputFormatProvider;
import io.cdap.plugin.common.db.DBUtils;
import io.cdap.plugin.common.db.DriverCleanup;
import io.cdap.plugin.db.batch.TransactionIsolationLevel;
import io.cdap.plugin.db.common.DBBaseConfig;
import io.cdap.plugin.db.common.FQNGenerator;
import io.cdap.plugin.db.connector.DBConnector;
import io.cdap.plugin.db.connector.DBConnectorConfig;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.Connection;
import java.sql.Driver;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.List;
import java.util.Properties;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javax.annotation.Nullable;
/**
* Batch source to read from a DB table
*/
@Plugin(type = BatchSource.PLUGIN_TYPE)
@Name(DBSource.NAME)
@Description("Reads from a database table(s) using a configurable SQL query." +
" Outputs one record for each row returned by the query.")
@Metadata(properties = {@MetadataProperty(key = Connector.PLUGIN_TYPE, value = DBConnector.NAME)})
public class DBSource extends ReferenceBatchSource {
public static final String NAME = "Database";
private static final Logger LOG = LoggerFactory.getLogger(DBSource.class);
private static final Pattern CONDITIONS_AND = Pattern.compile("\\$conditions (and|or)\\s+",
Pattern.CASE_INSENSITIVE);
private static final Pattern AND_CONDITIONS = Pattern.compile("\\s+(and|or) \\$conditions",
Pattern.CASE_INSENSITIVE);
private static final Pattern WHERE_CONDITIONS = Pattern.compile("\\s+where \\$conditions",
Pattern.CASE_INSENSITIVE);
private final DBSourceConfig sourceConfig;
private final DBManager dbManager;
private Class extends Driver> driverClass;
public DBSource(DBSourceConfig sourceConfig) {
super(new ReferencePluginConfig(sourceConfig.getReferenceName()));
this.sourceConfig = sourceConfig;
this.dbManager = new DBManager(sourceConfig.getConnection(), sourceConfig.getJdbcPluginType());
}
@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
super.configurePipeline(pipelineConfigurer);
FailureCollector collector = pipelineConfigurer.getStageConfigurer().getFailureCollector();
sourceConfig.validate(collector);
Schema configuredSchema = sourceConfig.getSchema(collector);
if (configuredSchema != null) {
pipelineConfigurer.getStageConfigurer().setOutputSchema(configuredSchema);
}
if (sourceConfig.containsMacro(DBConnectorConfig.JDBC_PLUGIN_NAME)) {
dbManager.validateCredentials(collector);
return;
}
Class extends Driver> driverClass = dbManager.validateJDBCPluginPipeline(pipelineConfigurer, getJDBCPluginId(),
collector);
// throw exception before deriving schema from database. This is because database schema is derived using import
// query and its possible that validation failed for import query.
collector.getOrThrowException();
if (configuredSchema == null && !sourceConfig.containsMacro(DBSourceConfig.IMPORT_QUERY)) {
try {
pipelineConfigurer.getStageConfigurer().setOutputSchema(getSchema(driverClass, sourceConfig.patternToReplace,
sourceConfig.replaceWith));
} catch (IllegalAccessException | InstantiationException e) {
collector.addFailure(String.format("Failed to instantiate JDBC driver: %s", e.getMessage()), null);
} catch (SQLException e) {
collector.addFailure(
String.format("Encountered SQL error while getting query schema: %s", e.getMessage()), null);
}
}
}
@Override
public void prepareRun(BatchSourceContext context) {
FailureCollector collector = context.getFailureCollector();
sourceConfig.validate(collector);
collector.getOrThrowException();
LOG.debug("pluginName = {}; connectionString = {}; importQuery = {}; " +
"boundingQuery = {}; transaction isolation level: {}",
sourceConfig.getJdbcPluginName(),
sourceConfig.getConnectionString(), sourceConfig.getImportQuery(), sourceConfig.getBoundingQuery(),
sourceConfig.transactionIsolationLevel);
Configuration hConf = new Configuration();
hConf.clear();
// Load the plugin class to make sure it is available.
Class extends Driver> driverClass = context.loadPluginClass(getJDBCPluginId());
if (sourceConfig.getUser() == null && sourceConfig.getPassword() == null) {
DBConfiguration.configureDB(hConf, driverClass.getName(), sourceConfig.getConnectionString());
} else {
DBConfiguration.configureDB(hConf, driverClass.getName(), sourceConfig.getConnectionString(),
sourceConfig.getUser(), sourceConfig.getPassword());
}
DataDrivenETLDBInputFormat.setInput(hConf, DBRecord.class,
sourceConfig.getImportQuery(), sourceConfig.getBoundingQuery(),
sourceConfig.getEnableAutoCommit());
if (sourceConfig.transactionIsolationLevel != null) {
hConf.set(TransactionIsolationLevel.CONF_KEY, sourceConfig.transactionIsolationLevel);
}
if (sourceConfig.getConnectionArguments() != null) {
hConf.set(DBUtils.CONNECTION_ARGUMENTS, sourceConfig.getConnectionArguments());
}
if (sourceConfig.numSplits == null || sourceConfig.numSplits != 1) {
if (!sourceConfig.getImportQuery().contains("$CONDITIONS")) {
throw new IllegalArgumentException(String.format("Import Query %s must contain the string '$CONDITIONS'.",
sourceConfig.importQuery));
}
hConf.set(DBConfiguration.INPUT_ORDER_BY_PROPERTY, sourceConfig.splitBy);
}
if (sourceConfig.numSplits != null) {
hConf.setInt(MRJobConfig.NUM_MAPS, sourceConfig.numSplits);
}
if (sourceConfig.schema != null) {
hConf.set(DBUtils.OVERRIDE_SCHEMA, sourceConfig.schema);
}
if (sourceConfig.patternToReplace != null) {
hConf.set(DBUtils.PATTERN_TO_REPLACE, sourceConfig.patternToReplace);
}
if (sourceConfig.replaceWith != null) {
hConf.set(DBUtils.REPLACE_WITH, sourceConfig.replaceWith);
}
if (sourceConfig.fetchSize != null) {
hConf.setInt(DBUtils.FETCH_SIZE, sourceConfig.fetchSize);
}
// Create the external dataset before setting context properties
emitLineage(context);
context.setInput(Input.of(sourceConfig.getReferenceName(),
new SourceInputFormatProvider(DataDrivenETLDBInputFormat.class, hConf)));
}
@Override
public void initialize(BatchRuntimeContext context) throws Exception {
super.initialize(context);
driverClass = context.loadPluginClass(getJDBCPluginId());
}
@Override
public void transform(KeyValue input, Emitter emitter) throws Exception {
emitter.emit(StructuredRecordUtils.convertCase(
input.getValue().getRecord(), FieldCase.toFieldCase(sourceConfig.getColumnNameCase())));
}
@Override
public void destroy() {
try {
DBUtils.cleanup(driverClass);
} finally {
dbManager.destroy();
}
}
private String getJDBCPluginId() {
return String.format("source.%s.%s", sourceConfig.getJdbcPluginType(), sourceConfig.getJdbcPluginName());
}
private Schema getSchema(Class extends Driver> driverClass, @Nullable String patternToReplace,
@Nullable String replaceWith)
throws IllegalAccessException, SQLException, InstantiationException {
DriverCleanup driverCleanup = loadPluginClassAndGetDriver(driverClass);
try (Connection connection = getConnection()) {
String query = sourceConfig.importQuery;
Statement statement = connection.createStatement();
statement.setMaxRows(1);
if (query.contains("$CONDITIONS")) {
query = removeConditionsClause(query);
}
ResultSet resultSet = statement.executeQuery(query);
return Schema.recordOf("outputSchema", DBUtils.getSchemaFields(resultSet, patternToReplace, replaceWith,
null));
} finally {
driverCleanup.destroy();
}
}
@VisibleForTesting
static String removeConditionsClause(String importQueryString) {
String query = importQueryString;
query = CONDITIONS_AND.matcher(query).replaceAll("");
query = AND_CONDITIONS.matcher(query).replaceAll("");
query = WHERE_CONDITIONS.matcher(query).replaceAll("");
return query;
}
private DriverCleanup loadPluginClassAndGetDriver(Class extends Driver> driverClass)
throws IllegalAccessException, InstantiationException, SQLException {
if (driverClass == null) {
throw new InstantiationException(
String.format("Unable to load JDBC driver class with plugin name %s", sourceConfig.getJdbcPluginName()));
}
try {
return DBUtils
.ensureJDBCDriverIsAvailable(driverClass, sourceConfig.getConnectionString(), sourceConfig.getJdbcPluginName(),
sourceConfig.getJdbcPluginType());
} catch (IllegalAccessException | InstantiationException | SQLException e) {
LOG.error("Unable to load or register driver {}", driverClass, e);
throw e;
}
}
private Connection getConnection() throws SQLException {
Properties properties =
ConnectionConfig.getConnectionArguments(sourceConfig.getConnectionArguments(),
sourceConfig.getUser(),
sourceConfig.getPassword());
return DriverManager.getConnection(sourceConfig.getConnectionString(), properties);
}
protected LineageRecorder getLineageRecorder(BatchSourceContext context) {
Asset asset = Asset.builder(sourceConfig.getReferenceName()).
setFqn(FQNGenerator.constructFQN(sourceConfig.getConnectionString(), sourceConfig.getReferenceName())).build();
return new LineageRecorder(context, asset);
}
/**
* {@link PluginConfig} for {@link DBSource}
*/
public static class DBSourceConfig extends DBBaseConfig {
public static final String IMPORT_QUERY = "importQuery";
public static final String BOUNDING_QUERY = "boundingQuery";
public static final String SPLIT_BY = "splitBy";
public static final String NUM_SPLITS = "numSplits";
public static final String SCHEMA = "schema";
public static final String TRANSACTION_ISOLATION_LEVEL = "transactionIsolationLevel";
public static final String PATTERN_TO_REPLACE = "patternToReplace";
public static final String REPLACE_WITH = "replaceWith";
public static final String FETCH_SIZE = "fetchSize";
@Name(IMPORT_QUERY)
@Description("The SELECT query to use to import data from the specified table. " +
"You can specify an arbitrary number of columns to import, or import all columns using *. " +
"The Query should contain the '$CONDITIONS' string unless numSplits is set to one. " +
"For example, 'SELECT * FROM table WHERE $CONDITIONS'. The '$CONDITIONS' string" +
"will be replaced by 'splitBy' field limits specified by the bounding query.")
@Macro
String importQuery;
@Nullable
@Name(BOUNDING_QUERY)
@Description("Bounding Query should return the min and max of the " +
"values of the 'splitBy' field. For example, 'SELECT MIN(id),MAX(id) FROM table'. " +
"This is required unless numSplits is set to one.")
@Macro
String boundingQuery;
@Nullable
@Name(SPLIT_BY)
@Description("Field Name which will be used to generate splits. This is required unless numSplits is set to one.")
@Macro
String splitBy;
@Nullable
@Name(NUM_SPLITS)
@Description("The number of splits to generate. If set to one, the boundingQuery is not needed, " +
"and no $CONDITIONS string needs to be specified in the importQuery. If not specified, the " +
"execution framework will pick a value.")
@Macro
Integer numSplits;
@Nullable
@Name(TRANSACTION_ISOLATION_LEVEL)
@Description("The transaction isolation level for queries run by this sink. " +
"Defaults to TRANSACTION_SERIALIZABLE. See java.sql.Connection#setTransactionIsolation for more details. " +
"The Phoenix jdbc driver will throw an exception if the Phoenix database does not have transactions enabled " +
"and this setting is set to true. For drivers like that, this should be set to TRANSACTION_NONE.")
@Macro
public String transactionIsolationLevel;
@Nullable
@Name(SCHEMA)
@Description("The schema of records output by the source. This will be used in place of whatever schema comes " +
"back from the query. This should only be used if there is a bug in your jdbc driver. For example, if a column " +
"is not correctly getting marked as nullable.")
String schema;
@Nullable
@Name(PATTERN_TO_REPLACE)
@Description("The pattern to replace in the field name in the table, it is typically used with the " +
"Replace With config. If Replace With is not set, the pattern will be removed in the field name.")
String patternToReplace;
@Nullable
@Name(REPLACE_WITH)
@Description("The string that will be replaced in the field name in the table, it must be used with the " +
"Pattern To Replace config.")
String replaceWith;
@Nullable
@Name(FETCH_SIZE)
@Macro
@Description("The number of rows to fetch at a time per split. Larger fetch size can result in faster import, " +
"with the tradeoff of higher memory usage.")
Integer fetchSize;
@Nullable
private String getImportQuery() {
return cleanQuery(importQuery);
}
private String getBoundingQuery() {
return cleanQuery(boundingQuery);
}
@SuppressWarnings("checkstyle:WhitespaceAround")
private void validate(FailureCollector collector) {
boolean hasOneSplit = false;
if (!containsMacro(NUM_SPLITS) && numSplits != null) {
if (numSplits < 1) {
collector.addFailure("Number of Splits must be a positive number.", null).withConfigProperty(NUM_SPLITS);
}
if (numSplits == 1) {
hasOneSplit = true;
}
}
if (!containsMacro(TRANSACTION_ISOLATION_LEVEL) && transactionIsolationLevel != null) {
TransactionIsolationLevel.validate(transactionIsolationLevel, collector);
}
if (!containsMacro(IMPORT_QUERY) && Strings.isNullOrEmpty(importQuery)) {
collector.addFailure("Import Query must be specified.", null).withConfigProperty(IMPORT_QUERY);
}
if (!hasOneSplit && !containsMacro(IMPORT_QUERY) && !Strings.isNullOrEmpty(importQuery) &&
!getImportQuery().contains("$CONDITIONS")) {
collector.addFailure("Invalid Import Query.", String.format("Import Query %s must contain the " +
"string '$CONDITIONS'.", importQuery))
.withConfigProperty(IMPORT_QUERY);
}
if (!hasOneSplit && !containsMacro(SPLIT_BY) && Strings.isNullOrEmpty(splitBy)) {
collector.addFailure("Split-By Field Name must be specified if Number of Splits is not set to 1.",
null).withConfigProperty(SPLIT_BY).withConfigProperty(NUM_SPLITS);
}
if (!hasOneSplit && !containsMacro(BOUNDING_QUERY) && Strings.isNullOrEmpty(boundingQuery)) {
collector.addFailure("Bounding Query must be specified if Number of Splits is not set to 1.", null)
.withConfigProperty(BOUNDING_QUERY).withConfigProperty(NUM_SPLITS);
}
if (replaceWith != null && patternToReplace == null) {
collector.addFailure("Replace With is set but Pattern To Replace is not provided", null)
.withConfigProperty(REPLACE_WITH).withConfigProperty(PATTERN_TO_REPLACE);
}
if (!containsMacro(FETCH_SIZE) && fetchSize != null && fetchSize <= 0) {
collector.addFailure("Invalid fetch size.", "Fetch size must be a positive integer.")
.withConfigProperty(FETCH_SIZE);
}
}
@Nullable
private Schema getSchema(FailureCollector collector) {
try {
return Strings.isNullOrEmpty(schema) ? null : Schema.parseJson(schema);
} catch (IOException e) {
collector.addFailure(String.format("Invalid Schema : %s", e.getMessage()), null);
}
throw collector.getOrThrowException();
}
}
private void emitLineage(BatchSourceContext context) {
Schema schema = sourceConfig.getSchema(context.getFailureCollector());
if (schema == null) {
schema = context.getOutputSchema();
}
LineageRecorder lineageRecorder = getLineageRecorder(context);
lineageRecorder.createExternalDataset(schema);
if (schema != null && schema.getFields() != null) {
lineageRecorder.recordRead("Read", "Read from DB.",
schema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList()));
}
}
}