org.apache.beam.sdk.io.jdbc.JdbcIO Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.io.jdbc;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import com.google.auto.value.AutoValue;
import java.io.Serializable;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.Random;
import javax.annotation.Nullable;
import javax.sql.DataSource;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.Flatten;
import org.apache.beam.sdk.transforms.GroupByKey;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.Values;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PDone;
import org.apache.commons.dbcp2.BasicDataSource;
/**
* IO to read and write data on JDBC.
*
* Reading from JDBC datasource
*
* JdbcIO source returns a bounded collection of {@code T} as a {@code PCollection}. T is the
* type returned by the provided {@link RowMapper}.
*
* To configure the JDBC source, you have to provide a {@link DataSourceConfiguration} using
* {@link DataSourceConfiguration#create(DataSource)} or
* {@link DataSourceConfiguration#create(String, String)} with either a
* {@link DataSource} (which must be {@link Serializable}) or the parameters needed to create it
* (driver class name and url). Optionally, {@link DataSourceConfiguration#withUsername(String)} and
* {@link DataSourceConfiguration#withPassword(String)} allows you to define DataSource username
* and password.
* For example:
*
*
{@code
* pipeline.apply(JdbcIO.>read()
* .withDataSourceConfiguration(JdbcIO.DataSourceConfiguration.create(
* "com.mysql.jdbc.Driver", "jdbc:mysql://hostname:3306/mydb")
* .withUsername("username")
* .withPassword("password"))
* .withQuery("select id,name from Person")
* .withRowMapper(new JdbcIO.RowMapper>() {
* public KV mapRow(ResultSet resultSet) throws Exception {
* return KV.of(resultSet.getInt(1), resultSet.getString(2));
* }
* })
* }
*
* Query parameters can be configured using a user-provided {@link StatementPreparator}.
* For example:
*
* {@code
* pipeline.apply(JdbcIO.>read()
* .withDataSourceConfiguration(JdbcIO.DataSourceConfiguration.create(
* "com.mysql.jdbc.Driver", "jdbc:mysql://hostname:3306/mydb",
* "username", "password"))
* .withQuery("select id,name from Person where name = ?")
* .withStatementPreparator(new JdbcIO.StatementPreparator() {
* public void setParameters(PreparedStatement preparedStatement) throws Exception {
* preparedStatement.setString(1, "Darwin");
* }
* })
* .withRowMapper(new JdbcIO.RowMapper>() {
* public KV mapRow(ResultSet resultSet) throws Exception {
* return KV.of(resultSet.getInt(1), resultSet.getString(2));
* }
* })
* }
*
* Writing to JDBC datasource
*
* JDBC sink supports writing records into a database. It writes a {@link PCollection} to the
* database by converting each T into a {@link PreparedStatement} via a user-provided {@link
* PreparedStatementSetter}.
*
*
Like the source, to configure the sink, you have to provide a {@link DataSourceConfiguration}.
*
*
{@code
* pipeline
* .apply(...)
* .apply(JdbcIO.>write()
* .withDataSourceConfiguration(JdbcIO.DataSourceConfiguration.create(
* "com.mysql.jdbc.Driver", "jdbc:mysql://hostname:3306/mydb")
* .withUsername("username")
* .withPassword("password"))
* .withStatement("insert into Person values(?, ?)")
* .withPreparedStatementSetter(new JdbcIO.PreparedStatementSetter>() {
* public void setParameters(KV element, PreparedStatement query) {
* query.setInt(1, kv.getKey());
* query.setString(2, kv.getValue());
* }
* })
* }
*
* NB: in case of transient failures, Beam runners may execute parts of JdbcIO.Write multiple
* times for fault tolerance. Because of that, you should avoid using {@code INSERT} statements,
* since that risks duplicating records in the database, or failing due to primary key conflicts.
* Consider using MERGE ("upsert")
* statements supported by your database instead.
*/
public class JdbcIO {
/**
* Read data from a JDBC datasource.
*
* @param Type of the data to be read.
*/
public static Read read() {
return new AutoValue_JdbcIO_Read.Builder().build();
}
/**
* Write data to a JDBC datasource.
*
* @param Type of the data to be written.
*/
public static Write write() {
return new AutoValue_JdbcIO_Write.Builder().build();
}
private JdbcIO() {}
/**
* An interface used by {@link JdbcIO.Read} for converting each row of the {@link ResultSet} into
* an element of the resulting {@link PCollection}.
*/
public interface RowMapper extends Serializable {
T mapRow(ResultSet resultSet) throws Exception;
}
/**
* A POJO describing a {@link DataSource}, either providing directly a {@link DataSource} or all
* properties allowing to create a {@link DataSource}.
*/
@AutoValue
public abstract static class DataSourceConfiguration implements Serializable {
@Nullable abstract String getDriverClassName();
@Nullable abstract String getUrl();
@Nullable abstract String getUsername();
@Nullable abstract String getPassword();
@Nullable abstract DataSource getDataSource();
abstract Builder builder();
@AutoValue.Builder
abstract static class Builder {
abstract Builder setDriverClassName(String driverClassName);
abstract Builder setUrl(String url);
abstract Builder setUsername(String username);
abstract Builder setPassword(String password);
abstract Builder setDataSource(DataSource dataSource);
abstract DataSourceConfiguration build();
}
public static DataSourceConfiguration create(DataSource dataSource) {
checkArgument(dataSource != null, "DataSourceConfiguration.create(dataSource) called with "
+ "null data source");
checkArgument(dataSource instanceof Serializable,
"DataSourceConfiguration.create(dataSource) called with a dataSource not Serializable");
return new AutoValue_JdbcIO_DataSourceConfiguration.Builder()
.setDataSource(dataSource)
.build();
}
public static DataSourceConfiguration create(String driverClassName, String url) {
checkArgument(driverClassName != null,
"DataSourceConfiguration.create(driverClassName, url) called with null driverClassName");
checkArgument(url != null,
"DataSourceConfiguration.create(driverClassName, url) called with null url");
return new AutoValue_JdbcIO_DataSourceConfiguration.Builder()
.setDriverClassName(driverClassName)
.setUrl(url)
.build();
}
public DataSourceConfiguration withUsername(String username) {
return builder().setUsername(username).build();
}
public DataSourceConfiguration withPassword(String password) {
return builder().setPassword(password).build();
}
private void populateDisplayData(DisplayData.Builder builder) {
if (getDataSource() != null) {
builder.addIfNotNull(DisplayData.item("dataSource", getDataSource().getClass().getName()));
} else {
builder.addIfNotNull(DisplayData.item("jdbcDriverClassName", getDriverClassName()));
builder.addIfNotNull(DisplayData.item("jdbcUrl", getUrl()));
builder.addIfNotNull(DisplayData.item("username", getUsername()));
}
}
Connection getConnection() throws Exception {
if (getDataSource() != null) {
return (getUsername() != null)
? getDataSource().getConnection(getUsername(), getPassword())
: getDataSource().getConnection();
} else {
BasicDataSource basicDataSource = new BasicDataSource();
basicDataSource.setDriverClassName(getDriverClassName());
basicDataSource.setUrl(getUrl());
basicDataSource.setUsername(getUsername());
basicDataSource.setPassword(getPassword());
return basicDataSource.getConnection();
}
}
}
/**
* An interface used by the JdbcIO Write to set the parameters of the {@link PreparedStatement}
* used to setParameters into the database.
*/
public interface StatementPreparator extends Serializable {
void setParameters(PreparedStatement preparedStatement) throws Exception;
}
/** A {@link PTransform} to read data from a JDBC datasource. */
@AutoValue
public abstract static class Read extends PTransform> {
@Nullable abstract DataSourceConfiguration getDataSourceConfiguration();
@Nullable abstract String getQuery();
@Nullable abstract StatementPreparator getStatementPreparator();
@Nullable abstract RowMapper getRowMapper();
@Nullable abstract Coder getCoder();
abstract Builder toBuilder();
@AutoValue.Builder
abstract static class Builder {
abstract Builder setDataSourceConfiguration(DataSourceConfiguration config);
abstract Builder setQuery(String query);
abstract Builder setStatementPreparator(StatementPreparator statementPreparator);
abstract Builder setRowMapper(RowMapper rowMapper);
abstract Builder setCoder(Coder coder);
abstract Read build();
}
public Read withDataSourceConfiguration(DataSourceConfiguration configuration) {
checkArgument(configuration != null, "JdbcIO.read().withDataSourceConfiguration"
+ "(configuration) called with null configuration");
return toBuilder().setDataSourceConfiguration(configuration).build();
}
public Read withQuery(String query) {
checkArgument(query != null, "JdbcIO.read().withQuery(query) called with null query");
return toBuilder().setQuery(query).build();
}
public Read withStatementPrepator(StatementPreparator statementPreparator) {
checkArgument(statementPreparator != null,
"JdbcIO.read().withStatementPreparator(statementPreparator) called "
+ "with null statementPreparator");
return toBuilder().setStatementPreparator(statementPreparator).build();
}
public Read withRowMapper(RowMapper rowMapper) {
checkArgument(rowMapper != null,
"JdbcIO.read().withRowMapper(rowMapper) called with null rowMapper");
return toBuilder().setRowMapper(rowMapper).build();
}
public Read withCoder(Coder coder) {
checkArgument(coder != null, "JdbcIO.read().withCoder(coder) called with null coder");
return toBuilder().setCoder(coder).build();
}
@Override
public PCollection expand(PBegin input) {
return input
.apply(Create.of(getQuery()))
.apply(ParDo.of(new ReadFn<>(this))).setCoder(getCoder())
// generate a random key followed by a GroupByKey and then ungroup
// to prevent fusion
// see https://cloud.google.com/dataflow/service/dataflow-service-desc#preventing-fusion
// for details
.apply(ParDo.of(new DoFn>() {
private Random random;
@Setup
public void setup() {
random = new Random();
}
@ProcessElement
public void processElement(ProcessContext context) {
context.output(KV.of(random.nextInt(), context.element()));
}
}))
.apply(GroupByKey.create())
.apply(Values.>create())
.apply(Flatten.iterables());
}
@Override
public void validate(PBegin input) {
checkState(getQuery() != null,
"JdbcIO.read() requires a query to be set via withQuery(query)");
checkState(getRowMapper() != null,
"JdbcIO.read() requires a rowMapper to be set via withRowMapper(rowMapper)");
checkState(getCoder() != null,
"JdbcIO.read() requires a coder to be set via withCoder(coder)");
checkState(getDataSourceConfiguration() != null,
"JdbcIO.read() requires a DataSource configuration to be set via "
+ "withDataSourceConfiguration(dataSourceConfiguration)");
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
builder.add(DisplayData.item("query", getQuery()));
builder.add(DisplayData.item("rowMapper", getRowMapper().getClass().getName()));
builder.add(DisplayData.item("coder", getCoder().getClass().getName()));
getDataSourceConfiguration().populateDisplayData(builder);
}
/** A {@link DoFn} executing the SQL query to read from the database. */
static class ReadFn extends DoFn {
private JdbcIO.Read spec;
private Connection connection;
private ReadFn(Read spec) {
this.spec = spec;
}
@Setup
public void setup() throws Exception {
connection = spec.getDataSourceConfiguration().getConnection();
}
@ProcessElement
public void processElement(ProcessContext context) throws Exception {
String query = context.element();
try (PreparedStatement statement = connection.prepareStatement(query)) {
if (this.spec.getStatementPreparator() != null) {
this.spec.getStatementPreparator().setParameters(statement);
}
try (ResultSet resultSet = statement.executeQuery()) {
while (resultSet.next()) {
context.output(spec.getRowMapper().mapRow(resultSet));
}
}
}
}
@Teardown
public void teardown() throws Exception {
if (connection != null) {
connection.close();
}
}
}
}
/**
* An interface used by the JdbcIO Write to set the parameters of the {@link PreparedStatement}
* used to setParameters into the database.
*/
public interface PreparedStatementSetter extends Serializable {
void setParameters(T element, PreparedStatement preparedStatement) throws Exception;
}
/** A {@link PTransform} to write to a JDBC datasource. */
@AutoValue
public abstract static class Write extends PTransform, PDone> {
@Nullable abstract DataSourceConfiguration getDataSourceConfiguration();
@Nullable abstract String getStatement();
@Nullable abstract PreparedStatementSetter getPreparedStatementSetter();
abstract Builder toBuilder();
@AutoValue.Builder
abstract static class Builder {
abstract Builder setDataSourceConfiguration(DataSourceConfiguration config);
abstract Builder setStatement(String statement);
abstract Builder setPreparedStatementSetter(PreparedStatementSetter setter);
abstract Write build();
}
public Write withDataSourceConfiguration(DataSourceConfiguration config) {
return toBuilder().setDataSourceConfiguration(config).build();
}
public Write withStatement(String statement) {
return toBuilder().setStatement(statement).build();
}
public Write withPreparedStatementSetter(PreparedStatementSetter setter) {
return toBuilder().setPreparedStatementSetter(setter).build();
}
@Override
public PDone expand(PCollection input) {
input.apply(ParDo.of(new WriteFn(this)));
return PDone.in(input.getPipeline());
}
@Override
public void validate(PCollection input) {
checkArgument(getDataSourceConfiguration() != null,
"JdbcIO.write() requires a configuration to be set via "
+ ".withDataSourceConfiguration(configuration)");
checkArgument(getStatement() != null,
"JdbcIO.write() requires a statement to be set via .withStatement(statement)");
checkArgument(getPreparedStatementSetter() != null,
"JdbcIO.write() requires a preparedStatementSetter to be set via "
+ ".withPreparedStatementSetter(preparedStatementSetter)");
}
private static class WriteFn extends DoFn {
private static final int DEFAULT_BATCH_SIZE = 1000;
private final Write spec;
private Connection connection;
private PreparedStatement preparedStatement;
private int batchCount;
public WriteFn(Write spec) {
this.spec = spec;
}
@Setup
public void setup() throws Exception {
connection = spec.getDataSourceConfiguration().getConnection();
connection.setAutoCommit(false);
preparedStatement = connection.prepareStatement(spec.getStatement());
}
@StartBundle
public void startBundle(Context context) {
batchCount = 0;
}
@ProcessElement
public void processElement(ProcessContext context) throws Exception {
T record = context.element();
preparedStatement.clearParameters();
spec.getPreparedStatementSetter().setParameters(record, preparedStatement);
preparedStatement.addBatch();
batchCount++;
if (batchCount >= DEFAULT_BATCH_SIZE) {
finishBundle(context);
}
}
@FinishBundle
public void finishBundle(Context context) throws Exception {
if (batchCount > 0) {
preparedStatement.executeBatch();
connection.commit();
batchCount = 0;
}
}
@Teardown
public void teardown() throws Exception {
try {
if (preparedStatement != null) {
preparedStatement.close();
}
} finally {
if (connection != null) {
connection.close();
}
}
}
}
}
}