All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.beam.sdk.io.jdbc.JdbcIO Maven / Gradle / Ivy

There is a newer version: 2.61.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io.jdbc;

import static com.google.common.base.Preconditions.checkArgument;

import com.google.auto.value.AutoValue;
import java.io.Serializable;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import javax.annotation.Nullable;
import javax.sql.DataSource;
import org.apache.beam.sdk.annotations.Experimental;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.Filter;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.Reshuffle;
import org.apache.beam.sdk.transforms.SerializableFunctions;
import org.apache.beam.sdk.transforms.View;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionView;
import org.apache.beam.sdk.values.PDone;
import org.apache.commons.dbcp2.BasicDataSource;

/**
 * IO to read and write data on JDBC.
 *
 * 

Reading from JDBC datasource

* *

JdbcIO source returns a bounded collection of {@code T} as a {@code PCollection}. T is the * type returned by the provided {@link RowMapper}. * *

To configure the JDBC source, you have to provide a {@link DataSourceConfiguration} using
* 1. {@link DataSourceConfiguration#create(DataSource)}(which must be {@link Serializable});
* 2. or {@link DataSourceConfiguration#create(String, String)}(driver class name and url). * Optionally, {@link DataSourceConfiguration#withUsername(String)} and * {@link DataSourceConfiguration#withPassword(String)} allows you to define username and password. * *

For example: *

{@code
 * pipeline.apply(JdbcIO.>read()
 *   .withDataSourceConfiguration(JdbcIO.DataSourceConfiguration.create(
 *          "com.mysql.jdbc.Driver", "jdbc:mysql://hostname:3306/mydb")
 *        .withUsername("username")
 *        .withPassword("password"))
 *   .withQuery("select id,name from Person")
 *   .withCoder(KvCoder.of(BigEndianIntegerCoder.of(), StringUtf8Coder.of()))
 *   .withRowMapper(new JdbcIO.RowMapper>() {
 *     public KV mapRow(ResultSet resultSet) throws Exception {
 *       return KV.of(resultSet.getInt(1), resultSet.getString(2));
 *     }
 *   })
 * );
 * }
* *

Query parameters can be configured using a user-provided {@link StatementPreparator}. * For example:

* *
{@code
 * pipeline.apply(JdbcIO.>read()
 *   .withDataSourceConfiguration(JdbcIO.DataSourceConfiguration.create(
 *       "com.mysql.jdbc.Driver", "jdbc:mysql://hostname:3306/mydb",
 *       "username", "password"))
 *   .withQuery("select id,name from Person where name = ?")
 *   .withCoder(KvCoder.of(BigEndianIntegerCoder.of(), StringUtf8Coder.of()))
 *   .withStatementPreparator(new JdbcIO.StatementPreparator() {
 *     public void setParameters(PreparedStatement preparedStatement) throws Exception {
 *       preparedStatement.setString(1, "Darwin");
 *     }
 *   })
 *   .withRowMapper(new JdbcIO.RowMapper>() {
 *     public KV mapRow(ResultSet resultSet) throws Exception {
 *       return KV.of(resultSet.getInt(1), resultSet.getString(2));
 *     }
 *   })
 * );
 * }
* *

Writing to JDBC datasource

* *

JDBC sink supports writing records into a database. It writes a {@link PCollection} to the * database by converting each T into a {@link PreparedStatement} via a user-provided {@link * PreparedStatementSetter}. * *

Like the source, to configure the sink, you have to provide a {@link DataSourceConfiguration}. * *

{@code
 * pipeline
 *   .apply(...)
 *   .apply(JdbcIO.>write()
 *      .withDataSourceConfiguration(JdbcIO.DataSourceConfiguration.create(
 *            "com.mysql.jdbc.Driver", "jdbc:mysql://hostname:3306/mydb")
 *          .withUsername("username")
 *          .withPassword("password"))
 *      .withStatement("insert into Person values(?, ?)")
 *      .withPreparedStatementSetter(new JdbcIO.PreparedStatementSetter>() {
 *        public void setParameters(KV element, PreparedStatement query)
 *          throws SQLException {
 *          query.setInt(1, kv.getKey());
 *          query.setString(2, kv.getValue());
 *        }
 *      })
 *    );
 * }
* *

NB: in case of transient failures, Beam runners may execute parts of JdbcIO.Write multiple * times for fault tolerance. Because of that, you should avoid using {@code INSERT} statements, * since that risks duplicating records in the database, or failing due to primary key conflicts. * Consider using MERGE ("upsert") * statements supported by your database instead. */ @Experimental(Experimental.Kind.SOURCE_SINK) public class JdbcIO { /** * Read data from a JDBC datasource. * * @param Type of the data to be read. */ public static Read read() { return new AutoValue_JdbcIO_Read.Builder().build(); } /** * Like {@link #read}, but executes multiple instances of the query substituting each element * of a {@link PCollection} as query parameters. * * @param Type of the data representing query parameters. * @param Type of the data to be read. */ public static ReadAll readAll() { return new AutoValue_JdbcIO_ReadAll.Builder().build(); } /** * Write data to a JDBC datasource. * * @param Type of the data to be written. */ public static Write write() { return new AutoValue_JdbcIO_Write.Builder().build(); } private JdbcIO() {} /** * An interface used by {@link JdbcIO.Read} for converting each row of the {@link ResultSet} into * an element of the resulting {@link PCollection}. */ @FunctionalInterface public interface RowMapper extends Serializable { T mapRow(ResultSet resultSet) throws Exception; } /** * A POJO describing a {@link DataSource}, either providing directly a {@link DataSource} or all * properties allowing to create a {@link DataSource}. */ @AutoValue public abstract static class DataSourceConfiguration implements Serializable { @Nullable abstract String getDriverClassName(); @Nullable abstract String getUrl(); @Nullable abstract String getUsername(); @Nullable abstract String getPassword(); @Nullable abstract String getConnectionProperties(); @Nullable abstract DataSource getDataSource(); abstract Builder builder(); @AutoValue.Builder abstract static class Builder { abstract Builder setDriverClassName(String driverClassName); abstract Builder setUrl(String url); abstract Builder setUsername(String username); abstract Builder setPassword(String password); abstract Builder setConnectionProperties(String connectionProperties); abstract Builder setDataSource(DataSource dataSource); abstract DataSourceConfiguration build(); } public static DataSourceConfiguration create(DataSource dataSource) { checkArgument(dataSource != null, "dataSource can not be null"); checkArgument(dataSource instanceof Serializable, "dataSource must be Serializable"); return new AutoValue_JdbcIO_DataSourceConfiguration.Builder() .setDataSource(dataSource) .build(); } public static DataSourceConfiguration create(String driverClassName, String url) { checkArgument(driverClassName != null, "driverClassName can not be null"); checkArgument(url != null, "url can not be null"); return new AutoValue_JdbcIO_DataSourceConfiguration.Builder() .setDriverClassName(driverClassName) .setUrl(url) .build(); } public DataSourceConfiguration withUsername(String username) { return builder().setUsername(username).build(); } public DataSourceConfiguration withPassword(String password) { return builder().setPassword(password).build(); } /** * Sets the connection properties passed to driver.connect(...). * Format of the string must be [propertyName=property;]* * *

NOTE - The "user" and "password" properties can be add via {@link #withUsername(String)}, * {@link #withPassword(String)}, so they do not need to be included here. */ public DataSourceConfiguration withConnectionProperties(String connectionProperties) { checkArgument(connectionProperties != null, "connectionProperties can not be null"); return builder().setConnectionProperties(connectionProperties).build(); } private void populateDisplayData(DisplayData.Builder builder) { if (getDataSource() != null) { builder.addIfNotNull(DisplayData.item("dataSource", getDataSource().getClass().getName())); } else { builder.addIfNotNull(DisplayData.item("jdbcDriverClassName", getDriverClassName())); builder.addIfNotNull(DisplayData.item("jdbcUrl", getUrl())); builder.addIfNotNull(DisplayData.item("username", getUsername())); } } DataSource buildDatasource() throws Exception{ if (getDataSource() != null) { return getDataSource(); } else { BasicDataSource basicDataSource = new BasicDataSource(); basicDataSource.setDriverClassName(getDriverClassName()); basicDataSource.setUrl(getUrl()); basicDataSource.setUsername(getUsername()); basicDataSource.setPassword(getPassword()); if (getConnectionProperties() != null) { basicDataSource.setConnectionProperties(getConnectionProperties()); } return basicDataSource; } } } /** * An interface used by the JdbcIO Write to set the parameters of the {@link PreparedStatement} * used to setParameters into the database. */ @FunctionalInterface public interface StatementPreparator extends Serializable { void setParameters(PreparedStatement preparedStatement) throws Exception; } /** Implementation of {@link #read}. */ @AutoValue public abstract static class Read extends PTransform> { @Nullable abstract DataSourceConfiguration getDataSourceConfiguration(); @Nullable abstract ValueProvider getQuery(); @Nullable abstract StatementPreparator getStatementPreparator(); @Nullable abstract RowMapper getRowMapper(); @Nullable abstract Coder getCoder(); abstract Builder toBuilder(); @AutoValue.Builder abstract static class Builder { abstract Builder setDataSourceConfiguration(DataSourceConfiguration config); abstract Builder setQuery(ValueProvider query); abstract Builder setStatementPreparator(StatementPreparator statementPreparator); abstract Builder setRowMapper(RowMapper rowMapper); abstract Builder setCoder(Coder coder); abstract Read build(); } public Read withDataSourceConfiguration(DataSourceConfiguration configuration) { checkArgument(configuration != null, "configuration can not be null"); return toBuilder().setDataSourceConfiguration(configuration).build(); } public Read withQuery(String query) { checkArgument(query != null, "query can not be null"); return withQuery(ValueProvider.StaticValueProvider.of(query)); } public Read withQuery(ValueProvider query) { checkArgument(query != null, "query can not be null"); return toBuilder().setQuery(query).build(); } public Read withStatementPreparator(StatementPreparator statementPreparator) { checkArgument(statementPreparator != null, "statementPreparator can not be null"); return toBuilder().setStatementPreparator(statementPreparator).build(); } public Read withRowMapper(RowMapper rowMapper) { checkArgument(rowMapper != null, "rowMapper can not be null"); return toBuilder().setRowMapper(rowMapper).build(); } public Read withCoder(Coder coder) { checkArgument(coder != null, "coder can not be null"); return toBuilder().setCoder(coder).build(); } @Override public PCollection expand(PBegin input) { checkArgument(getQuery() != null, "withQuery() is required"); checkArgument(getRowMapper() != null, "withRowMapper() is required"); checkArgument(getCoder() != null, "withCoder() is required"); checkArgument( getDataSourceConfiguration() != null, "withDataSourceConfiguration() is required"); return input .apply(Create.of((Void) null)) .apply( JdbcIO.readAll() .withDataSourceConfiguration(getDataSourceConfiguration()) .withQuery(getQuery()) .withCoder(getCoder()) .withRowMapper(getRowMapper()) .withParameterSetter( new PreparedStatementSetter() { @Override public void setParameters(Void element, PreparedStatement preparedStatement) throws Exception { if (getStatementPreparator() != null) { getStatementPreparator().setParameters(preparedStatement); } } })); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder.add(DisplayData.item("query", getQuery())); builder.add(DisplayData.item("rowMapper", getRowMapper().getClass().getName())); builder.add(DisplayData.item("coder", getCoder().getClass().getName())); getDataSourceConfiguration().populateDisplayData(builder); } } /** Implementation of {@link #readAll}. */ /** Implementation of {@link #read}. */ @AutoValue public abstract static class ReadAll extends PTransform, PCollection> { @Nullable abstract DataSourceConfiguration getDataSourceConfiguration(); @Nullable abstract ValueProvider getQuery(); @Nullable abstract PreparedStatementSetter getParameterSetter(); @Nullable abstract RowMapper getRowMapper(); @Nullable abstract Coder getCoder(); abstract Builder toBuilder(); @AutoValue.Builder abstract static class Builder { abstract Builder setDataSourceConfiguration( DataSourceConfiguration config); abstract Builder setQuery(ValueProvider query); abstract Builder setParameterSetter( PreparedStatementSetter parameterSetter); abstract Builder setRowMapper(RowMapper rowMapper); abstract Builder setCoder(Coder coder); abstract ReadAll build(); } public ReadAll withDataSourceConfiguration( DataSourceConfiguration configuration) { checkArgument(configuration != null, "JdbcIO.readAll().withDataSourceConfiguration" + "(configuration) called with null configuration"); return toBuilder().setDataSourceConfiguration(configuration).build(); } public ReadAll withQuery(String query) { checkArgument(query != null, "JdbcIO.readAll().withQuery(query) called with null query"); return withQuery(ValueProvider.StaticValueProvider.of(query)); } public ReadAll withQuery(ValueProvider query) { checkArgument(query != null, "JdbcIO.readAll().withQuery(query) called with null query"); return toBuilder().setQuery(query).build(); } public ReadAll withParameterSetter( PreparedStatementSetter parameterSetter) { checkArgument(parameterSetter != null, "JdbcIO.readAll().withParameterSetter(parameterSetter) called " + "with null statementPreparator"); return toBuilder().setParameterSetter(parameterSetter).build(); } public ReadAll withRowMapper(RowMapper rowMapper) { checkArgument(rowMapper != null, "JdbcIO.readAll().withRowMapper(rowMapper) called with null rowMapper"); return toBuilder().setRowMapper(rowMapper).build(); } public ReadAll withCoder(Coder coder) { checkArgument(coder != null, "JdbcIO.readAll().withCoder(coder) called with null coder"); return toBuilder().setCoder(coder).build(); } @Override public PCollection expand(PCollection input) { return input .apply( ParDo.of( new ReadFn<>( getDataSourceConfiguration(), getQuery(), getParameterSetter(), getRowMapper()))) .setCoder(getCoder()) .apply(new Reparallelize()); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder.add(DisplayData.item("query", getQuery())); builder.add(DisplayData.item("rowMapper", getRowMapper().getClass().getName())); builder.add(DisplayData.item("coder", getCoder().getClass().getName())); getDataSourceConfiguration().populateDisplayData(builder); } } /** A {@link DoFn} executing the SQL query to read from the database. */ private static class ReadFn extends DoFn { private final DataSourceConfiguration dataSourceConfiguration; private final ValueProvider query; private final PreparedStatementSetter parameterSetter; private final RowMapper rowMapper; private DataSource dataSource; private Connection connection; private ReadFn( DataSourceConfiguration dataSourceConfiguration, ValueProvider query, PreparedStatementSetter parameterSetter, RowMapper rowMapper) { this.dataSourceConfiguration = dataSourceConfiguration; this.query = query; this.parameterSetter = parameterSetter; this.rowMapper = rowMapper; } @Setup public void setup() throws Exception { dataSource = dataSourceConfiguration.buildDatasource(); connection = dataSource.getConnection(); } @ProcessElement public void processElement(ProcessContext context) throws Exception { try (PreparedStatement statement = connection.prepareStatement(query.get())) { parameterSetter.setParameters(context.element(), statement); try (ResultSet resultSet = statement.executeQuery()) { while (resultSet.next()) { context.output(rowMapper.mapRow(resultSet)); } } } } @Teardown public void teardown() throws Exception { connection.close(); if (dataSource instanceof AutoCloseable) { ((AutoCloseable) dataSource).close(); } } } /** * An interface used by the JdbcIO Write to set the parameters of the {@link PreparedStatement} * used to setParameters into the database. */ @FunctionalInterface public interface PreparedStatementSetter extends Serializable { void setParameters(T element, PreparedStatement preparedStatement) throws Exception; } /** A {@link PTransform} to write to a JDBC datasource. */ @AutoValue public abstract static class Write extends PTransform, PDone> { @Nullable abstract DataSourceConfiguration getDataSourceConfiguration(); @Nullable abstract String getStatement(); @Nullable abstract PreparedStatementSetter getPreparedStatementSetter(); abstract Builder toBuilder(); @AutoValue.Builder abstract static class Builder { abstract Builder setDataSourceConfiguration(DataSourceConfiguration config); abstract Builder setStatement(String statement); abstract Builder setPreparedStatementSetter(PreparedStatementSetter setter); abstract Write build(); } public Write withDataSourceConfiguration(DataSourceConfiguration config) { return toBuilder().setDataSourceConfiguration(config).build(); } public Write withStatement(String statement) { return toBuilder().setStatement(statement).build(); } public Write withPreparedStatementSetter(PreparedStatementSetter setter) { return toBuilder().setPreparedStatementSetter(setter).build(); } @Override public PDone expand(PCollection input) { checkArgument( getDataSourceConfiguration() != null, "withDataSourceConfiguration() is required"); checkArgument(getStatement() != null, "withStatement() is required"); checkArgument( getPreparedStatementSetter() != null, "withPreparedStatementSetter() is required"); input.apply(ParDo.of(new WriteFn(this))); return PDone.in(input.getPipeline()); } private static class WriteFn extends DoFn { private static final int DEFAULT_BATCH_SIZE = 1000; private final Write spec; private DataSource dataSource; private Connection connection; private PreparedStatement preparedStatement; private int batchCount; public WriteFn(Write spec) { this.spec = spec; } @Setup public void setup() throws Exception { dataSource = spec.getDataSourceConfiguration().buildDatasource(); connection = dataSource.getConnection(); connection.setAutoCommit(false); preparedStatement = connection.prepareStatement(spec.getStatement()); } @StartBundle public void startBundle() { batchCount = 0; } @ProcessElement public void processElement(ProcessContext context) throws Exception { T record = context.element(); preparedStatement.clearParameters(); spec.getPreparedStatementSetter().setParameters(record, preparedStatement); preparedStatement.addBatch(); batchCount++; if (batchCount >= DEFAULT_BATCH_SIZE) { executeBatch(); } } @FinishBundle public void finishBundle() throws Exception { executeBatch(); } private void executeBatch() throws SQLException { if (batchCount > 0) { preparedStatement.executeBatch(); connection.commit(); batchCount = 0; } } @Teardown public void teardown() throws Exception { try { if (preparedStatement != null) { preparedStatement.close(); } } finally { if (connection != null) { connection.close(); } if (dataSource instanceof AutoCloseable) { ((AutoCloseable) dataSource).close(); } } } } } private static class Reparallelize extends PTransform, PCollection> { @Override public PCollection expand(PCollection input) { // See https://issues.apache.org/jira/browse/BEAM-2803 // We use a combined approach to "break fusion" here: // (see https://cloud.google.com/dataflow/service/dataflow-service-desc#preventing-fusion) // 1) force the data to be materialized by passing it as a side input to an identity fn, // then 2) reshuffle it with a random key. Initial materialization provides some parallelism // and ensures that data to be shuffled can be generated in parallel, while reshuffling // provides perfect parallelism. // In most cases where a "fusion break" is needed, a simple reshuffle would be sufficient. // The current approach is necessary only to support the particular case of JdbcIO where // a single query may produce many gigabytes of query results. PCollectionView> empty = input .apply("Consume", Filter.by(SerializableFunctions.constant(false))) .apply(View.asIterable()); PCollection materialized = input.apply( "Identity", ParDo.of( new DoFn() { @ProcessElement public void process(ProcessContext c) { c.output(c.element()); } }) .withSideInputs(empty)); return materialized.apply(Reshuffle.viaRandomKey()); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy