All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.beam.sdk.io.jdbc.JdbcIO Maven / Gradle / Ivy

There is a newer version: 2.61.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io.jdbc;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;

import com.google.auto.value.AutoValue;

import java.io.Serializable;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.Random;

import javax.annotation.Nullable;
import javax.sql.DataSource;

import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.Flatten;
import org.apache.beam.sdk.transforms.GroupByKey;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.Values;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PDone;
import org.apache.commons.dbcp2.BasicDataSource;

/**
 * IO to read and write data on JDBC.
 *
 * 

Reading from JDBC datasource

* *

JdbcIO source returns a bounded collection of {@code T} as a {@code PCollection}. T is the * type returned by the provided {@link RowMapper}. * *

To configure the JDBC source, you have to provide a {@link DataSourceConfiguration} using * {@link DataSourceConfiguration#create(DataSource)} or * {@link DataSourceConfiguration#create(String, String)} with either a * {@link DataSource} (which must be {@link Serializable}) or the parameters needed to create it * (driver class name and url). Optionally, {@link DataSourceConfiguration#withUsername(String)} and * {@link DataSourceConfiguration#withPassword(String)} allows you to define DataSource username * and password. * For example: * *

{@code
 * pipeline.apply(JdbcIO.>read()
 *   .withDataSourceConfiguration(JdbcIO.DataSourceConfiguration.create(
 *          "com.mysql.jdbc.Driver", "jdbc:mysql://hostname:3306/mydb")
 *        .withUsername("username")
 *        .withPassword("password"))
 *   .withQuery("select id,name from Person")
 *   .withRowMapper(new JdbcIO.RowMapper>() {
 *     public KV mapRow(ResultSet resultSet) throws Exception {
 *       return KV.of(resultSet.getInt(1), resultSet.getString(2));
 *     }
 *   })
 * }
* *

Query parameters can be configured using a user-provided {@link StatementPreparator}. * For example:

* *
{@code
 * pipeline.apply(JdbcIO.>read()
 *   .withDataSourceConfiguration(JdbcIO.DataSourceConfiguration.create(
 *       "com.mysql.jdbc.Driver", "jdbc:mysql://hostname:3306/mydb",
 *       "username", "password"))
 *   .withQuery("select id,name from Person where name = ?")
 *   .withStatementPreparator(new JdbcIO.StatementPreparator() {
 *     public void setParameters(PreparedStatement preparedStatement) throws Exception {
 *       preparedStatement.setString(1, "Darwin");
 *     }
 *   })
 *   .withRowMapper(new JdbcIO.RowMapper>() {
 *     public KV mapRow(ResultSet resultSet) throws Exception {
 *       return KV.of(resultSet.getInt(1), resultSet.getString(2));
 *     }
 *   })
 * }
* *

Writing to JDBC datasource

* *

JDBC sink supports writing records into a database. It writes a {@link PCollection} to the * database by converting each T into a {@link PreparedStatement} via a user-provided {@link * PreparedStatementSetter}. * *

Like the source, to configure the sink, you have to provide a {@link DataSourceConfiguration}. * *

{@code
 * pipeline
 *   .apply(...)
 *   .apply(JdbcIO.>write()
 *      .withDataSourceConfiguration(JdbcIO.DataSourceConfiguration.create(
 *            "com.mysql.jdbc.Driver", "jdbc:mysql://hostname:3306/mydb")
 *          .withUsername("username")
 *          .withPassword("password"))
 *      .withStatement("insert into Person values(?, ?)")
 *      .withPreparedStatementSetter(new JdbcIO.PreparedStatementSetter>() {
 *        public void setParameters(KV element, PreparedStatement query) {
 *          query.setInt(1, kv.getKey());
 *          query.setString(2, kv.getValue());
 *        }
 *      })
 * }
* *

NB: in case of transient failures, Beam runners may execute parts of JdbcIO.Write multiple * times for fault tolerance. Because of that, you should avoid using {@code INSERT} statements, * since that risks duplicating records in the database, or failing due to primary key conflicts. * Consider using MERGE ("upsert") * statements supported by your database instead. */ public class JdbcIO { /** * Read data from a JDBC datasource. * * @param Type of the data to be read. */ public static Read read() { return new AutoValue_JdbcIO_Read.Builder().build(); } /** * Write data to a JDBC datasource. * * @param Type of the data to be written. */ public static Write write() { return new AutoValue_JdbcIO_Write.Builder().build(); } private JdbcIO() {} /** * An interface used by {@link JdbcIO.Read} for converting each row of the {@link ResultSet} into * an element of the resulting {@link PCollection}. */ public interface RowMapper extends Serializable { T mapRow(ResultSet resultSet) throws Exception; } /** * A POJO describing a {@link DataSource}, either providing directly a {@link DataSource} or all * properties allowing to create a {@link DataSource}. */ @AutoValue public abstract static class DataSourceConfiguration implements Serializable { @Nullable abstract String getDriverClassName(); @Nullable abstract String getUrl(); @Nullable abstract String getUsername(); @Nullable abstract String getPassword(); @Nullable abstract DataSource getDataSource(); abstract Builder builder(); @AutoValue.Builder abstract static class Builder { abstract Builder setDriverClassName(String driverClassName); abstract Builder setUrl(String url); abstract Builder setUsername(String username); abstract Builder setPassword(String password); abstract Builder setDataSource(DataSource dataSource); abstract DataSourceConfiguration build(); } public static DataSourceConfiguration create(DataSource dataSource) { checkArgument(dataSource != null, "DataSourceConfiguration.create(dataSource) called with " + "null data source"); checkArgument(dataSource instanceof Serializable, "DataSourceConfiguration.create(dataSource) called with a dataSource not Serializable"); return new AutoValue_JdbcIO_DataSourceConfiguration.Builder() .setDataSource(dataSource) .build(); } public static DataSourceConfiguration create(String driverClassName, String url) { checkArgument(driverClassName != null, "DataSourceConfiguration.create(driverClassName, url) called with null driverClassName"); checkArgument(url != null, "DataSourceConfiguration.create(driverClassName, url) called with null url"); return new AutoValue_JdbcIO_DataSourceConfiguration.Builder() .setDriverClassName(driverClassName) .setUrl(url) .build(); } public DataSourceConfiguration withUsername(String username) { return builder().setUsername(username).build(); } public DataSourceConfiguration withPassword(String password) { return builder().setPassword(password).build(); } private void populateDisplayData(DisplayData.Builder builder) { if (getDataSource() != null) { builder.addIfNotNull(DisplayData.item("dataSource", getDataSource().getClass().getName())); } else { builder.addIfNotNull(DisplayData.item("jdbcDriverClassName", getDriverClassName())); builder.addIfNotNull(DisplayData.item("jdbcUrl", getUrl())); builder.addIfNotNull(DisplayData.item("username", getUsername())); } } Connection getConnection() throws Exception { if (getDataSource() != null) { return (getUsername() != null) ? getDataSource().getConnection(getUsername(), getPassword()) : getDataSource().getConnection(); } else { BasicDataSource basicDataSource = new BasicDataSource(); basicDataSource.setDriverClassName(getDriverClassName()); basicDataSource.setUrl(getUrl()); basicDataSource.setUsername(getUsername()); basicDataSource.setPassword(getPassword()); return basicDataSource.getConnection(); } } } /** * An interface used by the JdbcIO Write to set the parameters of the {@link PreparedStatement} * used to setParameters into the database. */ public interface StatementPreparator extends Serializable { void setParameters(PreparedStatement preparedStatement) throws Exception; } /** A {@link PTransform} to read data from a JDBC datasource. */ @AutoValue public abstract static class Read extends PTransform> { @Nullable abstract DataSourceConfiguration getDataSourceConfiguration(); @Nullable abstract String getQuery(); @Nullable abstract StatementPreparator getStatementPreparator(); @Nullable abstract RowMapper getRowMapper(); @Nullable abstract Coder getCoder(); abstract Builder toBuilder(); @AutoValue.Builder abstract static class Builder { abstract Builder setDataSourceConfiguration(DataSourceConfiguration config); abstract Builder setQuery(String query); abstract Builder setStatementPreparator(StatementPreparator statementPreparator); abstract Builder setRowMapper(RowMapper rowMapper); abstract Builder setCoder(Coder coder); abstract Read build(); } public Read withDataSourceConfiguration(DataSourceConfiguration configuration) { checkArgument(configuration != null, "JdbcIO.read().withDataSourceConfiguration" + "(configuration) called with null configuration"); return toBuilder().setDataSourceConfiguration(configuration).build(); } public Read withQuery(String query) { checkArgument(query != null, "JdbcIO.read().withQuery(query) called with null query"); return toBuilder().setQuery(query).build(); } public Read withStatementPrepator(StatementPreparator statementPreparator) { checkArgument(statementPreparator != null, "JdbcIO.read().withStatementPreparator(statementPreparator) called " + "with null statementPreparator"); return toBuilder().setStatementPreparator(statementPreparator).build(); } public Read withRowMapper(RowMapper rowMapper) { checkArgument(rowMapper != null, "JdbcIO.read().withRowMapper(rowMapper) called with null rowMapper"); return toBuilder().setRowMapper(rowMapper).build(); } public Read withCoder(Coder coder) { checkArgument(coder != null, "JdbcIO.read().withCoder(coder) called with null coder"); return toBuilder().setCoder(coder).build(); } @Override public PCollection expand(PBegin input) { return input .apply(Create.of(getQuery())) .apply(ParDo.of(new ReadFn<>(this))).setCoder(getCoder()) // generate a random key followed by a GroupByKey and then ungroup // to prevent fusion // see https://cloud.google.com/dataflow/service/dataflow-service-desc#preventing-fusion // for details .apply(ParDo.of(new DoFn>() { private Random random; @Setup public void setup() { random = new Random(); } @ProcessElement public void processElement(ProcessContext context) { context.output(KV.of(random.nextInt(), context.element())); } })) .apply(GroupByKey.create()) .apply(Values.>create()) .apply(Flatten.iterables()); } @Override public void validate(PBegin input) { checkState(getQuery() != null, "JdbcIO.read() requires a query to be set via withQuery(query)"); checkState(getRowMapper() != null, "JdbcIO.read() requires a rowMapper to be set via withRowMapper(rowMapper)"); checkState(getCoder() != null, "JdbcIO.read() requires a coder to be set via withCoder(coder)"); checkState(getDataSourceConfiguration() != null, "JdbcIO.read() requires a DataSource configuration to be set via " + "withDataSourceConfiguration(dataSourceConfiguration)"); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder.add(DisplayData.item("query", getQuery())); builder.add(DisplayData.item("rowMapper", getRowMapper().getClass().getName())); builder.add(DisplayData.item("coder", getCoder().getClass().getName())); getDataSourceConfiguration().populateDisplayData(builder); } /** A {@link DoFn} executing the SQL query to read from the database. */ static class ReadFn extends DoFn { private JdbcIO.Read spec; private Connection connection; private ReadFn(Read spec) { this.spec = spec; } @Setup public void setup() throws Exception { connection = spec.getDataSourceConfiguration().getConnection(); } @ProcessElement public void processElement(ProcessContext context) throws Exception { String query = context.element(); try (PreparedStatement statement = connection.prepareStatement(query)) { if (this.spec.getStatementPreparator() != null) { this.spec.getStatementPreparator().setParameters(statement); } try (ResultSet resultSet = statement.executeQuery()) { while (resultSet.next()) { context.output(spec.getRowMapper().mapRow(resultSet)); } } } } @Teardown public void teardown() throws Exception { if (connection != null) { connection.close(); } } } } /** * An interface used by the JdbcIO Write to set the parameters of the {@link PreparedStatement} * used to setParameters into the database. */ public interface PreparedStatementSetter extends Serializable { void setParameters(T element, PreparedStatement preparedStatement) throws Exception; } /** A {@link PTransform} to write to a JDBC datasource. */ @AutoValue public abstract static class Write extends PTransform, PDone> { @Nullable abstract DataSourceConfiguration getDataSourceConfiguration(); @Nullable abstract String getStatement(); @Nullable abstract PreparedStatementSetter getPreparedStatementSetter(); abstract Builder toBuilder(); @AutoValue.Builder abstract static class Builder { abstract Builder setDataSourceConfiguration(DataSourceConfiguration config); abstract Builder setStatement(String statement); abstract Builder setPreparedStatementSetter(PreparedStatementSetter setter); abstract Write build(); } public Write withDataSourceConfiguration(DataSourceConfiguration config) { return toBuilder().setDataSourceConfiguration(config).build(); } public Write withStatement(String statement) { return toBuilder().setStatement(statement).build(); } public Write withPreparedStatementSetter(PreparedStatementSetter setter) { return toBuilder().setPreparedStatementSetter(setter).build(); } @Override public PDone expand(PCollection input) { input.apply(ParDo.of(new WriteFn(this))); return PDone.in(input.getPipeline()); } @Override public void validate(PCollection input) { checkArgument(getDataSourceConfiguration() != null, "JdbcIO.write() requires a configuration to be set via " + ".withDataSourceConfiguration(configuration)"); checkArgument(getStatement() != null, "JdbcIO.write() requires a statement to be set via .withStatement(statement)"); checkArgument(getPreparedStatementSetter() != null, "JdbcIO.write() requires a preparedStatementSetter to be set via " + ".withPreparedStatementSetter(preparedStatementSetter)"); } private static class WriteFn extends DoFn { private static final int DEFAULT_BATCH_SIZE = 1000; private final Write spec; private Connection connection; private PreparedStatement preparedStatement; private int batchCount; public WriteFn(Write spec) { this.spec = spec; } @Setup public void setup() throws Exception { connection = spec.getDataSourceConfiguration().getConnection(); connection.setAutoCommit(false); preparedStatement = connection.prepareStatement(spec.getStatement()); } @StartBundle public void startBundle(Context context) { batchCount = 0; } @ProcessElement public void processElement(ProcessContext context) throws Exception { T record = context.element(); preparedStatement.clearParameters(); spec.getPreparedStatementSetter().setParameters(record, preparedStatement); preparedStatement.addBatch(); batchCount++; if (batchCount >= DEFAULT_BATCH_SIZE) { finishBundle(context); } } @FinishBundle public void finishBundle(Context context) throws Exception { if (batchCount > 0) { preparedStatement.executeBatch(); connection.commit(); batchCount = 0; } } @Teardown public void teardown() throws Exception { try { if (preparedStatement != null) { preparedStatement.close(); } } finally { if (connection != null) { connection.close(); } } } } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy