com.google.cloud.flink.bigquery.examples.BigQueryTableExample Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of flink-1.17-connector-bigquery-table-api-examples Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.cloud.flink.bigquery.examples;

import org.apache.flink.api.connector.source.Boundedness;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.connector.base.DeliveryGuarantee;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.annotation.DataTypeHint;
import org.apache.flink.table.annotation.FunctionHint;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.TableFunction;
import org.apache.flink.types.Row;

import com.google.cloud.flink.bigquery.sink.serializer.BigQueryTableSchemaProvider;
import com.google.cloud.flink.bigquery.table.config.BigQueryReadTableConfig;
import com.google.cloud.flink.bigquery.table.config.BigQuerySinkTableConfig;
import com.google.cloud.flink.bigquery.table.config.BigQueryTableConfig;
import org.apache.avro.generic.GenericRecord;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static org.apache.flink.table.api.Expressions.$;
import static org.apache.flink.table.api.Expressions.call;

/**
 * A simple BigQuery table read and sink example with Flink's Table API.
 *
 * The Flink pipeline will try to read the specified BigQuery table according to the command line
 * arguments, returning {@link GenericRecord} representing the rows, and print the result of
 * specified operations or write to a BigQuery table via sink.
 *
 * 

 *   Specify the BQ dataset and table with an optional row restriction. Users can configure a
 *       source mode, i.e bounded or unbounded. Bounded implies that the BQ table will be read once
 *       at the time of execution, analogous to a batch job. Unbounded source implies that the BQ
 *       table will be periodically polled for new data. Resulting records can be written to another
 *       BQ table, with allowed delivery (write) guarantees at-least-once or exactly-once. 

 *       The sequence of operations in both pipelines is: source > flatMap > sink 

 *       Flink command line format is: 

 *        flink run {additional runtime params} {path to this jar}/BigQueryTableExample.jar
 *        

 *       --gcp-source-project {required; project ID containing the source table} 

 *       --bq-source-dataset {required; name of dataset containing the source table} 

 *       --bq-source-table {required; name of table to read} 

 *       --gcp-sink-project {required; project ID containing the sink table} 

 *       --bq-sink-dataset {required; name of dataset containing the sink table} 

 *       --bq-sink-table {required; name of table to write to} 

 *       --mode {optional; source read type. Allowed values are bounded (default) or unbounded or
 *       hybrid} 

 *       --restriction {optional; SQL filter applied at the BigQuery table before reading} 

 *       --limit {optional; maximum records to read from BigQuery table} 

 *       --checkpoint-interval {optional; milliseconds between state checkpoints} 

 *       --partition-discovery-interval {optional; minutes between polling table for new data. Used
 *       in unbounded/hybrid mode} 

 *       --delivery-guarantee {optional; sink consistency. Allowed values are at-least-once
 *       (default) or exactly-once}
 * 
 */
public class BigQueryTableExample {

    private static final Logger LOG = LoggerFactory.getLogger(BigQueryTableExample.class);

    public static void main(String[] args) throws Exception {
        // parse input arguments
        final ParameterTool parameterTool = ParameterTool.fromArgs(args);

        if (parameterTool.getNumberOfParameters() < 1) {
            LOG.error(
                    "Missing parameters!\n"
                            + "Usage: flink run  "
                            + " --gcp-source-project "
                            + " --bq-source-dataset "
                            + " --bq-source-table "
                            + " --gcp-sink-project "
                            + " --bq-sink-dataset "
                            + " --bq-sink-table "
                            + " --mode "
                            + " --restriction "
                            + " --limit "
                            + " --checkpoint-interval "
                            + " --partition-discovery-interval "
                            + " --delivery-guarantee ");
            return;
        }
        /**
         * We will be reading avro generic records from BigQuery, and in this case we are assuming
         * the GOOGLE_APPLICATION_CREDENTIALS env variable will be present in the execution
         * runtime. In case of need to authenticate differently, the credentials builder (part of
         * the BigQueryConnectOptions) should enable capturing the credentials from various sources.
         */
        String sourceGcpProjectName = parameterTool.getRequired("gcp-source-project");
        String sourceDatasetName = parameterTool.getRequired("bq-source-dataset");
        String sourceTableName = parameterTool.getRequired("bq-source-table");
        // Read - Optional Arguments
        Integer recordLimit = parameterTool.getInt("limit", -1);
        Long checkpointInterval = parameterTool.getLong("checkpoint-interval", 60000L);
        String rowRestriction = parameterTool.get("restriction", "").replace("\\u0027", "'");
        String mode = parameterTool.get("mode", "bounded");
        // Unbounded specific options.
        Integer partitionDiscoveryInterval =
                parameterTool.getInt("partition-discovery-interval", 10);
        // Sink Parameters
        String destGcpProjectName = parameterTool.getRequired("gcp-sink-project");
        String destDatasetName = parameterTool.getRequired("bq-sink-dataset");
        String destTableName = parameterTool.getRequired("bq-sink-table");
        String deliveryGuarantee = parameterTool.get("delivery-guarantee", "at-least-once");
        DeliveryGuarantee sinkMode;
        switch (deliveryGuarantee) {
            case "at-least-once":
                sinkMode = DeliveryGuarantee.AT_LEAST_ONCE;
                break;
            case "exactly-once":
                sinkMode = DeliveryGuarantee.EXACTLY_ONCE;
                break;
            default:
                throw new IllegalArgumentException(
                        String.format(
                                "Allowed values for delivery-guarantee are at-least-once or exactly-once. Found %s",
                                deliveryGuarantee));
        }

        switch (mode) {
            case "bounded":
                runBoundedTableAPIFlinkJob(
                        sourceGcpProjectName,
                        sourceDatasetName,
                        sourceTableName,
                        destGcpProjectName,
                        destDatasetName,
                        destTableName,
                        sinkMode,
                        rowRestriction,
                        recordLimit,
                        checkpointInterval);
                break;
            case "unbounded":
                runStreamingTableAPIFlinkJob(
                        sourceGcpProjectName,
                        sourceDatasetName,
                        sourceTableName,
                        destGcpProjectName,
                        destDatasetName,
                        destTableName,
                        sinkMode,
                        rowRestriction,
                        recordLimit,
                        checkpointInterval,
                        partitionDiscoveryInterval);
                break;
            default:
                throw new IllegalArgumentException(
                        "Allowed values for mode are bounded or unbounded. Found " + mode);
        }
    }

    /**
     * Bounded read and sink operation via Flink's Table API. The function is responsible for
     * reading a BigQuery table (having schema name STRING, number 
     * INTEGER, ts TIMESTAMP) in bounded mode and then passing the
     * obtained records via a flatmap. The flatmap appends a string "_write_test" to the "name"
     * field and writes the modified records back to another BigQuery table.
     *
     * @param sourceGcpProjectName The GCP Project name of the source table.
     * @param sourceDatasetName Dataset name of the source table.
     * @param sourceTableName Source Table Name.
     * @param destGcpProjectName The GCP Project name of the destination table.
     * @param destDatasetName Dataset name of the destination table.
     * @param destTableName Destination Table Name.
     * @param sinkMode At-least-once or exactly-once write consistency.
     * @param rowRestriction String value, filtering the rows to be read.
     * @param limit Integer value, Number of rows to limit the read result.
     * @param checkpointInterval Long value, Interval between two check points (milliseconds)
     * @throws Exception in a case of error, obtaining Table Descriptor.
     */
    private static void runBoundedTableAPIFlinkJob(
            String sourceGcpProjectName,
            String sourceDatasetName,
            String sourceTableName,
            String destGcpProjectName,
            String destDatasetName,
            String destTableName,
            DeliveryGuarantee sinkMode,
            String rowRestriction,
            Integer limit,
            Long checkpointInterval)
            throws Exception {

        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.enableCheckpointing(checkpointInterval);
        final StreamTableEnvironment tEnv = StreamTableEnvironment.create(env);
        tEnv.createTemporarySystemFunction("func", MyFlatMapFunction.class);

        // Declare Read Options.
        BigQueryTableConfig readTableConfig =
                BigQueryReadTableConfig.newBuilder()
                        .project(sourceGcpProjectName)
                        .dataset(sourceDatasetName)
                        .table(sourceTableName)
                        .limit(limit)
                        .rowRestriction(rowRestriction)
                        .boundedness(Boundedness.BOUNDED)
                        .build();

        // Register the Source Table
        tEnv.createTable(
                "bigQuerySourceTable",
                BigQueryTableSchemaProvider.getTableDescriptor(readTableConfig));

        // Read the table and pass to flatmap.
        Table sourceTable =
                tEnv.from("bigQuerySourceTable")
                        .select($("*"))
                        .flatMap(call("func", Row.of($("name"), $("number"), $("ts"))))
                        .as("name", "number", "ts");

        BigQueryTableConfig sinkTableConfig =
                BigQuerySinkTableConfig.newBuilder()
                        .project(destGcpProjectName)
                        .dataset(destDatasetName)
                        .table(destTableName)
                        .sinkParallelism(2)
                        .deliveryGuarantee(sinkMode)
                        .streamExecutionEnvironment(env)
                        .build();

        // Register the Sink Table
        tEnv.createTable(
                "bigQuerySinkTable",
                BigQueryTableSchemaProvider.getTableDescriptor(sinkTableConfig));

        // Insert the table sourceTable to the registered sinkTable
        sourceTable.executeInsert("bigQuerySinkTable");
    }

    /**
     * Unbounded read and sink operation via Flink's Table API. The function is responsible for
     * reading a BigQuery table (having schema name STRING, number 
     * INTEGER, ts TIMESTAMP) in unbounded mode and then passing the
     * obtained records via a flatmap. The flatmap appends a string "_write_test" to the "name"
     * field and writes the modified records back to another BigQuery table.
     *
     * @param sourceGcpProjectName The GCP Project name of the source table.
     * @param sourceDatasetName Dataset name of the source table.
     * @param sourceTableName Source Table Name.
     * @param destGcpProjectName The GCP Project name of the destination table.
     * @param destDatasetName Dataset name of the destination table.
     * @param destTableName Destination Table Name.
     * @param sinkMode At-least-once or exactly-once write consistency.
     * @param rowRestriction String value, filtering the rows to be read.
     * @param limit Integer value, Number of rows to limit the read result.
     * @param checkpointInterval Long value, Interval between two check points (milliseconds).
     * @throws Exception in a case of error, obtaining Table Descriptor.
     */
    private static void runStreamingTableAPIFlinkJob(
            String sourceGcpProjectName,
            String sourceDatasetName,
            String sourceTableName,
            String destGcpProjectName,
            String destDatasetName,
            String destTableName,
            DeliveryGuarantee sinkMode,
            String rowRestriction,
            Integer limit,
            Long checkpointInterval,
            Integer partitionDiscoveryInterval)
            throws Exception {

        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.enableCheckpointing(checkpointInterval);
        final StreamTableEnvironment tEnv = StreamTableEnvironment.create(env);
        tEnv.createTemporarySystemFunction("func", MyFlatMapFunction.class);

        // Declare Read Options.
        BigQueryTableConfig readTableConfig =
                BigQueryReadTableConfig.newBuilder()
                        .table(sourceTableName)
                        .project(sourceGcpProjectName)
                        .dataset(sourceDatasetName)
                        .limit(limit)
                        .rowRestriction(rowRestriction)
                        .partitionDiscoveryInterval(partitionDiscoveryInterval)
                        .boundedness(Boundedness.CONTINUOUS_UNBOUNDED)
                        .build();

        // Register the Source Table
        tEnv.createTable(
                "bigQuerySourceTable",
                BigQueryTableSchemaProvider.getTableDescriptor(readTableConfig));
        Table sourceTable = tEnv.from("bigQuerySourceTable");

        // Fetch entries in this sourceTable
        sourceTable = sourceTable.select($("*"));

        // Declare Write Options.
        BigQueryTableConfig sinkTableConfig =
                BigQuerySinkTableConfig.newBuilder()
                        .table(destTableName)
                        .project(destGcpProjectName)
                        .dataset(destDatasetName)
                        .sinkParallelism(2)
                        .deliveryGuarantee(sinkMode)
                        .streamExecutionEnvironment(env)
                        .build();

        // Register the Sink Table
        tEnv.createTable(
                "bigQuerySinkTable",
                BigQueryTableSchemaProvider.getTableDescriptor(sinkTableConfig));

        // Insert the table sourceTable to the registered sinkTable
        sourceTable =
                sourceTable
                        .flatMap(call("func", Row.of($("name"), $("number"), $("ts"))))
                        .as("name", "number", "ts");

        sourceTable.executeInsert("bigQuerySinkTable");
    }

    /**
     * Bounded read > join and sink operation via Flink's Table API. The function is responsible for
     * reading a BigQuery table (having schema id STRING, name_left
     * STRING) in bounded mode and then writes the modified records back to another
     * BigQuery table.
     *
     * This example is for reference only, and cannot be invoked from this class's main method.
     *
     * @param sourceGcpProjectName The GCP Project name of the source table.
     * @param sourceDatasetName Dataset name of the source table.
     * @param leftSourceTableName Source Table Name (left for Join).
     * @param rightSourceTableName Source Table Name (right for Join).
     * @param destGcpProjectName The GCP Project name of the destination table.
     * @param destDatasetName Dataset name of the destination table.
     * @param destTableName Destination Table Name.
     * @param sinkMode At-least-once or exactly-once write consistency.
     * @param rowRestriction String value, filtering the rows to be read.
     * @param limit Integer value, Number of rows to limit the read result.
     * @param checkpointInterval Long value, Interval between two check points (milliseconds)
     * @throws Exception in a case of error, obtaining Table Descriptor.
     */
    public static void runBoundedJoinFlinkJob(
            String sourceGcpProjectName,
            String sourceDatasetName,
            String leftSourceTableName,
            String rightSourceTableName,
            String destGcpProjectName,
            String destDatasetName,
            String destTableName,
            DeliveryGuarantee sinkMode,
            String rowRestriction,
            Integer limit,
            Long checkpointInterval)
            throws Exception {

        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.enableCheckpointing(checkpointInterval);
        final StreamTableEnvironment tEnv = StreamTableEnvironment.create(env);
        tEnv.createTemporarySystemFunction("func", MyFlatMapFunction.class);

        // Declare Read Options.
        BigQueryTableConfig readTableConfig =
                BigQueryReadTableConfig.newBuilder()
                        .table(leftSourceTableName)
                        .project(sourceGcpProjectName)
                        .dataset(sourceDatasetName)
                        .limit(limit)
                        .rowRestriction(rowRestriction)
                        .boundedness(Boundedness.BOUNDED)
                        .build();

        // Register the Source Table
        tEnv.createTable(
                "leftSourceTable", BigQueryTableSchemaProvider.getTableDescriptor(readTableConfig));

        readTableConfig =
                BigQueryReadTableConfig.newBuilder()
                        .table(rightSourceTableName)
                        .project(sourceGcpProjectName)
                        .dataset(sourceDatasetName)
                        .limit(limit)
                        .rowRestriction(rowRestriction)
                        .boundedness(Boundedness.BOUNDED)
                        .build();

        tEnv.createTable(
                "rightSourceTable",
                BigQueryTableSchemaProvider.getTableDescriptor(readTableConfig));

        // Declare Write Options.
        BigQueryTableConfig sinkTableConfig =
                BigQuerySinkTableConfig.newBuilder()
                        .table(destTableName)
                        .project(destGcpProjectName)
                        .dataset(destDatasetName)
                        .deliveryGuarantee(sinkMode)
                        .streamExecutionEnvironment(env)
                        .build();

        // Register the Sink Table
        tEnv.createTable(
                "bigQuerySinkTable",
                BigQueryTableSchemaProvider.getTableDescriptor(sinkTableConfig));

        // Join Example - Table API
        //        Table leftSourceTable = tEnv.from("leftSourceTable");
        //        Table rightSourceTable = tEnv.from("rightSourceTable");
        //        Table joinedTable =
        //                leftSourceTable
        //                        .renameColumns($("id").as("id_l"))
        //                        .join(rightSourceTable, $("id_l").isEqual($("id")))
        //                        .select($("id"), $("name_left"), $("name_right"));
        //        joinedTable.executeInsert("bigQuerySinkTable");

        // Join Example - SQL
        tEnv.executeSql(
                "insert into bigQuerySinkTable Select leftSourceTable.id AS id, "
                        + "leftSourceTable.name_left AS name_left, rightSourceTable.name_right as name_right from leftSourceTable JOIN rightSourceTable ON "
                        + "leftSourceTable.id = rightSourceTable.id;");
    }

    /** Function to flatmap the Table API source Catalog Table. */
    @FunctionHint(
            input = @DataTypeHint("ROW<`name` STRING, `number` BIGINT, `ts` TIMESTAMP(6)>"),
            output = @DataTypeHint("ROW<`name` STRING, `number` BIGINT, `ts` TIMESTAMP(6)>"))
    public static class MyFlatMapFunction extends TableFunction {

        public void eval(Row row) {
            String str = (String) row.getField("name");
            collect(Row.of(str + "_write_test", row.getField("number"), row.getField("ts")));
        }
    }
}