All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.flink.bigquery.examples.BigQueryTableExample Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.cloud.flink.bigquery.examples;

import org.apache.flink.api.connector.source.Boundedness;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.connector.base.DeliveryGuarantee;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.annotation.DataTypeHint;
import org.apache.flink.table.annotation.FunctionHint;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.TableFunction;
import org.apache.flink.types.Row;

import com.google.cloud.flink.bigquery.sink.serializer.BigQueryTableSchemaProvider;
import com.google.cloud.flink.bigquery.table.config.BigQueryReadTableConfig;
import com.google.cloud.flink.bigquery.table.config.BigQuerySinkTableConfig;
import com.google.cloud.flink.bigquery.table.config.BigQueryTableConfig;
import org.apache.avro.generic.GenericRecord;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static org.apache.flink.table.api.Expressions.$;
import static org.apache.flink.table.api.Expressions.call;

/**
 * A simple BigQuery table read and sink example with Flink's Table API.
 *
 * 

The Flink pipeline will try to read the specified BigQuery table according to the command line * arguments, returning {@link GenericRecord} representing the rows, and print the result of * specified operations or write to a BigQuery table via sink. * *

    *
  • Specify the BQ dataset and table with an optional row restriction. Users can configure a * source mode, i.e bounded or unbounded. Bounded implies that the BQ table will be read once * at the time of execution, analogous to a batch job. Unbounded source implies that the BQ * table will be periodically polled for new data. Resulting records can be written to another * BQ table, with allowed delivery (write) guarantees at-least-once or exactly-once.
    * The sequence of operations in both pipelines is: source > flatMap > sink
    * Flink command line format is:
    * flink run {additional runtime params} {path to this jar}/BigQueryTableExample.jar *
    * --gcp-source-project {required; project ID containing the source table}
    * --bq-source-dataset {required; name of dataset containing the source table}
    * --bq-source-table {required; name of table to read}
    * --gcp-sink-project {required; project ID containing the sink table}
    * --bq-sink-dataset {required; name of dataset containing the sink table}
    * --bq-sink-table {required; name of table to write to}
    * --mode {optional; source read type. Allowed values are bounded (default) or unbounded or * hybrid}
    * --restriction {optional; SQL filter applied at the BigQuery table before reading}
    * --limit {optional; maximum records to read from BigQuery table}
    * --checkpoint-interval {optional; milliseconds between state checkpoints}
    * --partition-discovery-interval {optional; minutes between polling table for new data. Used * in unbounded/hybrid mode}
    * --delivery-guarantee {optional; sink consistency. Allowed values are at-least-once * (default) or exactly-once} *
*/ public class BigQueryTableExample { private static final Logger LOG = LoggerFactory.getLogger(BigQueryTableExample.class); public static void main(String[] args) throws Exception { // parse input arguments final ParameterTool parameterTool = ParameterTool.fromArgs(args); if (parameterTool.getNumberOfParameters() < 1) { LOG.error( "Missing parameters!\n" + "Usage: flink run " + " --gcp-source-project " + " --bq-source-dataset " + " --bq-source-table " + " --gcp-sink-project " + " --bq-sink-dataset " + " --bq-sink-table " + " --mode " + " --restriction " + " --limit " + " --checkpoint-interval " + " --partition-discovery-interval " + " --delivery-guarantee "); return; } /** * We will be reading avro generic records from BigQuery, and in this case we are assuming * the GOOGLE_APPLICATION_CREDENTIALS env variable will be present in the execution * runtime. In case of need to authenticate differently, the credentials builder (part of * the BigQueryConnectOptions) should enable capturing the credentials from various sources. */ String sourceGcpProjectName = parameterTool.getRequired("gcp-source-project"); String sourceDatasetName = parameterTool.getRequired("bq-source-dataset"); String sourceTableName = parameterTool.getRequired("bq-source-table"); // Read - Optional Arguments Integer recordLimit = parameterTool.getInt("limit", -1); Long checkpointInterval = parameterTool.getLong("checkpoint-interval", 60000L); String rowRestriction = parameterTool.get("restriction", "").replace("\\u0027", "'"); String mode = parameterTool.get("mode", "bounded"); // Unbounded specific options. Integer partitionDiscoveryInterval = parameterTool.getInt("partition-discovery-interval", 10); // Sink Parameters String destGcpProjectName = parameterTool.getRequired("gcp-sink-project"); String destDatasetName = parameterTool.getRequired("bq-sink-dataset"); String destTableName = parameterTool.getRequired("bq-sink-table"); String deliveryGuarantee = parameterTool.get("delivery-guarantee", "at-least-once"); DeliveryGuarantee sinkMode; switch (deliveryGuarantee) { case "at-least-once": sinkMode = DeliveryGuarantee.AT_LEAST_ONCE; break; case "exactly-once": sinkMode = DeliveryGuarantee.EXACTLY_ONCE; break; default: throw new IllegalArgumentException( String.format( "Allowed values for delivery-guarantee are at-least-once or exactly-once. Found %s", deliveryGuarantee)); } switch (mode) { case "bounded": runBoundedTableAPIFlinkJob( sourceGcpProjectName, sourceDatasetName, sourceTableName, destGcpProjectName, destDatasetName, destTableName, sinkMode, rowRestriction, recordLimit, checkpointInterval); break; case "unbounded": runStreamingTableAPIFlinkJob( sourceGcpProjectName, sourceDatasetName, sourceTableName, destGcpProjectName, destDatasetName, destTableName, sinkMode, rowRestriction, recordLimit, checkpointInterval, partitionDiscoveryInterval); break; default: throw new IllegalArgumentException( "Allowed values for mode are bounded or unbounded. Found " + mode); } } /** * Bounded read and sink operation via Flink's Table API. The function is responsible for * reading a BigQuery table (having schema name STRING, number * INTEGER, ts TIMESTAMP) in bounded mode and then passing the * obtained records via a flatmap. The flatmap appends a string "_write_test" to the "name" * field and writes the modified records back to another BigQuery table. * * @param sourceGcpProjectName The GCP Project name of the source table. * @param sourceDatasetName Dataset name of the source table. * @param sourceTableName Source Table Name. * @param destGcpProjectName The GCP Project name of the destination table. * @param destDatasetName Dataset name of the destination table. * @param destTableName Destination Table Name. * @param sinkMode At-least-once or exactly-once write consistency. * @param rowRestriction String value, filtering the rows to be read. * @param limit Integer value, Number of rows to limit the read result. * @param checkpointInterval Long value, Interval between two check points (milliseconds) * @throws Exception in a case of error, obtaining Table Descriptor. */ private static void runBoundedTableAPIFlinkJob( String sourceGcpProjectName, String sourceDatasetName, String sourceTableName, String destGcpProjectName, String destDatasetName, String destTableName, DeliveryGuarantee sinkMode, String rowRestriction, Integer limit, Long checkpointInterval) throws Exception { final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(checkpointInterval); final StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); tEnv.createTemporarySystemFunction("func", MyFlatMapFunction.class); // Declare Read Options. BigQueryTableConfig readTableConfig = BigQueryReadTableConfig.newBuilder() .project(sourceGcpProjectName) .dataset(sourceDatasetName) .table(sourceTableName) .limit(limit) .rowRestriction(rowRestriction) .boundedness(Boundedness.BOUNDED) .build(); // Register the Source Table tEnv.createTable( "bigQuerySourceTable", BigQueryTableSchemaProvider.getTableDescriptor(readTableConfig)); // Read the table and pass to flatmap. Table sourceTable = tEnv.from("bigQuerySourceTable") .select($("*")) .flatMap(call("func", Row.of($("name"), $("number"), $("ts")))) .as("name", "number", "ts"); BigQueryTableConfig sinkTableConfig = BigQuerySinkTableConfig.newBuilder() .project(destGcpProjectName) .dataset(destDatasetName) .table(destTableName) .sinkParallelism(2) .deliveryGuarantee(sinkMode) .streamExecutionEnvironment(env) .build(); // Register the Sink Table tEnv.createTable( "bigQuerySinkTable", BigQueryTableSchemaProvider.getTableDescriptor(sinkTableConfig)); // Insert the table sourceTable to the registered sinkTable sourceTable.executeInsert("bigQuerySinkTable"); } /** * Unbounded read and sink operation via Flink's Table API. The function is responsible for * reading a BigQuery table (having schema name STRING, number * INTEGER, ts TIMESTAMP) in unbounded mode and then passing the * obtained records via a flatmap. The flatmap appends a string "_write_test" to the "name" * field and writes the modified records back to another BigQuery table. * * @param sourceGcpProjectName The GCP Project name of the source table. * @param sourceDatasetName Dataset name of the source table. * @param sourceTableName Source Table Name. * @param destGcpProjectName The GCP Project name of the destination table. * @param destDatasetName Dataset name of the destination table. * @param destTableName Destination Table Name. * @param sinkMode At-least-once or exactly-once write consistency. * @param rowRestriction String value, filtering the rows to be read. * @param limit Integer value, Number of rows to limit the read result. * @param checkpointInterval Long value, Interval between two check points (milliseconds). * @throws Exception in a case of error, obtaining Table Descriptor. */ private static void runStreamingTableAPIFlinkJob( String sourceGcpProjectName, String sourceDatasetName, String sourceTableName, String destGcpProjectName, String destDatasetName, String destTableName, DeliveryGuarantee sinkMode, String rowRestriction, Integer limit, Long checkpointInterval, Integer partitionDiscoveryInterval) throws Exception { final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(checkpointInterval); final StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); tEnv.createTemporarySystemFunction("func", MyFlatMapFunction.class); // Declare Read Options. BigQueryTableConfig readTableConfig = BigQueryReadTableConfig.newBuilder() .table(sourceTableName) .project(sourceGcpProjectName) .dataset(sourceDatasetName) .limit(limit) .rowRestriction(rowRestriction) .partitionDiscoveryInterval(partitionDiscoveryInterval) .boundedness(Boundedness.CONTINUOUS_UNBOUNDED) .build(); // Register the Source Table tEnv.createTable( "bigQuerySourceTable", BigQueryTableSchemaProvider.getTableDescriptor(readTableConfig)); Table sourceTable = tEnv.from("bigQuerySourceTable"); // Fetch entries in this sourceTable sourceTable = sourceTable.select($("*")); // Declare Write Options. BigQueryTableConfig sinkTableConfig = BigQuerySinkTableConfig.newBuilder() .table(destTableName) .project(destGcpProjectName) .dataset(destDatasetName) .sinkParallelism(2) .deliveryGuarantee(sinkMode) .streamExecutionEnvironment(env) .build(); // Register the Sink Table tEnv.createTable( "bigQuerySinkTable", BigQueryTableSchemaProvider.getTableDescriptor(sinkTableConfig)); // Insert the table sourceTable to the registered sinkTable sourceTable = sourceTable .flatMap(call("func", Row.of($("name"), $("number"), $("ts")))) .as("name", "number", "ts"); sourceTable.executeInsert("bigQuerySinkTable"); } /** * Bounded read > join and sink operation via Flink's Table API. The function is responsible for * reading a BigQuery table (having schema id STRING, name_left * STRING) in bounded mode and then writes the modified records back to another * BigQuery table. * *

This example is for reference only, and cannot be invoked from this class's main method. * * @param sourceGcpProjectName The GCP Project name of the source table. * @param sourceDatasetName Dataset name of the source table. * @param leftSourceTableName Source Table Name (left for Join). * @param rightSourceTableName Source Table Name (right for Join). * @param destGcpProjectName The GCP Project name of the destination table. * @param destDatasetName Dataset name of the destination table. * @param destTableName Destination Table Name. * @param sinkMode At-least-once or exactly-once write consistency. * @param rowRestriction String value, filtering the rows to be read. * @param limit Integer value, Number of rows to limit the read result. * @param checkpointInterval Long value, Interval between two check points (milliseconds) * @throws Exception in a case of error, obtaining Table Descriptor. */ public static void runBoundedJoinFlinkJob( String sourceGcpProjectName, String sourceDatasetName, String leftSourceTableName, String rightSourceTableName, String destGcpProjectName, String destDatasetName, String destTableName, DeliveryGuarantee sinkMode, String rowRestriction, Integer limit, Long checkpointInterval) throws Exception { final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(checkpointInterval); final StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); tEnv.createTemporarySystemFunction("func", MyFlatMapFunction.class); // Declare Read Options. BigQueryTableConfig readTableConfig = BigQueryReadTableConfig.newBuilder() .table(leftSourceTableName) .project(sourceGcpProjectName) .dataset(sourceDatasetName) .limit(limit) .rowRestriction(rowRestriction) .boundedness(Boundedness.BOUNDED) .build(); // Register the Source Table tEnv.createTable( "leftSourceTable", BigQueryTableSchemaProvider.getTableDescriptor(readTableConfig)); readTableConfig = BigQueryReadTableConfig.newBuilder() .table(rightSourceTableName) .project(sourceGcpProjectName) .dataset(sourceDatasetName) .limit(limit) .rowRestriction(rowRestriction) .boundedness(Boundedness.BOUNDED) .build(); tEnv.createTable( "rightSourceTable", BigQueryTableSchemaProvider.getTableDescriptor(readTableConfig)); // Declare Write Options. BigQueryTableConfig sinkTableConfig = BigQuerySinkTableConfig.newBuilder() .table(destTableName) .project(destGcpProjectName) .dataset(destDatasetName) .deliveryGuarantee(sinkMode) .streamExecutionEnvironment(env) .build(); // Register the Sink Table tEnv.createTable( "bigQuerySinkTable", BigQueryTableSchemaProvider.getTableDescriptor(sinkTableConfig)); // Join Example - Table API // Table leftSourceTable = tEnv.from("leftSourceTable"); // Table rightSourceTable = tEnv.from("rightSourceTable"); // Table joinedTable = // leftSourceTable // .renameColumns($("id").as("id_l")) // .join(rightSourceTable, $("id_l").isEqual($("id"))) // .select($("id"), $("name_left"), $("name_right")); // joinedTable.executeInsert("bigQuerySinkTable"); // Join Example - SQL tEnv.executeSql( "insert into bigQuerySinkTable Select leftSourceTable.id AS id, " + "leftSourceTable.name_left AS name_left, rightSourceTable.name_right as name_right from leftSourceTable JOIN rightSourceTable ON " + "leftSourceTable.id = rightSourceTable.id;"); } /** Function to flatmap the Table API source Catalog Table. */ @FunctionHint( input = @DataTypeHint("ROW<`name` STRING, `number` BIGINT, `ts` TIMESTAMP(6)>"), output = @DataTypeHint("ROW<`name` STRING, `number` BIGINT, `ts` TIMESTAMP(6)>")) public static class MyFlatMapFunction extends TableFunction { public void eval(Row row) { String str = (String) row.getField("name"); collect(Row.of(str + "_write_test", row.getField("number"), row.getField("ts"))); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy