All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.gcp.bigquery.BigQuerySyncTool Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.gcp.bigquery;

import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.HadoopConfigUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.VisibleForTesting;
import org.apache.hudi.sync.common.HoodieSyncTool;
import org.apache.hudi.sync.common.util.ManifestFileWriter;

import com.beust.jcommander.JCommander;
import com.google.cloud.bigquery.Schema;
import org.apache.hadoop.conf.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Collections;
import java.util.List;
import java.util.Properties;

import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_DATASET_NAME;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_PARTITION_FIELDS;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SOURCE_URI;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_SOURCE_URI_PREFIX;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_TABLE_NAME;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_USE_BQ_MANIFEST_FILE;
import static org.apache.hudi.gcp.bigquery.BigQuerySyncConfig.BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA;

/**
 * Tool to sync a hoodie table with a big query table. Either use it as an api
 * BigQuerySyncTool.syncHoodieTable(BigQuerySyncConfig) or as a command line java -cp hoodie-hive.jar BigQuerySyncTool [args]
 * 

* This utility will get the schema from the latest commit and will sync big query table schema. * * @Experimental */ public class BigQuerySyncTool extends HoodieSyncTool { private static final Logger LOG = LoggerFactory.getLogger(BigQuerySyncTool.class); private static final String SUFFIX_MANIFEST = "_manifest"; private static final String SUFFIX_VERSIONS = "_versions"; private final BigQuerySyncConfig config; private final String tableName; private final String manifestTableName; private final String versionsTableName; private final String snapshotViewName; private final ManifestFileWriter manifestFileWriter; private final HoodieBigQuerySyncClient bqSyncClient; private final HoodieTableMetaClient metaClient; private final BigQuerySchemaResolver bqSchemaResolver; public BigQuerySyncTool(Properties props) { this(props, HadoopConfigUtils.createHadoopConf(props), Option.empty()); } public BigQuerySyncTool(Properties props, Configuration configuration, Option metaClientOption) { // will build file writer, client, etc. from configs super(props, configuration); this.config = new BigQuerySyncConfig(props); this.tableName = config.getString(BIGQUERY_SYNC_TABLE_NAME); this.manifestTableName = tableName + SUFFIX_MANIFEST; this.versionsTableName = tableName + SUFFIX_VERSIONS; this.snapshotViewName = tableName; this.bqSyncClient = new HoodieBigQuerySyncClient(config, metaClientOption.orElseGet(() -> buildMetaClient(config))); // reuse existing meta client if not provided (only test cases will provide their own meta client) this.metaClient = bqSyncClient.getMetaClient(); this.manifestFileWriter = buildManifestFileWriterFromConfig(metaClient, config); this.bqSchemaResolver = BigQuerySchemaResolver.getInstance(); } @VisibleForTesting // allows us to pass in mocks for the writer and client BigQuerySyncTool(Properties properties, ManifestFileWriter manifestFileWriter, HoodieBigQuerySyncClient bigQuerySyncClient, HoodieTableMetaClient metaClient, BigQuerySchemaResolver bigQuerySchemaResolver) { super(properties); this.config = new BigQuerySyncConfig(props); this.tableName = config.getString(BIGQUERY_SYNC_TABLE_NAME); this.manifestTableName = tableName + SUFFIX_MANIFEST; this.versionsTableName = tableName + SUFFIX_VERSIONS; this.snapshotViewName = tableName; this.bqSyncClient = bigQuerySyncClient; this.metaClient = metaClient; this.manifestFileWriter = manifestFileWriter; this.bqSchemaResolver = bigQuerySchemaResolver; } private static ManifestFileWriter buildManifestFileWriterFromConfig(HoodieTableMetaClient metaClient, BigQuerySyncConfig config) { return ManifestFileWriter.builder() .setMetaClient(metaClient) .setUseFileListingFromMetadata(config.getBoolean(BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA)) .build(); } @Override public void syncHoodieTable() { switch (bqSyncClient.getTableType()) { case COPY_ON_WRITE: case MERGE_ON_READ: syncTable(bqSyncClient); break; default: throw new UnsupportedOperationException(bqSyncClient.getTableType() + " table type is not supported yet."); } } private boolean tableExists(HoodieBigQuerySyncClient bqSyncClient, String tableName) { if (bqSyncClient.tableExists(tableName)) { LOG.info("{} already exists. Skip table creation.", tableName); return true; } return false; } private void syncTable(HoodieBigQuerySyncClient bqSyncClient) { LOG.info("Sync hoodie table {} at base path {}", snapshotViewName, bqSyncClient.getBasePath()); if (!bqSyncClient.datasetExists()) { throw new HoodieBigQuerySyncException("Dataset not found: " + config.getString(BIGQUERY_SYNC_DATASET_NAME)); } List partitionFields = !StringUtils.isNullOrEmpty(config.getString(BIGQUERY_SYNC_SOURCE_URI_PREFIX)) ? config.getSplitStrings(BIGQUERY_SYNC_PARTITION_FIELDS) : Collections.emptyList(); Schema latestSchema = bqSchemaResolver.getTableSchema(metaClient, partitionFields); if (config.getBoolean(BIGQUERY_SYNC_USE_BQ_MANIFEST_FILE)) { manifestFileWriter.writeManifestFile(true); // if table does not exist, create it using the manifest file // if table exists but is not yet using manifest file or needs to be recreated with the big-lake connection ID, update it to use manifest file if (bqSyncClient.tableNotExistsOrDoesNotMatchSpecification(tableName)) { bqSyncClient.createOrUpdateTableUsingBqManifestFile( tableName, manifestFileWriter.getManifestSourceUri(true), config.getString(BIGQUERY_SYNC_SOURCE_URI_PREFIX), latestSchema); LOG.info("Completed table {} creation using the manifest file", tableName); } else { bqSyncClient.updateTableSchema(tableName, latestSchema, partitionFields); LOG.info("Synced schema for {}", tableName); } LOG.info("Sync table complete for {}", tableName); return; } manifestFileWriter.writeManifestFile(false); if (!tableExists(bqSyncClient, manifestTableName)) { bqSyncClient.createManifestTable(manifestTableName, manifestFileWriter.getManifestSourceUri(false)); LOG.info("Manifest table creation complete for {}", manifestTableName); } if (!tableExists(bqSyncClient, versionsTableName)) { bqSyncClient.createVersionsTable( versionsTableName, config.getString(BIGQUERY_SYNC_SOURCE_URI), config.getString(BIGQUERY_SYNC_SOURCE_URI_PREFIX), config.getSplitStrings(BIGQUERY_SYNC_PARTITION_FIELDS)); LOG.info("Versions table creation complete for {}", versionsTableName); } if (!tableExists(bqSyncClient, snapshotViewName)) { bqSyncClient.createSnapshotView(snapshotViewName, versionsTableName, manifestTableName); LOG.info("Snapshot view creation complete for {}", snapshotViewName); } LOG.info("Sync table complete for {}", snapshotViewName); } @Override public void close() throws Exception { super.close(); bqSyncClient.close(); } public static void main(String[] args) { final BigQuerySyncConfig.BigQuerySyncConfigParams params = new BigQuerySyncConfig.BigQuerySyncConfigParams(); JCommander cmd = JCommander.newBuilder().addObject(params).build(); cmd.parse(args); if (params.isHelp()) { cmd.usage(); System.exit(0); } new BigQuerySyncTool(params.toProps()).syncHoodieTable(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy