io.cdap.plugin.gcp.bigquery.sink.lib.BigQueryOutputConfiguration Maven / Gradle / Ivy
/*
* Copyright © 2023 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package io.cdap.plugin.gcp.bigquery.sink.lib;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.api.services.bigquery.model.TimePartitioning;
import com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration;
import com.google.cloud.hadoop.io.bigquery.BigQueryFileFormat;
import com.google.cloud.hadoop.io.bigquery.HadoopConfigurationProperty;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
import java.io.IOException;
import java.util.List;
import java.util.Optional;
import static com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration.OUTPUT_CLEANUP_TEMP;
import static com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration.OUTPUT_DATASET_ID;
import static com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration.OUTPUT_FILE_FORMAT;
import static com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration.OUTPUT_FORMAT_CLASS;
import static com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration.OUTPUT_PROJECT_ID;
import static com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration.OUTPUT_TABLE_CREATE_DISPOSITION;
import static com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration.OUTPUT_TABLE_ID;
import static com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration.OUTPUT_TABLE_KMS_KEY_NAME;
import static com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration.OUTPUT_TABLE_PARTITIONING;
import static com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration.OUTPUT_TABLE_SCHEMA;
import static com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration.OUTPUT_TABLE_WRITE_DISPOSITION;
import static com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration.PROJECT_ID;
import static com.google.cloud.hadoop.util.ConfigurationUtil.getMandatoryConfig;
/**
* A container for configuration keys related to BigQuery indirect output formats. Alternatively,
* the properties can be set in the configuration xml files with proper values.
* - Moved from com.google.cloud.hadoop.io.bigquery bigquery-connector library
* - Update the table regex as per the requirement of flexible column names
* TODO(PLUGIN-1725): Migrate to Spark BigQuery connector.
*/
@InterfaceStability.Unstable
public class BigQueryOutputConfiguration {
/** A list of keys that are required for this output connector. */
public static final List> REQUIRED_PROPERTIES =
ImmutableList.of(OUTPUT_DATASET_ID, OUTPUT_TABLE_ID, OUTPUT_FILE_FORMAT, OUTPUT_FORMAT_CLASS);
/**
* A helper function to set the required output keys in the given configuration.
*
* @param conf the configuration to set the keys on.
* @param qualifiedOutputTableId the qualified id of the output table in the form: (Optional
* ProjectId):[DatasetId].[TableId]
. If the project id is missing, the default project
* id is attempted {@link BigQueryConfiguration#PROJECT_ID}.
* @param outputTableSchemaJson the schema of the BigQuery output table.
* @param outputGcsPath the path in GCS to stage data in. Example: 'gs://bucket/job'.
* @param outputFileFormat the formatting of the data being written by the output format class.
* @param outputFormatClass the file output format that will write files to GCS.
* @throws IOException
*/
@SuppressWarnings("rawtypes")
private static void configure(
Configuration conf,
String qualifiedOutputTableId,
String outputTableSchemaJson,
String outputGcsPath,
BigQueryFileFormat outputFileFormat,
Class extends FileOutputFormat> outputFormatClass)
throws IOException {
Preconditions.checkArgument(
!Strings.isNullOrEmpty(outputTableSchemaJson),
"outputTableSchemaJson must not be null or empty.");
TableReference outputTable = BigQueryStrings.parseTableReference(qualifiedOutputTableId);
configure(
conf,
outputTable.getProjectId(),
outputTable.getDatasetId(),
outputTable.getTableId(),
Optional.of(outputTableSchemaJson),
outputGcsPath,
outputFileFormat,
outputFormatClass);
}
/**
* A helper function to set the required output keys in the given configuration.
*
* @param conf the configuration to set the keys on.
* @param outputProjectId the id of the output project. If the project id is null, the default
* project id is attempted {@link BigQueryConfiguration#PROJECT_ID}.
* @param outputDatasetId the id of the output dataset.
* @param outputTableId the id of the output table.
* @param outputTableSchemaJson the schema of the BigQuery output table. If the schema is null,
* BigQuery will attempt to auto detect the schema. When using avro formatted data, a schema
* is not required as avro stores the schema in the file.
* @param outputGcsPath the path in GCS to stage data in. Example: 'gs://bucket/job'.
* @param outputFileFormat the formatting of the data being written by the output format class.
* @param outputFormatClass the file output format that will write files to GCS.
* @throws IOException
*/
@SuppressWarnings("rawtypes")
private static void configure(
Configuration conf,
String outputProjectId,
String outputDatasetId,
String outputTableId,
Optional outputTableSchemaJson,
String outputGcsPath,
BigQueryFileFormat outputFileFormat,
Class extends FileOutputFormat> outputFormatClass)
throws IOException {
// Use the default project ID as a backup.
if (Strings.isNullOrEmpty(outputProjectId)) {
outputProjectId = PROJECT_ID.get(conf, conf::get);
}
Preconditions.checkArgument(
!Strings.isNullOrEmpty(outputProjectId), "outputProjectId must not be null or empty.");
Preconditions.checkArgument(
!Strings.isNullOrEmpty(outputDatasetId), "outputDatasetId must not be null or empty.");
Preconditions.checkArgument(
!Strings.isNullOrEmpty(outputTableId), "outputTableId must not be null or empty.");
Preconditions.checkArgument(
!Strings.isNullOrEmpty(outputGcsPath), "outputGcsPath must not be null or empty.");
Preconditions.checkNotNull(outputFileFormat, "outputFileFormat must not be null.");
Preconditions.checkNotNull(outputFormatClass, "outputFormatClass must not be null.");
conf.set(OUTPUT_PROJECT_ID.getKey(), outputProjectId);
conf.set(OUTPUT_DATASET_ID.getKey(), outputDatasetId);
conf.set(OUTPUT_TABLE_ID.getKey(), outputTableId);
conf.set(OUTPUT_FILE_FORMAT.getKey(), outputFileFormat.name());
conf.setClass(OUTPUT_FORMAT_CLASS.getKey(), outputFormatClass, FileOutputFormat.class);
setFileOutputFormatOutputPath(conf, outputGcsPath);
// If a schema is provided, serialize it.
if (outputTableSchemaJson.isPresent()) {
TableSchema tableSchema = BigQueryTableHelper.parseTableSchema(outputTableSchemaJson.get());
String fieldsJson = BigQueryTableHelper.getTableFieldsJson(tableSchema);
conf.set(OUTPUT_TABLE_SCHEMA.getKey(), fieldsJson);
}
}
/**
* A helper function to set the required output keys in the given configuration.
*
* @param conf the configuration to set the keys on.
* @param qualifiedOutputTableId the qualified id of the output table in the form: (Optional
* ProjectId):[DatasetId].[TableId]
. If the project id is missing, the default project
* id is attempted {@link BigQueryConfiguration#PROJECT_ID}.
* @param outputTableSchema the schema of the BigQuery output table. If the schema is null,
* BigQuery will attempt to auto detect the schema. When using avro formatted data, a schema
* is not required as avro stores the schema in the file.
* @param outputGcsPath the path in GCS to stage data in. Example: 'gs://bucket/job'.
* @param outputFileFormat the formatting of the data being written by the output format class.
* @param outputFormatClass the file output format that will write files to GCS.
* @throws IOException
*/
@SuppressWarnings("rawtypes")
public static void configure(
Configuration conf,
String qualifiedOutputTableId,
BigQueryTableSchema outputTableSchema,
String outputGcsPath,
BigQueryFileFormat outputFileFormat,
Class extends FileOutputFormat> outputFormatClass)
throws IOException {
configure(
conf,
qualifiedOutputTableId,
BigQueryTableHelper.getTableSchemaJson(outputTableSchema.get()),
outputGcsPath,
outputFileFormat,
outputFormatClass);
}
/**
* A helper function to set the required output keys in the given configuration.
*
* This method will set the output table schema as auto-detected.
*
* @param conf the configuration to set the keys on.
* @param qualifiedOutputTableId the qualified id of the output table in the form: (Optional
* ProjectId):[DatasetId].[TableId]
. If the project id is missing, the default project
* id is attempted {@link BigQueryConfiguration#PROJECT_ID}.
* @param outputGcsPath the path in GCS to stage data in. Example: 'gs://bucket/job'.
* @param outputFileFormat the formatting of the data being written by the output format class.
* @param outputFormatClass the file output format that will write files to GCS.
* @throws IOException
*/
@SuppressWarnings("rawtypes")
public static void configureWithAutoSchema(
Configuration conf,
String qualifiedOutputTableId,
String outputGcsPath,
BigQueryFileFormat outputFileFormat,
Class extends FileOutputFormat> outputFormatClass)
throws IOException {
TableReference outputTable = BigQueryStrings.parseTableReference(qualifiedOutputTableId);
configure(
conf,
outputTable.getProjectId(),
outputTable.getDatasetId(),
outputTable.getTableId(),
/* outputTableSchemaJson= */ Optional.empty(),
outputGcsPath,
outputFileFormat,
outputFormatClass);
}
public static void setKmsKeyName(Configuration conf, String kmsKeyName) {
Preconditions.checkArgument(
!Strings.isNullOrEmpty(kmsKeyName), "kmsKeyName must not be null or empty.");
conf.set(OUTPUT_TABLE_KMS_KEY_NAME.getKey(), kmsKeyName);
}
/**
* Helper function that validates the output configuration. Ensures the project id, dataset id,
* and table id exist in the configuration. This also ensures that if a schema is provided, that
* it is properly formatted.
*
* @param conf the configuration to validate.
* @throws IOException if the configuration is missing a key, or there's an issue while parsing
* the schema in the configuration.
*/
public static void validateConfiguration(Configuration conf) throws IOException {
// Ensure the BigQuery output information is valid.
getMandatoryConfig(conf, REQUIRED_PROPERTIES);
// Run through the individual getters as they manage error handling.
getProjectId(conf);
getJobProjectId(conf);
getTableSchema(conf);
getFileFormat(conf);
getFileOutputFormat(conf);
getGcsOutputPath(conf);
}
/**
* Gets if the configuration flag to cleanup temporary data in GCS is enabled or not.
*
* @param conf the configuration to reference the key from.
* @return true if the flag is enabled or missing, false otherwise.
*/
public static boolean getCleanupTemporaryDataFlag(Configuration conf) {
return OUTPUT_CLEANUP_TEMP.get(conf, conf::getBoolean);
}
/**
* Gets the output dataset project id based on the given configuration.
*
*
If the {@link BigQueryConfiguration#OUTPUT_PROJECT_ID} is missing, this resolves to
* referencing the {@link BigQueryConfiguration#PROJECT_ID} key.
*
*
The load job can be configured with two project identifiers. Configuration key {@link
* BigQueryConfiguration#PROJECT_ID} can set the project on whose behalf to perform BigQuery load
* operation, while {@link BigQueryConfiguration#OUTPUT_PROJECT_ID} can be used to name the
* project that the target dataset belongs to.
*
* @param conf the configuration to reference the keys from.
* @return the project id based on the given configuration.
* @throws IOException if a required key is missing.
*/
public static String getProjectId(Configuration conf) throws IOException {
// Reference the default project ID as a backup.
String projectId = OUTPUT_PROJECT_ID.get(conf, conf::get);
if (Strings.isNullOrEmpty(projectId)) {
projectId = PROJECT_ID.get(conf, conf::get);
}
if (Strings.isNullOrEmpty(projectId)) {
throw new IOException(
"Must supply a value for configuration setting: " + OUTPUT_PROJECT_ID.getKey());
}
return projectId;
}
/**
* Gets the project id to be used to run BQ load job based on the given configuration.
*
*
If the {@link BigQueryConfiguration#PROJECT_ID} is missing, this resolves to referencing the
* {@link BigQueryConfiguration#OUTPUT_PROJECT_ID} key.
*
*
The load job can be configured with two project identifiers. Configuration key {@link
* BigQueryConfiguration#PROJECT_ID} can set the project on whose behalf to perform BigQuery load
* operation, while {@link BigQueryConfiguration#OUTPUT_PROJECT_ID} can be used to name the
* project that the target dataset belongs to.
*
* @param conf the configuration to reference the keys from.
* @return the project id based on the given configuration.
* @throws IOException if a required key is missing.
*/
public static String getJobProjectId(Configuration conf) throws IOException {
// Reference the default project ID as a backup.
String projectId = PROJECT_ID.get(conf, conf::get);
if (Strings.isNullOrEmpty(projectId)) {
projectId = OUTPUT_PROJECT_ID.get(conf, conf::get);
}
if (Strings.isNullOrEmpty(projectId)) {
throw new IOException(
"Must supply a value for configuration setting: " + PROJECT_ID.getKey());
}
return projectId;
}
/**
* Gets the output table reference based on the given configuration. If the {@link
* BigQueryConfiguration#OUTPUT_PROJECT_ID} is missing, this resolves to referencing the
* {@link BigQueryConfiguration#PROJECT_ID} key.
*
* @param conf the configuration to reference the keys from.
* @return a reference to the derived output table in the format of ":.".
* @throws IOException if a required key is missing.
*/
static TableReference getTableReference(Configuration conf) throws IOException {
// Ensure the BigQuery output information is valid.
String projectId = getProjectId(conf);
String datasetId = getMandatoryConfig(conf, OUTPUT_DATASET_ID);
String tableId = getMandatoryConfig(conf, OUTPUT_TABLE_ID);
return new TableReference().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId);
}
/**
* Gets the output table schema based on the given configuration.
*
* @param conf the configuration to reference the keys from.
* @return the derived table schema, absent value if no table schema exists in the configuration.
* @throws IOException if a table schema was set in the configuration but couldn't be parsed.
*/
static Optional getTableSchema(Configuration conf) throws IOException {
String fieldsJson = OUTPUT_TABLE_SCHEMA.get(conf, conf::get);
if (!Strings.isNullOrEmpty(fieldsJson)) {
try {
TableSchema tableSchema = BigQueryTableHelper.createTableSchemaFromFields(fieldsJson);
return Optional.of(BigQueryTableSchema.wrap(tableSchema));
} catch (IOException e) {
throw new IOException("Unable to parse key '" + OUTPUT_TABLE_SCHEMA.getKey() + "'.", e);
}
}
return Optional.empty();
}
/**
* Gets the output table time partitioning based on the given configuration.
*
* @param conf the configuration to reference the keys from.
* @return the derived table time partitioning, absent value if no table time partitioning exists
* in the configuration.
* @throws IOException if a table time partitioning was set in the configuration but couldn't be
* parsed.
*/
static Optional getTablePartitioning(Configuration conf)
throws IOException {
String fieldsJson = OUTPUT_TABLE_PARTITIONING.get(conf, conf::get);
if (!Strings.isNullOrEmpty(fieldsJson)) {
try {
TimePartitioning tablePartitioning = BigQueryTimePartitioning.getFromJson(fieldsJson);
return Optional.of(BigQueryTimePartitioning.wrap(tablePartitioning));
} catch (IOException e) {
throw new IOException(
"Unable to parse key '" + OUTPUT_TABLE_PARTITIONING.getKey() + "'.", e);
}
}
return Optional.empty();
}
/**
* Gets the output table KMS key name based on the given configuration.
*
* @param conf the configuration to reference the keys from.
* @return the KMS key name of the output table, null if no KMS key name exists in the
* configuration.
*/
public static String getKmsKeyName(Configuration conf) throws IOException {
return OUTPUT_TABLE_KMS_KEY_NAME.get(conf, conf::get);
}
/**
* Gets the stored output {@link BigQueryFileFormat} in the configuration.
*
* @param conf the configuration to reference the keys from.
* @return the stored output {@link BigQueryFileFormat} in the configuration.
* @throws IOException if file format value is missing from the configuration.
*/
public static BigQueryFileFormat getFileFormat(Configuration conf) throws IOException {
// Ensure the BigQuery output information is valid.
String fileFormatName = getMandatoryConfig(conf, OUTPUT_FILE_FORMAT);
return BigQueryFileFormat.fromName(fileFormatName);
}
/**
* Gets a configured instance of the stored {@link FileOutputFormat} in the configuration.
*
* @param conf the configuration to reference the keys from.
* @return a configured instance of the stored {@link FileOutputFormat} in the configuration.
* @throws IOException if there's an issue getting an instance of a FileOutputFormat from the
* configuration.
*/
@SuppressWarnings("rawtypes")
public static FileOutputFormat getFileOutputFormat(Configuration conf) throws IOException {
// Ensure the BigQuery output information is valid.
getMandatoryConfig(conf, OUTPUT_FORMAT_CLASS);
Class> confClass = OUTPUT_FORMAT_CLASS.get(conf, conf::getClass);
// Fail if the default value was used, or the class isn't a FileOutputFormat.
if (confClass == null) {
throw new IOException(
"Unable to resolve value for the configuration key '"
+ OUTPUT_FORMAT_CLASS.getKey()
+ "'.");
} else if (!FileOutputFormat.class.isAssignableFrom(confClass)) {
throw new IOException("The class " + confClass.getName() + " is not a FileOutputFormat.");
}
Class extends FileOutputFormat> fileOutputClass =
confClass.asSubclass(FileOutputFormat.class);
// Create a new instance and configure it if it's configurable.
return ReflectionUtils.newInstance(fileOutputClass, conf);
}
/**
* Gets the stored GCS output path in the configuration.
*
* @param conf the configuration to reference the keys from.
* @return the stored output path in the configuration.
* @throws IOException if the output path isn't set in the configuration, or the output path's
* file system isn't GCS.
*/
public static Path getGcsOutputPath(Configuration conf) throws IOException {
Job tempJob = new BigQueryOutputConfiguration.JobConfigurationAdapter(conf);
// Error if the output path is missing.
Path outputPath = FileOutputFormat.getOutputPath(tempJob);
if (outputPath == null) {
throw new IOException("FileOutputFormat output path not set.");
}
// Error if the output file system isn't GCS.
FileSystem fs = outputPath.getFileSystem(conf);
if (!"gs".equals(fs.getScheme())) {
throw new IOException("Output FileSystem must be GCS ('gs' scheme).");
}
return outputPath;
}
/**
* Gets the create disposition of the output table. This specifies if the job should create a
* table for loading data.
*
* @param conf the configuration to reference the keys from.
* @return the create disposition of the output table.
*/
public static String getCreateDisposition(Configuration conf) {
return OUTPUT_TABLE_CREATE_DISPOSITION.get(conf, conf::get);
}
/**
* Gets the write disposition of the output table. This specifies the action that occurs if the
* destination table already exists. By default, if the table already exists, BigQuery appends
* data to the output table.
*
* @param conf the configuration to reference the keys from.
* @return the write disposition of the output table.
*/
public static String getWriteDisposition(Configuration conf) {
return OUTPUT_TABLE_WRITE_DISPOSITION.get(conf, conf::get);
}
/**
* Sets the output path for FileOutputFormat.
*
* @param conf the configuration to pass to FileOutputFormat.
* @param outputPath the path to set as the output path.
* @throws IOException
*/
@VisibleForTesting
static void setFileOutputFormatOutputPath(Configuration conf, String outputPath)
throws IOException {
Job tempJob = new BigQueryOutputConfiguration.JobConfigurationAdapter(conf);
FileOutputFormat.setOutputPath(tempJob, new Path(outputPath));
}
/**
* This class provides a workaround for setting FileOutputFormat's output path. Creating a job
* with a configuration creates a defensive copy of the configuration for the job, meaning changes
* in either configuration will not be reflected in the other. Because FileOutputFormat requires a
* job for the API to set an output path, this adapter is used to ensure changes are propagated
* out to the wrapped configuration.
*/
private static class JobConfigurationAdapter extends Job {
private final Configuration config;
JobConfigurationAdapter(Configuration config) throws IOException {
super();
this.config = config;
}
@Override
public Configuration getConfiguration() {
return config;
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy