io.cdap.plugin.gcp.dataplex.sink.DataplexBatchSink Maven / Gradle / Ivy
/*
* Copyright © 2022 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package io.cdap.plugin.gcp.dataplex.sink;
import com.google.api.gax.rpc.ApiException;
import com.google.auth.Credentials;
import com.google.auth.oauth2.GoogleCredentials;
import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.Dataset;
import com.google.cloud.bigquery.DatasetId;
import com.google.cloud.bigquery.Field;
import com.google.cloud.bigquery.Job;
import com.google.cloud.bigquery.JobConfiguration;
import com.google.cloud.bigquery.JobId;
import com.google.cloud.bigquery.JobStatistics;
import com.google.cloud.bigquery.Table;
import com.google.cloud.dataplex.v1.Asset;
import com.google.cloud.dataplex.v1.AssetName;
import com.google.cloud.dataplex.v1.CreateEntityRequest;
import com.google.cloud.dataplex.v1.DataplexServiceClient;
import com.google.cloud.dataplex.v1.Entity;
import com.google.cloud.dataplex.v1.EntityName;
import com.google.cloud.dataplex.v1.GetEntityRequest;
import com.google.cloud.dataplex.v1.MetadataServiceClient;
import com.google.cloud.dataplex.v1.StorageFormat;
import com.google.cloud.dataplex.v1.StorageSystem;
import com.google.cloud.dataplex.v1.UpdateEntityRequest;
import com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration;
import com.google.cloud.kms.v1.CryptoKeyName;
import com.google.cloud.storage.Bucket;
import com.google.cloud.storage.Storage;
import com.google.cloud.storage.StorageException;
import com.google.common.base.Strings;
import io.cdap.cdap.api.annotation.Description;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.annotation.Plugin;
import io.cdap.cdap.api.data.batch.Output;
import io.cdap.cdap.api.data.format.StructuredRecord;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.dataset.lib.KeyValue;
import io.cdap.cdap.api.plugin.InvalidPluginConfigException;
import io.cdap.cdap.api.plugin.InvalidPluginProperty;
import io.cdap.cdap.etl.api.Emitter;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.PipelineConfigurer;
import io.cdap.cdap.etl.api.StageConfigurer;
import io.cdap.cdap.etl.api.StageMetrics;
import io.cdap.cdap.etl.api.batch.BatchSink;
import io.cdap.cdap.etl.api.batch.BatchSinkContext;
import io.cdap.cdap.etl.api.validation.FormatContext;
import io.cdap.cdap.etl.api.validation.ValidatingOutputFormat;
import io.cdap.plugin.common.LineageRecorder;
import io.cdap.plugin.common.batch.sink.SinkOutputFormatProvider;
import io.cdap.plugin.format.FileFormat;
import io.cdap.plugin.gcp.bigquery.sink.AbstractBigQuerySink;
import io.cdap.plugin.gcp.bigquery.sink.BigQuerySinkUtils;
import io.cdap.plugin.gcp.bigquery.sink.PartitionType;
import io.cdap.plugin.gcp.bigquery.sink.lib.BigQueryTableFieldSchema;
import io.cdap.plugin.gcp.bigquery.util.BigQueryConstants;
import io.cdap.plugin.gcp.bigquery.util.BigQueryUtil;
import io.cdap.plugin.gcp.common.CmekUtils;
import io.cdap.plugin.gcp.common.GCPUtils;
import io.cdap.plugin.gcp.dataplex.common.util.DataplexConstants;
import io.cdap.plugin.gcp.dataplex.common.util.DataplexUtil;
import io.cdap.plugin.gcp.dataplex.sink.config.DataplexBatchSinkConfig;
import io.cdap.plugin.gcp.gcs.GCSPath;
import io.cdap.plugin.gcp.gcs.StorageClient;
import io.cdap.plugin.gcp.gcs.sink.GCSBatchSink;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
import java.util.stream.Collectors;
import javax.annotation.Nullable;
/**
* Batch Sink that writes data to Dataplex assets (BigQuery or Cloud Storage).
*
* {@code StructuredRecord} is the first parameter because that is what the
* sink will take as an input.
* Object is the second parameter because that is the key used
* by Hadoop's {@code TextOutputFormat}.
* {@code Object} is the third parameter because that is the value used by
* Hadoop's {@code TextOutputFormat}. All the plugins included with Hydrator operate on
* StructuredRecord.
*/
@Plugin(type = BatchSink.PLUGIN_TYPE)
@Name(DataplexBatchSink.NAME)
@Description("Ingests and processes data within Dataplex.")
public final class DataplexBatchSink extends BatchSink {
public static final String NAME = "Dataplex";
private static final Logger LOG = LoggerFactory.getLogger(DataplexBatchSink.class);
private static final String RECORDS_UPDATED_METRIC = "records.updated";
// Usually, you will need a private variable to store the config that was passed to your class
private final DataplexBatchSinkConfig config;
// UUID for the run. Will be used as bucket name for BigQuery assets.
// UUID is used since GCS bucket names must be globally unique.
private final UUID runUUID = UUID.randomUUID();
protected Configuration baseConfiguration;
protected BigQuery bigQuery;
private String outputPath;
private Asset asset;
private Entity entityBean = null;
public DataplexBatchSink(DataplexBatchSinkConfig config) {
this.config = config;
}
@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
super.configurePipeline(pipelineConfigurer);
StageConfigurer configurer = pipelineConfigurer.getStageConfigurer();
FailureCollector collector = configurer.getFailureCollector();
GoogleCredentials credentials = config.validateAndGetServiceAccountCredentials(collector);
try (DataplexServiceClient dataplexServiceClient = DataplexUtil.getDataplexServiceClient(credentials)) {
if (!config.getConnection().canConnect() || config.getServiceAccountType() == null ||
(config.isServiceAccountFilePath() && config.autoServiceAccountUnavailable()) ||
(config.tryGetProject() == null)) {
return;
}
Schema inputSchema = configurer.getInputSchema();
Schema configuredSchema = config.getSchema(collector);
config.validateAssetConfiguration(collector, dataplexServiceClient);
if (config.getAssetType().equals(DataplexConstants.BIGQUERY_DATASET_ASSET_TYPE)) {
config.validateBigQueryDataset(inputSchema, configuredSchema, collector, dataplexServiceClient);
return;
}
if (config.getAssetType().equals(DataplexConstants.STORAGE_BUCKET_ASSET_TYPE)) {
config.validateStorageBucket(collector);
config.validateFormatForStorageBucket(pipelineConfigurer, collector);
if (config.isUpdateDataplexMetadata()) {
prepareDataplexMetadataUpdate(collector, configuredSchema);
}
return;
}
} catch (IOException e) {
collector.addFailure(e.getMessage(), null);
}
}
@Override
public void prepareRun(BatchSinkContext context) throws Exception {
FailureCollector collector = context.getFailureCollector();
GoogleCredentials credentials = config.validateAndGetServiceAccountCredentials(collector);
try (DataplexServiceClient dataplexServiceClient =
DataplexUtil.getDataplexServiceClient(credentials)) {
config.validateAssetConfiguration(collector, dataplexServiceClient);
asset =
dataplexServiceClient.getAsset(AssetName.newBuilder().setProject(config.tryGetProject())
.setLocation(config.getLocation())
.setLake(config.getLake()).setZone(config.getZone()).setAsset(config.getAsset()).build());
if (config.getAssetType().equals(DataplexConstants.BIGQUERY_DATASET_ASSET_TYPE)) {
config.validateBigQueryDataset(context.getInputSchema(), context.getOutputSchema(), collector,
dataplexServiceClient);
prepareRunBigQueryDataset(context);
}
if (config.getAssetType().equals(DataplexConstants.STORAGE_BUCKET_ASSET_TYPE)) {
config.validateStorageBucket(collector);
if (config.isUpdateDataplexMetadata()) {
prepareDataplexMetadataUpdate(collector, config.getSchema(collector));
}
prepareRunStorageBucket(context);
}
}
}
/**
* O/P Template parameters will get changed based on asset type. E.g. for BQ Dataset
*
* @param input
* @param emitter
*/
@Override
public void transform(StructuredRecord input, Emitter> emitter) {
if (this.config.getAssetType().equalsIgnoreCase(DataplexConstants.BIGQUERY_DATASET_ASSET_TYPE)) {
emitter.emit(new KeyValue<>(input, NullWritable.get()));
} else {
emitter.emit(new KeyValue<>(NullWritable.get(), input));
}
}
@Override
public void onRunFinish(boolean succeeded, BatchSinkContext context) {
if (this.config.getAssetType().equalsIgnoreCase(DataplexConstants.STORAGE_BUCKET_ASSET_TYPE)) {
emitMetricsForStorageBucket(succeeded, context);
// Create entity when pipeline run has succeeded to enable manual discovery in dataplex.
// CreateEntity API only allows CRUD operations for GCS assets
if (succeeded && config.isUpdateDataplexMetadata()) {
FailureCollector collector = context.getFailureCollector();
GoogleCredentials googleCredentials = config.validateAndGetServiceAccountCredentials(collector);
Schema schema = config.getSchema(collector);
if (schema == null) {
schema = context.getInputSchema();
}
String bucketName = "";
try {
bucketName = asset.getResourceSpec().getName();
} catch (StorageException e) {
throw new RuntimeException(
"Unable to read bucket name. See error details for more information ", e);
}
try (
DataplexServiceClient dataplexServiceClient =
DataplexUtil.getDataplexServiceClient(googleCredentials)
) {
String assetFullPath = DataplexConstants.STORAGE_BUCKET_PATH_PREFIX + bucketName +
"/" + config.getTable();
configureDataplexMetadataUpdate(googleCredentials, assetFullPath,
StorageSystem.CLOUD_STORAGE, schema);
} catch (ApiException | IOException e) {
throw new RuntimeException(
String.format("Unable create entity for bucket %s. ", bucketName)
+ "See error details for more information.", e);
}
}
return;
}
Path gcsPath = new Path(DataplexConstants.STORAGE_BUCKET_PATH_PREFIX + runUUID);
try {
FileSystem fs = gcsPath.getFileSystem(baseConfiguration);
if (fs.exists(gcsPath)) {
fs.delete(gcsPath, true);
LOG.debug("Deleted temporary directory '{}'", gcsPath);
}
emitMetricsForBigQueryDataset(succeeded, context);
} catch (IOException e) {
LOG.warn("Failed to delete temporary directory '{}': {}", gcsPath, e.getMessage());
} catch (Exception exception) {
LOG.warn("Exception while trying to emit metric. No metric will be emitted for the number of affected rows.",
exception);
}
}
/**
* prepare Run for BigQuery Dataset Asset. It will create necessary resources, and configuration methods
*
* @param context
* @throws Exception
*/
private void prepareRunBigQueryDataset(BatchSinkContext context) throws Exception {
FailureCollector collector = context.getFailureCollector();
Credentials credentials = config.getCredentials(collector);
String project = config.getProject();
String cmekKey = context.getArguments().get(CmekUtils.CMEK_KEY);
CryptoKeyName cmekKeyName = null;
if (!Strings.isNullOrEmpty(cmekKey)) {
cmekKeyName = CryptoKeyName.parse(cmekKey);
}
baseConfiguration = getBaseConfiguration(cmekKeyName);
// asset.getResourceSpec().getName() will be of format 'projects/datasetProjectName/datasets/datasetName'
String[] assetValues = asset.getResourceSpec().getName().split("/");
String datasetName = assetValues[assetValues.length - 1];
String datasetProject = assetValues[assetValues.length - 3];
bigQuery = GCPUtils.getBigQuery(datasetProject, credentials);
// Get required dataset ID and dataset instance (if it exists)
DatasetId datasetId = DatasetId.of(datasetProject, datasetName);
Dataset dataset = bigQuery.getDataset(datasetId);
String bucket = BigQueryUtil.getStagingBucketName(context.getArguments().asMap(), config.getLocation(),
dataset, null);
String fallbackBucketName = "dataplex-" + runUUID;
bucket = BigQuerySinkUtils.configureBucket(baseConfiguration, bucket, fallbackBucketName);
if (!context.isPreviewEnabled()) {
BigQuerySinkUtils.createResources(bigQuery, GCPUtils.getStorage(project, credentials),
DatasetId.of(datasetProject, datasetName),
bucket, config.getLocation(), cmekKeyName);
}
Schema configSchema = config.getSchema(collector);
Schema outputSchema = configSchema == null ? context.getInputSchema() : configSchema;
configureTable(outputSchema, datasetName, datasetProject, collector);
configureBigQuerySink();
initOutput(context, bigQuery,
config.getReferenceName(BigQueryUtil.getFQN(datasetProject, datasetName, config.getTable())),
config.getTable(), outputSchema, bucket, collector, datasetName, datasetProject);
}
/**
* Sets addition configuration for the AbstractBigQuerySink's Hadoop configuration
*/
private void configureBigQuerySink() {
baseConfiguration.set(BigQueryConstants.CONFIG_JOB_ID, runUUID.toString());
if (config.getPartitionByField() != null) {
baseConfiguration.set(BigQueryConstants.CONFIG_PARTITION_BY_FIELD, config.getPartitionByField());
}
baseConfiguration.setBoolean(BigQueryConstants.CONFIG_REQUIRE_PARTITION_FILTER,
config.isRequirePartitionField());
if (config.getClusteringOrder() != null) {
baseConfiguration.set(BigQueryConstants.CONFIG_CLUSTERING_ORDER, config.getClusteringOrder());
}
baseConfiguration.set(BigQueryConstants.CONFIG_OPERATION, config.getOperation().name());
if (config.getTableKey() != null) {
baseConfiguration.set(BigQueryConstants.CONFIG_TABLE_KEY, config.getTableKey());
}
if (config.getDedupeBy() != null) {
baseConfiguration.set(BigQueryConstants.CONFIG_DEDUPE_BY, config.getDedupeBy());
}
if (config.getPartitionFilter() != null) {
baseConfiguration.set(BigQueryConstants.CONFIG_PARTITION_FILTER, config.getPartitionFilter());
}
PartitionType partitioningType = config.getPartitioningType();
baseConfiguration.setEnum(BigQueryConstants.CONFIG_PARTITION_TYPE, partitioningType);
if (config.getRangeStart() != null) {
baseConfiguration.setLong(BigQueryConstants.CONFIG_PARTITION_INTEGER_RANGE_START, config.getRangeStart());
}
if (config.getRangeEnd() != null) {
baseConfiguration.setLong(BigQueryConstants.CONFIG_PARTITION_INTEGER_RANGE_END, config.getRangeEnd());
}
if (config.getRangeInterval() != null) {
baseConfiguration.setLong(BigQueryConstants.CONFIG_PARTITION_INTEGER_RANGE_INTERVAL, config.getRangeInterval());
}
}
/**
* Sets the output table for the AbstractBigQuerySink's Hadoop configuration
*/
private void configureTable(Schema schema, String dataset, String datasetProject, FailureCollector collector) {
Table table = BigQueryUtil.getBigQueryTable(datasetProject, dataset,
config.getTable(),
config.getServiceAccount(),
config.isServiceAccountFilePath(),
collector);
baseConfiguration.setBoolean(BigQueryConstants.CONFIG_DESTINATION_TABLE_EXISTS, table != null);
List tableFieldsNames = null;
if (table != null) {
tableFieldsNames = Objects.requireNonNull(table.getDefinition().getSchema()).getFields().stream()
.map(Field::getName).collect(Collectors.toList());
} else if (schema != null) {
tableFieldsNames = schema.getFields().stream()
.map(Schema.Field::getName).collect(Collectors.toList());
}
if (tableFieldsNames != null) {
baseConfiguration.set(BigQueryConstants.CONFIG_TABLE_FIELDS, String.join(",", tableFieldsNames));
}
}
/**
* Initialized base configuration needed to load data into BigQuery table.
*
* @return base configuration
*/
private Configuration getBaseConfiguration(@Nullable CryptoKeyName cmekKey) throws IOException {
Configuration baseConfiguration = BigQueryUtil.getBigQueryConfig(config.getServiceAccount(), config.getProject(),
cmekKey, config.getServiceAccountType());
baseConfiguration.setBoolean(BigQueryConstants.CONFIG_ALLOW_SCHEMA_RELAXATION,
config.isUpdateTableSchema());
baseConfiguration.setStrings(BigQueryConfiguration.OUTPUT_TABLE_WRITE_DISPOSITION.getKey(),
config.getWriteDisposition().name());
// this setting is needed because gcs has default chunk size of 64MB. This is large default chunk size which can
// cause OOM issue if there are many tables being written. See this - CDAP-16670
String gcsChunkSize = "8388608";
baseConfiguration.set("fs.gs.outputstream.upload.chunk.size", gcsChunkSize);
return baseConfiguration;
}
/**
* Initializes output along with lineage recording for given table and its schema.
*
* @param context batch sink context
* @param bigQuery big query client for the configured project
* @param outputName output name
* @param tableName table name
* @param tableSchema table schema
* @param bucket bucket name
*/
protected void initOutput(BatchSinkContext context, BigQuery bigQuery, String outputName, String tableName,
@Nullable Schema tableSchema, String bucket,
FailureCollector collector, String dataset, String datasetProject) throws IOException {
LOG.debug("Init output for table '{}' with schema: {}", tableName, tableSchema);
List fields = BigQuerySinkUtils.getBigQueryTableFields(bigQuery, tableName, tableSchema,
this.config.isUpdateTableSchema(), datasetProject, dataset, this.config.isTruncateTable(), collector);
Configuration configuration = new Configuration(baseConfiguration);
// Build GCS storage path for this bucket output.
DatasetId datasetId = DatasetId.of(datasetProject, dataset);
String temporaryGcsPath = BigQuerySinkUtils.getTemporaryGcsPath(bucket, runUUID.toString(), tableName);
BigQuerySinkUtils.configureOutput(configuration, datasetId, tableName, temporaryGcsPath, fields);
// Both emitLineage and setOutputFormat internally try to create an external dataset if it does not already exist.
// We call emitLineage before since it creates the dataset with schema which is used.
List fieldNames = fields.stream()
.map(BigQueryTableFieldSchema::getName)
.collect(Collectors.toList());
String fqn = BigQueryUtil.getFQN(datasetProject, dataset, config.getTable());
String location = bigQuery.getDataset(datasetId).getLocation();
io.cdap.plugin.common.Asset lineageAsset = io.cdap.plugin.common.Asset.builder(
config.getReferenceName(fqn))
.setFqn(fqn).setLocation(location).build();
BigQuerySinkUtils.recordLineage(context, lineageAsset, tableSchema, fieldNames, null);
configuration.set(DataplexOutputFormatProvider.DATAPLEX_ASSET_TYPE, DataplexConstants.BIGQUERY_DATASET_ASSET_TYPE);
context.addOutput(Output.of(outputName, new DataplexOutputFormatProvider(configuration, tableSchema, null)));
}
void emitMetricsForBigQueryDataset(boolean succeeded, BatchSinkContext context) {
if (!succeeded) {
return;
}
Job queryJob = bigQuery.getJob(getJobId());
if (queryJob == null) {
LOG.warn("Unable to find BigQuery job. No metric will be emitted for the number of affected rows.");
return;
}
long totalRows = getTotalRows(queryJob);
LOG.info("Job {} affected {} rows", queryJob.getJobId(), totalRows);
//work around since StageMetrics count() only takes int as of now
int cap = 10000; // so the loop will not cause significant delays
long count = totalRows / Integer.MAX_VALUE;
if (count > cap) {
LOG.warn("Total record count is too high! Metric for the number of affected rows may not be updated correctly");
}
count = count < cap ? count : cap;
for (int i = 0; i <= count && totalRows > 0; i++) {
int rowCount = totalRows < Integer.MAX_VALUE ? (int) totalRows : Integer.MAX_VALUE;
context.getMetrics().count(AbstractBigQuerySink.RECORDS_UPDATED_METRIC, rowCount);
totalRows -= rowCount;
}
}
private JobId getJobId() {
return JobId.newBuilder().setLocation(config.getLocation()).setJob(runUUID.toString()).build();
}
private long getTotalRows(Job queryJob) {
JobConfiguration.Type type = queryJob.getConfiguration().getType();
if (type == JobConfiguration.Type.LOAD) {
return ((JobStatistics.LoadStatistics) queryJob.getStatistics()).getOutputRows();
} else if (type == JobConfiguration.Type.QUERY) {
return ((JobStatistics.QueryStatistics) queryJob.getStatistics()).getNumDmlAffectedRows();
}
LOG.warn("Unable to identify BigQuery job type. No metric will be emitted for the number of affected rows.");
return 0;
}
/**
* This method will perform prepareRun tasks for Storage Bucket Asset.
*
* @param context
* @throws Exception
*/
private void prepareRunStorageBucket(BatchSinkContext context) throws Exception {
ValidatingOutputFormat validatingOutputFormat = validateOutputFormatForRun(context);
FailureCollector collector = context.getFailureCollector();
String cmekKey = context.getArguments().get(CmekUtils.CMEK_KEY);
CryptoKeyName cmekKeyName = null;
if (!Strings.isNullOrEmpty(cmekKey)) {
cmekKeyName = CryptoKeyName.parse(cmekKey);
}
Credentials credentials = config.getCredentials(collector);
Storage storage = GCPUtils.getStorage(config.getProject(), credentials);
Bucket bucket;
String bucketName = "";
try {
bucketName = asset.getResourceSpec().getName();
bucket = storage.get(bucketName);
} catch (StorageException e) {
throw new RuntimeException(
String.format("Unable to access or create bucket %s. ", bucketName)
+ "Ensure you entered the correct bucket path and have permissions for it.", e);
}
if (bucket == null) {
GCPUtils.createBucket(storage, bucketName, config.getLocation(), cmekKeyName);
}
String outputDir = getOutputDir(context.getLogicalStartTime());
Map outputProperties = getStorageBucketOutputProperties(validatingOutputFormat, outputDir);
// record field level lineage information
// needs to happen before context.addOutput(), otherwise an external dataset without schema will be created.
Schema schema = config.getSchema(collector);
if (schema == null) {
schema = context.getInputSchema();
}
io.cdap.plugin.common.Asset asset = io.cdap.plugin.common.Asset.builder(
config.getReferenceName(outputDir))
.setFqn(outputDir).setLocation(config.getLocation()).build();
LineageRecorder lineageRecorder = new LineageRecorder(context, asset);
lineageRecorder.createExternalDataset(schema);
if (schema != null && schema.getFields() != null && !schema.getFields().isEmpty()) {
recordLineage(lineageRecorder,
schema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList()));
}
context.addOutput(Output.of(config.getReferenceName(outputDir),
new SinkOutputFormatProvider(validatingOutputFormat.getOutputFormatClassName(), outputProperties)));
}
/**
* Validates output format for run and return ValidatingOutputFormat
*
* @param context
* @throws Exception
*/
private ValidatingOutputFormat validateOutputFormatForRun(BatchSinkContext context) throws Exception {
FailureCollector collector = context.getFailureCollector();
String format = config.getFormat().toString().toLowerCase(Locale.ROOT);
ValidatingOutputFormat validatingOutputFormat = getOutputFormatForRun(context);
FormatContext formatContext = new FormatContext(collector, context.getInputSchema());
config.validateOutputFormatProvider(formatContext, format, validatingOutputFormat);
collector.getOrThrowException();
return validatingOutputFormat;
}
protected Map getFileSystemProperties() {
Map properties = GCPUtils.getFileSystemProperties(config.getConnection(),
outputPath, new HashMap<>());
properties.put(GCSBatchSink.CONTENT_TYPE, config.getContentType(config.getFormat().toString()));
return properties;
}
protected Map getStorageBucketOutputProperties(ValidatingOutputFormat validatingOutputFormat,
String outputDir) {
Map outputProperties = new HashMap<>(validatingOutputFormat.getOutputFormatConfiguration());
outputProperties.put(FileOutputFormat.OUTDIR, outputDir);
outputProperties.put(DataplexOutputFormatProvider.DATAPLEX_OUTPUT_BASE_DIR, outputDir);
outputProperties.put(DataplexOutputFormatProvider.DATAPLEX_ASSET_TYPE, config.getAssetType());
outputProperties.putAll(getFileSystemProperties());
// Added to not create _SUCCESS file in the output directory.
outputProperties.put("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false");
// Added to not write metadata files for parquet format in the output directory.
if (config.getFormat().equals(FileFormat.PARQUET)) {
outputProperties.put("parquet.enable.summary-metadata", "false");
}
return outputProperties;
}
/**
* It will instantiate and return the ValidatingOutputFormat object based on file Format selected by user
*
* @param context
* @return
* @throws InstantiationException
*/
protected ValidatingOutputFormat getOutputFormatForRun(BatchSinkContext context) throws InstantiationException {
String fileFormat = config.getFormat().toString().toLowerCase();
try {
ValidatingOutputFormat validatingOutputFormat = context.newPluginInstance(fileFormat);
return new DataplexOutputFormatProvider(null, null, validatingOutputFormat);
} catch (InvalidPluginConfigException e) {
Set properties = new HashSet<>(e.getMissingProperties());
for (InvalidPluginProperty invalidProperty : e.getInvalidProperties()) {
properties.add(invalidProperty.getName());
}
String errorMessage = String.format("Format '%s' cannot be used because properties %s were not provided or " +
"were invalid when the pipeline was deployed. Set the format to a " +
"different value, or re-create the pipeline with all required properties.",
fileFormat, properties);
throw new IllegalArgumentException(errorMessage, e);
}
}
protected void recordLineage(LineageRecorder lineageRecorder, List outputFields) {
lineageRecorder.recordWrite("Write", "Wrote to Google Cloud Storage.", outputFields);
}
/**
* it will return the output directory path in format gs://bucket/folder
*
* @param logicalStartTime
* @return
*/
protected String getOutputDir(long logicalStartTime) {
String suffix = config.getSuffix();
String defaultTimestampFormat = "yyyy-MM-dd-HH-mm";
String tableName = config.getTable();
suffix = Strings.isNullOrEmpty(suffix) ? defaultTimestampFormat : suffix;
String timeSuffix = String.format("%s=%s",
DataplexConstants.STORAGE_BUCKET_PARTITION_KEY,
new SimpleDateFormat(suffix).format(logicalStartTime)
);
String configPath = GCSPath.SCHEME + asset.getResourceSpec().getName();
String finalPath = String.format("%s/%s/%s/", configPath, tableName, timeSuffix);
this.outputPath = finalPath;
return finalPath;
}
private void emitMetricsForStorageBucket(boolean succeeded, BatchSinkContext context) {
if (!succeeded) {
return;
}
try {
StorageClient storageClient = StorageClient.create(config.getProject(), config.getServiceAccount(),
config.isServiceAccountFilePath(), null);
storageClient.mapMetaDataForAllBlobs(outputPath,
new MetricsEmitter(context.getMetrics())::emitMetrics);
} catch (Exception e) {
LOG.warn("Metrics for the number of affected rows in GCS Sink maybe incorrect.", e);
}
}
/**
* Prepares metadata update in Dataplex if update dataplex metadata is enabled
*
* @param collector
* @param schema
* @throws IOException
*/
private void prepareDataplexMetadataUpdate(FailureCollector collector, Schema schema)
throws IOException {
Optional partitionKey = Objects.requireNonNull(schema.getFields()).stream()
.filter(avroField -> avroField.getName().equals(DataplexConstants.STORAGE_BUCKET_PARTITION_KEY)).findAny();
if (partitionKey.isPresent()) {
collector.addFailure(
String.format("Field '%s' is used by dataplex sink to create time partitioned layout on GCS." +
" To avoid conflict, presence of a column with the name '%s' on the input schema is not allowed.",
DataplexConstants.STORAGE_BUCKET_PARTITION_KEY, DataplexConstants.STORAGE_BUCKET_PARTITION_KEY),
String.format(
"Remove '%s' field from the output schema or rename the '%s' field in the input schema by adding" +
" a transform step.",
DataplexConstants.STORAGE_BUCKET_PARTITION_KEY, DataplexConstants.STORAGE_BUCKET_PARTITION_KEY
)
);
}
String entityID = config.getTable().replaceAll("[^a-zA-Z0-9_]", "_");
try (MetadataServiceClient metadataServiceClient =
DataplexUtil.getMetadataServiceClient(config.getCredentials(collector))) {
entityBean =
metadataServiceClient.getEntity(GetEntityRequest.newBuilder().setName(EntityName.of(
config.tryGetProject(), config.getLocation(), config.getLake(), config.getZone(), entityID).toString())
.setView(GetEntityRequest.EntityView.FULL)
.build());
} catch (ApiException e) {
int statusCode = e.getStatusCode().getCode().getHttpStatusCode();
if (statusCode != 404) {
collector.addFailure("Unable to fetch entity information.", null);
}
}
if (entityBean != null && entityBean.getSchema().getUserManaged() == false) {
collector.addFailure("Entity already exists, but the schema is not user-managed.", null);
}
}
/**
* Configures metadata update in Dataplex if update dataplex metadata is enabled
*
* @param credentials
* @param assetFullPath
* @param storageSystem
* @param schema
* @throws IOException
*/
private void configureDataplexMetadataUpdate(
GoogleCredentials credentials, String assetFullPath, StorageSystem storageSystem, Schema schema
) throws IOException {
String entityID = config.getTable().replaceAll("[^a-zA-Z0-9_]", "_");
try (MetadataServiceClient metadataServiceClient =
DataplexUtil.getMetadataServiceClient(credentials)) {
com.google.cloud.dataplex.v1.Schema dataplexSchema = DataplexUtil.getDataplexSchema(schema);
Entity.Builder entityBuilder = Entity.newBuilder()
.setId(entityID)
.setAsset(config.getAsset())
.setDataPath(assetFullPath)
.setType(Entity.Type.TABLE)
.setSystem(storageSystem)
.setSchema(dataplexSchema)
.setFormat(StorageFormat
.newBuilder()
.setMimeType(DataplexUtil.getStorageFormatForEntity(config.getFormatStr()))
.build()
);
if (entityBean != null) {
try {
entityBean = metadataServiceClient.updateEntity(
UpdateEntityRequest.newBuilder()
.setEntity(entityBuilder
.setName(entityBean.getName())
.setEtag(entityBean.getEtag())
.build()
)
.build()
);
} catch (ApiException e) {
throw new RuntimeException(
String.format("%s: %s", "There was a problem updating the entity for metadata updates.", e.getMessage()));
}
} else {
try {
String entityParent = "projects/" + config.tryGetProject() +
"/locations/" + config.getLocation() +
"/lakes/" + config.getLake() +
"/zones/" + config.getZone();
entityBean = metadataServiceClient.createEntity(
CreateEntityRequest.newBuilder()
.setParent(entityParent)
.setEntity(entityBuilder.build())
.build()
);
} catch (ApiException e) {
throw new RuntimeException(
String.format("%s: %s", "There was a problem creating the entity for metadata updates.", e.getMessage()));
}
}
try {
DataplexUtil.addPartitionInfo(entityBean, credentials,
asset.getResourceSpec().getName(), config.getTable(), config.getProject());
} catch (ApiException e) {
// extract last 14 chars of the error message to make sure it's "already exists" and safe to ignore
String errMessage = e.getMessage().substring(e.getMessage().length() - 14);
if (!errMessage.equals("already exists")) {
throw new RuntimeException(String.format("Unable to create add partition information for %s. ", entityID));
}
}
}
}
private static class MetricsEmitter {
private final StageMetrics stageMetrics;
private MetricsEmitter(StageMetrics stageMetrics) {
this.stageMetrics = stageMetrics;
}
public void emitMetrics(Map metaData) {
long totalRows = extractRecordCount(metaData);
if (totalRows == 0) {
return;
}
// work around since StageMetrics count() only takes int as of now
int cap = 10000; // so the loop will not cause significant delays
long count = totalRows / Integer.MAX_VALUE;
if (count > cap) {
LOG.warn("Total record count is too high! Metric for the number of affected rows may not be updated correctly");
}
count = count < cap ? count : cap;
for (int i = 0; i <= count && totalRows > 0; i++) {
int rowCount = totalRows < Integer.MAX_VALUE ? (int) totalRows : Integer.MAX_VALUE;
stageMetrics.count(RECORDS_UPDATED_METRIC, rowCount);
totalRows -= rowCount;
}
}
private long extractRecordCount(Map metadata) {
String value = metadata.get(GCSBatchSink.RECORD_COUNT);
return value == null ? 0L : Long.parseLong(value);
}
}
}