All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.cdap.plugin.gcp.dataplex.sink.DataplexBatchSink Maven / Gradle / Ivy

/*
 * Copyright © 2022 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package io.cdap.plugin.gcp.dataplex.sink;

import com.google.api.gax.rpc.ApiException;
import com.google.auth.Credentials;
import com.google.auth.oauth2.GoogleCredentials;
import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.Dataset;
import com.google.cloud.bigquery.DatasetId;
import com.google.cloud.bigquery.Field;
import com.google.cloud.bigquery.Job;
import com.google.cloud.bigquery.JobConfiguration;
import com.google.cloud.bigquery.JobId;
import com.google.cloud.bigquery.JobStatistics;
import com.google.cloud.bigquery.Table;
import com.google.cloud.dataplex.v1.Asset;
import com.google.cloud.dataplex.v1.AssetName;
import com.google.cloud.dataplex.v1.CreateEntityRequest;
import com.google.cloud.dataplex.v1.DataplexServiceClient;
import com.google.cloud.dataplex.v1.Entity;
import com.google.cloud.dataplex.v1.EntityName;
import com.google.cloud.dataplex.v1.GetEntityRequest;
import com.google.cloud.dataplex.v1.MetadataServiceClient;
import com.google.cloud.dataplex.v1.StorageFormat;
import com.google.cloud.dataplex.v1.StorageSystem;
import com.google.cloud.dataplex.v1.UpdateEntityRequest;
import com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration;
import com.google.cloud.kms.v1.CryptoKeyName;
import com.google.cloud.storage.Bucket;
import com.google.cloud.storage.Storage;
import com.google.cloud.storage.StorageException;
import com.google.common.base.Strings;
import io.cdap.cdap.api.annotation.Description;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.annotation.Plugin;
import io.cdap.cdap.api.data.batch.Output;
import io.cdap.cdap.api.data.format.StructuredRecord;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.dataset.lib.KeyValue;
import io.cdap.cdap.api.plugin.InvalidPluginConfigException;
import io.cdap.cdap.api.plugin.InvalidPluginProperty;
import io.cdap.cdap.etl.api.Emitter;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.PipelineConfigurer;
import io.cdap.cdap.etl.api.StageConfigurer;
import io.cdap.cdap.etl.api.StageMetrics;
import io.cdap.cdap.etl.api.batch.BatchSink;
import io.cdap.cdap.etl.api.batch.BatchSinkContext;
import io.cdap.cdap.etl.api.validation.FormatContext;
import io.cdap.cdap.etl.api.validation.ValidatingOutputFormat;
import io.cdap.plugin.common.LineageRecorder;
import io.cdap.plugin.common.batch.sink.SinkOutputFormatProvider;
import io.cdap.plugin.format.FileFormat;
import io.cdap.plugin.gcp.bigquery.sink.AbstractBigQuerySink;
import io.cdap.plugin.gcp.bigquery.sink.BigQuerySinkUtils;
import io.cdap.plugin.gcp.bigquery.sink.PartitionType;
import io.cdap.plugin.gcp.bigquery.sink.lib.BigQueryTableFieldSchema;
import io.cdap.plugin.gcp.bigquery.util.BigQueryConstants;
import io.cdap.plugin.gcp.bigquery.util.BigQueryUtil;
import io.cdap.plugin.gcp.common.CmekUtils;
import io.cdap.plugin.gcp.common.GCPUtils;
import io.cdap.plugin.gcp.dataplex.common.util.DataplexConstants;
import io.cdap.plugin.gcp.dataplex.common.util.DataplexUtil;
import io.cdap.plugin.gcp.dataplex.sink.config.DataplexBatchSinkConfig;
import io.cdap.plugin.gcp.gcs.GCSPath;
import io.cdap.plugin.gcp.gcs.StorageClient;
import io.cdap.plugin.gcp.gcs.sink.GCSBatchSink;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
import java.util.stream.Collectors;
import javax.annotation.Nullable;

/**
 * Batch Sink that writes data to Dataplex assets (BigQuery or Cloud Storage).
 * 

* {@code StructuredRecord} is the first parameter because that is what the * sink will take as an input. * Object is the second parameter because that is the key used * by Hadoop's {@code TextOutputFormat}. * {@code Object} is the third parameter because that is the value used by * Hadoop's {@code TextOutputFormat}. All the plugins included with Hydrator operate on * StructuredRecord. */ @Plugin(type = BatchSink.PLUGIN_TYPE) @Name(DataplexBatchSink.NAME) @Description("Ingests and processes data within Dataplex.") public final class DataplexBatchSink extends BatchSink { public static final String NAME = "Dataplex"; private static final Logger LOG = LoggerFactory.getLogger(DataplexBatchSink.class); private static final String RECORDS_UPDATED_METRIC = "records.updated"; // Usually, you will need a private variable to store the config that was passed to your class private final DataplexBatchSinkConfig config; // UUID for the run. Will be used as bucket name for BigQuery assets. // UUID is used since GCS bucket names must be globally unique. private final UUID runUUID = UUID.randomUUID(); protected Configuration baseConfiguration; protected BigQuery bigQuery; private String outputPath; private Asset asset; private Entity entityBean = null; public DataplexBatchSink(DataplexBatchSinkConfig config) { this.config = config; } @Override public void configurePipeline(PipelineConfigurer pipelineConfigurer) { super.configurePipeline(pipelineConfigurer); StageConfigurer configurer = pipelineConfigurer.getStageConfigurer(); FailureCollector collector = configurer.getFailureCollector(); GoogleCredentials credentials = config.validateAndGetServiceAccountCredentials(collector); try (DataplexServiceClient dataplexServiceClient = DataplexUtil.getDataplexServiceClient(credentials)) { if (!config.getConnection().canConnect() || config.getServiceAccountType() == null || (config.isServiceAccountFilePath() && config.autoServiceAccountUnavailable()) || (config.tryGetProject() == null)) { return; } Schema inputSchema = configurer.getInputSchema(); Schema configuredSchema = config.getSchema(collector); config.validateAssetConfiguration(collector, dataplexServiceClient); if (config.getAssetType().equals(DataplexConstants.BIGQUERY_DATASET_ASSET_TYPE)) { config.validateBigQueryDataset(inputSchema, configuredSchema, collector, dataplexServiceClient); return; } if (config.getAssetType().equals(DataplexConstants.STORAGE_BUCKET_ASSET_TYPE)) { config.validateStorageBucket(collector); config.validateFormatForStorageBucket(pipelineConfigurer, collector); if (config.isUpdateDataplexMetadata()) { prepareDataplexMetadataUpdate(collector, configuredSchema); } return; } } catch (IOException e) { collector.addFailure(e.getMessage(), null); } } @Override public void prepareRun(BatchSinkContext context) throws Exception { FailureCollector collector = context.getFailureCollector(); GoogleCredentials credentials = config.validateAndGetServiceAccountCredentials(collector); try (DataplexServiceClient dataplexServiceClient = DataplexUtil.getDataplexServiceClient(credentials)) { config.validateAssetConfiguration(collector, dataplexServiceClient); asset = dataplexServiceClient.getAsset(AssetName.newBuilder().setProject(config.tryGetProject()) .setLocation(config.getLocation()) .setLake(config.getLake()).setZone(config.getZone()).setAsset(config.getAsset()).build()); if (config.getAssetType().equals(DataplexConstants.BIGQUERY_DATASET_ASSET_TYPE)) { config.validateBigQueryDataset(context.getInputSchema(), context.getOutputSchema(), collector, dataplexServiceClient); prepareRunBigQueryDataset(context); } if (config.getAssetType().equals(DataplexConstants.STORAGE_BUCKET_ASSET_TYPE)) { config.validateStorageBucket(collector); if (config.isUpdateDataplexMetadata()) { prepareDataplexMetadataUpdate(collector, config.getSchema(collector)); } prepareRunStorageBucket(context); } } } /** * O/P Template parameters will get changed based on asset type. E.g. for BQ Dataset * * @param input * @param emitter */ @Override public void transform(StructuredRecord input, Emitter> emitter) { if (this.config.getAssetType().equalsIgnoreCase(DataplexConstants.BIGQUERY_DATASET_ASSET_TYPE)) { emitter.emit(new KeyValue<>(input, NullWritable.get())); } else { emitter.emit(new KeyValue<>(NullWritable.get(), input)); } } @Override public void onRunFinish(boolean succeeded, BatchSinkContext context) { if (this.config.getAssetType().equalsIgnoreCase(DataplexConstants.STORAGE_BUCKET_ASSET_TYPE)) { emitMetricsForStorageBucket(succeeded, context); // Create entity when pipeline run has succeeded to enable manual discovery in dataplex. // CreateEntity API only allows CRUD operations for GCS assets if (succeeded && config.isUpdateDataplexMetadata()) { FailureCollector collector = context.getFailureCollector(); GoogleCredentials googleCredentials = config.validateAndGetServiceAccountCredentials(collector); Schema schema = config.getSchema(collector); if (schema == null) { schema = context.getInputSchema(); } String bucketName = ""; try { bucketName = asset.getResourceSpec().getName(); } catch (StorageException e) { throw new RuntimeException( "Unable to read bucket name. See error details for more information ", e); } try ( DataplexServiceClient dataplexServiceClient = DataplexUtil.getDataplexServiceClient(googleCredentials) ) { String assetFullPath = DataplexConstants.STORAGE_BUCKET_PATH_PREFIX + bucketName + "/" + config.getTable(); configureDataplexMetadataUpdate(googleCredentials, assetFullPath, StorageSystem.CLOUD_STORAGE, schema); } catch (ApiException | IOException e) { throw new RuntimeException( String.format("Unable create entity for bucket %s. ", bucketName) + "See error details for more information.", e); } } return; } Path gcsPath = new Path(DataplexConstants.STORAGE_BUCKET_PATH_PREFIX + runUUID); try { FileSystem fs = gcsPath.getFileSystem(baseConfiguration); if (fs.exists(gcsPath)) { fs.delete(gcsPath, true); LOG.debug("Deleted temporary directory '{}'", gcsPath); } emitMetricsForBigQueryDataset(succeeded, context); } catch (IOException e) { LOG.warn("Failed to delete temporary directory '{}': {}", gcsPath, e.getMessage()); } catch (Exception exception) { LOG.warn("Exception while trying to emit metric. No metric will be emitted for the number of affected rows.", exception); } } /** * prepare Run for BigQuery Dataset Asset. It will create necessary resources, and configuration methods * * @param context * @throws Exception */ private void prepareRunBigQueryDataset(BatchSinkContext context) throws Exception { FailureCollector collector = context.getFailureCollector(); Credentials credentials = config.getCredentials(collector); String project = config.getProject(); String cmekKey = context.getArguments().get(CmekUtils.CMEK_KEY); CryptoKeyName cmekKeyName = null; if (!Strings.isNullOrEmpty(cmekKey)) { cmekKeyName = CryptoKeyName.parse(cmekKey); } baseConfiguration = getBaseConfiguration(cmekKeyName); // asset.getResourceSpec().getName() will be of format 'projects/datasetProjectName/datasets/datasetName' String[] assetValues = asset.getResourceSpec().getName().split("/"); String datasetName = assetValues[assetValues.length - 1]; String datasetProject = assetValues[assetValues.length - 3]; bigQuery = GCPUtils.getBigQuery(datasetProject, credentials); // Get required dataset ID and dataset instance (if it exists) DatasetId datasetId = DatasetId.of(datasetProject, datasetName); Dataset dataset = bigQuery.getDataset(datasetId); String bucket = BigQueryUtil.getStagingBucketName(context.getArguments().asMap(), config.getLocation(), dataset, null); String fallbackBucketName = "dataplex-" + runUUID; bucket = BigQuerySinkUtils.configureBucket(baseConfiguration, bucket, fallbackBucketName); if (!context.isPreviewEnabled()) { BigQuerySinkUtils.createResources(bigQuery, GCPUtils.getStorage(project, credentials), DatasetId.of(datasetProject, datasetName), bucket, config.getLocation(), cmekKeyName); } Schema configSchema = config.getSchema(collector); Schema outputSchema = configSchema == null ? context.getInputSchema() : configSchema; configureTable(outputSchema, datasetName, datasetProject, collector); configureBigQuerySink(); initOutput(context, bigQuery, config.getReferenceName(BigQueryUtil.getFQN(datasetProject, datasetName, config.getTable())), config.getTable(), outputSchema, bucket, collector, datasetName, datasetProject); } /** * Sets addition configuration for the AbstractBigQuerySink's Hadoop configuration */ private void configureBigQuerySink() { baseConfiguration.set(BigQueryConstants.CONFIG_JOB_ID, runUUID.toString()); if (config.getPartitionByField() != null) { baseConfiguration.set(BigQueryConstants.CONFIG_PARTITION_BY_FIELD, config.getPartitionByField()); } baseConfiguration.setBoolean(BigQueryConstants.CONFIG_REQUIRE_PARTITION_FILTER, config.isRequirePartitionField()); if (config.getClusteringOrder() != null) { baseConfiguration.set(BigQueryConstants.CONFIG_CLUSTERING_ORDER, config.getClusteringOrder()); } baseConfiguration.set(BigQueryConstants.CONFIG_OPERATION, config.getOperation().name()); if (config.getTableKey() != null) { baseConfiguration.set(BigQueryConstants.CONFIG_TABLE_KEY, config.getTableKey()); } if (config.getDedupeBy() != null) { baseConfiguration.set(BigQueryConstants.CONFIG_DEDUPE_BY, config.getDedupeBy()); } if (config.getPartitionFilter() != null) { baseConfiguration.set(BigQueryConstants.CONFIG_PARTITION_FILTER, config.getPartitionFilter()); } PartitionType partitioningType = config.getPartitioningType(); baseConfiguration.setEnum(BigQueryConstants.CONFIG_PARTITION_TYPE, partitioningType); if (config.getRangeStart() != null) { baseConfiguration.setLong(BigQueryConstants.CONFIG_PARTITION_INTEGER_RANGE_START, config.getRangeStart()); } if (config.getRangeEnd() != null) { baseConfiguration.setLong(BigQueryConstants.CONFIG_PARTITION_INTEGER_RANGE_END, config.getRangeEnd()); } if (config.getRangeInterval() != null) { baseConfiguration.setLong(BigQueryConstants.CONFIG_PARTITION_INTEGER_RANGE_INTERVAL, config.getRangeInterval()); } } /** * Sets the output table for the AbstractBigQuerySink's Hadoop configuration */ private void configureTable(Schema schema, String dataset, String datasetProject, FailureCollector collector) { Table table = BigQueryUtil.getBigQueryTable(datasetProject, dataset, config.getTable(), config.getServiceAccount(), config.isServiceAccountFilePath(), collector); baseConfiguration.setBoolean(BigQueryConstants.CONFIG_DESTINATION_TABLE_EXISTS, table != null); List tableFieldsNames = null; if (table != null) { tableFieldsNames = Objects.requireNonNull(table.getDefinition().getSchema()).getFields().stream() .map(Field::getName).collect(Collectors.toList()); } else if (schema != null) { tableFieldsNames = schema.getFields().stream() .map(Schema.Field::getName).collect(Collectors.toList()); } if (tableFieldsNames != null) { baseConfiguration.set(BigQueryConstants.CONFIG_TABLE_FIELDS, String.join(",", tableFieldsNames)); } } /** * Initialized base configuration needed to load data into BigQuery table. * * @return base configuration */ private Configuration getBaseConfiguration(@Nullable CryptoKeyName cmekKey) throws IOException { Configuration baseConfiguration = BigQueryUtil.getBigQueryConfig(config.getServiceAccount(), config.getProject(), cmekKey, config.getServiceAccountType()); baseConfiguration.setBoolean(BigQueryConstants.CONFIG_ALLOW_SCHEMA_RELAXATION, config.isUpdateTableSchema()); baseConfiguration.setStrings(BigQueryConfiguration.OUTPUT_TABLE_WRITE_DISPOSITION.getKey(), config.getWriteDisposition().name()); // this setting is needed because gcs has default chunk size of 64MB. This is large default chunk size which can // cause OOM issue if there are many tables being written. See this - CDAP-16670 String gcsChunkSize = "8388608"; baseConfiguration.set("fs.gs.outputstream.upload.chunk.size", gcsChunkSize); return baseConfiguration; } /** * Initializes output along with lineage recording for given table and its schema. * * @param context batch sink context * @param bigQuery big query client for the configured project * @param outputName output name * @param tableName table name * @param tableSchema table schema * @param bucket bucket name */ protected void initOutput(BatchSinkContext context, BigQuery bigQuery, String outputName, String tableName, @Nullable Schema tableSchema, String bucket, FailureCollector collector, String dataset, String datasetProject) throws IOException { LOG.debug("Init output for table '{}' with schema: {}", tableName, tableSchema); List fields = BigQuerySinkUtils.getBigQueryTableFields(bigQuery, tableName, tableSchema, this.config.isUpdateTableSchema(), datasetProject, dataset, this.config.isTruncateTable(), collector); Configuration configuration = new Configuration(baseConfiguration); // Build GCS storage path for this bucket output. DatasetId datasetId = DatasetId.of(datasetProject, dataset); String temporaryGcsPath = BigQuerySinkUtils.getTemporaryGcsPath(bucket, runUUID.toString(), tableName); BigQuerySinkUtils.configureOutput(configuration, datasetId, tableName, temporaryGcsPath, fields); // Both emitLineage and setOutputFormat internally try to create an external dataset if it does not already exist. // We call emitLineage before since it creates the dataset with schema which is used. List fieldNames = fields.stream() .map(BigQueryTableFieldSchema::getName) .collect(Collectors.toList()); String fqn = BigQueryUtil.getFQN(datasetProject, dataset, config.getTable()); String location = bigQuery.getDataset(datasetId).getLocation(); io.cdap.plugin.common.Asset lineageAsset = io.cdap.plugin.common.Asset.builder( config.getReferenceName(fqn)) .setFqn(fqn).setLocation(location).build(); BigQuerySinkUtils.recordLineage(context, lineageAsset, tableSchema, fieldNames, null); configuration.set(DataplexOutputFormatProvider.DATAPLEX_ASSET_TYPE, DataplexConstants.BIGQUERY_DATASET_ASSET_TYPE); context.addOutput(Output.of(outputName, new DataplexOutputFormatProvider(configuration, tableSchema, null))); } void emitMetricsForBigQueryDataset(boolean succeeded, BatchSinkContext context) { if (!succeeded) { return; } Job queryJob = bigQuery.getJob(getJobId()); if (queryJob == null) { LOG.warn("Unable to find BigQuery job. No metric will be emitted for the number of affected rows."); return; } long totalRows = getTotalRows(queryJob); LOG.info("Job {} affected {} rows", queryJob.getJobId(), totalRows); //work around since StageMetrics count() only takes int as of now int cap = 10000; // so the loop will not cause significant delays long count = totalRows / Integer.MAX_VALUE; if (count > cap) { LOG.warn("Total record count is too high! Metric for the number of affected rows may not be updated correctly"); } count = count < cap ? count : cap; for (int i = 0; i <= count && totalRows > 0; i++) { int rowCount = totalRows < Integer.MAX_VALUE ? (int) totalRows : Integer.MAX_VALUE; context.getMetrics().count(AbstractBigQuerySink.RECORDS_UPDATED_METRIC, rowCount); totalRows -= rowCount; } } private JobId getJobId() { return JobId.newBuilder().setLocation(config.getLocation()).setJob(runUUID.toString()).build(); } private long getTotalRows(Job queryJob) { JobConfiguration.Type type = queryJob.getConfiguration().getType(); if (type == JobConfiguration.Type.LOAD) { return ((JobStatistics.LoadStatistics) queryJob.getStatistics()).getOutputRows(); } else if (type == JobConfiguration.Type.QUERY) { return ((JobStatistics.QueryStatistics) queryJob.getStatistics()).getNumDmlAffectedRows(); } LOG.warn("Unable to identify BigQuery job type. No metric will be emitted for the number of affected rows."); return 0; } /** * This method will perform prepareRun tasks for Storage Bucket Asset. * * @param context * @throws Exception */ private void prepareRunStorageBucket(BatchSinkContext context) throws Exception { ValidatingOutputFormat validatingOutputFormat = validateOutputFormatForRun(context); FailureCollector collector = context.getFailureCollector(); String cmekKey = context.getArguments().get(CmekUtils.CMEK_KEY); CryptoKeyName cmekKeyName = null; if (!Strings.isNullOrEmpty(cmekKey)) { cmekKeyName = CryptoKeyName.parse(cmekKey); } Credentials credentials = config.getCredentials(collector); Storage storage = GCPUtils.getStorage(config.getProject(), credentials); Bucket bucket; String bucketName = ""; try { bucketName = asset.getResourceSpec().getName(); bucket = storage.get(bucketName); } catch (StorageException e) { throw new RuntimeException( String.format("Unable to access or create bucket %s. ", bucketName) + "Ensure you entered the correct bucket path and have permissions for it.", e); } if (bucket == null) { GCPUtils.createBucket(storage, bucketName, config.getLocation(), cmekKeyName); } String outputDir = getOutputDir(context.getLogicalStartTime()); Map outputProperties = getStorageBucketOutputProperties(validatingOutputFormat, outputDir); // record field level lineage information // needs to happen before context.addOutput(), otherwise an external dataset without schema will be created. Schema schema = config.getSchema(collector); if (schema == null) { schema = context.getInputSchema(); } io.cdap.plugin.common.Asset asset = io.cdap.plugin.common.Asset.builder( config.getReferenceName(outputDir)) .setFqn(outputDir).setLocation(config.getLocation()).build(); LineageRecorder lineageRecorder = new LineageRecorder(context, asset); lineageRecorder.createExternalDataset(schema); if (schema != null && schema.getFields() != null && !schema.getFields().isEmpty()) { recordLineage(lineageRecorder, schema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList())); } context.addOutput(Output.of(config.getReferenceName(outputDir), new SinkOutputFormatProvider(validatingOutputFormat.getOutputFormatClassName(), outputProperties))); } /** * Validates output format for run and return ValidatingOutputFormat * * @param context * @throws Exception */ private ValidatingOutputFormat validateOutputFormatForRun(BatchSinkContext context) throws Exception { FailureCollector collector = context.getFailureCollector(); String format = config.getFormat().toString().toLowerCase(Locale.ROOT); ValidatingOutputFormat validatingOutputFormat = getOutputFormatForRun(context); FormatContext formatContext = new FormatContext(collector, context.getInputSchema()); config.validateOutputFormatProvider(formatContext, format, validatingOutputFormat); collector.getOrThrowException(); return validatingOutputFormat; } protected Map getFileSystemProperties() { Map properties = GCPUtils.getFileSystemProperties(config.getConnection(), outputPath, new HashMap<>()); properties.put(GCSBatchSink.CONTENT_TYPE, config.getContentType(config.getFormat().toString())); return properties; } protected Map getStorageBucketOutputProperties(ValidatingOutputFormat validatingOutputFormat, String outputDir) { Map outputProperties = new HashMap<>(validatingOutputFormat.getOutputFormatConfiguration()); outputProperties.put(FileOutputFormat.OUTDIR, outputDir); outputProperties.put(DataplexOutputFormatProvider.DATAPLEX_OUTPUT_BASE_DIR, outputDir); outputProperties.put(DataplexOutputFormatProvider.DATAPLEX_ASSET_TYPE, config.getAssetType()); outputProperties.putAll(getFileSystemProperties()); // Added to not create _SUCCESS file in the output directory. outputProperties.put("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false"); // Added to not write metadata files for parquet format in the output directory. if (config.getFormat().equals(FileFormat.PARQUET)) { outputProperties.put("parquet.enable.summary-metadata", "false"); } return outputProperties; } /** * It will instantiate and return the ValidatingOutputFormat object based on file Format selected by user * * @param context * @return * @throws InstantiationException */ protected ValidatingOutputFormat getOutputFormatForRun(BatchSinkContext context) throws InstantiationException { String fileFormat = config.getFormat().toString().toLowerCase(); try { ValidatingOutputFormat validatingOutputFormat = context.newPluginInstance(fileFormat); return new DataplexOutputFormatProvider(null, null, validatingOutputFormat); } catch (InvalidPluginConfigException e) { Set properties = new HashSet<>(e.getMissingProperties()); for (InvalidPluginProperty invalidProperty : e.getInvalidProperties()) { properties.add(invalidProperty.getName()); } String errorMessage = String.format("Format '%s' cannot be used because properties %s were not provided or " + "were invalid when the pipeline was deployed. Set the format to a " + "different value, or re-create the pipeline with all required properties.", fileFormat, properties); throw new IllegalArgumentException(errorMessage, e); } } protected void recordLineage(LineageRecorder lineageRecorder, List outputFields) { lineageRecorder.recordWrite("Write", "Wrote to Google Cloud Storage.", outputFields); } /** * it will return the output directory path in format gs://bucket/folder * * @param logicalStartTime * @return */ protected String getOutputDir(long logicalStartTime) { String suffix = config.getSuffix(); String defaultTimestampFormat = "yyyy-MM-dd-HH-mm"; String tableName = config.getTable(); suffix = Strings.isNullOrEmpty(suffix) ? defaultTimestampFormat : suffix; String timeSuffix = String.format("%s=%s", DataplexConstants.STORAGE_BUCKET_PARTITION_KEY, new SimpleDateFormat(suffix).format(logicalStartTime) ); String configPath = GCSPath.SCHEME + asset.getResourceSpec().getName(); String finalPath = String.format("%s/%s/%s/", configPath, tableName, timeSuffix); this.outputPath = finalPath; return finalPath; } private void emitMetricsForStorageBucket(boolean succeeded, BatchSinkContext context) { if (!succeeded) { return; } try { StorageClient storageClient = StorageClient.create(config.getProject(), config.getServiceAccount(), config.isServiceAccountFilePath(), null); storageClient.mapMetaDataForAllBlobs(outputPath, new MetricsEmitter(context.getMetrics())::emitMetrics); } catch (Exception e) { LOG.warn("Metrics for the number of affected rows in GCS Sink maybe incorrect.", e); } } /** * Prepares metadata update in Dataplex if update dataplex metadata is enabled * * @param collector * @param schema * @throws IOException */ private void prepareDataplexMetadataUpdate(FailureCollector collector, Schema schema) throws IOException { Optional partitionKey = Objects.requireNonNull(schema.getFields()).stream() .filter(avroField -> avroField.getName().equals(DataplexConstants.STORAGE_BUCKET_PARTITION_KEY)).findAny(); if (partitionKey.isPresent()) { collector.addFailure( String.format("Field '%s' is used by dataplex sink to create time partitioned layout on GCS." + " To avoid conflict, presence of a column with the name '%s' on the input schema is not allowed.", DataplexConstants.STORAGE_BUCKET_PARTITION_KEY, DataplexConstants.STORAGE_BUCKET_PARTITION_KEY), String.format( "Remove '%s' field from the output schema or rename the '%s' field in the input schema by adding" + " a transform step.", DataplexConstants.STORAGE_BUCKET_PARTITION_KEY, DataplexConstants.STORAGE_BUCKET_PARTITION_KEY ) ); } String entityID = config.getTable().replaceAll("[^a-zA-Z0-9_]", "_"); try (MetadataServiceClient metadataServiceClient = DataplexUtil.getMetadataServiceClient(config.getCredentials(collector))) { entityBean = metadataServiceClient.getEntity(GetEntityRequest.newBuilder().setName(EntityName.of( config.tryGetProject(), config.getLocation(), config.getLake(), config.getZone(), entityID).toString()) .setView(GetEntityRequest.EntityView.FULL) .build()); } catch (ApiException e) { int statusCode = e.getStatusCode().getCode().getHttpStatusCode(); if (statusCode != 404) { collector.addFailure("Unable to fetch entity information.", null); } } if (entityBean != null && entityBean.getSchema().getUserManaged() == false) { collector.addFailure("Entity already exists, but the schema is not user-managed.", null); } } /** * Configures metadata update in Dataplex if update dataplex metadata is enabled * * @param credentials * @param assetFullPath * @param storageSystem * @param schema * @throws IOException */ private void configureDataplexMetadataUpdate( GoogleCredentials credentials, String assetFullPath, StorageSystem storageSystem, Schema schema ) throws IOException { String entityID = config.getTable().replaceAll("[^a-zA-Z0-9_]", "_"); try (MetadataServiceClient metadataServiceClient = DataplexUtil.getMetadataServiceClient(credentials)) { com.google.cloud.dataplex.v1.Schema dataplexSchema = DataplexUtil.getDataplexSchema(schema); Entity.Builder entityBuilder = Entity.newBuilder() .setId(entityID) .setAsset(config.getAsset()) .setDataPath(assetFullPath) .setType(Entity.Type.TABLE) .setSystem(storageSystem) .setSchema(dataplexSchema) .setFormat(StorageFormat .newBuilder() .setMimeType(DataplexUtil.getStorageFormatForEntity(config.getFormatStr())) .build() ); if (entityBean != null) { try { entityBean = metadataServiceClient.updateEntity( UpdateEntityRequest.newBuilder() .setEntity(entityBuilder .setName(entityBean.getName()) .setEtag(entityBean.getEtag()) .build() ) .build() ); } catch (ApiException e) { throw new RuntimeException( String.format("%s: %s", "There was a problem updating the entity for metadata updates.", e.getMessage())); } } else { try { String entityParent = "projects/" + config.tryGetProject() + "/locations/" + config.getLocation() + "/lakes/" + config.getLake() + "/zones/" + config.getZone(); entityBean = metadataServiceClient.createEntity( CreateEntityRequest.newBuilder() .setParent(entityParent) .setEntity(entityBuilder.build()) .build() ); } catch (ApiException e) { throw new RuntimeException( String.format("%s: %s", "There was a problem creating the entity for metadata updates.", e.getMessage())); } } try { DataplexUtil.addPartitionInfo(entityBean, credentials, asset.getResourceSpec().getName(), config.getTable(), config.getProject()); } catch (ApiException e) { // extract last 14 chars of the error message to make sure it's "already exists" and safe to ignore String errMessage = e.getMessage().substring(e.getMessage().length() - 14); if (!errMessage.equals("already exists")) { throw new RuntimeException(String.format("Unable to create add partition information for %s. ", entityID)); } } } } private static class MetricsEmitter { private final StageMetrics stageMetrics; private MetricsEmitter(StageMetrics stageMetrics) { this.stageMetrics = stageMetrics; } public void emitMetrics(Map metaData) { long totalRows = extractRecordCount(metaData); if (totalRows == 0) { return; } // work around since StageMetrics count() only takes int as of now int cap = 10000; // so the loop will not cause significant delays long count = totalRows / Integer.MAX_VALUE; if (count > cap) { LOG.warn("Total record count is too high! Metric for the number of affected rows may not be updated correctly"); } count = count < cap ? count : cap; for (int i = 0; i <= count && totalRows > 0; i++) { int rowCount = totalRows < Integer.MAX_VALUE ? (int) totalRows : Integer.MAX_VALUE; stageMetrics.count(RECORDS_UPDATED_METRIC, rowCount); totalRows -= rowCount; } } private long extractRecordCount(Map metadata) { String value = metadata.get(GCSBatchSink.RECORD_COUNT); return value == null ? 0L : Long.parseLong(value); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy