All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.cdap.plugin.gcp.dataplex.source.DataplexBatchSource Maven / Gradle / Ivy

The newest version!
/*
 * Copyright © 2022 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package io.cdap.plugin.gcp.dataplex.source;

import com.google.auth.Credentials;
import com.google.auth.oauth2.GoogleCredentials;
import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.Dataset;
import com.google.cloud.bigquery.DatasetId;
import com.google.cloud.bigquery.TableDefinition;
import com.google.cloud.bigquery.TableId;
import com.google.cloud.dataplex.v1.DataplexServiceClient;
import com.google.cloud.dataplex.v1.Entity;
import com.google.cloud.dataplex.v1.EntityName;
import com.google.cloud.dataplex.v1.LakeName;
import com.google.cloud.dataplex.v1.MetadataServiceClient;
import com.google.cloud.dataplex.v1.StorageSystem;
import com.google.cloud.dataplex.v1.Task;
import com.google.cloud.storage.Storage;
import com.google.cloud.storage.StorageException;
import com.google.common.base.Strings;
import io.cdap.cdap.api.annotation.Description;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.annotation.Plugin;
import io.cdap.cdap.api.data.batch.Input;
import io.cdap.cdap.api.data.format.StructuredRecord;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.dataset.lib.KeyValue;
import io.cdap.cdap.etl.api.Emitter;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.PipelineConfigurer;
import io.cdap.cdap.etl.api.StageConfigurer;
import io.cdap.cdap.etl.api.batch.BatchRuntimeContext;
import io.cdap.cdap.etl.api.batch.BatchSinkContext;
import io.cdap.cdap.etl.api.batch.BatchSource;
import io.cdap.cdap.etl.api.batch.BatchSourceContext;
import io.cdap.cdap.etl.api.validation.ValidatingInputFormat;
import io.cdap.plugin.common.Asset;
import io.cdap.plugin.common.LineageRecorder;
import io.cdap.plugin.common.ReferenceNames;
import io.cdap.plugin.common.batch.JobUtils;
import io.cdap.plugin.gcp.bigquery.sink.BigQuerySinkUtils;
import io.cdap.plugin.gcp.bigquery.source.BigQueryAvroToStructuredTransformer;
import io.cdap.plugin.gcp.bigquery.source.BigQuerySourceUtils;
import io.cdap.plugin.gcp.bigquery.util.BigQueryConstants;
import io.cdap.plugin.gcp.bigquery.util.BigQueryUtil;
import io.cdap.plugin.gcp.common.GCPConnectorConfig;
import io.cdap.plugin.gcp.common.GCPUtils;
import io.cdap.plugin.gcp.dataplex.common.util.DataplexConstants;
import io.cdap.plugin.gcp.dataplex.common.util.DataplexUtil;
import io.cdap.plugin.gcp.dataplex.sink.config.DataplexBatchSinkConfig;
import io.cdap.plugin.gcp.dataplex.source.config.DataplexBatchSourceConfig;
import io.cdap.plugin.gcp.gcs.GCSPath;

import org.apache.avro.generic.GenericData;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ExecutionException;
import java.util.stream.Collectors;

import javax.annotation.Nullable;

/**
 * Batch Source that reads data from Dataplex entities (BigQuery or Cloud Storage).
 * 

* {@code Object} is the first parameter because that is the key used * by Hadoop's {@code TextInputFormat}. * Object is the second parameter because that is the value used * by Hadoop's {@code TextInputFormat}. * {@code StructuredRecord} is the third parameter because that is the output given by * Hadoop's {@code TextInputFormat}. All the plugins included with Hydrator operate on * StructuredRecord. */ @Plugin(type = BatchSource.PLUGIN_TYPE) @Name(DataplexBatchSource.NAME) @Description("Dataplex Source") public class DataplexBatchSource extends BatchSource { public static final String NAME = "Dataplex"; private static final String BQ_TEMP_BUCKET_NAME_PREFIX = "dataplex-bq-source-bucket-"; private static final String BQ_TEMP_BUCKET_NAME_TEMPLATE = BQ_TEMP_BUCKET_NAME_PREFIX + "%s"; private static final String CONFIG_TEMPORARY_TABLE_NAME = "cdap.bq.source.temporary.table.name"; private static final String GCS_TEMP_BUCKET_NAME = "dataplex-cdf-" + UUID.randomUUID(); private static final String DATAPLEX_TASK_ARGS = "TASK_ARGS"; private static final Logger LOG = LoggerFactory.getLogger(DataplexBatchSource.class); private static Entity entity; private static String dataset; private static String datasetProject; private static Schema outputSchema; private static String tableId; private final BigQueryAvroToStructuredTransformer transformer = new BigQueryAvroToStructuredTransformer(); private final DataplexBatchSourceConfig config; private Configuration configuration; private String bucketPath; public DataplexBatchSource(DataplexBatchSourceConfig dataplexBatchSourceConfig) { this.config = dataplexBatchSourceConfig; } @Override public void configurePipeline(PipelineConfigurer pipelineConfigurer) { super.configurePipeline(pipelineConfigurer); StageConfigurer configurer = pipelineConfigurer.getStageConfigurer(); FailureCollector collector = configurer.getFailureCollector(); if (!config.getConnection().canConnect() || config.getServiceAccountType() == null || (config.isServiceAccountFilePath() && config.autoServiceAccountUnavailable()) || (config.tryGetProject() == null)) { // ValidatingInputFormat plugin setup is mandatory.Otherwise pipeline will fail at runtime in case of GCS entity. config.setupValidatingInputFormat(pipelineConfigurer, collector, null); return; } GoogleCredentials credentials = config.validateAndGetServiceAccountCredentials(collector); collector.getOrThrowException(); try { entity = config.getAndValidateEntityConfiguration(collector, credentials); } catch (IOException e) { collector.addFailure(e.getCause().getMessage(), "Please check credentials"); return; } if (entity == null) { config.setupValidatingInputFormat(pipelineConfigurer, collector, null); return; } if (entity.getSystem().equals(StorageSystem.BIGQUERY)) { getEntityValuesFromDataPathForBQEntities(entity.getDataPath()); config.validateBigQueryDataset(collector, datasetProject, dataset, tableId); if (config.getSchema(collector) == null) { Schema configuredSchema = DataplexUtil.getTableSchema(entity.getSchema(), collector); configurer.setOutputSchema(configuredSchema); } return; } // for Cloud Storage Entities config.checkMetastoreForGCSEntity(collector, credentials); config.setupValidatingInputFormat(pipelineConfigurer, collector, entity); } @Override public void prepareRun(BatchSourceContext context) throws Exception { FailureCollector collector = context.getFailureCollector(); GoogleCredentials credentials = config.validateAndGetServiceAccountCredentials(collector); collector.getOrThrowException(); entity = config.getAndValidateEntityConfiguration(collector, credentials); if (entity == null) { throw new IOException(String.format("Pipeline failed. Entity %s does not exist", config.getEntity())); } if (entity.getSystem().equals(StorageSystem.BIGQUERY)) { getEntityValuesFromDataPathForBQEntities(entity.getDataPath()); config.validateBigQueryDataset(collector, datasetProject, dataset, tableId); prepareRunBigQueryDataset(context); } else { config.checkMetastoreForGCSEntity(collector, credentials); prepareRunStorageBucket(context); } } private void getEntityValuesFromDataPathForBQEntities(String dataPath) { // dataPath will be in format 'projects/projectName/datasets/datasetName/tables/tableName' String[] entityValues = dataPath.split("/"); if (entityValues.length >= 3) { dataset = entityValues[entityValues.length - 3]; datasetProject = entityValues[1]; tableId = entityValues[entityValues.length - 1]; } } @Override public void initialize(BatchRuntimeContext context) throws Exception { super.initialize(context); outputSchema = config.getSchema(context.getFailureCollector()); try (MetadataServiceClient metadataServiceClient = DataplexUtil.getMetadataServiceClient(config.getCredentials(context.getFailureCollector()))) { // entity will be required while calling transform method entity = metadataServiceClient.getEntity(EntityName.newBuilder().setProject(config.tryGetProject()). setLocation(config.getLocation()).setLake(config.getLake()).setZone(config.getZone()). setEntity(config.getEntity()).build()); } } /** * this method will set up all the configuration to run the job in case of bigQuery entities. * * @param context * @throws Exception */ private void prepareRunBigQueryDataset(BatchSourceContext context) throws Exception { FailureCollector collector = context.getFailureCollector(); outputSchema = DataplexUtil.getTableSchema(entity.getSchema(), collector); // Create BigQuery client String serviceAccount = config.getServiceAccount(); Credentials credentials = config.getCredentials(collector); BigQuery bigQuery = GCPUtils.getBigQuery(datasetProject, credentials); // Temporary bucket path without BQ template bucketPath = UUID.randomUUID().toString(); configuration = BigQueryUtil.getBigQueryConfig(serviceAccount, config.getProject(), null, config.getServiceAccountType()); // Configure temporay GCS Bucket to use String bucketName = BigQueryUtil.getStagingBucketName(context.getArguments().asMap(), config.getLocation(), bigQuery.getDataset(DatasetId.of(datasetProject, dataset)), null); String bucket = createBucket(configuration, config.getProject(), bigQuery, credentials, bucketName, bucketPath); // Configure Service account credentials configureServiceAccount(configuration, config.getConnection()); // Configure BQ Source configureBigQuerySource(); // Configure BigQuery input format. String temporaryGcsPath = BigQuerySourceUtils.getTemporaryGcsPath(bucket, bucketPath, bucketPath); BigQuerySourceUtils.configureBigQueryInput(configuration, DatasetId.of(datasetProject, dataset), tableId, temporaryGcsPath); configuration.set(DataplexConstants.DATAPLEX_ENTITY_TYPE, entity.getSystem().toString()); TableDefinition.Type sourceTableType = config.getSourceTableType(datasetProject, dataset, tableId); emitLineage(context, outputSchema, sourceTableType); context.setInput( Input.of(config.getReferenceName(BigQueryUtil.getFQN(datasetProject, dataset, tableId)), new DataplexInputFormatProvider(configuration))); } /** * Set bigquery filter and partition values for filtered data. */ private void configureBigQuerySource() { if (config.getPartitionFrom() != null) { configuration.set(BigQueryConstants.CONFIG_PARTITION_FROM_DATE, config.getPartitionFrom()); } if (config.getPartitionTo() != null) { configuration.set(BigQueryConstants.CONFIG_PARTITION_TO_DATE, config.getPartitionTo()); } if (config.getFilter() != null) { configuration.set(BigQueryConstants.CONFIG_FILTER, config.getFilter()); } } private void emitLineage(BatchSourceContext context, Schema schema, TableDefinition.Type sourceTableType) { getEntityValuesFromDataPathForBQEntities(entity.getDataPath()); String fqn = BigQueryUtil.getFQN(datasetProject, dataset, tableId); Asset asset = Asset.builder( config.getReferenceName(fqn)).setFqn(fqn).setLocation(config.getLocation()).build(); LineageRecorder lineageRecorder = new LineageRecorder(context, asset); lineageRecorder.createExternalDataset(schema); String type = "table"; if (TableDefinition.Type.VIEW == sourceTableType) { type = "view"; } else if (TableDefinition.Type.MATERIALIZED_VIEW == sourceTableType) { type = "materialized view"; } if (schema.getFields() != null) { this.recordLineage(lineageRecorder, schema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList()), String.format("Read from BigQuery Entity %s '%s' from Dataplex.", type, tableId)); } } /** * this method will set up all the configuration to run the job in case of Cloud Storage entities. * * @param context BatchSourceContext * @throws InstantiationException * @throws IOException * @throws ExecutionException * @throws InterruptedException */ private void prepareRunStorageBucket(BatchSourceContext context) throws InstantiationException, IOException, ExecutionException, InterruptedException { FailureCollector collector = context.getFailureCollector(); Job job = JobUtils.createInstance(); configuration = job.getConfiguration(); // Get storage and create temporary bucket to store task execution data from. Storage storage = GCPUtils.getStorage(config.getProject(), config.getCredentials(collector)); createBucket(configuration, storage, config.getLocation(), GCS_TEMP_BUCKET_NAME); String outputLocation = GCSPath.SCHEME + GCS_TEMP_BUCKET_NAME; String query = formatQuery(entity, context.isPreviewEnabled()); // Create Dataplex task to fetch filtered data. And set in configuration String taskId = createTask(outputLocation, query, collector); setConfigurationForDataplex(taskId); // Set Validating Input Format ValidatingInputFormat validatingInputFormat = config.getValidatingInputFormat(context); FileInputFormat.setInputDirRecursive(job, true); Schema schema = DataplexUtil.getTableSchema(entity.getSchema(), collector); io.cdap.plugin.common.Asset asset = io.cdap.plugin.common.Asset.builder( config.getReferenceName(entity.getDataPath())) .setFqn(entity.getDataPath()).setLocation(config.getLocation()).build(); LineageRecorder lineageRecorder = new LineageRecorder(context, asset); lineageRecorder.createExternalDataset(schema); if (schema != null && schema.getFields() != null) { this.recordLineage(lineageRecorder, schema.getFields().stream().map(Schema.Field::getName).collect( Collectors.toList()), "Read from GCS entity in Dataplex."); } Iterator> propertiesIterator = config.getFileSystemProperties(outputLocation) .entrySet().iterator(); while (propertiesIterator.hasNext()) { Map.Entry entry = propertiesIterator.next(); configuration.set(entry.getKey(), entry.getValue()); } Path path = new Path(outputLocation); FileSystem pathFileSystem = FileSystem.get(path.toUri(), configuration); FileStatus[] fileStatus = pathFileSystem.globStatus(path); if (fileStatus == null) { throw new IOException(String.format("Input path %s does not exist", path)); } else { FileInputFormat.addInputPath(job, path); Map inputFormatConfiguration = validatingInputFormat.getInputFormatConfiguration(); Iterator> inputFormatIterator = inputFormatConfiguration.entrySet().iterator(); while (inputFormatIterator.hasNext()) { Map.Entry propertyEntry = inputFormatIterator.next(); configuration.set(propertyEntry.getKey(), propertyEntry.getValue()); } } configuration.set(DataplexConstants.DATAPLEX_ENTITY_TYPE, entity.getSystem().toString()); context.setInput(Input.of(config.getReferenceName(entity.getDataPath()), new DataplexInputFormatProvider(configuration))); } /** * Set Dataplex properties in configuration * * @param taskId */ private void setConfigurationForDataplex(String taskId) { configuration.set(DataplexConstants.DATAPLEX_TASK_ID, taskId); configuration.set(DataplexConstants.DATAPLEX_PROJECT_ID, config.tryGetProject()); configuration.set(DataplexConstants.DATAPLEX_LOCATION, config.getLocation()); configuration.set(DataplexConstants.DATAPLEX_LAKE, config.getLake()); configuration.set(DataplexConstants.SERVICE_ACCOUNT_TYPE, config.getServiceAccountType()); String serviceAccountFilePath = config.getServiceAccountFilePath() != null ? config.getServiceAccountFilePath() : DataplexConstants.NONE; configuration.set(DataplexConstants.SERVICE_ACCOUNT_FILEPATH, serviceAccountFilePath); } private void recordLineage(LineageRecorder lineageRecorder, List outputFields, String description) { lineageRecorder.recordRead("Read", description, outputFields); } @Override public void onRunFinish(boolean succeeded, BatchSourceContext context) { if (entity.getSystem().equals(StorageSystem.BIGQUERY)) { BigQuerySourceUtils.deleteGcsTemporaryDirectory(configuration, null, bucketPath); String temporaryTable = configuration.get(CONFIG_TEMPORARY_TABLE_NAME); Credentials credentials = config.getCredentials(context.getFailureCollector()); BigQuery bigQuery = GCPUtils.getBigQuery(config.getProject(), credentials); bigQuery.delete(TableId.of(datasetProject, dataset, temporaryTable)); LOG.debug("Deleted temporary table '{}'", temporaryTable); } else { Storage storage = GCPUtils.getStorage(config.tryGetProject(), config.getCredentials(context.getFailureCollector())); // Delete the directory first and then delete empty storage bucket BigQuerySourceUtils.deleteGcsTemporaryDirectory(configuration, GCS_TEMP_BUCKET_NAME, "projects"); storage.delete(GCS_TEMP_BUCKET_NAME); LOG.debug("Deleted temporary bucket '{}'.", GCS_TEMP_BUCKET_NAME); } } /** * Create task object * * @param outputLocation GCS output path * @param query Query to run on spark while calling dataplex task. * @param collector FailureCollector * @return * @throws IOException * @throws ExecutionException * @throws InterruptedException */ private String createTask(String outputLocation, String query, FailureCollector collector) throws IOException, ExecutionException, InterruptedException { String taskArgs = "--output_location,%s, --output_format, %s"; Task.TriggerSpec triggerSpec = Task.TriggerSpec.newBuilder().setType(Task.TriggerSpec.Type.ON_DEMAND).build(); Task.ExecutionSpec executionSpec = Task.ExecutionSpec.newBuilder() .setServiceAccount(config.getServiceAccountEmail()).putArgs(DATAPLEX_TASK_ARGS, String.format(taskArgs, outputLocation, DataplexBatchSourceConfig.INPUT_FORMAT)).build(); Task.SparkTaskConfig spark = Task.SparkTaskConfig.newBuilder().setSqlScript(query).build(); Task task = Task.newBuilder().setTriggerSpec(triggerSpec).setDescription("task-" + UUID.randomUUID()). setExecutionSpec(executionSpec).setSpark(spark).build(); try (DataplexServiceClient dataplexServiceClient = DataplexUtil.getDataplexServiceClient(config.getCredentials(collector))) { task = dataplexServiceClient.createTaskAsync(LakeName.newBuilder().setLake(config.getLake()). setProject(config.tryGetProject()).setLocation(config.getLocation()).build(), task, task.getDescription()) .get(); } return task.getDescription(); } /** * Format query for Cloud storage entities. * * @param entity Dataplex entity * @param isPreviewEnabled * @return */ private String formatQuery(Entity entity, boolean isPreviewEnabled) { String queryTemplate = "select * from %s.%s %s"; StringBuilder condition = new StringBuilder(); if (!Strings.isNullOrEmpty(config.getFilter())) { condition.append("where ").append(config.getFilter()); } // In case of preview limit the job to 1000 records condition.append(isPreviewEnabled ? " LIMIT 1000;" : ";"); return String.format(queryTemplate, config.getZone(), entity.getId(), condition); } @Override public void transform(KeyValue input, Emitter emitter) throws IOException { if (entity.getSystem().equals(StorageSystem.BIGQUERY)) { StructuredRecord transformed = outputSchema == null ? transformer.transform((GenericData.Record) input.getValue()) : transformer.transform((GenericData.Record) input.getValue(), outputSchema); emitter.emit(transformed); } else { emitter.emit((StructuredRecord) input.getValue()); } } /** * Gets bucket from supplied configuration. *

* If the supplied configuration doesn't specify a bucket, a bucket will get auto created and configuration modified * to auto-delete this bucket on completion. * * @param configuration Hadoop configuration instance. * @param project GCP projectId * @param bigQuery bigquery client * @param credentials GCP credentials * @param bucket bucket name * @param bucketPath bucket path to use. Will be used as a bucket name if needed.. * @return Bucket name. */ private String createBucket(Configuration configuration, String project, BigQuery bigQuery, Credentials credentials, @Nullable String bucket, String bucketPath) throws IOException { if (bucket == null) { bucket = String.format(BQ_TEMP_BUCKET_NAME_TEMPLATE, bucketPath); // By default, this option is false, meaning the job can not delete the bucket. configuration.setBoolean("fs.gs.bucket.delete.enable", true); } // the dataset existence is validated before, so this cannot be null Dataset bigQueryDataset = bigQuery.getDataset(DatasetId.of(datasetProject, dataset)); createBucket(configuration, GCPUtils.getStorage(project, credentials), bigQueryDataset.getLocation(), bucket); return bucket; } /** * Sets up service account credentials into supplied Hadoop configuration. * * @param configuration Hadoop Configuration instance. * @param config BigQuery connection configuration. */ private void configureServiceAccount(Configuration configuration, GCPConnectorConfig config) { if (config.getServiceAccount() != null) { configuration.set(BigQueryConstants.CONFIG_SERVICE_ACCOUNT, config.getServiceAccount()); configuration.setBoolean(BigQueryConstants.CONFIG_SERVICE_ACCOUNT_IS_FILE, config.isServiceAccountFilePath()); } } /** * Gets bucket from supplied configuration. *

* If the supplied configuration doesn't specify a bucket, a bucket will get auto created and configuration modified * to auto-delete this bucket on completion. * * @param configuration Hadoop configuration instance. * @param storage GCS Storage client * @param bucket GCS bucket * @param location location * @return Bucket name. */ private String createBucket(Configuration configuration, Storage storage, String location, @Nullable String bucket) throws IOException { // Create a new bucket if needed if (storage != null && storage.get(bucket) == null) { try { configuration.setBoolean("fs.gs.bucket.delete.enable", true); GCPUtils.createBucket(storage, bucket, location, null); } catch (StorageException e) { if (e.getCode() == 409) { // A conflict means the bucket already exists // This most likely means multiple stages in the same pipeline are trying to create the same bucket. // Ignore this and move on, since all that matters is that the bucket exists. return bucket; } throw new IOException(String.format("Unable to create Cloud Storage bucket '%s'. ", bucket), e); } } return bucket; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy