io.cdap.plugin.gcp.gcs.sink.GCSMultiBatchSink Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of google-cloud Show documentation
Plugins for Google Big Query
The newest version!
/*
 * Copyright © 2015-2021 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package io.cdap.plugin.gcp.gcs.sink;

import com.google.auth.Credentials;
import com.google.cloud.kms.v1.CryptoKeyName;
import com.google.cloud.storage.Storage;
import com.google.cloud.storage.StorageException;
import com.google.common.base.Strings;
import io.cdap.cdap.api.annotation.Description;
import io.cdap.cdap.api.annotation.Macro;
import io.cdap.cdap.api.annotation.Metadata;
import io.cdap.cdap.api.annotation.MetadataProperty;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.annotation.Plugin;
import io.cdap.cdap.api.data.batch.Output;
import io.cdap.cdap.api.data.batch.OutputFormatProvider;
import io.cdap.cdap.api.data.format.StructuredRecord;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.dataset.lib.KeyValue;
import io.cdap.cdap.api.plugin.InvalidPluginConfigException;
import io.cdap.cdap.api.plugin.InvalidPluginProperty;
import io.cdap.cdap.api.plugin.PluginProperties;
import io.cdap.cdap.etl.api.Emitter;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.PipelineConfigurer;
import io.cdap.cdap.etl.api.batch.BatchSink;
import io.cdap.cdap.etl.api.batch.BatchSinkContext;
import io.cdap.cdap.etl.api.connector.Connector;
import io.cdap.cdap.etl.api.validation.ValidatingOutputFormat;
import io.cdap.plugin.common.batch.sink.SinkOutputFormatProvider;
import io.cdap.plugin.format.FileFormat;
import io.cdap.plugin.gcp.common.CmekUtils;
import io.cdap.plugin.gcp.common.GCPUtils;
import io.cdap.plugin.gcp.gcs.connector.GCSConnector;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.Map;
import java.util.stream.Collectors;
import javax.annotation.Nullable;


/**
 * {@link GCSMultiBatchSink} that stores the data of the latest run of an adapter in GCS.
 */
@Plugin(type = BatchSink.PLUGIN_TYPE)
@Name(GCSMultiBatchSink.NAME)
@Description("Writes records to one or more Avro, ORC, Parquet or Delimited format files in a directory " +
  "on Google Cloud Storage.")
@Metadata(properties = {@MetadataProperty(key = Connector.PLUGIN_TYPE, value = GCSConnector.NAME)})
public class GCSMultiBatchSink extends BatchSink {
  private static final Logger LOG = LoggerFactory.getLogger(GCSMultiBatchSink.class);
  public static final String NAME = "GCSMultiFiles";
  private static final String TABLE_PREFIX = "multisink.";
  private static final String FORMAT_PLUGIN_ID = "format";
  private static final String SCHEMA_MACRO = "__provided_schema__";

  private final GCSMultiBatchSinkConfig config;

  public GCSMultiBatchSink(GCSMultiBatchSinkConfig config) {
    this.config = config;
  }

  @Override
  public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
    FailureCollector collector = pipelineConfigurer.getStageConfigurer().getFailureCollector();
    config.validate(collector);
    collector.getOrThrowException();

    // add schema as a macro since we don't know it until runtime
    PluginProperties.Builder formatPropertiesBuilder = PluginProperties.builder()
      .addAll(config.getProperties().getProperties());

    if (!config.getAllowFlexibleSchema()) {
      formatPropertiesBuilder.add("schema", String.format("${%s}", SCHEMA_MACRO));
    }

    PluginProperties formatProperties = formatPropertiesBuilder.build();

    if (!this.config.containsMacro("format")) {
      String format = config.getFormatName();
      OutputFormatProvider outputFormatProvider =
              pipelineConfigurer.usePlugin(ValidatingOutputFormat.PLUGIN_TYPE, format, FORMAT_PLUGIN_ID,
                      formatProperties);
      if (outputFormatProvider == null) {
        collector.addFailure(
                String.format("Could not find the '%s' output format plugin.", format), null)
                .withPluginNotFound(FORMAT_PLUGIN_ID, format, ValidatingOutputFormat.PLUGIN_TYPE);
      }
      return;
    }
    //deploying all format plugins if its macro, so that required format plugin is available when macro is resolved
    for (FileFormat f: FileFormat.values()) {
      try {
        pipelineConfigurer.usePlugin(ValidatingOutputFormat.PLUGIN_TYPE, f.name().toLowerCase(),
                f.name().toLowerCase(), this.config.getRawProperties());
      } catch (InvalidPluginConfigException e) {
        LOG.warn("Failed to register format '{}', which means it cannot be used when the pipeline is run." +
                " Missing properties: {}, invalid properties: {}", new Object[]{f.name(),
                e.getMissingProperties(), e.getInvalidProperties().stream()
                .map(InvalidPluginProperty::getName).collect(Collectors.toList())});
      }
    }
  }

  @Override
  public void prepareRun(BatchSinkContext context) throws IOException, InstantiationException {
    FailureCollector collector = context.getFailureCollector();
    config.validate(collector, context.getArguments().asMap());
    collector.getOrThrowException();

    Map baseProperties = GCPUtils.getFileSystemProperties(config.connection,
                                                                          config.getPath(), new HashMap<>());
    Map argumentCopy = new HashMap<>(context.getArguments().asMap());

    CryptoKeyName cmekKeyName = CmekUtils.getCmekKey(config.cmekKey, context.getArguments().asMap(), collector);
    collector.getOrThrowException();
    Boolean isServiceAccountFilePath = config.connection.isServiceAccountFilePath();
    if (isServiceAccountFilePath == null) {
      context.getFailureCollector().addFailure("Service account type is undefined.",
                                               "Must be `filePath` or `JSON`");
      context.getFailureCollector().getOrThrowException();
      return;
    }
    Credentials credentials = config.connection.getServiceAccount() == null ?
      null : GCPUtils.loadServiceAccountCredentials(config.connection.getServiceAccount(), isServiceAccountFilePath);
    Storage storage = GCPUtils.getStorage(config.connection.getProject(), credentials);
    try {
      if (storage.get(config.getBucket()) == null) {
        GCPUtils.createBucket(storage, config.getBucket(), config.getLocation(), cmekKeyName);
      }
    } catch (StorageException e) {
      // Add more descriptive error message
      throw new RuntimeException(
        String.format("Unable to access or create bucket %s. ", config.getBucket())
          + "Ensure you entered the correct bucket path and have permissions for it.", e);
    }

    if (config.getAllowFlexibleSchema()) {
      //Configure MultiSink with support for flexible schemas.
      configureSchemalessMultiSink(context, baseProperties, argumentCopy);
    } else {
      //Configure MultiSink with fixed schemas based on arguments.
      configureMultiSinkWithSchema(context, baseProperties, argumentCopy);
    }
  }

  @Override
  public void transform(StructuredRecord input,
                        Emitter> emitter) {
    emitter.emit(new KeyValue<>(NullWritable.get(), input));
  }

  private void configureMultiSinkWithSchema(BatchSinkContext context,
                                            Map baseProperties,
                                            Map argumentCopy)
    throws IOException, InstantiationException {

    for (Map.Entry argument : argumentCopy.entrySet()) {
      String key = argument.getKey();
      if (!key.startsWith(TABLE_PREFIX)) {
        continue;
      }
      String name = key.substring(TABLE_PREFIX.length());
      Schema schema = Schema.parseJson(argument.getValue());
      // TODO: (CDAP-14600) pass in schema as an argument instead of using macros and setting arguments
      // add better platform support to allow passing in arguments when instantiating a plugin
      context.getArguments().set(SCHEMA_MACRO, schema.toString());
      ValidatingOutputFormat validatingOutputFormat = context.newPluginInstance(FORMAT_PLUGIN_ID);

      Map outputProperties = new HashMap<>(baseProperties);
      outputProperties.putAll(validatingOutputFormat.getOutputFormatConfiguration());
      outputProperties.putAll(RecordFilterOutputFormat.configure(validatingOutputFormat.getOutputFormatClassName(),
                                                                 config.splitField, name, schema));
      outputProperties.put(FileOutputFormat.OUTDIR, config.getOutputDir(context.getLogicalStartTime(), name));
      outputProperties.put(GCSBatchSink.CONTENT_TYPE, config.getContentType());
      context.addOutput(Output.of(
        config.getReferenceName() + "_" + name,
        new SinkOutputFormatProvider(RecordFilterOutputFormat.class.getName(), outputProperties)));
    }
  }

  private void configureSchemalessMultiSink(BatchSinkContext context,
                                            Map baseProperties,
                                            Map argumentCopy) throws InstantiationException {
    ValidatingOutputFormat validatingOutputFormat = context.newPluginInstance(FORMAT_PLUGIN_ID);

    Map outputProperties = new HashMap<>(baseProperties);
    outputProperties.putAll(validatingOutputFormat.getOutputFormatConfiguration());
    outputProperties.putAll(DelegatingGCSOutputFormat.configure(validatingOutputFormat.getOutputFormatClassName(),
                                                                config.splitField,
                                                                config.getOutputBaseDir(),
                                                                config.getOutputSuffix(context.getLogicalStartTime())));
    outputProperties.put(GCSBatchSink.CONTENT_TYPE, config.getContentType());
    context.addOutput(Output.of(
      config.getReferenceName(),
      new SinkOutputFormatProvider(DelegatingGCSOutputFormat.class.getName(), outputProperties)));
  }

  /**
   * Sink configuration.
   */
  public static class GCSMultiBatchSinkConfig extends GCSBatchSink.GCSBatchSinkConfig {
    private static final String NAME_ALLOW_FLEXIBLE_SCHEMA = "allowFlexibleSchema";

    @Description("The codec to use when writing data. " +
      "The 'avro' format supports 'snappy' and 'deflate'. The parquet format supports 'snappy' and 'gzip'. " +
      "Other formats do not support compression.")
    @Nullable
    private String compressionCodec;

    @Description("The name of the field that will be used to determine which directory to write to.")
    private String splitField = "tablename";

    @Name(NAME_ALLOW_FLEXIBLE_SCHEMA)
    @Macro
    @Nullable
    @Description("Allow Flexible Schemas in output. If disabled, only records with schemas set as " +
      "arguments will be processed. If enabled, all records will be written as-is.")
    private Boolean allowFlexibleSchema;

    protected String getOutputDir(long logicalStartTime, String context) {
      return String.format("%s/%s/%s", getOutputBaseDir(), context, getOutputSuffix(logicalStartTime));
    }

    protected String getOutputBaseDir() {
      return getPath();
    }

    protected String getOutputSuffix(long logicalStartTime) {
      boolean suffixOk = !Strings.isNullOrEmpty(getSuffix());
      return suffixOk ? new SimpleDateFormat(getSuffix()).format(logicalStartTime) : "";
    }

    public Boolean getAllowFlexibleSchema() {
      return allowFlexibleSchema != null ? allowFlexibleSchema : false;
    }
  }
}