co.cask.gcp.bigquery.BigQuerySink Maven / Gradle / Ivy

Go to download
/*
 * Copyright © 2018 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.gcp.bigquery;

import co.cask.cdap.api.annotation.Description;
import co.cask.cdap.api.annotation.Name;
import co.cask.cdap.api.annotation.Plugin;
import co.cask.cdap.api.data.batch.Output;
import co.cask.cdap.api.data.batch.OutputFormatProvider;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.dataset.lib.KeyValue;
import co.cask.cdap.etl.api.Emitter;
import co.cask.cdap.etl.api.PipelineConfigurer;
import co.cask.cdap.etl.api.batch.BatchRuntimeContext;
import co.cask.cdap.etl.api.batch.BatchSink;
import co.cask.cdap.etl.api.batch.BatchSinkContext;
import co.cask.hydrator.common.LineageRecorder;
import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.BigQueryException;
import com.google.cloud.bigquery.DatasetInfo;
import com.google.cloud.bigquery.Field;
import com.google.cloud.bigquery.FieldList;
import com.google.cloud.bigquery.LegacySQLTypeName;
import com.google.cloud.bigquery.Table;
import com.google.cloud.hadoop.io.bigquery.BigQueryFileFormat;
import com.google.cloud.hadoop.io.bigquery.output.BigQueryOutputConfiguration;
import com.google.cloud.hadoop.io.bigquery.output.BigQueryTableFieldSchema;
import com.google.cloud.hadoop.io.bigquery.output.BigQueryTableSchema;
import com.google.cloud.hadoop.io.bigquery.output.IndirectBigQueryOutputFormat;
import com.google.gson.JsonNull;
import com.google.gson.JsonObject;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.stream.Collectors;

/**
 * This class BigQuerySink is a plugin that would allow users
 * to write StructuredRecords to Google Big Query.
 *
 * The plugin uses native BigQuery Output format to write data.
 */
@Plugin(type = "batchsink")
@Name(BigQuerySink.NAME)
@Description("This sink writes to a BigQuery table. "
  + "BigQuery is Google's serverless, highly scalable, enterprise data warehouse. "
  + "Data is first written to a temporary location on Google Cloud Storage, then loaded into BigQuery from there.")
public final class BigQuerySink extends BatchSink {
  public static final String NAME = "BigQueryTable";
  private static final DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSSSSS");
  private static final Logger LOG = LoggerFactory.getLogger(BigQuerySink.class);

  private final BigQuerySinkConfig config;
  private Schema schema;
  private Configuration configuration;
  // UUID for the run. Will be used as bucket name if bucket is not provided.
  private UUID uuid;

  public BigQuerySink(BigQuerySinkConfig config) {
    this.config = config;
  }

  @Override
  public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
    config.validate(pipelineConfigurer.getStageConfigurer().getInputSchema());
    super.configurePipeline(pipelineConfigurer);
  }

  @Override
  public void prepareRun(BatchSinkContext context) throws Exception {
    config.validate(context.getInputSchema());
    BigQuery bigquery = BigQueryUtils.getBigQuery(config.getServiceAccountFilePath(), config.getProject());
    // create dataset if it does not exist
    if (bigquery.getDataset(config.getDataset()) == null) {
      try {
        bigquery.create(DatasetInfo.newBuilder(config.getDataset()).build());
      } catch (BigQueryException e) {
        throw new RuntimeException("Exception occurred while creating dataset " + config.getDataset() + ".", e);
      }
    }

    // schema validation against bigquery table schema
    validateSchema();

    uuid = UUID.randomUUID();
    configuration = BigQueryUtils.getBigQueryConfig(config.getServiceAccountFilePath(), config.getProject());

    List fields = new ArrayList<>();
    for (Schema.Field field : config.getSchema().getFields()) {
      String tableTypeName = getTableDataType(BigQueryUtils.getNonNullableSchema(field.getSchema())).name();
      BigQueryTableFieldSchema tableFieldSchema = new BigQueryTableFieldSchema()
        .setName(field.getName())
        .setType(tableTypeName)
        .setMode(Field.Mode.NULLABLE.name());
      fields.add(tableFieldSchema);
    }

    String bucket = config.getBucket();
    if (config.getBucket() == null) {
      bucket = uuid.toString();
      // By default, this option is false, meaning the job can not delete the bucket. So enable it only when bucket name
      // is not provided.
      configuration.setBoolean("fs.gs.bucket.delete.enable", true);
    }

    configuration.set("fs.gs.system.bucket", bucket);
    configuration.setBoolean("fs.gs.impl.disable.cache", true);
    configuration.setBoolean("fs.gs.metadata.cache.enable", false);
    String temporaryGcsPath = String.format("gs://%s/hadoop/input/%s", bucket, uuid);

    BigQueryOutputConfiguration.configure(
      configuration,
      String.format("%s.%s", config.getDataset(), config.getTable()),
      new BigQueryTableSchema().setFields(fields),
      temporaryGcsPath,
      BigQueryFileFormat.NEWLINE_DELIMITED_JSON,
      TextOutputFormat.class);

    // Both emitLineage and setOutputFormat internally try to create an external dataset if it does not already exists.
    // We call emitLineage before since it creates the dataset with schema which .
    emitLineage(context, fields);
    setOutputFormat(context);
  }

  @Override
  public void initialize(BatchRuntimeContext context) throws Exception {
    super.initialize(context);
    schema = config.getSchema();
  }

  @Override
  public void transform(StructuredRecord input, Emitter> emitter) throws Exception {
    JsonObject object = new JsonObject();
    for (Schema.Field recordField : input.getSchema().getFields()) {
      // From all the fields in input record, decode only those fields that are present in output schema
      if (schema.getField(recordField.getName()) != null) {
        decodeSimpleTypes(object, recordField.getName(), input);
      }
    }
    emitter.emit(new KeyValue<>(object, NullWritable.get()));
  }

  @Override
  public void onRunFinish(boolean succeeded, BatchSinkContext context) {
    if (config.getBucket() == null) {
      Path gcsPath = new Path(String.format("gs://%s", uuid.toString()));
      try {
        FileSystem fs = gcsPath.getFileSystem(configuration);
        if (fs.exists(gcsPath)) {
          fs.delete(gcsPath, true);
        }
      } catch (IOException e) {
        LOG.warn("Failed to delete bucket " + gcsPath.toUri().getPath() + ", " + e.getMessage());
      }
    }
  }

  private LegacySQLTypeName getTableDataType(Schema schema) {
    Schema.LogicalType logicalType = schema.getLogicalType();

    if (logicalType != null) {
      switch (logicalType) {
        case DATE:
          return LegacySQLTypeName.DATE;
        case TIME_MILLIS:
        case TIME_MICROS:
          return LegacySQLTypeName.TIME;
        case TIMESTAMP_MILLIS:
        case TIMESTAMP_MICROS:
          return LegacySQLTypeName.TIMESTAMP;
        default:
          throw new IllegalStateException("Unsupported logical type " + logicalType);
      }
    }

    Schema.Type type = schema.getType();
    switch(type) {
      case INT:
      case LONG:
        return LegacySQLTypeName.INTEGER;
      case STRING:
        return LegacySQLTypeName.STRING;
      case FLOAT:
      case DOUBLE:
        return LegacySQLTypeName.FLOAT;
      case BOOLEAN:
        return LegacySQLTypeName.BOOLEAN;
      case BYTES:
        return LegacySQLTypeName.BYTES;
      default:
        throw new IllegalStateException("Unsupported type " + type);
    }
  }

  private void setOutputFormat(BatchSinkContext context) {
    context.addOutput(Output.of(config.getReferenceName(), new OutputFormatProvider() {
      @Override
      public String getOutputFormatClassName() {
        return IndirectBigQueryOutputFormat.class.getName();
      }

      @Override
      public Map getOutputFormatConfiguration() {
        Map config = new HashMap<>();
        for (Map.Entry entry : configuration) {
          config.put(entry.getKey(), entry.getValue());
        }
        return config;
      }
    }));
  }

  private void emitLineage(BatchSinkContext context, List fields) {
    LineageRecorder lineageRecorder = new LineageRecorder(context, config.getReferenceName());
    lineageRecorder.createExternalDataset(config.getSchema());

    if (!fields.isEmpty()) {
      lineageRecorder.recordWrite("Write", "Wrote to BigQuery table.",
                                  fields.stream().map(BigQueryTableFieldSchema::getName).collect(Collectors.toList()));
    }
  }

  private static void decodeSimpleTypes(JsonObject json, String name, StructuredRecord input) {
    Object object = input.get(name);
    Schema schema = BigQueryUtils.getNonNullableSchema(input.getSchema().getField(name).getSchema());

    if (object == null) {
      json.add(name, JsonNull.INSTANCE);
      return;
    }

    Schema.LogicalType logicalType = schema.getLogicalType();
    if (logicalType != null) {
      switch (logicalType) {
        case DATE:
          json.addProperty(name, input.getDate(name).toString());
          break;
        case TIME_MILLIS:
        case TIME_MICROS:
          json.addProperty(name, input.getTime(name).toString());
          break;
        case TIMESTAMP_MILLIS:
        case TIMESTAMP_MICROS:
          //timestamp for json input should be in this format yyyy-MM-dd HH:mm:ss.SSSSSS
          json.addProperty(name, dtf.format(input.getTimestamp(name)));
          break;
        default:
          throw new IllegalStateException(String.format("Unsupported logical type %s", logicalType));
      }
      return;
    }

    Schema.Type type = schema.getType();
    switch (type) {
      case NULL:
        json.add(name, JsonNull.INSTANCE); // nothing much to do here.
        break;
      case INT:
      case LONG:
      case FLOAT:
      case DOUBLE:
        json.addProperty(name, (Number) object);
        break;
      case BOOLEAN:
        json.addProperty(name, (Boolean) object);
        break;
      case STRING:
        json.addProperty(name, object.toString());
        break;
      default:
        throw new IllegalStateException(String.format("Unsupported type %s", type));
    }
  }

  /**
   * Validates output schema against bigquery table schema. It throws {@link IllegalArgumentException}
   * if the output schema has more fields than bigquery table or output schema field types does not match bigquery
   * column types.
   */
  private void validateSchema() throws IOException {
    Table table = BigQueryUtils.getBigQueryTable(config.getServiceAccountFilePath(), config.getProject(),
                                                 config.getDataset(), config.getTable());
    if (table == null) {
      // Table does not exist, so no further validation is required.
      return;
    }

    com.google.cloud.bigquery.Schema bqSchema = table.getDefinition().getSchema();
    if (bqSchema == null) {
      // Table is created without schema, so no further validation is required.
      return;
    }

    FieldList bqFields = bqSchema.getFields();
    List outputSchemaFields = config.getSchema().getFields();

    // Output schema should not have fields that are not present in BigQuery table.
    List diff = BigQueryUtils.getSchemaMinusBqFields(outputSchemaFields, bqFields);
    if (!diff.isEmpty()) {
      throw new IllegalArgumentException(
        String.format("The output schema does not match the BigQuery table schema for '%s.%s' table. " +
                        "The table does not contain the '%s' column(s).",
                      config.getDataset(), config.getTable(), diff));
    }

    // validate the missing columns in output schema are nullable fields in bigquery
    List remainingBQFields = BigQueryUtils.getBqFieldsMinusSchema(bqFields, outputSchemaFields);
    for (String field : remainingBQFields) {
      if (bqFields.get(field).getMode() != Field.Mode.NULLABLE) {
        throw new IllegalArgumentException(
          String.format("The output schema does not match the BigQuery table schema for '%s.%s'. " +
                          "The table requires column '%s', which is not in the output schema.",
                        config.getDataset(), config.getTable(), field));
      }
    }

    // Match output schema field type with bigquery column type
    for (Schema.Field field : config.getSchema().getFields()) {
      BigQueryUtils.validateFieldSchemaMatches(bqFields.get(field.getName()),
                                               field, config.getDataset(), config.getTable());
    }
  }
}