io.cdap.plugin.gcp.bigquery.sink.MultiSinkOutputFormatProvider Maven / Gradle / Ivy

Go to download
/*
 * Copyright © 2019 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package io.cdap.plugin.gcp.bigquery.sink;

import io.cdap.cdap.api.data.batch.OutputFormatProvider;
import io.cdap.cdap.api.data.format.StructuredRecord;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.plugin.gcp.bigquery.util.BigQueryConstants;
import io.cdap.plugin.gcp.bigquery.util.BigQueryUtil;
import org.apache.avro.mapred.AvroKey;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import java.io.IOException;
import java.util.Map;

/**
 * Provides {@link MultiSinkOutputFormatDelegate} to output values for multiple tables.
 */
public class MultiSinkOutputFormatProvider implements OutputFormatProvider {

  private static final String FILTER_FIELD = "bq.multi.record.filter.field";
  private static final String FILTER_VALUE = "bq.multi.record.filter.value";

  private final Configuration config;

  public MultiSinkOutputFormatProvider(Configuration config,
                                       String tableName,
                                       Schema tableSchema,
                                       String filterField) {
    this.config = new Configuration(config);
    this.config.set(FILTER_VALUE, tableName);
    this.config.set(FILTER_FIELD, filterField);
    this.config.set(BigQueryConstants.CDAP_BQ_SINK_OUTPUT_SCHEMA, tableSchema.toString());
  }

  @Override
  public String getOutputFormatClassName() {
    return MultiSinkOutputFormatDelegate.class.getName();
  }

  @Override
  public Map getOutputFormatConfiguration() {
    Map map = BigQueryUtil.configToMap(config);
    map.put(org.apache.hadoop.mapred.JobContext.OUTPUT_KEY_CLASS, AvroKey.class.getName());
    return map;
  }

  /**
   * Uses {@link BigQueryOutputFormat} as delegate and creates {@link FilterRecordWriter}
   * to output values based on filter and its value and schema.
   */
  public static class MultiSinkOutputFormatDelegate extends OutputFormat {

    private final OutputFormat delegate;

    public MultiSinkOutputFormatDelegate() {
      this.delegate = new BigQueryOutputFormat();
    }

    @Override
    public RecordWriter getRecordWriter(TaskAttemptContext taskAttemptContext)
      throws IOException, InterruptedException {
      Configuration conf = taskAttemptContext.getConfiguration();
      String filterField = conf.get(FILTER_FIELD);
      String filterValue = conf.get(FILTER_VALUE);
      Schema schema = Schema.parseJson(conf.get(BigQueryConstants.CDAP_BQ_SINK_OUTPUT_SCHEMA));
      @SuppressWarnings("unchecked")
      RecordWriter recordWriter = delegate.getRecordWriter(taskAttemptContext);
      return new FilterRecordWriter(filterField, filterValue, schema, recordWriter);
    }

    @Override
    public void checkOutputSpecs(JobContext jobContext) throws IOException, InterruptedException {
      delegate.checkOutputSpecs(jobContext);
    }

    @Override
    public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext)
      throws IOException, InterruptedException {
      return delegate.getOutputCommitter(taskAttemptContext);
    }
  }

  /**
   * Filters records before writing them out using a delegate based on filter and its value and given schema.
   */
  public static class FilterRecordWriter extends RecordWriter {

    private final String filterField;
    private final String filterValue;
    private final Schema schema;
    private final RecordWriter delegate;


    public FilterRecordWriter(String filterField,
                              String filterValue,
                              Schema schema,
                              RecordWriter delegate) {
      this.filterField = filterField;
      this.filterValue = filterValue;
      this.schema = schema;
      this.delegate = delegate;
    }

    @Override
    public void write(StructuredRecord key, NullWritable value) throws IOException, InterruptedException {
      Object objectValue = key.get(filterField);
      if (objectValue == null) {
        return;
      }
      String name = (String) objectValue;
      String[] split = name.split("\\.");
      if (split.length == 2) {
        name = split[1];
      }
      if (!filterValue.equalsIgnoreCase(name)) {
        return;
      }

      StructuredRecord.Builder builder = StructuredRecord.builder(schema);

      key.getSchema().getFields().stream()
        .filter(entry -> !filterField.equals(entry.getName()))
        .filter(entry -> schema.getField(entry.getName()) != null)
        .forEach(entry -> builder.set(entry.getName(), key.get(entry.getName())));

      delegate.write(builder.build(), value);
    }

    private org.apache.avro.Schema getAvroSchema(Schema cdapSchema) {
      return new org.apache.avro.Schema.Parser().parse(cdapSchema.toString());
    }

    @Override
    public void close(TaskAttemptContext context) throws IOException, InterruptedException {
      delegate.close(context);
    }
  }
}