co.cask.cdap.etl.batch.connector.ConnectorSource Maven / Gradle / Ivy

Go to download
/*
 * Copyright © 2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.etl.batch.connector;

import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.dataset.lib.KeyValue;
import co.cask.cdap.api.dataset.lib.PartitionDetail;
import co.cask.cdap.api.dataset.lib.PartitionFilter;
import co.cask.cdap.api.dataset.lib.PartitionedFileSet;
import co.cask.cdap.api.dataset.lib.PartitionedFileSetArguments;
import co.cask.cdap.api.dataset.lib.PartitionedFileSetProperties;
import co.cask.cdap.api.dataset.lib.Partitioning;
import co.cask.cdap.api.workflow.WorkflowConfigurer;
import co.cask.cdap.etl.api.Emitter;
import co.cask.cdap.etl.api.batch.BatchSource;
import co.cask.cdap.etl.api.batch.BatchSourceContext;
import co.cask.cdap.format.StructuredRecordStringConverter;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.util.HashMap;
import java.util.Map;
import javax.annotation.Nullable;

/**
 * Internal batch source used as a connector between pipeline phases.
 * Though this extends BatchSource, this will not be instantiated through the plugin framework, but will
 * be created explicitly through the application.
 *
 * The batch connector is just a PartitionedFileSet, where a partition is the name of a phase that wrote to it.
 * This way, multiple phases can have the same local PartitionedFileSet as a sink, and the source will read data
 * from all partitions.
 *
 * This is because we don't want this to show up as a plugin that users can select and use, and also because
 * it uses features not exposed in the etl api (local workflow datasets).
 *
 * TODO: improve storage format. It is currently a json of the record but that is obviously not ideal
 */
public class ConnectorSource extends BatchSource {
  static final Schema RECORD_WITH_SCHEMA = Schema.recordOf(
    "record",
    Schema.Field.of("schema", Schema.of(Schema.Type.STRING)),
    Schema.Field.of("record", Schema.of(Schema.Type.STRING)));
  private final String datasetName;
  @Nullable
  private final Schema schema;

  public ConnectorSource(String datasetName, @Nullable Schema schema) {
    this.datasetName = datasetName;
    this.schema = schema;
  }

  // not the standard configurePipeline method. Need a workflowConfigurer to create a local dataset
  // we may want to expose local datasets in cdap-etl-api, but that is a separate track.
  public void configure(WorkflowConfigurer workflowConfigurer) {
    Partitioning partitioning = Partitioning.builder()
      .addField("phase", Partitioning.FieldType.STRING)
      .build();
    workflowConfigurer.createLocalDataset(datasetName, PartitionedFileSet.class,
                                          PartitionedFileSetProperties.builder()
                                            .setPartitioning(partitioning)
                                            .setInputFormat(TextInputFormat.class)
                                            .setOutputFormat(TextOutputFormat.class)
                                            .build());
  }

  @Override
  public void prepareRun(BatchSourceContext context) throws Exception {
    Map arguments = new HashMap<>();
    PartitionedFileSet inputFileset = context.getDataset(datasetName);
    for (PartitionDetail partitionDetail : inputFileset.getPartitions(PartitionFilter.ALWAYS_MATCH)) {
      PartitionedFileSetArguments.addInputPartition(arguments, partitionDetail);
    }
    context.setInput(datasetName, arguments);
  }

  @Override
  public void transform(KeyValue input,
                        Emitter emitter) throws Exception {
    StructuredRecord output;
    String inputStr = input.getValue().toString();
    if (schema == null) {
      StructuredRecord recordWithSchema =
        StructuredRecordStringConverter.fromJsonString(inputStr, RECORD_WITH_SCHEMA);
      Schema outputSchema = Schema.parseJson((String) recordWithSchema.get("schema"));
      output = StructuredRecordStringConverter.fromJsonString((String) recordWithSchema.get("record"), outputSchema);
    } else {
      output = StructuredRecordStringConverter.fromJsonString(inputStr, schema);
    }
    emitter.emit(output);
  }

}