io.cdap.plugin.kafka.batch.source.KafkaBatchSource Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of kafka-plugins-client Show documentation
The newest version!
/*
 * Copyright © 2017 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package io.cdap.plugin.kafka.batch.source;

import com.google.common.base.Strings;
import io.cdap.cdap.api.annotation.Description;
import io.cdap.cdap.api.annotation.Macro;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.annotation.Plugin;
import io.cdap.cdap.api.data.batch.Input;
import io.cdap.cdap.api.data.format.FormatSpecification;
import io.cdap.cdap.api.data.format.RecordFormat;
import io.cdap.cdap.api.data.format.StructuredRecord;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.dataset.lib.KeyValue;
import io.cdap.cdap.etl.api.Emitter;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.PipelineConfigurer;
import io.cdap.cdap.etl.api.StageConfigurer;
import io.cdap.cdap.etl.api.batch.BatchRuntimeContext;
import io.cdap.cdap.etl.api.batch.BatchSource;
import io.cdap.cdap.etl.api.batch.BatchSourceContext;
import io.cdap.cdap.format.RecordFormats;
import io.cdap.plugin.common.ConfigUtil;
import io.cdap.plugin.common.KeyValueListParser;
import io.cdap.plugin.common.LineageRecorder;
import io.cdap.plugin.common.SourceInputFormatProvider;
import io.cdap.plugin.common.batch.JobUtils;
import io.cdap.plugin.kafka.common.KafkaHelpers;
import io.cdap.plugin.kafka.connector.KafkaConnectorConfig;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.mapreduce.Job;
import org.apache.kafka.clients.consumer.ConsumerConfig;

import java.io.IOException;
import java.net.URI;
import java.nio.ByteBuffer;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import javax.annotation.Nullable;

/**
 * Kafka batch source.
 */
@Plugin(type = BatchSource.PLUGIN_TYPE)
@Name(KafkaBatchSource.NAME)
@Description("Kafka batch source.")
public class KafkaBatchSource extends BatchSource {
  public static final String NAME = "Kafka";

  private final Kafka10BatchConfig config;
  private List kafkaRequests;
  private Schema schema;
  private RecordFormat recordFormat;
  private String messageField;
  private FileContext fileContext;
  private Path offsetsFile;

  /**
   * Config properties for the plugin.
   */
  public static class Kafka10BatchConfig extends KafkaBatchConfig {

    @Description("Additional kafka consumer properties to set.")
    @Macro
    @Nullable
    private String kafkaProperties;

    @Description("The kerberos principal used for the source when kerberos security is enabled for kafka.")
    @Macro
    @Nullable
    private String principal;

    @Description("The keytab location for the kerberos principal when kerberos security is enabled for kafka.")
    @Macro
    @Nullable
    private String keytabLocation;

    @Name(ConfigUtil.NAME_USE_CONNECTION)
    @Nullable
    @Description("Whether to use an existing connection.")
    private Boolean useConnection;

    @Name(ConfigUtil.NAME_CONNECTION)
    @Macro
    @Nullable
    @Description("The connection to use.")
    private KafkaConnectorConfig connection;

    public Kafka10BatchConfig() {
      super();
    }

    public Kafka10BatchConfig(String partitions, String topic, String initialPartitionOffsets) {
      super(partitions, topic, initialPartitionOffsets);
    }

    public String getBrokers() {
      return connection.getKafkaBrokers();
    }

    @Nullable
    public String getPrincipal() {
      return principal;
    }

    @Nullable
    public String getKeytabLocation() {
      return keytabLocation;
    }

    public Map getKafkaProperties() {
      KeyValueListParser kvParser = new KeyValueListParser("\\s*,\\s*", ":");
      Map conf = new HashMap<>();
      if (!Strings.isNullOrEmpty(kafkaProperties)) {
        for (KeyValue keyVal : kvParser.parse(kafkaProperties)) {
          conf.put(keyVal.getKey(), keyVal.getValue());
        }
      }
      return conf;
    }

    public void validate(FailureCollector collector) {
      super.validate(collector);
      ConfigUtil.validateConnection(this, useConnection, connection, collector);
      KafkaHelpers.validateKerberosSetting(principal, keytabLocation, collector);
      if (connection != null && connection.getKafkaBrokers() != null) {
        parseBrokerMap(connection.getKafkaBrokers(), collector);
      }
    }
  }

  public KafkaBatchSource(Kafka10BatchConfig config) {
    this.config = config;
  }

  @Override
  public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
    StageConfigurer stageConfigurer = pipelineConfigurer.getStageConfigurer();
    FailureCollector failureCollector = stageConfigurer.getFailureCollector();
    config.validate(failureCollector);
    stageConfigurer.setOutputSchema(config.getSchema(failureCollector));
    failureCollector.getOrThrowException();
  }

  @Override
  public void prepareRun(BatchSourceContext context) throws Exception {
    Job job = JobUtils.createInstance();
    Configuration conf = job.getConfiguration();

    FailureCollector failureCollector = context.getFailureCollector();
    KafkaPartitionOffsets partitionOffsets = config.getInitialPartitionOffsets(failureCollector);
    Schema schema = config.getSchema(failureCollector);
    Set partitions = config.getPartitions(failureCollector);
    failureCollector.getOrThrowException();

    // If the offset directory is provided, try to load the file
    if (!context.isPreviewEnabled() && config.getOffsetDir() != null) {
      Path offsetDir = new Path(URI.create(config.getOffsetDir()));
      fileContext = FileContext.getFileContext(offsetDir.toUri(), conf);
      try {
        fileContext.mkdir(offsetDir, new FsPermission("700"), true);
      } catch (FileAlreadyExistsException e) {
        // It's ok if the parent already exists
      }
      offsetsFile = KafkaBatchConfig.getOffsetFilePath(offsetDir, context.getNamespace(), context.getPipelineName());

      // Load the offset from the offset file
      partitionOffsets = KafkaPartitionOffsets.load(fileContext, offsetsFile);
    }

    Map kafkaConf = new HashMap<>();
    kafkaConf.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, config.getBrokers());
    // We save offsets in datasets, no need for Kafka to save them
    kafkaConf.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
    KafkaHelpers.setupKerberosLogin(kafkaConf, config.getPrincipal(), config.getKeytabLocation());
    kafkaConf.putAll(config.getKafkaProperties());
    kafkaRequests = KafkaInputFormat.saveKafkaRequests(conf, config.getTopic(), kafkaConf,
                                                       partitions,
                                                       config.getMaxNumberRecords(),
                                                       partitionOffsets);
    LineageRecorder lineageRecorder = new LineageRecorder(context, config.referenceName);

    if (schema != null) {
      lineageRecorder.createExternalDataset(schema);
      if (schema.getFields() != null && !schema.getFields().isEmpty()) {
        lineageRecorder.recordRead("Read", "Read from Kafka topic.",
                                   schema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList()));
      }
    }
    context.setInput(Input.of(config.referenceName,
                              new SourceInputFormatProvider(KafkaInputFormat.class, conf)));
  }

  @Override
  public void onRunFinish(boolean succeeded, BatchSourceContext context) {
    if (context.isPreviewEnabled()) {
      return;
    }
    if (succeeded && kafkaRequests != null && fileContext != null && offsetsFile != null) {
      KafkaPartitionOffsets partitionOffsets = new KafkaPartitionOffsets(
        kafkaRequests.stream().collect(Collectors.toMap(KafkaRequest::getPartition, KafkaRequest::getEndOffset)));

      try {
        KafkaPartitionOffsets.save(fileContext, offsetsFile, partitionOffsets);
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }
  }

  @Override
  public void initialize(BatchRuntimeContext context) throws Exception {
    super.initialize(context);

    schema = config.getSchema(context.getFailureCollector());
    Schema messageSchema = config.getMessageSchema(context.getFailureCollector());
    if (schema == null || messageSchema == null) {
      return;
    }
    for (Schema.Field field : schema.getFields()) {
      String name = field.getName();
      if (!name.equals(config.getKeyField()) && !name.equals(config.getPartitionField()) &&
        !name.equals(config.getOffsetField())) {
        messageField = name;
        break;
      }
    }
    if (config.getFormat() != null) {
      FormatSpecification spec = new FormatSpecification(config.getFormat(), messageSchema, new HashMap<>());
      recordFormat = RecordFormats.createInitializedFormat(spec);
    }
  }

  @Override
  public void transform(KeyValue input, Emitter emitter) {
    StructuredRecord.Builder builder = StructuredRecord.builder(schema);
    if (config.getKeyField() != null) {
      builder.set(config.getKeyField(), input.getValue().getKey().array());
    }
    if (config.getPartitionField() != null) {
      builder.set(config.getPartitionField(), input.getKey().getPartition());
    }
    if (config.getOffsetField() != null) {
      builder.set(config.getOffsetField(), input.getKey().getOffset());
    }
    if (config.getFormat() == null) {
      builder.set(messageField, getByteArray(input.getValue().getPayload()));
    } else {
      StructuredRecord messageRecord = recordFormat.read(input.getValue().getPayload());
      for (Schema.Field field : messageRecord.getSchema().getFields()) {
        String fieldName = field.getName();
        builder.set(fieldName, messageRecord.get(fieldName));
      }
    }
    emitter.emit(builder.build());
  }

  private static byte[] getByteArray(ByteBuffer buffer) {
    // return underlying byte array if available.
    if (buffer.hasArray()) {
      return buffer.array();
    }

    // Copy buffer contents into new array and return the array.
    byte[] arr = new byte[buffer.remaining()];
    buffer.get(arr);
    return arr;
  }
}