io.cdap.plugin.kafka.source.KafkaConfig Maven / Gradle / Ivy

Go to download
/*
 * Copyright © 2018 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package io.cdap.plugin.kafka.source;

import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import io.cdap.cdap.api.annotation.Description;
import io.cdap.cdap.api.annotation.Macro;
import io.cdap.cdap.api.data.format.FormatSpecification;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.dataset.lib.KeyValue;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.format.RecordFormats;
import io.cdap.plugin.common.KeyValueListParser;
import io.cdap.plugin.common.ReferencePluginConfig;
import io.cdap.plugin.kafka.common.KafkaHelpers;
import org.apache.kafka.common.TopicPartition;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;

/**
 * Conf for Kafka streaming source.
 */
@SuppressWarnings("unused")
public class KafkaConfig extends ReferencePluginConfig implements Serializable {
  private static final String NAME_SCHEMA = "schema";
  private static final String NAME_BROKERS = "brokers";
  private static final String NAME_PARTITIONS = "partitions";
  private static final String NAME_MAX_RATE = "maxRatePerPartition";
  private static final String NAME_INITIAL_PARTITION_OFFSETS = "initialPartitionOffsets";
  private static final String NAME_TIMEFIELD = "timeField";
  private static final String NAME_KEYFIELD = "keyField";
  private static final String NAME_PARTITION_FIELD = "partitionField";
  private static final String NAME_OFFSET_FIELD = "offsetField";
  private static final String NAME_FORMAT = "format";
  private static final String SEPARATOR = ":";
  public static final String OFFSET_START_FROM_BEGINNING = "Start from beginning";
  public static final String OFFSET_START_FROM_LAST_OFFSET = "Start from last processed offset";
  public static final String OFFSET_START_FROM_SPECIFIC_OFFSET = "Start from specific offset";

  private static final long serialVersionUID = 8069169417140954175L;

  @Description("List of Kafka brokers specified in host1:port1,host2:port2 form. For example, " +
    "host1.example.com:9092,host2.example.com:9092.")
  @Macro
  private String brokers;

  @Description("Kafka topic to read from.")
  @Macro
  private String topic;

  @Description("The topic partitions to read from. If not specified, all partitions will be read.")
  @Nullable
  @Macro
  private String partitions;

  @Description("The initial offset for each topic partition. If this is not specified, " +
    "all partitions will have the same initial offset, which is determined by the defaultInitialOffset property. " +
    "An offset of -2 means the smallest offset. An offset of -1 means the latest offset. " +
    "Offsets are inclusive. If an offset of 5 is used, the message at offset 5 will be read.")
  @Nullable
  @Macro
  private String initialPartitionOffsets;

  @Description("The default initial offset for all topic partitions. Defaults to latest offset.")
  @Nullable
  @Macro
  private Long defaultInitialOffset;

  @Description("The initial offset for all topic partitions. " +
    "Start from beginning means the smallest offset will be set. " +
    "Start from last processed offset means the latest offset will be set. Defaults to null. " +
    "If start from specific offset is selected default initial offset must be provided." +
    "If you wish to set different initial offsets for different partitions, use the initialPartitionOffsets property.")
  @Nullable
  @Macro
  private String initialOffset;

  @Description("Output schema of the source, including the timeField and keyField. " +
    "The fields excluding the timeField and keyField are used in conjunction with the format " +
    "to parse Kafka payloads.")
  private String schema;

  @Description("Optional format of the Kafka event. Any format supported by CDAP is supported. " +
    "For example, a value of 'csv' will attempt to parse Kafka payloads as comma-separated values. " +
    "If no format is given, Kafka message payloads will be treated as bytes.")
  @Nullable
  private String format;

  @Description("Optional name of the field containing the read time of the batch. " +
    "If this is not set, no time field will be added to output records. " +
    "If set, this field must be present in the schema property and must be a long.")
  @Nullable
  private String timeField;

  @Description("Optional name of the field containing the message key. " +
    "If this is not set, no key field will be added to output records. " +
    "If set, this field must be present in the schema property and must be bytes.")
  @Nullable
  private String keyField;

  @Description("Optional name of the field containing the kafka partition that was read from. " +
    "If this is not set, no partition field will be added to output records. " +
    "If set, this field must be present in the schema property and must be an integer.")
  @Nullable
  private String partitionField;

  @Description("Optional name of the field containing the kafka offset that the message was read from. " +
    "If this is not set, no offset field will be added to output records. " +
    "If set, this field must be present in the schema property and must be a long.")
  @Nullable
  private String offsetField;

  @Description("Max number of records to read per second per partition. 0 means there is no limit. Defaults to 1000.")
  @Nullable
  private Integer maxRatePerPartition;

  @Description("Additional kafka consumer properties to set.")
  @Macro
  @Nullable
  private String kafkaProperties;

  @Description("The kerberos principal used for the source when kerberos security is enabled for kafka.")
  @Macro
  @Nullable
  private String principal;

  @Description("The keytab location for the kerberos principal when kerberos security is enabled for kafka.")
  @Macro
  @Nullable
  private String keytabLocation;

  public KafkaConfig() {
    super("");
    defaultInitialOffset = -1L;
    maxRatePerPartition = 1000;
  }

  public String getTopic() {
    return topic;
  }

  public String getBrokers() {
    return brokers;
  }

  @Nullable
  public String getTimeField() {
    return Strings.isNullOrEmpty(timeField) ? null : timeField;
  }

  @Nullable
  public String getKeyField() {
    return Strings.isNullOrEmpty(keyField) ? null : keyField;
  }

  @Nullable
  public String getPartitionField() {
    return Strings.isNullOrEmpty(partitionField) ? null : partitionField;
  }

  @Nullable
  public String getOffsetField() {
    return Strings.isNullOrEmpty(offsetField) ? null : offsetField;
  }

  @Nullable
  public String getFormat() {
    return Strings.isNullOrEmpty(format) ? null : format;
  }

  @Nullable
  public Integer getMaxRatePerPartition() {
    return maxRatePerPartition;
  }

  @Nullable
  public Long getDefaultInitialOffset() {
    if (!containsMacro(initialOffset) && !Strings.isNullOrEmpty(initialOffset)) {
      if (!initialOffset.equals(OFFSET_START_FROM_SPECIFIC_OFFSET)) {
        if (initialOffset.equals(OFFSET_START_FROM_BEGINNING)) {
          return -2L;
        }
        if (initialOffset.equals(OFFSET_START_FROM_LAST_OFFSET)) {
          return -1L;
        }
      }
    }
    return defaultInitialOffset;
  }

  @Nullable
  public Schema getSchema() {
    try {
      return Strings.isNullOrEmpty(schema) ? null : Schema.parseJson(schema);
    } catch (IOException e) {
      throw new IllegalArgumentException("Invalid schema : " + e.getMessage());
    }
  }

  @Nullable
  public Schema getSchema(FailureCollector collector) {
    try {
      return Strings.isNullOrEmpty(schema) ? null : Schema.parseJson(schema);
    } catch (IOException e) {
      collector.addFailure("Invalid schema : " + e.getMessage(), null).withConfigProperty(NAME_SCHEMA);
    }
    throw collector.getOrThrowException();
  }

  // gets the message schema from the schema field. If the time, key, partition, or offset fields are in the configured
  // schema, they will be removed.
  public Schema getMessageSchema() {
    Schema schema = getSchema();
    List messageFields = new ArrayList<>();
    boolean timeFieldExists = false;
    boolean keyFieldExists = false;
    boolean partitionFieldExists = false;
    boolean offsetFieldExists = false;

    for (Schema.Field field : schema.getFields()) {
      String fieldName = field.getName();
      Schema fieldSchema = field.getSchema().isNullable() ? field.getSchema().getNonNullable() : field.getSchema();
      Schema.Type fieldType = fieldSchema.getType();
      // if the field is not the time field and not the key field, it is a message field.
      if (fieldName.equals(timeField)) {
        if (fieldType != Schema.Type.LONG || fieldSchema.getLogicalType() != null) {
          throw new IllegalArgumentException("The time field must be of type long or nullable long.");
        }
        timeFieldExists = true;
      } else if (fieldName.equals(keyField)) {
        if (fieldType != Schema.Type.BYTES || fieldSchema.getLogicalType() != null) {
          throw new IllegalArgumentException("The key field must be of type bytes or nullable bytes.");
        }
        keyFieldExists = true;
      } else if (fieldName.equals(partitionField)) {
        if (fieldType != Schema.Type.INT || fieldSchema.getLogicalType() != null) {
          throw new IllegalArgumentException("The partition field must be of type int.");
        }
        partitionFieldExists = true;
      } else if (fieldName.equals(offsetField)) {
        if (fieldType != Schema.Type.LONG || fieldSchema.getLogicalType() != null) {
          throw new IllegalArgumentException("The offset field must be of type long.");
        }
        offsetFieldExists = true;
      } else {
        messageFields.add(field);
      }
    }
    if (messageFields.isEmpty()) {
      throw new IllegalArgumentException(
        "Schema must contain at least one other field besides the time and key fields.");
    }

    if (getTimeField() != null && !timeFieldExists) {
      throw new IllegalArgumentException(String.format(
        "timeField '%s' does not exist in the schema. Please add it to the schema.", timeField));
    }
    if (getKeyField() != null && !keyFieldExists) {
      throw new IllegalArgumentException(String.format(
        "keyField '%s' does not exist in the schema. Please add it to the schema.", keyField));
    }
    if (getPartitionField() != null && !partitionFieldExists) {
      throw new IllegalArgumentException(String.format(
        "partitionField '%s' does not exist in the schema. Please add it to the schema.", partitionField));
    }
    if (getOffsetField() != null && !offsetFieldExists) {
      throw new IllegalArgumentException(String.format(
        "offsetField '%s' does not exist in the schema. Please add it to the schema.", offsetFieldExists));
    }
    return Schema.recordOf("kafka.message", messageFields);
  }

  // gets the message schema from the schema field. If the time, key, partition, or offset fields are in the configured
  // schema, they will be removed.
  public Schema getMessageSchema(FailureCollector collector) {
    Schema schema = getSchema(collector);
    List messageFields = new ArrayList<>();
    boolean timeFieldExists = false;
    boolean keyFieldExists = false;
    boolean partitionFieldExists = false;
    boolean offsetFieldExists = false;

    for (Schema.Field field : schema.getFields()) {
      String fieldName = field.getName();
      Schema fieldSchema = field.getSchema().isNullable() ? field.getSchema().getNonNullable() : field.getSchema();
      Schema.Type fieldType = fieldSchema.getType();
      // if the field is not the time field and not the key field, it is a message field.
      if (fieldName.equals(timeField)) {
        if (fieldType != Schema.Type.LONG || fieldSchema.getLogicalType() != null) {
          collector.addFailure("The time field must be of type long or nullable long.", null)
            .withConfigProperty(NAME_TIMEFIELD).withOutputSchemaField(timeField);
        }
        timeFieldExists = true;
      } else if (fieldName.equals(keyField)) {
        if (fieldType != Schema.Type.BYTES || fieldSchema.getLogicalType() != null) {
          collector.addFailure("The key field must be of type bytes or nullable bytes.", null)
            .withConfigProperty(NAME_KEYFIELD).withOutputSchemaField(keyField);
        }
        keyFieldExists = true;
      } else if (fieldName.equals(partitionField)) {
        if (fieldType != Schema.Type.INT || fieldSchema.getLogicalType() != null) {
          collector.addFailure("The partition field must be of type int.", null)
            .withConfigProperty(NAME_PARTITION_FIELD).withOutputSchemaField(partitionField);
        }
        partitionFieldExists = true;
      } else if (fieldName.equals(offsetField)) {
        if (fieldType != Schema.Type.LONG || fieldSchema.getLogicalType() != null) {
          collector.addFailure("The offset field must be of type long.", null)
            .withConfigProperty(NAME_OFFSET_FIELD).withOutputSchemaField(offsetField);
        }
        offsetFieldExists = true;
      } else {
        messageFields.add(field);
      }
    }

    if (getTimeField() != null && !timeFieldExists) {
      collector.addFailure(String.format("Time field '%s' must exist in schema.", timeField), null)
        .withConfigProperty(NAME_TIMEFIELD);
    }
    if (getKeyField() != null && !keyFieldExists) {
      collector.addFailure(String.format("Key field '%s' must exist in schema.", keyField), null)
        .withConfigProperty(NAME_KEYFIELD);
    }
    if (getPartitionField() != null && !partitionFieldExists) {
      collector.addFailure(String.format("Partition field '%s' must exist in schema.", partitionField), null)
        .withConfigProperty(NAME_PARTITION_FIELD);
    }
    if (getOffsetField() != null && !offsetFieldExists) {
      collector.addFailure(String.format("Offset field '%s' must exist in schema.", offsetField), null)
        .withConfigProperty(NAME_OFFSET_FIELD);
    }

    if (messageFields.isEmpty()) {
      collector.addFailure("Schema must contain at least one other field besides the time and key fields.", null);
      throw collector.getOrThrowException();
    }
    return Schema.recordOf("kafka.message", messageFields);
  }

  /**
   * Get the initial partition offsets for the specified partitions. If an initial offset is specified in the
   * initialPartitionOffsets property, that value will be used. Otherwise, the defaultInitialOffset will be used.
   *
   * @param partitionsToRead the partitions to read
   * @param collector        failure collector
   * @return initial partition offsets.
   */
  public Map getInitialPartitionOffsets(Set partitionsToRead,
                                                              FailureCollector collector) {
    Map partitionOffsets = new HashMap<>();

    // set default initial partitions
    final Long defaultInitialOffset = getDefaultInitialOffset();
    for (Integer partition : partitionsToRead) {
      partitionOffsets.put(new TopicPartition(topic, partition), defaultInitialOffset);
    }

    // if initial partition offsets are specified, overwrite the defaults.
    if (initialPartitionOffsets != null) {
      for (KeyValue partitionAndOffset : KeyValueListParser.DEFAULT.parse(initialPartitionOffsets)) {
        String partitionStr = partitionAndOffset.getKey();
        String offsetStr = partitionAndOffset.getValue();
        int partition;
        try {
          partition = Integer.parseInt(partitionStr);
        } catch (NumberFormatException e) {
          collector.addFailure(
            String.format("Invalid partition '%s' in initialPartitionOffsets.", partitionStr),
            "Partition must be a valid integer.")
            .withConfigElement(NAME_INITIAL_PARTITION_OFFSETS, partitionStr + SEPARATOR + offsetStr);
          continue;
        }
        long offset;
        try {
          offset = Long.parseLong(offsetStr);
        } catch (NumberFormatException e) {
          collector.addFailure(
            String.format("Invalid offset '%s' in initialPartitionOffsets for partition %d.", offsetStr, partition),
            "Offset muse be a valid integer.")
            .withConfigElement(NAME_INITIAL_PARTITION_OFFSETS, partitionStr + SEPARATOR + offsetStr);
          continue;
        }
        partitionOffsets.put(new TopicPartition(topic, partition), offset);
      }
    }

    return partitionOffsets;
  }

  /**
   * @return broker host to broker port mapping.
   */
  public Map getBrokerMap(FailureCollector collector) {
    Map brokerMap = new HashMap<>();
    try {
      Iterable> parsed = KeyValueListParser.DEFAULT.parse(brokers);
      for (KeyValue hostAndPort : parsed) {
        String host = hostAndPort.getKey();
        String portStr = hostAndPort.getValue();
        try {
          brokerMap.put(host, Integer.parseInt(portStr));
        } catch (NumberFormatException e) {
          collector.addFailure(String.format("Invalid port '%s' for host '%s'.", portStr, host),
                               "It should be a valid port number.")
            .withConfigElement(NAME_BROKERS, host + SEPARATOR + portStr);
        }
      }
    } catch (IllegalArgumentException e) {
      // no-op
    }

    if (brokerMap.isEmpty()) {
      collector.addFailure("Kafka brokers must be provided in host:port format.", null)
        .withConfigProperty(NAME_BROKERS);
    }
    return brokerMap;
  }

  /**
   * @return set of partitions to read from. Returns an empty list if no partitions were specified.
   */
  public Set getPartitions(FailureCollector collector) {
    Set partitionSet = new HashSet<>();
    if (Strings.isNullOrEmpty(partitions)) {
      return partitionSet;
    }
    for (String partition : Splitter.on(',').trimResults().split(partitions)) {
      try {
        partitionSet.add(Integer.parseInt(partition));
      } catch (NumberFormatException e) {
        collector.addFailure(String.format("Invalid partition '%s'.", partition), "Partitions must be integers.")
          .withConfigElement(NAME_PARTITIONS, partition);
      }
    }
    return partitionSet;
  }

  @Nullable
  public String getPrincipal() {
    return principal;
  }

  @Nullable
  public String getKeytabLocation() {
    return keytabLocation;
  }

  public Map getKafkaProperties() {
    KeyValueListParser kvParser = new KeyValueListParser("\\s*,\\s*", ":");
    Map conf = new HashMap<>();
    if (!Strings.isNullOrEmpty(kafkaProperties)) {
      for (KeyValue keyVal : kvParser.parse(kafkaProperties)) {
        conf.put(keyVal.getKey(), keyVal.getValue());
      }
    }
    return conf;
  }

  public void validate(FailureCollector collector) {
    // brokers can be null since it is macro enabled.
    if (!Strings.isNullOrEmpty(brokers)) {
      getBrokerMap(collector);
    }
    Set partitions = getPartitions(collector);
    getInitialPartitionOffsets(partitions, collector);

    if (maxRatePerPartition == null) {
      collector.addFailure("Max rate per partition must be provided.", null)
        .withConfigProperty(NAME_MAX_RATE);
    }

    if (maxRatePerPartition < 0) {
      collector.addFailure(String.format("Invalid maxRatePerPartition '%d'.", maxRatePerPartition),
                           "Rate must be 0 or greater.").withConfigProperty(NAME_MAX_RATE);
    }

    if (!Strings.isNullOrEmpty(timeField) && !Strings.isNullOrEmpty(keyField) && timeField.equals(keyField)) {
      collector.addFailure(String.format(
        "The timeField and keyField cannot both have the same name (%s).", timeField), null)
        .withConfigProperty(NAME_TIMEFIELD).withConfigProperty(NAME_KEYFIELD);
    }

    Schema messageSchema = getMessageSchema(collector);
    // if format is empty, there must be just a single message field of type bytes or nullable types.
    if (Strings.isNullOrEmpty(format)) {
      List messageFields = messageSchema.getFields();
      if (messageFields.size() > 1) {
        for (Schema.Field messageField : messageFields) {
          collector.addFailure(
            "Without a format, the schema must contain just a single message field of type bytes or nullable bytes.",
            String.format("Remove field '%s'.", messageField.getName()))
            .withOutputSchemaField(messageField.getName()).withConfigProperty(NAME_FORMAT);
        }
        return;
      }

      Schema.Field messageField = messageFields.get(0);
      Schema messageFieldSchema = messageField.getSchema().isNullable() ? messageField.getSchema().getNonNullable() :
        messageField.getSchema();
      Schema.Type messageFieldType = messageFieldSchema.getType();
      if (messageFieldType != Schema.Type.BYTES || messageFieldSchema.getLogicalType() != null) {
        collector.addFailure(
          String.format("Without a format, the message field must be of type bytes or nullable bytes, " +
                          "but field %s is of type %s.",
                        messageField.getName(), messageField.getSchema().getDisplayName()), null)
          .withOutputSchemaField(messageField.getName()).withConfigProperty(NAME_FORMAT);
      }
    } else {
      // otherwise, if there is a format, make sure we can instantiate it.
      FormatSpecification formatSpec = new FormatSpecification(format, messageSchema, new HashMap<>());
      try {
        RecordFormats.createInitializedFormat(formatSpec);
      } catch (Exception e) {
        collector.addFailure(String.format(
          "Unable to instantiate a message parser from format '%s': %s",
          format, e.getMessage()), null).withStacktrace(e.getStackTrace()).withConfigProperty(NAME_FORMAT);
      }
    }

    KafkaHelpers.validateKerberosSetting(principal, keytabLocation, collector);
  }
}