All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gobblin.converter.EnvelopeSchemaConverter Maven / Gradle / Ivy

There is a newer version: 0.11.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.converter;

import com.google.common.base.Optional;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.WorkUnitState;
import gobblin.converter.filter.AvroProjectionConverter;
import gobblin.converter.filter.AvroSchemaFieldRemover;
import gobblin.metrics.kafka.KafkaSchemaRegistry;
import gobblin.metrics.kafka.KafkaSchemaRegistryFactory;
import gobblin.metrics.kafka.SchemaRegistryException;
import gobblin.util.AvroUtils;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.ExecutionException;
import javax.xml.bind.DatatypeConverter;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.Decoder;
import org.apache.avro.io.DecoderFactory;

/**
 * A converter for extracting schema/records from an envelope schema.
 * Input schema: envelope schema - must have fields payloadSchemaId (the schema registry key of the output
 *               schema) and payload (byte data for output record)
 * Input record: record corresponding to input schema
 * Output schema: schema obtained from schema registry using key provided in input record's {@link #PAYLOAD_SCHEMA_ID_FIELD}
 * Output record: record corresponding to output schema obtained from input record's {@link #PAYLOAD_FIELD} as bytes
 */
public class EnvelopeSchemaConverter extends Converter {

  public static final String PAYLOAD_SCHEMA_ID_FIELD = "EnvelopeSchemaConverter.schemaIdField";
  public static final String PAYLOAD_FIELD = "EnvelopeSchemaConverter.payloadField";
  public static final String DEFAULT_PAYLOAD_SCHEMA_ID_FIELD ="payloadSchemaId";
  public static final String DEFAULT_PAYLOAD_FIELD = "payload";
  public static final String DEFAULT_KAFKA_SCHEMA_REGISTRY_FACTORY_CLASS = "gobblin.metrics.kafka.KafkaAvroSchemaRegistryFactory";

  private Optional fieldRemover;
  private KafkaSchemaRegistry registry;
  private DecoderFactory decoderFactory;
  private LoadingCache> readers;

  /**
   * To remove certain fields from the Avro schema or records of a topic/table, set property
   * {topic/table name}.remove.fields={comma-separated, fully qualified field names} in workUnit.
   */
  @Override
  public EnvelopeSchemaConverter init(WorkUnitState workUnit) {
    if (workUnit.contains(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY)) {
      String removeFieldsPropName = workUnit.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY) + AvroProjectionConverter.REMOVE_FIELDS;
      if (workUnit.contains(removeFieldsPropName)) {
        this.fieldRemover = Optional.of(new AvroSchemaFieldRemover(workUnit.getProp(removeFieldsPropName)));
      } else {
        this.fieldRemover = Optional.absent();
      }
    }
    String registryFactoryField = workUnit.contains(KafkaSchemaRegistryFactory.KAFKA_SCHEMA_REGISTRY_FACTORY_CLASS) ?
        workUnit.getProp(KafkaSchemaRegistryFactory.KAFKA_SCHEMA_REGISTRY_FACTORY_CLASS) : DEFAULT_KAFKA_SCHEMA_REGISTRY_FACTORY_CLASS;
    try {
      KafkaSchemaRegistryFactory registryFactory = ((Class) Class.forName(registryFactoryField)).newInstance();
      this.registry = registryFactory.create(workUnit.getProperties());
    } catch (ClassNotFoundException | IllegalAccessException | InstantiationException e) {
      return null;
    }
    this.decoderFactory = DecoderFactory.get();
    this.readers = CacheBuilder.newBuilder().build(new CacheLoader>() {
      @Override
      public GenericDatumReader load(final Schema key) throws Exception {
        return new GenericDatumReader<>(key);
      }
    });
    return this;
  }

  /**
   * Do nothing, actual schema must be obtained from records.
   */
  @Override
  public String convertSchema(Schema inputSchema, WorkUnitState workUnit) throws SchemaConversionException {
    return EnvelopeSchemaConverter.class.getName();
  }

  /**
   * Get actual schema from registry and deserialize payload using it.
   */
  @Override
  public Iterable convertRecord(String outputSchema, GenericRecord inputRecord, WorkUnitState workUnit)
      throws DataConversionException {
    try {
      String schemaIdField = workUnit.contains(PAYLOAD_SCHEMA_ID_FIELD) ?
          workUnit.getProp(PAYLOAD_SCHEMA_ID_FIELD) : DEFAULT_PAYLOAD_SCHEMA_ID_FIELD;
      String payloadField = workUnit.contains(PAYLOAD_FIELD) ?
          workUnit.getProp(PAYLOAD_FIELD) : DEFAULT_PAYLOAD_FIELD;
      String schemaKey = String.valueOf(inputRecord.get(schemaIdField));
      Schema payloadSchema = (Schema) this.registry.getSchemaByKey(schemaKey);
      byte[] payload = getPayload(inputRecord, payloadField);
      GenericRecord outputRecord = deserializePayload(payload, payloadSchema);
      if (this.fieldRemover.isPresent()) {
        payloadSchema = this.fieldRemover.get().removeFields(payloadSchema);
      }
      return new SingleRecordIterable<>(AvroUtils.convertRecordSchema(outputRecord, payloadSchema));
    } catch (IOException | SchemaRegistryException | ExecutionException e) {
      throw new DataConversionException(e);
    }
  }

  /**
   * Get payload field from GenericRecord and convert to byte array
   */
  public byte[] getPayload(GenericRecord inputRecord, String payloadFieldName) {
    ByteBuffer bb = (ByteBuffer) inputRecord.get(payloadFieldName);
    byte[] payloadBytes;
    if (bb.hasArray()) {
      payloadBytes = bb.array();
    } else {
      payloadBytes = new byte[bb.remaining()];
      bb.get(payloadBytes);
    }
    String hexString = new String(payloadBytes, StandardCharsets.UTF_8);
    return DatatypeConverter.parseHexBinary(hexString);
  }

  /**
   * Deserialize payload using payload schema
   */
  public GenericRecord deserializePayload(byte[] payload, Schema payloadSchema) throws IOException, ExecutionException {
    Decoder decoder = this.decoderFactory.binaryDecoder(payload, null);
    GenericDatumReader reader = this.readers.get(payloadSchema);
    return reader.read(null, decoder);
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy