All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.avro.MercifulJsonConverter Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.avro;

import org.apache.hudi.avro.processors.DateLogicalTypeProcessor;
import org.apache.hudi.avro.processors.DecimalLogicalTypeProcessor;
import org.apache.hudi.avro.processors.DurationLogicalTypeProcessor;
import org.apache.hudi.avro.processors.EnumTypeProcessor;
import org.apache.hudi.avro.processors.FixedTypeProcessor;
import org.apache.hudi.avro.processors.JsonFieldProcessor;
import org.apache.hudi.avro.processors.LocalTimestampMicroLogicalTypeProcessor;
import org.apache.hudi.avro.processors.LocalTimestampMilliLogicalTypeProcessor;
import org.apache.hudi.avro.processors.Parser;
import org.apache.hudi.avro.processors.TimeMicroLogicalTypeProcessor;
import org.apache.hudi.avro.processors.TimeMilliLogicalTypeProcessor;
import org.apache.hudi.avro.processors.TimestampMicroLogicalTypeProcessor;
import org.apache.hudi.avro.processors.TimestampMilliLogicalTypeProcessor;
import org.apache.hudi.common.util.CollectionUtils;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieJsonConversionException;
import org.apache.hudi.exception.HoodieJsonToAvroConversionException;

import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.avro.Conversions;
import org.apache.avro.LogicalType;
import org.apache.avro.LogicalTypes;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Type;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericFixed;
import org.apache.avro.generic.GenericRecord;

import java.io.IOException;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.time.LocalDate;
import java.time.temporal.ChronoUnit;
import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;

/**
 * Converts Json record to Avro Generic Record.
 */
public class MercifulJsonConverter {

  // For each schema (keyed by full name), stores a mapping of schema field name to json field name to account for sanitization of fields
  private static final Map> SANITIZED_FIELD_MAPPINGS = new ConcurrentHashMap<>();
  private final Map fieldTypeProcessorMap;
  private final Map fieldLogicalTypeProcessorMap;

  protected final ObjectMapper mapper;

  protected final String invalidCharMask;
  protected final boolean shouldSanitize;

  /**
   * Uses a default objectMapper to deserialize a json string.
   */
  public MercifulJsonConverter() {
    this(false, "__");
  }

  /**
   * Allows enabling sanitization and allows choice of invalidCharMask for sanitization
   */
  public MercifulJsonConverter(boolean shouldSanitize, String invalidCharMask) {
    this(new ObjectMapper(), shouldSanitize, invalidCharMask);
  }

  /**
   * Allows a configured ObjectMapper to be passed for converting json records to avro record.
   */
  public MercifulJsonConverter(ObjectMapper mapper, boolean shouldSanitize, String invalidCharMask) {
    this.mapper = mapper;
    this.shouldSanitize = shouldSanitize;
    this.invalidCharMask = invalidCharMask;
    this.fieldTypeProcessorMap = getFieldTypeProcessors();
    this.fieldLogicalTypeProcessorMap = getLogicalFieldTypeProcessors();
  }

  /**
   * Converts json to Avro generic record.
   * NOTE: if sanitization is needed for avro conversion, the schema input to this method is already sanitized.
   *       During the conversion here, we sanitize the fields in the data
   *
   * @param json Json record
   * @param schema Schema
   */
  public GenericRecord convert(String json, Schema schema) {
    try {
      Map jsonObjectMap = mapper.readValue(json, Map.class);
      return convertJsonToAvro(jsonObjectMap, schema);
    } catch (HoodieException | IOException e) {
      throw new HoodieJsonToAvroConversionException("failed to convert json to avro", e);
    }
  }

  /**
   * Clear between fetches. If the schema changes or if two tables have the same schemaFullName then
   * can be issues
   */
  public static void clearCache(String schemaFullName) {
    SANITIZED_FIELD_MAPPINGS.remove(schemaFullName);
  }

  private GenericRecord convertJsonToAvro(Map inputJson, Schema schema) {
    GenericRecord avroRecord = new GenericData.Record(schema);
    for (Schema.Field f : schema.getFields()) {
      Object val = shouldSanitize ? getFieldFromJson(f, inputJson, schema.getFullName(), invalidCharMask) : inputJson.get(f.name());
      if (val != null) {
        avroRecord.put(f.pos(), convertJsonField(val, f.name(), f.schema()));
      }
    }
    return avroRecord;
  }

  protected static Object getFieldFromJson(final Schema.Field fieldSchema, final Map inputJson, final String schemaFullName, final String invalidCharMask) {
    Map schemaToJsonFieldNames = SANITIZED_FIELD_MAPPINGS.computeIfAbsent(schemaFullName, unused -> new ConcurrentHashMap<>());
    if (!schemaToJsonFieldNames.containsKey(fieldSchema.name())) {
      // if we don't have field mapping, proactively populate as many as possible based on input json
      for (String inputFieldName : inputJson.keySet()) {
        // we expect many fields won't need sanitization so check if un-sanitized field name is already present
        if (!schemaToJsonFieldNames.containsKey(inputFieldName)) {
          String sanitizedJsonFieldName = HoodieAvroUtils.sanitizeName(inputFieldName, invalidCharMask);
          schemaToJsonFieldNames.putIfAbsent(sanitizedJsonFieldName, inputFieldName);
        }
      }
    }
    Object match = inputJson.get(schemaToJsonFieldNames.getOrDefault(fieldSchema.name(), fieldSchema.name()));
    if (match != null) {
      return match;
    }
    // Check if there is an alias match
    for (String alias : fieldSchema.aliases()) {
      if (inputJson.containsKey(alias)) {
        return inputJson.get(alias);
      }
    }
    return null;
  }

  private Schema getNonNull(Schema schema) {
    List types = schema.getTypes();
    Schema.Type firstType = types.get(0).getType();
    return firstType.equals(Schema.Type.NULL) ? types.get(1) : types.get(0);
  }

  private boolean isOptional(Schema schema) {
    return schema.getType().equals(Schema.Type.UNION) && schema.getTypes().size() == 2
        && (schema.getTypes().get(0).getType().equals(Schema.Type.NULL)
        || schema.getTypes().get(1).getType().equals(Schema.Type.NULL));
  }

  protected Object convertJsonField(Object value, String name, Schema schema) {

    if (isOptional(schema)) {
      if (value == null) {
        return null;
      } else {
        schema = getNonNull(schema);
      }
    } else if (value == null) {
      // Always fail on null for non-nullable schemas
      throw buildConversionException(String.format("Symbol %s not in enum", value.toString()),
          schema.getFullName(), schema, shouldSanitize, invalidCharMask);
    }

    return convertField(value, name, schema);
  }

  private Object convertField(Object value, String name, Schema schema) {
    JsonFieldProcessor processor = getProcessorForSchema(schema);
    return processor.convertField(value, name, schema);
  }

  protected JsonFieldProcessor getProcessorForSchema(Schema schema) {
    JsonFieldProcessor processor = null;

    // 3 cases to consider: customized logicalType, logicalType, and type.
    String customizedLogicalType = schema.getProp("logicalType");
    LogicalType logicalType = schema.getLogicalType();
    Type type = schema.getType();
    if (customizedLogicalType != null && !customizedLogicalType.isEmpty()) {
      processor = fieldLogicalTypeProcessorMap.get(customizedLogicalType);
    } else if (logicalType != null) {
      processor = fieldLogicalTypeProcessorMap.get(logicalType.getName());
    } else {
      processor = fieldTypeProcessorMap.get(type);
    }

    ValidationUtils.checkArgument(
        processor != null, String.format("JsonConverter cannot handle type: %s", type));
    return processor;
  }

  /**
   * Build type processor map for each avro type.
   */
  private Map getFieldTypeProcessors() {
    Map fieldTypeProcessors = new EnumMap<>(Schema.Type.class);
    fieldTypeProcessors.put(Type.STRING, generateStringTypeHandler());
    fieldTypeProcessors.put(Type.BOOLEAN, generateBooleanTypeHandler());
    fieldTypeProcessors.put(Type.DOUBLE, generateDoubleTypeHandler());
    fieldTypeProcessors.put(Type.FLOAT, generateFloatTypeHandler());
    fieldTypeProcessors.put(Type.INT, generateIntTypeHandler());
    fieldTypeProcessors.put(Type.LONG, generateLongTypeHandler());
    fieldTypeProcessors.put(Type.ARRAY, generateArrayTypeHandler());
    fieldTypeProcessors.put(Type.RECORD, generateRecordTypeHandler());
    fieldTypeProcessors.put(Type.ENUM, generateEnumTypeHandler());
    fieldTypeProcessors.put(Type.MAP, generateMapTypeHandler());
    fieldTypeProcessors.put(Type.BYTES, generateBytesTypeHandler());
    fieldTypeProcessors.put(Type.FIXED, generateFixedTypeHandler());
    return Collections.unmodifiableMap(fieldTypeProcessors);
  }

  private Map getLogicalFieldTypeProcessors() {
    return CollectionUtils.createImmutableMap(
      Pair.of(AvroLogicalTypeEnum.DECIMAL.getValue(), generateDecimalLogicalTypeHandler()),
      Pair.of(AvroLogicalTypeEnum.TIME_MICROS.getValue(), generateTimeMicroLogicalTypeHandler()),
      Pair.of(AvroLogicalTypeEnum.TIME_MILLIS.getValue(), generateTimeMilliLogicalTypeHandler()),
      Pair.of(AvroLogicalTypeEnum.DATE.getValue(), generateDateLogicalTypeHandler()),
      Pair.of(AvroLogicalTypeEnum.LOCAL_TIMESTAMP_MICROS.getValue(), generateLocalTimeStampMicroLogicalTypeHandler()),
      Pair.of(AvroLogicalTypeEnum.LOCAL_TIMESTAMP_MILLIS.getValue(), generateLocalTimeStampMilliLogicalTypeHandler()),
      Pair.of(AvroLogicalTypeEnum.TIMESTAMP_MICROS.getValue(), generateTimestampMicroLogicalTypeHandler()),
      Pair.of(AvroLogicalTypeEnum.TIMESTAMP_MILLIS.getValue(), generateTimestampMilliLogicalTypeHandler()),
      Pair.of(AvroLogicalTypeEnum.DURATION.getValue(), generateDurationLogicalTypeHandler()),
      Pair.of(AvroLogicalTypeEnum.UUID.getValue(), generateStringTypeHandler()));
  }

  protected JsonFieldProcessor generateDecimalLogicalTypeHandler() {
    return new DecimalToAvroLogicalTypeProcessor();
  }

  protected JsonFieldProcessor generateTimeMicroLogicalTypeHandler() {
    return new TimeMicroLogicalTypeProcessor();
  }

  protected JsonFieldProcessor generateTimeMilliLogicalTypeHandler() {
    return new TimeMilliLogicalTypeProcessor();
  }

  protected JsonFieldProcessor generateDateLogicalTypeHandler() {
    return new DateToAvroLogicalTypeProcessor();
  }

  protected JsonFieldProcessor generateLocalTimeStampMicroLogicalTypeHandler() {
    return new LocalTimestampMicroLogicalTypeProcessor();
  }

  protected JsonFieldProcessor generateLocalTimeStampMilliLogicalTypeHandler() {
    return new LocalTimestampMilliLogicalTypeProcessor();
  }

  protected JsonFieldProcessor generateTimestampMicroLogicalTypeHandler() {
    return new TimestampMicroLogicalTypeProcessor();
  }

  protected JsonFieldProcessor generateTimestampMilliLogicalTypeHandler() {
    return new TimestampMilliLogicalTypeProcessor();
  }

  protected JsonFieldProcessor generateDurationLogicalTypeHandler() {
    return new DurationToAvroLogicalTypeProcessor();
  }

  private class DecimalToAvroLogicalTypeProcessor extends DecimalLogicalTypeProcessor {
    @Override
    public Pair convert(Object value, String name, Schema schema) {
      if (!isValidDecimalTypeConfig(schema)) {
        return Pair.of(false, null);
      }

      // Case 1: Input is a list. It is expected to be raw Fixed byte array input, and we only support
      // parsing it to Fixed avro type.
      if (value instanceof List && schema.getType() == Type.FIXED) {
        JsonFieldProcessor processor = generateFixedTypeHandler();
        return processor.convert(value, name, schema);
      }

      // Case 2: Input is a number or String number.
      LogicalTypes.Decimal decimalType = (LogicalTypes.Decimal) schema.getLogicalType();
      Pair parseResult = parseObjectToBigDecimal(value, schema);
      if (Boolean.FALSE.equals(parseResult.getLeft())) {
        return Pair.of(false, null);
      }
      BigDecimal bigDecimal = parseResult.getRight();

      switch (schema.getType()) {
        case BYTES:
          // Convert to primitive Arvo type that logical type Decimal uses.
          ByteBuffer byteBuffer = new Conversions.DecimalConversion().toBytes(bigDecimal, schema, decimalType);
          return Pair.of(true, byteBuffer);
        case FIXED:
          GenericFixed fixedValue = new Conversions.DecimalConversion().toFixed(bigDecimal, schema, decimalType);
          return Pair.of(true, fixedValue);
        default: {
          return Pair.of(false, null);
        }
      }
    }
  }

  private static class DurationToAvroLogicalTypeProcessor extends DurationLogicalTypeProcessor {

    /**
     * Convert the given object to Avro object with schema whose logical type is duration.
     */
    @Override
    public Pair convert(Object value, String name, Schema schema) {

      if (!isValidDurationTypeConfig(schema)) {
        return Pair.of(false, null);
      }
      if (!isValidDurationInput(value)) {
        return Pair.of(false, null);
      }
      // After the validation the input can be safely cast to List with 3 elements.
      List list = (List) value;
      List converval = list.stream()
          .filter(Integer.class::isInstance)
          .map(Integer.class::cast)
          .collect(Collectors.toList());

      ByteBuffer buffer = ByteBuffer.allocate(schema.getFixedSize()).order(ByteOrder.LITTLE_ENDIAN);
      for (Integer element : converval) {
        buffer.putInt(element);  // months
      }
      return Pair.of(true, new GenericData.Fixed(schema, buffer.array()));
    }
  }

  private static class DateToAvroLogicalTypeProcessor extends DateLogicalTypeProcessor {

    @Override
    public Pair convert(
        Object value, String name, Schema schema) {
      return convertCommon(
          new Parser.IntParser() {
            @Override
            public Pair handleStringValue(String value) {
              if (!isWellFormedDateTime(value)) {
                return Pair.of(false, null);
              }
              Pair result = convertToLocalDate(value);
              if (!result.getLeft()) {
                return Pair.of(false, null);
              }
              LocalDate date = result.getRight();
              int daysSinceEpoch = (int) ChronoUnit.DAYS.between(LocalDate.ofEpochDay(0), date);
              return Pair.of(true, daysSinceEpoch);
            }
          },
          value, schema);
    }
  }

  protected JsonFieldProcessor generateBooleanTypeHandler() {
    return new JsonFieldProcessor() {
      @Override
      public Pair convert(Object value, String name, Schema schema) {
        if (value instanceof Boolean) {
          return Pair.of(true, value);
        }
        return Pair.of(false, null);
      }
    };
  }

  protected JsonFieldProcessor generateIntTypeHandler() {
    return new JsonFieldProcessor() {
      @Override
      public Pair convert(Object value, String name, Schema schema) {
        if (value instanceof Number) {
          return Pair.of(true, ((Number) value).intValue());
        } else if (value instanceof String) {
          return Pair.of(true, Integer.valueOf((String) value));
        }
        return Pair.of(false, null);
      }
    };
  }

  protected JsonFieldProcessor generateDoubleTypeHandler() {
    return new JsonFieldProcessor() {
      @Override
      public Pair convert(Object value, String name, Schema schema) {
        if (value instanceof Number) {
          return Pair.of(true, ((Number) value).doubleValue());
        } else if (value instanceof String) {
          return Pair.of(true, Double.valueOf((String) value));
        }
        return Pair.of(false, null);
      }
    };
  }

  protected JsonFieldProcessor generateFloatTypeHandler() {
    return new JsonFieldProcessor() {
      @Override
      public Pair convert(Object value, String name, Schema schema) {
        if (value instanceof Number) {
          return Pair.of(true, ((Number) value).floatValue());
        } else if (value instanceof String) {
          return Pair.of(true, Float.valueOf((String) value));
        }
        return Pair.of(false, null);
      }
    };
  }

  protected JsonFieldProcessor generateLongTypeHandler() {
    return new JsonFieldProcessor() {
      @Override
      public Pair convert(Object value, String name, Schema schema) {
        if (value instanceof Number) {
          return Pair.of(true, ((Number) value).longValue());
        } else if (value instanceof String) {
          return Pair.of(true, Long.valueOf((String) value));
        }
        return Pair.of(false, null);
      }
    };
  }

  private static JsonFieldProcessor generateStringTypeHandler() {
    return new StringProcessor();
  }

  private static class StringProcessor extends JsonFieldProcessor {
    private static final ObjectMapper STRING_MAPPER = new ObjectMapper();

    @Override
    public Pair convert(Object value, String name, Schema schema) {
      if (value instanceof String) {
        return Pair.of(true, value);
      } else {
        try {
          return Pair.of(true, STRING_MAPPER.writeValueAsString(value));
        } catch (IOException ex) {
          return Pair.of(false, null);
        }
      }
    }
  }

  protected JsonFieldProcessor generateBytesTypeHandler() {
    return new JsonFieldProcessor() {
      @Override
      public Pair convert(Object value, String name, Schema schema) {
        // Should return ByteBuffer (see GenericData.isBytes())
        return Pair.of(true, ByteBuffer.wrap(value.toString().getBytes()));
      }
    };
  }

  protected JsonFieldProcessor generateFixedTypeHandler() {
    return new AvroFixedTypeProcessor();
  }

  private static class AvroFixedTypeProcessor extends FixedTypeProcessor {
    @Override
    public Pair convert(Object value, String name, Schema schema) {
      return Pair.of(true, new GenericData.Fixed(
          schema, convertToJavaObject(value, name, schema)));
    }
  }

  private static class AvroEnumTypeProcessor extends EnumTypeProcessor {
    @Override
    public Pair convert(Object value, String name, Schema schema) {
      return Pair.of(true, new GenericData.EnumSymbol(schema, convertToJavaObject(value, name, schema)));
    }
  }

  protected JsonFieldProcessor generateEnumTypeHandler() {
    return new AvroEnumTypeProcessor();
  }

  protected JsonFieldProcessor generateRecordTypeHandler() {
    return new JsonFieldProcessor() {
      @Override
      public Pair convert(Object value, String name, Schema schema) {
        return Pair.of(true, convertJsonToAvro((Map) value, schema));
      }
    };
  }

  protected JsonFieldProcessor generateArrayTypeHandler() {
    return new JsonFieldProcessor() {
      private List convertToJavaObject(Object value, String name, Schema schema) {
        Schema elementSchema = schema.getElementType();
        List listRes = new ArrayList<>();
        for (Object v : (List) value) {
          listRes.add(convertJsonField(v, name, elementSchema));
        }
        return listRes;
      }

      @Override
      public Pair convert(Object value, String name, Schema schema) {
        return Pair.of(true, new GenericData.Array<>(
            schema,
            convertToJavaObject(
                value,
                name,
                schema)));
      }
    };
  }

  protected JsonFieldProcessor generateMapTypeHandler() {
    return new JsonFieldProcessor() {
      @Override
      public Pair convert(Object value, String name, Schema schema) {
        Schema valueSchema = schema.getValueType();
        Map mapRes = new HashMap<>();
        for (Map.Entry v : ((Map) value).entrySet()) {
          mapRes.put(v.getKey(), convertJsonField(v.getValue(), name, valueSchema));
        }
        return Pair.of(true, mapRes);
      }
    };
  }

  protected HoodieJsonConversionException buildConversionException(Object value, String fieldName, Schema schema, boolean shouldSanitize, String invalidCharMask) {
    String errorMsg;
    if (shouldSanitize) {
      errorMsg = String.format("Json to Avro Type conversion error for field %s, %s for %s. Field sanitization is enabled with a mask of %s.", fieldName, value, schema, invalidCharMask);
    } else {
      errorMsg = String.format("Json to Avro Type conversion error for field %s, %s for %s", fieldName, value, schema);
    }
    return new HoodieJsonConversionException(errorMsg);
  }

}