org.apache.hudi.internal.schema.action.InternalSchemaMerger Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.internal.schema.action;

import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.Type;
import org.apache.hudi.internal.schema.Types;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Auxiliary class.
 * help to merge file schema and query schema to produce final read schema for avro/parquet file
 */
public class InternalSchemaMerger {
  private final InternalSchema fileSchema;
  private final InternalSchema querySchema;
  // now there exist some bugs when we use spark update/merge api,
  // those operation will change col nullability from optional to required which is wrong.
  // Before that bug is fixed, we need to do adapt.
  // if mergeRequiredFiledForce is true, we will ignore the col's required attribute.
  private final boolean ignoreRequiredAttribute;
  // Whether to use column Type from file schema to read files when we find some column type has changed.
  // spark parquetReader need the original column type to read data, otherwise the parquetReader will failed.
  // eg: current column type is StringType, now we changed it to decimalType,
  // we should not pass decimalType to parquetReader, we must pass StringType to it; when we read out the data, we convert data from String to Decimal, everything is ok.
  // for log reader
  // since our reWriteRecordWithNewSchema function support rewrite directly, so we no need this parameter
  // eg: current column type is StringType, now we changed it to decimalType,
  // we can pass decimalType to reWriteRecordWithNewSchema directly, everything is ok.
  private boolean useColumnTypeFromFileSchema = true;

  // deal with rename
  // Whether to use column name from file schema to read files when we find some column name has changed.
  // spark parquetReader need the original column name to read data, otherwise the parquetReader will read nothing.
  // eg: current column name is colOldName, now we rename it to colNewName,
  // we should not pass colNewName to parquetReader, we must pass colOldName to it; when we read out the data.
  // for log reader
  // since our reWriteRecordWithNewSchema function support rewrite directly, so we no need this parameter
  // eg: current column name is colOldName, now we rename it to colNewName,
  // we can pass colNewName to reWriteRecordWithNewSchema directly, everything is ok.
  private boolean useColNameFromFileSchema = true;

  private final Map renamedFields = new HashMap<>();

  public InternalSchemaMerger(InternalSchema fileSchema, InternalSchema querySchema, boolean ignoreRequiredAttribute, boolean useColumnTypeFromFileSchema, boolean useColNameFromFileSchema) {
    this.fileSchema = fileSchema;
    this.querySchema = querySchema;
    this.ignoreRequiredAttribute = ignoreRequiredAttribute;
    this.useColumnTypeFromFileSchema = useColumnTypeFromFileSchema;
    this.useColNameFromFileSchema = useColNameFromFileSchema;
  }

  public InternalSchemaMerger(InternalSchema fileSchema, InternalSchema querySchema, boolean ignoreRequiredAttribute, boolean useColumnTypeFromFileSchema) {
    this(fileSchema, querySchema, ignoreRequiredAttribute, useColumnTypeFromFileSchema, true);
  }

  /**
   * Create final read schema to read avro/parquet file.
   *
   * @return read schema to read avro/parquet file.
   */
  public InternalSchema mergeSchema() {
    Types.RecordType record = (Types.RecordType) mergeType(querySchema.getRecord(), 0);
    return new InternalSchema(record);
  }

  /**
   * Create final read schema to read avro/parquet file.
   *
   * @return read schema to read avro/parquet file.
   */
  public Pair> mergeSchemaGetRenamed() {
    return Pair.of(mergeSchema(), renamedFields);
  }

  /**
   * Create final read schema to read avro/parquet file.
   * this is auxiliary function used by mergeSchema.
   */
  private Type mergeType(Type type, int currentTypeId) {
    switch (type.typeId()) {
      case RECORD:
        Types.RecordType record = (Types.RecordType) type;
        List newTypes = new ArrayList<>();
        for (Types.Field f : record.fields()) {
          Type newType = mergeType(f.type(), f.fieldId());
          newTypes.add(newType);
        }
        return Types.RecordType.get(buildRecordType(record.fields(), newTypes));
      case ARRAY:
        Types.ArrayType array = (Types.ArrayType) type;
        Type newElementType;
        Types.Field elementField = array.fields().get(0);
        newElementType = mergeType(elementField.type(), elementField.fieldId());
        return buildArrayType(array, newElementType);
      case MAP:
        Types.MapType map = (Types.MapType) type;
        Type newValueType = mergeType(map.valueType(), map.valueId());
        return buildMapType(map, newValueType);
      default:
        return buildPrimitiveType((Type.PrimitiveType) type, currentTypeId);
    }
  }

  private List buildRecordType(List oldFields, List newTypes) {
    List newFields = new ArrayList<>();
    for (int i = 0; i < newTypes.size(); i++) {
      Type newType = newTypes.get(i);
      Types.Field oldField = oldFields.get(i);
      int fieldId = oldField.fieldId();
      String fullName = querySchema.findFullName(fieldId);
      if (fileSchema.findField(fieldId) != null) {
        if (fileSchema.findFullName(fieldId).equals(fullName)) {
          // maybe col type changed, deal with it.
          newFields.add(Types.Field.get(oldField.fieldId(), oldField.isOptional(), oldField.name(), newType, oldField.doc()));
        } else {
          // find rename, deal with it.
          newFields.add(dealWithRename(fieldId, newType, oldField));
        }
      } else {
        // buildFullName
        fullName = normalizeFullName(fullName);
        if (fileSchema.findField(fullName) != null) {
          newFields.add(Types.Field.get(oldField.fieldId(), oldField.isOptional(), oldField.name() + "suffix", oldField.type(), oldField.doc()));
        } else {
          // find add column
          // now there exist some bugs when we use spark update/merge api, those operation will change col optional to required.
          if (ignoreRequiredAttribute) {
            newFields.add(Types.Field.get(oldField.fieldId(), true, oldField.name(), newType, oldField.doc()));
          } else {
            newFields.add(Types.Field.get(oldField.fieldId(), oldField.isOptional(), oldField.name(), newType, oldField.doc()));
          }
        }
      }
    }
    return newFields;
  }

  private Types.Field dealWithRename(int fieldId, Type newType, Types.Field oldField) {
    Types.Field fieldFromFileSchema = fileSchema.findField(fieldId);
    String nameFromFileSchema = fieldFromFileSchema.name();
    String nameFromQuerySchema = querySchema.findField(fieldId).name();
    String finalFieldName = useColNameFromFileSchema ? nameFromFileSchema : nameFromQuerySchema;
    Type typeFromFileSchema = fieldFromFileSchema.type();
    if (!useColNameFromFileSchema) {
      renamedFields.put(nameFromQuerySchema, nameFromFileSchema);
    }
    // Current design mechanism guarantees nestedType change is not allowed, so no need to consider.
    if (newType.isNestedType()) {
      return Types.Field.get(oldField.fieldId(), oldField.isOptional(),
          finalFieldName, newType, oldField.doc());
    } else {
      return Types.Field.get(oldField.fieldId(), oldField.isOptional(),
          finalFieldName, useColumnTypeFromFileSchema ? typeFromFileSchema : newType, oldField.doc());
    }
  }

  private String normalizeFullName(String fullName) {
    // find parent rename, and normalize fullName
    // eg: we renamed a nest field struct(c, d) to aa, the we delete a.d and add it back later.
    String[] nameParts = fullName.split("\\.");
    String[] normalizedNameParts = new String[nameParts.length];
    System.arraycopy(nameParts, 0, normalizedNameParts, 0, nameParts.length);
    for (int j = 0; j < nameParts.length - 1; j++) {
      StringBuilder sb = new StringBuilder();
      for (int k = 0; k <= j; k++) {
        sb.append(nameParts[k]);
      }
      String parentName = sb.toString();
      int parentFieldIdFromQuerySchema = querySchema.findIdByName(parentName);
      String parentNameFromFileSchema = fileSchema.findFullName(parentFieldIdFromQuerySchema);
      if (parentNameFromFileSchema.isEmpty()) {
        break;
      }
      if (!parentNameFromFileSchema.equalsIgnoreCase(parentName)) {
        // find parent rename, update nameParts
        String[] parentNameParts = parentNameFromFileSchema.split("\\.");
        System.arraycopy(parentNameParts, 0, normalizedNameParts, 0, parentNameParts.length);
      }
    }
    return StringUtils.join(normalizedNameParts, ".");
  }

  private Type buildArrayType(Types.ArrayType array, Type newType) {
    Types.Field elementField = array.fields().get(0);
    int elementId = elementField.fieldId();
    if (elementField.type() == newType) {
      return array;
    } else {
      return Types.ArrayType.get(elementId, elementField.isOptional(), newType);
    }
  }

  private Type buildMapType(Types.MapType map, Type newValue) {
    Types.Field valueFiled = map.fields().get(1);
    if (valueFiled.type() == newValue) {
      return map;
    } else {
      return Types.MapType.get(map.keyId(), map.valueId(), map.keyType(), newValue, map.isValueOptional());
    }
  }

  private Type buildPrimitiveType(Type.PrimitiveType typeFromQuerySchema, int currentPrimitiveTypeId) {
    Type typeFromFileSchema = fileSchema.findType(currentPrimitiveTypeId);
    if (typeFromFileSchema == null) {
      return typeFromQuerySchema;
    } else {
      return useColumnTypeFromFileSchema ? typeFromFileSchema : typeFromQuerySchema;
    }
  }
}