org.apache.hudi.internal.schema.utils.InternalSchemaUtils Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of hudi-datahub-sync-bundle Show documentation
There is a newer version: 1.0.0-beta2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.internal.schema.utils;

import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.internal.schema.HoodieSchemaException;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.Type;
import org.apache.hudi.internal.schema.Types;
import org.apache.hudi.internal.schema.Types.Field;

import java.util.ArrayList;
import java.util.Deque;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.stream.Collectors;

/**
 * Util methods to help us do some operations on InternalSchema.
 * eg: column prune, filter rebuild for query engine...
 */
public class InternalSchemaUtils {

  private InternalSchemaUtils() {
  }

  /**
   * Create project internalSchema, based on the project names which produced by query engine.
   * support nested project.
   *
   * @param schema a internal schema.
   * @param names project names produced by query engine.
   * @return a project internalSchema.
   */
  public static InternalSchema pruneInternalSchema(InternalSchema schema, List names) {
    // do check
    List prunedIds = names.stream().map(name -> {
      int id = schema.findIdByName(name);
      if (id == -1) {
        throw new IllegalArgumentException(String.format("cannot prune col: %s which does not exist in hudi table", name));
      }
      return id;
    }).collect(Collectors.toList());
    // find top parent field ID. eg: a.b.c, f.g.h, only collect id of a and f ignore all child field.
    List topParentFieldIds = new ArrayList<>();
    names.stream().forEach(f -> {
      int id = schema.findIdByName(f.split("\\.")[0]);
      if (!topParentFieldIds.contains(id)) {
        topParentFieldIds.add(id);
      }
    });
    return pruneInternalSchemaByID(schema, prunedIds, topParentFieldIds);
  }

  /**
   * Create project internalSchema.
   * support nested project.
   *
   * @param schema a internal schema.
   * @param fieldIds project col field_ids.
   * @return a project internalSchema.
   */
  public static InternalSchema pruneInternalSchemaByID(InternalSchema schema, List fieldIds, List topParentFieldIds) {
    Types.RecordType recordType = (Types.RecordType)pruneType(schema.getRecord(), fieldIds);
    // reorder top parent fields, since the recordType.fields() produced by pruneType maybe out of order.
    List newFields = new ArrayList<>();
    if (topParentFieldIds != null && !topParentFieldIds.isEmpty()) {
      for (int id : topParentFieldIds) {
        Types.Field f = recordType.field(id);
        if (f != null) {
          newFields.add(f);
        } else {
          throw new HoodieSchemaException(String.format("cannot find pruned id %s in currentSchema %s", id, schema.toString()));
        }
      }
    }
    return new InternalSchema(newFields.isEmpty() ? recordType : Types.RecordType.get(newFields));
  }

  /**
   * Project hudi type by projected cols field_ids
   * this is auxiliary function used by pruneInternalSchema.
   */
  private static Type pruneType(Type type, List fieldIds) {
    switch (type.typeId()) {
      case RECORD:
        Types.RecordType record = (Types.RecordType) type;
        List fields = record.fields();
        List newTypes = new ArrayList<>();
        for (Types.Field f : fields) {
          Type newType = pruneType(f.type(), fieldIds);
          if (fieldIds.contains(f.fieldId())) {
            newTypes.add(f.type());
          } else if (newType != null) {
            newTypes.add(newType);
          } else {
            newTypes.add(null);
          }
        }
        boolean changed = false;
        List newFields = new ArrayList<>();
        for (int i = 0; i < fields.size(); i++) {
          Types.Field oldField = fields.get(i);
          Type newType = newTypes.get(i);
          if (oldField.type() == newType) {
            newFields.add(oldField);
          } else if (newType != null) {
            changed = true;
            newFields.add(Types.Field.get(oldField.fieldId(), oldField.isOptional(), oldField.name(), newType, oldField.doc()));
          }
        }
        if (newFields.isEmpty()) {
          return null;
        }
        if (newFields.size() == fields.size() && !changed) {
          return record;
        } else {
          return Types.RecordType.get(newFields);
        }
      case ARRAY:
        Types.ArrayType array = (Types.ArrayType) type;
        Type newElementType = pruneType(array.elementType(), fieldIds);
        if (fieldIds.contains(array.elementId())) {
          return array;
        } else if (newElementType != null) {
          if (array.elementType() == newElementType) {
            return array;
          }
          return Types.ArrayType.get(array.elementId(), array.isElementOptional(), newElementType);
        }
        return null;
      case MAP:
        Types.MapType map = (Types.MapType) type;
        Type newValueType = pruneType(map.valueType(), fieldIds);
        if (fieldIds.contains(map.valueId())) {
          return map;
        } else if (newValueType != null) {
          if (map.valueType() == newValueType) {
            return map;
          }
          return Types.MapType.get(map.keyId(), map.valueId(), map.keyType(), newValueType, map.isValueOptional());
        }
        return null;
      default:
        return null;
    }
  }

  /**
   * A helper function to help correct the colName of pushed filters.
   *
   * @param name origin col name from pushed filters.
   * @param fileSchema the real schema of avro/parquet file.
   * @param querySchema the query schema which query engine produced.
   * @return a corrected name.
   */
  public static String reBuildFilterName(String name, InternalSchema fileSchema, InternalSchema querySchema) {
    int nameId = querySchema.findIdByName(name);
    if (nameId == -1) {
      throw new IllegalArgumentException(String.format("cannot find filter col name：%s from querySchema: %s", name, querySchema));
    }
    if (fileSchema.findField(nameId) == null) {
      // added operation found
      // the read file does not contain current col, so current colFilter is invalid
      return "";
    } else {
      if (name.equals(fileSchema.findFullName(nameId))) {
        // no change happened on current col
        return name;
      } else {
        // find rename operation on current col
        // return the name from fileSchema
        return fileSchema.findFullName(nameId);
      }
    }
  }

  /**
   * Collect all type changed cols to build a colPosition -> (newColType, oldColType) map.
   * only collect top level col changed. eg: a is a nest field(record(b int, d long), now a.b is changed from int to long,
   * only a will be collected, a.b will excluded.
   *
   * @param schema a type changed internalSchema
   * @param oldSchema an old internalSchema.
   * @return a map.
   */
  public static Map> collectTypeChangedCols(InternalSchema schema, InternalSchema oldSchema) {
    Set ids = schema.getAllIds();
    Set otherIds = oldSchema.getAllIds();
    Map> result = new HashMap<>();
    ids.stream().filter(f -> otherIds.contains(f)).forEach(f -> {
      if (!schema.findType(f).equals(oldSchema.findType(f))) {
        String[] fieldNameParts = schema.findFullName(f).split("\\.");
        String[] otherFieldNameParts = oldSchema.findFullName(f).split("\\.");
        String parentName = fieldNameParts[0];
        String otherParentName = otherFieldNameParts[0];
        if (fieldNameParts.length == otherFieldNameParts.length && schema.findIdByName(parentName) == oldSchema.findIdByName(otherParentName)) {
          int index = schema.findIdByName(parentName);
          int position = schema.getRecord().fields().stream().map(s -> s.fieldId()).collect(Collectors.toList()).indexOf(index);
          if (!result.containsKey(position)) {
            result.put(position, Pair.of(schema.findType(parentName), oldSchema.findType(otherParentName)));
          }
        }
      }
    });
    return result;
  }

  /**
   * Search target internalSchema by version number.
   *
   * @param versionId the internalSchema version to be search.
   * @param internalSchemas internalSchemas to be searched.
   * @return a internalSchema.
   */
  public static InternalSchema searchSchema(long versionId, List internalSchemas) {
    TreeMap treeMap = new TreeMap<>();
    internalSchemas.forEach(s -> treeMap.put(s.schemaId(), s));
    return searchSchema(versionId, treeMap);
  }

  /**
   * Search target internalSchema by version number.
   *
   * @param versionId the internalSchema version to be search.
   * @param treeMap internalSchemas collections to be searched.
   * @return a internalSchema.
   */
  public static InternalSchema searchSchema(long versionId, TreeMap treeMap) {
    if (treeMap.containsKey(versionId)) {
      return treeMap.get(versionId);
    } else {
      SortedMap headMap = treeMap.headMap(versionId);
      if (!headMap.isEmpty()) {
        return headMap.get(headMap.lastKey());
      }
    }
    return InternalSchema.getEmptyInternalSchema();
  }

  public static String createFullName(String name, Deque fieldNames) {
    String result = name;
    if (!fieldNames.isEmpty()) {
      List parentNames = new ArrayList<>();
      fieldNames.descendingIterator().forEachRemaining(parentNames::add);
      result = parentNames.stream().collect(Collectors.joining(".")) + "." + result;
    }
    return result;
  }

  /**
   * Try to find all renamed cols between oldSchema and newSchema.
   *
   * @param oldSchema oldSchema
   * @param newSchema newSchema which modified from oldSchema
   * @return renameCols Map. (k, v) -> (colNameFromNewSchema, colNameLastPartFromOldSchema)
   */
  public static Map collectRenameCols(InternalSchema oldSchema, InternalSchema newSchema) {
    List colNamesFromWriteSchema = oldSchema.getAllColsFullName();
    return colNamesFromWriteSchema.stream().filter(f -> {
      int fieldIdFromWriteSchema = oldSchema.findIdByName(f);
      // try to find the cols which has the same id, but have different colName;
      return newSchema.getAllIds().contains(fieldIdFromWriteSchema) && !newSchema.findFullName(fieldIdFromWriteSchema).equalsIgnoreCase(f);
    }).collect(Collectors.toMap(e -> newSchema.findFullName(oldSchema.findIdByName(e)), e -> {
      int lastDotIndex = e.lastIndexOf(".");
      return e.substring(lastDotIndex == -1 ? 0 : lastDotIndex + 1);
    }));
  }
}