org.apache.hudi.hadoop.HoodieColumnProjectionUtils Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.hadoop;

import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.Pair;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.ql.io.IOConstants;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import org.apache.hadoop.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import static org.apache.hadoop.hive.serde.serdeConstants.TIMESTAMP_TYPE_NAME;

/**
 * Utility functions copied from Hive ColumnProjectionUtils.java.
 * Needed to copy as we see NoSuchMethod errors when directly using these APIs with/without Spark.
 * Some of these methods are not available across hive versions.
 */
public class HoodieColumnProjectionUtils {
  public static final Logger LOG = LoggerFactory.getLogger(ColumnProjectionUtils.class);

  public static final String READ_COLUMN_IDS_CONF_STR = "hive.io.file.readcolumn.ids";
  /**
   * the nested column path is the string from the root to the leaf
   * e.g.
   * c:struct_of (a:string,b:string).
   * the column a's path is c.a and b's path is c.b
   */
  public static final String READ_COLUMN_NAMES_CONF_STR = "hive.io.file.readcolumn.names";
  private static final String READ_COLUMN_IDS_CONF_STR_DEFAULT = "";
  private static final String READ_COLUMN_NAMES_CONF_STR_DEFAULT = "";

  /**
   * Returns an array of column ids(start from zero) which is set in the given
   * parameter conf.
   */
  public static List getReadColumnIDs(Configuration conf) {
    String skips = conf.get(READ_COLUMN_IDS_CONF_STR, READ_COLUMN_IDS_CONF_STR_DEFAULT);
    String[] list = StringUtils.split(skips);
    List result = new ArrayList(list.length);
    for (String element : list) {
      // it may contain duplicates, remove duplicates
      Integer toAdd = Integer.parseInt(element);
      if (!result.contains(toAdd)) {
        result.add(toAdd);
      }
      // NOTE: some code uses this list to correlate with column names, and yet these lists may
      //       contain duplicates, which this call will remove and the other won't. As far as I can
      //       tell, no code will actually use these two methods together; all is good if the code
      //       gets the ID list without relying on this method. Or maybe it just works by magic.
    }
    return result;
  }

  public static String[] getReadColumnNames(Configuration conf) {
    String colNames = conf.get(READ_COLUMN_NAMES_CONF_STR, READ_COLUMN_NAMES_CONF_STR_DEFAULT);
    if (colNames != null && !colNames.isEmpty()) {
      return colNames.split(",");
    }
    return new String[] {};
  }

  public static List getIOColumns(Configuration conf) {
    String colNames = conf.get(IOConstants.COLUMNS, "");
    if (colNames != null && !colNames.isEmpty()) {
      return Arrays.asList(colNames.split(","));
    }
    return new ArrayList<>();
  }

  public static List getIOColumnTypes(Configuration conf) {
    String colTypes = conf.get(IOConstants.COLUMNS_TYPES, "");
    if (colTypes != null && !colTypes.isEmpty()) {
      return TypeInfoUtils.getTypeInfosFromTypeString(colTypes).stream()
          .map(t -> t.getTypeName()).collect(Collectors.toList());
    }
    return new ArrayList<>();
  }

  public static List> getIOColumnNameAndTypes(Configuration conf) {
    List names = getIOColumns(conf);
    List types = getIOColumnTypes(conf);
    ValidationUtils.checkArgument(names.size() == types.size());
    return IntStream.range(0, names.size()).mapToObj(idx -> Pair.of(names.get(idx), types.get(idx)))
        .collect(Collectors.toList());
  }

  /**
   * If schema contains timestamp columns, this method is used for compatibility when there is no timestamp fields.
   *
   * We expect to use parquet-avro reader {@link org.apache.hudi.hadoop.avro.HoodieAvroParquetReader} to read
   * timestamp column when read columns contain timestamp type.
   */
  public static boolean supportTimestamp(Configuration conf) {
    List readCols = Arrays.asList(getReadColumnNames(conf));
    if (readCols.isEmpty()) {
      return false;
    }

    String colTypes = conf.get(IOConstants.COLUMNS_TYPES, "");
    if (colTypes == null || colTypes.isEmpty()) {
      return false;
    }

    ArrayList types = TypeInfoUtils.getTypeInfosFromTypeString(colTypes);
    List names = getIOColumns(conf);
    return IntStream.range(0, names.size()).filter(i -> readCols.contains(names.get(i)))
        .anyMatch(i -> typeContainsTimestamp(types.get(i)));
  }

  public static boolean typeContainsTimestamp(TypeInfo type) {
    Category category = type.getCategory();

    switch (category) {
      case PRIMITIVE:
        return type.getTypeName().equals(TIMESTAMP_TYPE_NAME);
      case LIST:
        ListTypeInfo listTypeInfo = (ListTypeInfo) type;
        return typeContainsTimestamp(listTypeInfo.getListElementTypeInfo());
      case MAP:
        MapTypeInfo mapTypeInfo = (MapTypeInfo) type;
        return typeContainsTimestamp(mapTypeInfo.getMapKeyTypeInfo())
            || typeContainsTimestamp(mapTypeInfo.getMapValueTypeInfo());
      case STRUCT:
        StructTypeInfo structTypeInfo = (StructTypeInfo) type;
        return structTypeInfo.getAllStructFieldTypeInfos().stream()
            .anyMatch(HoodieColumnProjectionUtils::typeContainsTimestamp);
      case UNION:
        UnionTypeInfo unionTypeInfo = (UnionTypeInfo) type;
        return unionTypeInfo.getAllUnionObjectTypeInfos().stream()
            .anyMatch(HoodieColumnProjectionUtils::typeContainsTimestamp);
      default:
        return false;
    }
  }
}