org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of hive-exec
There is a newer version: 4.0.0
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.io.parquet.read;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.io.IOConstants;
import org.apache.hadoop.hive.ql.io.parquet.convert.DataWritableRecordConverter;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.ql.optimizer.FieldNode;
import org.apache.hadoop.hive.ql.optimizer.NestedColumnFieldPruningUtils;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.StringUtils;
import org.apache.parquet.hadoop.api.InitContext;
import org.apache.parquet.hadoop.api.ReadSupport;
import org.apache.parquet.io.api.RecordMaterializer;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type;
import org.apache.parquet.schema.Type.Repetition;
import org.apache.parquet.schema.Types;

/**
 *
 * A MapWritableReadSupport
 *
 * Manages the translation between Hive and Parquet
 *
 */
public class DataWritableReadSupport extends ReadSupport {

  public static final String HIVE_TABLE_AS_PARQUET_SCHEMA = "HIVE_TABLE_SCHEMA";
  public static final String PARQUET_COLUMN_INDEX_ACCESS = "parquet.column.index.access";
  private TypeInfo hiveTypeInfo;
  /**
   * From a string which columns names (including hive column), return a list
   * of string columns
   *
   * @param columns comma separated list of columns
   * @return list with virtual columns removed
   */
  public static List getColumnNames(final String columns) {
    return (List) VirtualColumn.
        removeVirtualColumns(StringUtils.getStringCollection(columns));
  }

  /**
   * Returns a list of TypeInfo objects from a string which contains column
   * types strings.
   *
   * @param types Comma separated list of types
   * @return A list of TypeInfo objects.
   */
  public static List getColumnTypes(final String types) {
    return TypeInfoUtils.getTypeInfosFromTypeString(types);
  }

  /**
   * Searchs for a fieldName into a parquet GroupType by ignoring string case.
   * GroupType#getType(String fieldName) is case sensitive, so we use this method.
   *
   * @param groupType Group of field types where to search for fieldName
   * @param fieldName The field what we are searching
   * @return The Type object of the field found; null otherwise.
   */
  private static Type getFieldTypeIgnoreCase(GroupType groupType, String fieldName) {
    for (Type type : groupType.getFields()) {
      if (type.getName().equalsIgnoreCase(fieldName)) {
        return type;
      }
    }

    return null;
  }

  /**
   * Searchs column names by name on a given Parquet schema, and returns its corresponded
   * Parquet schema types.
   *
   * @param schema Group schema where to search for column names.
   * @param colNames List of column names.
   * @param colTypes List of column types.
   * @return List of GroupType objects of projected columns.
   */
  private static List getProjectedGroupFields(GroupType schema, List colNames, List colTypes) {
    List schemaTypes = new ArrayList();

    ListIterator columnIterator = colNames.listIterator();
    while (columnIterator.hasNext()) {
      TypeInfo colType = colTypes.get(columnIterator.nextIndex());
      String colName = columnIterator.next();

      Type fieldType = getFieldTypeIgnoreCase(schema, colName);
      if (fieldType == null) {
        schemaTypes.add(Types.optional(PrimitiveTypeName.BINARY).named(colName));
      } else {
        schemaTypes.add(getProjectedType(colType, fieldType));
      }
    }

    return schemaTypes;
  }

  private static Type getProjectedType(TypeInfo colType, Type fieldType) {
    switch (colType.getCategory()) {
      case STRUCT:
        List groupFields = getProjectedGroupFields(
          fieldType.asGroupType(),
          ((StructTypeInfo) colType).getAllStructFieldNames(),
          ((StructTypeInfo) colType).getAllStructFieldTypeInfos()
        );

        Type[] typesArray = groupFields.toArray(new Type[0]);
        return Types.buildGroup(fieldType.getRepetition())
          .addFields(typesArray)
          .named(fieldType.getName());
      case LIST:
        TypeInfo elemType = ((ListTypeInfo) colType).getListElementTypeInfo();
        if (elemType.getCategory() == ObjectInspector.Category.STRUCT) {
          Type subFieldType = fieldType.asGroupType().getType(0);
          if (!subFieldType.isPrimitive()) {
            String subFieldName = subFieldType.getName();
            Text name = new Text(subFieldName);
            if (name.equals(ParquetHiveSerDe.ARRAY) || name.equals(ParquetHiveSerDe.LIST)) {
              subFieldType = new GroupType(Repetition.REPEATED, subFieldName,
                getProjectedType(elemType, subFieldType.asGroupType().getType(0)));
            } else {
              subFieldType = getProjectedType(elemType, subFieldType);
            }
            return Types.buildGroup(Repetition.OPTIONAL).as(OriginalType.LIST).addFields(
              subFieldType).named(fieldType.getName());
          }
        }
        break;
      default:
    }
    return fieldType;
  }

  /**
   * Searches column names by name on a given Parquet message schema, and returns its projected
   * Parquet schema types.
   *
   * @param schema Message type schema where to search for column names.
   * @param colNames List of column names.
   * @param colTypes List of column types.
   * @return A MessageType object of projected columns.
   */
  public static MessageType getSchemaByName(MessageType schema, List colNames, List colTypes) {
    List projectedFields = getProjectedGroupFields(schema, colNames, colTypes);
    Type[] typesArray = projectedFields.toArray(new Type[0]);

    return Types.buildMessage()
        .addFields(typesArray)
        .named(schema.getName());
  }

  /**
   * Searches column names by indexes on a given Parquet file schema, and returns its corresponded
   * Parquet schema types.
   *
   * @param schema Message schema where to search for column names.
   * @param colNames List of column names.
   * @param colIndexes List of column indexes.
   * @return A MessageType object of the column names found.
   */
  public static MessageType getSchemaByIndex(MessageType schema, List colNames, List colIndexes) {
    List schemaTypes = new ArrayList();

    for (Integer i : colIndexes) {
      if (i < colNames.size()) {
        if (i < schema.getFieldCount()) {
          schemaTypes.add(schema.getType(i));
        } else {
          //prefixing with '_mask_' to ensure no conflict with named
          //columns in the file schema
          schemaTypes.add(
            Types.optional(PrimitiveTypeName.BINARY).named("_mask_" + colNames.get(i)));
        }
      }
    }

    return new MessageType(schema.getName(), schemaTypes);
  }

  /**
   * Generate the projected schema from colIndexes and nested column paths. If the column is
   * contained by colIndex, it will be added directly, otherwise it will build a group type which
   * contains all required sub types using nestedColumnPaths.
   * @param schema original schema
   * @param colNames
   * @param colIndexes the index of needed columns
   * @param nestedColumnPaths the paths for nested columns
   * @return
   */
  public static MessageType getProjectedSchema(
    MessageType schema,
    List colNames,
    List colIndexes,
    Set nestedColumnPaths) {
    List schemaTypes = new ArrayList();

    Map prunedCols = getPrunedNestedColumns(nestedColumnPaths);
    for (Integer i : colIndexes) {
      if (i < colNames.size()) {
        if (i < schema.getFieldCount()) {
          Type t = schema.getType(i);
          String tn = t.getName().toLowerCase();
          if (!prunedCols.containsKey(tn)) {
            schemaTypes.add(schema.getType(i));
          } else {
            if (t.isPrimitive()) {
              // For primitive type, add directly.
              schemaTypes.add(t);
            } else {
              // For group type, we need to build the projected group type with required leaves
              List g =
                projectLeafTypes(Arrays.asList(t), Arrays.asList(prunedCols.get(tn)));
              if (!g.isEmpty()) {
                schemaTypes.addAll(g);
              }
            }
          }
        } else {
          //prefixing with '_mask_' to ensure no conflict with named
          //columns in the file schema
          schemaTypes.add(Types.optional(PrimitiveTypeName.BINARY).named("_mask_" + colNames.get(i)));
        }
      }
    }

    return new MessageType(schema.getName(), schemaTypes);
  }

  /**
   * Return the columns which contains required nested attribute level
   * E.g., given struct a: while 'x' is required and 'y' is not, the method will return
   * a pruned struct for 'a' which only contains the attribute 'x'
   *
   * @param nestedColPaths the paths for required nested attribute
   * @return a map from the column to its selected nested column paths, of which the keys are all lower-cased.
   */
  private static Map getPrunedNestedColumns(Set nestedColPaths) {
    Map resMap = new HashMap<>();
    if (nestedColPaths.isEmpty()) {
      return resMap;
    }
    for (String s : nestedColPaths) {
      String c = StringUtils.split(s, '.')[0].toLowerCase();
      if (!resMap.containsKey(c)) {
        FieldNode f = NestedColumnFieldPruningUtils.addNodeByPath(null, s);
        resMap.put(c, f);
      } else {
        resMap.put(c, NestedColumnFieldPruningUtils.addNodeByPath(resMap.get(c), s));
      }
    }
    return resMap;
  }

  private static GroupType buildProjectedGroupType(
    GroupType originalType,
    List types) {
    if (types == null || types.isEmpty()) {
      return null;
    }
    return new GroupType(originalType.getRepetition(), originalType.getName(), types);
  }

  private static List projectLeafTypes(
    List types,
    List nodes) {
    List res = new ArrayList<>();
    if (nodes.isEmpty()) {
      return res;
    }
    Map fieldMap = new HashMap<>();
    for (FieldNode n : nodes) {
      fieldMap.put(n.getFieldName().toLowerCase(), n);
    }
    for (Type type : types) {
      String tn = type.getName().toLowerCase();

      if (fieldMap.containsKey(tn)) {
        FieldNode f = fieldMap.get(tn);
        if (f.getNodes().isEmpty()) {
          // no child, no need for pruning
          res.add(type);
        } else {
          if (type instanceof GroupType) {
            GroupType groupType = type.asGroupType();
            List ts = projectLeafTypes(groupType.getFields(), f.getNodes());
            GroupType g = buildProjectedGroupType(groupType, ts);
            if (g != null) {
              res.add(g);
            }
          } else {
            throw new RuntimeException(
              "Primitive type " + f.getFieldName() + "should not " + "doesn't match type" + f
                .toString());
          }
        }
      }
    }
    return res;
  }

  /**
   * It creates the readContext for Parquet side with the requested schema during the init phase.
   *
   * @param context
   * @return the parquet ReadContext
   */
  @Override
  public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(InitContext context) {
    Configuration configuration = context.getConfiguration();
    MessageType fileSchema = context.getFileSchema();
    String columnNames = configuration.get(IOConstants.COLUMNS);
    Map contextMetadata = new HashMap();
    boolean indexAccess = configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false);

    if (columnNames != null) {
      List columnNamesList = getColumnNames(columnNames);
      String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
      List columnTypesList = getColumnTypes(columnTypes);

      MessageType tableSchema =
        getRequestedSchemaForIndexAccess(indexAccess, columnNamesList, columnTypesList, fileSchema);

      contextMetadata.put(HIVE_TABLE_AS_PARQUET_SCHEMA, tableSchema.toString());
      contextMetadata.put(PARQUET_COLUMN_INDEX_ACCESS, String.valueOf(indexAccess));
      this.hiveTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNamesList, columnTypesList);

      return new ReadContext(getRequestedPrunedSchema(columnNamesList, tableSchema, configuration),
        contextMetadata);
    } else {
      contextMetadata.put(HIVE_TABLE_AS_PARQUET_SCHEMA, fileSchema.toString());
      return new ReadContext(fileSchema, contextMetadata);
    }
  }

  /**
   * It's used for vectorized code path.
   * @param indexAccess
   * @param columnNamesList
   * @param columnTypesList
   * @param fileSchema
   * @param configuration
   * @return
   */
  public static MessageType getRequestedSchema(
    boolean indexAccess,
    List columnNamesList,
    List columnTypesList,
    MessageType fileSchema,
    Configuration configuration) {
    MessageType tableSchema =
      getRequestedSchemaForIndexAccess(indexAccess, columnNamesList, columnTypesList, fileSchema);

    List indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
    //TODO Duplicated code for init method since vectorization reader path doesn't support Nested
    // column pruning so far. See HIVE-15156
    if (!ColumnProjectionUtils.isReadAllColumns(configuration) && !indexColumnsWanted.isEmpty()) {
      return DataWritableReadSupport
        .getSchemaByIndex(tableSchema, columnNamesList, indexColumnsWanted);
    } else {
      return tableSchema;
    }
  }

  private static MessageType getRequestedSchemaForIndexAccess(
    boolean indexAccess,
    List columnNamesList,
    List columnTypesList,
    MessageType fileSchema) {
    if (indexAccess) {
      List indexSequence = new ArrayList();

      // Generates a sequence list of indexes
      for (int i = 0; i < columnNamesList.size(); i++) {
        indexSequence.add(i);
      }

      return getSchemaByIndex(fileSchema, columnNamesList, indexSequence);
    } else {
      return getSchemaByName(fileSchema, columnNamesList, columnTypesList);
    }
  }

  private static MessageType getRequestedPrunedSchema(
    List columnNamesList,
    MessageType fileSchema,
    Configuration configuration) {
    Set groupPaths = ColumnProjectionUtils.getNestedColumnPaths(configuration);
    List indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
    if (!ColumnProjectionUtils.isReadAllColumns(configuration) && !indexColumnsWanted.isEmpty()) {
      return getProjectedSchema(fileSchema, columnNamesList, indexColumnsWanted, groupPaths);
    } else {
      return fileSchema;
    }
  }

  /**
   *
   * It creates the hive read support to interpret data from parquet to hive
   *
   * @param configuration // unused
   * @param keyValueMetaData
   * @param fileSchema // unused
   * @param readContext containing the requested schema and the schema of the hive table
   * @return Record Materialize for Hive
   */
  @Override
  public RecordMaterializer prepareForRead(final Configuration configuration,
      final Map keyValueMetaData, final MessageType fileSchema,
          final org.apache.parquet.hadoop.api.ReadSupport.ReadContext readContext) {
    final Map metadata = readContext.getReadSupportMetadata();
    if (metadata == null) {
      throw new IllegalStateException("ReadContext not initialized properly. " +
        "Don't know the Hive Schema.");
    }
    String key = HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION.varname;
    if (!metadata.containsKey(key)) {
      metadata.put(key, String.valueOf(HiveConf.getBoolVar(
        configuration, HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION)));
    }
    return new DataWritableRecordConverter(readContext.getRequestedSchema(), metadata, hiveTypeInfo);
  }
}