All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.orc.OrcUtils Maven / Gradle / Ivy

There is a newer version: 2.2.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.orc;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import com.google.common.collect.Lists;

public class OrcUtils {

  /**
   * Returns selected columns as a boolean array with true value set for specified column names.
   * The result will contain number of elements equal to flattened number of columns.
   * For example:
   * selectedColumns - a,b,c
   * allColumns - a,b,c,d
   * If column c is a complex type, say list and other types are primitives then result will
   * be [false, true, true, true, true, true, false]
   * Index 0 is the root element of the struct which is set to false by default, index 1,2
   * corresponds to columns a and b. Index 3,4 correspond to column c which is list and
   * index 5 correspond to column d. After flattening list gets 2 columns.
   *
   * @param selectedColumns - comma separated list of selected column names
   * @param schema       - object schema
   * @return - boolean array with true value set for the specified column names
   */
  public static boolean[] includeColumns(String selectedColumns,
                                         TypeDescription schema) {
    int numFlattenedCols = schema.getMaximumId();
    boolean[] results = new boolean[numFlattenedCols + 1];
    if ("*".equals(selectedColumns)) {
      Arrays.fill(results, true);
      return results;
    }
    if (selectedColumns != null &&
        schema.getCategory() == TypeDescription.Category.STRUCT) {
      List fieldNames = schema.getFieldNames();
      List fields = schema.getChildren();
      for (String column: selectedColumns.split((","))) {
        TypeDescription col = findColumn(column, fieldNames, fields);
        if (col != null) {
          for(int i=col.getId(); i <= col.getMaximumId(); ++i) {
            results[i] = true;
          }
        }
      }
    }
    return results;
  }

  private static TypeDescription findColumn(String columnName,
                                            List fieldNames,
                                            List fields) {
    int i = 0;
    for(String fieldName: fieldNames) {
      if (fieldName.equalsIgnoreCase(columnName)) {
        return fields.get(i);
      } else {
        i += 1;
      }
    }
    return null;
  }

  public static List getOrcTypes(TypeDescription typeDescr) {
    List result = Lists.newArrayList();
    appendOrcTypes(result, typeDescr);
    return result;
  }

  private static void appendOrcTypes(List result, TypeDescription typeDescr) {
    OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
    List children = typeDescr.getChildren();
    switch (typeDescr.getCategory()) {
    case BOOLEAN:
      type.setKind(OrcProto.Type.Kind.BOOLEAN);
      break;
    case BYTE:
      type.setKind(OrcProto.Type.Kind.BYTE);
      break;
    case SHORT:
      type.setKind(OrcProto.Type.Kind.SHORT);
      break;
    case INT:
      type.setKind(OrcProto.Type.Kind.INT);
      break;
    case LONG:
      type.setKind(OrcProto.Type.Kind.LONG);
      break;
    case FLOAT:
      type.setKind(OrcProto.Type.Kind.FLOAT);
      break;
    case DOUBLE:
      type.setKind(OrcProto.Type.Kind.DOUBLE);
      break;
    case STRING:
      type.setKind(OrcProto.Type.Kind.STRING);
      break;
    case CHAR:
      type.setKind(OrcProto.Type.Kind.CHAR);
      type.setMaximumLength(typeDescr.getMaxLength());
      break;
    case VARCHAR:
      type.setKind(OrcProto.Type.Kind.VARCHAR);
      type.setMaximumLength(typeDescr.getMaxLength());
      break;
    case BINARY:
      type.setKind(OrcProto.Type.Kind.BINARY);
      break;
    case TIMESTAMP:
      type.setKind(OrcProto.Type.Kind.TIMESTAMP);
      break;
    case DATE:
      type.setKind(OrcProto.Type.Kind.DATE);
      break;
    case DECIMAL:
      type.setKind(OrcProto.Type.Kind.DECIMAL);
      type.setPrecision(typeDescr.getPrecision());
      type.setScale(typeDescr.getScale());
      break;
    case LIST:
      type.setKind(OrcProto.Type.Kind.LIST);
      type.addSubtypes(children.get(0).getId());
      break;
    case MAP:
      type.setKind(OrcProto.Type.Kind.MAP);
      for(TypeDescription t: children) {
        type.addSubtypes(t.getId());
      }
      break;
    case STRUCT:
      type.setKind(OrcProto.Type.Kind.STRUCT);
      for(TypeDescription t: children) {
        type.addSubtypes(t.getId());
      }
      for(String field: typeDescr.getFieldNames()) {
        type.addFieldNames(field);
      }
      break;
    case UNION:
      type.setKind(OrcProto.Type.Kind.UNION);
      for(TypeDescription t: children) {
        type.addSubtypes(t.getId());
      }
      break;
    default:
      throw new IllegalArgumentException("Unknown category: " +
          typeDescr.getCategory());
    }
    result.add(type.build());
    if (children != null) {
      for(TypeDescription child: children) {
        appendOrcTypes(result, child);
      }
    }
  }

  /**
   * NOTE: This method ignores the subtype numbers in the TypeDescription rebuilds the subtype
   * numbers based on the length of the result list being appended.
   *
   * @param result
   * @param typeDescr
   */
  public static void appendOrcTypesRebuildSubtypes(List result,
      TypeDescription typeDescr) {

    int subtype = result.size();
    OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
    boolean needsAdd = true;
    List children = typeDescr.getChildren();
    switch (typeDescr.getCategory()) {
    case BOOLEAN:
      type.setKind(OrcProto.Type.Kind.BOOLEAN);
      break;
    case BYTE:
      type.setKind(OrcProto.Type.Kind.BYTE);
      break;
    case SHORT:
      type.setKind(OrcProto.Type.Kind.SHORT);
      break;
    case INT:
      type.setKind(OrcProto.Type.Kind.INT);
      break;
    case LONG:
      type.setKind(OrcProto.Type.Kind.LONG);
      break;
    case FLOAT:
      type.setKind(OrcProto.Type.Kind.FLOAT);
      break;
    case DOUBLE:
      type.setKind(OrcProto.Type.Kind.DOUBLE);
      break;
    case STRING:
      type.setKind(OrcProto.Type.Kind.STRING);
      break;
    case CHAR:
      type.setKind(OrcProto.Type.Kind.CHAR);
      type.setMaximumLength(typeDescr.getMaxLength());
      break;
    case VARCHAR:
      type.setKind(OrcProto.Type.Kind.VARCHAR);
      type.setMaximumLength(typeDescr.getMaxLength());
      break;
    case BINARY:
      type.setKind(OrcProto.Type.Kind.BINARY);
      break;
    case TIMESTAMP:
      type.setKind(OrcProto.Type.Kind.TIMESTAMP);
      break;
    case DATE:
      type.setKind(OrcProto.Type.Kind.DATE);
      break;
    case DECIMAL:
      type.setKind(OrcProto.Type.Kind.DECIMAL);
      type.setPrecision(typeDescr.getPrecision());
      type.setScale(typeDescr.getScale());
      break;
    case LIST:
      type.setKind(OrcProto.Type.Kind.LIST);
      type.addSubtypes(++subtype);
      result.add(type.build());
      needsAdd = false;
      appendOrcTypesRebuildSubtypes(result, children.get(0));
      break;
    case MAP:
      {
        // Make room for MAP type.
        result.add(null);
  
        // Add MAP type pair in order to determine their subtype values.
        appendOrcTypesRebuildSubtypes(result, children.get(0));
        int subtype2 = result.size();
        appendOrcTypesRebuildSubtypes(result, children.get(1));
        type.setKind(OrcProto.Type.Kind.MAP);
        type.addSubtypes(subtype + 1);
        type.addSubtypes(subtype2);
        result.set(subtype, type.build());
        needsAdd = false;
      }
      break;
    case STRUCT:
      {
        List fieldNames = typeDescr.getFieldNames();

        // Make room for STRUCT type.
        result.add(null);

        List fieldSubtypes = new ArrayList(fieldNames.size());
        for(TypeDescription child: children) {
          int fieldSubtype = result.size();
          fieldSubtypes.add(fieldSubtype);
          appendOrcTypesRebuildSubtypes(result, child);
        }

        type.setKind(OrcProto.Type.Kind.STRUCT);

        for (int i = 0 ; i < fieldNames.size(); i++) {
          type.addSubtypes(fieldSubtypes.get(i));
          type.addFieldNames(fieldNames.get(i));
        }
        result.set(subtype, type.build());
        needsAdd = false;
      }
      break;
    case UNION:
      {
        // Make room for UNION type.
        result.add(null);

        List unionSubtypes = new ArrayList(children.size());
        for(TypeDescription child: children) {
          int unionSubtype = result.size();
          unionSubtypes.add(unionSubtype);
          appendOrcTypesRebuildSubtypes(result, child);
        }

        type.setKind(OrcProto.Type.Kind.UNION);
        for (int i = 0 ; i < children.size(); i++) {
          type.addSubtypes(unionSubtypes.get(i));
        }
        result.set(subtype, type.build());
        needsAdd = false;
      }
      break;
    default:
      throw new IllegalArgumentException("Unknown category: " + typeDescr.getCategory());
    }
    if (needsAdd) {
      result.add(type.build());
    }
  }

  /**
   * NOTE: This method ignores the subtype numbers in the OrcProto.Type rebuilds the subtype
   * numbers based on the length of the result list being appended.
   *
   * @param result
   * @param types
   * @param columnId
   */
  public static int appendOrcTypesRebuildSubtypes(List result,
      List types, int columnId) {

    OrcProto.Type oldType = types.get(columnId++);

    int subtype = result.size();
    OrcProto.Type.Builder builder = OrcProto.Type.newBuilder();
    boolean needsAdd = true;
    switch (oldType.getKind()) {
    case BOOLEAN:
      builder.setKind(OrcProto.Type.Kind.BOOLEAN);
      break;
    case BYTE:
      builder.setKind(OrcProto.Type.Kind.BYTE);
      break;
    case SHORT:
      builder.setKind(OrcProto.Type.Kind.SHORT);
      break;
    case INT:
      builder.setKind(OrcProto.Type.Kind.INT);
      break;
    case LONG:
      builder.setKind(OrcProto.Type.Kind.LONG);
      break;
    case FLOAT:
      builder.setKind(OrcProto.Type.Kind.FLOAT);
      break;
    case DOUBLE:
      builder.setKind(OrcProto.Type.Kind.DOUBLE);
      break;
    case STRING:
      builder.setKind(OrcProto.Type.Kind.STRING);
      break;
    case CHAR:
      builder.setKind(OrcProto.Type.Kind.CHAR);
      builder.setMaximumLength(oldType.getMaximumLength());
      break;
    case VARCHAR:
      builder.setKind(OrcProto.Type.Kind.VARCHAR);
      builder.setMaximumLength(oldType.getMaximumLength());
      break;
    case BINARY:
      builder.setKind(OrcProto.Type.Kind.BINARY);
      break;
    case TIMESTAMP:
      builder.setKind(OrcProto.Type.Kind.TIMESTAMP);
      break;
    case DATE:
      builder.setKind(OrcProto.Type.Kind.DATE);
      break;
    case DECIMAL:
      builder.setKind(OrcProto.Type.Kind.DECIMAL);
      builder.setPrecision(oldType.getPrecision());
      builder.setScale(oldType.getScale());
      break;
    case LIST:
      builder.setKind(OrcProto.Type.Kind.LIST);
      builder.addSubtypes(++subtype);
      result.add(builder.build());
      needsAdd = false;
      columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
      break;
    case MAP:
      {
        // Make room for MAP type.
        result.add(null);
  
        // Add MAP type pair in order to determine their subtype values.
        columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
        int subtype2 = result.size();
        columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
        builder.setKind(OrcProto.Type.Kind.MAP);
        builder.addSubtypes(subtype + 1);
        builder.addSubtypes(subtype2);
        result.set(subtype, builder.build());
        needsAdd = false;
      }
      break;
    case STRUCT:
      {
        List fieldNames = oldType.getFieldNamesList();

        // Make room for STRUCT type.
        result.add(null);

        List fieldSubtypes = new ArrayList(fieldNames.size());
        for(int i = 0 ; i < fieldNames.size(); i++) {
          int fieldSubtype = result.size();
          fieldSubtypes.add(fieldSubtype);
          columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
        }

        builder.setKind(OrcProto.Type.Kind.STRUCT);

        for (int i = 0 ; i < fieldNames.size(); i++) {
          builder.addSubtypes(fieldSubtypes.get(i));
          builder.addFieldNames(fieldNames.get(i));
        }
        result.set(subtype, builder.build());
        needsAdd = false;
      }
      break;
    case UNION:
      {
        int subtypeCount = oldType.getSubtypesCount();

        // Make room for UNION type.
        result.add(null);

        List unionSubtypes = new ArrayList(subtypeCount);
        for(int i = 0 ; i < subtypeCount; i++) {
          int unionSubtype = result.size();
          unionSubtypes.add(unionSubtype);
          columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
        }

        builder.setKind(OrcProto.Type.Kind.UNION);
        for (int i = 0 ; i < subtypeCount; i++) {
          builder.addSubtypes(unionSubtypes.get(i));
        }
        result.set(subtype, builder.build());
        needsAdd = false;
      }
      break;
    default:
      throw new IllegalArgumentException("Unknown category: " + oldType.getKind());
    }
    if (needsAdd) {
      result.add(builder.build());
    }
    return columnId;
  }

  /**
   * Translate the given rootColumn from the list of types to a TypeDescription.
   * @param types all of the types
   * @param rootColumn translate this type
   * @return a new TypeDescription that matches the given rootColumn
   */
  public static
        TypeDescription convertTypeFromProtobuf(List types,
                                                int rootColumn) {
    OrcProto.Type type = types.get(rootColumn);
    switch (type.getKind()) {
      case BOOLEAN:
        return TypeDescription.createBoolean();
      case BYTE:
        return TypeDescription.createByte();
      case SHORT:
        return TypeDescription.createShort();
      case INT:
        return TypeDescription.createInt();
      case LONG:
        return TypeDescription.createLong();
      case FLOAT:
        return TypeDescription.createFloat();
      case DOUBLE:
        return TypeDescription.createDouble();
      case STRING:
        return TypeDescription.createString();
      case CHAR:
      case VARCHAR: {
        TypeDescription result = type.getKind() == OrcProto.Type.Kind.CHAR ?
            TypeDescription.createChar() : TypeDescription.createVarchar();
        if (type.hasMaximumLength()) {
          result.withMaxLength(type.getMaximumLength());
        }
        return result;
      }
      case BINARY:
        return TypeDescription.createBinary();
      case TIMESTAMP:
        return TypeDescription.createTimestamp();
      case DATE:
        return TypeDescription.createDate();
      case DECIMAL: {
        TypeDescription result = TypeDescription.createDecimal();
        if (type.hasScale()) {
          result.withScale(type.getScale());
        }
        if (type.hasPrecision()) {
          result.withPrecision(type.getPrecision());
        }
        return result;
      }
      case LIST:
        return TypeDescription.createList(
            convertTypeFromProtobuf(types, type.getSubtypes(0)));
      case MAP:
        return TypeDescription.createMap(
            convertTypeFromProtobuf(types, type.getSubtypes(0)),
            convertTypeFromProtobuf(types, type.getSubtypes(1)));
      case STRUCT: {
        TypeDescription result = TypeDescription.createStruct();
        for(int f=0; f < type.getSubtypesCount(); ++f) {
          result.addField(type.getFieldNames(f),
              convertTypeFromProtobuf(types, type.getSubtypes(f)));
        }
        return result;
      }
      case UNION: {
        TypeDescription result = TypeDescription.createUnion();
        for(int f=0; f < type.getSubtypesCount(); ++f) {
          result.addUnionChild(
              convertTypeFromProtobuf(types, type.getSubtypes(f)));
        }
        return result;
      }
    }
    throw new IllegalArgumentException("Unknown ORC type " + type.getKind());
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy