All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.orc.OrcUtils Maven / Gradle / Ivy

There is a newer version: 3.1.2-23
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.orc;

import org.apache.orc.impl.ReaderImpl;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class OrcUtils {

  /**
   * Returns selected columns as a boolean array with true value set for specified column names.
   * The result will contain number of elements equal to flattened number of columns.
   * For example:
   * selectedColumns - a,b,c
   * allColumns - a,b,c,d
   * If column c is a complex type, say list<string> and other types are
   * primitives then result will
   * be [false, true, true, true, true, true, false]
   * Index 0 is the root element of the struct which is set to false by default, index 1,2
   * corresponds to columns a and b. Index 3,4 correspond to column c which is list<string> and
   * index 5 correspond to column d. After flattening list<string> gets 2 columns.
   *
   * @param selectedColumns - comma separated list of selected column names
   * @param schema       - object schema
   * @return - boolean array with true value set for the specified column names
   */
  public static boolean[] includeColumns(String selectedColumns,
                                         TypeDescription schema) {
    int numFlattenedCols = schema.getMaximumId();
    boolean[] results = new boolean[numFlattenedCols + 1];
    if ("*".equals(selectedColumns)) {
      Arrays.fill(results, true);
      return results;
    }
    if (selectedColumns != null &&
        schema.getCategory() == TypeDescription.Category.STRUCT) {
      List fieldNames = schema.getFieldNames();
      List fields = schema.getChildren();
      for (String column: selectedColumns.split((","))) {
        TypeDescription col = findColumn(column, fieldNames, fields);
        if (col != null) {
          for(int i=col.getId(); i <= col.getMaximumId(); ++i) {
            results[i] = true;
          }
        }
      }
    }
    return results;
  }

  private static TypeDescription findColumn(String columnName,
                                            List fieldNames,
                                            List fields) {
    int i = 0;
    for(String fieldName: fieldNames) {
      if (fieldName.equalsIgnoreCase(columnName)) {
        return fields.get(i);
      } else {
        i += 1;
      }
    }
    return null;
  }

  public static List getOrcTypes(TypeDescription typeDescr) {
    List result = new ArrayList<>();
    appendOrcTypes(result, typeDescr);
    return result;
  }

  private static void appendOrcTypes(List result, TypeDescription typeDescr) {
    OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
    List children = typeDescr.getChildren();
    switch (typeDescr.getCategory()) {
    case BOOLEAN:
      type.setKind(OrcProto.Type.Kind.BOOLEAN);
      break;
    case BYTE:
      type.setKind(OrcProto.Type.Kind.BYTE);
      break;
    case SHORT:
      type.setKind(OrcProto.Type.Kind.SHORT);
      break;
    case INT:
      type.setKind(OrcProto.Type.Kind.INT);
      break;
    case LONG:
      type.setKind(OrcProto.Type.Kind.LONG);
      break;
    case FLOAT:
      type.setKind(OrcProto.Type.Kind.FLOAT);
      break;
    case DOUBLE:
      type.setKind(OrcProto.Type.Kind.DOUBLE);
      break;
    case STRING:
      type.setKind(OrcProto.Type.Kind.STRING);
      break;
    case CHAR:
      type.setKind(OrcProto.Type.Kind.CHAR);
      type.setMaximumLength(typeDescr.getMaxLength());
      break;
    case VARCHAR:
      type.setKind(OrcProto.Type.Kind.VARCHAR);
      type.setMaximumLength(typeDescr.getMaxLength());
      break;
    case BINARY:
      type.setKind(OrcProto.Type.Kind.BINARY);
      break;
    case TIMESTAMP:
      type.setKind(OrcProto.Type.Kind.TIMESTAMP);
      break;
    case DATE:
      type.setKind(OrcProto.Type.Kind.DATE);
      break;
    case DECIMAL:
      type.setKind(OrcProto.Type.Kind.DECIMAL);
      type.setPrecision(typeDescr.getPrecision());
      type.setScale(typeDescr.getScale());
      break;
    case LIST:
      type.setKind(OrcProto.Type.Kind.LIST);
      type.addSubtypes(children.get(0).getId());
      break;
    case MAP:
      type.setKind(OrcProto.Type.Kind.MAP);
      for(TypeDescription t: children) {
        type.addSubtypes(t.getId());
      }
      break;
    case STRUCT:
      type.setKind(OrcProto.Type.Kind.STRUCT);
      for(TypeDescription t: children) {
        type.addSubtypes(t.getId());
      }
      for(String field: typeDescr.getFieldNames()) {
        type.addFieldNames(field);
      }
      break;
    case UNION:
      type.setKind(OrcProto.Type.Kind.UNION);
      for(TypeDescription t: children) {
        type.addSubtypes(t.getId());
      }
      break;
    default:
      throw new IllegalArgumentException("Unknown category: " +
          typeDescr.getCategory());
    }
    result.add(type.build());
    if (children != null) {
      for(TypeDescription child: children) {
        appendOrcTypes(result, child);
      }
    }
  }

  /**
   * NOTE: This method ignores the subtype numbers in the TypeDescription rebuilds the subtype
   * numbers based on the length of the result list being appended.
   *
   * @param result
   * @param typeDescr
   */
  public static void appendOrcTypesRebuildSubtypes(List result,
      TypeDescription typeDescr) {

    int subtype = result.size();
    OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
    boolean needsAdd = true;
    List children = typeDescr.getChildren();
    switch (typeDescr.getCategory()) {
    case BOOLEAN:
      type.setKind(OrcProto.Type.Kind.BOOLEAN);
      break;
    case BYTE:
      type.setKind(OrcProto.Type.Kind.BYTE);
      break;
    case SHORT:
      type.setKind(OrcProto.Type.Kind.SHORT);
      break;
    case INT:
      type.setKind(OrcProto.Type.Kind.INT);
      break;
    case LONG:
      type.setKind(OrcProto.Type.Kind.LONG);
      break;
    case FLOAT:
      type.setKind(OrcProto.Type.Kind.FLOAT);
      break;
    case DOUBLE:
      type.setKind(OrcProto.Type.Kind.DOUBLE);
      break;
    case STRING:
      type.setKind(OrcProto.Type.Kind.STRING);
      break;
    case CHAR:
      type.setKind(OrcProto.Type.Kind.CHAR);
      type.setMaximumLength(typeDescr.getMaxLength());
      break;
    case VARCHAR:
      type.setKind(OrcProto.Type.Kind.VARCHAR);
      type.setMaximumLength(typeDescr.getMaxLength());
      break;
    case BINARY:
      type.setKind(OrcProto.Type.Kind.BINARY);
      break;
    case TIMESTAMP:
      type.setKind(OrcProto.Type.Kind.TIMESTAMP);
      break;
    case DATE:
      type.setKind(OrcProto.Type.Kind.DATE);
      break;
    case DECIMAL:
      type.setKind(OrcProto.Type.Kind.DECIMAL);
      type.setPrecision(typeDescr.getPrecision());
      type.setScale(typeDescr.getScale());
      break;
    case LIST:
      type.setKind(OrcProto.Type.Kind.LIST);
      type.addSubtypes(++subtype);
      result.add(type.build());
      needsAdd = false;
      appendOrcTypesRebuildSubtypes(result, children.get(0));
      break;
    case MAP:
      {
        // Make room for MAP type.
        result.add(null);

        // Add MAP type pair in order to determine their subtype values.
        appendOrcTypesRebuildSubtypes(result, children.get(0));
        int subtype2 = result.size();
        appendOrcTypesRebuildSubtypes(result, children.get(1));
        type.setKind(OrcProto.Type.Kind.MAP);
        type.addSubtypes(subtype + 1);
        type.addSubtypes(subtype2);
        result.set(subtype, type.build());
        needsAdd = false;
      }
      break;
    case STRUCT:
      {
        List fieldNames = typeDescr.getFieldNames();

        // Make room for STRUCT type.
        result.add(null);

        List fieldSubtypes = new ArrayList(fieldNames.size());
        for(TypeDescription child: children) {
          int fieldSubtype = result.size();
          fieldSubtypes.add(fieldSubtype);
          appendOrcTypesRebuildSubtypes(result, child);
        }

        type.setKind(OrcProto.Type.Kind.STRUCT);

        for (int i = 0 ; i < fieldNames.size(); i++) {
          type.addSubtypes(fieldSubtypes.get(i));
          type.addFieldNames(fieldNames.get(i));
        }
        result.set(subtype, type.build());
        needsAdd = false;
      }
      break;
    case UNION:
      {
        // Make room for UNION type.
        result.add(null);

        List unionSubtypes = new ArrayList(children.size());
        for(TypeDescription child: children) {
          int unionSubtype = result.size();
          unionSubtypes.add(unionSubtype);
          appendOrcTypesRebuildSubtypes(result, child);
        }

        type.setKind(OrcProto.Type.Kind.UNION);
        for (int i = 0 ; i < children.size(); i++) {
          type.addSubtypes(unionSubtypes.get(i));
        }
        result.set(subtype, type.build());
        needsAdd = false;
      }
      break;
    default:
      throw new IllegalArgumentException("Unknown category: " + typeDescr.getCategory());
    }
    if (needsAdd) {
      result.add(type.build());
    }
  }

  /**
   * NOTE: This method ignores the subtype numbers in the OrcProto.Type rebuilds the subtype
   * numbers based on the length of the result list being appended.
   *
   * @param result
   * @param types
   * @param columnId
   */
  public static int appendOrcTypesRebuildSubtypes(List result,
      List types, int columnId) {

    OrcProto.Type oldType = types.get(columnId++);

    int subtype = result.size();
    OrcProto.Type.Builder builder = OrcProto.Type.newBuilder();
    boolean needsAdd = true;
    switch (oldType.getKind()) {
    case BOOLEAN:
      builder.setKind(OrcProto.Type.Kind.BOOLEAN);
      break;
    case BYTE:
      builder.setKind(OrcProto.Type.Kind.BYTE);
      break;
    case SHORT:
      builder.setKind(OrcProto.Type.Kind.SHORT);
      break;
    case INT:
      builder.setKind(OrcProto.Type.Kind.INT);
      break;
    case LONG:
      builder.setKind(OrcProto.Type.Kind.LONG);
      break;
    case FLOAT:
      builder.setKind(OrcProto.Type.Kind.FLOAT);
      break;
    case DOUBLE:
      builder.setKind(OrcProto.Type.Kind.DOUBLE);
      break;
    case STRING:
      builder.setKind(OrcProto.Type.Kind.STRING);
      break;
    case CHAR:
      builder.setKind(OrcProto.Type.Kind.CHAR);
      builder.setMaximumLength(oldType.getMaximumLength());
      break;
    case VARCHAR:
      builder.setKind(OrcProto.Type.Kind.VARCHAR);
      builder.setMaximumLength(oldType.getMaximumLength());
      break;
    case BINARY:
      builder.setKind(OrcProto.Type.Kind.BINARY);
      break;
    case TIMESTAMP:
      builder.setKind(OrcProto.Type.Kind.TIMESTAMP);
      break;
    case DATE:
      builder.setKind(OrcProto.Type.Kind.DATE);
      break;
    case DECIMAL:
      builder.setKind(OrcProto.Type.Kind.DECIMAL);
      builder.setPrecision(oldType.getPrecision());
      builder.setScale(oldType.getScale());
      break;
    case LIST:
      builder.setKind(OrcProto.Type.Kind.LIST);
      builder.addSubtypes(++subtype);
      result.add(builder.build());
      needsAdd = false;
      columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
      break;
    case MAP:
      {
        // Make room for MAP type.
        result.add(null);

        // Add MAP type pair in order to determine their subtype values.
        columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
        int subtype2 = result.size();
        columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
        builder.setKind(OrcProto.Type.Kind.MAP);
        builder.addSubtypes(subtype + 1);
        builder.addSubtypes(subtype2);
        result.set(subtype, builder.build());
        needsAdd = false;
      }
      break;
    case STRUCT:
      {
        List fieldNames = oldType.getFieldNamesList();

        // Make room for STRUCT type.
        result.add(null);

        List fieldSubtypes = new ArrayList(fieldNames.size());
        for(int i = 0 ; i < fieldNames.size(); i++) {
          int fieldSubtype = result.size();
          fieldSubtypes.add(fieldSubtype);
          columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
        }

        builder.setKind(OrcProto.Type.Kind.STRUCT);

        for (int i = 0 ; i < fieldNames.size(); i++) {
          builder.addSubtypes(fieldSubtypes.get(i));
          builder.addFieldNames(fieldNames.get(i));
        }
        result.set(subtype, builder.build());
        needsAdd = false;
      }
      break;
    case UNION:
      {
        int subtypeCount = oldType.getSubtypesCount();

        // Make room for UNION type.
        result.add(null);

        List unionSubtypes = new ArrayList(subtypeCount);
        for(int i = 0 ; i < subtypeCount; i++) {
          int unionSubtype = result.size();
          unionSubtypes.add(unionSubtype);
          columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
        }

        builder.setKind(OrcProto.Type.Kind.UNION);
        for (int i = 0 ; i < subtypeCount; i++) {
          builder.addSubtypes(unionSubtypes.get(i));
        }
        result.set(subtype, builder.build());
        needsAdd = false;
      }
      break;
    default:
      throw new IllegalArgumentException("Unknown category: " + oldType.getKind());
    }
    if (needsAdd) {
      result.add(builder.build());
    }
    return columnId;
  }

  /**
   * Checks whether the list of protobuf types from the file are valid or not.
   * @param types the list of types from the protobuf
   * @param root the top of the tree to check
   * @return the next available id
   * @throws java.io.IOException if the tree is invalid
   */
  public static int isValidTypeTree(List types,
                                    int root) throws IOException  {
    if (root < 0 || root >= types.size()) {
      throw new IOException("Illegal type id " + root +
          ". The valid range is 0 to " + (types.size() - 1));
    }
    OrcProto.Type rootType = types.get(root);
    int current = root+1;
    List children = rootType.getSubtypesList();
    if (!rootType.hasKind()) {
      throw new IOException("Type " + root + " has an unknown kind.");
    }
    // ensure that we have the right number of children
    switch(rootType.getKind()) {
      case LIST:
        if (children == null || children.size() != 1) {
          throw new IOException("Wrong number of type children in list " + root);
        }
        break;
      case MAP:
        if (children == null || children.size() != 2) {
          throw new IOException("Wrong number of type children in map " + root);
        }
        break;
      case UNION:
      case STRUCT:
        break;
      default:
        if (children != null && children.size() != 0) {
          throw new IOException("Type children under primitive type " + root);
        }
    }
    // ensure the children are also correct
    if (children != null) {
      for(int child: children) {
        if (child != current) {
          throw new IOException("Unexpected child type id " + child + " when " +
              current + " was expected.");
        }
        current = isValidTypeTree(types, current);
      }
    }
    return current;
  }
  /**
   * Translate the given rootColumn from the list of types to a TypeDescription.
   * @param types all of the types
   * @param rootColumn translate this type
   * @return a new TypeDescription that matches the given rootColumn
   */
  public static
        TypeDescription convertTypeFromProtobuf(List types,
                                                int rootColumn)
          throws FileFormatException {
    OrcProto.Type type = types.get(rootColumn);
    switch (type.getKind()) {
      case BOOLEAN:
        return TypeDescription.createBoolean();
      case BYTE:
        return TypeDescription.createByte();
      case SHORT:
        return TypeDescription.createShort();
      case INT:
        return TypeDescription.createInt();
      case LONG:
        return TypeDescription.createLong();
      case FLOAT:
        return TypeDescription.createFloat();
      case DOUBLE:
        return TypeDescription.createDouble();
      case STRING:
        return TypeDescription.createString();
      case CHAR:
      case VARCHAR: {
        TypeDescription result = type.getKind() == OrcProto.Type.Kind.CHAR ?
            TypeDescription.createChar() : TypeDescription.createVarchar();
        if (type.hasMaximumLength()) {
          result.withMaxLength(type.getMaximumLength());
        }
        return result;
      }
      case BINARY:
        return TypeDescription.createBinary();
      case TIMESTAMP:
        return TypeDescription.createTimestamp();
      case DATE:
        return TypeDescription.createDate();
      case DECIMAL: {
        TypeDescription result = TypeDescription.createDecimal();
        if (type.hasScale()) {
          result.withScale(type.getScale());
        }
        if (type.hasPrecision()) {
          result.withPrecision(type.getPrecision());
        }
        return result;
      }
      case LIST:
        if (type.getSubtypesCount() != 1) {
          throw new FileFormatException("LIST type should contain exactly " +
                  "one subtype but has " + type.getSubtypesCount());
        }
        return TypeDescription.createList(
            convertTypeFromProtobuf(types, type.getSubtypes(0)));
      case MAP:
        if (type.getSubtypesCount() != 2) {
          throw new FileFormatException("MAP type should contain exactly " +
                  "two subtypes but has " + type.getSubtypesCount());
        }
        return TypeDescription.createMap(
            convertTypeFromProtobuf(types, type.getSubtypes(0)),
            convertTypeFromProtobuf(types, type.getSubtypes(1)));
      case STRUCT: {
        TypeDescription result = TypeDescription.createStruct();
        for(int f=0; f < type.getSubtypesCount(); ++f) {
          result.addField(type.getFieldNames(f),
              convertTypeFromProtobuf(types, type.getSubtypes(f)));
        }
        return result;
      }
      case UNION: {
        if (type.getSubtypesCount() == 0) {
          throw new FileFormatException("UNION type should contain at least" +
                  " one subtype but has none");
        }
        TypeDescription result = TypeDescription.createUnion();
        for(int f=0; f < type.getSubtypesCount(); ++f) {
          result.addUnionChild(
              convertTypeFromProtobuf(types, type.getSubtypes(f)));
        }
        return result;
      }
    }
    throw new IllegalArgumentException("Unknown ORC type " + type.getKind());
  }

  public static List convertProtoStripesToStripes(
      List stripes) {
    List result = new ArrayList(stripes.size());
    for (OrcProto.StripeInformation info : stripes) {
      result.add(new ReaderImpl.StripeInformationImpl(info));
    }
    return result;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy