All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.thrift.ThriftSchemaConvertVisitor Maven / Gradle / Ivy

There is a newer version: 1.15.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.thrift;

import static org.apache.parquet.schema.ConversionPatterns.listOfElements;
import static org.apache.parquet.schema.ConversionPatterns.listType;
import static org.apache.parquet.schema.ConversionPatterns.mapType;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
import static org.apache.parquet.schema.Type.Repetition.OPTIONAL;
import static org.apache.parquet.schema.Type.Repetition.REPEATED;
import static org.apache.parquet.schema.Type.Repetition.REQUIRED;
import static org.apache.parquet.schema.Types.primitive;

import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.ShouldNeverHappenException;
import org.apache.parquet.conf.HadoopParquetConfiguration;
import org.apache.parquet.conf.ParquetConfiguration;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type;
import org.apache.parquet.schema.Type.Repetition;
import org.apache.parquet.schema.Types.PrimitiveBuilder;
import org.apache.parquet.thrift.ConvertedField.Drop;
import org.apache.parquet.thrift.ConvertedField.Keep;
import org.apache.parquet.thrift.ConvertedField.SentinelUnion;
import org.apache.parquet.thrift.projection.FieldProjectionFilter;
import org.apache.parquet.thrift.projection.FieldsPath;
import org.apache.parquet.thrift.projection.ThriftProjectionException;
import org.apache.parquet.thrift.struct.ThriftField;
import org.apache.parquet.thrift.struct.ThriftType;
import org.apache.parquet.thrift.struct.ThriftType.BoolType;
import org.apache.parquet.thrift.struct.ThriftType.ByteType;
import org.apache.parquet.thrift.struct.ThriftType.DoubleType;
import org.apache.parquet.thrift.struct.ThriftType.EnumType;
import org.apache.parquet.thrift.struct.ThriftType.I16Type;
import org.apache.parquet.thrift.struct.ThriftType.I32Type;
import org.apache.parquet.thrift.struct.ThriftType.I64Type;
import org.apache.parquet.thrift.struct.ThriftType.ListType;
import org.apache.parquet.thrift.struct.ThriftType.MapType;
import org.apache.parquet.thrift.struct.ThriftType.SetType;
import org.apache.parquet.thrift.struct.ThriftType.StringType;
import org.apache.parquet.thrift.struct.ThriftType.StructType;
import org.apache.parquet.thrift.struct.ThriftType.StructType.StructOrUnionType;

/**
 * Visitor Class for converting a thrift definition to parquet message type.
 * Projection can be done by providing a {@link FieldProjectionFilter}
 */
class ThriftSchemaConvertVisitor implements ThriftType.StateVisitor {
  private final FieldProjectionFilter fieldProjectionFilter;
  private final boolean doProjection;
  private final boolean keepOneOfEachUnion;

  private final boolean writeThreeLevelList;

  private ThriftSchemaConvertVisitor(
      FieldProjectionFilter fieldProjectionFilter,
      boolean doProjection,
      boolean keepOneOfEachUnion,
      Configuration configuration) {
    this(fieldProjectionFilter, doProjection, keepOneOfEachUnion, new HadoopParquetConfiguration(configuration));
  }

  private ThriftSchemaConvertVisitor(
      FieldProjectionFilter fieldProjectionFilter,
      boolean doProjection,
      boolean keepOneOfEachUnion,
      ParquetConfiguration configuration) {
    this.fieldProjectionFilter =
        Objects.requireNonNull(fieldProjectionFilter, "fieldProjectionFilter cannot be null");
    this.doProjection = doProjection;
    this.keepOneOfEachUnion = keepOneOfEachUnion;
    if (configuration != null) {
      this.writeThreeLevelList = configuration.getBoolean(
          ParquetWriteProtocol.WRITE_THREE_LEVEL_LISTS, ParquetWriteProtocol.WRITE_THREE_LEVEL_LISTS_DEFAULT);
    } else {
      writeThreeLevelList = ParquetWriteProtocol.WRITE_THREE_LEVEL_LISTS_DEFAULT;
    }
  }

  private ThriftSchemaConvertVisitor(
      FieldProjectionFilter fieldProjectionFilter, boolean doProjection, boolean keepOneOfEachUnion) {
    this(fieldProjectionFilter, doProjection, keepOneOfEachUnion, new Configuration());
  }

  @Deprecated
  public static MessageType convert(StructType struct, FieldProjectionFilter filter) {
    return convert(struct, filter, true, new Configuration());
  }

  public static MessageType convert(
      StructType struct, FieldProjectionFilter filter, boolean keepOneOfEachUnion, Configuration conf) {
    return convert(struct, filter, keepOneOfEachUnion, new HadoopParquetConfiguration(conf));
  }

  public static MessageType convert(
      StructType struct, FieldProjectionFilter filter, boolean keepOneOfEachUnion, ParquetConfiguration conf) {
    State state = new State(new FieldsPath(), REPEATED, "ParquetSchema");

    ConvertedField converted =
        struct.accept(new ThriftSchemaConvertVisitor(filter, true, keepOneOfEachUnion, conf), state);

    if (!converted.isKeep()) {
      throw new ThriftProjectionException("No columns have been selected");
    }

    return new MessageType(
        state.name, converted.asKeep().getType().asGroupType().getFields());
  }

  /**
   * @deprecated this will be removed in 2.0.0.
   */
  @Deprecated
  public FieldProjectionFilter getFieldProjectionFilter() {
    return fieldProjectionFilter;
  }

  @Override
  public ConvertedField visit(MapType mapType, State state) {
    ThriftField keyField = mapType.getKey();
    ThriftField valueField = mapType.getValue();

    State keyState = new State(state.path.push(keyField), REQUIRED, "key");

    // TODO: This is a bug! this should be REQUIRED but changing this will
    // break the the schema compatibility check against old data
    // Thrift does not support null / missing map values.
    State valueState = new State(state.path.push(valueField), OPTIONAL, "value");

    ConvertedField convertedKey = keyField.getType().accept(this, keyState);
    ConvertedField convertedValue = valueField.getType().accept(this, valueState);

    if (!convertedKey.isKeep()) {

      if (convertedValue.isKeep()) {
        throw new ThriftProjectionException(
            "Cannot select only the values of a map, you must keep the keys as well: " + state.path);
      }

      // neither key nor value was requested
      return new Drop(state.path);
    }

    // we are keeping the key, but we do not allow partial projections on keys
    // as that doesn't make sense when assembling back into a map.
    // NOTE: doProjections prevents us from infinite recursion here.
    if (doProjection) {
      ConvertedField fullConvKey = keyField.getType()
          .accept(
              new ThriftSchemaConvertVisitor(
                  FieldProjectionFilter.ALL_COLUMNS, false, keepOneOfEachUnion),
              keyState);

      if (!fullConvKey.asKeep().getType().equals(convertedKey.asKeep().getType())) {
        throw new ThriftProjectionException(
            "Cannot select only a subset of the fields in a map key, " + "for path " + state.path);
      }
    }

    // now, are we keeping the value?

    if (convertedValue.isKeep()) {
      // keep both key and value

      Type mapField = mapType(
          state.repetition,
          state.name,
          convertedKey.asKeep().getType(),
          convertedValue.asKeep().getType());

      return new Keep(state.path, mapField);
    }

    // keep only the key, not the value

    ConvertedField sentinelValue = valueField
        .getType()
        .accept(
            new ThriftSchemaConvertVisitor(new KeepOnlyFirstPrimitiveFilter(), true, keepOneOfEachUnion),
            valueState);

    Type mapField = mapType(
        state.repetition,
        state.name,
        convertedKey.asKeep().getType(),
        sentinelValue.asKeep().getType()); // signals to mapType method to project the value

    return new Keep(state.path, mapField);
  }

  private ConvertedField visitListLike(ThriftField listLike, State state, boolean isSet) {
    State childState;
    if (writeThreeLevelList) {
      childState = new State(state.path, REQUIRED, "element");
    } else {
      childState = new State(state.path, REPEATED, state.name + "_tuple");
    }

    ConvertedField converted = listLike.getType().accept(this, childState);

    if (converted.isKeep()) {
      // doProjection prevents an infinite recursion here
      if (isSet && doProjection) {
        ConvertedField fullConv = listLike.getType()
            .accept(
                new ThriftSchemaConvertVisitor(
                    FieldProjectionFilter.ALL_COLUMNS, false, keepOneOfEachUnion),
                childState);
        if (!converted.asKeep().getType().equals(fullConv.asKeep().getType())) {
          throw new ThriftProjectionException(
              "Cannot select only a subset of the fields in a set, " + "for path " + state.path);
        }
      }

      if (writeThreeLevelList) {
        return new Keep(
            state.path,
            listOfElements(
                state.repetition, state.name, converted.asKeep().getType()));
      } else {
        return new Keep(
            state.path,
            listType(
                state.repetition, state.name, converted.asKeep().getType()));
      }
    }

    return new Drop(state.path);
  }

  @Override
  public ConvertedField visit(SetType setType, State state) {
    return visitListLike(setType.getValues(), state, true);
  }

  @Override
  public ConvertedField visit(ListType listType, State state) {
    return visitListLike(listType.getValues(), state, false);
  }

  @Override
  public ConvertedField visit(StructType structType, State state) {

    // special care is taken when converting unions,
    // because we are actually both converting + projecting in
    // one pass, and unions need special handling when projecting.
    final boolean needsToKeepOneOfEachUnion = keepOneOfEachUnion && isUnion(structType.getStructOrUnionType());

    boolean hasSentinelUnionColumns = false;
    boolean hasNonSentinelUnionColumns = false;

    List convertedChildren = new ArrayList();

    for (ThriftField child : structType.getChildren()) {

      State childState = new State(state.path.push(child), getRepetition(child), child.getName());

      ConvertedField converted = child.getType().accept(this, childState);

      if (!converted.isKeep() && needsToKeepOneOfEachUnion) {
        // user is not keeping this "kind" of union, but we still need
        // to keep at least one of the primitives of this union around.
        // in order to know what "kind" of union each record is.
        // TODO: in the future, we should just filter these records out instead

        // re-do the recursion, with a new projection filter that keeps only
        // the first primitive it encounters
        ConvertedField firstPrimitive = child.getType()
            .accept(
                new ThriftSchemaConvertVisitor(
                    new KeepOnlyFirstPrimitiveFilter(), true, keepOneOfEachUnion),
                childState);

        convertedChildren.add(firstPrimitive.asKeep().getType().withId(child.getFieldId()));
        hasSentinelUnionColumns = true;
      }

      if (converted.isSentinelUnion()) {
        // child field is a sentinel union that we should drop if possible
        if (childState.repetition == REQUIRED) {
          // but this field is required, so we may still need it
          convertedChildren.add(converted.asSentinelUnion().getType().withId(child.getFieldId()));
          hasSentinelUnionColumns = true;
        }
      } else if (converted.isKeep()) {
        // user has selected this column, so we keep it.
        convertedChildren.add(converted.asKeep().getType().withId(child.getFieldId()));
        hasNonSentinelUnionColumns = true;
      }
    }

    if (!hasNonSentinelUnionColumns && hasSentinelUnionColumns) {
      // this is a union, and user has not requested any of the children
      // of this union. We should drop this union, if possible, but
      // we may not be able to, so tag this as a sentinel.
      return new SentinelUnion(state.path, new GroupType(state.repetition, state.name, convertedChildren));
    }

    if (hasNonSentinelUnionColumns) {
      // user requested some of the fields of this struct, so we keep the struct
      return new Keep(state.path, new GroupType(state.repetition, state.name, convertedChildren));
    } else {
      // user requested none of the fields of this struct, so we drop it
      return new Drop(state.path);
    }
  }

  private ConvertedField visitPrimitiveType(PrimitiveTypeName type, State state) {
    return visitPrimitiveType(type, null, state);
  }

  private ConvertedField visitPrimitiveType(PrimitiveTypeName type, LogicalTypeAnnotation orig, State state) {
    PrimitiveBuilder b = primitive(type, state.repetition);

    if (orig != null) {
      b = b.as(orig);
    }

    if (fieldProjectionFilter.keep(state.path)) {
      return new Keep(state.path, b.named(state.name));
    } else {
      return new Drop(state.path);
    }
  }

  @Override
  public ConvertedField visit(EnumType enumType, State state) {
    return visitPrimitiveType(BINARY, LogicalTypeAnnotation.enumType(), state);
  }

  @Override
  public ConvertedField visit(BoolType boolType, State state) {
    return visitPrimitiveType(BOOLEAN, state);
  }

  @Override
  public ConvertedField visit(ByteType byteType, State state) {
    return visitPrimitiveType(INT32, LogicalTypeAnnotation.intType(8, true), state);
  }

  @Override
  public ConvertedField visit(DoubleType doubleType, State state) {
    return visitPrimitiveType(DOUBLE, state);
  }

  @Override
  public ConvertedField visit(I16Type i16Type, State state) {
    return visitPrimitiveType(INT32, LogicalTypeAnnotation.intType(16, true), state);
  }

  @Override
  public ConvertedField visit(I32Type i32Type, State state) {
    return i32Type.hasLogicalTypeAnnotation()
        ? visitPrimitiveType(INT32, i32Type.getLogicalTypeAnnotation(), state)
        : visitPrimitiveType(INT32, state);
  }

  @Override
  public ConvertedField visit(I64Type i64Type, State state) {
    return i64Type.hasLogicalTypeAnnotation()
        ? visitPrimitiveType(INT64, i64Type.getLogicalTypeAnnotation(), state)
        : visitPrimitiveType(INT64, state);
  }

  @Override
  public ConvertedField visit(StringType stringType, State state) {
    if (stringType.isBinary()) {
      return stringType.hasLogicalTypeAnnotation()
          ? visitPrimitiveType(BINARY, stringType.getLogicalTypeAnnotation(), state)
          : visitPrimitiveType(BINARY, state);
    } else {
      return visitPrimitiveType(BINARY, LogicalTypeAnnotation.stringType(), state);
    }
  }

  @Override
  public ConvertedField visit(ThriftType.UUIDType uuidType, State state) {
    return visitPrimitiveType(FIXED_LEN_BYTE_ARRAY, LogicalTypeAnnotation.uuidType(), state);
  }

  private static boolean isUnion(StructOrUnionType s) {
    switch (s) {
      case STRUCT:
        return false;
      case UNION:
        return true;
      case UNKNOWN:
        throw new ShouldNeverHappenException("Encountered UNKNOWN StructOrUnionType");
      default:
        throw new ShouldNeverHappenException("Unrecognized type: " + s);
    }
  }

  private Type.Repetition getRepetition(ThriftField thriftField) {
    switch (thriftField.getRequirement()) {
      case REQUIRED:
        return REQUIRED;
      case OPTIONAL:
        return OPTIONAL;
      case DEFAULT:
        return OPTIONAL;
      default:
        throw new IllegalArgumentException("unknown requirement type: " + thriftField.getRequirement());
    }
  }

  /**
   * The state passed through the recursion as we traverse a thrift schema.
   */
  public static final class State {
    public final FieldsPath path; // current field path
    public final Type.Repetition repetition; // current repetition (no relation to parent's repetition)
    public final String name; // name of the field located at path

    public State(FieldsPath path, Repetition repetition, String name) {
      this.path = path;
      this.repetition = repetition;
      this.name = name;
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy