All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.thrift.ThriftSchemaConverter Maven / Gradle / Ivy

There is a newer version: 1.14.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.thrift;

import static org.apache.parquet.schema.Type.Repetition.REPEATED;

import com.twitter.elephantbird.thrift.TStructDescriptor;
import com.twitter.elephantbird.thrift.TStructDescriptor.Field;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.conf.HadoopParquetConfiguration;
import org.apache.parquet.conf.ParquetConfiguration;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Type;
import org.apache.parquet.thrift.projection.FieldProjectionFilter;
import org.apache.parquet.thrift.struct.ThriftField;
import org.apache.parquet.thrift.struct.ThriftField.Requirement;
import org.apache.parquet.thrift.struct.ThriftType;
import org.apache.parquet.thrift.struct.ThriftType.BoolType;
import org.apache.parquet.thrift.struct.ThriftType.ByteType;
import org.apache.parquet.thrift.struct.ThriftType.DoubleType;
import org.apache.parquet.thrift.struct.ThriftType.EnumType;
import org.apache.parquet.thrift.struct.ThriftType.EnumValue;
import org.apache.parquet.thrift.struct.ThriftType.I16Type;
import org.apache.parquet.thrift.struct.ThriftType.I32Type;
import org.apache.parquet.thrift.struct.ThriftType.I64Type;
import org.apache.parquet.thrift.struct.ThriftType.StringType;
import org.apache.parquet.thrift.struct.ThriftType.StructType;
import org.apache.parquet.thrift.struct.ThriftType.StructType.StructOrUnionType;
import org.apache.parquet.thrift.struct.ThriftTypeID;
import org.apache.thrift.TBase;
import org.apache.thrift.TEnum;
import org.apache.thrift.TUnion;
import org.apache.thrift.meta_data.FieldMetaData;

/**
 * Given a thrift class, this class converts it to parquet schema,
 * a {@link FieldProjectionFilter} can be specified for projection pushdown.
 */
public class ThriftSchemaConverter {
  private final FieldProjectionFilter fieldProjectionFilter;

  private ParquetConfiguration conf;

  public ThriftSchemaConverter() {
    this(FieldProjectionFilter.ALL_COLUMNS);
  }

  public ThriftSchemaConverter(Configuration configuration) {
    this(new HadoopParquetConfiguration(configuration));
  }

  public ThriftSchemaConverter(ParquetConfiguration configuration) {
    this();
    conf = configuration;
  }

  public ThriftSchemaConverter(Configuration configuration, FieldProjectionFilter fieldProjectionFilter) {
    this(new HadoopParquetConfiguration(configuration), fieldProjectionFilter);
  }

  public ThriftSchemaConverter(ParquetConfiguration configuration, FieldProjectionFilter fieldProjectionFilter) {
    this(fieldProjectionFilter);
    conf = configuration;
  }

  public ThriftSchemaConverter(FieldProjectionFilter fieldProjectionFilter) {
    this.fieldProjectionFilter = fieldProjectionFilter;
  }

  public MessageType convert(Class> thriftClass) {
    return convert(toStructType(thriftClass));
  }

  /**
   * struct is assumed to contain valid structOrUnionType metadata when used with this method.
   * This method may throw if structOrUnionType is unknown.
   * 

* Use convertWithoutProjection below to convert a StructType to MessageType * * @param struct the thrift type descriptor * @return the struct as a Parquet message type */ public MessageType convert(StructType struct) { MessageType messageType = ThriftSchemaConvertVisitor.convert(struct, fieldProjectionFilter, true, conf); fieldProjectionFilter.assertNoUnmatchedPatterns(); return messageType; } /** * struct is not required to have known structOrUnionType, which is useful * for converting a StructType from an (older) file schema to a MessageType * * @param struct the thrift type descriptor * @return the struct as a Parquet message type */ public static MessageType convertWithoutProjection(StructType struct) { return ThriftSchemaConvertVisitor.convert( struct, FieldProjectionFilter.ALL_COLUMNS, false, new Configuration()); } public static > StructOrUnionType structOrUnionType(Class klass) { return TUnion.class.isAssignableFrom(klass) ? StructOrUnionType.UNION : StructOrUnionType.STRUCT; } public static ThriftType.StructType toStructType(Class> thriftClass) { final TStructDescriptor struct = TStructDescriptor.getInstance(thriftClass); return toStructType(struct); } private static StructType toStructType(TStructDescriptor struct) { List fields = struct.getFields(); List children = new ArrayList(fields.size()); for (Field field : fields) { Requirement req = field.getFieldMetaData() == null ? Requirement.OPTIONAL : Requirement.fromType(field.getFieldMetaData().requirementType); children.add(toThriftField(field.getName(), field, req)); } return new StructType(children, structOrUnionType(struct.getThriftClass())); } /** * Returns whether the given type is the element type of a list or is a * synthetic group with one field that is the element type. This is * determined by checking whether the type can be a synthetic group and by * checking whether a potential synthetic group matches the expected * ThriftField. *

* This method never guesses because the expected ThriftField is known. * * @param repeatedType a type that may be the element type * @param thriftElement the expected Schema for list elements * @return {@code true} if the repeatedType is the element schema */ static boolean isListElementType(Type repeatedType, ThriftField thriftElement) { if (repeatedType.isPrimitive() || (repeatedType.asGroupType().getFieldCount() != 1) || (repeatedType.asGroupType().getType(0).isRepetition(REPEATED))) { // The repeated type must be the element type because it is an invalid // synthetic wrapper. Must be a group with one optional or required field return true; } else if (thriftElement != null && thriftElement.getType() instanceof StructType) { Set fieldNames = new HashSet(); for (ThriftField field : ((StructType) thriftElement.getType()).getChildren()) { fieldNames.add(field.getName()); } // If the repeated type is a subset of the structure of the ThriftField, // then it must be the element type. return fieldNames.contains(repeatedType.asGroupType().getFieldName(0)); } return false; } private static ThriftField toThriftField(String name, Field field, ThriftField.Requirement requirement) { ThriftType type; switch (ThriftTypeID.fromByte(field.getType())) { case STOP: case VOID: default: throw new UnsupportedOperationException("can't convert type of " + field); case BOOL: type = new BoolType(); break; case BYTE: type = new ByteType(); break; case DOUBLE: type = new DoubleType(); break; case I16: type = new I16Type(); break; case I32: type = new I32Type(); break; case I64: type = new I64Type(); break; case STRING: StringType stringType = new StringType(); FieldMetaData fieldMetaData = field.getFieldMetaData(); // There is no real binary type (see THRIFT-1920) in Thrift, // binary data is represented by String type with an additional binary flag. if (fieldMetaData != null && fieldMetaData.valueMetaData.isBinary()) { stringType.setBinary(true); } type = stringType; break; case STRUCT: type = toStructType(field.gettStructDescriptor()); break; case MAP: final Field mapKeyField = field.getMapKeyField(); final Field mapValueField = field.getMapValueField(); type = new ThriftType.MapType( toThriftField(mapKeyField.getName(), mapKeyField, requirement), toThriftField(mapValueField.getName(), mapValueField, requirement)); break; case SET: final Field setElemField = field.getSetElemField(); type = new ThriftType.SetType(toThriftField(setElemField.getName(), setElemField, requirement)); break; case LIST: final Field listElemField = field.getListElemField(); type = new ThriftType.ListType(toThriftField(listElemField.getName(), listElemField, requirement)); break; case UUID: case ENUM: if (field.isEnum()) { Collection enumValues = field.getEnumValues(); List values = new ArrayList<>(); for (TEnum tEnum : enumValues) { values.add(new EnumValue(tEnum.getValue(), tEnum.toString())); } type = new EnumType(values); } else { type = new ThriftType.UUIDType(); } } return new ThriftField(name, field.getId(), requirement, type); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy