All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cz.o2.proxima.direct.bulk.fs.parquet.ParquetUtils Maven / Gradle / Ivy

/*
 * Copyright 2017-2023 O2 Czech Republic, a.s.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package cz.o2.proxima.direct.bulk.fs.parquet;

import cz.o2.proxima.annotations.Internal;
import cz.o2.proxima.direct.bulk.fs.parquet.StreamElementWriteSupport.ArrayWriter;
import cz.o2.proxima.repository.AttributeDescriptor;
import cz.o2.proxima.repository.AttributeFamilyDescriptor;
import cz.o2.proxima.scheme.AttributeValueType;
import cz.o2.proxima.scheme.SchemaDescriptors.SchemaTypeDescriptor;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type;
import org.apache.parquet.schema.Types;
import org.apache.parquet.schema.Types.GroupBuilder;

@Slf4j
@Internal
class ParquetUtils {

  private ParquetUtils() {
    // no-op
  }

  /**
   * Create parquet schema for given {@link AttributeFamilyDescriptor}.
   *
   * @param family attribute family
   * @return parquet message schema
   */
  static MessageType createParquetSchema(AttributeFamilyDescriptor family) {

    return ParquetUtils.createMessageWithFields(
        family
            .getAttributes()
            .stream()
            .map(
                attribute ->
                    createParquetSchema(
                        attribute,
                        family
                            .getCfg()
                            .getOrDefault(
                                ParquetFileFormat.PARQUET_CONFIG_VALUES_PREFIX_KEY_NAME,
                                ParquetFileFormat.PARQUET_DEFAULT_VALUES_NAME_PREFIX)
                            .toString()))
            .collect(Collectors.toList()));
  }

  /**
   * Convert {@link AttributeDescriptor} to parquet type.
   *
   * @param attribute attribute
   * @param attributeNamesPrefix prefix used for parquet type name
   * @param  type
   * @return parquet type
   */
  static  Type createParquetSchema(
      AttributeDescriptor attribute, String attributeNamesPrefix) {
    final SchemaTypeDescriptor schema = attribute.getSchemaTypeDescriptor();
    String attributeName = attributeNamesPrefix + attribute.getName();
    if (attribute.isWildcard()) {
      attributeName = attributeNamesPrefix + attribute.toAttributePrefix(false);
    }
    validateAttributeName(attributeName);
    Type parquet;
    if (schema.isStructureType()) {
      parquet =
          Types.optionalGroup()
              .named(attributeName)
              .withNewFields(
                  schema
                      .asStructureTypeDescriptor()
                      .getFields()
                      .entrySet()
                      .stream()
                      .map(e -> mapSchemaTypeToParquet(e.getValue(), e.getKey()))
                      .collect(Collectors.toList()));
    } else {
      parquet = mapSchemaTypeToParquet(schema, attributeName);
    }
    return parquet;
  }

  /**
   * Map proxima {@link SchemaTypeDescriptor} to correspond parquet type.
   *
   * @param descriptor schema descriptor
   * @param name name for parquet type
   * @param  descriptor type
   * @return parquet {@link Type}
   */
  static  Type mapSchemaTypeToParquet(SchemaTypeDescriptor descriptor, String name) {
    switch (descriptor.getType()) {
      case INT:
        return Types.optional(PrimitiveTypeName.INT32).named(name);
      case LONG:
        return Types.optional(PrimitiveTypeName.INT64).named(name);
      case DOUBLE:
        return Types.optional(PrimitiveTypeName.DOUBLE).named(name);
      case FLOAT:
        return Types.optional(PrimitiveTypeName.FLOAT).named(name);
      case BOOLEAN:
        return Types.optional(PrimitiveTypeName.BOOLEAN).named(name);
      case STRING:
        return Types.optional(PrimitiveTypeName.BINARY)
            .as(LogicalTypeAnnotation.stringType())
            .named(name);
      case ENUM:
        return Types.optional(PrimitiveTypeName.BINARY)
            .as(LogicalTypeAnnotation.enumType())
            .named(name);
      case BYTE:
        return Types.optional(PrimitiveTypeName.BINARY).named(name);
      case ARRAY:
        SchemaTypeDescriptor valueTypeDescriptor =
            descriptor.asArrayTypeDescriptor().getValueDescriptor();

        // proxima byte array should be encoded as binary
        if (valueTypeDescriptor.isPrimitiveType()
            && valueTypeDescriptor.getType().equals(AttributeValueType.BYTE)) {
          return mapSchemaTypeToParquet(valueTypeDescriptor, name);
        }

        return Types.optionalList()
            .setElementType(
                mapSchemaTypeToParquet(valueTypeDescriptor, ArrayWriter.ELEMENT_FIELD_NAME))
            .named(name);

      case STRUCTURE:
        GroupBuilder structure = Types.optionalGroup();
        descriptor
            .asStructureTypeDescriptor()
            .getFields()
            .forEach(
                (fieldName, des) -> structure.addField(mapSchemaTypeToParquet(des, fieldName)));

        return structure.named(name);
      default:
        throw new IllegalStateException("Unexpected value: " + descriptor.getType());
    }
  }

  /**
   * Create base {@link MessageType} for storing {@link cz.o2.proxima.storage.StreamElement} with
   * additional fields
   *
   * @param fieldsToAdd additional fields
   * @return message type for {@link cz.o2.proxima.storage.StreamElement}.
   */
  static MessageType createMessageWithFields(List fieldsToAdd) {
    List fields =
        new ArrayList<>(
            Arrays.asList(
                Types.required(PrimitiveTypeName.BINARY)
                    .as(LogicalTypeAnnotation.stringType())
                    .named(ParquetFileFormat.PARQUET_COLUMN_NAME_KEY),
                Types.required(PrimitiveTypeName.BINARY)
                    // @TODO when parquet 1.12 is released we should be able to use logicalType UUID
                    // More info: https://issues.apache.org/jira/browse/PARQUET-1827
                    .as(LogicalTypeAnnotation.stringType())
                    .named(ParquetFileFormat.PARQUET_COLUMN_NAME_UUID),
                Types.required(PrimitiveTypeName.INT64)
                    .as(LogicalTypeAnnotation.timestampType(true, TimeUnit.MILLIS))
                    .named(ParquetFileFormat.PARQUET_COLUMN_NAME_TIMESTAMP),
                Types.required(PrimitiveTypeName.BINARY)
                    .as(LogicalTypeAnnotation.enumType())
                    .named(ParquetFileFormat.PARQUET_COLUMN_NAME_OPERATION),
                Types.required(PrimitiveTypeName.BINARY)
                    .as(LogicalTypeAnnotation.stringType())
                    .named(ParquetFileFormat.PARQUET_COLUMN_NAME_ATTRIBUTE),
                Types.required(PrimitiveTypeName.BINARY)
                    .as(LogicalTypeAnnotation.stringType())
                    .named(ParquetFileFormat.PARQUET_COLUMN_NAME_ATTRIBUTE_PREFIX)));
    fields.addAll(fieldsToAdd);
    return new MessageType(ParquetFileFormat.PARQUET_MESSAGE_NAME, fields);
  }

  /**
   * Check name for possible collision with top level fields.
   *
   * @param attributeName attribute name
   * @throws IllegalArgumentException for reserved names
   */
  private static void validateAttributeName(String attributeName) {
    if (Arrays.asList(
            ParquetFileFormat.PARQUET_COLUMN_NAME_KEY,
            ParquetFileFormat.PARQUET_COLUMN_NAME_UUID,
            ParquetFileFormat.PARQUET_COLUMN_NAME_OPERATION,
            ParquetFileFormat.PARQUET_COLUMN_NAME_TIMESTAMP,
            ParquetFileFormat.PARQUET_COLUMN_NAME_ATTRIBUTE,
            ParquetFileFormat.PARQUET_COLUMN_NAME_ATTRIBUTE_PREFIX)
        .contains(attributeName)) {
      throw new IllegalArgumentException(
          String.format(
              "Attribute [%s] collides with reserved keyword. Please specify %s.",
              attributeName, ParquetFileFormat.PARQUET_CONFIG_VALUES_PREFIX_KEY_NAME));
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy