cz.o2.proxima.direct.bulk.fs.parquet.StreamElementWriteSupport Maven / Gradle / Ivy

Go to download
/*
 * Copyright 2017-2023 O2 Czech Republic, a.s.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package cz.o2.proxima.direct.bulk.fs.parquet;

import static cz.o2.proxima.direct.bulk.fs.parquet.ParquetFileFormat.PARQUET_COLUMN_NAME_ATTRIBUTE;
import static cz.o2.proxima.direct.bulk.fs.parquet.ParquetFileFormat.PARQUET_COLUMN_NAME_ATTRIBUTE_PREFIX;
import static cz.o2.proxima.direct.bulk.fs.parquet.ParquetFileFormat.PARQUET_COLUMN_NAME_KEY;
import static cz.o2.proxima.direct.bulk.fs.parquet.ParquetFileFormat.PARQUET_COLUMN_NAME_OPERATION;
import static cz.o2.proxima.direct.bulk.fs.parquet.ParquetFileFormat.PARQUET_COLUMN_NAME_TIMESTAMP;
import static cz.o2.proxima.direct.bulk.fs.parquet.ParquetFileFormat.PARQUET_COLUMN_NAME_UUID;

import com.google.common.base.Preconditions;
import cz.o2.proxima.annotations.Internal;
import cz.o2.proxima.direct.bulk.fs.parquet.ParquetFileFormat.Operation;
import cz.o2.proxima.repository.AttributeDescriptor;
import cz.o2.proxima.scheme.AttributeValueAccessor;
import cz.o2.proxima.scheme.AttributeValueType;
import cz.o2.proxima.scheme.SchemaDescriptors;
import cz.o2.proxima.scheme.SchemaDescriptors.SchemaTypeDescriptor;
import cz.o2.proxima.scheme.SchemaDescriptors.StructureTypeDescriptor;
import cz.o2.proxima.storage.StreamElement;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.hadoop.api.WriteSupport;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.io.api.RecordConsumer;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;

@Slf4j
@Internal
public class StreamElementWriteSupport extends WriteSupport {

  private final MessageType parquetSchema;
  private final String attributeNamesPrefix;
  private final StreamElementWriter streamElementWriter;
  RecordConsumer recordConsumer;

  public StreamElementWriteSupport(MessageType parquetSchema, String attributeNamesPrefix) {
    this.parquetSchema = parquetSchema;
    this.attributeNamesPrefix = attributeNamesPrefix;
    this.streamElementWriter = new StreamElementWriter(parquetSchema);
  }

  @Override
  public WriteContext init(Configuration configuration) {
    final Map extraMetadata =
        Collections.singletonMap(
            ParquetFileFormat.PARQUET_CONFIG_VALUES_PREFIX_KEY_NAME, attributeNamesPrefix);
    return new WriteContext(parquetSchema, extraMetadata);
  }

  @Override
  public void prepareForWrite(RecordConsumer recordConsumer) {
    this.recordConsumer = recordConsumer;
  }

  @Override
  public void write(StreamElement record) {
    Preconditions.checkNotNull(this.recordConsumer, "RecordConsumer can not be null.");
    streamElementWriter.write(record);
  }

   Writer createWriter(
      SchemaTypeDescriptor schema, String attributeName, GroupType parquetSchema) {
    Writer writer;
    switch (schema.getType()) {
      case ARRAY:
        if (schema.asArrayTypeDescriptor().getValueType().equals(AttributeValueType.BYTE)) {
          writer =
              createWriter(
                  schema.asArrayTypeDescriptor().getValueDescriptor(),
                  attributeName,
                  parquetSchema);
        } else {
          writer =
              new ArrayWriter<>(
                  createWriter(
                      schema.asArrayTypeDescriptor().getValueDescriptor(),
                      ArrayWriter.ELEMENT_FIELD_NAME,
                      parquetSchema
                          .getType(attributeName)
                          .asGroupType()
                          .getType(ArrayWriter.LIST_FIELD_NAME)
                          .asGroupType()));
        }
        break;
      case STRUCTURE:
        writer =
            new StructureWriter(
                schema.asStructureTypeDescriptor(),
                parquetSchema.getType(attributeName).asGroupType());
        break;
      case INT:
        writer = new IntWriter();
        break;
      case LONG:
        writer = new LongWriter();
        break;
      case BOOLEAN:
        writer = new BooleanWriter();
        break;
      case ENUM:
      case STRING:
        writer = new StringWriter();
        break;
      case FLOAT:
        writer = new FloatWriter();
        break;
      case DOUBLE:
        writer = new DoubleWriter();
        break;
      case BYTE:
        writer = new BytesWriter();
        break;
      default:
        throw new UnsupportedOperationException("Unknown type " + schema.getType());
    }
    writer.setName(attributeName);
    writer.setIndex(parquetSchema.getFieldIndex(attributeName));
    return writer;
  }

  interface Writer {

    void setName(String name);

    void setIndex(int index);

    default void writeRawValue(T value) {
      throw new UnsupportedOperationException("Method writeRawValue should be overridden.");
    }

    void write(T value);
  }

  class StreamElementWriter extends GenericFieldWriter {

    final Map> writers = new HashMap<>();

    StreamElementWriter(GroupType schema) {
      final SchemaTypeDescriptor stringType = SchemaDescriptors.strings();
      final SchemaTypeDescriptor longType = SchemaDescriptors.longs();
      final SchemaTypeDescriptor operationType =
          SchemaDescriptors.enums(
              Arrays.stream(Operation.values())
                  .map(Operation::getValue)
                  .collect(Collectors.toList()));
      writers.put(
          PARQUET_COLUMN_NAME_KEY, createWriter(stringType, PARQUET_COLUMN_NAME_KEY, schema));
      writers.put(
          PARQUET_COLUMN_NAME_UUID, createWriter(stringType, PARQUET_COLUMN_NAME_UUID, schema));
      writers.put(
          PARQUET_COLUMN_NAME_TIMESTAMP,
          createWriter(longType, PARQUET_COLUMN_NAME_TIMESTAMP, schema));
      writers.put(
          PARQUET_COLUMN_NAME_OPERATION,
          createWriter(operationType, PARQUET_COLUMN_NAME_OPERATION, schema));
      writers.put(
          PARQUET_COLUMN_NAME_ATTRIBUTE,
          createWriter(stringType, PARQUET_COLUMN_NAME_ATTRIBUTE, schema));
      writers.put(
          PARQUET_COLUMN_NAME_ATTRIBUTE_PREFIX,
          createWriter(stringType, PARQUET_COLUMN_NAME_ATTRIBUTE_PREFIX, schema));
    }

    @Override
    public void write(StreamElement element) {

      final AttributeDescriptor attributeDescriptor = element.getAttributeDescriptor();
      final String attributePrefix =
          attributeNamesPrefix
              + ((attributeDescriptor.isWildcard())
                  ? attributeDescriptor.toAttributePrefix(true) + "*"
                  : attributeDescriptor.toAttributePrefix());

      final Map row = new HashMap<>();
      row.put(PARQUET_COLUMN_NAME_KEY, element.getKey());
      row.put(PARQUET_COLUMN_NAME_UUID, element.getUuid());
      row.put(PARQUET_COLUMN_NAME_TIMESTAMP, element.getStamp());
      row.put(PARQUET_COLUMN_NAME_OPERATION, Operation.fromElement(element).getValue());
      row.put(PARQUET_COLUMN_NAME_ATTRIBUTE, element.getAttribute());
      row.put(PARQUET_COLUMN_NAME_ATTRIBUTE_PREFIX, attributePrefix);

      @SuppressWarnings("unchecked")
      final SchemaTypeDescriptor