cz.o2.proxima.direct.bulk.fs.parquet.StreamElementMaterializer Maven / Gradle / Ivy

Go to download
/*
 * Copyright 2017-2023 O2 Czech Republic, a.s.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package cz.o2.proxima.direct.bulk.fs.parquet;

import static java.util.Optional.of;

import com.google.common.base.Preconditions;
import cz.o2.proxima.annotations.Internal;
import cz.o2.proxima.direct.bulk.fs.parquet.ParquetFileFormat.Operation;
import cz.o2.proxima.direct.bulk.fs.parquet.StreamElementMaterializer.ParquetColumnGroup.ParquetColumn;
import cz.o2.proxima.direct.bulk.fs.parquet.StreamElementWriteSupport.ArrayWriter;
import cz.o2.proxima.repository.AttributeDescriptor;
import cz.o2.proxima.repository.EntityDescriptor;
import cz.o2.proxima.scheme.AttributeValueAccessor;
import cz.o2.proxima.scheme.AttributeValueAccessors.StructureValue;
import cz.o2.proxima.scheme.SchemaDescriptors.SchemaTypeDescriptor;
import cz.o2.proxima.scheme.ValueSerializer;
import cz.o2.proxima.storage.StreamElement;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import javax.annotation.Nullable;
import lombok.Value;
import lombok.extern.slf4j.Slf4j;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.io.api.Converter;
import org.apache.parquet.io.api.GroupConverter;
import org.apache.parquet.io.api.PrimitiveConverter;
import org.apache.parquet.io.api.RecordMaterializer;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Type;

/** Class responsible for converting parquet record into {@link StreamElement}. */
@Slf4j
@Internal
public class StreamElementMaterializer extends RecordMaterializer {

  private final ParquetColumnRecordConverter root;
  private final MessageType schema;
  private final EntityDescriptor entity;
  private final String attributeNamesPrefix;

  public StreamElementMaterializer(
      MessageType schema, EntityDescriptor entity, String attributeNamesPrefix) {
    this.schema = schema;
    this.entity = entity;
    this.attributeNamesPrefix = attributeNamesPrefix;
    this.root = new ParquetColumnRecordConverter(schema, null, null);
  }

  @Override
  public StreamElement getCurrentRecord() {

    Map record = recordToMap(schema, root.getCurrentRecord(), new HashMap<>());

    final String key = getColumnFromRow(ParquetFileFormat.PARQUET_COLUMN_NAME_KEY, record);
    final String operation =
        getColumnFromRow(ParquetFileFormat.PARQUET_COLUMN_NAME_OPERATION, record);
    final String attributeName =
        getColumnFromRow(ParquetFileFormat.PARQUET_COLUMN_NAME_ATTRIBUTE_PREFIX, record);
    Optional> attribute = entity.findAttribute(attributeName);
    if (!attribute.isPresent()) {
      // current attribute is not in entity -> skip
      log.info(
          "Skipping attribute [{}] which is not in current entity [{}].",
          attributeName,
          entity.getName());
      return null;
    }
    final String uuid = getColumnFromRow(ParquetFileFormat.PARQUET_COLUMN_NAME_UUID, record);
    final long timestamp =
        getColumnFromRow(ParquetFileFormat.PARQUET_COLUMN_NAME_TIMESTAMP, record);
    switch (Operation.of(operation)) {
      case DELETE:
        return StreamElement.delete(entity, attribute.get(), uuid, key, attributeName, timestamp);
      case DELETE_WILDCARD:
        return StreamElement.deleteWildcard(entity, attribute.get(), uuid, key, timestamp);
      case UPSERT:
        return StreamElement.upsert(
            entity,
            attribute.get(),
            uuid,
            key,
            attributeName,
            timestamp,
            getValueFromCurrentRowData(attribute.get(), record));
      default:
        throw new RecordMaterializationException("Unknown operation " + operation);
    }
  }

  private byte[] getValueFromCurrentRowData(
      AttributeDescriptor attribute, Map record) {
    final String storedAttributeName = attributeNamesPrefix + attribute.toAttributePrefix(false);
    @SuppressWarnings("unchecked")
    final ValueSerializer