All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cz.o2.proxima.direct.bulk.fs.parquet.ProximaParquetReader Maven / Gradle / Ivy

/*
 * Copyright 2017-2023 O2 Czech Republic, a.s.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package cz.o2.proxima.direct.bulk.fs.parquet;

import com.google.common.base.Preconditions;
import com.google.common.collect.AbstractIterator;
import cz.o2.proxima.direct.bulk.Path;
import cz.o2.proxima.direct.bulk.Reader;
import cz.o2.proxima.repository.EntityDescriptor;
import cz.o2.proxima.storage.StreamElement;
import cz.o2.proxima.util.ExceptionUtils;
import java.io.IOException;
import java.nio.channels.Channels;
import java.nio.channels.SeekableByteChannel;
import java.util.Iterator;
import java.util.Map;
import lombok.extern.slf4j.Slf4j;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.api.InitContext;
import org.apache.parquet.hadoop.api.ReadSupport;
import org.apache.parquet.io.DelegatingSeekableInputStream;
import org.apache.parquet.io.InputFile;
import org.apache.parquet.io.SeekableInputStream;
import org.apache.parquet.io.api.RecordMaterializer;
import org.apache.parquet.schema.MessageType;

@Slf4j
public class ProximaParquetReader implements Reader {

  private final Path path;
  private final ParquetReader reader;

  public ProximaParquetReader(Path path, EntityDescriptor entity) throws IOException {
    final SeekableByteChannel channel = (SeekableByteChannel) path.read();
    final Configuration configuration = new Configuration(false);
    this.reader =
        new ParquetReadBuilder(new BulkInputFile(channel), entity)
            .withConf(configuration)
            // Currently we can not use push down filter for attributes See
            // https://github.com/O2-Czech-Republic/proxima-platform/issues/196 for details
            // .withFilter()
            .build();
    this.path = path;
  }

  @Override
  public void close() {
    ExceptionUtils.unchecked(reader::close);
  }

  @Override
  public Path getPath() {
    return path;
  }

  @Override
  public Iterator iterator() {
    return new AbstractIterator() {
      @Override
      protected StreamElement computeNext() {
        try {
          StreamElement element = reader.read();
          if (element == null) {
            return endOfData();
          } else {
            return element;
          }
        } catch (IOException e) {
          throw new IllegalStateException("Unable to compute next element.", e);
        }
      }
    };
  }

  private static class ParquetReadBuilder extends ParquetReader.Builder {

    private final EntityDescriptor entity;

    ParquetReadBuilder(InputFile file, EntityDescriptor entity) {
      super(file);
      this.entity = entity;
    }

    @Override
    protected ReadSupport getReadSupport() {
      Preconditions.checkNotNull(entity, "Entity must be specified.");
      return new StreamElementReadSupport(entity);
    }
  }

  private static class StreamElementReadSupport extends ReadSupport {

    private final EntityDescriptor entity;

    public StreamElementReadSupport(EntityDescriptor entity) {
      this.entity = entity;
    }

    @Override
    public ReadContext init(InitContext context) {
      return new ReadContext(context.getFileSchema());
    }

    @Override
    public RecordMaterializer prepareForRead(
        Configuration configuration,
        Map keyValueMetaData,
        MessageType fileSchema,
        ReadContext readContext) {
      final String attributeNamesPrefix =
          keyValueMetaData.getOrDefault(
              ParquetFileFormat.PARQUET_CONFIG_VALUES_PREFIX_KEY_NAME, "");
      return new StreamElementMaterializer(fileSchema, entity, attributeNamesPrefix);
    }
  }

  private static class BulkInputFile implements InputFile {

    private final SeekableByteChannel channel;

    BulkInputFile(SeekableByteChannel channel) {
      this.channel = channel;
    }

    @Override
    public long getLength() throws IOException {
      return channel.size();
    }

    @Override
    public SeekableInputStream newStream() {
      return new DelegatingSeekableInputStream(Channels.newInputStream(channel)) {

        @Override
        public long getPos() throws IOException {
          return channel.position();
        }

        @Override
        public void seek(long newPosition) throws IOException {
          channel.position(newPosition);
        }
      };
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy