All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.internal.column.columnindex;

import java.util.Formatter;

import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongList;

/**
 * Builder implementation to create {@link OffsetIndex} objects during writing a parquet file.
 */
public class OffsetIndexBuilder {

  private static class OffsetIndexImpl implements OffsetIndex {
    private long[] offsets;
    private int[] compressedPageSizes;
    private long[] firstRowIndexes;

    @Override
    public String toString() {
      try (Formatter formatter = new Formatter()) {
        formatter.format("%-10s  %20s  %16s  %20s\n", "", "offset", "compressed size", "first row index");
        for (int i = 0, n = offsets.length; i < n; ++i) {
          formatter.format("page-%-5d  %20d  %16d  %20d\n", i, offsets[i], compressedPageSizes[i], firstRowIndexes[i]);
        }
        return formatter.toString();
      }
    }

    @Override
    public int getPageCount() {
      return offsets.length;
    }

    @Override
    public long getOffset(int pageIndex) {
      return offsets[pageIndex];
    }

    @Override
    public int getCompressedPageSize(int pageIndex) {
      return compressedPageSizes[pageIndex];
    }

    @Override
    public long getFirstRowIndex(int pageIndex) {
      return firstRowIndexes[pageIndex];
    }
    
    @Override
    public int getPageOrdinal(int pageIndex) {
      return pageIndex;
    }
  }

  private static final OffsetIndexBuilder NO_OP_BUILDER = new OffsetIndexBuilder() {
    @Override
    public void add(int compressedPageSize, long rowCount) {
    }

    @Override
    public void add(long offset, int compressedPageSize, long rowCount) {
    }
  };

  private final LongList offsets = new LongArrayList();
  private final IntList compressedPageSizes = new IntArrayList();
  private final LongList firstRowIndexes = new LongArrayList();
  private long previousOffset;
  private int previousPageSize;
  private long previousRowIndex;
  private long previousRowCount;

  /**
   * @return a no-op builder that does not collect values and therefore returns {@code null} at {@link #build(long)}
   */
  public static OffsetIndexBuilder getNoOpBuilder() {
    return NO_OP_BUILDER;
  }

  /**
   * @return an {@link OffsetIndexBuilder} instance to build an {@link OffsetIndex} object
   */
  public static OffsetIndexBuilder getBuilder() {
    return new OffsetIndexBuilder();
  }

  private OffsetIndexBuilder() {
  }

  /**
   * Adds the specified parameters to this builder. Used by the writers to building up {@link OffsetIndex} objects to be
   * written to the Parquet file.
   *
   * @param compressedPageSize
   *          the size of the page (including header)
   * @param rowCount
   *          the number of rows in the page
   */
  public void add(int compressedPageSize, long rowCount) {
    add(previousOffset + previousPageSize, compressedPageSize, previousRowIndex + previousRowCount);
    previousRowCount = rowCount;
  }

  /**
   * Adds the specified parameters to this builder. Used by the metadata converter to building up {@link OffsetIndex}
   * objects read from the Parquet file.
   *
   * @param offset
   *          the offset of the page in the file
   * @param compressedPageSize
   *          the size of the page (including header)
   * @param firstRowIndex
   *          the index of the first row in the page (within the row group)
   */
  public void add(long offset, int compressedPageSize, long firstRowIndex) {
    previousOffset = offset;
    offsets.add(offset);
    previousPageSize = compressedPageSize;
    compressedPageSizes.add(compressedPageSize);
    previousRowIndex = firstRowIndex;
    firstRowIndexes.add(firstRowIndex);
  }

  /**
   * Builds the offset index. Used by the metadata converter to building up {@link OffsetIndex}
   * objects read from the Parquet file.
   *
   * @return the newly created offset index or {@code null} if the {@link OffsetIndex} object would be empty
   */
  public OffsetIndex build() {
    return build(0);
  }

  /**
   * Builds the offset index. Used by the writers to building up {@link OffsetIndex} objects to be
   * written to the Parquet file.
   *
   * @param firstPageOffset
   *          the actual offset in the file to be used to translate all the collected offsets
   * @return the newly created offset index or {@code null} if the {@link OffsetIndex} object would be empty
   */
  public OffsetIndex build(long firstPageOffset) {
    if (compressedPageSizes.isEmpty()) {
      return null;
    }
    long[] offsets = this.offsets.toLongArray();
    if (firstPageOffset != 0) {
      for (int i = 0, n = offsets.length; i < n; ++i) {
        offsets[i] += firstPageOffset;
      }
    }
    OffsetIndexImpl offsetIndex = new OffsetIndexImpl();
    offsetIndex.offsets = offsets;
    offsetIndex.compressedPageSizes = compressedPageSizes.toIntArray();
    offsetIndex.firstRowIndexes = firstRowIndexes.toLongArray();

    return offsetIndex;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy