All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.hadoop.ColumnIndexFilterUtils Maven / Gradle / Ivy

There is a newer version: 1.11.9
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.hadoop;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Formatter;
import java.util.List;

import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.internal.column.columnindex.OffsetIndex;
import org.apache.parquet.internal.filter2.columnindex.RowRanges;

import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;

/**
 * Internal utility class to help at column index based filtering.
 */
class ColumnIndexFilterUtils {
  static class OffsetRange {
    private final long offset;
    private long length;

    private OffsetRange(long offset, int length) {
      this.offset = offset;
      this.length = length;
    }

    long getOffset() {
      return offset;
    }

    long getLength() {
      return length;
    }

    private boolean extend(long offset, int length) {
      if (this.offset + this.length == offset) {
        this.length += length;
        return true;
      } else {
        return false;
      }
    }
  }

  private static class FilteredOffsetIndex implements OffsetIndex {
    private final OffsetIndex offsetIndex;
    private final int[] indexMap;

    private FilteredOffsetIndex(OffsetIndex offsetIndex, int[] indexMap) {
      this.offsetIndex = offsetIndex;
      this.indexMap = indexMap;
    }
    
    @Override
    public int getPageOrdinal(int pageIndex) {
      return indexMap[pageIndex];
    }

    @Override
    public int getPageCount() {
      return indexMap.length;
    }

    @Override
    public long getOffset(int pageIndex) {
      return offsetIndex.getOffset(indexMap[pageIndex]);
    }

    @Override
    public int getCompressedPageSize(int pageIndex) {
      return offsetIndex.getCompressedPageSize(indexMap[pageIndex]);
    }

    @Override
    public long getFirstRowIndex(int pageIndex) {
      return offsetIndex.getFirstRowIndex(indexMap[pageIndex]);
    }

    @Override
    public long getLastRowIndex(int pageIndex, long totalRowCount) {
      int nextIndex = indexMap[pageIndex] + 1;
      return (nextIndex >= offsetIndex.getPageCount() ? totalRowCount : offsetIndex.getFirstRowIndex(nextIndex)) - 1;
    }

    @Override
    public String toString() {
      try (Formatter formatter = new Formatter()) {
        formatter.format("%-12s  %20s  %16s  %20s\n", "", "offset", "compressed size", "first row index");
        for (int i = 0, n = offsetIndex.getPageCount(); i < n; ++i) {
          int index = Arrays.binarySearch(indexMap, i);
          boolean isHidden = index < 0;
          formatter.format("%spage-%-5d  %20d  %16d  %20d\n",
              isHidden ? "- " : "  ",
              isHidden ? i : index,
              offsetIndex.getOffset(i),
              offsetIndex.getCompressedPageSize(i),
              offsetIndex.getFirstRowIndex(i));
        }
        return formatter.toString();
      }
    }
  }

  /*
   * Returns the filtered offset index containing only the pages which are overlapping with rowRanges.
   */
  static OffsetIndex filterOffsetIndex(OffsetIndex offsetIndex, RowRanges rowRanges, long totalRowCount) {
    IntList indexMap = new IntArrayList();
    for (int i = 0, n = offsetIndex.getPageCount(); i < n; ++i) {
      long from = offsetIndex.getFirstRowIndex(i);
      if (rowRanges.isOverlapping(from, offsetIndex.getLastRowIndex(i, totalRowCount))) {
        indexMap.add(i);
      }
    }
    return new FilteredOffsetIndex(offsetIndex, indexMap.toIntArray());
  }

  static List calculateOffsetRanges(OffsetIndex offsetIndex, ColumnChunkMetaData cm,
      long firstPageOffset) {
    List ranges = new ArrayList<>();
    int n = offsetIndex.getPageCount();
    if (n > 0) {
      OffsetRange currentRange = null;

      // Add a range for the dictionary page if required
      long rowGroupOffset = cm.getStartingPos();
      if (rowGroupOffset < firstPageOffset) {
        currentRange = new OffsetRange(rowGroupOffset, (int) (firstPageOffset - rowGroupOffset));
        ranges.add(currentRange);
      }

      for (int i = 0; i < n; ++i) {
        long offset = offsetIndex.getOffset(i);
        int length = offsetIndex.getCompressedPageSize(i);
        if (currentRange == null || !currentRange.extend(offset, length)) {
          currentRange = new OffsetRange(offset, length);
          ranges.add(currentRange);
        }
      }
    }
    return ranges;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy