All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.filter2.compat.RowGroupFilter Maven / Gradle / Ivy

There is a newer version: 1.11.9
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.filter2.compat;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Objects;

import org.apache.parquet.filter2.bloomfilterlevel.BloomFilterImpl;
import org.apache.parquet.filter2.compat.FilterCompat.Filter;
import org.apache.parquet.filter2.compat.FilterCompat.NoOpFilter;
import org.apache.parquet.filter2.compat.FilterCompat.Visitor;
import org.apache.parquet.filter2.dictionarylevel.DictionaryFilter;
import org.apache.parquet.filter2.predicate.FilterPredicate;
import org.apache.parquet.filter2.predicate.SchemaCompatibilityValidator;
import org.apache.parquet.filter2.statisticslevel.StatisticsFilter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.schema.MessageType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Given a {@link Filter} applies it to a list of BlockMetaData (row groups)
 * If the Filter is an {@link org.apache.parquet.filter.UnboundRecordFilter} or the no op filter,
 * no filtering will be performed.
 */
public class RowGroupFilter implements Visitor> {
  private final List blocks;
  private final MessageType schema;
  private final List levels;
  private final ParquetFileReader reader;

  public enum FilterLevel {
    STATISTICS,
    DICTIONARY,
    BLOOMFILTER
  }

  /**
   * @param filter a filter
   * @param blocks a list of block metadata to filter
   * @param schema the file schema
   * @return a filtered list of block metadata
   * @deprecated will be removed in 2.0.0.
   */
  @Deprecated
  public static List filterRowGroups(Filter filter, List blocks, MessageType schema) {
	  Objects.requireNonNull(filter, "filter cannot be null");
    return filter.accept(new RowGroupFilter(blocks, schema));
  }

  public static List filterRowGroups(List levels, Filter filter, List blocks, ParquetFileReader reader) {
    Objects.requireNonNull(filter, "filter cannot be null");
    return filter.accept(new RowGroupFilter(levels, blocks, reader));
  }

  @Deprecated
  private RowGroupFilter(List blocks, MessageType schema) {
    this.blocks = Objects.requireNonNull(blocks, "blocks cannnot be null");
    this.schema = Objects.requireNonNull(schema, "schema cannnot be null");
    this.levels = Collections.singletonList(FilterLevel.STATISTICS);
    this.reader = null;
  }

  private RowGroupFilter(List levels, List blocks, ParquetFileReader reader) {
    this.blocks = Objects.requireNonNull(blocks, "blocks cannnot be null");
    this.reader = Objects.requireNonNull(reader, "reader cannnot be null");
    this.schema = reader.getFileMetaData().getSchema();
    this.levels = levels;
  }

  @Override
  public List visit(FilterCompat.FilterPredicateCompat filterPredicateCompat) {
    FilterPredicate filterPredicate = filterPredicateCompat.getFilterPredicate();

    // check that the schema of the filter matches the schema of the file
    SchemaCompatibilityValidator.validate(filterPredicate, schema);

    List filteredBlocks = new ArrayList();

    for (BlockMetaData block : blocks) {
      boolean drop = false;

      if(levels.contains(FilterLevel.STATISTICS)) {
        drop = StatisticsFilter.canDrop(filterPredicate, block.getColumns());
      }

      if(!drop && levels.contains(FilterLevel.DICTIONARY)) {
        drop = DictionaryFilter.canDrop(filterPredicate, block.getColumns(), reader.getDictionaryReader(block));
      }

      if (!drop && levels.contains(FilterLevel.BLOOMFILTER)) {
        drop = BloomFilterImpl.canDrop(filterPredicate, block.getColumns(), reader.getBloomFilterDataReader(block));
      }

      if(!drop) {
        filteredBlocks.add(block);
      }
    }

    return filteredBlocks;
  }

  @Override
  public List visit(FilterCompat.UnboundRecordFilterCompat unboundRecordFilterCompat) {
    return blocks;
  }

  @Override
  public List visit(NoOpFilter noOpFilter) {
    return blocks;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy