org.apache.parquet.hadoop.ParquetInputFormat Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of parquet-hadoop Show documentation

There is a newer version: 1.14.4

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.hadoop;

import static java.lang.Boolean.TRUE;
import static org.apache.parquet.Preconditions.checkArgument;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.parquet.Preconditions;
import org.apache.parquet.conf.HadoopParquetConfiguration;
import org.apache.parquet.conf.ParquetConfiguration;
import org.apache.parquet.filter.UnboundRecordFilter;
import org.apache.parquet.filter2.compat.FilterCompat;
import org.apache.parquet.filter2.compat.FilterCompat.Filter;
import org.apache.parquet.filter2.compat.RowGroupFilter;
import org.apache.parquet.filter2.predicate.FilterPredicate;
import org.apache.parquet.hadoop.api.InitContext;
import org.apache.parquet.hadoop.api.ReadSupport;
import org.apache.parquet.hadoop.api.ReadSupport.ReadContext;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.GlobalMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.ConfigurationUtil;
import org.apache.parquet.hadoop.util.ContextUtil;
import org.apache.parquet.hadoop.util.HiddenFileFilter;
import org.apache.parquet.hadoop.util.SerializationUtil;
import org.apache.parquet.io.ParquetDecodingException;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.MessageTypeParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * The input format to read a Parquet file.
 * 
 * It requires an implementation of {@link ReadSupport} to materialize the records.
 * 

 * The requestedSchema will control how the original records get projected by the loader.
 * It must be a subset of the original schema. Only the columns needed to reconstruct the records with the requestedSchema will be scanned.
 *
 * @param  the type of the materialized records
 * @see #READ_SUPPORT_CLASS
 * @see #UNBOUND_RECORD_FILTER
 * @see #STRICT_TYPE_CHECKING
 * @see #FILTER_PREDICATE
 * @see #TASK_SIDE_METADATA
 */
public class ParquetInputFormat extends FileInputFormat {

  private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class);

  /**
   * key to configure the ReadSupport implementation
   */
  public static final String READ_SUPPORT_CLASS = "parquet.read.support.class";

  /**
   * key to configure the filter
   */
  public static final String UNBOUND_RECORD_FILTER = "parquet.read.filter";

  /**
   * key to configure type checking for conflicting schemas (default: true)
   */
  public static final String STRICT_TYPE_CHECKING = "parquet.strict.typing";

  /**
   * key to configure the filter predicate
   */
  public static final String FILTER_PREDICATE = "parquet.private.read.filter.predicate";

  /**
   * key to configure whether record-level filtering is enabled
   */
  public static final String RECORD_FILTERING_ENABLED = "parquet.filter.record-level.enabled";

  /**
   * key to configure whether row group stats filtering is enabled
   */
  public static final String STATS_FILTERING_ENABLED = "parquet.filter.stats.enabled";

  /**
   * key to configure whether row group dictionary filtering is enabled
   */
  public static final String DICTIONARY_FILTERING_ENABLED = "parquet.filter.dictionary.enabled";

  /**
   * key to configure whether column index filtering of pages is enabled
   */
  public static final String COLUMN_INDEX_FILTERING_ENABLED = "parquet.filter.columnindex.enabled";

  /**
   * key to configure whether page level checksum verification is enabled
   */
  public static final String PAGE_VERIFY_CHECKSUM_ENABLED = "parquet.page.verify-checksum.enabled";

  /**
   * key to configure whether row group bloom filtering is enabled
   */
  public static final String BLOOM_FILTERING_ENABLED = "parquet.filter.bloom.enabled";

  /**
   * Key to configure if off-heap buffer should be used for decryption
   */
  public static final String OFF_HEAP_DECRYPT_BUFFER_ENABLED = "parquet.decrypt.off-heap.buffer.enabled";

  /**
   * key to turn on or off task side metadata loading (default true)
   * if true then metadata is read on the task side and some tasks may finish immediately.
   * if false metadata is read on the client which is slower if there is a lot of metadata but tasks will only be spawn if there is work to do.
   */
  public static final String TASK_SIDE_METADATA = "parquet.task.side.metadata";

  /**
   * key to turn off file splitting. See PARQUET-246.
   */
  public static final String SPLIT_FILES = "parquet.split.files";

  private static final int MIN_FOOTER_CACHE_SIZE = 100;

  /**
   * Key to enable/disable vectored io while reading parquet files:
   * {@value}.
   */
  public static final String HADOOP_VECTORED_IO_ENABLED = "parquet.hadoop.vectored.io.enabled";

  /**
   * Default value of parquet.hadoop.vectored.io.enabled is {@value}.
   */
  public static final boolean HADOOP_VECTORED_IO_DEFAULT = false;

  public static void setTaskSideMetaData(Job job, boolean taskSideMetadata) {
    ContextUtil.getConfiguration(job).setBoolean(TASK_SIDE_METADATA, taskSideMetadata);
  }

  public static boolean isTaskSideMetaData(Configuration configuration) {
    return configuration.getBoolean(TASK_SIDE_METADATA, TRUE);
  }

  public static void setReadSupportClass(Job job, Class readSupportClass) {
    ContextUtil.getConfiguration(job).set(READ_SUPPORT_CLASS, readSupportClass.getName());
  }

  public static void setUnboundRecordFilter(Job job, Class filterClass) {
    Configuration conf = ContextUtil.getConfiguration(job);
    checkArgument(
        getFilterPredicate(conf) == null,
        "You cannot provide an UnboundRecordFilter after providing a FilterPredicate");

    conf.set(UNBOUND_RECORD_FILTER, filterClass.getName());
  }

  /**
   * @param configuration a configuration
   * @return an unbound record filter class
   * @deprecated use {@link #getFilter(Configuration)}
   */
  @Deprecated
  public static Class getUnboundRecordFilter(Configuration configuration) {
    return ConfigurationUtil.getClassFromConfig(configuration, UNBOUND_RECORD_FILTER, UnboundRecordFilter.class);
  }

  private static UnboundRecordFilter getUnboundRecordFilterInstance(ParquetConfiguration configuration) {
    Class clazz =
        ConfigurationUtil.getClassFromConfig(configuration, UNBOUND_RECORD_FILTER, UnboundRecordFilter.class);
    if (clazz == null) {
      return null;
    }
    try {
      return (UnboundRecordFilter) clazz.newInstance();
    } catch (InstantiationException | IllegalAccessException e) {
      throw new BadConfigurationException("could not instantiate unbound record filter class", e);
    }
  }

  public static void setReadSupportClass(JobConf conf, Class readSupportClass) {
    conf.set(READ_SUPPORT_CLASS, readSupportClass.getName());
  }

  public static Class getReadSupportClass(Configuration configuration) {
    return ConfigurationUtil.getClassFromConfig(configuration, READ_SUPPORT_CLASS, ReadSupport.class);
  }

  public static void setFilterPredicate(Configuration configuration, FilterPredicate filterPredicate) {
    checkArgument(
        getUnboundRecordFilter(configuration) == null,
        "You cannot provide a FilterPredicate after providing an UnboundRecordFilter");

    configuration.set(FILTER_PREDICATE + ".human.readable", filterPredicate.toString());
    try {
      SerializationUtil.writeObjectToConfAsBase64(FILTER_PREDICATE, filterPredicate, configuration);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

  private static FilterPredicate getFilterPredicate(Configuration configuration) {
    return getFilterPredicate(new HadoopParquetConfiguration(configuration));
  }

  private static FilterPredicate getFilterPredicate(ParquetConfiguration configuration) {
    try {
      return SerializationUtil.readObjectFromConfAsBase64(FILTER_PREDICATE, configuration);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

  /**
   * Returns a non-null Filter, which is a wrapper around either a
   * FilterPredicate, an UnboundRecordFilter, or a no-op filter.
   *
   * @param conf a configuration
   * @return a filter for the unbound record filter specified in conf
   */
  public static Filter getFilter(Configuration conf) {
    return getFilter(new HadoopParquetConfiguration(conf));
  }

  public static Filter getFilter(ParquetConfiguration conf) {
    return FilterCompat.get(getFilterPredicate(conf), getUnboundRecordFilterInstance(conf));
  }

  private LruCache footersCache;

  private final Class> readSupportClass;

  /**
   * Hadoop will instantiate using this constructor
   */
  public ParquetInputFormat() {
    this.readSupportClass = null;
  }

  /**
   * Constructor for subclasses, such as AvroParquetInputFormat, or wrappers.
   * 

   * Subclasses and wrappers may use this constructor to set the ReadSupport
   * class that will be used when reading instead of requiring the user to set
   * the read support property in their configuration.
   *
   * @param readSupportClass a ReadSupport subclass
   * @param               the Java read support type
   */
  public > ParquetInputFormat(Class readSupportClass) {
    this.readSupportClass = readSupportClass;
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public RecordReader createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
      throws IOException, InterruptedException {
    Configuration conf = ContextUtil.getConfiguration(taskAttemptContext);
    ReadSupport readSupport = getReadSupport(conf);
    return new ParquetRecordReader(readSupport, getFilter(conf));
  }

  /**
   * @param configuration to find the configuration for the read support
   * @return the configured read support
   * @deprecated use getReadSupportInstance static methods instead
   */
  @Deprecated
  @SuppressWarnings("unchecked")
  ReadSupport getReadSupport(Configuration configuration) {
    return getReadSupportInstance(
        readSupportClass == null
            ? (Class>) getReadSupportClass(configuration)
            : readSupportClass);
  }

  /**
   * @param configuration to find the configuration for the read support
   * @param            the Java type of objects created by the ReadSupport
   * @return the configured read support
   */
  @SuppressWarnings("unchecked")
  public static  ReadSupport getReadSupportInstance(Configuration configuration) {
    return getReadSupportInstance((Class>) getReadSupportClass(configuration));
  }

  /**
   * @param readSupportClass to instantiate
   * @param               the Java type of objects created by the ReadSupport
   * @return the configured read support
   */
  @SuppressWarnings("unchecked")
  static  ReadSupport getReadSupportInstance(Class> readSupportClass) {
    try {
      return readSupportClass.newInstance();
    } catch (InstantiationException | IllegalAccessException e) {
      throw new BadConfigurationException("could not instantiate read support class", e);
    }
  }

  @Override
  protected boolean isSplitable(JobContext context, Path filename) {
    return ContextUtil.getConfiguration(context).getBoolean(SPLIT_FILES, true);
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public List getSplits(JobContext jobContext) throws IOException {
    Configuration configuration = ContextUtil.getConfiguration(jobContext);
    List splits = new ArrayList();

    if (isTaskSideMetaData(configuration)) {
      // Although not required by the API, some clients may depend on always
      // receiving ParquetInputSplit. Translation is required at some point.
      for (InputSplit split : super.getSplits(jobContext)) {
        Preconditions.checkArgument(split instanceof FileSplit, "Cannot wrap non-FileSplit: %s", split);
        splits.add(ParquetInputSplit.from((FileSplit) split));
      }
      return splits;
    } else {
      splits.addAll(getSplits(configuration, getFooters(jobContext)));
    }

    return splits;
  }

  /**
   * @param configuration the configuration to connect to the file system
   * @param footers       the footers of the files to read
   * @return the splits for the footers
   * @throws IOException if there is an error while reading
   * @deprecated split planning using file footers will be removed
   */
  @Deprecated
  public List getSplits(Configuration configuration, List footers) throws IOException {
    boolean strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
    final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE);
    final long minSplitSize = Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L));
    if (maxSplitSize < 0 || minSplitSize < 0) {
      throw new ParquetDecodingException("maxSplitSize or minSplitSize should not be negative: maxSplitSize = "
          + maxSplitSize + "; minSplitSize = " + minSplitSize);
    }
    GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers, strictTypeChecking);
    ReadContext readContext = getReadSupport(configuration)
        .init(new InitContext(configuration, globalMetaData.getKeyValueMetaData(), globalMetaData.getSchema()));

    return new ClientSideMetadataSplitStrategy()
        .getSplits(configuration, footers, maxSplitSize, minSplitSize, readContext);
  }

  /*
   * This is to support multi-level/recursive directory listing until
   * MAPREDUCE-1577 is fixed.
   */
  @Override
  protected List listStatus(JobContext jobContext) throws IOException {
    return getAllFileRecursively(super.listStatus(jobContext), ContextUtil.getConfiguration(jobContext));
  }

  private static List getAllFileRecursively(List files, Configuration conf)
      throws IOException {
    List result = new ArrayList();
    for (FileStatus file : files) {
      if (file.isDir()) {
        Path p = file.getPath();
        FileSystem fs = p.getFileSystem(conf);
        staticAddInputPathRecursively(result, fs, p, HiddenFileFilter.INSTANCE);
      } else {
        result.add(file);
      }
    }
    LOG.info("Total input paths to process : {}", result.size());
    return result;
  }

  private static void staticAddInputPathRecursively(
      List result, FileSystem fs, Path path, PathFilter inputFilter) throws IOException {
    for (FileStatus stat : fs.listStatus(path, inputFilter)) {
      if (stat.isDir()) {
        staticAddInputPathRecursively(result, fs, stat.getPath(), inputFilter);
      } else {
        result.add(stat);
      }
    }
  }

  /**
   * @param jobContext the current job context
   * @return the footers for the files
   * @throws IOException if there is an error while reading
   */
  public List getFooters(JobContext jobContext) throws IOException {
    List statuses = listStatus(jobContext);
    if (statuses.isEmpty()) {
      return Collections.emptyList();
    }
    Configuration config = ContextUtil.getConfiguration(jobContext);
    // Use LinkedHashMap to preserve the insertion order and ultimately to return the list of
    // footers in the same order as the list of file statuses returned from listStatus()
    Map footersMap = new LinkedHashMap();
    Set missingStatuses = new HashSet();
    Map missingStatusesMap = new HashMap(missingStatuses.size());

    if (footersCache == null) {
      footersCache = new LruCache(
          Math.max(statuses.size(), MIN_FOOTER_CACHE_SIZE));
    }
    for (FileStatus status : statuses) {
      FileStatusWrapper statusWrapper = new FileStatusWrapper(status);
      FootersCacheValue cacheEntry = footersCache.getCurrentValue(statusWrapper);
      if (LOG.isDebugEnabled()) {
        LOG.debug(
            "Cache entry " + (cacheEntry == null ? "not " : "") + " found for '" + status.getPath() + "'");
      }
      if (cacheEntry != null) {
        footersMap.put(statusWrapper, cacheEntry.getFooter());
      } else {
        footersMap.put(statusWrapper, null);
        missingStatuses.add(status);
        missingStatusesMap.put(status.getPath(), statusWrapper);
      }
    }
    LOG.debug(
        "found {} footers in cache and adding up to {} missing footers to the cache",
        footersMap.size(),
        missingStatuses.size());

    if (!missingStatuses.isEmpty()) {
      List newFooters = getFooters(config, missingStatuses);
      for (Footer newFooter : newFooters) {
        // Use the original file status objects to make sure we store a
        // conservative (older) modification time (i.e. in case the files and
        // footers were modified and it's not clear which version of the footers
        // we have)
        FileStatusWrapper fileStatus = missingStatusesMap.get(newFooter.getFile());
        footersCache.put(fileStatus, new FootersCacheValue(fileStatus, newFooter));
      }
    }

    List footers = new ArrayList(statuses.size());
    for (Entry footerEntry : footersMap.entrySet()) {
      Footer footer = footerEntry.getValue();

      if (footer == null) {
        // Footer was originally missing, so get it from the cache again
        footers.add(footersCache.getCurrentValue(footerEntry.getKey()).getFooter());
      } else {
        footers.add(footer);
      }
    }

    return footers;
  }

  public List getFooters(Configuration configuration, List statuses) throws IOException {
    return getFooters(configuration, (Collection) statuses);
  }

  /**
   * the footers for the files
   *
   * @param configuration to connect to the file system
   * @param statuses      the files to open
   * @return the footers of the files
   * @throws IOException if there is an error while reading
   */
  public List getFooters(Configuration configuration, Collection statuses) throws IOException {
    LOG.debug("reading {} files", statuses.size());
    boolean taskSideMetaData = isTaskSideMetaData(configuration);
    return ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(configuration, statuses, taskSideMetaData);
  }

  /**
   * @param jobContext the current job context
   * @return the merged metadata from the footers
   * @throws IOException if there is an error while reading
   */
  public GlobalMetaData getGlobalMetaData(JobContext jobContext) throws IOException {
    return ParquetFileWriter.getGlobalMetaData(getFooters(jobContext));
  }

  /**
   * A simple wrapper around {@link org.apache.parquet.hadoop.Footer} that also includes a
   * modification time associated with that footer.  The modification time is
   * used to determine whether the footer is still current.
   */
  static final class FootersCacheValue implements LruCache.Value {
    private final long modificationTime;
    private final Footer footer;

    public FootersCacheValue(FileStatusWrapper status, Footer footer) {
      this.modificationTime = status.getModificationTime();
      this.footer = new Footer(footer.getFile(), footer.getParquetMetadata());
    }

    @Override
    public boolean isCurrent(FileStatusWrapper key) {
      long currentModTime = key.getModificationTime();
      boolean isCurrent = modificationTime >= currentModTime;
      if (LOG.isDebugEnabled() && !isCurrent) {
        LOG.debug(
            "The cache value for '{}' is not current: cached modification time={}, current modification time: {}",
            key,
            modificationTime,
            currentModTime);
      }
      return isCurrent;
    }

    public Footer getFooter() {
      return footer;
    }

    @Override
    public boolean isNewerThan(FootersCacheValue otherValue) {
      return otherValue == null || modificationTime > otherValue.modificationTime;
    }

    public Path getPath() {
      return footer.getFile();
    }
  }

  /**
   * A simple wrapper around {@link org.apache.hadoop.fs.FileStatus} with a
   * meaningful "toString()" method
   */
  static final class FileStatusWrapper {
    private final FileStatus status;

    public FileStatusWrapper(FileStatus fileStatus) {
      if (fileStatus == null) {
        throw new IllegalArgumentException("FileStatus object cannot be null");
      }
      status = fileStatus;
    }

    public long getModificationTime() {
      return status.getModificationTime();
    }

    @Override
    public int hashCode() {
      return status.hashCode();
    }

    @Override
    public boolean equals(Object other) {
      return other instanceof FileStatusWrapper && status.equals(((FileStatusWrapper) other).status);
    }

    @Override
    public String toString() {
      return status.getPath().toString();
    }
  }
}

class ClientSideMetadataSplitStrategy {
  // Wrapper of hdfs blocks, keep track of which HDFS block is being used
  private static class HDFSBlocks {
    BlockLocation[] hdfsBlocks;
    int currentStartHdfsBlockIndex = 0; // the hdfs block index corresponding to the start of a row group
    int currentMidPointHDFSBlockIndex =
        0; // the hdfs block index corresponding to the mid-point of a row group, a split might be created only
    // when the midpoint of the rowgroup enters a new hdfs block

    private HDFSBlocks(BlockLocation[] hdfsBlocks) {
      this.hdfsBlocks = hdfsBlocks;
      Comparator comparator = new Comparator() {
        @Override
        public int compare(BlockLocation b1, BlockLocation b2) {
          return Long.signum(b1.getOffset() - b2.getOffset());
        }
      };
      Arrays.sort(hdfsBlocks, comparator);
    }

    private long getHDFSBlockEndingPosition(int hdfsBlockIndex) {
      BlockLocation hdfsBlock = hdfsBlocks[hdfsBlockIndex];
      return hdfsBlock.getOffset() + hdfsBlock.getLength() - 1;
    }

    /**
     * @param rowGroupMetadata
     * @return true if the mid point of row group is in a new hdfs block, and also move the currentHDFSBlock pointer to the correct index that contains the row group;
     * return false if the mid point of row group is in the same hdfs block
     */
    private boolean checkBelongingToANewHDFSBlock(BlockMetaData rowGroupMetadata) {
      boolean isNewHdfsBlock = false;
      long rowGroupMidPoint = rowGroupMetadata.getStartingPos() + (rowGroupMetadata.getCompressedSize() / 2);

      // if mid point is not in the current HDFS block any more, return true
      while (rowGroupMidPoint > getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex)) {
        isNewHdfsBlock = true;
        currentMidPointHDFSBlockIndex++;
        if (currentMidPointHDFSBlockIndex >= hdfsBlocks.length)
          throw new ParquetDecodingException(
              "the row group is not in hdfs blocks in the file: midpoint of row groups is "
                  + rowGroupMidPoint
                  + ", the end of the hdfs block is "
                  + getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex - 1));
      }

      while (rowGroupMetadata.getStartingPos() > getHDFSBlockEndingPosition(currentStartHdfsBlockIndex)) {
        currentStartHdfsBlockIndex++;
        if (currentStartHdfsBlockIndex >= hdfsBlocks.length)
          throw new ParquetDecodingException("The row group does not start in this file: row group offset is "
              + rowGroupMetadata.getStartingPos()
              + " but the end of hdfs blocks of file is "
              + getHDFSBlockEndingPosition(currentStartHdfsBlockIndex));
      }
      return isNewHdfsBlock;
    }

    public BlockLocation getCurrentBlock() {
      return hdfsBlocks[currentStartHdfsBlockIndex];
    }
  }

  static class SplitInfo {
    List rowGroups = new ArrayList();
    BlockLocation hdfsBlock;
    long compressedByteSize = 0L;

    public SplitInfo(BlockLocation currentBlock) {
      this.hdfsBlock = currentBlock;
    }

    private void addRowGroup(BlockMetaData rowGroup) {
      this.rowGroups.add(rowGroup);
      this.compressedByteSize += rowGroup.getCompressedSize();
    }

    public long getCompressedByteSize() {
      return compressedByteSize;
    }

    public List getRowGroups() {
      return rowGroups;
    }

    int getRowGroupCount() {
      return rowGroups.size();
    }

    public ParquetInputSplit getParquetInputSplit(
        FileStatus fileStatus, String requestedSchema, Map readSupportMetadata)
        throws IOException {
      MessageType requested = MessageTypeParser.parseMessageType(requestedSchema);
      long length = 0;

      for (BlockMetaData block : this.getRowGroups()) {
        List columns = block.getColumns();
        for (ColumnChunkMetaData column : columns) {
          if (requested.containsPath(column.getPath().toArray())) {
            length += column.getTotalSize();
          }
        }
      }

      BlockMetaData lastRowGroup = this.getRowGroups().get(this.getRowGroupCount() - 1);
      long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize();

      long[] rowGroupOffsets = new long[this.getRowGroupCount()];
      for (int i = 0; i < rowGroupOffsets.length; i++) {
        rowGroupOffsets[i] = this.getRowGroups().get(i).getStartingPos();
      }

      return new ParquetInputSplit(
          fileStatus.getPath(), hdfsBlock.getOffset(), end, length, hdfsBlock.getHosts(), rowGroupOffsets);
    }
  }

  private static final Logger LOG = LoggerFactory.getLogger(ClientSideMetadataSplitStrategy.class);

  List getSplits(
      Configuration configuration,
      List footers,
      long maxSplitSize,
      long minSplitSize,
      ReadContext readContext)
      throws IOException {
    List splits = new ArrayList();
    Filter filter = ParquetInputFormat.getFilter(configuration);

    long rowGroupsDropped = 0;
    long totalRowGroups = 0;

    for (Footer footer : footers) {
      final Path file = footer.getFile();
      LOG.debug("{}", file);
      FileSystem fs = file.getFileSystem(configuration);
      FileStatus fileStatus = fs.getFileStatus(file);
      ParquetMetadata parquetMetaData = footer.getParquetMetadata();
      List blocks = parquetMetaData.getBlocks();

      List filteredBlocks;

      totalRowGroups += blocks.size();
      filteredBlocks = RowGroupFilter.filterRowGroups(
          filter, blocks, parquetMetaData.getFileMetaData().getSchema());
      rowGroupsDropped += blocks.size() - filteredBlocks.size();

      if (filteredBlocks.isEmpty()) {
        continue;
      }

      BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
      splits.addAll(generateSplits(
          filteredBlocks,
          fileBlockLocations,
          fileStatus,
          readContext.getRequestedSchema().toString(),
          readContext.getReadSupportMetadata(),
          minSplitSize,
          maxSplitSize));
    }

    if (rowGroupsDropped > 0 && totalRowGroups > 0) {
      int percentDropped = (int) ((((double) rowGroupsDropped) / totalRowGroups) * 100);
      LOG.info(
          "Dropping {} row groups that do not pass filter predicate! ({}%)",
          rowGroupsDropped, percentDropped);
    } else {
      LOG.info("There were no row groups that could be dropped due to filter predicates");
    }
    return splits;
  }

  /**
   * groups together all the data blocks for the same HDFS block
   *
   * @param rowGroupBlocks      data blocks (row groups)
   * @param hdfsBlocksArray     hdfs blocks
   * @param fileStatus          the containing file
   * @param requestedSchema     the schema requested by the user
   * @param readSupportMetadata the metadata provided by the readSupport implementation in init
   * @param minSplitSize        the mapred.min.split.size
   * @param maxSplitSize        the mapred.max.split.size
   * @return the splits (one per HDFS block)
   * @throws IOException If hosts can't be retrieved for the HDFS block
   */
  static  List generateSplits(
      List rowGroupBlocks,
      BlockLocation[] hdfsBlocksArray,
      FileStatus fileStatus,
      String requestedSchema,
      Map readSupportMetadata,
      long minSplitSize,
      long maxSplitSize)
      throws IOException {

    List splitRowGroups = generateSplitInfo(rowGroupBlocks, hdfsBlocksArray, minSplitSize, maxSplitSize);

    // generate splits from rowGroups of each split
    List resultSplits = new ArrayList();
    for (SplitInfo splitInfo : splitRowGroups) {
      ParquetInputSplit split = splitInfo.getParquetInputSplit(fileStatus, requestedSchema, readSupportMetadata);
      resultSplits.add(split);
    }
    return resultSplits;
  }

  static List generateSplitInfo(
      List rowGroupBlocks, BlockLocation[] hdfsBlocksArray, long minSplitSize, long maxSplitSize) {
    List splitRowGroups;

    if (maxSplitSize < minSplitSize || maxSplitSize < 0 || minSplitSize < 0) {
      throw new ParquetDecodingException(
          "maxSplitSize and minSplitSize should be positive and max should be greater or equal to the minSplitSize: maxSplitSize = "
              + maxSplitSize + "; minSplitSize is " + minSplitSize);
    }
    HDFSBlocks hdfsBlocks = new HDFSBlocks(hdfsBlocksArray);
    hdfsBlocks.checkBelongingToANewHDFSBlock(rowGroupBlocks.get(0));
    SplitInfo currentSplit = new SplitInfo(hdfsBlocks.getCurrentBlock());

    // assign rowGroups to splits
    splitRowGroups = new ArrayList();
    checkSorted(rowGroupBlocks); // assert row groups are sorted
    for (BlockMetaData rowGroupMetadata : rowGroupBlocks) {
      if ((hdfsBlocks.checkBelongingToANewHDFSBlock(rowGroupMetadata)
              && currentSplit.getCompressedByteSize() >= minSplitSize
              && currentSplit.getCompressedByteSize() > 0)
          || currentSplit.getCompressedByteSize() >= maxSplitSize) {
        // create a new split
        splitRowGroups.add(currentSplit); // finish previous split
        currentSplit = new SplitInfo(hdfsBlocks.getCurrentBlock());
      }
      currentSplit.addRowGroup(rowGroupMetadata);
    }

    if (currentSplit.getRowGroupCount() > 0) {
      splitRowGroups.add(currentSplit);
    }

    return splitRowGroups;
  }

  private static void checkSorted(List rowGroupBlocks) {
    long previousOffset = 0L;
    for (BlockMetaData rowGroup : rowGroupBlocks) {
      long currentOffset = rowGroup.getStartingPos();
      if (currentOffset < previousOffset) {
        throw new ParquetDecodingException("row groups are not sorted: previous row groups starts at "
            + previousOffset + ", current row group starts at " + currentOffset);
      }
    }
  }
}

    

    

    
            
    
            

    
        
            
                Related Artifacts
                
                     mysql-connector-java mysql
 facebook-messenger com.github.codedrinker
 selenium-java org.seleniumhq.selenium
 instagram-java com.github.sola92
 gson com.google.code.gson
 poi org.apache.poi
 httpclient org.apache.httpcomponents
 json org.json
 facebook-java-api com.google.code.facebook-java-api
 poi-ooxml org.apache.poi
 jackson-databind com.fasterxml.jackson.core
 junit junit
 primefaces org.primefaces
 ojdbc7 com.github.noraui
 jfoenix com.jfoenix
 testng org.testng
 json-simple com.googlecode.json-simple
 selenium-server org.seleniumhq.selenium
 itextpdf com.itextpdf
 spring-core org.springframework
                
            
        
        
            
                Related Groups
                
                     org.springframework
 org.apache.poi
 org.hibernate
 org.springframework.boot
 com.fasterxml.jackson.core
 com.itextpdf
 org.seleniumhq.selenium
 mysql
 org.finos.legend.engine
 org.apache.httpcomponents
 org.apache.logging.log4j
 org.openjfx
 org.apache.commons
 org.json
 com.google.guava
 com.google.zxing
 net.sf.jasperreports
 javax.xml.bind
 ojdbc
 com.google.code.facebook-java-api