parquet.hadoop.ParquetFileReader Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of parquet-hadoop Show documentation

There is a newer version: 1.6.0

/**
 * Copyright 2012 Twitter, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package parquet.hadoop;

import static parquet.Log.DEBUG;
import static parquet.bytes.BytesUtils.readIntLittleEndian;
import static parquet.format.Util.readPageHeader;
import static parquet.hadoop.ParquetFileWriter.MAGIC;
import static parquet.hadoop.ParquetFileWriter.PARQUET_METADATA_FILE;

import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.Utils;

import parquet.Log;
import parquet.bytes.BytesInput;
import parquet.column.ColumnDescriptor;
import parquet.column.page.DictionaryPage;
import parquet.column.page.Page;
import parquet.column.page.PageReadStore;
import parquet.format.PageHeader;
import parquet.format.converter.ParquetMetadataConverter;
import parquet.hadoop.CodecFactory.BytesDecompressor;
import parquet.hadoop.ColumnChunkPageReadStore.ColumnChunkPageReader;
import parquet.hadoop.metadata.BlockMetaData;
import parquet.hadoop.metadata.ColumnChunkMetaData;
import parquet.hadoop.metadata.ColumnPath;
import parquet.hadoop.metadata.ParquetMetadata;
import parquet.hadoop.util.counters.BenchmarkCounter;
import parquet.io.ParquetDecodingException;

/**
 * Internal implementation of the Parquet file reader as a block container
 *
 * @author Julien Le Dem
 *
 */
public class ParquetFileReader implements Closeable {

  private static final Log LOG = Log.getLog(ParquetFileReader.class);

  private static ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();

  /**
   * for files provided, check if there's a summary file.
   * If a summary file is found it is used otherwise the file footer is used.
   * @param configuration the hadoop conf to connect to the file system;
   * @param partFiles the part files to read
   * @return the footers for those files using the summary file if possible.
   * @throws IOException
   */
  public static List readAllFootersInParallelUsingSummaryFiles(final Configuration configuration, List partFiles) throws IOException {

    // figure out list of all parents to part files
    Set parents = new HashSet();
    for (FileStatus part : partFiles) {
      parents.add(part.getPath().getParent());
    }

    // read corresponding summary files if they exist
    Map cache = new HashMap();
    for (Path path : parents) {
      FileSystem fileSystem = path.getFileSystem(configuration);
      Path summaryFile = new Path(path, PARQUET_METADATA_FILE);
      if (fileSystem.exists(summaryFile)) {
        if (Log.INFO) LOG.info("reading summary file: " + summaryFile);
        List footers = readSummaryFile(configuration, fileSystem.getFileStatus(summaryFile));
        for (Footer footer : footers) {
          // the folder may have been moved
          footer = new Footer(new Path(path, footer.getFile().getName()), footer.getParquetMetadata());
          cache.put(footer.getFile(), footer);
        }
      }
    }

    // keep only footers for files actually requested and read file footer if not found in summaries
    List result = new ArrayList(partFiles.size());
    List toRead = new ArrayList();
    for (FileStatus part : partFiles) {
      Footer f = cache.get(part.getPath());
      if (f != null) {
        result.add(f);
      } else {
        toRead.add(part);
      }
    }

    if (toRead.size() > 0) {
      // read the footers of the files that did not have a summary file
      if (Log.INFO) LOG.info("reading another " + toRead.size() + " footers");
      result.addAll(readAllFootersInParallel(configuration, toRead));
    }

    return result;
  }

  public static List readAllFootersInParallel(final Configuration configuration, List partFiles) throws IOException {
    ExecutorService threadPool = Executors.newFixedThreadPool(5);
    try {
      List> footers = new ArrayList>();
      for (final FileStatus currentFile : partFiles) {
        footers.add(threadPool.submit(new Callable() {
          @Override
          public Footer call() throws Exception {
            try {
              FileSystem fs = currentFile.getPath().getFileSystem(configuration);
              return readFooter(fs, currentFile);
            } catch (IOException e) {
              throw new IOException("Could not read footer for file " + currentFile, e);
            }
          }

          private Footer readFooter(final FileSystem fs,
              final FileStatus currentFile) throws IOException {
            ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(configuration, currentFile);
            return new Footer(currentFile.getPath(), parquetMetadata);
          }
        }));
      }
      List result = new ArrayList(footers.size());
      for (Future futureFooter : footers) {
        try {
          result.add(futureFooter.get());
        } catch (InterruptedException e) {
          Thread.interrupted();
          throw new RuntimeException("The thread was interrupted", e);
        } catch (ExecutionException e) {
          throw new IOException("Could not read footer: " + e.getMessage(), e.getCause());
        }
      }
      return result;
    } finally {
      threadPool.shutdownNow();
    }
  }

  public static List readAllFootersInParallel(Configuration configuration, FileStatus fileStatus) throws IOException {
    final FileSystem fs = fileStatus.getPath().getFileSystem(configuration);
    List statuses;
    if (fileStatus.isDir()) {
      statuses = Arrays.asList(fs.listStatus(fileStatus.getPath(), new Utils.OutputFileUtils.OutputFilesFilter()));
    } else {
      statuses = new ArrayList();
      statuses.add(fileStatus);
    }
    return readAllFootersInParallel(configuration, statuses);
  }

  public static List readFooters(Configuration configuration, FileStatus pathStatus) throws IOException {
    try {
      if (pathStatus.isDir()) {
        Path summaryPath = new Path(pathStatus.getPath(), PARQUET_METADATA_FILE);
        FileSystem fs = summaryPath.getFileSystem(configuration);
        if (fs.exists(summaryPath)) {
          FileStatus summaryStatus = fs.getFileStatus(summaryPath);
          return readSummaryFile(configuration, summaryStatus);
        }
      }
    } catch (IOException e) {
      LOG.warn("can not read summary file for " + pathStatus.getPath(), e);
    }
    return readAllFootersInParallel(configuration, pathStatus);

  }

  public static List readSummaryFile(Configuration configuration, FileStatus summaryStatus) throws IOException {
    final Path parent = summaryStatus.getPath().getParent();
    ParquetMetadata mergedFooters = readFooter(configuration, summaryStatus);
    Map footers = new HashMap();
    List blocks = mergedFooters.getBlocks();
    for (BlockMetaData block : blocks) {
      String path = block.getPath();
      Path fullPath = new Path(parent, path);
      ParquetMetadata current = footers.get(fullPath);
      if (current == null) {
        current = new ParquetMetadata(mergedFooters.getFileMetaData(), new ArrayList());
        footers.put(fullPath, current);
      }
      current.getBlocks().add(block);
    }
    List result = new ArrayList();
    for (Entry entry : footers.entrySet()) {
      result.add(new Footer(entry.getKey(), entry.getValue()));
    }
    return result;
  }

  /**
   * Reads the meta data block in the footer of the file
   * @param configuration
   * @param file the parquet File
   * @return the metadata blocks in the footer
   * @throws IOException if an error occurs while reading the file
   */
  public static final ParquetMetadata readFooter(Configuration configuration, Path file) throws IOException {
    FileSystem fileSystem = file.getFileSystem(configuration);
    return readFooter(configuration, fileSystem.getFileStatus(file));
  }


  public static final List readFooters(Configuration configuration, Path file) throws IOException {
    FileSystem fileSystem = file.getFileSystem(configuration);
    return readFooters(configuration, fileSystem.getFileStatus(file));
  }

  /**
   * Reads the meta data block in the footer of the file
   * @param configuration
   * @param file the parquet File
   * @return the metadata blocks in the footer
   * @throws IOException if an error occurs while reading the file
   */
  public static final ParquetMetadata readFooter(Configuration configuration, FileStatus file) throws IOException {
    FileSystem fileSystem = file.getPath().getFileSystem(configuration);
    FSDataInputStream f = fileSystem.open(file.getPath());
    try {
      long l = file.getLen();
      if (Log.DEBUG) LOG.debug("File length " + l);
      int FOOTER_LENGTH_SIZE = 4;
      if (l < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC
        throw new RuntimeException(file.getPath() + " is not a Parquet file (too small)");
      }
      long footerLengthIndex = l - FOOTER_LENGTH_SIZE - MAGIC.length;
      if (Log.DEBUG) LOG.debug("reading footer index at " + footerLengthIndex);

      f.seek(footerLengthIndex);
      int footerLength = readIntLittleEndian(f);
      byte[] magic = new byte[MAGIC.length];
      f.readFully(magic);
      if (!Arrays.equals(MAGIC, magic)) {
        throw new RuntimeException(file.getPath() + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic));
      }
      long footerIndex = footerLengthIndex - footerLength;
      if (Log.DEBUG) LOG.debug("read footer length: " + footerLength + ", footer index: " + footerIndex);
      if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) {
        throw new RuntimeException("corrupted file: the footer index is not within the file");
      }
      f.seek(footerIndex);
      return parquetMetadataConverter.readParquetMetadata(f);
    } finally {
      f.close();
    }
  }
  private CodecFactory codecFactory;

  private final List blocks;
  private final FSDataInputStream f;
  private final Path filePath;
  private int currentBlock = 0;
  private Map paths = new HashMap();

  /**
   *
   * @param f the Parquet file
   * @param blocks the blocks to read
   * @param colums the columns to read (their path)
   * @param codecClassName the codec used to compress the blocks
   * @throws IOException if the file can not be opened
   */
  public ParquetFileReader(Configuration configuration, Path filePath, List blocks, List columns) throws IOException {
    this.filePath = filePath;
    FileSystem fs = filePath.getFileSystem(configuration);
    this.f = fs.open(filePath);
    this.blocks = blocks;
    for (ColumnDescriptor col : columns) {
      paths.put(ColumnPath.get(col.getPath()), col);
    }
    this.codecFactory = new CodecFactory(configuration);
  }

  /**
   * Reads all the columns requested from the row group at the current file position.
   * @throws IOException if an error occurs while reading
   * @return the PageReadStore which can provide PageReaders for each column.
   */
  public PageReadStore readNextRowGroup() throws IOException {
    if (currentBlock == blocks.size()) {
      return null;
    }
    BlockMetaData block = blocks.get(currentBlock);
    if (block.getRowCount() == 0) {
      throw new RuntimeException("Illegal row group of 0 rows");
    }
    ColumnChunkPageReadStore columnChunkPageReadStore = new ColumnChunkPageReadStore(block.getRowCount());
    for (ColumnChunkMetaData mc : block.getColumns()) {
      ColumnPath pathKey = mc.getPath();
      BenchmarkCounter.incrementTotalBytes(mc.getTotalSize());
      ColumnDescriptor columnDescriptor = paths.get(pathKey);
      if (columnDescriptor != null) {
        BenchmarkCounter.incrementBytesRead(mc.getTotalSize());
        List pagesInChunk = new ArrayList();
        List dictionaryPagesInChunk = new ArrayList();
        readColumnChunkPages(columnDescriptor, mc, pagesInChunk, dictionaryPagesInChunk);
        if (dictionaryPagesInChunk.size() > 1) {
          throw new ParquetDecodingException("more than one dictionary page: " + dictionaryPagesInChunk);
        }
        BytesDecompressor decompressor = codecFactory.getDecompressor(mc.getCodec());
        ColumnChunkPageReader columnChunkPageReader = new ColumnChunkPageReader(decompressor, pagesInChunk, dictionaryPagesInChunk.size() == 0 ? null : dictionaryPagesInChunk.get(0));
        columnChunkPageReadStore.addColumn(columnDescriptor, columnChunkPageReader);
      }
    }
    ++currentBlock;
    return columnChunkPageReadStore;
  }

  /**
   * Read all of the pages in a given column chunk.
   * @return the list of pages
   */
  private void readColumnChunkPages(ColumnDescriptor columnDescriptor, ColumnChunkMetaData metadata, List pagesInChunk, List dictionaryPagesInChunk)
      throws IOException {
    long startingPos = metadata.getFirstDataPageOffset();
    if (metadata.getDictionaryPageOffset() > 0 && metadata.getDictionaryPageOffset() < startingPos) {
      // if there's a dictionary and it's before the first data page, start from there
      startingPos = metadata.getDictionaryPageOffset();
    }
    f.seek(startingPos);
    if (DEBUG) {
      LOG.debug(f.getPos() + ": start column chunk " + metadata.getPath() +
        " " + metadata.getType() + " count=" + metadata.getValueCount());
    }
    long valuesCountReadSoFar = 0;
    while (valuesCountReadSoFar < metadata.getValueCount()) {
      PageHeader pageHeader = readPageHeader(f);
      switch (pageHeader.type) {
        case DICTIONARY_PAGE:
          dictionaryPagesInChunk.add(
              new DictionaryPage(
                  BytesInput.copy(BytesInput.from(f, pageHeader.compressed_page_size)),
                  pageHeader.uncompressed_page_size,
                  pageHeader.dictionary_page_header.num_values,
                  parquetMetadataConverter.getEncoding(pageHeader.dictionary_page_header.encoding)
                  ));
          break;
        case DATA_PAGE:
          pagesInChunk.add(
              new Page(
                  BytesInput.copy(BytesInput.from(f, pageHeader.compressed_page_size)),
                  pageHeader.data_page_header.num_values,
                  pageHeader.uncompressed_page_size,
                  parquetMetadataConverter.getEncoding(pageHeader.data_page_header.repetition_level_encoding),
                  parquetMetadataConverter.getEncoding(pageHeader.data_page_header.definition_level_encoding),
                  parquetMetadataConverter.getEncoding(pageHeader.data_page_header.encoding)
                  ));
          valuesCountReadSoFar += pageHeader.data_page_header.num_values;
          break;
        default:
          if (DEBUG) LOG.debug("skipping page of type " + pageHeader.type + " of size " + pageHeader.compressed_page_size);
          f.skip(pageHeader.compressed_page_size);
          break;
      }
    }
    if (valuesCountReadSoFar != metadata.getValueCount()) {
      // Would be nice to have a CorruptParquetFileException or something as a subclass?
      throw new IOException(
          "Expected " + metadata.getValueCount() + " values in column chunk at " +
          filePath + " offset " + metadata.getFirstDataPageOffset() +
          " but got " + valuesCountReadSoFar + " values instead over " + pagesInChunk.size()
          + " pages ending at file offset " + f.getPos());
    }
  }

  @Override
  public void close() throws IOException {
    f.close();
    this.codecFactory.release();
  }

}

    

    

    
            
    
            

    
        
            
                Related Artifacts
                
                     mysql-connector-java mysql
 facebook-messenger com.github.codedrinker
 selenium-java org.seleniumhq.selenium
 instagram-java com.github.sola92
 gson com.google.code.gson
 poi org.apache.poi
 httpclient org.apache.httpcomponents
 json org.json
 facebook-java-api com.google.code.facebook-java-api
 poi-ooxml org.apache.poi
 jackson-databind com.fasterxml.jackson.core
 junit junit
 primefaces org.primefaces
 ojdbc7 com.github.noraui
 jfoenix com.jfoenix
 testng org.testng
 json-simple com.googlecode.json-simple
 selenium-server org.seleniumhq.selenium
 itextpdf com.itextpdf
 spring-core org.springframework
                
            
        
        
            
                Related Groups
                
                     org.springframework
 org.apache.poi
 org.hibernate
 org.springframework.boot
 com.fasterxml.jackson.core
 com.itextpdf
 org.seleniumhq.selenium
 mysql
 org.finos.legend.engine
 org.apache.httpcomponents
 org.apache.logging.log4j
 org.openjfx
 org.apache.commons
 org.json
 com.google.guava
 com.google.zxing
 net.sf.jasperreports
 javax.xml.bind
 ojdbc
 com.google.code.facebook-java-api