All Downloads are FREE. Search and download functionalities are using the official Maven repository.

parquet.hadoop.ParquetReader Maven / Gradle / Ivy

There is a newer version: 1.6.0
Show newest version
/**
 * Copyright 2012 Twitter, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package parquet.hadoop;

import static parquet.Preconditions.checkNotNull;

import java.io.Closeable;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import parquet.filter.UnboundRecordFilter;
import parquet.filter2.compat.FilterCompat;
import parquet.filter2.compat.FilterCompat.Filter;
import parquet.filter2.compat.RowGroupFilter;
import parquet.hadoop.api.InitContext;
import parquet.hadoop.api.ReadSupport;
import parquet.hadoop.api.ReadSupport.ReadContext;
import parquet.hadoop.metadata.BlockMetaData;
import parquet.hadoop.metadata.GlobalMetaData;
import parquet.schema.MessageType;

/**
 * Read records from a Parquet file.
 * TODO: too many constructors (https://issues.apache.org/jira/browse/PARQUET-39)
 */
public class ParquetReader implements Closeable {

  private final ReadSupport readSupport;
  private final Configuration conf;
  private final ReadContext readContext;
  private final Iterator
footersIterator; private final GlobalMetaData globalMetaData; private final Filter filter; private InternalParquetRecordReader reader; /** * @param file the file to read * @param readSupport to materialize records * @throws IOException * @deprecated use {@link #builder(ReadSupport, Path)} */ @Deprecated public ParquetReader(Path file, ReadSupport readSupport) throws IOException { this(new Configuration(), file, readSupport, FilterCompat.NOOP); } /** * @param conf the configuration * @param file the file to read * @param readSupport to materialize records * @throws IOException * @deprecated use {@link #builder(ReadSupport, Path)} */ @Deprecated public ParquetReader(Configuration conf, Path file, ReadSupport readSupport) throws IOException { this(conf, file, readSupport, FilterCompat.NOOP); } /** * @param file the file to read * @param readSupport to materialize records * @param unboundRecordFilter the filter to use to filter records * @throws IOException * @deprecated use {@link #builder(ReadSupport, Path)} */ @Deprecated public ParquetReader(Path file, ReadSupport readSupport, UnboundRecordFilter unboundRecordFilter) throws IOException { this(new Configuration(), file, readSupport, FilterCompat.get(unboundRecordFilter)); } /** * @param conf the configuration * @param file the file to read * @param readSupport to materialize records * @param unboundRecordFilter the filter to use to filter records * @throws IOException * @deprecated use {@link #builder(ReadSupport, Path)} */ @Deprecated public ParquetReader(Configuration conf, Path file, ReadSupport readSupport, UnboundRecordFilter unboundRecordFilter) throws IOException { this(conf, file, readSupport, FilterCompat.get(unboundRecordFilter)); } private ParquetReader(Configuration conf, Path file, ReadSupport readSupport, Filter filter) throws IOException { this.readSupport = readSupport; this.filter = checkNotNull(filter, "filter"); this.conf = conf; FileSystem fs = file.getFileSystem(conf); List statuses = Arrays.asList(fs.listStatus(file)); List
footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false); this.footersIterator = footers.iterator(); globalMetaData = ParquetFileWriter.getGlobalMetaData(footers); MessageType schema = globalMetaData.getSchema(); Map> extraMetadata = globalMetaData.getKeyValueMetaData(); readContext = readSupport.init(new InitContext(conf, extraMetadata, schema)); } /** * @return the next record or null if finished * @throws IOException */ public T read() throws IOException { try { if (reader != null && reader.nextKeyValue()) { return reader.getCurrentValue(); } else { initReader(); return reader == null ? null : read(); } } catch (InterruptedException e) { throw new IOException(e); } } private void initReader() throws IOException { if (reader != null) { reader.close(); reader = null; } if (footersIterator.hasNext()) { Footer footer = footersIterator.next(); List blocks = footer.getParquetMetadata().getBlocks(); List filteredBlocks = RowGroupFilter.filterRowGroups(filter, blocks, footer.getParquetMetadata().getFileMetaData().getSchema()); reader = new InternalParquetRecordReader(readSupport, filter); reader.initialize( readContext.getRequestedSchema(), globalMetaData.getSchema(), footer.getParquetMetadata().getFileMetaData().getKeyValueMetaData(), readContext.getReadSupportMetadata(), footer.getFile(), filteredBlocks, conf); } } @Override public void close() throws IOException { if (reader != null) { reader.close(); } } public static Builder builder(ReadSupport readSupport, Path path) { return new Builder(readSupport, path); } public static class Builder { private final ReadSupport readSupport; private final Path file; private Configuration conf; private Filter filter; private Builder(ReadSupport readSupport, Path path) { this.readSupport = checkNotNull(readSupport, "readSupport"); this.file = checkNotNull(path, "path"); this.conf = new Configuration(); this.filter = FilterCompat.NOOP; } public Builder withConf(Configuration conf) { this.conf = checkNotNull(conf, "conf"); return this; } public Builder withFilter(Filter filter) { this.filter = checkNotNull(filter, "filter"); return this; } public ParquetReader build() throws IOException { return new ParquetReader(conf, file, readSupport, filter); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy