All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.uber.hoodie.common.util.ParquetUtils Maven / Gradle / Ivy

There is a newer version: 0.4.7
Show newest version
/*
 * Copyright (c) 2016 Uber Technologies, Inc. ([email protected])
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *          http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.uber.hoodie.common.util;

import com.uber.hoodie.avro.HoodieAvroWriteSupport;
import com.uber.hoodie.common.BloomFilter;
import com.uber.hoodie.common.model.HoodieRecord;

import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.exception.HoodieIOException;
import com.uber.hoodie.exception.MetadataNotFoundException;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.parquet.avro.AvroReadSupport;
import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;

import java.io.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import static com.uber.hoodie.common.util.FSUtils.getFs;

/**
 * Utility functions involving with parquet.
 */
public class ParquetUtils {

    /**
     * Read the rowKey list from the given parquet file.
     *
     * @param filePath    The parquet file path.
     */
    public static Set readRowKeysFromParquet(Path filePath) {
        Configuration conf = new Configuration();
        conf.addResource(getFs().getConf());
        Schema readSchema = HoodieAvroUtils.getRecordKeySchema();
        AvroReadSupport.setAvroReadSchema(conf, readSchema);
        AvroReadSupport.setRequestedProjection(conf, readSchema);
        ParquetReader reader = null;
        Set rowKeys = new HashSet<>();
        try {
            reader = AvroParquetReader.builder(filePath).withConf(conf).build();
            Object obj = reader.read();
            while (obj != null) {
                if (obj instanceof GenericRecord) {
                    rowKeys.add(((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString());
                }
                obj = reader.read();
            }
        } catch (IOException e) {
            throw new HoodieIOException("Failed to read row keys from Parquet " + filePath, e);

        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e) {
                    // ignore
                }
            }
        }
        return rowKeys;
    }


    /**
     *
     * Read the metadata from a parquet file
     *
     * @param parquetFilePath
     * @return
     */
    public static ParquetMetadata readMetadata(Path parquetFilePath) {
        return readMetadata(new Configuration(), parquetFilePath);
    }

    public static ParquetMetadata readMetadata(Configuration conf, Path parquetFilePath) {
        ParquetMetadata footer;
        try {
            // TODO(vc): Should we use the parallel reading version here?
            footer = ParquetFileReader.readFooter(getFs().getConf(), parquetFilePath);
        } catch (IOException e) {
            throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath,
                    e);
        }
        return footer;
    }


    /**
     * Get the schema of the given parquet file.
     *
     * @param parquetFilePath
     * @return
     */
    public static MessageType readSchema(Path parquetFilePath) {
        return readMetadata(parquetFilePath).getFileMetaData().getSchema();
    }


    private static List readParquetFooter(Path parquetFilePath, String... footerNames) {
        List footerVals = new ArrayList<>();
        ParquetMetadata footer = readMetadata(parquetFilePath);
        Map metadata = footer.getFileMetaData().getKeyValueMetaData();
        for (String footerName : footerNames) {
            if (metadata.containsKey(footerName)) {
                footerVals.add(metadata.get(footerName));
            } else {
                throw new MetadataNotFoundException("Could not find index in Parquet footer. " +
                        "Looked for key " + footerName + " in " + parquetFilePath);
            }
        }
        return footerVals;
    }

    public static Schema readAvroSchema(Path parquetFilePath) {
        return new AvroSchemaConverter().convert(readSchema(parquetFilePath));
    }

    /**
     * Read out the bloom filter from the parquet file meta data.
     */
    public static BloomFilter readBloomFilterFromParquetMetadata(Path parquetFilePath) {
        String footerVal = readParquetFooter(parquetFilePath,
                HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY).get(0);
        return new BloomFilter(footerVal);
    }

    public static String[] readMinMaxRecordKeys(Path parquetFilePath) {
        List minMaxKeys = readParquetFooter(parquetFilePath, HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER,
                HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER);
        if (minMaxKeys.size() != 2) {
            throw new HoodieException(String.format(
                    "Could not read min/max record key out of footer correctly from %s. read) : %s",
                    parquetFilePath, minMaxKeys));
        }
        return new String[]{minMaxKeys.get(0), minMaxKeys.get(1)};
    }

    /**
     *
     * NOTE: This literally reads the entire file contents, thus should be used with caution.
     *
     * @param filePath
     * @return
     */
    public static List readAvroRecords(Path filePath) {
        ParquetReader reader = null;
        List records = new ArrayList<>();
        try {
            reader = AvroParquetReader.builder(filePath).build();
            Object obj = reader.read();
            while (obj != null) {
                if (obj instanceof GenericRecord) {
                    records.add(((GenericRecord) obj));
                }
                obj = reader.read();
            }
        } catch (IOException e) {
            throw new HoodieIOException("Failed to read avro records from Parquet " + filePath, e);

        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e) {
                    // ignore
                }
            }
        }
        return records;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy