All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.avro.AvroParquetWriter Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.avro;

import java.io.IOException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.specific.SpecificData;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.column.ParquetProperties.WriterVersion;
import org.apache.parquet.conf.HadoopParquetConfiguration;
import org.apache.parquet.conf.ParquetConfiguration;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.api.WriteSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.io.OutputFile;

/**
 * Write Avro records to a Parquet file.
 */
public class AvroParquetWriter extends ParquetWriter {

  /**
   * @param file a file path
   * @param   the Java type of records to read from the file
   * @return an Avro reader builder
   * @deprecated will be removed in 2.0.0; use {@link #builder(OutputFile)} instead.
   */
  @Deprecated
  public static  Builder builder(Path file) {
    return new Builder(file);
  }

  public static  Builder builder(OutputFile file) {
    return new Builder(file);
  }

  /**
   * Create a new {@link AvroParquetWriter}.
   *
   * @param file                 a file path
   * @param avroSchema           a schema for the write
   * @param compressionCodecName compression codec
   * @param blockSize            target block size
   * @param pageSize             target page size
   * @throws IOException if there is an error while writing
   */
  @Deprecated
  public AvroParquetWriter(
      Path file, Schema avroSchema, CompressionCodecName compressionCodecName, int blockSize, int pageSize)
      throws IOException {
    super(
        file,
        AvroParquetWriter.writeSupport(avroSchema, SpecificData.get()),
        compressionCodecName,
        blockSize,
        pageSize);
  }

  /**
   * Create a new {@link AvroParquetWriter}.
   *
   * @param file                 The file name to write to.
   * @param avroSchema           The schema to write with.
   * @param compressionCodecName Compression code to use, or CompressionCodecName.UNCOMPRESSED
   * @param blockSize            the block size threshold.
   * @param pageSize             See parquet write up. Blocks are subdivided into pages for alignment and other purposes.
   * @param enableDictionary     Whether to use a dictionary to compress columns.
   * @throws IOException if there is an error while writing
   */
  @Deprecated
  public AvroParquetWriter(
      Path file,
      Schema avroSchema,
      CompressionCodecName compressionCodecName,
      int blockSize,
      int pageSize,
      boolean enableDictionary)
      throws IOException {
    super(
        file,
        AvroParquetWriter.writeSupport(avroSchema, SpecificData.get()),
        compressionCodecName,
        blockSize,
        pageSize,
        enableDictionary,
        DEFAULT_IS_VALIDATING_ENABLED);
  }

  /**
   * Create a new {@link AvroParquetWriter}. The default block size is 128 MB. The default
   * page size is 1 MB. Default compression is no compression. (Inherited from {@link ParquetWriter})
   *
   * @param file       The file name to write to.
   * @param avroSchema The schema to write with.
   * @throws IOException if there is an error while writing
   */
  @Deprecated
  public AvroParquetWriter(Path file, Schema avroSchema) throws IOException {
    this(file, avroSchema, CompressionCodecName.UNCOMPRESSED, DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE);
  }

  /**
   * Create a new {@link AvroParquetWriter}.
   *
   * @param file                 The file name to write to.
   * @param avroSchema           The schema to write with.
   * @param compressionCodecName Compression code to use, or CompressionCodecName.UNCOMPRESSED
   * @param blockSize            the block size threshold.
   * @param pageSize             See parquet write up. Blocks are subdivided into pages for alignment and other purposes.
   * @param enableDictionary     Whether to use a dictionary to compress columns.
   * @param conf                 The Configuration to use.
   * @throws IOException if there is an error while writing
   */
  @Deprecated
  public AvroParquetWriter(
      Path file,
      Schema avroSchema,
      CompressionCodecName compressionCodecName,
      int blockSize,
      int pageSize,
      boolean enableDictionary,
      Configuration conf)
      throws IOException {
    this(
        file,
        AvroParquetWriter.writeSupport(conf, avroSchema, SpecificData.get()),
        compressionCodecName,
        blockSize,
        pageSize,
        enableDictionary,
        DEFAULT_IS_VALIDATING_ENABLED,
        DEFAULT_WRITER_VERSION,
        conf);
  }

  /**
   * Create a new {@link AvroParquetWriter}.
   *
   * @param file                 The file name to write to.
   * @param writeSupport         The schema to write with.
   * @param compressionCodecName Compression code to use, or CompressionCodecName.UNCOMPRESSED
   * @param blockSize            the block size threshold.
   * @param pageSize             See parquet write up. Blocks are subdivided into pages for alignment and other purposes.
   * @param enableDictionary     Whether to use a dictionary to compress columns.
   * @param conf                 The Configuration to use.
   * @throws IOException
   */
  AvroParquetWriter(
      Path file,
      WriteSupport writeSupport,
      CompressionCodecName compressionCodecName,
      int blockSize,
      int pageSize,
      boolean enableDictionary,
      boolean enableValidation,
      WriterVersion writerVersion,
      Configuration conf)
      throws IOException {
    super(
        file,
        writeSupport,
        compressionCodecName,
        blockSize,
        pageSize,
        pageSize,
        enableDictionary,
        enableValidation,
        writerVersion,
        conf);
  }

  private static  WriteSupport writeSupport(Schema avroSchema, GenericData model) {
    return new AvroWriteSupport(new AvroSchemaConverter().convert(avroSchema), avroSchema, model);
  }

  private static  WriteSupport writeSupport(Configuration conf, Schema avroSchema, GenericData model) {
    return writeSupport(new HadoopParquetConfiguration(conf), avroSchema, model);
  }

  private static  WriteSupport writeSupport(ParquetConfiguration conf, Schema avroSchema, GenericData model) {
    return new AvroWriteSupport(new AvroSchemaConverter(conf).convert(avroSchema), avroSchema, model);
  }

  public static class Builder extends ParquetWriter.Builder> {
    private Schema schema = null;
    private GenericData model = null;

    private Builder(Path file) {
      super(file);
    }

    private Builder(OutputFile file) {
      super(file);
    }

    public Builder withSchema(Schema schema) {
      this.schema = schema;
      return this;
    }

    public Builder withDataModel(GenericData model) {
      this.model = model;
      return this;
    }

    @Override
    protected Builder self() {
      return this;
    }

    @Override
    protected WriteSupport getWriteSupport(Configuration conf) {
      return AvroParquetWriter.writeSupport(conf, schema, model);
    }

    @Override
    protected WriteSupport getWriteSupport(ParquetConfiguration conf) {
      return AvroParquetWriter.writeSupport(conf, schema, model);
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy