All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.avro.AvroParquetWriter Maven / Gradle / Ivy

The newest version!
/* 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.avro;

import java.io.IOException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.specific.SpecificData;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.column.ParquetProperties.WriterVersion;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.api.WriteSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.io.OutputFile;

/**
 * Write Avro records to a Parquet file.
 */
public class AvroParquetWriter extends ParquetWriter {

  public static  Builder builder(Path file) {
    return new Builder(file);
  }

  public static  Builder builder(OutputFile file) {
    return new Builder(file);
  }

  /** Create a new {@link AvroParquetWriter}.
   *
   * @param file a file path
   * @param avroSchema a schema for the write
   * @param compressionCodecName compression codec
   * @param blockSize target block size
   * @param pageSize target page size
   * @throws IOException if there is an error while writing
   */
  @Deprecated
  public AvroParquetWriter(Path file, Schema avroSchema,
      CompressionCodecName compressionCodecName, int blockSize,
      int pageSize) throws IOException {
    super(file, AvroParquetWriter.writeSupport(avroSchema, SpecificData.get()),
	      compressionCodecName, blockSize, pageSize);
  }

  /** Create a new {@link AvroParquetWriter}.
   *
   * @param file The file name to write to.
   * @param avroSchema The schema to write with.
   * @param compressionCodecName Compression code to use, or CompressionCodecName.UNCOMPRESSED
   * @param blockSize the block size threshold.
   * @param pageSize See parquet write up. Blocks are subdivided into pages for alignment and other purposes.
   * @param enableDictionary Whether to use a dictionary to compress columns.
   * @throws IOException if there is an error while writing
   */
  @Deprecated
  public AvroParquetWriter(Path file, Schema avroSchema,
                           CompressionCodecName compressionCodecName, int blockSize,
                           int pageSize, boolean enableDictionary) throws IOException {
    super(file, AvroParquetWriter.writeSupport(avroSchema, SpecificData.get()),
        compressionCodecName, blockSize, pageSize, enableDictionary,
        DEFAULT_IS_VALIDATING_ENABLED);
  }

  /** Create a new {@link AvroParquetWriter}. The default block size is 50 MB.The default
   *  page size is 1 MB.  Default compression is no compression. (Inherited from {@link ParquetWriter})
   *
   * @param file The file name to write to.
   * @param avroSchema The schema to write with.
   * @throws IOException if there is an error while writing
   */
  @Deprecated
  public AvroParquetWriter(Path file, Schema avroSchema) throws IOException {
    this(file, avroSchema, CompressionCodecName.UNCOMPRESSED,
        DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE);
  }

  /** Create a new {@link AvroParquetWriter}.
   *
   * @param file The file name to write to.
   * @param avroSchema The schema to write with.
   * @param compressionCodecName Compression code to use, or CompressionCodecName.UNCOMPRESSED
   * @param blockSize the block size threshold.
   * @param pageSize See parquet write up. Blocks are subdivided into pages for alignment and other purposes.
   * @param enableDictionary Whether to use a dictionary to compress columns.
   * @param conf The Configuration to use.
   * @throws IOException if there is an error while writing
   */
  @Deprecated
  public AvroParquetWriter(Path file, Schema avroSchema,
                           CompressionCodecName compressionCodecName,
                           int blockSize, int pageSize, boolean enableDictionary,
                           Configuration conf) throws IOException {
    this(file,
        AvroParquetWriter.writeSupport(conf, avroSchema, SpecificData.get()),
        compressionCodecName, blockSize, pageSize,
        enableDictionary, DEFAULT_IS_VALIDATING_ENABLED, DEFAULT_WRITER_VERSION,
        conf);
  }

  /**
   * Create a new {@link AvroParquetWriter}.
   *
   * @param file The file name to write to.
   * @param writeSupport The schema to write with.
   * @param compressionCodecName Compression code to use, or CompressionCodecName.UNCOMPRESSED
   * @param blockSize the block size threshold.
   * @param pageSize See parquet write up. Blocks are subdivided into pages for alignment and other purposes.
   * @param enableDictionary Whether to use a dictionary to compress columns.
   * @param conf The Configuration to use.
   * @throws IOException
   */
  AvroParquetWriter(Path file, WriteSupport writeSupport,
                           CompressionCodecName compressionCodecName,
                           int blockSize, int pageSize, boolean enableDictionary,
                           boolean enableValidation, WriterVersion writerVersion,
                           Configuration conf)
      throws IOException {
    super(file, writeSupport, compressionCodecName, blockSize, pageSize,
        pageSize, enableDictionary, enableValidation, writerVersion, conf);
  }

  private static  WriteSupport writeSupport(Schema avroSchema,
                                                  GenericData model) {
    return new AvroWriteSupport(
        new AvroSchemaConverter().convert(avroSchema), avroSchema, model);
  }

  private static  WriteSupport writeSupport(Configuration conf,
                                                  Schema avroSchema,
                                                  GenericData model) {
    return new AvroWriteSupport(
        new AvroSchemaConverter(conf).convert(avroSchema), avroSchema, model);
  }

  public static class Builder extends ParquetWriter.Builder> {
    private Schema schema = null;
    private GenericData model = SpecificData.get();

    private Builder(Path file) {
      super(file);
    }

    private Builder(OutputFile file) {
      super(file);
    }

    public Builder withSchema(Schema schema) {
      this.schema = schema;
      return this;
    }

    public Builder withDataModel(GenericData model) {
      this.model = model;
      return this;
    }

    @Override
    protected Builder self() {
      return this;
    }

    @Override
    protected WriteSupport getWriteSupport(Configuration conf) {
      return AvroParquetWriter.writeSupport(conf, schema, model);
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy