All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.cli.commands.ConvertCommand Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.parquet.cli.commands;

import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroParquetWriter;
import org.apache.parquet.cli.BaseCommand;
import org.apache.parquet.cli.util.Codecs;
import org.apache.parquet.cli.util.Schemas;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.slf4j.Logger;
import java.io.Closeable;
import java.io.IOException;
import java.util.List;

import static org.apache.avro.generic.GenericData.Record;
import static org.apache.parquet.cli.util.Expressions.filterSchema;
import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_1_0;
import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0;

@Parameters(commandDescription="Create a Parquet file from a data file")
public class ConvertCommand extends BaseCommand {

  public ConvertCommand(Logger console) {
    super(console);
  }

  @Parameter(description = "")
  List targets;

  @Parameter(
      names={"-o", "--output"},
      description="Output file path",
      required=true)
  String outputPath = null;

  @Parameter(names = {"-s", "--schema"},
      description = "The file containing the Avro schema.")
  String avroSchemaFile;

  @Parameter(
      names = {"-c", "--column", "--columns"},
      description = "List of columns")
  List columns;

  @Parameter(names = {"--compression-codec"},
      description = "A compression codec name.")
  String compressionCodecName = "GZIP";

  @Parameter(
      names={"--overwrite"},
      description="Overwrite the output file if it exists")
  boolean overwrite = false;

  @Parameter(
      names={"-2", "--format-version-2", "--writer-version-2"},
      description="Use Parquet format version 2",
      hidden = true)
  boolean v2 = false;

  @Parameter(names="--row-group-size", description="Target row group size")
  int rowGroupSize = ParquetWriter.DEFAULT_BLOCK_SIZE;

  @Parameter(names="--page-size", description="Target page size")
  int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE;

  @Parameter(names="--dictionary-size", description="Max dictionary page size")
  int dictionaryPageSize = ParquetWriter.DEFAULT_PAGE_SIZE;

  @Override
  @SuppressWarnings("unchecked")
  public int run() throws IOException {
    Preconditions.checkArgument(targets != null && targets.size() == 1,
        "A data file is required.");

    String source = targets.get(0);

    CompressionCodecName codec = Codecs.parquetCodec(compressionCodecName);

    Schema schema;
    if (avroSchemaFile != null) {
      schema = Schemas.fromAvsc(open(avroSchemaFile));
    } else {
      schema = getAvroSchema(source);
    }
    Schema projection = filterSchema(schema, columns);

    Path outPath = qualifiedPath(outputPath);
    FileSystem outFS = outPath.getFileSystem(getConf());
    if (overwrite && outFS.exists(outPath)) {
      console.debug("Deleting output file {} (already exists)", outPath);
      outFS.delete(outPath);
    }

    Iterable reader = openDataFile(source, projection);
    boolean threw = true;
    long count = 0;
    try {
      try (ParquetWriter writer = AvroParquetWriter
          .builder(qualifiedPath(outputPath))
          .withWriterVersion(v2 ? PARQUET_2_0 : PARQUET_1_0)
          .withConf(getConf())
          .withCompressionCodec(codec)
          .withRowGroupSize(rowGroupSize)
          .withDictionaryPageSize(dictionaryPageSize < 64 ? 64 : dictionaryPageSize)
          .withDictionaryEncoding(dictionaryPageSize != 0)
          .withPageSize(pageSize)
          .withDataModel(GenericData.get())
          .withSchema(projection)
          .build()) {
        for (Record record : reader) {
          writer.write(record);
          count += 1;
        }
      }
      threw = false;
    } catch (RuntimeException e) {
      throw new RuntimeException("Failed on record " + count, e);
    } finally {
      if (reader instanceof Closeable) {
        Closeables.close((Closeable) reader, threw);
      }
    }

    return 0;
  }

  @Override
  public List getExamples() {
    return Lists.newArrayList(
        "# Create a Parquet file from an Avro file",
        "sample.avro -o sample.parquet",
        "# Create a Parquet file in S3 from a local Avro file",
        "path/to/sample.avro -o s3:/user/me/sample.parquet",
        "# Create a Parquet file from Avro data in S3",
        "s3:/data/path/sample.avro -o sample.parquet"
    );
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy