All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.cli.commands.SchemaCommand Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.parquet.cli.commands;

import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.util.List;
import org.apache.avro.file.SeekableInput;
import org.apache.parquet.cli.BaseCommand;
import org.apache.parquet.cli.util.Formats;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.slf4j.Logger;

@Parameters(commandDescription = "Print the Avro schema for a file")
public class SchemaCommand extends BaseCommand {

  public SchemaCommand(Logger console) {
    super(console);
  }

  @Parameter(description = "")
  List targets;

  @Parameter(
      names = {"-o", "--output"},
      description = "Output file path")
  String outputPath = null;

  @Parameter(
      names = {"--overwrite"},
      description = "Overwrite the output file if it exists")
  boolean overwrite = false;

  @Parameter(
      names = {"--parquet"},
      description = "Print a Parquet schema, without converting to Avro",
      hidden = true)
  boolean parquetSchema = false;

  @Override
  @SuppressWarnings("unchecked")
  public int run() throws IOException {
    Preconditions.checkArgument(targets != null && targets.size() == 1, "Parquet file is required.");

    if (targets.size() > 1) {
      Preconditions.checkArgument(outputPath == null, "Cannot output multiple schemas to file %s", outputPath);
      for (String source : targets) {
        console.info("{}: {}", source, getSchema(source));
      }
    } else {
      String source = targets.get(0);

      if (outputPath != null) {
        try (OutputStream out = overwrite ? create(outputPath) : createWithNoOverwrite(outputPath)) {
          out.write(getSchema(source).getBytes(StandardCharsets.UTF_8));
        }
      } else {
        console.info(getSchema(source));
      }
    }

    return 0;
  }

  @Override
  public List getExamples() {
    return Lists.newArrayList(
        "# Print the Avro schema for a Parquet file",
        "sample.parquet",
        "# Print the Avro schema for an Avro file",
        "sample.avro",
        "# Print the Avro schema for a JSON file",
        "sample.json");
  }

  private String getSchema(String source) throws IOException {
    if (parquetSchema) {
      return getParquetSchema(source);
    } else {
      return getAvroSchema(source).toString(true);
    }
  }

  private String getParquetSchema(String source) throws IOException {
    Formats.Format format;
    try (SeekableInput in = openSeekable(source)) {
      format = Formats.detectFormat((InputStream) in);
      in.seek(0);

      switch (format) {
        case PARQUET:
          try (ParquetFileReader reader = new ParquetFileReader(
              getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER)) {
            return reader.getFileMetaData().getSchema().toString();
          }
        default:
          throw new IllegalArgumentException(
              String.format("Could not get a Parquet schema for format %s: %s", format, source));
      }
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy