org.apache.parquet.cli.commands.ConvertCSVCommand Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.cli.commands;
import static org.apache.avro.generic.GenericData.Record;
import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_1_0;
import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.List;
import java.util.Set;
import org.apache.avro.Schema;
import org.apache.avro.SchemaNormalization;
import org.apache.avro.generic.GenericData;
import org.apache.parquet.avro.AvroParquetWriter;
import org.apache.parquet.cli.BaseCommand;
import org.apache.parquet.cli.csv.AvroCSV;
import org.apache.parquet.cli.csv.AvroCSVReader;
import org.apache.parquet.cli.csv.CSVProperties;
import org.apache.parquet.cli.util.Codecs;
import org.apache.parquet.cli.util.Schemas;
import org.apache.parquet.hadoop.ParquetFileWriter;
import org.apache.parquet.hadoop.ParquetWriter;
import org.slf4j.Logger;
@Parameters(commandDescription = "Create a file from CSV data")
public class ConvertCSVCommand extends BaseCommand {
public ConvertCSVCommand(Logger console) {
super(console);
}
@Parameter(description = "")
List targets;
@Parameter(
names = {"-o", "--output"},
description = "Output file path",
required = true)
String outputPath = null;
@Parameter(
names = {"-2", "--format-version-2", "--writer-version-2"},
description = "Use Parquet format version 2",
hidden = true)
boolean v2 = false;
@Parameter(names = "--delimiter", description = "Delimiter character")
String delimiter = ",";
@Parameter(names = "--escape", description = "Escape character")
String escape = "\\";
@Parameter(names = "--quote", description = "Quote character")
String quote = "\"";
@Parameter(names = "--no-header", description = "Don't use first line as CSV header")
boolean noHeader = false;
@Parameter(names = "--skip-lines", description = "Lines to skip before CSV start")
int linesToSkip = 0;
@Parameter(names = "--charset", description = "Character set name", hidden = true)
String charsetName = Charset.defaultCharset().displayName();
@Parameter(names = "--header", description = "Line to use as a header. Must match the CSV settings.")
String header;
@Parameter(names = "--require", description = "Do not allow null values for the given field")
List requiredFields;
@Parameter(
names = {"-s", "--schema"},
description = "The file containing the Avro schema.")
String avroSchemaFile;
@Parameter(
names = {"--compression-codec"},
description = "A compression codec name.")
String compressionCodecName = "GZIP";
@Parameter(names = "--row-group-size", description = "Target row group size")
int rowGroupSize = ParquetWriter.DEFAULT_BLOCK_SIZE;
@Parameter(names = "--page-size", description = "Target page size")
int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE;
@Parameter(names = "--dictionary-size", description = "Max dictionary page size")
int dictionaryPageSize = ParquetWriter.DEFAULT_PAGE_SIZE;
@Parameter(
names = {"--overwrite"},
description = "Remove any data already in the target view or dataset")
boolean overwrite = false;
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && !targets.isEmpty(), "CSV path is required.");
if (header != null) {
// if a header is given on the command line, don't assume one is in the file
noHeader = true;
}
CSVProperties props = new CSVProperties.Builder()
.delimiter(delimiter)
.escape(escape)
.quote(quote)
.header(header)
.hasHeader(!noHeader)
.linesToSkip(linesToSkip)
.charset(charsetName)
.build();
Schema csvSchema = null;
if (avroSchemaFile != null) {
csvSchema = Schemas.fromAvsc(open(avroSchemaFile));
} else {
Set required = ImmutableSet.of();
if (requiredFields != null) {
required = ImmutableSet.copyOf(requiredFields);
}
String filename = new File(targets.get(0)).getName();
String recordName;
if (filename.contains(".")) {
recordName = filename.substring(0, filename.indexOf("."));
} else {
recordName = filename;
}
// If the schema is not explicitly provided,
// ensure that all input files share the same one.
for (String target : targets) {
Schema schema = AvroCSV.inferNullableSchema(recordName, open(target), props, required);
if (csvSchema == null) {
csvSchema = schema;
} else if (!SchemaNormalization.toParsingForm(csvSchema)
.equals(SchemaNormalization.toParsingForm(schema))) {
throw new IllegalArgumentException(target + " seems to have a different schema from others. "
+ "Please specify the correct schema explicitly with the `--schema` option.");
}
}
}
try (ParquetWriter writer = AvroParquetWriter.builder(qualifiedPath(outputPath))
.withWriterVersion(v2 ? PARQUET_2_0 : PARQUET_1_0)
.withWriteMode(overwrite ? ParquetFileWriter.Mode.OVERWRITE : ParquetFileWriter.Mode.CREATE)
.withCompressionCodec(Codecs.parquetCodec(compressionCodecName))
.withDictionaryEncoding(true)
.withDictionaryPageSize(dictionaryPageSize)
.withPageSize(pageSize)
.withRowGroupSize(rowGroupSize)
.withDataModel(GenericData.get())
.withConf(getConf())
.withSchema(csvSchema)
.build()) {
for (String target : targets) {
long count = 0;
try (AvroCSVReader reader =
new AvroCSVReader<>(open(target), props, csvSchema, Record.class, true)) {
for (Record record : reader) {
writer.write(record);
count++;
}
} catch (RuntimeException e) {
throw new RuntimeException("Failed on record " + count + " in file " + target, e);
}
}
}
return 0;
}
@Override
public List getExamples() {
return Lists.newArrayList(
"# Create a Parquet file from a CSV file",
"sample.csv -o sample.parquet --schema schema.avsc",
"# Create a Parquet file in HDFS from local CSV",
"path/to/sample.csv -o hdfs:/user/me/sample.parquet --schema schema.avsc");
}
}