org.apache.orc.tools.convert.ConvertTool Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.orc.tools.convert;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
import org.apache.orc.tools.json.JsonSchemaFinder;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;
/**
* A conversion tool to convert CSV or JSON files into ORC files.
*/
public class ConvertTool {
static final String DEFAULT_TIMESTAMP_FORMAT = "yyyy[[-][/]]MM[[-][/]]dd[['T'][ ]]HH:mm:ss[ ][XXX][X]";
private final List fileList;
private final TypeDescription schema;
private final char csvSeparator;
private final char csvQuote;
private final char csvEscape;
private final int csvHeaderLines;
private final String csvNullString;
private final String timestampFormat;
private final Writer writer;
private final VectorizedRowBatch batch;
TypeDescription buildSchema(List files,
Configuration conf) throws IOException {
JsonSchemaFinder schemaFinder = new JsonSchemaFinder();
int filesScanned = 0;
for(FileInformation file: files) {
if (file.format == Format.JSON) {
System.err.println("Scanning " + file.path + " for schema");
filesScanned += 1;
schemaFinder.addFile(file.getReader(file.filesystem.open(file.path)));
} else if (file.format == Format.ORC) {
System.err.println("Merging schema from " + file.path);
filesScanned += 1;
Reader reader = OrcFile.createReader(file.path,
OrcFile.readerOptions(conf)
.filesystem(file.filesystem));
schemaFinder.addSchema(reader.getSchema());
}
}
if (filesScanned == 0) {
throw new IllegalArgumentException("Please specify a schema using" +
" --schema for converting CSV files.");
}
return schemaFinder.getSchema();
}
enum Compression {
NONE, GZIP
}
enum Format {
JSON, CSV, ORC
}
class FileInformation {
private final Compression compression;
private final Format format;
private final Path path;
private final FileSystem filesystem;
private final Configuration conf;
private final long size;
FileInformation(Path path, Configuration conf) throws IOException {
this.path = path;
this.conf = conf;
this.filesystem = path.getFileSystem(conf);
this.size = filesystem.getFileStatus(path).getLen();
String name = path.getName();
int lastDot = name.lastIndexOf(".");
if (lastDot >= 0 && ".gz".equals(name.substring(lastDot))) {
this.compression = Compression.GZIP;
name = name.substring(0, lastDot);
lastDot = name.lastIndexOf(".");
} else {
this.compression = Compression.NONE;
}
if (lastDot >= 0) {
String ext = name.substring(lastDot);
if (".json".equals(ext) || ".jsn".equals(ext)) {
format = Format.JSON;
} else if (".csv".equals(ext)) {
format = Format.CSV;
} else if (".orc".equals(ext)) {
format = Format.ORC;
} else {
throw new IllegalArgumentException("Unknown kind of file " + path);
}
} else {
throw new IllegalArgumentException("No extension on file " + path);
}
}
java.io.Reader getReader(InputStream input) throws IOException {
if (compression == Compression.GZIP) {
input = new GZIPInputStream(input);
}
return new InputStreamReader(input, StandardCharsets.UTF_8);
}
public RecordReader getRecordReader() throws IOException {
switch (format) {
case ORC: {
Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
return reader.rows(reader.options().schema(schema));
}
case JSON: {
FSDataInputStream underlying = filesystem.open(path);
return new JsonReader(getReader(underlying), underlying, size, schema);
}
case CSV: {
FSDataInputStream underlying = filesystem.open(path);
return new CsvReader(getReader(underlying), underlying, size, schema,
csvSeparator, csvQuote, csvEscape, csvHeaderLines, csvNullString, timestampFormat);
}
default:
throw new IllegalArgumentException("Unhandled format " + format +
" for " + path);
}
}
}
public static void main(Configuration conf,
String[] args) throws IOException, ParseException {
new ConvertTool(conf, args).run();
}
List buildFileList(String[] files,
Configuration conf) throws IOException {
List result = new ArrayList<>(files.length);
for(String fn: files) {
result.add(new FileInformation(new Path(fn), conf));
}
return result;
}
public ConvertTool(Configuration conf,
String[] args) throws IOException, ParseException {
CommandLine opts = parseOptions(args);
fileList = buildFileList(opts.getArgs(), conf);
if (opts.hasOption('s')) {
this.schema = TypeDescription.fromString(opts.getOptionValue('s'));
} else {
this.schema = buildSchema(fileList, conf);
}
this.csvQuote = getCharOption(opts, 'q', '"');
this.csvEscape = getCharOption(opts, 'e', '\\');
this.csvSeparator = getCharOption(opts, 'S', ',');
this.csvHeaderLines = getIntOption(opts, 'H', 0);
this.csvNullString = opts.getOptionValue('n', "");
this.timestampFormat = opts.getOptionValue("t", DEFAULT_TIMESTAMP_FORMAT);
String outFilename = opts.hasOption('o')
? opts.getOptionValue('o') : "output.orc";
writer = OrcFile.createWriter(new Path(outFilename),
OrcFile.writerOptions(conf).setSchema(schema));
batch = schema.createRowBatch();
}
void run() throws IOException {
for (FileInformation file: fileList) {
System.err.println("Processing " + file.path);
RecordReader reader = file.getRecordReader();
while (reader.nextBatch(batch)) {
writer.addRowBatch(batch);
}
reader.close();
}
writer.close();
}
private static int getIntOption(CommandLine opts, char letter, int mydefault) {
if (opts.hasOption(letter)) {
return Integer.parseInt(opts.getOptionValue(letter));
} else {
return mydefault;
}
}
private static char getCharOption(CommandLine opts, char letter, char mydefault) {
if (opts.hasOption(letter)) {
return opts.getOptionValue(letter).charAt(0);
} else {
return mydefault;
}
}
private static CommandLine parseOptions(String[] args) throws ParseException {
Options options = new Options();
options.addOption(
Option.builder("h").longOpt("help").desc("Provide help").build());
options.addOption(
Option.builder("s").longOpt("schema").hasArg()
.desc("The schema to write in to the file").build());
options.addOption(
Option.builder("o").longOpt("output").desc("Output filename")
.hasArg().build());
options.addOption(
Option.builder("n").longOpt("null").desc("CSV null string")
.hasArg().build());
options.addOption(
Option.builder("q").longOpt("quote").desc("CSV quote character")
.hasArg().build());
options.addOption(
Option.builder("e").longOpt("escape").desc("CSV escape character")
.hasArg().build());
options.addOption(
Option.builder("S").longOpt("separator").desc("CSV separator character")
.hasArg().build());
options.addOption(
Option.builder("H").longOpt("header").desc("CSV header lines")
.hasArg().build());
options.addOption(
Option.builder("t").longOpt("timestampformat").desc("Timestamp Format")
.hasArg().build());
CommandLine cli = new DefaultParser().parse(options, args);
if (cli.hasOption('h') || cli.getArgs().length == 0) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("convert", options);
System.exit(1);
}
return cli;
}
}