All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.cli.BaseCommand Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.parquet.cli;

import com.beust.jcommander.internal.Lists;
import com.google.common.base.Preconditions;
import com.google.common.io.CharStreams;
import com.google.common.io.Resources;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.security.AccessController;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.file.SeekableInput;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.ChecksumFileSystem;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.parquet.avro.AvroReadSupport;
import org.apache.parquet.cli.json.AvroJsonReader;
import org.apache.parquet.cli.util.Formats;
import org.apache.parquet.cli.util.GetClassLoader;
import org.apache.parquet.cli.util.Schemas;
import org.apache.parquet.cli.util.SeekableFSDataInputStream;
import org.apache.parquet.hadoop.ParquetReader;
import org.slf4j.Logger;

public abstract class BaseCommand implements Command, Configurable {

  private static final String RESOURCE_URI_SCHEME = "resource";
  private static final String STDIN_AS_SOURCE = "stdin";

  protected final Logger console;

  private Configuration conf = null;
  private LocalFileSystem localFS = null;

  public BaseCommand(Logger console) {
    this.console = console;
  }

  /**
   * @return FileSystem to use when no file system scheme is present in a path
   * @throws IOException if there is an error loading the default fs
   */
  public FileSystem defaultFS() throws IOException {
    if (localFS == null) {
      this.localFS = FileSystem.getLocal(getConf());
    }
    return localFS;
  }

  /**
   * Output content to the console or a file.
   * 

* This will not produce checksum files. * * @param content String content to write * @param console A {@link Logger} for writing to the console * @param filename The destination {@link Path} as a String * @throws IOException if there is an error while writing */ public void output(String content, Logger console, String filename) throws IOException { if (filename == null || "-".equals(filename)) { console.info(content); } else { try (FSDataOutputStream outgoing = create(filename)) { outgoing.write(content.getBytes(StandardCharsets.UTF_8)); outgoing.flush(); } } } /** * Creates a file and returns an open {@link FSDataOutputStream}. *

* If the file does not have a file system scheme, this uses the default FS. *

* This will not produce checksum files and will overwrite a file that * already exists. * * @param filename The filename to create * @return An open FSDataOutputStream * @throws IOException if there is an error creating the file */ public FSDataOutputStream create(String filename) throws IOException { return create(filename, true); } /** * Creates a file and returns an open {@link FSDataOutputStream}. *

* If the file does not have a file system scheme, this uses the default FS. *

* This will produce checksum files and will overwrite a file that already * exists. * * @param filename The filename to create * @return An open FSDataOutputStream * @throws IOException if there is an error creating the file */ public FSDataOutputStream createWithChecksum(String filename) throws IOException { return create(filename, false); } /** * Creates a file and returns an open {@link FSDataOutputStream}. *

* If the file does not have a file system scheme, this uses the default FS. *

* This will neither produce checksum files nor overwrite a file that already * exists. * * @param filename The filename to create * @return An open FSDataOutputStream * @throws IOException if there is an error creating the file */ public FSDataOutputStream createWithNoOverwrite(String filename) throws IOException { return create(filename, true, false); } private FSDataOutputStream create(String filename, boolean noChecksum) throws IOException { return create(filename, noChecksum, true); } private FSDataOutputStream create(String filename, boolean noChecksum, boolean overwrite) throws IOException { Path filePath = qualifiedPath(filename); // even though it was qualified using the default FS, it may not be in it FileSystem fs = filePath.getFileSystem(getConf()); if (noChecksum && fs instanceof ChecksumFileSystem) { fs = ((ChecksumFileSystem) fs).getRawFileSystem(); } return fs.create(filePath, overwrite); } /** * Returns a qualified {@link Path} for the {@code filename}. *

* If the file does not have a file system scheme, this uses the default FS. * * @param filename The filename to qualify * @return A qualified Path for the filename * @throws IOException if there is an error creating a qualified path */ public Path qualifiedPath(String filename) throws IOException { Path cwd = defaultFS().makeQualified(new Path(".")); return new Path(filename).makeQualified(defaultFS().getUri(), cwd); } /** * Returns a {@link URI} for the {@code filename} that is a qualified Path or * a resource URI. *

* If the file does not have a file system scheme, this uses the default FS. * * @param filename The filename to qualify * @return A qualified URI for the filename * @throws IOException if there is an error creating a qualified URI */ public URI qualifiedURI(String filename) throws IOException { try { URI fileURI = new URI(filename); if (RESOURCE_URI_SCHEME.equals(fileURI.getScheme())) { return fileURI; } } catch (URISyntaxException ignore) { } return qualifiedPath(filename).toUri(); } /** * Opens an existing file or resource. *

* If the file does not have a file system scheme, this uses the default FS. * * @param filename The filename to open. * @return An open InputStream with the file contents * @throws IOException if there is an error opening the file * @throws IllegalArgumentException If the file does not exist */ public InputStream open(String filename) throws IOException { if (STDIN_AS_SOURCE.equals(filename)) { return System.in; } URI uri = qualifiedURI(filename); if (RESOURCE_URI_SCHEME.equals(uri.getScheme())) { return Resources.getResource(uri.getRawSchemeSpecificPart()).openStream(); } else { Path filePath = new Path(uri); // even though it was qualified using the default FS, it may not be in it FileSystem fs = filePath.getFileSystem(getConf()); return fs.open(filePath); } } public SeekableInput openSeekable(String filename) throws IOException { Path path = qualifiedPath(filename); // even though it was qualified using the default FS, it may not be in it FileSystem fs = path.getFileSystem(getConf()); return new SeekableFSDataInputStream(fs, path); } @Override public void setConf(Configuration conf) { this.conf = conf; HadoopFileSystemURLStreamHandler.setDefaultConf(conf); } @Override public Configuration getConf() { // In case conf is null, we'll return an empty configuration // this can be on a local development machine return null != conf ? conf : createDefaultConf(); } private Configuration createDefaultConf() { Configuration conf = new Configuration(); // Support reading INT96 by default conf.setBoolean(AvroReadSupport.READ_INT96_AS_FIXED, true); return conf; } /** * Returns a {@link ClassLoader} for a set of jars and directories. * * @param jars A list of jar paths * @param paths A list of directories containing .class files * @return a classloader for the jars and paths * @throws MalformedURLException if a jar or path is invalid */ protected static ClassLoader loaderFor(List jars, List paths) throws MalformedURLException { return AccessController.doPrivileged(new GetClassLoader(urls(jars, paths))); } /** * Returns a {@link ClassLoader} for a set of jars. * * @param jars A list of jar paths * @return a classloader for the jars * @throws MalformedURLException if a URL is invalid */ protected static ClassLoader loaderForJars(List jars) throws MalformedURLException { return AccessController.doPrivileged(new GetClassLoader(urls(jars, null))); } /** * Returns a {@link ClassLoader} for a set of directories. * * @param paths A list of directories containing .class files * @return a classloader for the paths * @throws MalformedURLException if a path is invalid */ protected static ClassLoader loaderForPaths(List paths) throws MalformedURLException { return AccessController.doPrivileged(new GetClassLoader(urls(null, paths))); } private static List urls(List jars, List dirs) throws MalformedURLException { // check the additional jars and lib directories in the local FS final List urls = Lists.newArrayList(); if (dirs != null) { for (String lib : dirs) { // final URLs must end in '/' for URLClassLoader File path = lib.endsWith("/") ? new File(lib) : new File(lib + "/"); Preconditions.checkArgument(path.exists(), "Lib directory does not exist: %s", lib); Preconditions.checkArgument(path.isDirectory(), "Not a directory: %s", lib); Preconditions.checkArgument( path.canRead() && path.canExecute(), "Insufficient permissions to access lib directory: %s", lib); urls.add(path.toURI().toURL()); } } if (jars != null) { for (String jar : jars) { File path = new File(jar); Preconditions.checkArgument(path.exists(), "Jar files does not exist: %s", jar); Preconditions.checkArgument(path.isFile(), "Not a file: %s", jar); Preconditions.checkArgument(path.canRead(), "Cannot read jar file: %s", jar); urls.add(path.toURI().toURL()); } } return urls; } protected Iterable openDataFile(final String source, Schema projection) throws IOException { Formats.Format format = Formats.detectFormat(open(source)); switch (format) { case PARQUET: Configuration conf = new Configuration(getConf()); // TODO: add these to the reader builder AvroReadSupport.setRequestedProjection(conf, projection); AvroReadSupport.setAvroReadSchema(conf, projection); final ParquetReader parquet = AvroParquetReader.builder(qualifiedPath(source)) .disableCompatibility() .withDataModel(GenericData.get()) .withConf(conf) .build(); return new Iterable() { @Override public Iterator iterator() { return new Iterator() { private boolean hasNext = false; private D next = advance(); @Override public boolean hasNext() { return hasNext; } @Override public D next() { if (!hasNext) { throw new NoSuchElementException(); } D toReturn = next; this.next = advance(); return toReturn; } private D advance() { try { D next = parquet.read(); this.hasNext = (next != null); return next; } catch (IOException e) { throw new RuntimeException("Failed while reading Parquet file: " + source, e); } } @Override public void remove() { throw new UnsupportedOperationException("Remove is not supported"); } }; } }; case AVRO: Iterable avroReader = (Iterable) DataFileReader.openReader(openSeekable(source), new GenericDatumReader<>(projection)); return avroReader; default: if (source.endsWith("json")) { return new AvroJsonReader<>(open(source), projection); } else { Preconditions.checkArgument(projection == null, "Cannot select columns from text files"); Iterable text = CharStreams.readLines(new InputStreamReader(open(source))); return text; } } } protected Schema getAvroSchema(String source) throws IOException { Formats.Format format; try (SeekableInput in = openSeekable(source)) { format = Formats.detectFormat((InputStream) in); in.seek(0); switch (format) { case PARQUET: return Schemas.fromParquet(getConf(), qualifiedURI(source)); case AVRO: return Schemas.fromAvro(open(source)); case TEXT: if (source.endsWith("avsc")) { return Schemas.fromAvsc(open(source)); } else if (source.endsWith("json")) { return Schemas.fromJSON("json", open(source)); } default: } throw new IllegalArgumentException(String.format("Could not determine file format of %s.", source)); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy