io.cdap.plugin.gcp.common.FileSetUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of google-cloud Show documentation
Plugins for Google Big Query
The newest version!
/*
 * Copyright © 2015, 2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package io.cdap.plugin.gcp.common;

import io.cdap.cdap.api.dataset.lib.FileSetProperties;

import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;


/**
 * Utilities for configuring file sets during pipeline configuration.
 */
public final class FileSetUtil {
  private static final String AVRO_OUTPUT_CODEC = "avro.output.codec";
  private static final String MAPRED_OUTPUT_COMPRESS = "mapred.output.compress";
  private static final String AVRO_SCHEMA_OUTPUT_KEY = "avro.schema.output.key";
  private static final String CODEC_SNAPPY = "snappy";
  private static final String CODEC_DEFLATE = "deflate";
  public static final Set AVRO_CODECS = new HashSet<>(Arrays.asList(CODEC_SNAPPY, CODEC_DEFLATE));
  private static final String CODEC_GZIP = "gzip";
  public static final Set PARQUET_CODECS = new HashSet<>(Arrays.asList(CODEC_SNAPPY, CODEC_GZIP));
  private static final String PARQUET_AVRO_SCHEMA = "parquet.avro.schema";
  private static final String PARQUET_COMPRESSION = "parquet.compression";
  public static final String NONE = "None";

  /**
   * Sets the compression options for an Avro file set format. Also, sets the schema output key to the schema provided.
   * The map-reduce output compression is set to true, and the compression codec can be set to one of
   * the following:
   * 
   * snappy
   * deflate
   * 
   *
   * @param format The format param is required in order to provide better error msg
   * @param compressionCodec compression code provided, can be either snappy or deflate
   * @param schema           output schema to be set as the schema output key for the file set
   * @param isOutputProperty boolean value to identify if the compression options are used as output property for
   *                         FilesetProperties.Builder
   * @return map of string to be set as configuration or output properties in FileSetProperties.Builder
   */
  public static Map getAvroCompressionConfiguration(String format, String compressionCodec,
                                                                    String schema, Boolean isOutputProperty) {
    Map conf = new HashMap<>();
    String prefix = "";
    if (isOutputProperty) {
      prefix = FileSetProperties.OUTPUT_PROPERTIES_PREFIX;
    }

    if (isCompressionRequired(format, compressionCodec, AVRO_CODECS)) {
      String codec = compressionCodec.toLowerCase();

      conf.put(prefix + MAPRED_OUTPUT_COMPRESS, "true");
      conf.put(prefix + AVRO_SCHEMA_OUTPUT_KEY, schema);
      conf.put(prefix + AVRO_OUTPUT_CODEC, codec);
    }

    return conf;
  }

  /**
   * Sets the compression options for an Parquet file set format.
   * Also, sets the schema output key to the schema provided.
   * The compression codec can be set to one of the following:
   * 
   * SNAPPY
   * GZIP
   * 
   *
   * @param format The format param is required in order to provide better error msg
   * @param compressionCodec compression code selected by user. Can be either snappy or gzip
   * @param schema           output schema to be set as the schema output key for the file set
   * @param isOutputProperty boolean value to identify if the compression options are as output property for
   *                         FilesetProperties Builder
   * @return map of string to be set as configuration or output properties in FileSetProperties.Builder
   */
  public static Map getParquetCompressionConfiguration(String format, String compressionCodec,
                                                                       String schema, Boolean isOutputProperty) {
    Map conf = new HashMap<>();
    String prefix = "";
    if (isOutputProperty) {
      prefix = FileSetProperties.OUTPUT_PROPERTIES_PREFIX;
    }
    conf.put(prefix + PARQUET_AVRO_SCHEMA, schema);
    if (isCompressionRequired(format, compressionCodec, PARQUET_CODECS)) {
      conf.put(prefix + PARQUET_COMPRESSION, compressionCodec.toUpperCase());
    }
    return conf;
  }

  /**
   * Checks if compression configuration is required and validates the if the codec is supported
   *
   * @param format The format to validate if the codec is supported
   * @param codec The codec to validate
   * @param supportedCodecs  The supported codecs
   * @return True if compression is required, False if compression is not required
   * @throws IllegalArgumentException in case the codec is not supported
   */
  public static boolean isCompressionRequired(String format, String codec, Set supportedCodecs) {

    if (codec != null && !codec.equalsIgnoreCase(NONE)) {
      if (!supportedCodecs.contains(codec.toLowerCase())) {
        throw new IllegalArgumentException("Unsupported compression codec " + codec + " for format " + format);
      }
      return true;
    }
    return false;
  }
}