All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.cdap.plugin.common.FileSetUtil Maven / Gradle / Ivy

There is a newer version: 2.12.3
Show newest version
/*
 * Copyright © 2015, 2016-2019 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package io.cdap.plugin.common;

import io.cdap.cdap.api.data.schema.UnsupportedTypeException;
import io.cdap.cdap.api.dataset.DatasetProperties;
import io.cdap.cdap.api.dataset.lib.FileSetProperties;

/**
 * Utilities for configuring file sets during pipeline configuration.
 */
public class FileSetUtil {

  /**
   * Configure a file set to use Parquet file format with a given schema. The schema is lower-cased, parsed
   * as an Avro schema, validated and converted into a Hive schema. The file set is configured to use
   * Parquet input and output format, and also configured for Explore to use Parquet. The schema is added
   * to the file set properties in all the different required ways:
   * 
    *
  • As a top-level dataset property;
  • *
  • As the schema for the input and output format;
  • *
  • As the schema of the Hive table.
  • *
* @param configuredSchema the original schema configured for the table * @param properties a builder for the file set properties */ public static void configureParquetFileSet(String configuredSchema, FileSetProperties.Builder properties) { String hiveSchema = parseHiveSchema(configuredSchema, configuredSchema); properties .setEnableExploreOnCreate(true) .setExploreFormat("parquet") .setExploreSchema(hiveSchema.substring(1, hiveSchema.length() - 1)) .add(DatasetProperties.SCHEMA, configuredSchema); } /** * Configure a file set to use ORC file format with a given schema. The schema is parsed * validated and converted into a Hive schema which is compatible with ORC format. The file set is configured to use * ORC input and output format, and also configured for Explore to use Hive. The schema is added * to the file set properties in all the different required ways: *
    *
  • As a top-level dataset property;
  • *
  • As the schema for the input and output format;
  • *
  • As the schema to be used by the ORC serde (which is used by Hive).
  • *
* * @param configuredSchema the original schema configured for the table * @param properties a builder for the file set properties */ public static void configureORCFileSet(String configuredSchema, FileSetProperties.Builder properties) { //TODO test if complex cases run with lowercase schema only String lowerCaseSchema = configuredSchema.toLowerCase(); String hiveSchema = parseHiveSchema(lowerCaseSchema, configuredSchema); hiveSchema = hiveSchema.substring(1, hiveSchema.length() - 1); properties.setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat") .setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat") .setSerDe("org.apache.hadoop.hive.ql.io.orc.OrcSerde") .setExploreSchema(hiveSchema) .setEnableExploreOnCreate(true) .add(DatasetProperties.SCHEMA, configuredSchema) .build(); } /** * Configure a file set to use Avro file format with a given schema. The schema is parsed * as an Avro schema, validated and converted into a Hive schema. The file set is configured to use * Avro key input and output format, and also configured for Explore to use Avro. The schema is added * to the file set properties in all the different required ways: *
    *
  • As a top-level dataset property;
  • *
  • As the schema for the input and output format;
  • *
  • As the schema of the Hive table;
  • *
  • As the schema to be used by the Avro serde (which is used by Hive).
  • *
* @param configuredSchema the original schema configured for the table * @param properties a builder for the file set properties */ public static void configureAvroFileSet(String configuredSchema, FileSetProperties.Builder properties) { properties .setEnableExploreOnCreate(true) .setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe") .setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat") .setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat") .setTableProperty("avro.schema.literal", configuredSchema) .add(DatasetProperties.SCHEMA, configuredSchema); } /*----- private helpers ----*/ private static String parseHiveSchema(String schemaString, String configuredSchema) { try { return HiveSchemaConverter.toHiveSchema(io.cdap.cdap.api.data.schema.Schema.parseJson(schemaString)); } catch (UnsupportedTypeException e) { throw new IllegalArgumentException("Schema " + configuredSchema + " is not supported as a Hive schema.", e); } catch (Exception e) { throw new IllegalArgumentException("Schema " + configuredSchema + " is invalid.", e); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy