All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.aliyun.utils.OdpsUtils.scala Maven / Gradle / Ivy

There is a newer version: 0.28.4-public_2.4.3-1.0.4
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.aliyun.utils

import java.sql.SQLException

import com.aliyun.odps.`type`.TypeInfo
import com.aliyun.odps.account.AliyunAccount
import com.aliyun.odps.task.SQLTask
import com.aliyun.odps.{Partition, _}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.types._

class OdpsUtils(odps: Odps) extends Logging{

  /**
   * Check if specific ODPS table and partition exist or else.
   *
   * 

Examples

*
* * * * * * * *
Type of ODPS table * Table exist * Partition exist * Return *
Non-partitioned * false * - * (false, false) *
Non-partitioned * true * - * (true, false) *
Partitioned * true * false * (true, false) *
Partitioned * true * true * (true, true) *
Partitioned * false * - * (false, false) *
*
* * @param project The name of ODPS project. * @param table The name of ODPS table. * @param pname The name of ODPS table partition, if partitioned table. */ def checkTableAndPartition( project: String, table: String, pname: String): (Boolean, Boolean) = { val partitionSpec_ = new PartitionSpec(pname) odps.setDefaultProject(project) val tables = odps.tables() val tableExist = tables.exists(table) if(!tableExist) { logWarning("table " + table + " do not exist!") return (false, false) } val partitions = tables.get(table).getPartitions val partitionFilter = partitions.toArray(new Array[Partition](0)).iterator .map(e => e.getPartitionSpec) .filter(f => f.toString.equals(partitionSpec_.toString)) val partitionExist = if(partitionFilter.size == 0) false else true if(partitionExist) { (true, true) } else { (true, false) } } /** * Drop specific partition of ODPS table. * @param project The name of ODPS project. * @param table The name of ODPS table. * @param pname The name of ODPS table partition, if partitioned table. * @return Success or not. */ def dropPartition( project: String, table: String, pname: String): Boolean = { try { val (_, partitionE) = checkTableAndPartition(project, table, pname) if(!partitionE) return true odps.setDefaultProject(project) val partitionSpec = new PartitionSpec(pname) odps.tables().get(table).deletePartition(partitionSpec) true } catch { case e: OdpsException => logError("somethings wrong happens when delete partition " + pname + " of " + table + ".") logError(e.getMessage) return false } } /** * Drop specific ODPS table. * @param project The name of ODPS project. * @param table The name of ODPS table. * @return Success or not. */ def dropTable( project: String, table: String): Boolean = { try { val (tableE, _) = checkTableAndPartition(project, table, "random") if(!tableE) return true odps.setDefaultProject(project) odps.tables().delete(table) true } catch { case e: OdpsException => logError("somethings wrong happens when delete table " + table + ".") logError(e.getMessage) return false } } /** * Create specific partition of ODPS table. * @param project The name of ODPS project. * @param table The name of ODPS table. * @param pname The name of ODPS table partition, if partitioned table. * @return Success or not. */ def createPartition( project: String, table: String, pname: String): Boolean = { val partitionSpec_ = new PartitionSpec(pname) val (tableE, partitionE) = checkTableAndPartition(project, table, pname) if(!tableE) { logWarning("table " + table + " do not exist, FAILED.") return false } else if(partitionE) { logWarning("table " + table + " partition " + pname + " exist, " + "no need to create.") return true } try { odps.tables().get(table).createPartition(partitionSpec_) } catch { case e: OdpsException => logError("somethings wrong happens when create table " + table + " partition " + pname + ".") return false } true } /** * Get the table schema of ODPS table. * @param project The name of ODPS project. * @param table The name of ODPS table. * @param isPartition Is partition column or not. * @return */ def getTableSchema(project: String, table: String, isPartition: Boolean): Array[(String, TypeInfo)] = { odps.setDefaultProject(project) val schema = odps.tables().get(table).getSchema val columns = if (isPartition) schema.getPartitionColumns else schema.getColumns columns.toArray(new Array[Column](0)).map(e => (e.getName, e.getTypeInfo)) } /** * Get information of specific column via column name. * @param project The name of ODPS project. * @param table The name of ODPS table. * @param name The name of specific column. * @return Column index and type. */ def getColumnByName(project: String, table: String, name: String): (String, String) = { odps.setDefaultProject(project) val schema = odps.tables().get(table).getSchema val idx = schema.getColumnIndex(name) val colType = schema.getColumn(name).getTypeInfo val field = getCatalystType(name, colType, true) (idx.toString, field.dataType.simpleString) } /** * Get information of specific column via column index. * @param project The name of ODPS project. * @param table The name of ODPS table. * @param idx The index of specific column. * @return Column name and type. */ def getColumnByIdx(project: String, table: String, idx: Int): (String, String) = { odps.setDefaultProject(project) val schema = odps.tables().get(table).getSchema val column = schema.getColumn(idx) val name = column.getName val colType = schema.getColumn(name).getTypeInfo val field = getCatalystType(name, colType, true) (name, field.dataType.simpleString) } /** * Run sql on ODPS. * @param project The name of ODPS project. * @param sqlCmd An ODPS sql * @return An instance of ODPS. */ def runSQL(project: String, sqlCmd: String): Instance = { odps.setDefaultProject(project) log.info("SQL command: " + sqlCmd) try { SQLTask.run(odps, sqlCmd) } catch { case e: OdpsException => e.printStackTrace(); null } } /** * Get all partition [[PartitionSpec]] of specific ODPS table. * @param project The name of ODPS project. * @param table The name of ODPS table. * @return All partition [[PartitionSpec]] */ def getAllPartitionSpecs(table: String, project: String = null): Iterator[PartitionSpec] = { if(project != null) odps.setDefaultProject(project) odps.tables().get(table).getPartitions.toArray(new Array[Partition](0)) .map(pt => pt.getPartitionSpec).toIterator } /** * Check if the table is a partition table * @param project The name of ODPS project. * @param table The name of ODPS table. * @return */ def isPartitionTable(table: String, project: String = null):Boolean = { if(project != null) odps.setDefaultProject(project) odps.tables().get(table).isPartitioned } /** * Check if the table exists * @param project The name of ODPS project. * @param table The name of ODPS table. * @return */ def tableExist(table: String, project: String = null):Boolean = { if(project != null) odps.setDefaultProject(project) odps.tables().exists(table) } /** * Check if the partition exists in the table, * `partitionSpec` like `pt='xxx',ds='yyy'` * @param project The name of ODPS project. * @param table The name of ODPS table. * @return */ def partitionExist(partitionSpec:String, table: String, project: String = null):Boolean = { if(project != null) odps.setDefaultProject(project) val partitions = odps.tables().get(table).getPartitions val partitionFilter = partitions.toArray(new Array[Partition](0)).iterator .map(e => e.getPartitionSpec) .filter(f => f.toString.equals(partitionSpec.toString)) if(partitionFilter.size == 0) false else true } def getCatalystType(columnName: String, columnType: TypeInfo, nullable: Boolean): StructField = { val metadata = new MetadataBuilder() .putString("name", columnName) .putLong("scale", 0L) val answer = columnType.getOdpsType match { case OdpsType.BIGINT => LongType case OdpsType.BINARY => BinaryType case OdpsType.BOOLEAN => BooleanType case OdpsType.CHAR => StringType case OdpsType.DATE => DateType case OdpsType.DATETIME => DateType case OdpsType.DECIMAL => DecimalType.SYSTEM_DEFAULT case OdpsType.DOUBLE => DoubleType case OdpsType.FLOAT => FloatType case OdpsType.INT => IntegerType case OdpsType.SMALLINT => ShortType case OdpsType.STRING => StringType case OdpsType.TINYINT => ByteType case OdpsType.VARCHAR => StringType case OdpsType.TIMESTAMP => TimestampType case OdpsType.VOID => NullType case OdpsType.INTERVAL_DAY_TIME => throw new SQLException(s"Unsupported type 'INTERVAL_DAY_TIME'") case OdpsType.INTERVAL_YEAR_MONTH => throw new SQLException(s"Unsupported type 'INTERVAL_YEAR_MONTH'") case OdpsType.MAP => throw new SQLException(s"Unsupported type 'MAP'") case OdpsType.STRUCT => throw new SQLException(s"Unsupported type 'STRUCT'") case OdpsType.ARRAY => throw new SQLException(s"Unsupported type 'ARRAY'") case _ => throw new SQLException(s"Unsupported type $columnType") } StructField(columnName, answer, nullable, metadata.build()) } } object OdpsUtils { def apply(accessKeyId: String, accessKeySecret: String, odpsUrl: String): OdpsUtils = { val account = new AliyunAccount(accessKeyId, accessKeySecret) val odps = new Odps(account) odps.setEndpoint(odpsUrl) new OdpsUtils(odps) } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy