All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.databricks.spark.redshift.Conversions.scala Maven / Gradle / Ivy

/*
 * Copyright 2015 TouchType Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.databricks.spark.redshift

import java.sql.Timestamp
import java.text.{DecimalFormat, DateFormat, FieldPosition, ParsePosition, SimpleDateFormat}
import java.util.Date

import scala.collection.mutable

import org.apache.spark.sql.types._
import org.apache.spark.sql.Row

/**
 * Data type conversions for Redshift unloaded data
 */
private[redshift] object Conversions {

  /**
   * Parse a boolean using Redshift's UNLOAD bool syntax
   */
  private def parseBoolean(s: String): Boolean = {
    if (s == "t") true
    else if (s == "f") false
    else throw new IllegalArgumentException(s"Expected 't' or 'f' but got '$s'")
  }

  /**
   * Formatter for writing decimals unloaded from Redshift.
   *
   * Note that Java Formatters are NOT thread-safe, so you should not re-use instances of this
   * DecimalFormat across threads.
   */
  def createRedshiftDecimalFormat(): DecimalFormat = {
    val format = new DecimalFormat()
    format.setParseBigDecimal(true)
    format
  }

  /**
   * Formatter for parsing strings exported from Redshift DATE columns.
   * This formatter should not be used when saving dates back to Redshift; instead, use
   * [[RedshiftTimestampFormat]].
   *
   * Note that Java Formatters are NOT thread-safe, so you should not re-use instances of this
   * SimpleDateFormat across threads.
   */
  def createRedshiftDateFormat(): SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")

  /**
   * Return a function that will convert arrays of strings conforming to the given schema to Rows.
   *
   * Note that instances of this function are NOT thread-safe.
   */
  def createRowConverter(schema: StructType): (Array[String]) => Row = {
    val timestampFormat = new RedshiftTimestampFormat
    val dateFormat = createRedshiftDateFormat()
    val decimalFormat = createRedshiftDecimalFormat()
    val conversionFunctions: Array[String => Any] = schema.fields.map { field =>
      field.dataType match {
        case ByteType => (data: String) => data.toByte
        case BooleanType => (data: String) => parseBoolean(data)
        case DateType => (data: String) => new java.sql.Date(dateFormat.parse(data).getTime)
        case DoubleType => (data: String) => data.toDouble
        case FloatType => (data: String) => data.toFloat
        case dt: DecimalType =>
          (data: String) => decimalFormat.parse(data).asInstanceOf[java.math.BigDecimal]
        case IntegerType => (data: String) => data.toInt
        case LongType => (data: String) => data.toLong
        case ShortType => (data: String) => data.toShort
        case StringType => (data: String) => data
        case TimestampType => (data: String) => new Timestamp(timestampFormat.parse(data).getTime)
        case _ => (data: String) => data
      }
    }
    // As a performance optimization, re-use the same mutable Seq:
    val converted: mutable.IndexedSeq[Any] = mutable.IndexedSeq.fill(schema.length)(null)
    (fields: Array[String]) => {
      var i = 0
      while (i < schema.length) {
        val data = fields(i)
        converted(i) = if (data == null || data.isEmpty) null else conversionFunctions(i)(data)
        i += 1
      }
      Row.fromSeq(converted)
    }
  }
}

/**
 * Formatter for parsing strings exported from Redshift TIMESTAMP columns and for formatting
 * timestamps as strings when writing data back to Redshift via Avro.
 *
 * Redshift may or may not include the fraction component in the UNLOAD data, and there are
 * apparently not clues about this in the table schema. This format delegates to one of two
 * formats based on string length.
 *
 * Instances of this class are NOT thread-safe (because they rely on Java's DateFormat classes,
 * which are also not thread-safe).
 */
private[redshift] class RedshiftTimestampFormat extends DateFormat {

  // Imports and exports with Redshift require that timestamps are represented
  // as strings, using the following formats
  private val PATTERN_WITH_MILLIS = "yyyy-MM-dd HH:mm:ss.SSS"
  private val PATTERN_WITHOUT_MILLIS = "yyyy-MM-dd HH:mm:ss"

  private val redshiftTimestampFormatWithMillis = new SimpleDateFormat(PATTERN_WITH_MILLIS)
  private val redshiftTimestampFormatWithoutMillis = new SimpleDateFormat(PATTERN_WITHOUT_MILLIS)

  override def format(
      date: Date,
      toAppendTo: StringBuffer,
      fieldPosition: FieldPosition): StringBuffer = {
    // Always export with milliseconds, as they can just be zero if not specified
    redshiftTimestampFormatWithMillis.format(date, toAppendTo, fieldPosition)
  }

  override def parse(source: String, pos: ParsePosition): Date = {
    if (source.length < PATTERN_WITH_MILLIS.length) {
      redshiftTimestampFormatWithoutMillis.parse(source, pos)
    } else {
      redshiftTimestampFormatWithMillis.parse(source, pos)
    }
  }
}

private[redshift] class RedshiftDateFormat extends DateFormat {

  // Imports and exports with Redshift require that dates are represented
  // as strings, using the following format
  private val PATTERN = "yyyy-MM-dd"

  private val redshiftDateFormat = new SimpleDateFormat(PATTERN)

  override def format(
       date: Date,
       toAppendTo: StringBuffer,
       fieldPosition: FieldPosition): StringBuffer = {
    redshiftDateFormat.format(date, toAppendTo, fieldPosition)
  }

  override def parse(source: String, pos: ParsePosition): Date = {
    redshiftDateFormat.parse(source, pos)
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy