org.apache.spark.sql.delta.actions.actions.scala Maven / Gradle / Ivy
The newest version!
/*
* Copyright (2020) The Delta Lake Project Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.delta.actions
import java.net.URI
import java.sql.Timestamp
import org.apache.spark.sql.delta.util.JsonUtils
import com.fasterxml.jackson.annotation.{ JsonIgnore, JsonInclude }
import com.fasterxml.jackson.core.JsonGenerator
import com.fasterxml.jackson.databind.{ JsonSerializer, SerializerProvider }
import com.fasterxml.jackson.databind.annotation.{ JsonDeserialize, JsonSerialize }
import org.codehaus.jackson.annotate.JsonRawValue
import org.apache.spark.util.Utils
// scalastyle:off
import org.apache.spark.internal.Logging
import org.apache.spark.sql.Encoder
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.types.{ DataType, StructType }
// scalastyle:on
/** Thrown when the protocol version of a table is greater than supported by this client. */
class InvalidProtocolVersionException
extends RuntimeException(
"Delta protocol version is too new for this version of the Databricks Runtime. " +
"Please upgrade to a newer release."
)
class ProtocolDowngradeException(oldProtocol: Protocol, newProtocol: Protocol)
extends RuntimeException(s"Protocol version cannot be downgraded from $oldProtocol to $newProtocol")
object Action {
/** The maximum version of the protocol that this version of Delta understands. */
val readerVersion = 1
val writerVersion = 2
val protocolVersion: Protocol = Protocol(readerVersion, writerVersion)
def fromJson(json: String): Action = {
JsonUtils.mapper.readValue[SingleAction](json).unwrap
}
lazy val logSchema = ExpressionEncoder[SingleAction].schema
}
/**
* Represents a single change to the state of a Delta table. An order sequence
* of actions can be replayed using [[InMemoryLogReplay]] to derive the state
* of the table at a given point in time.
*/
sealed trait Action {
def wrap: SingleAction
def json: String = JsonUtils.toJson(wrap)
}
/**
* Used to block older clients from reading or writing the log when backwards
* incompatible changes are made to the protocol. Readers and writers are
* responsible for checking that they meet the minimum versions before performing
* any other operations.
*
* Since this action allows us to explicitly block older clients in the case of a
* breaking change to the protocol, clients should be tolerant of messages and
* fields that they do not understand.
*/
case class Protocol(minReaderVersion: Int = Action.readerVersion, minWriterVersion: Int = Action.writerVersion)
extends Action {
override def wrap: SingleAction = SingleAction(protocol = this)
@JsonIgnore
def simpleString: String = s"($minReaderVersion,$minWriterVersion)"
}
/**
* Sets the committed version for a given application. Used to make operations
* like streaming append idempotent.
*/
case class SetTransaction(
appId: String,
version: Long,
@JsonDeserialize(contentAs = classOf[java.lang.Long])
lastUpdated: Option[Long]
) extends Action {
override def wrap: SingleAction = SingleAction(txn = this)
}
/** Actions pertaining to the addition and removal of files. */
sealed trait FileAction extends Action {
val path: String
val dataChange: Boolean
@JsonIgnore
lazy val pathAsUri: URI = new URI(path)
}
/**
* Adds a new file to the table. When multiple [[AddFile]] file actions
* are seen with the same `path` only the metadata from the last one is
* kept.
*/
case class AddFile(
path: String,
@JsonInclude(JsonInclude.Include.ALWAYS)
partitionValues: Map[String, String],
size: Long,
modificationTime: Long,
dataChange: Boolean,
@JsonRawValue
stats: String = null,
tags: Map[String, String] = null
) extends FileAction {
require(path.nonEmpty)
override def wrap: SingleAction = SingleAction(add = this)
def remove: RemoveFile = removeWithTimestamp()
def removeWithTimestamp(timestamp: Long = System.currentTimeMillis(), dataChange: Boolean = true): RemoveFile = {
// scalastyle:off
RemoveFile(path, Some(timestamp), dataChange)
// scalastyle:on
}
}
object AddFile {
/**
* Misc file-level metadata.
*
* The convention is that clients may safely ignore any/all of these tags and this should never
* have an impact on correctness.
*
* Otherwise, the information should go as a field of the AddFile action itself and the Delta
* protocol version should be bumped.
*/
object Tags {
sealed abstract class KeyType(val name: String)
/** [[ZCUBE_ID]]: identifier of the OPTIMIZE ZORDER BY job that this file was produced by */
object ZCUBE_ID extends AddFile.Tags.KeyType("ZCUBE_ID")
/** [[ZCUBE_ZORDER_BY]]: ZOrdering of the corresponding ZCube */
object ZCUBE_ZORDER_BY extends AddFile.Tags.KeyType("ZCUBE_ZORDER_BY")
}
/** Convert a [[Tags.KeyType]] to a string to be used in the AddMap.tags Map[String, String]. */
def tag(tagKey: Tags.KeyType): String = tagKey.name
}
/**
* Logical removal of a given file from the reservoir. Acts as a tombstone before a file is
* deleted permanently.
*/
// scalastyle:off
case class RemoveFile(
path: String,
@JsonDeserialize(contentAs = classOf[java.lang.Long])
deletionTimestamp: Option[Long],
dataChange: Boolean = true
) extends FileAction {
override def wrap: SingleAction = SingleAction(remove = this)
@JsonIgnore
val delTimestamp: Long = deletionTimestamp.getOrElse(0L)
}
// scalastyle:on
case class Format(provider: String = "parquet", options: Map[String, String] = Map.empty)
/**
* Updates the metadata of the table. Only the last update to the [[Metadata]]
* of a table is kept. It is the responsibility of the writer to ensure that
* any data already present in the table is still valid after any change.
*/
case class Metadata(
id: String = if (Utils.isTesting) "testId" else java.util.UUID.randomUUID().toString,
name: String = null,
description: String = null,
format: Format = Format(),
schemaString: String = null,
partitionColumns: Seq[String] = Nil,
configuration: Map[String, String] = Map.empty,
@JsonDeserialize(contentAs = classOf[java.lang.Long])
createdTime: Option[Long] = Some(System.currentTimeMillis())
) extends Action {
// The `schema` and `partitionSchema` methods should be vals or lazy vals, NOT
// defs, because parsing StructTypes from JSON is extremely expensive and has
// caused perf. problems here in the past:
/** Returns the schema as a [[StructType]] */
@JsonIgnore
lazy val schema: StructType =
Option(schemaString).map { s =>
DataType.fromJson(s).asInstanceOf[StructType]
}.getOrElse(StructType.apply(Nil))
/** Returns the partitionSchema as a [[StructType]] */
@JsonIgnore
lazy val partitionSchema: StructType =
new StructType(partitionColumns.map(c => schema(c)).toArray)
/** Columns written out to files. */
@JsonIgnore
lazy val dataSchema: StructType = {
val partitions = partitionColumns.toSet
StructType(schema.filterNot(f => partitions.contains(f.name)))
}
override def wrap: SingleAction = SingleAction(metaData = this)
}
/**
* Interface for objects that represents the information for a commit. Commits can be referred to
* using a version and timestamp. The timestamp of a commit comes from the remote storage
* `lastModifiedTime`, and can be adjusted for clock skew. Hence we have the method `withTimestamp`.
*/
trait CommitMarker {
/** Get the timestamp of the commit as millis after the epoch. */
def getTimestamp: Long
/** Return a copy object of this object with the given timestamp. */
def withTimestamp(timestamp: Long): CommitMarker
/** Get the version of the commit. */
def getVersion: Long
}
/**
* Holds provenance information about changes to the table. This [[Action]]
* is not stored in the checkpoint and has reduced compatibility guarantees.
* Information stored in it is best effort (i.e. can be falsified by the writer).
*/
case class CommitInfo(
// The commit version should be left unfilled during commit(). When reading a delta file, we can
// infer the commit version from the file name and fill in this field then.
@JsonDeserialize(contentAs = classOf[java.lang.Long])
version: Option[Long],
timestamp: Timestamp,
userId: Option[String],
userName: Option[String],
operation: String,
@JsonSerialize(using = classOf[JsonMapSerializer])
operationParameters: Map[String, String],
job: Option[JobInfo],
notebook: Option[NotebookInfo],
clusterId: Option[String],
@JsonDeserialize(contentAs = classOf[java.lang.Long])
readVersion: Option[Long],
isolationLevel: Option[String],
/** Whether this commit has blindly appended without caring about existing files */
isBlindAppend: Option[Boolean],
operationMetrics: Option[Map[String, String]]
) extends Action
with CommitMarker {
override def wrap: SingleAction = SingleAction(commitInfo = this)
override def withTimestamp(timestamp: Long): CommitInfo = {
this.copy(timestamp = new Timestamp(timestamp))
}
override def getTimestamp: Long = timestamp.getTime
@JsonIgnore
override def getVersion: Long = version.get
}
case class JobInfo(jobId: String, jobName: String, runId: String, jobOwnerId: String, triggerType: String)
object JobInfo {
def fromContext(context: Map[String, String]): Option[JobInfo] = {
context.get("jobId").map { jobId =>
JobInfo(
jobId,
context.get("jobName").orNull,
context.get("runId").orNull,
context.get("jobOwnerId").orNull,
context.get("jobTriggerType").orNull
)
}
}
}
case class NotebookInfo(notebookId: String)
object NotebookInfo {
def fromContext(context: Map[String, String]): Option[NotebookInfo] = {
context.get("notebookId").map { nbId => NotebookInfo(nbId) }
}
}
object CommitInfo {
def empty(version: Option[Long] = None): CommitInfo = {
CommitInfo(version, null, None, None, null, null, None, None, None, None, None, None, None)
}
def apply(
time: Long,
operation: String,
operationParameters: Map[String, String],
commandContext: Map[String, String],
readVersion: Option[Long],
isolationLevel: Option[String],
isBlindAppend: Option[Boolean],
operationMetrics: Option[Map[String, String]]
): CommitInfo = {
val getUserName = commandContext.get("user").flatMap {
case "unknown" => None
case other => Option(other)
}
CommitInfo(
None,
new Timestamp(time),
commandContext.get("userId"),
getUserName,
operation,
operationParameters,
JobInfo.fromContext(commandContext),
NotebookInfo.fromContext(commandContext),
commandContext.get("clusterId"),
readVersion,
isolationLevel,
isBlindAppend,
operationMetrics
)
}
}
/** A serialization helper to create a common action envelope. */
case class SingleAction(
txn: SetTransaction = null,
add: AddFile = null,
remove: RemoveFile = null,
metaData: Metadata = null,
protocol: Protocol = null,
commitInfo: CommitInfo = null
) {
def unwrap: Action = {
if (add != null) {
add
} else if (remove != null) {
remove
} else if (metaData != null) {
metaData
} else if (txn != null) {
txn
} else if (protocol != null) {
protocol
} else if (commitInfo != null) {
commitInfo
} else {
null
}
}
}
object SingleAction extends Logging {
private lazy val _encoder: ExpressionEncoder[SingleAction] =
try {
ExpressionEncoder[SingleAction]()
} catch {
case e: Throwable =>
logError(e.getMessage, e)
throw e
}
private lazy val _addFileEncoder: ExpressionEncoder[AddFile] =
try {
ExpressionEncoder[AddFile]()
} catch {
case e: Throwable =>
logError(e.getMessage, e)
throw e
}
implicit def encoder: Encoder[SingleAction] = {
_encoder.copy()
}
implicit def addFileEncoder: Encoder[AddFile] = {
_addFileEncoder.copy()
}
}
/** Serializes Maps containing JSON strings without extra escaping. */
class JsonMapSerializer extends JsonSerializer[Map[String, String]] {
def serialize(parameters: Map[String, String], jgen: JsonGenerator, provider: SerializerProvider): Unit = {
jgen.writeStartObject()
parameters.foreach { case (key, value) =>
if (value == null) {
jgen.writeNullField(key)
} else {
jgen.writeFieldName(key)
// Write value as raw data, since it's already JSON text
jgen.writeRawValue(value)
}
}
jgen.writeEndObject()
}
}