All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.delta.sharing.spark.DeltaSharingSourceOffset.scala Maven / Gradle / Ivy

/*
 * Copyright (2021) The Delta Lake Project Authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.delta.sharing.spark

// scalastyle:off import.ordering.noEmptyLine
import org.apache.spark.sql.connector.read.streaming.{Offset => OffsetV2}
import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
import org.json4s._
import org.json4s.jackson.JsonMethods.parse

import io.delta.sharing.client.util.JsonUtils

/**
 * Tracks how far we processed in when reading changes from the [[Delta Sharing Server]].
 *
 * @param sourceVersion     The version of serialization that this offset is encoded with.
 * @param tableId           The id of the table we are reading from. Used to detect
 *                          misconfiguration when restarting a query.
 * @param tableVersion      The version of the table that we are currently processing.
 * @param index             The index in the sequence of AddFiles in this version. Used to
 *                          break large commits into multiple batches. This index is created by
 *                          sorting on url.
 * @param isStartingVersion Whether this offset denotes a query that is starting rather than
 *                          processing changes. When starting a new query, we first process
 *                          all data present in the table at the start and then move on to
 *                          processing new data that has arrived.
 */
case class DeltaSharingSourceOffset(
  sourceVersion: Long,
  tableId: String,
  tableVersion: Long,
  index: Long,
  isStartingVersion: Boolean
) extends Offset {

  override def json: String = JsonUtils.toJson(this)

  /**
   * Compare two DeltaSharingSourceOffsets which are on the same table.
   * @return 0 for equivalent offsets. negative if this offset is less than `otherOffset`. Positive
   *         if this offset is greater than `otherOffset`
   */
  def compare(otherOffset: DeltaSharingSourceOffset): Int = {
    assert(tableId == otherOffset.tableId, "Comparing offsets that do not refer to the" +
      " same table is disallowed.")
    implicitly[Ordering[(Long, Long)]].compare((tableVersion, index),
      (otherOffset.tableVersion, otherOffset.index))
  }
}

object DeltaSharingSourceOffset {

  val VERSION_1 = 1

  def apply(
    sourceVersion: Long,
    tableId: String,
    tableVersion: Long,
    index: Long,
    isStartingVersion: Boolean
  ): DeltaSharingSourceOffset = {
    new DeltaSharingSourceOffset(
      sourceVersion,
      tableId,
      tableVersion,
      index,
      isStartingVersion
    )
  }

  def apply(tableId: String, offset: OffsetV2): DeltaSharingSourceOffset = {
    offset match {
      case o: DeltaSharingSourceOffset => o
      case s =>
        validateSourceVersion(s.json)
        val o = JsonUtils.fromJson[DeltaSharingSourceOffset](s.json)
        if (o.tableId != tableId) {
          throw DeltaSharingErrors.nonExistentDeltaSharingTable(o.tableId)
        }
        o
    }
  }

  private def validateSourceVersion(json: String): Unit = {
    val parsedJson = parse(json)
    val versionOpt = jsonOption(parsedJson \ "sourceVersion").map {
      case i: JInt => i.num.longValue
      case other => throw DeltaSharingErrors.invalidSourceVersion(other.toString)
    }
    if (versionOpt.isEmpty) {
      throw DeltaSharingErrors.cannotFindSourceVersionException(json)
    }

    val maxVersion = VERSION_1

    if (versionOpt.get > maxVersion) {
      throw DeltaSharingErrors.unsupportedTableReaderVersion(maxVersion, versionOpt.get)
    }
  }

  /** Return an option that translates JNothing to None */
  private def jsonOption(json: JValue): Option[JValue] = {
    json match {
      case JNothing => None
      case value: JValue => Some(value)
    }
  }

  /**
   * Validate offsets to make sure we always move forward. Moving backward may make the query
   * re-process data and cause data duplication.
   */
  def validateOffsets(
    previousOffset: DeltaSharingSourceOffset,
    currentOffset: DeltaSharingSourceOffset): Unit = {
    if (previousOffset.isStartingVersion == false && currentOffset.isStartingVersion == true) {
      throw new IllegalStateException(
        s"Found invalid offsets: 'isStartingVersion' fliped incorrectly. " +
          s"Previous: $previousOffset, Current: $currentOffset")
    }
    if (previousOffset.compare(currentOffset) > 0) {
      throw new IllegalStateException(
        s"Found invalid offsets. Previous: $previousOffset, Current: $currentOffset")
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy