All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.audienceproject.spark.dynamodb.datasource.ScanPartition.scala Maven / Gradle / Ivy

/**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  *
  * Copyright © 2019 AudienceProject. All rights reserved.
  */
package com.audienceproject.spark.dynamodb.datasource

import com.amazonaws.services.dynamodbv2.document.Item
import com.audienceproject.shaded.google.common.util.concurrent.RateLimiter
import com.audienceproject.spark.dynamodb.connector.DynamoConnector
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.sources.v2.reader.{InputPartition, InputPartitionReader}
import org.apache.spark.sql.types.{StructField, StructType}

import scala.collection.JavaConverters._

class ScanPartition(schema: StructType,
                    partitionIndex: Int,
                    connector: DynamoConnector,
                    filters: Array[Filter])
    extends InputPartition[InternalRow] {

    private val requiredColumns = schema.map(_.name)

    @transient
    private lazy val typeConversions = schema.collect({
        case StructField(name, dataType, _, _) => name -> TypeConversion(name, dataType)
    }).toMap

    override def createPartitionReader(): InputPartitionReader[InternalRow] = {
        if (connector.isEmpty) new EmptyReader
        else new PartitionReader
    }

    private class EmptyReader extends InputPartitionReader[InternalRow] {
        override def next(): Boolean = false

        override def get(): InternalRow = throw new IllegalStateException("Unable to call get() on empty iterator")

        override def close(): Unit = {}
    }

    private class PartitionReader extends InputPartitionReader[InternalRow] {

        private val pageIterator = connector.scan(partitionIndex, requiredColumns, filters).pages().iterator().asScala
        private val rateLimiter = RateLimiter.create(connector.readLimit)

        private var innerIterator: Iterator[InternalRow] = Iterator.empty

        private var currentRow: InternalRow = _
        private var proceed = false

        override def next(): Boolean = {
            proceed = true
            innerIterator.hasNext || {
                if (pageIterator.hasNext) {
                    nextPage()
                    next()
                }
                else false
            }
        }

        override def get(): InternalRow = {
            if (proceed) {
                currentRow = innerIterator.next()
                proceed = false
            }
            currentRow
        }

        override def close(): Unit = {}

        private def nextPage(): Unit = {
            val page = pageIterator.next()
            val result = page.getLowLevelResult
            Option(result.getScanResult.getConsumedCapacity).foreach(cap => rateLimiter.acquire(cap.getCapacityUnits.toInt max 1))
            innerIterator = result.getItems.iterator().asScala.map(itemToRow(requiredColumns))
        }

    }

    private def itemToRow(requiredColumns: Seq[String])(item: Item): InternalRow =
        if (requiredColumns.nonEmpty) InternalRow.fromSeq(requiredColumns.map(columnName => typeConversions(columnName)(item)))
        else InternalRow.fromSeq(item.asMap().asScala.values.toSeq.map(_.toString))

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy