All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.yupana.spark.HDFSProgressSaver.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2019 Rusexpertiza LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.yupana.spark

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.spark.Partition
import org.yupana.hbase.HdfsFileUtils
import org.yupana.settings.Settings

import scala.io.Source
import scala.util.Using

class HDFSProgressSaver[P <: Partition](
    fileName: String,
    partitionStorable: PartitionStorable[P],
    settings: Settings
) extends ProgressSaver[P]
    with Serializable {

  private val allPartitionsHeader = "** All Partitions **"
  private val completedPartitionsHeader = "** Completed partitions **"

  override def writePartitions(ps: Seq[P]): Unit = {
    val hBaseConfiguration = createHBaseConfiguration()
    HdfsFileUtils.saveDataToHdfs(
      fileName,
      hBaseConfiguration,
      os => {
        os.writeBytes(allPartitionsHeader + "\n")
        ps.foreach { partition =>
          os.writeBytes(partitionStorable.asString(partition) + "\n")
        }
        os.writeBytes(completedPartitionsHeader + "\n")
      }
    )
  }

  override def writeProgress(p: P): Unit = {
    val hBaseConfiguration = createHBaseConfiguration()
    HdfsFileUtils.appendDataToHdfs(
      fileName,
      hBaseConfiguration,
      os => os.writeBytes(partitionStorable.asString(p) + "\n")
    )
  }

  override def readPartitions: Seq[P] = {
    val hBaseConfiguration = createHBaseConfiguration()

    if (HdfsFileUtils.isFileExists(fileName, hBaseConfiguration)) {
      val lines = HdfsFileUtils.readDataFromHdfs[List[String]](
        fileName,
        hBaseConfiguration,
        is => {
          Using.resource(Source.fromInputStream(is)) { s =>
            s.getLines().toList
          }
        }
      )
      if (!lines.headOption.contains(allPartitionsHeader)) {
        throw new IllegalArgumentException("Incorrect format of partitions file")
      }
      val (allPartitions, tail) = lines.tail.span(_ != completedPartitionsHeader)
      val donePartitions = tail.drop(1).toSet
      val remains = allPartitions
        .filterNot(p => donePartitions.contains(p))
        .zipWithIndex
        .map {
          case (line, index) => partitionStorable.fromString(line, index)
        }
      remains
    } else Seq.empty
  }

  private def createHBaseConfiguration(): Configuration = {
    val hBaseConfiguration = HBaseConfiguration.create()
    hBaseConfiguration.set("hbase.zookeeper.quorum", settings("hbase.zookeeper"))
    hBaseConfiguration.set("zookeeper.session.timeout", "180000")
    HdfsFileUtils.addHdfsPathToConfiguration(hBaseConfiguration, settings)
    hBaseConfiguration
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy