geotrellis.spark.io.hadoop.HadoopAttributeStore.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of geotrellis-spark_2.10 Show documentation
GeoTrellis is an open source geographic data processing engine for high performance applications.
The newest version!
package geotrellis.spark.io.hadoop

import geotrellis.spark._
import geotrellis.spark.io._
import geotrellis.spark.io.index.KeyIndex
import org.apache.avro.Schema

import spray.json._
import DefaultJsonProtocol._
import org.apache.hadoop.fs.Path
import org.apache.spark._
import java.io.PrintWriter

import org.apache.hadoop.conf.Configuration

import scala.util.matching.Regex

class HadoopAttributeStore(val rootPath: Path, val hadoopConfiguration: Configuration) extends BlobLayerAttributeStore {
  import HadoopAttributeStore._

  val attributePath = new Path(rootPath, "_attributes")
  val fs = attributePath.getFileSystem(hadoopConfiguration)

  // Create directory if it doesn't exist
  if(!fs.exists(attributePath)) {
    fs.mkdirs(attributePath)
  }

  def attributePath(layerId: LayerId, attributeName: String): Path = {
    val fname = s"${layerId.name}${SEP}${layerId.zoom}${SEP}${attributeName}.json"
    new Path(attributePath, fname)
  }

  private def delete(layerId: LayerId, path: Path): Unit = {
    if(!layerExists(layerId)) throw new LayerNotFoundError(layerId)
    HdfsUtils
      .listFiles(new Path(attributePath, path), hadoopConfiguration)
      .foreach(fs.delete(_, false))
    clearCache(layerId)
  }

  def attributeWildcard(attributeName: String): Path =
    new Path(s"*${SEP}${attributeName}.json")

  def layerWildcard(layerId: LayerId): Path =
    new Path(s"${layerId.name}${SEP}${layerId.name}${SEP}*.json")

  private def readFile[T: JsonFormat](path: Path): Option[(LayerId, T)] = {
    HdfsUtils
      .getLineScanner(path, hadoopConfiguration)
      .map{ in =>
        val txt =
          try {
            in.mkString
          }
          finally {
            in.close()
          }
        txt.parseJson.convertTo[(LayerId, T)]
      }
  }

  def read[T: JsonFormat](layerId: LayerId, attributeName: String): T =
    readFile[T](attributePath(layerId, attributeName)) match {
      case Some((id, value)) => value
      case None => throw new AttributeNotFoundError(attributeName, layerId)
    }

  def readAll[T: JsonFormat](attributeName: String): Map[LayerId,T] = {
    HdfsUtils
      .listFiles(attributeWildcard(attributeName), hadoopConfiguration)
      .map{ path: Path =>
        readFile[T](path) match {
          case Some(tup) => tup
          case None => throw new LayerIOError(s"Unable to list $attributeName attributes from $path")
        }
      }
      .toMap
  }

  def write[T: JsonFormat](layerId: LayerId, attributeName: String, value: T): Unit = {
    val path = attributePath(layerId, attributeName)

    if(fs.exists(path)) {
      fs.delete(path, false)
    }

    val fdos = fs.create(path)
    val out = new PrintWriter(fdos)
    try {
      val s = (layerId, value).toJson.toString()
      out.println(s)
    } finally {
      out.close()
      fdos.close()
    }
  }

  def layerExists(layerId: LayerId): Boolean =
    HdfsUtils
      .listFiles(new Path(attributePath, s"*.json"), hadoopConfiguration)
      .exists { path: Path =>
        val List(name, zoomStr) = path.getName.split(SEP).take(2).toList
        layerId == LayerId(name, zoomStr.toInt)
      }

  def delete(layerId: LayerId): Unit = {
    delete(layerId, new Path(s"${layerId.name}${SEP}${layerId.zoom}${SEP}*.json"))
    clearCache(layerId)
  }

  def delete(layerId: LayerId, attributeName: String): Unit = {
    delete(layerId, new Path(s"${layerId.name}${SEP}${layerId.zoom}${SEP}${attributeName}.json"))
    clearCache(layerId, attributeName)
  }

  def layerIds: Seq[LayerId] =
    HdfsUtils
      .listFiles(new Path(attributePath, s"*.json"), hadoopConfiguration)
      .map { path: Path =>
        val List(name, zoomStr) = path.getName.split(SEP).take(2).toList
        LayerId(name, zoomStr.toInt)
      }
      .distinct

  def availableAttributes(layerId: LayerId): Seq[String] = {
    HdfsUtils
      .listFiles(layerWildcard(layerId), hadoopConfiguration)
      .map { path: Path =>
        val attributeRx(name, zoom, attribute) = path.getName
        attribute
      }
      .toVector
  }
}

object HadoopAttributeStore {
  final val SEP = "___"

  val attributeRx = {
    val slug = "[a-zA-Z0-9-]+"
    new Regex(s"""($slug)$SEP($slug)${SEP}($slug).json""", "layer", "zoom", "attribute")
  }

  def apply(rootPath: Path, config: Configuration): HadoopAttributeStore =
    new HadoopAttributeStore(rootPath, config)

  def apply(rootPath: Path)(implicit sc: SparkContext): HadoopAttributeStore =
    apply(rootPath, sc.hadoopConfiguration)
}