All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.ucr.cs.bdlab.beast.io.SpatialFilePartitioner.scala Maven / Gradle / Ivy

There is a newer version: 0.10.1-RC2
Show newest version
/*
 * Copyright 2022 University of California, Riverside
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.ucr.cs.bdlab.beast.io

import edu.ucr.cs.bdlab.beast.geolite.EnvelopeNDLite
import edu.ucr.cs.bdlab.beast.io.SpatialFileRDD.{MaxSplitSize, MinSplitSize, SpatialFilePartition}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
import org.apache.hadoop.io.compress.{CompressionCodecFactory, SplittableCompressionCodec}
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.spark.internal.Logging

import scala.collection.mutable.ArrayBuffer

/**
 * A class that iterates over all files that match a set of extensions.
 * This class lazily returns the matching paths which helps when only one file is needed and the input path
 * contains thousands of matching files.
 * @param fileSystem the file system that contains all the given files
 * @param files the list of files or directories to start the search on
 * @param extensions (optional) list of extensions to limit the search. Each one contains the dot, e.g., ".zip"
 * @param recursive recursively go into subdirectories in the search. Default: false
 * @param skipHidden skip hidden files that begin with "." or "_". Default: true
 * @param useMaster in directories with a master file, use it to list the files instead of using the file system
 * @param splitFiles a function that tells whether a file should be split or not
 */
class SpatialFilePartitioner(fileSystem: FileSystem, files: Iterator[Path],
                             extensions: Array[String] = null,
                             recursive: Boolean = false,
                             skipHidden: Boolean = true,
                             useMaster: Boolean = true,
                             splitFiles: FileStatus => Boolean = SpatialFilePartitioner.splitCompressed)
  extends Iterator[SpatialFilePartition2] with Logging {

  /** The list of paths that should be searched next */
  private val pathsToSearch = new ArrayBuffer[FileStatus]()
  for (file <- files)
    pathsToSearch.append(fileSystem.getFileStatus(file))

  /** Partitions that have been created but not returned yet */
  private val partitions = new ArrayBuffer[SpatialFilePartition2]()

  /** A filter that is used to determine which paths to consider based on user preferences */
  private lazy val filter: FileStatus => Boolean = {
    var subFilters = Array[FileStatus => Boolean]()
    if (extensions != null) {
      val lowerCaseExtensions = extensions.map(_.toLowerCase)
      val extensionFilter: FileStatus => Boolean = fileStatus => {
        if (fileStatus.isDirectory)
          true // Extensions are only applied to files not directories
        else {
          val name = fileStatus.getPath.getName.toLowerCase
          lowerCaseExtensions.indexWhere(ext => name.endsWith(ext)) != -1
        }
      }
      subFilters = subFilters :+ extensionFilter
    }
    if (skipHidden) {
      val hiddenFilter: FileStatus => Boolean = fileStatus => {
        val name = fileStatus.getPath.getName
        // Hidden filter applies to directories and files
        !(name.startsWith("_") || name.startsWith("."))
      }
      subFilters = subFilters :+ hiddenFilter
    }
    val combinedFilter: FileStatus => Boolean = path => subFilters.forall(filter => filter(path))
    combinedFilter
  }

  /** Number of partitions that have been created so far */
  private var numPartitionsCreated: Int = 0

  /** The record that will be returned next */
  private var nextRecord: SpatialFilePartition2 = prefetchNext

  private def prefetchNext: SpatialFilePartition2 = {
    while (partitions.nonEmpty || pathsToSearch.nonEmpty) {
      if (partitions.nonEmpty)
        return partitions.remove(partitions.length - 1)
      val pathToSearch = pathsToSearch.remove(pathsToSearch.size - 1)
      if (pathToSearch.isFile) {
        // A file, apply any user-provided filters before returning
        if (filter(pathToSearch)) {
          val locations = fileSystem.getFileBlockLocations(pathToSearch, 0, pathToSearch.getLen)
            .flatMap(_.getHosts).distinct
          val filePartition = SpatialFilePartition2(0, pathToSearch.getPath.toString,
            0, pathToSearch.getLen, locations, -1, -1, null, null)
          // Split the file if needed
          addFilePartitions(filePartition, partitions)
        }
      } else if (pathToSearch.isDirectory) {
        // A directory, return all files inside it while applying the filter
        var contents = fileSystem.listStatus(pathToSearch.getPath)
        val iMaster = if (!useMaster) -1 else contents.indexWhere(f => f.getPath.getName.startsWith("_master"))
        if (iMaster != -1) {
          // Override the contents based on the master file and ignore the user-provided filter
          val masterReader = new CSVReaderLite(fileSystem.open(contents(iMaster).getPath), '\t')
          try {
            val masterFilePartitions = masterReader.map(row => {
              val path = new Path(pathToSearch.getPath, row.getAs[String]("File Name"))
              // Use the file length from the master file to override the split length
              val length = row.getAs[String]("Data Size").trim.toLong
              val firstDimension = row.fieldIndex("Geometry") + 1
              val lastDimension = row.length - 1
              val numDimensions = (lastDimension - firstDimension + 1) / 2
              val minCoord = new Array[Double](numDimensions)
              val maxCoord = new Array[Double](numDimensions)
              for (d <- 0 until numDimensions) {
                minCoord(d) = row.getAs[String](firstDimension + d).trim.toDouble
                maxCoord(d) = row.getAs[String](firstDimension + numDimensions + d).trim.toDouble
              }
              val mbr = new EnvelopeNDLite(minCoord, maxCoord)
              val numRecords = row.getAs[String]("Record Count").trim.toLong
              val locations = fileSystem.getFileBlockLocations(path, 0, length)
                .flatMap(_.getHosts).distinct
              SpatialFilePartition2(0, path.toString, 0, length, locations, numRecords, -1, null, mbr)
            })
            for (masterFilePartition <- masterFilePartitions)
              addFilePartitions(masterFilePartition, partitions)
          } finally {
            masterReader.close()
          }
        } else {
          contents = contents.filter(status => filter(status))
          for (content <- contents) {
            if (recursive || !content.isDirectory)
              pathsToSearch.append(content)
          }
        }
      } else {
        logWarning(s"Do not know how to handle $pathToSearch")
      }

    }
    // No more paths to search. Return null to mark end of iterator
    null
  }

  override def hasNext: Boolean = nextRecord != null

  override def next(): SpatialFilePartition2 = {
    val currentRecord = nextRecord
    nextRecord = prefetchNext
    currentRecord
  }

  def addFilePartitions(file: SpatialFilePartition2, partitions: ArrayBuffer[SpatialFilePartition2]): Unit = {
    val fileStatus = fileSystem.getFileStatus(new Path(file.filePath))
    if (!splitFiles(fileStatus)) {
      file.index = numPartitionsCreated
      numPartitionsCreated += 1
      partitions.append(file)
    } else {
      // Split files
      val splitSize = {
        val minSize = fileSystem.getConf.getLong(MinSplitSize, 1)
        val maxSize = fileSystem.getConf.getLong(MaxSplitSize, Long.MaxValue)
        Math.max(minSize, Math.min(maxSize, fileStatus.getBlockSize))
      }
      val blkLocations = fileSystem.getFileBlockLocations(fileStatus, file.offset, file.length)
      var partitionStart = file.offset
      val fileEnd = file.end
      val SPLIT_SLOP = 1.1 // 10% slop
      while (partitionStart < fileEnd) {
        val blkIndex = blkLocations.find(bl => partitionStart >= bl.getOffset && partitionStart < bl.getOffset + bl.getLength)
        assert(blkIndex.isDefined, s"No locations found for block at offset $partitionStart")
        val partitionEnd = if ((fileEnd - partitionStart).toDouble / splitSize > SPLIT_SLOP)
          partitionStart + splitSize
        else
          fileEnd
        val partitionID = numPartitionsCreated
        val spatialPartition = SpatialFilePartition2(partitionID, file.filePath, partitionStart,
          partitionEnd - partitionStart, blkIndex.get.getHosts, file.numFeatures, file.numPoints, file.avgSideLength, file.mbr)
        numPartitionsCreated += 1
        partitions.append(spatialPartition)
        partitionStart = partitionEnd
      }
    }
  }
}

object SpatialFilePartitioner {
  /** The configuration entry for the minimum split size */
  val MinSplitSize: String = FileInputFormat.SPLIT_MINSIZE

  /** The configuration entry for the maximum split size */
  val MaxSplitSize: String = FileInputFormat.SPLIT_MAXSIZE

  lazy val codecFactory: CompressionCodecFactory =  new CompressionCodecFactory(new Configuration())

  /**
   * A splitter that splits non-compressed files and block-compressed files.
   * @return
   */
  def splitCompressed: FileStatus => Boolean = fileStatus => codecFactory.getCodec(fileStatus.getPath) match {
    case null | _: SplittableCompressionCodec => true
    case _ => false
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy