com.sparkutils.quality.BloomModel.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of quality_2.4_2.11 Show documentation
Show all versions of quality_2.4_2.11 Show documentation
A Spark library for managing in-process data quality rules via Spark SQL
The newest version!
package com.sparkutils.quality
import java.io.{ByteArrayInputStream, ByteArrayOutputStream, File, FileInputStream, ObjectInputStream, ObjectOutputStream, Serializable}
import java.nio.{ByteOrder, IntBuffer}
import java.nio.channels.FileChannel
import org.apache.spark.util.sketch.BloomFilter
/**
* Simple does it contain function to test a bloom
*/
trait BloomLookup {
def apply(any: Any): Boolean = mightContain(any)
def mightContain(any: Any): Boolean
}
case class SparkBloomFilter( bloom: BloomFilter ) extends BloomLookup {
override def mightContain(any: Any): Boolean =
any match {
case s: String => bloom.mightContainString(s)
case b: Array[Byte] => bloom.mightContainBinary(b)
case l: Long => bloom.mightContainLong(l)
case _ => bloom.mightContain(any)
}
}
/**
* Represents the shared file location of a bucked bloom filter. There should be files with names 0..numBuckets containing
* the same number of bytes representing each bucket.
*
* @param rootDir The directory which contains each bucket
* @param fpp The fpp for this bloom - note it is informational only and will not be used in further processing
* @param numBuckets The number of buckets within this bloom
*/
case class BloomModel(rootDir: String, fpp: Double, numBuckets: Int) extends Serializable {
/**
* Provides memory mapped buffers from the underlying files
* @return
*/
def maps: Seq[IntBuffer] =
(0 until numBuckets).map { i =>
val file = new File(rootDir, i.toString)
val mapped = new FileInputStream(file).getChannel.map(FileChannel.MapMode.READ_ONLY, 0, file.length())
mapped.order(ByteOrder.LITTLE_ENDIAN).asIntBuffer()
}
/**
* Serializes the definition of this bucketfiles, not the underlying bytes of the bloom
* @return
*/
def serialize: Array[Byte] = {
val bos = new ByteArrayOutputStream()
val oos = new ObjectOutputStream(bos)
oos.writeObject(this)
oos.flush()
val bytes = bos.toByteArray
bos.close()
bytes
}
/**
* Only performed lazily
* @return
*/
def read: Array[Array[Byte]] = {
val size = new File(rootDir, "0").length().toInt // safe as it's never more than an array
val ar = Array.ofDim[Array[Byte]](numBuckets)
for { i <- 0 until numBuckets} {
val f = new File(rootDir, i.toString)
val fis = new FileInputStream(f)
ar(i) = Array.ofDim[Byte](size)
fis.read(ar(i))
fis.close()
}
ar
}
/**
* Removes other directories from the parent root id
*/
def cleanupOthers(): Unit ={
val file = new File(rootDir)
val name = file.getName
val siblings = file.getParentFile.listFiles().filterNot(_.getName == name)
siblings.foreach(_.listFiles().foreach(_.delete()))
siblings.foreach(_.delete())
}
/**
* Removes this bloom's files, this is advised only after you have processed or otherwise saved it's results. It will remove everything under this bloomid
*/
def removeThisBloom(): Unit = {
val file = new File(rootDir)
val siblings = file.getParentFile.listFiles()
siblings.foreach(_.listFiles().foreach(_.delete()))
siblings.foreach(_.delete())
}
}
object BloomModel {
/**
* Deserializes from the bytes, must have been created by a compatible BucketFiles.serialize
* @param storageFormat
* @return
*/
def deserialize(storageFormat: Array[Byte]): BloomModel = {
val ios = new ByteArrayInputStream(storageFormat)
val oos = new ObjectInputStream(ios)
val bucketedFiles = oos.readObject().asInstanceOf[BloomModel]
oos.close()
ios.close()
bucketedFiles
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy