scray.hdfs.index.HDFSBlobResolver.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of scray-hdfs-service-adapter Show documentation
Show all versions of scray-hdfs-service-adapter Show documentation
Scray adapter to read BLOBs from HDFS
package scray.hdfs.index
import java.io.ByteArrayOutputStream
import java.io.DataOutputStream
import java.net.URL
import java.util.Arrays
import java.util.UUID
import java.util.concurrent.locks.ReentrantReadWriteLock
import scala.collection.mutable.HashMap
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.io.BooleanWritable
import org.apache.hadoop.io.BytesWritable
import org.apache.hadoop.io.DoubleWritable
import org.apache.hadoop.io.FloatWritable
import org.apache.hadoop.io.IntWritable
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.ObjectWritable
import org.apache.hadoop.io.ShortWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.Writable
import com.typesafe.scalalogging.LazyLogging
import de.greenrobot.common.hash.Murmur3F
import scray.hdfs.hadoop.URLWritable
import scray.hdfs.hadoop.UUIDWritable
import scray.querying.description.TableIdentifier
import scray.querying.source.store.BlobResolver
class HDFSBlobResolver[T <: org.apache.hadoop.io.Text](ti: TableIdentifier, directory: String) extends BlobResolver[org.apache.hadoop.io.Text] with LazyLogging {
import HDFSBlobResolver._
import IndexFilenameStructure.FileTypes._
private val fs = getFileSystem()
private val directoryScanner = new IndexFilenameStructure(fs, directory)
/**
* return FileSystem object
*/
private def getFileSystem(): FileSystem = {
val config = new Configuration()
config.set("fs.defaultFS", directory)
FileSystem.get(config)
}
/**
* reads index files until a match for the key has been found.
* If that could be found, it would return the relevant SingleFile for blobs.
*/
def readAllIndexesUntilKeyIsfound(key: ArrayBytes, files: List[CombinedFiles]): Option[(String, Long)] = {
//logger.info("readAllIndexesUntilKeyIsfo")
if(files.isEmpty) {
None
} else {
val headFiles = files.head.getFileSet
val indexFile = headFiles.find(sf => sf.getType == INDEX.toString())
val longOption = indexFile.flatMap { idxFile =>
IndexFileReader.getIndexForKey(fs, idxFile.getNameWithPath, key, ti)
}
longOption.orElse {
readAllIndexesUntilKeyIsfound(key, files.tail)
}
}
}
/**
* reads index files
*/
def readAllIndexes(files: List[CombinedFiles]) : Unit = {
if(files.isEmpty) {
None
} else {
files.foreach(f => { val indexFile = f.getFileSet.find(sf => sf.getType == INDEX.toString());
indexFile.foreach(f => IndexFileReader.updateCache(fs, f.getNameWithPath, ti))
})
}
}
// correct algorithm would be:
// check the bloom filter for newest entry of the key
//
def getBlob(key: Text): Option[Array[Byte]] = {
// logger.info("getBlob:" + key.toString())
val hashedKey = new ArrayBytes(computeHash(key, ti))
val blob = HDFSBlobResolver.getCachedBlob(hashedKey).orElse {
val files = directoryScanner.getFiles
logger.info(s"known files: ${files}")
// scan index-cache for existing entries
HDFSBlobResolver.getCachedIdxPos(hashedKey).flatMap { filepos =>
BlobFileReader.getBlobForPosition(fs, filepos._1, hashedKey, key.toString(), ti, filepos._2)
}.orElse {
// js - put all keys into cache
readAllIndexes(files)
return HDFSBlobResolver.getCachedBlob(hashedKey)
// if we did not find the key, we need to find an index which contains it
/* readAllIndexesUntilKeyIsfound(hashedKey, files).flatMap { filepos =>
logger.info(s"Found key: ${key} in index")
BlobFileReader.getBlobForPosition(fs, filepos._1, hashedKey, ti, filepos._2)
} */
}
}
// logger.info(s"Test result = $blob")
blob
}
}
object HDFSBlobResolver extends LazyLogging {
// cache for index and positions
val indexlock = new ReentrantReadWriteLock()
var filenumber: Short = 0
val idxmap = new HashMap[ArrayBytes, (Short, Long)]()
val filesMap = new HashMap[Short, String]()
val filesMapReversed = new HashMap[String, Short]()
// cache for blobs
val lock = new ReentrantReadWriteLock()
val blobmap = new HashMap[ArrayBytes, Array[Byte]]
//test
def getAnyCachedIdxPos(): (scray.hdfs.index.HDFSBlobResolver.ArrayBytes,String, Long) = {
indexlock.readLock().lock()
try {
val a = idxmap.iterator.next()
logger.info(a.toString())
(a._1, filesMap.get(a._2._1).get, a._2._2)
//get(key).map(pos => (filesMap.get(pos._1).get, pos._2))
} finally {
indexlock.readLock().unlock()
}
}
def getCachedIdxPos(key: ArrayBytes): Option[(String, Long)] = {
logger.debug(s"Request entry for key ${key} \t ${idxmap.get(key)}")
indexlock.readLock().lock()
try {
idxmap.get(key).map(pos => (filesMap.get(pos._1).get, pos._2))
} finally {
indexlock.readLock().unlock()
}
}
def putIntoIndexCache(key: ArrayBytes, blobfileName: String, position: Long): Unit = {
indexlock.writeLock().lock()
try {
val fileNameNumber = filesMapReversed.get(blobfileName).getOrElse {
val currfilenumber = filenumber
filesMap += ((filenumber, blobfileName))
filesMapReversed += ((blobfileName, filenumber))
filenumber = (filenumber + 1).toShort
currfilenumber
}
idxmap.put(key, (fileNameNumber, position))
} finally {
indexlock.writeLock().unlock()
}
}
def getCachedBlob(key: ArrayBytes): Option[Array[Byte]] = {
lock.readLock().lock()
try {
blobmap.get(key)
} finally {
lock.readLock().unlock()
}
}
def putBlobIntoCache(key: ArrayBytes, blob: Array[Byte]): Unit = {
lock.writeLock().lock()
try {
blobmap.put(key, blob)
} finally {
lock.writeLock().unlock()
}
}
// internally we use murmur3f (128Bits) to represent the Blob keys...
def computeHash[T <: Writable](key: T, ti: TableIdentifier): Array[Byte] = {
val hasher = new Murmur3F()
val bos = new ByteArrayOutputStream()
val dos = new DataOutputStream(bos)
key.write(dos)
dos.writeUTF(ti.dbId)
dos.writeUTF(ti.tableId)
hasher.update(bos.toByteArray())
// don't need to close these streams as close does nothing
hasher.getValueBytesBigEndian
}
// // internally we use murmur3f (128Bits) to represent the Blob keys...
// def computeHash(key: Array[Byte], ti: TableIdentifier): Array[Byte] = {
// val hasher = new Murmur3F()
// val bos = new ByteArrayOutputStream()
// val dos = new DataOutputStream(bos)
// dos.write(key)
// dos.writeUTF(ti.dbId)
// dos.writeUTF(ti.tableId)
// hasher.update(bos.toByteArray())
// // don't need to close these streams as close does nothing
// hasher.getValueBytesBigEndian
// }
def byteTransformHadoopType(input: Text): Array[Byte] = input match {
case t: Text => t.toString().getBytes("UTF-8")
}
def transformHadoopTypes(input: Any): Writable = input match {
case writable: Writable => writable
case str: String => new Text(str)
case int: Int => new IntWritable(int)
case int: Integer => new IntWritable(int)
case lng: Long => new LongWritable(lng)
case dbl: Double => new DoubleWritable(dbl)
case flt: Float => new FloatWritable(flt)
case srt: Short => new ShortWritable(srt)
case bol: Boolean => new BooleanWritable(bol)
case uid: UUID => new UUIDWritable(uid)
case url: URL => new URLWritable(url)
case bi: Array[Byte] => new BytesWritable(bi)
case o: Object => new ObjectWritable(o)
// match error otherwise
}
class ArrayBytes(val bytes: Array[Byte]) {
override def equals(obj: Any): Boolean = {
val that = obj.asInstanceOf[ArrayBytes]
if((bytes == null && that == null) || (bytes == null && that.bytes == null)) {
true
} else {
if(bytes == null) {
false
} else {
if(that == null || that.bytes == null) {
false
} else {
Arrays.equals(bytes, that.bytes)
}
}
}
}
override def hashCode(): Int = Arrays.hashCode(bytes)
}
}