
com.microsoft.azure.synapse.ml.vw.VowpalWabbitMurmurWithPrefix.scala Maven / Gradle / Ivy
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.
package com.microsoft.azure.synapse.ml.vw
import org.vowpalwabbit.spark.VowpalWabbitMurmur
import java.nio.charset.StandardCharsets
/**
* VW style murmur hash with pre-hashing of an initially specified prefix.
* @param prefix the prefix for each hashed value.
* @param maxSize maximum size of the string to be hashed.
*/
class VowpalWabbitMurmurWithPrefix(val prefix: String, val maxSize: Int = 2 * 1024) extends Serializable {
// worst case is 4 bytes per character
val ys: Array[Byte] = new Array(maxSize * 4)
val ysStart: Int = {
// pre-populate the string with the prefix - we could go so-far as keep the intermediate hash state :)
val prefixBytes = prefix.getBytes(StandardCharsets.UTF_8)
Array.copy(prefixBytes, 0, ys, 0, prefixBytes.length)
prefixBytes.length
}
def hash(str: String, namespaceHash: Int): Int =
hash(str, 0, str.length, namespaceHash)
def hash(str: String, start: Int, end: Int, namespaceHash: Int): Int = {
if (end - start > maxSize)
VowpalWabbitMurmur.hash(prefix + str.substring(start, end), namespaceHash)
else {
// adapted from https://stackoverflow.com/questions/5513144/converting-char-to-byte/20604909#20604909
// copy sub part
var i = start
var j = ysStart // i for chars; j for bytes
// fill ys with bytes
while (i < end) { //scalastyle:ignore while
val c = str.charAt(i)
if (c < 0x80) {
ys(j) = c.toByte
i = i + 1
j = j + 1
} else if (c < 0x800) {
ys(j) = (0xc0 | (c >> 6)).toByte
ys(j + 1) = (0x80 | (c & 0x3f)).toByte
i = i + 1
j = j + 2
} else if (Character.isHighSurrogate(c)) {
if (end - i < 2) throw new Exception("overflow") // this is not reachable due to maxSize * 4, so just in case
val d = str.charAt(i + 1)
val uc: Int =
if (Character.isLowSurrogate(d))
Character.toCodePoint(c, d)
else
throw new Exception("malformed")
ys(j) = (0xf0 | (uc >> 18)).toByte
ys(j + 1) = (0x80 | ((uc >> 12) & 0x3f)).toByte
ys(j + 2) = (0x80 | ((uc >> 6) & 0x3f)).toByte
ys(j + 3) = (0x80 | (uc & 0x3f)).toByte
i = i + 2 // 2 chars
j = j + 4
} else if (Character.isLowSurrogate(c)) {
throw new Exception("malformed")
} else {
ys(j) = (0xe0 | (c >> 12)).toByte
ys(j + 1) = (0x80 | ((c >> 6) & 0x3f)).toByte
ys(j + 2) = (0x80 | (c & 0x3f)).toByte
i = i + 1
j = j + 3
}
}
VowpalWabbitMurmur.hash(ys, 0, j, namespaceHash)
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy