All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.mrpowers.spark.stringmetric.unsafe.UTF8StringFunctions Maven / Gradle / Ivy

The newest version!
package com.github.mrpowers.spark.stringmetric.unsafe;

import org.apache.spark.unsafe.types.UTF8String;

public class UTF8StringFunctions {
    // Wish these methods weren't private in UTF8String:
    // - bytesOfCodePointInUTF8
    // - numBytesForFirstByte
    // https://github.com/apache/spark/blob/master/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
    private static byte[] bytesOfCodePointInUTF8 = {
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00..0x0F
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10..0x1F
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20..0x2F
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30..0x3F
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40..0x4F
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50..0x5F
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60..0x6F
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70..0x7F
        // Continuation bytes cannot appear as the first byte
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80..0x8F
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90..0x9F
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0..0xAF
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0..0xBF
        0, 0, // 0xC0..0xC1 - disallowed in UTF-8
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC2..0xCF
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xD0..0xDF
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE0..0xEF
        4, 4, 4, 4, 4, // 0xF0..0xF4
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0xF5..0xFF - disallowed in UTF-8
    };

    /**
     * Returns the number of bytes for a code point with the first byte as `b`
     * @param b The first byte of a code point
     */
    private static int numBytesForFirstByte(final byte b) {
        final int offset = b & 0xFF;
        byte numBytes = bytesOfCodePointInUTF8[offset];
        return (numBytes == 0) ? 1: numBytes; // Skip the first byte disallowed in UTF-8
    }

    // Adapted from org.apache.commons.text.similarity.HammingDistance
    public static int hammingDistance(UTF8String left, UTF8String right) {
        int n = left.numChars();
        if (n != right.numChars()) {
            throw new java.lang.IllegalArgumentException(
                "Hamming distance is only defined for strings of same length!"
            );
        }

        int distance = 0;
        byte[] leftBytes = left.getBytes();
        byte[] rightBytes = right.getBytes();
        
        int leftIdx = 0;
        int rightIdx = 0;
        int c = 0;
        while (c < n) {
            int leftCharSize = numBytesForFirstByte(leftBytes[leftIdx]);
            int rightCharSize = numBytesForFirstByte(rightBytes[rightIdx]);

            if (leftCharSize != rightCharSize) {
                distance++;
            } else {
                for (int i = 0; i < leftCharSize; i++) {
                    if (leftBytes[i + leftIdx] != rightBytes[i + rightIdx]) {
                        distance++;
                        break;
                    }
                }
            }

            leftIdx += leftCharSize;
            rightIdx += rightCharSize;
            c++; // would rather be writing this than Java!
        }
        return distance;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy