se.kuseman.payloadbuilder.api.execution.UTF8String Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of payloadbuilder-api Show documentation
Payloadbuilder API Module
The newest version!
package se.kuseman.payloadbuilder.api.execution;

import static java.util.Objects.requireNonNull;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;

import se.kuseman.payloadbuilder.api.catalog.Column.Type;
import se.kuseman.payloadbuilder.api.catalog.ResolvedType;

/**
 * A bytes reference used for data types that supports operations directly on under laying byte structures like Strings etc. NOTE! {@link ValueVector} is implemented here to let a single string become
 * a literal value vector of it self to avoid creating a literal
 */
public class UTF8String implements Comparable, ValueVector
{
    private static final ThreadLocal BUILDER = new ThreadLocal<>();

    public static final UTF8String EMPTY = UTF8String.from("")
            .loadBytes();
    public static final UTF8String COMMA = UTF8String.from(",")
            .loadBytes();
    private static final UTF8String TRUE = UTF8String.from("true")
            .loadBytes();
    private static final UTF8String FALSE = UTF8String.from("false")
            .loadBytes();

    private static final int START = 17;
    private static final int CONSTANT = 37;

    private String string;

    private byte[] bytes;
    private int offset;
    private int length;

    private UTF8String(String string)
    {
        this.string = string;
    }

    private UTF8String(byte[] bytes, int offset, int length)
    {
        this.bytes = requireNonNull(bytes, "bytes");
        this.offset = offset;
        this.length = length;
    }

    /** Return a copy of the underlying utf8 bytes for this string */
    public byte[] getBytes()
    {
        getBytesInternal();
        return Arrays.copyOfRange(bytes, offset, offset + length);
    }

    /** Return the bytes of this instance into destination byte array. Caller is responsible for correct length */
    public void getBytes(byte[] destination)
    {
        getBytesInternal();
        System.arraycopy(this.bytes, offset, destination, 0, length);
    }

    // ValueVector

    @Override
    public int size()
    {
        return 1;
    }

    @Override
    public ResolvedType type()
    {
        return ResolvedType.of(Type.String);
    }

    @Override
    public boolean isNull(int row)
    {
        return false;
    }

    @Override
    public UTF8String getString(int row)
    {
        return this;
    }

    // End ValueVector

    /** Compare this reference to another bytes reference */
    @Override
    public int compareTo(UTF8String that)
    {
        if (this == that)
        {
            return 0;
        }

        if (string != null
                && that.string != null)
        {
            return string.compareTo(that.string);
        }

        // UTF8 can be compared lexicographically by unsigned byte comparison
        byte[] thisBytes = getBytesInternal();
        int thisOffset = this.offset;
        byte[] thatBytes = that.getBytesInternal();
        int thatOffset = that.offset;

        int size = thisOffset + Math.min(this.length, that.length);

        while (thisOffset < size)
        {
            int a = thisBytes[thisOffset++] & 0xff;
            int b = thatBytes[thatOffset++] & 0xff;
            int diff = a - b;
            if (diff != 0)
            {
                return diff;
            }
        }

        return this.length - that.length;
    }

    @Override
    public int hashCode()
    {
        // NOTE! We must always use hash code from bytes, we cannot mix
        // hash code from string and bytes since then hash match don't work
        getBytesInternal();

        // TODO: cache
        int result = START;
        final int end = offset + length;
        for (int i = offset; i < end; i++)
        {
            result = result * CONSTANT + bytes[i];
        }
        return result;
    }

    @Override
    public boolean equals(Object obj)
    {
        if (obj == null)
        {
            return false;
        }
        else if (obj == this)
        {
            return true;
        }
        else if (obj instanceof UTF8String that)
        {
            if (string != null
                    && that.string != null)
            {
                return string.equals(that.string);
            }

            byte[] bytes1 = getBytesInternal();
            byte[] bytes2 = that.getBytesInternal();

            if (length != that.length)
            {
                return false;
            }

            for (int i = 0; i < length; i++)
            {
                if (bytes1[offset + i] != bytes2[that.offset + i])
                {
                    return false;
                }
            }
            return true;
        }
        return false;
    }

    /** Returns true if this instance has a {@link java.lang.String} instance. */
    public boolean hasString()
    {
        return string != null;
    }

    /** Return the byte length of this instance. */
    public int getByteLength()
    {
        getBytesInternal();
        return length;
    }

    private byte[] getBytesInternal()
    {
        if (bytes != null)
        {
            return bytes;
        }
        else if (string != null)
        {
            bytes = string.getBytes(StandardCharsets.UTF_8);
            length = bytes.length;
            offset = 0;
        }
        return bytes;
    }

    private UTF8String loadBytes()
    {
        getBytesInternal();
        return this;
    }

    // TODO: More string operations can be done here when codePoints are implemented
    // startsWith, endsWith, replace, subString

    /** Return a string representation of this instance */
    @Override
    public String toString()
    {
        if (string != null)
        {
            return string;
        }

        string = new String(bytes, offset, length, StandardCharsets.UTF_8);
        return string;
    }

    /** Concats provided string with a comma delimiter */
    public static UTF8String concat(List strings)
    {
        return concat(COMMA, strings);
    }

    /** Concats provided string with a delimiter */
    public static UTF8String concat(UTF8String delimeter, List strings)
    {
        requireNonNull(strings);
        if (strings.isEmpty())
        {
            return EMPTY;
        }
        else if (strings.size() == 1)
        {
            return strings.get(0);
        }

        int count = strings.size();

        // If all have java String then we should not use byte concat
        boolean allHaveStrings = delimeter.string != null;
        for (int i = 0; i < count; i++)
        {
            if (strings.get(i).string == null)
            {
                allHaveStrings = false;
                break;
            }
        }

        if (allHaveStrings)
        {
            return concatStringBuilder(delimeter, strings);
        }

        int size = 0;
        for (int i = 0; i < count; i++)
        {
            size += strings.get(i)
                    .getByteLength();
        }
        size += (count - 1) * delimeter.getByteLength();

        byte[] bytes = new byte[size];

        int offset = 0;
        for (int i = 0; i < count; i++)
        {
            UTF8String str = strings.get(i);
            System.arraycopy(str.getBytesInternal(), str.offset, bytes, offset, str.length);
            offset += str.length;
            // Don't add a last delimiter
            if (i < count - 1
                    && delimeter.length > 0)
            {
                System.arraycopy(delimeter.getBytesInternal(), delimeter.offset, bytes, offset, delimeter.length);
                offset += delimeter.length;
            }
        }

        return utf8(bytes);
    }

    private static UTF8String concatStringBuilder(UTF8String delimeter, List strings)
    {
        int count = strings.size();
        StringBuilder sb = getBuilder();
        for (int i = 0; i < count; i++)
        {
            sb.append(strings.get(i).string);
            if (i < count - 1)
            {
                sb.append(delimeter.string);
            }
        }
        return new UTF8String(sb.toString());
    }

    private static StringBuilder getBuilder()
    {
        StringBuilder sb = BUILDER.get();
        if (sb == null)
        {
            sb = new StringBuilder(1024);
            BUILDER.set(sb);
        }
        sb.setLength(0);
        return sb;
    }

    /**
     * Creates a string from provided object. Will fallback to toString if no known type is found
     */
    public static UTF8String from(Object object)
    {
        if (object instanceof Boolean)
        {
            return ((Boolean) object).booleanValue() ? TRUE
                    : FALSE;
        }
        else if (object instanceof UTF8String utf8s)
        {
            return utf8s;
        }
        else if (object instanceof byte[] bytes)
        {
            // Assume utf8 bytes
            return new UTF8String(bytes, 0, bytes.length);
        }
        return from(String.valueOf(object));
    }

    public static UTF8String from(boolean value)
    {
        return value ? TRUE
                : FALSE;
    }

    public static UTF8String from(String string)
    {
        return new UTF8String(string);
    }

    public static UTF8String utf8(byte[] bytes)
    {
        return utf8(bytes, 0, bytes.length);
    }

    public static UTF8String utf8(byte[] bytes, int offset, int length)
    {
        return new UTF8String(bytes, offset, length);
    }

    public static UTF8String latin(byte[] bytes)
    {
        return latin(bytes, 0, bytes.length);
    }

    /**
     * Create a utf8 string from latin encoded bytes. NOTE! Recommended usage is utf8 since this method allocates some when converting bytes.
     */
    public static UTF8String latin(byte[] bytes, int offset, int length)
    {
        ByteBuffer buffer = ByteBuffer.wrap(bytes, offset, length);
        CharBuffer charBuffer = StandardCharsets.ISO_8859_1.decode(buffer);
        ByteBuffer encoded = StandardCharsets.UTF_8.encode(charBuffer);
        return utf8(encoded.array(), 0, encoded.limit());
    }
}