org.neo4j.values.storable.UTF8StringValue Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of neo4j-values Show documentation
Show all versions of neo4j-values Show documentation
Neo4j property value system.
The newest version!
/*
* Copyright (c) "Neo4j"
* Neo4j Sweden AB [https://neo4j.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
package org.neo4j.values.storable;
import static org.neo4j.memory.HeapEstimator.shallowSizeOfInstance;
import static org.neo4j.memory.HeapEstimator.sizeOf;
import static org.neo4j.values.storable.Values.utf8Value;
import static org.neo4j.values.utils.ValueMath.HASH_CONSTANT;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.ArrayUtils;
import org.neo4j.hashing.HashFunction;
/*
* Just as a normal StringValue but is backed by a byte array and does string
* serialization lazily when necessary.
*
*/
public final class UTF8StringValue extends StringValue {
private static final long SHALLOW_SIZE = shallowSizeOfInstance(UTF8StringValue.class);
/** Used for removing the high order bit from byte. */
private static final int HIGH_BIT_MASK = 0b0111_1111;
/** Used for detecting non-continuation bytes. For example {@code 0b10xx_xxxx}. */
private static final int NON_CONTINUATION_BIT_MASK = 0b0100_0000;
private volatile String value;
private final byte[] bytes;
private final int offset;
private final int byteLength;
UTF8StringValue(byte[] bytes, int offset, int length) {
assert bytes != null;
this.bytes = bytes;
this.offset = offset;
this.byteLength = length;
}
@Override
public void writeTo(ValueWriter writer) throws E {
writer.writeUTF8(bytes, offset, byteLength);
}
@Override
public boolean equals(Value value) {
if (value instanceof UTF8StringValue other) {
return Arrays.equals(
bytes, offset, offset + byteLength, other.bytes, other.offset, other.offset + other.byteLength);
} else {
return super.equals(value);
}
}
@Override
protected String value() {
String s = value;
if (s == null) {
synchronized (this) {
s = value;
if (s == null) {
value = s = new String(bytes, offset, byteLength, StandardCharsets.UTF_8);
}
}
}
return s;
}
@Override
public int length() {
return numberOfCodePoints(bytes, offset, byteLength);
}
@Override
public boolean isEmpty() {
return bytes.length == 0 || byteLength == 0;
}
@Override
public long estimatedHeapUsage() {
return SHALLOW_SIZE + sizeOf(bytes);
}
private static int numberOfCodePoints(byte[] bytes, int offset, int byteLength) {
int count = 0, i = offset, len = offset + byteLength;
while (i < len) {
byte b = bytes[i];
// If high bit is zero (equivalent to the byte being positive in two's complement)
// we are dealing with an ascii value and use a single byte for storing the value.
if (b >= 0) {
i++;
count++;
continue;
}
// The number of high bits tells us how many bytes we use to store the value
// e.g. 110xxxx -> need two bytes, 1110xxxx -> need three bytes, 11110xxx -> needs
// four bytes
while (b < 0) {
i++;
b = (byte) (b << 1);
}
count++;
}
return count;
}
@Override
protected int computeHashToMemoize() {
if (bytes.length == 0 || byteLength == 0) {
return 0;
}
CodePointCursor cpc = new CodePointCursor(bytes, offset);
int hash = 1;
int len = offset + byteLength;
while (cpc.i < len) {
hash = HASH_CONSTANT * hash + (int) cpc.nextCodePoint();
}
return hash;
}
@Override
public long updateHash(HashFunction hashFunction, long hash) {
CodePointCursor cpc = new CodePointCursor(bytes, offset);
int len = offset + byteLength;
while (cpc.i < len) {
long codePointA = cpc.nextCodePoint() << 32;
long codePointB = 0L;
if (cpc.i < len) {
codePointB = cpc.nextCodePoint();
}
hash = hashFunction.update(hash, codePointA + codePointB);
}
return hashFunction.update(hash, cpc.codePointCount);
}
public static final class CodePointCursor {
private final byte[] values;
private int i;
private int codePointCount;
public CodePointCursor(byte[] values, int offset) {
this.values = values;
this.i = offset;
}
public long nextCodePoint() {
codePointCount++;
byte b = values[i];
// If high bit is zero (equivalent to the byte being positive in two's complement)
// we are dealing with an ascii value and use a single byte for storing the value.
if (b >= 0) {
i++;
return b;
}
// We can now have one of three situations.
// Byte1 Byte2 Byte3 Byte4
// 110xxxxx 10xxxxxx
// 1110xxxx 10xxxxxx 10xxxxxx
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// Figure out how many bytes we need by reading the number of leading bytes
int bytesNeeded = 0;
while (b < 0) {
bytesNeeded++;
b = (byte) (b << 1);
}
int codePoint = codePoint(values, b, i, bytesNeeded);
i += bytesNeeded;
return codePoint;
}
}
public static final class ReverseCodePointCursor {
private final byte[] values;
private int i;
public ReverseCodePointCursor(byte[] values, int offset, int byteLength) {
this.values = values;
this.i = offset + byteLength - 1;
}
public long previousCodePoint() {
byte b = values[i];
// If high bit is zero (equivalent to the byte being positive in two's complement)
// we are dealing with an ascii value and use a single byte for storing the value.
if (b >= 0) {
i--;
return b;
}
// We can now have one of three situations.
// Byte1 Byte2 Byte3 Byte4
// 110xxxxx 10xxxxxx
// 1110xxxx 10xxxxxx 10xxxxxx
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// Figure out how many bytes we need by reading the number of leading bytes
int bytesNeeded = 1;
while ((b & NON_CONTINUATION_BIT_MASK) == 0) {
bytesNeeded++;
b = values[--i];
}
int codePoint = codePoint(values, (byte) (b << bytesNeeded), i, bytesNeeded);
i -= 1;
return codePoint;
}
}
@Override
public TextValue substring(int start, int length) {
if (start < 0 || length < 0) {
throw new IndexOutOfBoundsException("Cannot handle negative start index nor negative length");
}
if (length == 0) {
return StringValue.EMPTY;
}
int end = start + length;
byte[] values = bytes;
int count = 0, byteStart = -1, byteEnd = -1, i = offset, len = offset + byteLength;
while (i < len) {
if (count == start) {
byteStart = i;
}
if (count == end) {
byteEnd = i;
break;
}
byte b = values[i];
// If high bit is zero (equivalent to the byte being positive in two's complement)
// we are dealing with an ascii value and use a single byte for storing the value.
if (b >= 0) {
i++;
}
while (b < 0) {
i++;
b = (byte) (b << 1);
}
count++;
}
if (byteEnd < 0) {
byteEnd = len;
}
if (byteStart < 0) {
return StringValue.EMPTY;
}
return new UTF8StringValue(values, byteStart, byteEnd - byteStart);
}
@Override
public TextValue trim() {
byte[] values = bytes;
if (values.length == 0 || byteLength == 0) {
return this;
}
int startIndex = trimLeftIndexWhitespace();
int endIndex = trimRightIndexWhitespace();
if (startIndex > endIndex) {
return StringValue.EMPTY;
}
return new UTF8StringValue(values, startIndex, Math.max(endIndex + 1 - startIndex, 0));
}
@Override
public TextValue ltrim() {
byte[] values = bytes;
if (values.length == 0 || byteLength == 0) {
return this;
}
int startIndex = trimLeftIndexWhitespace();
assert (startIndex <= values.length);
if (startIndex == offset) {
return this;
} else {
return new UTF8StringValue(values, startIndex, byteLength - (startIndex - offset));
}
}
@Override
public TextValue rtrim() {
byte[] values = bytes;
if (values.length == 0 || byteLength == 0) {
return this;
}
int endIndex = trimRightIndexWhitespace();
if (endIndex < 0) {
return StringValue.EMPTY;
}
return new UTF8StringValue(values, offset, endIndex + 1 - offset);
}
@Override
public TextValue trim(TextValue trimCharacterString) {
byte[] values = bytes;
if (values.length == 0 || byteLength == 0) {
return this;
}
int startIndex = trimLeftIndex(trimCharacterString);
int endIndex = trimRightIndex(trimCharacterString);
if (startIndex > endIndex) {
return StringValue.EMPTY;
}
return new UTF8StringValue(values, startIndex, Math.max(endIndex + 1 - startIndex, 0));
}
@Override
public TextValue ltrim(TextValue trimCharacterString) {
byte[] values = bytes;
if (values.length == 0 || byteLength == 0) {
return this;
}
int startIndex = trimLeftIndex(trimCharacterString);
assert (startIndex <= values.length);
if (startIndex == offset) {
return this;
} else {
return new UTF8StringValue(values, startIndex, byteLength - (startIndex - offset));
}
}
@Override
public TextValue rtrim(TextValue trimCharacterString) {
byte[] values = bytes;
if (values.length == 0 || byteLength == 0) {
return this;
}
int endIndex = trimRightIndex(trimCharacterString);
if (endIndex < 0) {
return StringValue.EMPTY;
}
return new UTF8StringValue(values, offset, endIndex + 1 - offset);
}
@Override
public TextValue plus(TextValue other) {
if (other instanceof UTF8StringValue rhs) {
byte[] newBytes = new byte[byteLength + rhs.byteLength];
System.arraycopy(bytes, offset, newBytes, 0, byteLength);
System.arraycopy(rhs.bytes, rhs.offset, newBytes, byteLength, rhs.byteLength);
return utf8Value(newBytes);
}
return Values.stringValue(stringValue() + other.stringValue());
}
@Override
public boolean startsWith(TextValue other) {
if (other instanceof UTF8StringValue suffix) {
return startsWith(suffix, 0);
}
return value().startsWith(other.stringValue());
}
@Override
public boolean endsWith(TextValue other) {
if (other instanceof UTF8StringValue suffix) {
return startsWith(suffix, byteLength - suffix.byteLength);
}
return value().endsWith(other.stringValue());
}
@SuppressWarnings("StatementWithEmptyBody")
@Override
public boolean contains(TextValue other) {
if (other instanceof final UTF8StringValue substring) {
if (byteLength == 0) {
return substring.byteLength == 0;
}
if (substring.byteLength == 0) {
return true;
}
if (substring.byteLength > byteLength) {
return false;
}
final byte first = substring.bytes[substring.offset];
final int max = offset + byteLength - substring.byteLength;
for (int pos = offset; pos <= max; pos++) {
// find first byte
if (bytes[pos] != first) {
while (++pos <= max && bytes[pos] != first) {
// do nothing
}
}
// Now we have the first byte match, look at the rest
if (pos <= max) {
int i = pos + 1;
final int end = pos + substring.byteLength;
for (int j = substring.offset + 1; i < end && bytes[i] == substring.bytes[j]; j++, i++) {
// do nothing
}
if (i == end) {
return true;
}
}
}
return false;
}
return value().contains(other.stringValue());
}
private boolean startsWith(UTF8StringValue prefix, int startPos) {
int thisOffset = offset + startPos;
int prefixOffset = prefix.offset;
int prefixCount = prefix.byteLength;
if (startPos < 0 || prefixCount > byteLength) {
return false;
}
while (--prefixCount >= 0) {
if (bytes[thisOffset++] != prefix.bytes[prefixOffset++]) {
return false;
}
}
return true;
}
@Override
public TextValue reverse() {
byte[] values = bytes;
if (values.length == 0 || byteLength == 0) {
return StringValue.EMPTY;
}
int i = offset, len = offset + byteLength;
byte[] newValues = new byte[byteLength];
while (i < len) {
byte b = values[i];
// If high bit is zero (equivalent to the byte being positive in two's complement)
// we are dealing with an ascii value and use a single byte for storing the value.
if (b >= 0) {
// a single byte is trivial to reverse
// just put it at the opposite end of the new array
newValues[len - 1 - i] = b;
i++;
continue;
}
// We can now have one of three situations.
// Byte1 Byte2 Byte3 Byte4
// 110xxxxx 10xxxxxx
// 1110xxxx 10xxxxxx 10xxxxxx
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// Figure out how many bytes we need by reading the number of leading bytes
int bytesNeeded = 0;
while (b < 0) {
bytesNeeded++;
b = (byte) (b << 1);
}
// reversing when multiple bytes are needed for the code point we cannot just reverse
// since we need to preserve the code point while moving it,
// e.g. [A, b1,b2, B] -> [B, b1,b2, A]
System.arraycopy(values, i, newValues, len - i - bytesNeeded, bytesNeeded);
i += bytesNeeded;
}
return new UTF8StringValue(newValues, 0, newValues.length);
}
@Override
public int compareTo(TextValue other) {
if (!(other instanceof UTF8StringValue otherUTF8)) {
return super.compareTo(other);
}
return byteArrayCompare(bytes, offset, byteLength, otherUTF8.bytes, otherUTF8.offset, otherUTF8.byteLength);
}
private static int byteArrayCompare(
byte[] value1, int value1Offset, int value1Length, byte[] value2, int value2Offset, int value2Length) {
int lim = Math.min(value1Length, value2Length);
for (int i = 0; i < lim; i++) {
byte b1 = value1[i + value1Offset];
byte b2 = value2[i + value2Offset];
if (b1 != b2) {
return (((int) b1) & 0xFF) - (((int) b2) & 0xFF);
}
}
return value1Length - value2Length;
}
@Override
protected Matcher matcher(Pattern pattern) {
return pattern.matcher(value());
}
/**
* Returns the left-most index into the underlying byte array that does not belong to a whitespace code point
*/
private int trimLeftIndexWhitespace() {
int i = offset, len = offset + byteLength;
while (i < len) {
byte b = bytes[i];
// If high bit is zero (equivalent to the byte being positive in two's complement)
// we are dealing with an ascii value and use a single byte for storing the value.
if (b >= 0) {
if (!Character.isWhitespace(b)) {
return i;
}
i++;
continue;
}
// We can now have one of three situations.
// Byte1 Byte2 Byte3 Byte4
// 110xxxxx 10xxxxxx
// 1110xxxx 10xxxxxx 10xxxxxx
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// Figure out how many bytes we need by reading the number of leading bytes
int bytesNeeded = 0;
while (b < 0) {
bytesNeeded++;
b = (byte) (b << 1);
}
int codePoint = codePoint(bytes, b, i, bytesNeeded);
if (!Character.isWhitespace(codePoint)) {
return i;
}
i += bytesNeeded;
}
return i;
}
/**
* Returns the left-most index into the underlying byte array that does not exist in the given trimCharList
*/
private int trimLeftIndex(TextValue trimCharacterString) {
int pos = offset;
int[] trimCharacterStringCodePointArray =
trimCharacterString.stringValue().codePoints().toArray();
if (trimCharacterString.isEmpty()) return pos;
var cpc = new CodePointCursor(bytes, offset);
while (cpc.i < byteLength + offset) {
pos = cpc.i;
var cp = cpc.nextCodePoint();
if (!ArrayUtils.contains(trimCharacterStringCodePointArray, (int) cp)) {
return pos;
}
pos = cpc.i;
}
return pos;
}
/**
* Returns the right-most index into the underlying byte array that does not belong to a whitespace code point
*/
private int trimRightIndexWhitespace() {
int index = offset + byteLength - 1;
while (index >= offset) {
byte b = bytes[index];
// If high bit is zero (equivalent to the byte being positive in two's complement)
// we are dealing with an ascii value and use a single byte for storing the value.
if (b >= 0) {
if (!Character.isWhitespace(b)) {
return index;
}
index--;
continue;
}
// We can now have one of three situations.
// Byte1 Byte2 Byte3 Byte4
// 110xxxxx 10xxxxxx
// 1110xxxx 10xxxxxx 10xxxxxx
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
int bytesNeeded = 1;
while ((b & NON_CONTINUATION_BIT_MASK) == 0) {
bytesNeeded++;
b = bytes[--index];
}
int codePoint = codePoint(bytes, (byte) (b << bytesNeeded), index, bytesNeeded);
if (!Character.isWhitespace(codePoint)) {
return Math.min(index + bytesNeeded - 1, bytes.length - 1);
}
index--;
}
return index;
}
/**
* Returns the right-most index into the underlying byte array that does not exist in the given trimCharList
*/
private int trimRightIndex(TextValue trimCharacterString) {
int pos = offset + byteLength - 1;
int[] trimCharacterStringCodePointArray =
trimCharacterString.stringValue().codePoints().toArray();
if (trimCharacterString.isEmpty()) return pos;
var cpc = new ReverseCodePointCursor(bytes, offset, byteLength);
while (cpc.i > 0) {
pos = cpc.i;
var cp = cpc.previousCodePoint();
if (!ArrayUtils.contains(trimCharacterStringCodePointArray, (int) cp)) {
return pos;
}
pos = cpc.i;
}
return pos;
}
@Override
public ValueRepresentation valueRepresentation() {
return ValueRepresentation.UTF8_TEXT;
}
public byte[] bytes() {
return bytes;
}
private static int codePoint(byte[] bytes, byte currentByte, int i, int bytesNeeded) {
return switch (bytesNeeded) {
case 2 -> (currentByte << 4) | (bytes[i + 1] & HIGH_BIT_MASK);
case 3 -> (currentByte << 9) | ((bytes[i + 1] & HIGH_BIT_MASK) << 6) | (bytes[i + 2] & HIGH_BIT_MASK);
case 4 -> (currentByte << 14)
| ((bytes[i + 1] & HIGH_BIT_MASK) << 12)
| ((bytes[i + 2] & HIGH_BIT_MASK) << 6)
| (bytes[i + 3] & HIGH_BIT_MASK);
default -> throw new IllegalArgumentException("Malformed UTF8 value");
};
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy