net.sf.saxon.str.ZenoString Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of Saxon-HE Show documentation
Show all versions of Saxon-HE Show documentation
The XSLT and XQuery Processor
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2022 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
package net.sf.saxon.str;
import net.sf.saxon.expr.sort.EmptyIntIterator;
import net.sf.saxon.transpile.CSharpReplaceBody;
import net.sf.saxon.z.IntIterator;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.function.IntPredicate;
/**
* A ZenoString is an implementation of UnicodeString that comprises a list
* of segments representing substrings of the total string. By convention the
* segments are not themselves ZenoStrings, so the structure is a shallow tree.
* An index holds pointers to the segments and their offsets within the string
* as a whole; this is used to locate the codepoint at any particular location
* in the string.
*
* The segments will always be non-empty. An empty string contains no segments.
*
* The key to the performance of the data structure (and its name) is
* the algorithm for consolidating segments when strings are concatenated,
* so as to keep the number of segments increasing logarithmically with
* the string size, with short segments at the extremities to allow efficient
* further concatenation at the ends.
*
* For further details see the paper by Michael Kay at Balisage 2021.
*/
public class ZenoString extends UnicodeString {
private List segments = new ArrayList<>();
private List offsets = new ArrayList<>();
/**
* Private constructor creating an empty ZenoString (containing an empty list of segments)
*/
private ZenoString() {}
/**
* Private constructor creating a ZenoString with a single segment
*/
private ZenoString(UnicodeString content) {
segments.add(content);
offsets.add(0L);
}
/**
* An empty ZenoString
*/
public static final ZenoString EMPTY = new ZenoString();
/**
* Construct a ZenoString from a supplied UnicodeString
* @param content the supplied UnicodeString
* @return the resulting ZenoString
*/
public static ZenoString of(UnicodeString content) {
if (content instanceof ZenoString) {
return (ZenoString) content;
} else if (content.isEmpty()) {
return new ZenoString();
} else {
return new ZenoString(content);
}
}
/**
* Get the index of the segment containing the character at a given offset in the string
* @param offset the offset of the character in the string. This must be greater
* than or equal to zero, and less than length of the string.
* @return the index of the segment containing the required character
* @throws IndexOutOfBoundsException if the supplied offset is out of range.
*/
private int segmentForOffset(long offset) {
if (segments.size() == 0) {
throw new IndexOutOfBoundsException("ZenoString is empty");
}
int result = binarySearch(offset, 0, offsets.size() - 1);
if (result < 0) {
throw new IndexOutOfBoundsException("Index " + offset + " out of range 0-" + (length()-1));
}
return result;
}
private int binarySearch(long offset, int start, int end) {
//System.err.println("BinarySearch " + start + " " + end);
if (start == end) {
long s = offsets.get(start);
long e = s + segments.get(start).length();
if (s <= offset && e > offset) {
return start;
} else {
return -1;
}
} else {
int mid = start + (end - start + 1) / 2;
if (offsets.get(mid) > offset) {
return binarySearch(offset, start, mid-1);
} else {
return binarySearch(offset, mid, end);
}
}
}
/**
* Get an iterator over the code points present in the string.
* @return an iterator that delivers the individual code points
*/
@Override
@CSharpReplaceBody(code="return new Saxon.Impl.Overrides.ZenoStringCodepoints(segments);")
public IntIterator codePoints() {
if (isEmpty()) {
return EmptyIntIterator.getInstance();
}
return new IntIterator() {
final Iterator outerIterator = segments.iterator();
IntIterator innerIterator;
@Override
public boolean hasNext() {
if (innerIterator == null) {
return outerIterator.hasNext();
} else if (innerIterator.hasNext()) {
return true;
} else {
innerIterator = null;
return outerIterator.hasNext();
}
}
@Override
public int next() {
if (innerIterator == null) {
if (outerIterator.hasNext()) {
innerIterator = outerIterator.next().codePoints();
} else {
throw new NoSuchElementException();
}
}
return innerIterator.next();
}
};
}
/**
* Get the length of the string
*
* @return the number of code points in the string
*/
@Override
public long length() {
int i = segments.size()-1;
return i < 0 ? 0L : offsets.get(i) + segments.get(i).length();
}
/**
* Ask whether the string is empty
*
* @return true if the length of the string is zero
*/
@Override
public boolean isEmpty() {
return segments.isEmpty();
}
/**
* Get the number of bits needed to hold all the characters in this string
*
* @return 7 for ascii characters, 8 for latin-1, 16 for BMP, 24 for general Unicode.
*/
@Override
public int getWidth() {
int maxWidth = 7;
for (UnicodeString entry : segments) {
int width = entry.getWidth();
if (width == 24) {
return 24;
} else {
maxWidth = Math.max(maxWidth, width);
}
}
return maxWidth;
}
/**
* Get the position of the first occurrence of the specified codepoint,
* starting the search at a given position in the string
*
* @param codePoint the sought codePoint
* @param from the position from which the search should start (0-based), in the
* range 0 to length()-1
* @return the position (0-based) of the first occurrence found, or -1 if not found
* @throws IndexOutOfBoundsException if the from
value is out of range
*/
@Override
public long indexOf(int codePoint, long from) {
from = Math.max(from, 0);
if (from >= length()) {
return -1L;
}
int first = segmentForOffset(from);
for (int i=first; i= 0) {
return pos + offset;
}
}
return -1;
}
@Override
public long indexWhere(IntPredicate predicate, long from) {
int first = segmentForOffset(from);
for (int i = first; i < segments.size(); i++) {
UnicodeString segment = segments.get(i);
long offset = offsets.get(i);
long pos = segment.indexWhere(predicate, i == first ? from - offset : 0);
if (pos >= 0) {
return pos + offset;
}
}
return -1;
}
/**
* Get the code point at a given position in the string
* @param index the given position (0-based)
* @throws IndexOutOfBoundsException if the index is out of range
*/
@Override
public int codePointAt(long index) {
int entry = segmentForOffset(index);
UnicodeString segment = segments.get(entry);
return segment.codePointAt(index - offsets.get(entry));
}
/**
* Get a substring of this codepoint sequence, with a given start and end position
*
* @param start the start position (0-based): that is, the position of the first
* code point to be included
* @param end the end position (0-based): specifically, the position of the first
* code point not to be included
*/
@Override
public UnicodeString substring(long start, long end) {
checkSubstringBounds(start, end);
if (start == end) {
return EmptyUnicodeString.getInstance();
} else if (start + 1 == end) {
return new UnicodeChar(codePointAt(start));
}
int first = segmentForOffset(start);
int last = segmentForOffset(end-1);
if (first == last) {
UnicodeString segment = segments.get(first);
long offset = offsets.get(first);
return segment.substring(start - offset, end - offset);
} else {
ZenoString z = ZenoString.of(segments.get(first).substring(start - offsets.get(first)));
for (int i=first+1; i(segments);
z.segments.addAll(((ZenoString)other).segments);
z.offsets = new ArrayList<>(offsets);
long len = length();
for (long offset : ((ZenoString) other).offsets) {
z.offsets.add(offset + len);
}
return (len < 32 || other.length() < 32 ? z.consolidate0() : z);
} else {
ZenoString z = new ZenoString();
z.segments = new ArrayList<>(segments);
z.offsets = new ArrayList<>(offsets);
z.segments.add(other);
z.offsets.add(length());
return z.consolidate0();
}
}
@Override
void copy8bit(byte[] target, int offset) {
for (UnicodeString us : segments) {
us.copy8bit(target, offset);
offset += us.length32();
}
}
@Override
void copy16bit(char[] target, int offset) {
for (UnicodeString us : segments) {
us.copy16bit(target, offset);
offset += us.length32();
}
}
@Override
void copy24bit(byte[] target, int offset) {
for (UnicodeString us : segments) {
us.copy24bit(target, offset);
offset += (us.length32() * 3);
}
}
private ZenoString consolidate() {
// internal, so works in-situ
int i = segments.size()-2;
long prevLength = segments.get(i+1).length();
while (i >= 0) {
long thisLength = segments.get(i).length();
long nextLength = i == 0 ? 0 : segments.get(i-1).length();
if ((thisLength <= prevLength && thisLength <= nextLength) || thisLength + prevLength <= 32) {
segments.set(i, concatSegments(segments.get(i), segments.get(i + 1)));
//Instrumentation.count("charCopy", segments.get(i).length32() + segments.get(i + 1).length32());
//Instrumentation.count("copyOperations");
segments.remove(i + 1);
offsets.remove(i + 1);
prevLength = segments.get(i).length();
} else {
prevLength = thisLength;
}
i--;
}
//showSegmentLengths();
return this;
}
/**
* Write each of the segments in turn to a UnicodeWriter
* @param writer the writer to which the string is to be written
*/
public void writeSegments(UnicodeWriter writer) throws IOException {
for (UnicodeString str : segments) {
writer.write(str);
}
}
public static UnicodeString concatSegments(UnicodeString left, UnicodeString right) {
if (left.getWidth() <= 8 && right.getWidth() <= 8) {
byte[] newByteArray = new byte[left.length32() + right.length32()];
left.copy8bit(newByteArray, 0);
right.copy8bit(newByteArray, left.length32());
return new Twine8(newByteArray);
} else if (left.getWidth() <= 16 && right.getWidth() <= 16) {
char[] newCharArray = new char[left.length32() + right.length32()];
left.copy16bit(newCharArray, 0);
right.copy16bit(newCharArray, left.length32());
return new Twine16(newCharArray);
} else {
byte[] newByteArray = new byte[(left.length32() + right.length32()) * 3];
left.copy24bit(newByteArray, 0);
right.copy24bit(newByteArray, left.length32() * 3);
return new Twine24(newByteArray);
}
}
private ZenoString consolidate0() {
// internal, so works in-situ
for (int i=segments.size()-2; i>=0; i--) {
double nextLength = segments.get(i + 1).length() * 1.1;
if (segments.get(i).length() < nextLength) {
segments.set(i, concatSegments(segments.get(i), segments.get(i+1)));
segments.remove(i+1);
offsets.remove(i+1);
}
}
//verifySegmentLengths();
return this;
}
private ZenoString consolidate1() {
// internal, so works in-situ
int halfway = segments.size()/2;
for (int i=0; i<(halfway-1); i++) {
if (segments.get(i).length() +segments.get(i+1).length() < (32L << i)) {
UnicodeString merged = segments.get(i).concat(segments.get(i+1));
segments.remove(i+1);
offsets.remove(i+1);
segments.set(i, merged);
}
}
int distance = 0;
for (int i = segments.size() - 1; i > halfway; i--) {
if (segments.get(i).length() + segments.get(i-1).length() < (32L << (distance++))) {
UnicodeString merged = segments.get(i-1).concat(segments.get(i));
segments.remove(i);
offsets.remove(i);
segments.set(i-1, merged);
}
}
//showSegmentLengths();
return this;
}
/**
* Get an equivalent UnicodeString that uses the most economical representation available
*
* @return an equivalent UnicodeString
*/
@Override
public UnicodeString economize() {
int segs = segments.size();
if (segs == 0) {
return EmptyUnicodeString.getInstance();
} else if (segs == 1) {
return segments.get(0);
} else if (segs < 32 && length() < 256 && getWidth() <= 16) {
// Return a single wrapped Java String, for economy of any subsequent toString() operations.
return new BMPString(toString());
} else {
return this;
}
}
public String toString() {
StringBuilder sb = new StringBuilder();
for (UnicodeString str : segments) {
sb.append(str.toString());
}
return sb.toString();
}
/**
* This method is for diagnostics and unit testing only: it exposes
* the lengths of the internal segments. This is an implementation detail
* that is subject to change and does not affect the exposed functionality.
* @return the lengths of the segments
*/
public List debugSegmentLengths() {
List result = new ArrayList<>(segments.size());
for (UnicodeString str : segments) {
result.add(str.length());
}
return result;
}
// Diagnostic method
private void showSegmentLengths() {
StringBuilder sb = new StringBuilder();
for (UnicodeString str : segments) {
sb.append(str.length() + ", ");
}
System.err.println(sb);
}
private void verifySegmentLengths() {
long total = 0;
for (int i = 0; i= 0 && start < z.length()) {
// long next = start + 1000;
// if (next > z.length()) {
// break;
// }
// result = result.concat(z.substring(start, next)).concat(StringConstants.ASTERISK);
// start = next+1;
// occurrences++;
// }
// System.err.println("Replacements: " + occurrences);
// result.showSegmentLengths();
// }
//
// private static void alphabet() {
// String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
// alphabet = alphabet + alphabet + alphabet + alphabet;
// ZenoString z = new ZenoString();
// for (int i=0; i