net.sf.saxon.str.LargeTextBuffer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of Saxon-HE Show documentation
Show all versions of Saxon-HE Show documentation
The XSLT and XQuery Processor
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2022 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
package net.sf.saxon.str;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* The segments (other than the last) have a fixed size of 65536 codepoints,
* which may use one byte per codepoint, two bytes per codepoint, or three bytes per
* codepoint, depending on the largest codepoint present in the segment.
* This is more efficient than a buffer backed by a contiguous array of characters
* in cases where the size is likely to grow very large, and where substring operations
* are rare. As used within the TinyTree, extraction of the string value of a node
* requires character copying only in the case where the value crosses segment
* boundaries.
*/
public final class LargeTextBuffer {
private final static int BITS = 16;
private final static int SEGLEN = 1 << BITS;
private final static int MASK = SEGLEN - 1;
private List completeSegments;
private Segment lastSegment;
private int lastSegmentLength;
private interface Segment {
/**
* Get the number of bits-per-character in this segment (8, 16, or 24)
* @return the number of bits per character
*/
int getWidth();
/**
* Return a Segment that contains the existing content of this segment, but
* stretched if necessary to accommodate a given length and width
* @param oldLength the number of characters (codepoints) currently used
* in the segment
* @param newLength the number of characters (codepoints) that the new segment
* must have room for
* @param newWidth the number of bits-per-character that the new segment must
* have room for
* @return either this Segment, or a replacement
*/
Segment stretch(int oldLength, int newLength, int newWidth);
/**
* Get the content of this segment as a {@code UnicodeString}
*/
UnicodeString asUnicodeString();
/**
* Get a substring of this segment
* @param start the start offset (in codepoints)
* @param end the end offset (in codepoints)
* @return the substring
*/
UnicodeString substring(int start, int end);
}
/**
* A Segment comprising 8-bit characters (codepoints in the range 0-255)
*/
private static class Segment8 implements Segment {
public byte[] bytes;
public Segment8(byte[] bytes) {
this.bytes = bytes;
}
@Override
public int getWidth() {
return 8;
}
@Override
public Segment stretch(int oldLength, int newLength, int newWidth) {
if (newWidth <= 8) {
if (newLength > bytes.length) {
bytes = Arrays.copyOf(bytes, Math.max(newLength, oldLength*2));
}
return this;
} else if (newWidth == 16) {
char[] array16 = new char[newLength];
// if (Configuration.isAssertionsEnabled()) {
// new Slice8(bytes, 0, oldLength).verifyCharacters();
// }
StringTool.copy8to16(bytes, 0, array16, 0, oldLength);
return new Segment16(array16);
} else {
byte[] array24 = new byte[newLength*3];
StringTool.copy8to24(bytes, 0, array24, 0, oldLength);
return new Segment24(array24);
}
}
@Override
public UnicodeString asUnicodeString() {
return new Twine8(bytes);
}
@Override
public UnicodeString substring(int start, int end) {
return new Slice8(bytes, start, end);
}
}
/**
* A Segment comprising 16-bit characters (codepoints in the range 0-65535)
*/
private static class Segment16 implements Segment {
public char[] chars;
/**
* Construct the segment
*
* @param chars an array of chars holding the codepoints, arranged
* as two bytes per codepoint; the caller warrants that the
* char array contains no surrogates.
*/
public Segment16(char[] chars) {
this.chars = chars;
}
@Override
public int getWidth() {
return 16;
}
@Override
public Segment stretch(int oldLength, int newLength, int newWidth) {
if (newWidth <= 16) {
if (newLength > chars.length) {
chars = Arrays.copyOf(chars, Math.max(newLength, oldLength * 2));
}
return this;
} else {
byte[] array24 = new byte[newLength * 3];
StringTool.copy16to24(chars, 0, array24, 0, oldLength);
return new Segment24(array24);
}
}
@Override
public UnicodeString asUnicodeString() {
return new Twine16(chars);
}
@Override
public UnicodeString substring(int start, int end) {
return new Slice16(chars, start, end);
}
}
/**
* A Segment comprising 24-bit characters (any Unicode codepoints)
*/
private static class Segment24 implements Segment {
public byte[] bytes;
/**
* Construct the segment
* @param bytes an array of bytes holding the codepoints, arranged
* as three bytes per codepoint.
*/
public Segment24(byte[] bytes) {
this.bytes = bytes;
}
@Override
public int getWidth() {
return 24;
}
@Override
public Segment stretch(int oldLength, int newLength, int newWidth) {
if (newLength * 3 > bytes.length ) {
bytes = Arrays.copyOf(bytes, Math.max(newLength * 3, oldLength * 6));
}
return this;
}
@Override
public UnicodeString substring(int start, int length) {
return new Slice24(bytes, start, length);
}
@Override
public UnicodeString asUnicodeString() {
return new Twine24(bytes);
}
}
/**
* Create an empty LargeTextBuffer with default space allocation
*/
public LargeTextBuffer() {
completeSegments = new ArrayList<>(4);
lastSegment = new Segment8(new byte[0]);
lastSegmentLength = 0;
}
private void addSegment(Segment segment) {
completeSegments.add(segment);
}
private Segment getSegment(int n) {
if (n == completeSegments.size()) {
return lastSegment;
} else {
return completeSegments.get(n);
}
}
public void appendUnicodeString(UnicodeString chars) {
int spaceAvailableInLastSegment = SEGLEN - lastSegmentLength;
//System.err.println("appendUnicodeString " + chars.length() + " " + lastSegmentLength);
long charsSupplied = chars.length();
if (charsSupplied < spaceAvailableInLastSegment) {
extendLastSegment(chars);
} else {
long start = 0;
extendLastSegment(chars.substring(0, spaceAvailableInLastSegment));
charsSupplied -= spaceAvailableInLastSegment;
start += spaceAvailableInLastSegment;
while (charsSupplied > SEGLEN) {
//System.err.println("appendUnicodeString start=" + start + " supplied=" + charsSupplied);
extendLastSegment(chars.substring(start, start + SEGLEN));
charsSupplied -= SEGLEN;
start += SEGLEN;
}
if (charsSupplied > 0) {
//System.err.println("appendUnicodeStringZ start=" + start + " supplied=" + charsSupplied);
extendLastSegment(chars.substring(start, start + charsSupplied));
}
}
}
private void extendLastSegment(UnicodeString chars) {
//System.err.println("Extend last segment from " + lastSegmentLength + " with " + chars.length());
lastSegment = lastSegment.stretch(lastSegmentLength, lastSegmentLength + chars.length32(), chars.getWidth());
if (lastSegment instanceof Segment8) {
chars.copy8bit(((Segment8)lastSegment).bytes, lastSegmentLength);
} else if (lastSegment instanceof Segment16) {
chars.copy16bit(((Segment16)lastSegment).chars, lastSegmentLength);
} else {
assert lastSegment instanceof Segment24;
chars.copy24bit(((Segment24)lastSegment).bytes, lastSegmentLength*3);
}
lastSegmentLength += chars.length32();
// if (Configuration.isAssertionsEnabled()) {
// lastSegment.substring(0, lastSegmentLength).verifyCharacters();
// }
if (lastSegmentLength == SEGLEN) {
addSegment(lastSegment);
lastSegment = new Segment8(new byte[1024]);
lastSegmentLength = 0;
}
//showSegmentLengths();
}
private void showSegmentLengths() {
StringBuilder sb = new StringBuilder();
for (Segment s : completeSegments) {
sb.append(s.asUnicodeString().length()).append(", ");
}
sb.append(lastSegmentLength);
System.err.println(sb);
}
/**
* Returns a new character sequence that is a subsequence of this sequence.
* The subsequence starts with the character at the specified index and
* ends with the character at index end - 1. The length of the
* returned sequence is end - start, so if start == end
* then an empty sequence is returned.
*
* @param start the start index, inclusive (codepoints, not bytes)
* @param end the end index, exclusive (codepoints, not bytes)
* @return the specified subsequence
* @throws IndexOutOfBoundsException if start or end are negative,
* if end is greater than length(),
* or if start is greater than end
*/
/*@NotNull*/
public UnicodeString substring(int start, int end) {
int firstSeg = start >> BITS;
int lastSeg = (end - 1) >> BITS;
int lastCP = end & MASK;
if (lastCP == 0) {
lastCP = SEGLEN;
}
if (firstSeg == lastSeg) {
// String falls entirely within one segment
try {
Segment seg = getSegment(firstSeg);
return seg.substring(start & MASK, lastCP);
} catch (ArrayIndexOutOfBoundsException e) {
e.printStackTrace();
throw e;
}
} else {
// Concatenate strings from two or more segments
//System.err.println("Cross-segment s=" + start + " e=" + end);
UnicodeBuilder ub = new UnicodeBuilder();
int segNr = firstSeg;
ub.accept(getSegment(segNr++).substring(start & MASK, SEGLEN));
while (segNr < lastSeg) {
ub.accept(getSegment(segNr++).asUnicodeString());
}
ub.accept(getSegment(lastSeg).substring(0, lastCP));
return ub.toUnicodeString();
}
}
public void close() {
if (lastSegment != null && lastSegmentLength > 0) {
addSegment(lastSegment);
}
lastSegment = null;
}
public int length() {
return completeSegments.size()*SEGLEN + (lastSegment == null ? 0 : lastSegmentLength);
}
/**
* Set the length. If this exceeds the current length, this method is a no-op.
* If this is less than the current length, characters beyond the specified point
* are deleted.
*
* @param newLength the new length
*/
public void setLength(int newLength) {
// used to remove a text node if it's found to be a duplicate
if (newLength < length()) {
int segCount = completeSegments.size();
if (newLength <= segCount * SEGLEN) {
// drop the current "last segment", and make the last segment in the completed list
// the new "last segment"
lastSegment = completeSegments.get(segCount - 1);
completeSegments.remove(segCount - 1);
}
lastSegmentLength = newLength & MASK;
}
}
}