
net.sf.saxon.tree.tiny.LargeStringBuffer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of Saxon-HE Show documentation
Show all versions of Saxon-HE Show documentation
The XSLT and XQuery Processor
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2015 Saxonica Limited.
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
package net.sf.saxon.tree.tiny;
import net.sf.saxon.tree.util.FastStringBuffer;
/**
* This is an implementation of the CharSequence interface: it implements
* a CharSequence as a list of arrays of characters (the individual arrays are known
* as segments). The segments have a fixed size of 65536 characters.
*
* This is more efficient than a buffer backed by a contiguous array of characters
* in cases where the size is likely to grow very large, and where substring operations
* are rare. As used within the TinyTree, extraction of the string value of a node
* requires character copying only in the case where the value crosses segment
* boundaries.
*/
public final class LargeStringBuffer implements AppendableCharSequence {
private final static int BITS = 16;
private final static int SEGLEN = 1 << BITS;
private final static int MASK = SEGLEN - 1;
// Variant of LargeStringBuffer using fixed-length segments
private char[][] data;
private int length; // total length of the CharSequence
private int segmentsUsed;
/**
* Create an empty LargeStringBuffer with default space allocation
*/
public LargeStringBuffer() {
data = new char[1][];
segmentsUsed = 0;
length = 0;
}
/**
* Expand the data structure. Note this only involves expanding the "index" (the list of
* segments), it does not cause any character data to be copied.
*
* @param seg the new segment to be added.
*/
private void addSegment(char[] seg) {
int segs = data.length;
if (segmentsUsed + 1 > segs) {
if (segmentsUsed == 32768) {
throw new IllegalStateException("Source document too large: more than 1G characters in text nodes");
}
char[][] d2 = new char[segs * 2][];
System.arraycopy(data, 0, d2, 0, segmentsUsed);
data = d2;
}
data[segmentsUsed++] = seg;
}
/**
* Append a CharSequence to this LargeStringBuffer
*
* @param s the data to be appended
*/
public void append(CharSequence s) {
// Although we provide variants of this method for different subtypes, Java decides which to use based
// on the static type of the operand. We want to use the right method based on the dynamic type, to avoid
// creating objects and copying strings unnecessarily. So we do a dynamic dispatch. (This is only necessary
// of course because the CharSequence class offers no getChars() method).
if (s instanceof CompressedWhitespace) {
FastStringBuffer fsb = new FastStringBuffer(FastStringBuffer.C64);
((CompressedWhitespace)s).uncompress(fsb);
append(fsb);
return;
}
final int len = s.length();
char[] firstSeg;
int firstSegOffset = length & MASK;
if (firstSegOffset == 0) {
firstSeg = new char[SEGLEN];
addSegment(firstSeg);
} else {
firstSeg = data[length>>BITS];
}
int firstSegLen;
int fullSegments;
int lastSegLen;
if (len <= SEGLEN - firstSegOffset) {
// all fits in the current segment
firstSegLen = len;
fullSegments = 0;
lastSegLen = 0;
} else {
firstSegLen = SEGLEN - firstSegOffset;
fullSegments = (len - firstSegLen) >> BITS;
lastSegLen = (len - firstSegLen) & MASK;
}
if (s instanceof CharSlice) {
((CharSlice)s).getChars(0, firstSegLen, firstSeg, firstSegOffset);
int start = firstSegLen;
for (int i=0; i 0) {
char[] seg = new char[SEGLEN];
addSegment(seg);
((CharSlice)s).getChars(start, len, seg, 0);
}
length += len;
} else if (s instanceof FastStringBuffer) {
((FastStringBuffer)s).getChars(0, firstSegLen, firstSeg, firstSegOffset);
int start = firstSegLen;
for (int i=0; i 0) {
char[] seg = new char[SEGLEN];
addSegment(seg);
((FastStringBuffer)s).getChars(start, len, seg, 0);
}
length += len;
} else {
if (!(s instanceof String)) {
s = s.toString();
}
((String)s).getChars(0, firstSegLen, firstSeg, firstSegOffset);
int start = firstSegLen;
for (int i=0; i 0) {
char[] seg = new char[SEGLEN];
addSegment(seg);
((String)s).getChars(start, len, seg, 0);
}
length += len;
}
}
/**
* Returns the length of this character sequence. The length is the number
* of 16-bit UTF-16 characters in the sequence.
*
* @return the number of characters in this sequence
*/
public int length() {
return length;
}
/**
* Set the length. If this exceeds the current length, this method is a no-op.
* If this is less than the current length, characters beyond the specified point
* are deleted.
*
* @param length the new length
*/
public void setLength(int length) {
if (length < this.length) {
int usedInLastSegment = length & MASK;
this.length = length;
this.segmentsUsed = length / SEGLEN + (usedInLastSegment == 0 ? 0 : 1);
}
}
/**
* Returns the character at the specified index. An index ranges from zero
* to length() - 1. The first character of the sequence is at
* index zero, the next at index one, and so on, as for array
* indexing.
*
* @param index the index of the character to be returned
* @return the specified character
* @throws IndexOutOfBoundsException if the index argument is negative or not less than
* length()
*/
public char charAt(int index) {
if (index < 0 || index >= length) {
throw new IndexOutOfBoundsException(index + "");
}
return data[index >> BITS][index & MASK];
}
/**
* Returns a new character sequence that is a subsequence of this sequence.
* The subsequence starts with the character at the specified index and
* ends with the character at index end - 1. The length of the
* returned sequence is end - start, so if start == end
* then an empty sequence is returned.
*
* @param start the start index, inclusive
* @param end the end index, exclusive
* @return the specified subsequence
* @throws IndexOutOfBoundsException if start or end are negative,
* if end is greater than length(),
* or if start is greater than end
*/
/*@NotNull*/
public CharSequence subSequence(int start, int end) {
int firstSeg = start >> BITS;
int lastSeg = (end - 1) >> BITS;
if (firstSeg == lastSeg) {
return new CharSlice(data[firstSeg], start & MASK, end - start);
} else {
FastStringBuffer fsb = new FastStringBuffer(end - start);
int firstSegLen = SEGLEN - (start & MASK);
fsb.append(data[firstSeg], start & MASK, firstSegLen);
int doneTo = start + firstSegLen;
while (true) {
firstSeg++;
if (doneTo + SEGLEN < end) {
fsb.append(data[firstSeg]);
doneTo += SEGLEN;
} else {
fsb.append(data[firstSeg], 0, end - doneTo);
break;
}
}
return fsb;
}
}
/**
* Convert to a string
*/
public String toString() {
return subSequence(0, length).toString();
}
/**
* Compare equality
*/
public boolean equals(Object other) {
return other instanceof CharSequence && toString().equals(other.toString());
}
/**
* Generate a hash code
*/
public int hashCode() {
// Same algorithm as String#hashCode(), but not cached
int h = 0;
for (char[] chars : data) {
for (int i = 0; i < SEGLEN; i++) {
h = 31 * h + chars[i];
}
}
return h;
}
/**
* Returns a new character sequence that is a subsequence of this sequence.
* Unlike subSequence, this is guaranteed to return a String.
*
* @param start index of the first character to be included
* @param end index of the character after the last one to be included
* @return the substring at the given position
*/
public String substring(int start, int end) {
return subSequence(start, end).toString();
}
/**
* Write the value to a writer
*
* @param writer the writer to which the value is to be written
* @throws java.io.IOException if an error occurs downstream
*/
public void write(java.io.Writer writer) throws java.io.IOException {
writer.write(toString());
}
/**
* Produce diagnostic dump
*/
// public void dumpDataStructure() {
// System.err.println("** Segments:");
// for (int s = 0; s < segments.size(); s++) {
// System.err.println(" SEG " + s + " start offset " + startOffsets[s] + " length "
// + ((FastStringBuffer)segments.get(s)).length());
// }
// }
// public static void main(String[] args) {
// LargeStringBuffer lsb = new LargeStringBuffer();
// for (int i=0; i<30; i++) {
// char[] chars = new char[i*5000];
// Arrays.fill(chars, 'x');
// lsb.append(new String(chars));
// lsb.append("");
// }
// for (int i=0; i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy