All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.saxon.tree.tiny.LargeStringBuffer Maven / Gradle / Ivy

There is a newer version: 12.5
Show newest version
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2015 Saxonica Limited.
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.tree.tiny;

import net.sf.saxon.tree.util.FastStringBuffer;


/**
 * This is an implementation of the CharSequence interface: it implements
 * a CharSequence as a list of arrays of characters (the individual arrays are known
 * as segments). The segments have a fixed size of 65536 characters.
 * 

* This is more efficient than a buffer backed by a contiguous array of characters * in cases where the size is likely to grow very large, and where substring operations * are rare. As used within the TinyTree, extraction of the string value of a node * requires character copying only in the case where the value crosses segment * boundaries. */ public final class LargeStringBuffer implements AppendableCharSequence { private final static int BITS = 16; private final static int SEGLEN = 1 << BITS; private final static int MASK = SEGLEN - 1; // Variant of LargeStringBuffer using fixed-length segments private char[][] data; private int length; // total length of the CharSequence private int segmentsUsed; /** * Create an empty LargeStringBuffer with default space allocation */ public LargeStringBuffer() { data = new char[1][]; segmentsUsed = 0; length = 0; } /** * Expand the data structure. Note this only involves expanding the "index" (the list of * segments), it does not cause any character data to be copied. * * @param seg the new segment to be added. */ private void addSegment(char[] seg) { int segs = data.length; if (segmentsUsed + 1 > segs) { if (segmentsUsed == 32768) { throw new IllegalStateException("Source document too large: more than 1G characters in text nodes"); } char[][] d2 = new char[segs * 2][]; System.arraycopy(data, 0, d2, 0, segmentsUsed); data = d2; } data[segmentsUsed++] = seg; } /** * Append a CharSequence to this LargeStringBuffer * * @param s the data to be appended */ public void append(CharSequence s) { // Although we provide variants of this method for different subtypes, Java decides which to use based // on the static type of the operand. We want to use the right method based on the dynamic type, to avoid // creating objects and copying strings unnecessarily. So we do a dynamic dispatch. (This is only necessary // of course because the CharSequence class offers no getChars() method). if (s instanceof CompressedWhitespace) { FastStringBuffer fsb = new FastStringBuffer(FastStringBuffer.C64); ((CompressedWhitespace)s).uncompress(fsb); append(fsb); return; } final int len = s.length(); char[] firstSeg; int firstSegOffset = length & MASK; if (firstSegOffset == 0) { firstSeg = new char[SEGLEN]; addSegment(firstSeg); } else { firstSeg = data[length>>BITS]; } int firstSegLen; int fullSegments; int lastSegLen; if (len <= SEGLEN - firstSegOffset) { // all fits in the current segment firstSegLen = len; fullSegments = 0; lastSegLen = 0; } else { firstSegLen = SEGLEN - firstSegOffset; fullSegments = (len - firstSegLen) >> BITS; lastSegLen = (len - firstSegLen) & MASK; } if (s instanceof CharSlice) { ((CharSlice)s).getChars(0, firstSegLen, firstSeg, firstSegOffset); int start = firstSegLen; for (int i=0; i 0) { char[] seg = new char[SEGLEN]; addSegment(seg); ((CharSlice)s).getChars(start, len, seg, 0); } length += len; } else if (s instanceof FastStringBuffer) { ((FastStringBuffer)s).getChars(0, firstSegLen, firstSeg, firstSegOffset); int start = firstSegLen; for (int i=0; i 0) { char[] seg = new char[SEGLEN]; addSegment(seg); ((FastStringBuffer)s).getChars(start, len, seg, 0); } length += len; } else { if (!(s instanceof String)) { s = s.toString(); } ((String)s).getChars(0, firstSegLen, firstSeg, firstSegOffset); int start = firstSegLen; for (int i=0; i 0) { char[] seg = new char[SEGLEN]; addSegment(seg); ((String)s).getChars(start, len, seg, 0); } length += len; } } /** * Returns the length of this character sequence. The length is the number * of 16-bit UTF-16 characters in the sequence.

* * @return the number of characters in this sequence */ public int length() { return length; } /** * Set the length. If this exceeds the current length, this method is a no-op. * If this is less than the current length, characters beyond the specified point * are deleted. * * @param length the new length */ public void setLength(int length) { if (length < this.length) { int usedInLastSegment = length & MASK; this.length = length; this.segmentsUsed = length / SEGLEN + (usedInLastSegment == 0 ? 0 : 1); } } /** * Returns the character at the specified index. An index ranges from zero * to length() - 1. The first character of the sequence is at * index zero, the next at index one, and so on, as for array * indexing.

* * @param index the index of the character to be returned * @return the specified character * @throws IndexOutOfBoundsException if the index argument is negative or not less than * length() */ public char charAt(int index) { if (index < 0 || index >= length) { throw new IndexOutOfBoundsException(index + ""); } return data[index >> BITS][index & MASK]; } /** * Returns a new character sequence that is a subsequence of this sequence. * The subsequence starts with the character at the specified index and * ends with the character at index end - 1. The length of the * returned sequence is end - start, so if start == end * then an empty sequence is returned.

* * @param start the start index, inclusive * @param end the end index, exclusive * @return the specified subsequence * @throws IndexOutOfBoundsException if start or end are negative, * if end is greater than length(), * or if start is greater than end */ /*@NotNull*/ public CharSequence subSequence(int start, int end) { int firstSeg = start >> BITS; int lastSeg = (end - 1) >> BITS; if (firstSeg == lastSeg) { return new CharSlice(data[firstSeg], start & MASK, end - start); } else { FastStringBuffer fsb = new FastStringBuffer(end - start); int firstSegLen = SEGLEN - (start & MASK); fsb.append(data[firstSeg], start & MASK, firstSegLen); int doneTo = start + firstSegLen; while (true) { firstSeg++; if (doneTo + SEGLEN < end) { fsb.append(data[firstSeg]); doneTo += SEGLEN; } else { fsb.append(data[firstSeg], 0, end - doneTo); break; } } return fsb; } } /** * Convert to a string */ public String toString() { return subSequence(0, length).toString(); } /** * Compare equality */ public boolean equals(Object other) { return other instanceof CharSequence && toString().equals(other.toString()); } /** * Generate a hash code */ public int hashCode() { // Same algorithm as String#hashCode(), but not cached int h = 0; for (char[] chars : data) { for (int i = 0; i < SEGLEN; i++) { h = 31 * h + chars[i]; } } return h; } /** * Returns a new character sequence that is a subsequence of this sequence. * Unlike subSequence, this is guaranteed to return a String. * * @param start index of the first character to be included * @param end index of the character after the last one to be included * @return the substring at the given position */ public String substring(int start, int end) { return subSequence(start, end).toString(); } /** * Write the value to a writer * * @param writer the writer to which the value is to be written * @throws java.io.IOException if an error occurs downstream */ public void write(java.io.Writer writer) throws java.io.IOException { writer.write(toString()); } /** * Produce diagnostic dump */ // public void dumpDataStructure() { // System.err.println("** Segments:"); // for (int s = 0; s < segments.size(); s++) { // System.err.println(" SEG " + s + " start offset " + startOffsets[s] + " length " // + ((FastStringBuffer)segments.get(s)).length()); // } // } // public static void main(String[] args) { // LargeStringBuffer lsb = new LargeStringBuffer(); // for (int i=0; i<30; i++) { // char[] chars = new char[i*5000]; // Arrays.fill(chars, 'x'); // lsb.append(new String(chars)); // lsb.append(""); // } // for (int i=0; i




© 2015 - 2025 Weber Informatics LLC | Privacy Policy