net.sf.saxon.str.LargeTextBuffer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of Saxon-HE Show documentation
The XSLT and XQuery Processor
There is a newer version: 12.5
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2022 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.str;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;


/**
 * The segments (other than the last) have a fixed size of 65536 codepoints,
 * which may use one byte per codepoint, two bytes per codepoint, or three bytes per
 * codepoint, depending on the largest codepoint present in the segment.
 * This is more efficient than a buffer backed by a contiguous array of characters
 * in cases where the size is likely to grow very large, and where substring operations
 * are rare. As used within the TinyTree, extraction of the string value of a node
 * requires character copying only in the case where the value crosses segment
 * boundaries.
 */

public final class LargeTextBuffer {

    private final static int BITS = 16;
    private final static int SEGLEN = 1 << BITS;
    private final static int MASK = SEGLEN - 1;


    private List completeSegments;
    private Segment lastSegment;
    private int lastSegmentLength;


    private interface Segment {
        /**
         * Get the number of bits-per-character in this segment (8, 16, or 24)
         * @return the number of bits per character
         */
        int getWidth();

        /**
         * Return a Segment that contains the existing content of this segment, but
         * stretched if necessary to accommodate a given length and width
         * @param oldLength the number of characters (codepoints) currently used
         *                  in the segment
         * @param newLength the number of characters (codepoints) that the new segment
         *               must have room for
         * @param newWidth the number of bits-per-character that the new segment must
         *              have room for
         * @return either this Segment, or a replacement
         */
        Segment stretch(int oldLength, int newLength, int newWidth);

        /**
         * Get the content of this segment as a {@code UnicodeString}
         */

        UnicodeString asUnicodeString();

        /**
         * Get a substring of this segment
         * @param start the start offset (in codepoints)
         * @param end the end offset (in codepoints)
         * @return the substring
         */
        UnicodeString substring(int start, int end);

    }

    /**
     * A Segment comprising 8-bit characters (codepoints in the range 0-255)
     */
    private static class Segment8 implements Segment {
        public byte[] bytes;

        public Segment8(byte[] bytes) {
            this.bytes = bytes;
        }

        @Override
        public int getWidth() {
            return 8;
        }

        @Override
        public Segment stretch(int oldLength, int newLength, int newWidth) {
            if (newWidth <= 8) {
                if (newLength > bytes.length) {
                    bytes = Arrays.copyOf(bytes, Math.max(newLength, oldLength*2));
                }
                return this;
            } else if (newWidth == 16) {
                char[] array16 = new char[newLength];
//                if (Configuration.isAssertionsEnabled()) {
//                    new Slice8(bytes, 0, oldLength).verifyCharacters();
//                }
                StringTool.copy8to16(bytes, 0, array16, 0, oldLength);
                return new Segment16(array16);
            } else {
                byte[] array24 = new byte[newLength*3];
                StringTool.copy8to24(bytes, 0, array24, 0, oldLength);
                return new Segment24(array24);
            }
        }

        @Override
        public UnicodeString asUnicodeString() {
            return new Twine8(bytes);
        }

        @Override
        public UnicodeString substring(int start, int end) {
            return new Slice8(bytes, start, end);
        }

    }

    /**
     * A Segment comprising 16-bit characters (codepoints in the range 0-65535)
     */

    private static class Segment16 implements Segment {
        public char[] chars;

        /**
         * Construct the segment
         *
         * @param chars an array of chars holding the codepoints, arranged
         *              as two bytes per codepoint; the caller warrants that the
         *              char array contains no surrogates.
         */

        public Segment16(char[] chars) {
            this.chars = chars;
        }

        @Override
        public int getWidth() {
            return 16;
        }

        @Override
        public Segment stretch(int oldLength, int newLength, int newWidth) {
            if (newWidth <= 16) {
                if (newLength > chars.length) {
                    chars = Arrays.copyOf(chars, Math.max(newLength, oldLength * 2));
                }
                return this;
            } else {
                byte[] array24 = new byte[newLength * 3];
                StringTool.copy16to24(chars, 0, array24, 0, oldLength);
                return new Segment24(array24);
            }
        }

        @Override
        public UnicodeString asUnicodeString() {
            return new Twine16(chars);
        }

        @Override
        public UnicodeString substring(int start, int end) {
            return new Slice16(chars, start, end);
        }
    }

    /**
     * A Segment comprising 24-bit characters (any Unicode codepoints)
     */

    private static class Segment24 implements Segment {
        public byte[] bytes;

        /**
         * Construct the segment
         * @param bytes an array of bytes holding the codepoints, arranged
         *              as three bytes per codepoint.
         */

        public Segment24(byte[] bytes) {
            this.bytes = bytes;
        }

        @Override
        public int getWidth() {
            return 24;
        }

        @Override
        public Segment stretch(int oldLength, int newLength, int newWidth) {
            if (newLength * 3 > bytes.length ) {
                bytes = Arrays.copyOf(bytes, Math.max(newLength * 3, oldLength * 6));
            }
            return this;
        }

        @Override
        public UnicodeString substring(int start, int length) {
            return new Slice24(bytes, start, length);
        }

        @Override
        public UnicodeString asUnicodeString() {
            return new Twine24(bytes);
        }

    }

    /**
     * Create an empty LargeTextBuffer with default space allocation
     */

    public LargeTextBuffer() {
        completeSegments = new ArrayList<>(4);
        lastSegment = new Segment8(new byte[0]);
        lastSegmentLength = 0;
    }

    private void addSegment(Segment segment) {
        completeSegments.add(segment);
    }

    private Segment getSegment(int n) {
        if (n == completeSegments.size()) {
            return lastSegment;
        } else {
            return completeSegments.get(n);
        }
    }


    public void appendUnicodeString(UnicodeString chars) {
        int spaceAvailableInLastSegment = SEGLEN - lastSegmentLength;
        //System.err.println("appendUnicodeString " + chars.length() + " " + lastSegmentLength);
        long charsSupplied = chars.length();
        if (charsSupplied < spaceAvailableInLastSegment) {
            extendLastSegment(chars);
        } else {
            long start = 0;
            extendLastSegment(chars.substring(0, spaceAvailableInLastSegment));
            charsSupplied -= spaceAvailableInLastSegment;
            start += spaceAvailableInLastSegment;
            while (charsSupplied > SEGLEN) {
                //System.err.println("appendUnicodeString start=" + start + " supplied=" + charsSupplied);
                extendLastSegment(chars.substring(start, start + SEGLEN));
                charsSupplied -= SEGLEN;
                start += SEGLEN;
            }
            if (charsSupplied > 0) {
                //System.err.println("appendUnicodeStringZ start=" + start + " supplied=" + charsSupplied);
                extendLastSegment(chars.substring(start, start + charsSupplied));
            }
        }
    }

    private void extendLastSegment(UnicodeString chars) {
        //System.err.println("Extend last segment from " + lastSegmentLength + " with " + chars.length());
        lastSegment = lastSegment.stretch(lastSegmentLength, lastSegmentLength + chars.length32(), chars.getWidth());
        if (lastSegment instanceof Segment8) {
            chars.copy8bit(((Segment8)lastSegment).bytes, lastSegmentLength);
        } else if (lastSegment instanceof Segment16) {
            chars.copy16bit(((Segment16)lastSegment).chars, lastSegmentLength);
        } else {
            assert lastSegment instanceof Segment24;
            chars.copy24bit(((Segment24)lastSegment).bytes, lastSegmentLength*3);
        }
        lastSegmentLength += chars.length32();
//        if (Configuration.isAssertionsEnabled()) {
//            lastSegment.substring(0, lastSegmentLength).verifyCharacters();
//        }
        if (lastSegmentLength == SEGLEN) {
            addSegment(lastSegment);
            lastSegment = new Segment8(new byte[1024]);
            lastSegmentLength = 0;
        }
        //showSegmentLengths();
    }

    private void showSegmentLengths() {
        StringBuilder sb = new StringBuilder();
        for (Segment s : completeSegments) {
            sb.append(s.asUnicodeString().length()).append(", ");
        }
        sb.append(lastSegmentLength);
        System.err.println(sb);
    }


    /**
     * Returns a new character sequence that is a subsequence of this sequence.
     * The subsequence starts with the character at the specified index and
     * ends with the character at index end - 1.  The length of the
     * returned sequence is end - start, so if start == end
     * then an empty sequence is returned.
     *
     * @param start the start index, inclusive (codepoints, not bytes)
     * @param end   the end index, exclusive (codepoints, not bytes)
     * @return the specified subsequence
     * @throws IndexOutOfBoundsException if start or end are negative,
     *                                   if end is greater than length(),
     *                                   or if start is greater than end
     */
    /*@NotNull*/
    public UnicodeString substring(int start, int end) {
        int firstSeg = start >> BITS;
        int lastSeg = (end - 1) >> BITS;
        int lastCP = end & MASK;
        if (lastCP == 0) {
            lastCP = SEGLEN;
        }
        if (firstSeg == lastSeg) {
            // String falls entirely within one segment
            try {
                Segment seg = getSegment(firstSeg);
                return seg.substring(start & MASK, lastCP);
            } catch (ArrayIndexOutOfBoundsException e) {
                e.printStackTrace();
                throw e;
            }
        } else {
            // Concatenate strings from two or more segments
            //System.err.println("Cross-segment s=" + start + " e=" + end);
            UnicodeBuilder ub = new UnicodeBuilder();
            int segNr = firstSeg;
            ub.accept(getSegment(segNr++).substring(start & MASK, SEGLEN));
            while (segNr < lastSeg) {
                ub.accept(getSegment(segNr++).asUnicodeString());
            }
            ub.accept(getSegment(lastSeg).substring(0, lastCP));
            return ub.toUnicodeString();

        }
    }

    public void close() {
        if (lastSegment != null && lastSegmentLength > 0) {
            addSegment(lastSegment);
        }
        lastSegment = null;
    }

    public int length() {
        return completeSegments.size()*SEGLEN + (lastSegment == null ? 0 : lastSegmentLength);
    }

    /**
     * Set the length. If this exceeds the current length, this method is a no-op.
     * If this is less than the current length, characters beyond the specified point
     * are deleted.
     *
     * @param newLength the new length
     */

    public void setLength(int newLength) {
        // used to remove a text node if it's found to be a duplicate
        if (newLength < length()) {
            int segCount = completeSegments.size();
            if (newLength <= segCount * SEGLEN) {
                // drop the current "last segment", and make the last segment in the completed list
                // the new "last segment"
                lastSegment = completeSegments.get(segCount - 1);
                completeSegments.remove(segCount - 1);
            }
            lastSegmentLength = newLength & MASK;
        }
    }

}