All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.saxon.str.UnicodeBuilder Maven / Gradle / Ivy

There is a newer version: 12.5
Show newest version
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2022 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.str;

import net.sf.saxon.om.Item;
import net.sf.saxon.om.SequenceIterator;
import net.sf.saxon.type.AtomicType;
import net.sf.saxon.value.StringValue;
import net.sf.saxon.z.IntIterator;

import java.io.IOException;
import java.util.Arrays;

/**
 * Builder class to construct a UnicodeString by appending text incrementally
 */

public final class UnicodeBuilder implements UniStringConsumer, UnicodeWriter {

    // The data held by the UnicodeBuilder is in two parts: an archive part
    // of arbitrary length, held as a ZenoString, and an active part which
    // is typically up to 65535 characters. For short strings the archive part
    // is always empty. The active part is held as an integer array, 32 bits per
    // character.

    // As characters are added to the active part, the variable "bits" is used
    // to track the widest character added so far, which is subsequently used
    // to reduce the memory requirement for storing the string.

    private int[] codepoints;
    private int used;
    private int bits;
    private ZenoString archive = ZenoString.EMPTY;

    /**
     * Create a Unicode builder with an initial allocation of 256 codepoints
     */
    public UnicodeBuilder() {
        this(256);
    }

    /**
     * Create a Unicode builder with an initial space allocation
     * @param allocate the initial space allocation, in codepoints (32-bit integers)
     */
    public UnicodeBuilder(int allocate) {
        codepoints = new int[allocate];
    }

    /**
     * Append a character (which must not be a surrogate. (Method needed for C#, because implicit
     * conversion of char to int isn't supported)
     * @param ch the character
     * @return this builder, with the new character added
     */
    public UnicodeBuilder append(char ch) {
        append((int) ch);
        return this;
    }

    /**
     * Append a single unicode character to the content
     * @param codePoint the unicode codepoint. The caller is responsible for ensuring that this
     *                  is not a surrogate
     * @return this builder, with the new character added
     */

    public UnicodeBuilder append(int codePoint) {
        ensureCapacity(1);
        codepoints[used++] = codePoint;
        bits |= codePoint;
        return this;
    }

    /**
     * Append multiple unicode characters to the content
     *
     * @param codePoints an iterator delivering the codepoints to be added.
     * @return this builder, with the new character added
     */

    public UnicodeBuilder append(IntIterator codePoints) {
        while (codePoints.hasNext()) {
            append(codePoints.next());
        }
        return this;
    }

    /**
     * Append a Java string to the content. The caller is responsible for ensuring that this
     * consists entirely of characters in the Latin-1 character set
     *
     * @param str the string to be appended
     * @return this builder, with the new string added
     */

    public UnicodeBuilder appendLatin(String str) {
        return append(new BMPString(str));
    }

    /**
     * Append the string values of all the items in a sequence, with no separator
     * @param iter the sequence of items
     * @return this builder, with the new items added
     */

    public UnicodeBuilder appendAll(SequenceIterator iter) {
        // Note: used from bytecode
        Item item;
        while ((item = iter.next()) != null) {
            append(item.getUnicodeStringValue());
        }
        return this;
    }

    /**
     * Append a Java CharSequence to the content. This may contain arbitrary characters including
     * well formed surrogate pairs
     *
     * @param str the string to be appended
     * @return this builder, with the new string added
     */

    public UnicodeBuilder append(CharSequence str) {
        return append(StringTool.codePoints(str));
    }

    /**
     * Append a UnicodeString object to the content.
     *
     * @param str the string to be appended. The length is currently restricted to 2^31.
     * @return this builder, with the new string added
     */

    public UnicodeBuilder append(UnicodeString str) {
        int len = str.length32();
        if (len == 0) {
            return this;
        }
        ensureCapacity(len);
        IntIterator iter = str.codePoints();
        while (iter.hasNext()) {
            codepoints[used++] = iter.next();
        }
        int width = str.getWidth();
        if (width > 8) {
            if (width > 16) {
                bits |= 0xffffff;
            } else {
                bits |= 0xffff;
            }
        }
        return this;
    }

    /**
     * Get the number of codepoints currently in the builder
     * @return the size in codepoints
     */
    public long length() {
        return archive.length() + used;
    }

    /**
     * Ask whether the content of the builder is empty
     * @return true if the size is zero
     */
    public boolean isEmpty() {
        return archive.isEmpty() && used == 0;
    }

    /**
     * Ensure the buffer has enough capacity for a string of a given length
     *
     * @param required the number of codepoints that need to be added to the buffer
     */

    private void ensureCapacity(int required) {
        // For very long strings, archive what we've already accumulated as a ZenoString
        if (used > 65535) {
            archive = archive.concat(getActivePart());
            used = 0;
            bits = 0xff;
        }
        while (used + required > codepoints.length) {
            codepoints = Arrays.copyOf(codepoints, codepoints.length * 2);
        }
    }


    /**
     * Construct a UnicodeString whose value is formed from the contents of this builder
     * @return the constructed {@link UnicodeString}
     */

    public UnicodeString toUnicodeString() {
        if (archive.isEmpty()) {
            return getActivePart();
        } else {
            return archive.concat(getActivePart());
        }
    }

    /**
     * Get the contents of the active part, as a UnicodeString
     * @return a UnicodeString representing the active part of the builder's data
     */

    private UnicodeString getActivePart() {
        if ((bits & 0xff0000) != 0) {
            // use 24-bit codes
            return new Twine24(codepoints, used);
        } else if ((bits & 0xff00) != 0) {
            // use 16-bit codes
            char[] chars = new char[used];
            for (int i = 0; i < used; i++) {
                chars[i] = (char) (codepoints[i] & 0xffff);
            }
            return new Twine16(chars);
        } else {
            byte[] bytes = new byte[used];
            for (int i = 0; i < used; i++) {
                bytes[i] = (byte) (codepoints[i] & 0xff);
            }
            return new Twine8(bytes);
        }
    }

    /**
     * Construct a StringValue whose value is formed from the contents of this builder
     * @param type the required type, for example BuiltInAtomicType.STRING or
     *             BuiltInAtomicType.UNTYPED_ATOMIC. The caller warrants that the value is
     *             a valid instance of this type. No validation or whitespace normalization
     *             is carried out
     * @return the constructed StringValue
     */

    public StringValue toStringItem(AtomicType type) {
        return new StringValue(toUnicodeString(), type);
    }

    /**
     * Return a string containing the character content of this builder
     * @return the character content of this builder as a Java String
     */

    public String toString() {
        return toUnicodeString().toString();
    }


    /**
     * Reset the contents of this builder to be empty
     */

    public void clear() {
        archive = ZenoString.EMPTY;
        codepoints = new int[256];
        used = 0;
        bits = 0;
    }

    /**
     * Expand a byte array from 1-byte-per-character to 2-bytes-per-character
     * @param in the input byte array
     * @param start the start offset in bytes
     * @param used the end offset in bytes
     * @param allocate the number of code points to allow for in the output byte array
     * @return the new byte array
     */

    public static byte[] expand1to2(byte[] in, int start, int used, int allocate) {
        byte[] result = new byte[allocate*2];
        for (int i=start, j=0; i




© 2015 - 2024 Weber Informatics LLC | Privacy Policy