net.sf.saxon.str.UnicodeBuilder Maven / Gradle / Ivy

Go to download
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2022 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.str;

import net.sf.saxon.om.Item;
import net.sf.saxon.om.SequenceIterator;
import net.sf.saxon.type.AtomicType;
import net.sf.saxon.value.StringValue;
import net.sf.saxon.z.IntIterator;

import java.io.IOException;

/**
 * Builder class to construct a UnicodeString by appending text incrementally
 */

public final class UnicodeBuilder implements UniStringConsumer, UnicodeWriter {

    ZenoString content = ZenoString.of(EmptyUnicodeString.getInstance());

    /**
     * Create a Unicode builder
     */
    public UnicodeBuilder() {

    }

    /**
     * Create a Unicode builder with an initial space allocation
     * @param allocate the initial space allocation (currently ignored)
     */
    public UnicodeBuilder(int allocate) {  // used by bytecode, for compatibility with StringBuilder

    }

    /**
     * Append a character (which must not be a surrogate. (Method needed for C#, because implicit
     * conversion of char to int isn't supported)
     * @param ch the character
     * @return the builder, with the new character added
     */
    public UnicodeBuilder append(char ch) {
        content = content.concat(new UnicodeChar(ch));
        return this;
    }

    /**
     * Append a single unicode character to the content
     * @param codePoint the unicode codepoint. The caller is responsible for ensuring that this
     *                  is not a surrogate
     * @return the builder, with the new character added
     */

    public UnicodeBuilder append(int codePoint) {
        content = content.concat(new UnicodeChar(codePoint));
//        if (content.length() % 100 == 0) {
//            System.err.println("Length = " + content.length());
//        }
        return this;
    }

    public UnicodeBuilder append(IntIterator codePoints) {
        while (codePoints.hasNext()) {
            append(codePoints.next());
        }
        return this;
    }

    /**
     * Append a Java string to the content. The caller is responsible for ensuring that this
     * consists entirely of characters in the Latin-1 character set
     *
     * @param str the string to be appended
     * @return the builder, with the new string added
     */

    public UnicodeBuilder appendLatin(String str) {
        return append(new BMPString(str));
    }

    /**
     * Append the string values of all the items in a sequence, with no separator
     * @param iter the sequence of items
     * @return this builder, with the new items added
     */

    public UnicodeBuilder appendAll(SequenceIterator iter) {
        // Note: used from bytecode
        Item item;
        while ((item = iter.next()) != null) {
            append(item.getUnicodeStringValue());
        }
        return this;
    }

    /**
     * Append a Java CharSequence to the content. This may contain arbitrary characters including
     * well formed surrogate pairs
     *
     * @param str the string to be appended
     * @return the builder, with the new string added
     */

    public UnicodeBuilder append(CharSequence str) {
//        int uLength = StringTool.getStringLength(str);
//        if (uLength == str.length()) {
//            // No surrogate pairs
//        }
        return append(StringTool.codePoints(str));
    }

    /**
     * Append a UnicodeString object to the content.
     *
     * @param str the string to be appended. The length is currently restricted to 2^31.
     * @return the builder, with the new string added
     */

    public UnicodeBuilder append(UnicodeString str) {
        content = content.concat(str);
        return this;
    }

    /**
     * Get the number of codepoints currently in the builder
     * @return the size in codepoints
     */
    public long length() {
        return content.length();
    }

    /**
     * Ask whether the content of the builder is empty
     * @return true if the size is zero
     */
    public boolean isEmpty() {
        return content.isEmpty();
    }

    /**
     * Construct a UnicodeString whose value is formed from the contents of this builder
     * @return the constructed {@link UnicodeString}
     */

    public UnicodeString toUnicodeString() {
        return content.economize();
    }

    /**
     * Construct a StringItem whose value is formed from the contents of this builder
     * @param type the required type, for example BuiltInAtomicType.STRING or
     *             BuiltInAtomicType.UNTYPED_ATOMIC. The caller warrants that the value is
     *             a valid instance of this type. No validation or whitespace normalization
     *             is carried out
     * @return the constructed StringItem
     */

    public StringValue toStringItem(AtomicType type) {
        return new StringValue(toUnicodeString(), type);
    }

    /**
     * Return a string containing the character content of this builder
     * @return the character content of this builder as a Java String
     */

    public String toString() {
        return content.toString();
    }



    /**
     * Reset the contents of this builder to be empty
     */

    public void clear() {
        content = ZenoString.of(EmptyUnicodeString.getInstance());
    }

    /**
     * Expand a byte array from 1-byte-per-character to 2-bytes-per-character
     * @param in the input byte array
     * @param start the start offset in bytes
     * @param used the end offset in bytes
     * @param allocate the number of code points to allow for in the output byte array
     * @return the new byte array
     */

    public static byte[] expand1to2(byte[] in, int start, int used, int allocate) {
        byte[] result = new byte[allocate*2];
        for (int i=start, j=0; i