
net.sf.saxon.str.UnicodeBuilder Maven / Gradle / Ivy
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2022 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
package net.sf.saxon.str;
import net.sf.saxon.om.Item;
import net.sf.saxon.om.SequenceIterator;
import net.sf.saxon.type.AtomicType;
import net.sf.saxon.value.StringValue;
import net.sf.saxon.z.IntIterator;
import java.io.IOException;
/**
* Builder class to construct a UnicodeString by appending text incrementally
*/
public final class UnicodeBuilder implements UniStringConsumer, UnicodeWriter {
ZenoString content = ZenoString.of(EmptyUnicodeString.getInstance());
/**
* Create a Unicode builder
*/
public UnicodeBuilder() {
}
/**
* Create a Unicode builder with an initial space allocation
* @param allocate the initial space allocation (currently ignored)
*/
public UnicodeBuilder(int allocate) { // used by bytecode, for compatibility with StringBuilder
}
/**
* Append a character (which must not be a surrogate. (Method needed for C#, because implicit
* conversion of char to int isn't supported)
* @param ch the character
* @return the builder, with the new character added
*/
public UnicodeBuilder append(char ch) {
content = content.concat(new UnicodeChar(ch));
return this;
}
/**
* Append a single unicode character to the content
* @param codePoint the unicode codepoint. The caller is responsible for ensuring that this
* is not a surrogate
* @return the builder, with the new character added
*/
public UnicodeBuilder append(int codePoint) {
content = content.concat(new UnicodeChar(codePoint));
// if (content.length() % 100 == 0) {
// System.err.println("Length = " + content.length());
// }
return this;
}
public UnicodeBuilder append(IntIterator codePoints) {
while (codePoints.hasNext()) {
append(codePoints.next());
}
return this;
}
/**
* Append a Java string to the content. The caller is responsible for ensuring that this
* consists entirely of characters in the Latin-1 character set
*
* @param str the string to be appended
* @return the builder, with the new string added
*/
public UnicodeBuilder appendLatin(String str) {
return append(new BMPString(str));
}
/**
* Append the string values of all the items in a sequence, with no separator
* @param iter the sequence of items
* @return this builder, with the new items added
*/
public UnicodeBuilder appendAll(SequenceIterator iter) {
// Note: used from bytecode
Item item;
while ((item = iter.next()) != null) {
append(item.getUnicodeStringValue());
}
return this;
}
/**
* Append a Java CharSequence to the content. This may contain arbitrary characters including
* well formed surrogate pairs
*
* @param str the string to be appended
* @return the builder, with the new string added
*/
public UnicodeBuilder append(CharSequence str) {
// int uLength = StringTool.getStringLength(str);
// if (uLength == str.length()) {
// // No surrogate pairs
// }
return append(StringTool.codePoints(str));
}
/**
* Append a UnicodeString object to the content.
*
* @param str the string to be appended. The length is currently restricted to 2^31.
* @return the builder, with the new string added
*/
public UnicodeBuilder append(UnicodeString str) {
content = content.concat(str);
return this;
}
/**
* Get the number of codepoints currently in the builder
* @return the size in codepoints
*/
public long length() {
return content.length();
}
/**
* Ask whether the content of the builder is empty
* @return true if the size is zero
*/
public boolean isEmpty() {
return content.isEmpty();
}
/**
* Construct a UnicodeString whose value is formed from the contents of this builder
* @return the constructed {@link UnicodeString}
*/
public UnicodeString toUnicodeString() {
return content.economize();
}
/**
* Construct a StringItem whose value is formed from the contents of this builder
* @param type the required type, for example BuiltInAtomicType.STRING or
* BuiltInAtomicType.UNTYPED_ATOMIC. The caller warrants that the value is
* a valid instance of this type. No validation or whitespace normalization
* is carried out
* @return the constructed StringItem
*/
public StringValue toStringItem(AtomicType type) {
return new StringValue(toUnicodeString(), type);
}
/**
* Return a string containing the character content of this builder
* @return the character content of this builder as a Java String
*/
public String toString() {
return content.toString();
}
/**
* Reset the contents of this builder to be empty
*/
public void clear() {
content = ZenoString.of(EmptyUnicodeString.getInstance());
}
/**
* Expand a byte array from 1-byte-per-character to 2-bytes-per-character
* @param in the input byte array
* @param start the start offset in bytes
* @param used the end offset in bytes
* @param allocate the number of code points to allow for in the output byte array
* @return the new byte array
*/
public static byte[] expand1to2(byte[] in, int start, int used, int allocate) {
byte[] result = new byte[allocate*2];
for (int i=start, j=0; i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy