All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.saxon.value.StringValue Maven / Gradle / Ivy

There is a newer version: 10.5
Show newest version
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2013 Saxonica Limited.
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.value;

import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.lib.StringCollator;
import net.sf.saxon.regex.BMPString;
import net.sf.saxon.regex.UnicodeString;
import net.sf.saxon.tree.iter.UnfailingIterator;
import net.sf.saxon.tree.util.FastStringBuffer;
import net.sf.saxon.type.AtomicType;
import net.sf.saxon.type.BuiltInAtomicType;


/**
 * An atomic value of type xs:string. This class is also used for types derived from xs:string.
 * Subclasses of StringValue are used for xs:untypedAtomic and xs:anyURI values.
 */

public class StringValue extends AtomicValue {

    /*@NotNull*/ public static final StringValue EMPTY_STRING = new StringValue("");
    /*@NotNull*/ public static final StringValue SINGLE_SPACE = new StringValue(" ");
    /*@NotNull*/ public static final StringValue TRUE = new StringValue("true");
    /*@NotNull*/ public static final StringValue FALSE = new StringValue("false");

    static {
        EMPTY_STRING.unicodeString = new BMPString("");
        SINGLE_SPACE.unicodeString = new BMPString(" ");
        TRUE.unicodeString = new BMPString("true");
        FALSE.unicodeString = new BMPString("false");
    }

    // We hold the value as a CharSequence (it may be a StringBuffer rather than a string)
    // But the first time this is converted to a string, we keep it as a string

    protected CharSequence value;     // may be zero-length, will never be null
    protected UnicodeString unicodeString = null;

    /**
     * Protected constructor for use by subtypes
     */

    protected StringValue() {
        value = "";
        typeLabel = BuiltInAtomicType.STRING;
    }

    /**
     * Constructor. Note that although a StringValue may wrap any kind of CharSequence
     * (usually a String, but it can also be, for example, a StringBuffer), the caller
     * is responsible for ensuring that the value is immutable.
     * @param value the String value. Null is taken as equivalent to "".
     */

    public StringValue(/*@Nullable*/ CharSequence value) {
        this.value = (value == null ? "" : value);
        typeLabel = BuiltInAtomicType.STRING;
    }

    /**
     * Constructor. Note that although a StringValue may wrap any kind of CharSequence
     * (usually a String, but it can also be, for example, a StringBuffer), the caller
     * is responsible for ensuring that the value is immutable.
     * @param value the String value.
     * @param typeLabel the type of the value to be created. The caller must ensure that this is
     *  a type derived from string and that the string is valid against this type.
     */

    public StringValue(CharSequence value, AtomicType typeLabel) {
        this.value = value;
        this.typeLabel = typeLabel;
    }


    /**
     * Assert that the string is known to contain no surrogate pairs
     */

    public void setContainsNoSurrogates() {
        unicodeString = new BMPString(value);
    }

    /**
     * Create a copy of this atomic value, with a different type label
     *
     * @param typeLabel the type label of the new copy. The caller is responsible for checking that
     *                  the value actually conforms to this type.
     */

    public AtomicValue copyAsSubType(AtomicType typeLabel) {
        StringValue v = new StringValue(value);
        v.unicodeString = unicodeString;
        v.typeLabel = typeLabel;
        return v;
    }

    /**
     * Determine the primitive type of the value. This delivers the same answer as
     * getItemType().getPrimitiveItemType(). The primitive types are
     * the 19 primitive types of XML Schema, plus xs:integer, xs:dayTimeDuration and xs:yearMonthDuration,
     * and xs:untypedAtomic. For external objects, the result is AnyAtomicType.
     */

    public BuiltInAtomicType getPrimitiveType() {
        return BuiltInAtomicType.STRING;
    }

    /**
     * Factory method. Unlike the constructor, this avoids creating a new StringValue in the case
     * of a zero-length string (and potentially other strings, in future)
     * @param value the String value. Null is taken as equivalent to "".
     * @return the corresponding StringValue
     */

    /*@NotNull*/ public static StringValue makeStringValue(/*@Nullable*/ CharSequence value) {
        if (value == null || value.length() == 0) {
            return StringValue.EMPTY_STRING;
        } else {
            return new StringValue(value);
        }
    }

    public static StringValue makeStringValue(UnicodeString unicode) {
        if (unicode.length() == 0) {
            return EMPTY_STRING;
        }
        StringValue sv = new StringValue(unicode.getCharSequence());
        sv.unicodeString = unicode;
        return sv;
    }

    /**
     * Get the string value as a CharSequence
     */

    public final CharSequence getPrimitiveStringValue() {
        return value;
    }

    /**
     * Set the value of the item as a CharSequence.
     * 

For system use only. In principle, a StringValue is immutable. However, in special circumstances, * if it is newly constructed, the content can be changed to reflect the effect of the whiteSpace facet.

* @param value the value of the string */ public final void setStringValueCS(CharSequence value) { this.value = value; } /** * Get the length of this string, as defined in XPath. This is not the same as the Java length, * as a Unicode surrogate pair counts as a single character * @return the length of the string in Unicode code points */ public int getStringLength() { if (unicodeString == null) { makeUnicodeString(); } return unicodeString.length(); } public void makeUnicodeString() { unicodeString = UnicodeString.makeUnicodeString(value); } public UnicodeString getUnicodeString() { if (unicodeString == null) { makeUnicodeString(); } return unicodeString; } /** * Get the length of a string, as defined in XPath. This is not the same as the Java length, * as a Unicode surrogate pair counts as a single character. * @param s The string whose length is required * @return the length of the string in Unicode code points */ public static int getStringLength(/*@NotNull*/ CharSequence s) { int n = 0; for (int i = 0; i < s.length(); i++) { int c = (int) s.charAt(i); if (c < 55296 || c > 56319) n++; // don't count high surrogates, i.e. D800 to DBFF } return n; } /** * Determine whether the string is a zero-length string. This may * be more efficient than testing whether the length is equal to zero * @return true if the string is zero length */ public boolean isZeroLength() { return value.length() == 0; } /** * Determine whether the string contains surrogate pairs * @return true if the string contains any non-BMP characters */ public boolean containsSurrogatePairs() { if (unicodeString != null) { return unicodeString.length() != value.length(); } return UnicodeString.containsSurrogatePairs(value); } /** * Ask whether the string is known to contain no surrogate pairs. * @return true if it is known to contain no surrogates, false if the answer is not known */ public boolean isKnownToContainNoSurrogates() { return unicodeString instanceof BMPString; } /** * Iterate over a string, returning a sequence of integers representing the Unicode code-point values * @return an iterator over the characters (Unicode code points) in the string */ /*@NotNull*/ public UnfailingIterator iterateCharacters() { return new CharacterIterator(); } /** * Expand a string containing surrogate pairs into an array of 32-bit characters * @param s the string to be expanded * @return an array of integers representing the Unicode code points */ /*@NotNull*/ public static int[] expand(/*@NotNull*/ CharSequence s) { int[] array = new int[getStringLength(s)]; int o = 0; for (int i = 0; i < s.length(); i++) { int charval; int c = s.charAt(i); if (c >= 55296 && c <= 56319) { // we'll trust the data to be sound charval = ((c - 55296) * 1024) + ((int) s.charAt(i + 1) - 56320) + 65536; i++; } else { charval = c; } array[o++] = charval; } return array; } /** * Contract an array of integers containing Unicode codepoints into a Java string * @param codes an array of integers representing the Unicode code points * @param used the number of items in the array that are actually used * @return the constructed string */ /*@NotNull*/ public static CharSequence contract(/*@NotNull*/ int[] codes, int used) { FastStringBuffer sb = new FastStringBuffer(codes.length); for (int i=0; i 0; } /*@NotNull*/ public String toString() { return "\"" + value + '\"'; } /** * Get a Comparable value that implements the XML Schema comparison semantics for this value. * Returns null if the value is not comparable according to XML Schema rules. This implementation * returns the underlying Java string, which works because strings will only be compared for * equality, not for ordering, and the equality rules for strings in XML schema are the same as in Java. */ public Comparable getSchemaComparable() { return asString(); } /** * Determine whether two atomic values are identical, as determined by XML Schema rules. This is a stronger * test than equality (even schema-equality); for example two dateTime values are not identical unless * they are in the same timezone. *

Note that even this check ignores the type annotation of the value. The integer 3 and the short 3 * are considered identical, even though they are not fully interchangeable. "Identical" means the * same point in the value space, regardless of type annotation.

*

NaN is identical to itself.

* * @param v the other value to be compared with this one * @return true if the two values are identical, false otherwise. */ public boolean isIdentical(/*@NotNull*/ AtomicValue v) { return v instanceof StringValue && (this instanceof AnyURIValue == v instanceof AnyURIValue) && (this instanceof UntypedAtomicValue == v instanceof UntypedAtomicValue) && asString().equals(((StringValue)v).asString()); } /** * Produce a diagnostic representation of the contents of the string * @param s the string * @return a string in which non-Ascii-printable characters are replaced by \ uXXXX escapes */ /*@NotNull*/ public static String diagnosticDisplay(/*@NotNull*/ String s) { FastStringBuffer fsb = new FastStringBuffer(s.length()); for (int i = 0, len = s.length(); i < len; i++) { char c = s.charAt(i); if (c >= 0x20 && c <= 0x7e) { fsb.append(c); } else { fsb.append("\\u"); for (int shift = 12; shift >= 0; shift -= 4) { fsb.append("0123456789ABCDEF".charAt((c >> shift) & 0xF)); } } } return fsb.toString(); } /** * CharacterIterator is used to iterate over the characters in a string, * returning them as integers representing the Unicode code-point. */ public final class CharacterIterator implements UnfailingIterator { int inpos = 0; // 0-based index of the current Java char int outpos = 0; // 1-based value of position() function int current = -1; // Unicode codepoint most recently returned /** * Create an iterator over a string */ public CharacterIterator() { } /*@Nullable*/ public Int64Value next() { if (inpos < value.length()) { int c = value.charAt(inpos++); if (c >= 55296 && c <= 56319) { // we'll trust the data to be sound try { current = ((c - 55296) * 1024) + ((int) value.charAt(inpos++) - 56320) + 65536; } catch (StringIndexOutOfBoundsException e) { System.err.println("Invalid surrogate at end of string"); System.err.println(diagnosticDisplay(value.toString())); e.printStackTrace(); throw e; } } else { current = c; } outpos++; return new Int64Value(current); } else { outpos = -1; return null; } } /*@Nullable*/ public Int64Value current() { if (outpos < 1) { return null; } return new Int64Value(current); } public int position() { return outpos; } public void close() { } /*@NotNull*/ public UnfailingIterator getAnother() { return new CharacterIterator(); } /** * Get properties of this iterator, as a bit-significant integer. * * @return the properties of this iterator. This will be some combination of * properties such as {@link #GROUNDED} and {@link #LAST_POSITION_FINDER}. It is always * acceptable to return the value zero, indicating that there are no known special properties. */ public int getProperties() { return 0; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy