All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.saxon.tree.tiny.CompressedWhitespace Maven / Gradle / Ivy

There is a newer version: 12.5
Show newest version
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2015 Saxonica Limited.
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.tree.tiny;

import net.sf.saxon.tree.util.FastStringBuffer;

import java.io.OutputStream;
import java.io.Writer;

/**
 * This class provides a compressed representation of a sequence of whitespace characters. The representation
 * is a sequence of bytes: in each byte the top two bits indicate which whitespace character is used
 * (x9, xA, xD, or x20) and the bottom six bits indicate the number of such characters. A zero byte is a filler.
 * We don't compress the sequence if it would occupy more than 8 bytes, because that's the space we've got available
 * in the TinyTree arrays.
 */

public class CompressedWhitespace implements CharSequence {

    /*@NotNull*/ private static char[] WHITE_CHARS = {0x09, 0x0A, 0x0D, 0x20};
    /*@NotNull*/ private static int[] CODES =
            {-1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, -1, -1, 2, -1, -1,
                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                    3};

    private long value;

    public CompressedWhitespace(long compressedValue) {
        value = compressedValue;
    }

    /**
     * Attempt to compress a CharSequence
     *
     * @param in the CharSequence to be compressed
     * @return the compressed sequence if it can be compressed; or the original CharSequence otherwise
     */

    /*@NotNull*/
    public static CharSequence compress(/*@NotNull*/ CharSequence in) {
        final int inlen = in.length();
        if (inlen == 0) {
            return in;
        }
        int runlength = 1;
        int outlength = 0;
        for (int i = 0; i < inlen; i++) {
            final char c = in.charAt(i);
            if (c <= 32 && CODES[c] > 0) {
                if (i == inlen - 1 || c != in.charAt(i + 1) || runlength == 63) {
                    runlength = 1;
                    outlength++;
                    if (outlength > 8) {
                        return in;
                    }
                } else {
                    runlength++;
                }
            } else {
                return in;
            }
        }
        int ix = 0;
        runlength = 1;
        int[] out = new int[outlength];
        for (int i = 0; i < inlen; i++) {
            final char c = in.charAt(i);
            if (i == inlen - 1 || c != in.charAt(i + 1) || runlength == 63) {
                out[ix++] = (CODES[c] << 6) | runlength;
                runlength = 1;
            } else {
                runlength++;
            }
        }
        long value = 0;
        for (int i = 0; i < outlength; i++) {
            value = (value << 8) | out[i];
        }
        value <<= (8 * (8 - outlength));
        return new CompressedWhitespace(value);
    }

    /**
     * Uncompress the whitespace to a FastStringBuffer
     *
     * @param buffer the buffer to which the whitespace is to be appended. The parameter may be
     *               null, in which case a new buffer is created.
     * @return the FastStringBuffer to which the whitespace has been appended. If a buffer was
     *         supplied in the argument, this will be the same buffer.
     */

    public FastStringBuffer uncompress(/*@Nullable*/ FastStringBuffer buffer) {
        if (buffer == null) {
            buffer = new FastStringBuffer(length());
        }
        uncompress(value, buffer);
        return buffer;
    }

    public static void uncompress(long value, /*@NotNull*/ FastStringBuffer buffer) {
        for (int s = 56; s >= 0; s -= 8) {
            byte b = (byte) ((value >>> s) & 0xff);
            if (b == 0) {
                break;
            }
            char c = WHITE_CHARS[b >>> 6 & 0x3];
            int len = (b & 0x3f);
            buffer.ensureCapacity(len);
            for (int j = 0; j < len; j++) {
                buffer.append(c);
            }
        }
    }

    public long getCompressedValue() {
        return value;
    }

    public int length() {
        int count = 0;
        long val = value;
        for (int s = 56; s >= 0; s -= 8) {
            int c = (int) ((val >>> s) & 0x3f);
            if (c == 0) {
                break;
            }
            count += c;
        }
        return count;
    }

    /**
     * Returns the char value at the specified index.  An index ranges from zero
     * to length() - 1.  The first char value of the sequence is at
     * index zero, the next at index one, and so on, as for array
     * indexing. 

*

*

If the char value specified by the index is a * surrogate, the surrogate * value is returned. * * @param index the index of the char value to be returned * @return the specified char value * @throws IndexOutOfBoundsException if the index argument is negative or not less than * length() */ public char charAt(int index) { int count = 0; final long val = value; for (int s = 56; s >= 0; s -= 8) { byte b = (byte) ((val >>> s) & 0xff); if (b == 0) { break; } count += (b & 0x3f); if (count > index) { return WHITE_CHARS[b >>> 6 & 0x3]; } } throw new IndexOutOfBoundsException(index + ""); } /** * Returns a new CharSequence that is a subsequence of this sequence. * The subsequence starts with the char value at the specified index and * ends with the char value at index end - 1. The length * (in chars) of the * returned sequence is end - start, so if start == end * then an empty sequence is returned.

* * @param start the start index, inclusive * @param end the end index, exclusive * @return the specified subsequence * @throws IndexOutOfBoundsException if start or end are negative, * if end is greater than length(), * or if start is greater than end */ public CharSequence subSequence(int start, int end) { return uncompress(null).subSequence(start, end); } /** * Indicates whether some other object is "equal to" this one. */ public boolean equals(/*@NotNull*/ Object obj) { if (obj instanceof CompressedWhitespace) { return value == ((CompressedWhitespace) obj).value; } return uncompress(null).equals(obj); } /** * Returns a hash code value for the object. */ public int hashCode() { return uncompress(null).hashCode(); } /** * Returns a string representation of the object. */ public String toString() { return uncompress(null).toString(); } /** * Write the value to a Writer * * @param writer the writer to write to * @throws java.io.IOException if an error occurs downstream */ public void write(/*@NotNull*/ Writer writer) throws java.io.IOException { final long val = value; for (int s = 56; s >= 0; s -= 8) { final byte b = (byte) ((val >>> s) & 0xff); if (b == 0) { break; } final char c = WHITE_CHARS[b >>> 6 & 0x3]; final int len = (b & 0x3f); for (int j = 0; j < len; j++) { writer.write(c); } } } /** * Write the value to a Writer with escaping of special characters * * @param specialChars identifies which characters are considered special * @param writer the writer to write to * @throws java.io.IOException if an error occurs downstream */ public void writeEscape(boolean[] specialChars, /*@NotNull*/ Writer writer) throws java.io.IOException { final long val = value; for (int s = 56; s >= 0; s -= 8) { final byte b = (byte) ((val >>> s) & 0xff); if (b == 0) { break; } final char c = WHITE_CHARS[b >>> 6 & 0x3]; final int len = (b & 0x3f); if (specialChars[c]) { String e = ""; if (c == '\n') { e = " "; } else if (c == '\r') { e = " "; } else if (c == '\t') { e = " "; } for (int j = 0; j < len; j++) { writer.write(e); } } else { for (int j = 0; j < len; j++) { writer.write(c); } } } } /** * Write the value to a UTF-8 OutputStream with escaping of special characters * * @param specialChars array of booleans indicating which characters need to be XML-escaped * @param stream the output stream to write to * @throws java.io.IOException if an error occurs downstream */ public void writeEscape(boolean[] specialChars, /*@NotNull*/ OutputStream stream) throws java.io.IOException { final long val = value; for (int s = 56; s >= 0; s -= 8) { final byte b = (byte) ((val >>> s) & 0xff); if (b == 0) { break; } final char c = WHITE_CHARS[b >>> 6 & 0x3]; final int len = (b & 0x3f); if (specialChars[c]) { byte[] e; if (c == '\n') { e = ESCAPE_N; } else if (c == '\r') { e = ESCAPE_R; } else { e = ESCAPE_T; } for (int j = 0; j < len; j++) { stream.write(e); } } else { for (int j = 0; j < len; j++) { stream.write(c); } } } } /*@NotNull*/ private static byte[] ESCAPE_N = {'&', '#', 'x', 'A', ';'}; /*@NotNull*/ private static byte[] ESCAPE_R = {'&', '#', 'x', 'D', ';'}; /*@NotNull*/ private static byte[] ESCAPE_T = {'&', '#', 'x', '9', ';'}; }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy