All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.exist.util.CompressedWhitespace Maven / Gradle / Ivy

There is a newer version: 6.3.0
Show newest version
package org.exist.util;

import java.io.Writer;

/**
 * This class provides a compressed representation of a sequence of whitespace characters. The representation
 * is a sequence of bytes: in each byte the top two bits indicate which whitespace character is used
 * (x9, xA, xD, or x20) and the bottom six bits indicate the number of such characters. A zero byte is a filler.
 * We don't compress the sequence if it would occupy more than 8 bytes, because that's the space we've got available
 * in the TinyTree arrays.
 */
public class CompressedWhitespace implements CharSequence {

    private static char[] WHITE_CHARS = {0x09, 0x0A, 0x0D, 0x20};

    private long value;

    public CompressedWhitespace(long compressedValue) {
        this.value = compressedValue;
    }

    /**
     * Attempt to compress a CharSequence
     * @param in the CharSequence to be compressed
     * @return the compressed sequence if it can be compressed; or the original CharSequence otherwise
     */

    public static CharSequence compress(CharSequence in) {
        final int inlen = in.length();
        if (inlen == 0) {
            return in;
        }
        int runlength = 1;
        int outlength = 0;
        for (int i=0; i= 0) {
                if (i == inlen-1 || c != in.charAt(i+1) || runlength == 63) {
                    runlength = 1;
                    outlength++;
                    if (outlength > 8) {
                        return in;
                    }
                } else {
                    runlength++;
                }
            } else {
                return in;
            }
        }
        int ix = 0;
        runlength = 1;
        final int[] out = new int[outlength];
        for (int i=0; i=0; s-=8) {
            final byte b = (byte)((val>>>s) & 0xff);
            if (b == 0) {
                break;
            }
            final char c = WHITE_CHARS[b>>>6 & 0x3];
            final int len = (b & 0x3f);
            for (int j=0; j=0; s-=8) {
            int c = (int)((val>>>s) & 0x3f);
            if (c == 0) {
                break;
            }
            count += c;
        }
        return count;
    }

    /**
     * Returns the char value at the specified index.  An index ranges from zero
     * to length() - 1.  The first char value of the sequence is at
     * index zero, the next at index one, and so on, as for array
     * indexing.
     *
     * If the char value specified by the index is a
     * surrogate, the surrogate
     * value is returned.
     *
     * @param index the index of the char value to be returned
     * @return the specified char value
     * @throws IndexOutOfBoundsException if the index argument is negative or not less than
     *                                   length()
     */
    public char charAt(int index) {
        int count = 0;
        final long val = value;
        for (int s=56; s>=0; s-=8) {
            final byte b = (byte)((val>>>s) & 0xff);
            if (b == 0) {
                break;
            }
            count += (b & 0x3f);
            if (count > index) {
                return WHITE_CHARS[b>>>6 & 0x3];
            }
        }
        throw new IndexOutOfBoundsException(index+"");
    }

    /**
     * Returns a new CharSequence that is a subsequence of this sequence.
     * The subsequence starts with the char value at the specified index and
     * ends with the char value at index end - 1.  The length
     * (in chars) of the
     * returned sequence is end - start, so if start == end
     * then an empty sequence is returned.
     *
     * @param start the start index, inclusive
     * @param end   the end index, exclusive
     * @return the specified subsequence
     * @throws IndexOutOfBoundsException if start or end are negative,
     *                                   if end is greater than length(),
     *                                   or if start is greater than end
     */
    public CharSequence subSequence(int start, int end) {
        return uncompress(null).subSequence(start, end);
    }

    /**
     * Indicates whether some other object is "equal to" this one.
     */
    public boolean equals(Object obj) {
        if (obj instanceof CompressedWhitespace) {
            return value == ((CompressedWhitespace)obj).value;
        }
        return uncompress(null).equals(obj);
    }

    /**
     * Returns a hash code value for the object.
     */
    public int hashCode() {
        return uncompress(null).hashCode();
    }

    /**
     * Returns a string representation of the object.
     */
    public String toString() {
        return uncompress(null).toString();
    }

    /**
     * Write the value to a Writer
     *
     * @param writer the writer
     *
     * @throws java.io.IOException if an error occurs whilst writing
     */
    public void write(Writer writer) throws java.io.IOException {
        final long val = value;
        for (int s=56; s>=0; s-=8) {
            final byte b = (byte)((val>>>s) & 0xff);
            if (b == 0) {
                break;
            }
            final char c = WHITE_CHARS[b>>>6 & 0x3];
            final int len = (b & 0x3f);
            for (int j=0; j=0; s-=8) {
            final byte b = (byte)((val>>>s) & 0xff);
            if (b == 0) {
                break;
            }
            final char c = WHITE_CHARS[b>>>6 & 0x3];
            final int len = (b & 0x3f);
            if (specialChars[c]) {
                String e = "";
                if (c=='\n') {
                    e = "
";
                } else if (c=='\r') {
                    e = "
";
                } else if (c=='\t') {
                    e = "	";
                }
                for (int j=0; j




© 2015 - 2025 Weber Informatics LLC | Privacy Policy