All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.vladsch.flexmark.util.sequence.builder.tree.Segment Maven / Gradle / Ivy

There is a newer version: 0.64.8
Show newest version
package com.vladsch.flexmark.util.sequence.builder.tree;

import com.vladsch.flexmark.util.sequence.BasedSequence;
import com.vladsch.flexmark.util.sequence.builder.Seg;
import org.jetbrains.annotations.NotNull;

import static com.vladsch.flexmark.util.misc.Utils.escapeJavaString;

/**
 * SegmentedSequence Segment stored in byte[] in serialized format
 */
public abstract class Segment {
    // @formatter:off
    final static int TYPE_MASK              = 0b0000_0000_1110_0000;

    final static int TYPE_NO_SIZE_BYTES     = 0b0000_0000_0001_0000;
    final static int TYPE_START_BYTES       = 0b0000_0000_0000_0011;
    final static int TYPE_LENGTH_BYTES      = 0b0000_0000_0000_1100;

    final static int TYPE_ANCHOR            = 0b0000_0000_0000_0000;
    final static int TYPE_BASE              = 0b0000_0000_0010_0000;
    final static int TYPE_TEXT              = 0b0000_0000_0100_0000;
    final static int TYPE_REPEATED_TEXT     = 0b0000_0000_0110_0000;
    final static int TYPE_TEXT_ASCII        = 0b0000_0000_1000_0000;
    final static int TYPE_REPEATED_ASCII    = 0b0000_0000_1010_0000;
    final static int TYPE_REPEATED_SPACE    = 0b0000_0000_1100_0000;
    final static int TYPE_REPEATED_EOL      = 0b0000_0000_1110_0000;

    final static int TYPE_HAS_OFFSET        = 0b0000_0001_0000_0000;
    final static int TYPE_HAS_LENGTH        = 0b0000_0010_0000_0000;
    final static int TYPE_HAS_BOTH          = 0b0000_0011_0000_0000;
    final static int TYPE_HAS_CHAR          = 0b0000_0100_0000_0000;
    final static int TYPE_HAS_CHARS         = 0b0000_1000_0000_0000;
    final static int TYPE_HAS_BYTE          = 0b0001_0000_0000_0000;
    final static int TYPE_HAS_BYTES         = 0b0010_0000_0000_0000;

// @formatter:on

    public enum SegType {
        ANCHOR(TYPE_ANCHOR | TYPE_HAS_OFFSET),
        BASE(TYPE_BASE | TYPE_HAS_BOTH),
        TEXT(TYPE_TEXT | TYPE_HAS_LENGTH | TYPE_HAS_CHARS),
        REPEATED_TEXT(TYPE_REPEATED_TEXT | TYPE_HAS_LENGTH | TYPE_HAS_CHAR),
        TEXT_ASCII(TYPE_TEXT_ASCII | TYPE_HAS_LENGTH | TYPE_HAS_BYTES),
        REPEATED_ASCII(TYPE_REPEATED_ASCII | TYPE_HAS_LENGTH | TYPE_HAS_BYTE),
        REPEATED_SPACE(TYPE_REPEATED_SPACE | TYPE_HAS_LENGTH),
        REPEATED_EOL(TYPE_REPEATED_EOL | TYPE_HAS_LENGTH),
        ;

        final public int flags;

        SegType(int flags) {
            this.flags = flags;
        }

        public boolean hasAll(int flags) {
            return (this.flags & flags) == flags;
        }

        public boolean hasLength() {
            return hasAll(TYPE_HAS_LENGTH);
        }

        public boolean hasOffset() {
            return hasAll(TYPE_HAS_OFFSET);
        }

        public boolean hasBoth() {
            return hasAll(TYPE_HAS_BOTH);
        }

        public boolean hasChar() {
            return hasAll(TYPE_HAS_CHAR);
        }

        public boolean hasChars() {
            return hasAll(TYPE_HAS_CHARS);
        }

        public boolean hasByte() {
            return hasAll(TYPE_HAS_BYTE);
        }

        public boolean hasBytes() {
            return hasAll(TYPE_HAS_BYTES);
        }

        public static SegType fromTypeMask(int segTypeMask) {
            switch (segTypeMask & TYPE_MASK) {
// @formatter:off
                case TYPE_ANCHOR: return SegType.ANCHOR;
                case TYPE_BASE: return SegType.BASE;
                case TYPE_TEXT: return SegType.TEXT;
                case TYPE_TEXT_ASCII: return SegType.TEXT_ASCII;
                case TYPE_REPEATED_TEXT: return SegType.REPEATED_TEXT;
                case TYPE_REPEATED_ASCII: return SegType.REPEATED_ASCII;
                case TYPE_REPEATED_SPACE: return SegType.REPEATED_SPACE;
                case TYPE_REPEATED_EOL: return SegType.REPEATED_EOL;
// @formatter:on

                default:
                    throw new IllegalStateException(String.format("Invalid text type %02x", segTypeMask));
            }
        }
    }

    public boolean hasAll(int flags, int mask) {
        return (flags & mask) == mask;
    }

    protected final int pos;  // position of segment in aggr length table
    protected final byte[] bytes;
    protected final int byteOffset;
    protected final int startIndex;

    public Segment(int pos, byte[] bytes, int byteOffset, int startIndex) {
        this.pos = pos;
        this.bytes = bytes;
        this.byteOffset = byteOffset;
        this.startIndex = startIndex;
    }

    public int getPos() {
        return pos;
    }

    public byte[] getBytes() {
        return bytes;
    }

    final public int getByteOffset() {
        return byteOffset;
    }

    final public int getStartIndex() {
        return startIndex;
    }

    final public int getEndIndex() {
        return startIndex + length();
    }

    public boolean notInSegment(int index) {
        return index < startIndex || index >= startIndex + length();
    }

    public boolean offsetNotInSegment(int offset) {
        return offset < getStartOffset() || offset >= getEndOffset();
    }

    final public SegType getType() {
        return SegType.fromTypeMask(bytes[byteOffset]);
    }

    final public int getByteLength() {
        return getSegByteLength(getType(), getStartOffset(), length());
    }

    public abstract int length();
    public abstract boolean isBase();
    public abstract boolean isAnchor();
    public abstract boolean isText();
    public abstract boolean isFirst256Start();
    public abstract boolean isRepeatedTextEnd();
    public abstract int getStartOffset();
    public abstract int getEndOffset();
    public abstract CharSequence getCharSequence();
    /**
     * get char at index
     *
     * @param index index in segmented sequence coordinates. index offset must be subtracted to convert to segment coordinates
     * @return character at given index in segmented sequence
     */
    public abstract char charAt(int index);

    @Override
    public String toString() {
        if (isBase()) {
            if (isAnchor()) {
                return "[" + getStartOffset() + ")";
            } else {
                return "[" + getStartOffset() + ", " + getEndOffset() + ")";
            }
        } else {
            CharSequence charSequence = getCharSequence();
            if (isRepeatedTextEnd() && length() > 1) {
                if (isFirst256Start()) {
                    return "a:" + (length() + "x'" + escapeJavaString(charSequence.subSequence(0, 1)) + "'");
                } else {
                    return "" + (length() + "x'" + escapeJavaString(charSequence.subSequence(0, 1)) + "'");
                }
            } else {
                int length = charSequence.length();
                String chars = length <= 20 ? charSequence.toString() : charSequence.subSequence(0, 10).toString() + "…" + charSequence.subSequence(length - 10, length).toString();
                if (isFirst256Start()) {
                    return "a:'" + escapeJavaString(chars) + "'";
                } else {
                    return "'" + escapeJavaString(chars) + "'";
                }
            }
        }
    }

    static class Base extends Segment {
        protected final int startOffset;
        protected final int endOffset;
        protected final @NotNull BasedSequence baseSeq;

        public Base(int pos, byte[] bytes, int byteOffset, int indexOffset, @NotNull BasedSequence basedSequence) {
            super(pos, bytes, byteOffset, indexOffset);

            baseSeq = basedSequence;

            int type = bytes[byteOffset++] & 0x00ff;
            if ((type & TYPE_MASK) == TYPE_ANCHOR) {
                if (hasAll(type, TYPE_NO_SIZE_BYTES)) {
                    endOffset = startOffset = type & 0x000f;
                } else {
                    int intBytes = type & TYPE_START_BYTES;
                    endOffset = startOffset = getInt(bytes, byteOffset, intBytes + 1);
                }
            } else {
                assert !hasAll(type, TYPE_NO_SIZE_BYTES);

                int intBytes = type & TYPE_START_BYTES;
                startOffset = getInt(bytes, byteOffset, intBytes + 1);
                byteOffset += intBytes + 1;

                int lengthBytes = (type & TYPE_LENGTH_BYTES) >> 2;
                endOffset = startOffset + getInt(bytes, byteOffset, lengthBytes + 1);
            }
        }

        @Override
        public int length() {
            return endOffset - startOffset;
        }

        @Override
        public boolean isBase() {
            return true;
        }

        @Override
        public boolean isAnchor() {
            return startOffset == endOffset;
        }

        @Override
        public boolean isText() {
            return false;
        }

        @Override
        public boolean isFirst256Start() {
            return false;
        }

        @Override
        public boolean isRepeatedTextEnd() {
            return false;
        }

        @Override
        public int getStartOffset() {
            return startOffset;
        }

        @Override
        public int getEndOffset() {
            return endOffset;
        }

        @Override
        public char charAt(int index) {
            if (index < startIndex || index - startIndex >= length()) {
                throw new IndexOutOfBoundsException("index " + index + " out of bounds [" + startIndex + ", " + startIndex + length() + ")");
            }
            return baseSeq.charAt(startOffset + index - startIndex);
        }

        @Override
        public CharSequence getCharSequence() {
            return baseSeq.subSequence(startOffset, endOffset);
        }
    }

    static abstract class TextCharSequenceBase implements CharSequence {
        protected final byte[] bytes;
        protected final int byteOffset;  // byte offset of first byte of chars for original base sequence
        protected final int startOffset;
        protected final int length;

        public TextCharSequenceBase(byte[] bytes, int byteOffset, int startOffset, int length) {
            this.bytes = bytes;
            this.byteOffset = byteOffset;
            this.startOffset = startOffset;
            this.length = length;
        }

        @Override
        public int length() {
            return length;
        }

        @Override
        public abstract char charAt(int index);

        abstract CharSequence create(int startOffset, int length);

        @Override
        public CharSequence subSequence(int startIndex, int endIndex) {
            if (startIndex < 0 || startIndex > endIndex || endIndex > length) {
                throw new IndexOutOfBoundsException("Invalid index range [" + startIndex + ", " + endIndex + "] out of bounds [0, " + length() + ")");
            }
            return create(startOffset + startIndex, endIndex - startIndex);
        }

        @NotNull
        @Override
        public String toString() {
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i < length; i++) {
                sb.append(charAt(i));
            }
            return sb.toString();
        }
    }

    static class TextCharSequence extends TextCharSequenceBase {
        public TextCharSequence(byte[] bytes, int byteOffset, int startOffset, int length) {
            super(bytes, byteOffset, startOffset, length);
        }

        @Override
        public char charAt(int index) {
            if (index < 0 || index >= length) {
                throw new IndexOutOfBoundsException("index " + index + " out of bounds [0, " + length + ")");
            }
            return getChar(bytes, byteOffset + (startOffset + index) * 2);
        }

        @Override
        CharSequence create(int startOffset, int length) {
            return new TextCharSequence(bytes, byteOffset, startOffset, length);
        }
    }

    static class TextAsciiCharSequence extends TextCharSequenceBase {
        public TextAsciiCharSequence(byte[] bytes, int byteOffset, int startOffset, int length) {
            super(bytes, byteOffset, startOffset, length);
        }

        @Override
        public char charAt(int index) {
            if (index < 0 || index >= length) {
                throw new IndexOutOfBoundsException("index " + index + " out of bounds [0, " + length + ")");
            }

            return (char) (0x00ff & bytes[byteOffset + startOffset + index]);
        }

        @Override
        CharSequence create(int startOffset, int length) {
            return new TextAsciiCharSequence(bytes, byteOffset, startOffset, length);
        }
    }

    static class TextRepeatedSequence implements CharSequence {
        protected final char c;
        protected final int length;

        public TextRepeatedSequence(char c, int length) {
            this.c = c;
            this.length = length;
        }

        @Override
        public int length() {
            return length;
        }

        @Override
        public char charAt(int index) {
            if (index < 0 || index >= length) {
                throw new IndexOutOfBoundsException("index " + index + " out of bounds [0, " + length + ")");
            }
            return c;
        }

        @Override
        public CharSequence subSequence(int startIndex, int endIndex) {
            if (startIndex < 0 || startIndex > endIndex || endIndex > length) {
                throw new IndexOutOfBoundsException("Invalid index range [" + startIndex + ", " + endIndex + "] out of bounds [0, " + length() + ")");
            }
            return new TextRepeatedSequence(c, endIndex - startIndex);
        }

        @NotNull
        @Override
        public String toString() {
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i < length; i++) {
                sb.append(c);
            }
            return sb.toString();
        }
    }

    static class Text extends Segment {
        protected final @NotNull CharSequence chars;

        public Text(int pos, byte[] bytes, int byteOffset, int indexOffset) {
            super(pos, bytes, byteOffset, indexOffset);
            int type = bytes[byteOffset++] & 0x00ff;
            int segTypeMask = type & TYPE_MASK;

            int length;

            if (hasAll(type, TYPE_NO_SIZE_BYTES)) {
                length = type & 0x000f;
            } else {
                int lengthBytes = (type & TYPE_LENGTH_BYTES) >> 2;
                length = getInt(bytes, byteOffset, lengthBytes + 1);
                byteOffset += lengthBytes + 1;
            }

            switch (segTypeMask) {
                case TYPE_TEXT:
                    chars = new TextCharSequence(bytes, byteOffset, 0, length);
                    break;

                case TYPE_TEXT_ASCII:
                    chars = new TextAsciiCharSequence(bytes, byteOffset, 0, length);
                    break;

                case TYPE_REPEATED_TEXT:
                    chars = new TextRepeatedSequence(getChar(bytes, byteOffset), length);
                    break;

                case TYPE_REPEATED_ASCII:
                    chars = new TextRepeatedSequence((char) (0x00ff & bytes[byteOffset]), length);
                    break;

                case TYPE_REPEATED_SPACE:
                    chars = new TextRepeatedSequence(' ', length);
                    break;

                case TYPE_REPEATED_EOL:
                    chars = new TextRepeatedSequence('\n', length);
                    break;

                default:
                    throw new IllegalStateException("Invalid text type " + segTypeMask);
            }
        }

        @Override
        public int length() {
            return chars.length();
        }

        @Override
        public char charAt(int index) {
            if (index < startIndex || index - startIndex >= chars.length()) {
                throw new IndexOutOfBoundsException("index " + index + " out of bounds [" + startIndex + ", " + startIndex + chars.length() + ")");
            }
            return chars.charAt(index - startIndex);
        }

        @Override
        public boolean isBase() {
            return false;
        }

        @Override
        public boolean isAnchor() {
            return false;
        }

        @Override
        public boolean isText() {
            return true;
        }

        int textType() {
            return bytes[byteOffset] & TYPE_MASK;
        }

        @Override
        public boolean isFirst256Start() {
            int textType = textType();
            return textType == TYPE_TEXT_ASCII || textType == TYPE_REPEATED_ASCII || textType == TYPE_REPEATED_SPACE || textType == TYPE_REPEATED_EOL;
        }

        @Override
        public boolean isRepeatedTextEnd() {
            int textType = textType();
            return textType == TYPE_REPEATED_TEXT || textType == TYPE_REPEATED_ASCII || textType == TYPE_REPEATED_SPACE || textType == TYPE_REPEATED_EOL;
        }

        @Override
        public int getStartOffset() {
            return -1;
        }

        @Override
        public int getEndOffset() {
            return -1;
        }

        @NotNull
        @Override
        public CharSequence getCharSequence() {
            return chars;
        }
    }

    public static Segment getSegment(byte[] bytes, int byteOffset, int pos, int indexOffset, @NotNull BasedSequence basedSequence) {
        int type = bytes[byteOffset] & TYPE_MASK;

        switch (type) {
            case TYPE_ANCHOR:
            case TYPE_BASE:
                return new Base(pos, bytes, byteOffset, indexOffset, basedSequence);

            case TYPE_TEXT:
            case TYPE_REPEATED_TEXT:
            case TYPE_TEXT_ASCII:
            case TYPE_REPEATED_ASCII:
            case TYPE_REPEATED_SPACE:
            case TYPE_REPEATED_EOL:
                return new Text(pos, bytes, byteOffset, indexOffset);

            default:
                throw new IllegalStateException("Invalid text type " + type);
        }
    }

    public static SegType getSegType(@NotNull Seg seg, @NotNull CharSequence textChars) {
        if (seg.isBase()) {
            return seg.isAnchor() ? SegType.ANCHOR : SegType.BASE;
        } else if (seg.isText()) {
            boolean first256Start = seg.isFirst256Start();
            boolean repeatedTextEnd = seg.isRepeatedTextEnd();

            if (first256Start) {
                // ascii text
                if (repeatedTextEnd) {
                    // repeated chars
                    char c = textChars.charAt(seg.getTextStart());
                    if (c == ' ') return SegType.REPEATED_SPACE;
                    else if (c == '\n') return SegType.REPEATED_EOL;
                    else return SegType.REPEATED_ASCII;
                } else {
                    return SegType.TEXT_ASCII;
                }
            } else {
                return repeatedTextEnd ? SegType.REPEATED_TEXT : SegType.TEXT;
            }
        } else {
            throw new IllegalStateException("Unknown seg type " + seg);
        }
    }

    public static int getOffsetBytes(int offset) {
        return offset < 16 ? 0 : offset < 256 ? 1 : offset < 65536 ? 2 : offset < 65536 * 256 ? 3 : 4;
    }

    public static int getLengthBytes(int length) {
        return length < 16 ? 0 : length < 256 ? 1 : length < 65536 ? 2 : length < 65536 * 256 ? 3 : 4;
    }

    public static int getIntBytes(int length) {
        return length < 256 ? 1 : length < 65536 ? 2 : length < 65536 * 256 ? 3 : 4;
    }

    public static int getSegByteLength(@NotNull Segment.SegType segType, int segStart, int segLength) {
        int length = 1;

        if (segType.hasBoth()) {
            length += getIntBytes(segStart) + getIntBytes(segLength);
        } else if (segType.hasOffset()) {
            length += getOffsetBytes(segStart);
        } else if (segType.hasLength()) {
            length += getLengthBytes(segLength);
        }

        if (segType.hasChar()) length += 2;
        else if (segType.hasChars()) length += 2 * segLength;
        else if (segType.hasByte()) length += 1;
        else if (segType.hasBytes()) length += segLength;

        return length;
    }

    public static int getSegByteLength(@NotNull Seg seg, @NotNull CharSequence textChars) {
        SegType segType = getSegType(seg, textChars);
        return getSegByteLength(segType, seg.getSegStart(), seg.length());
    }

    public static int addIntBytes(byte[] bytes, int offset, int value, int count) {
        switch (count) {
            case 4:
                bytes[offset++] = (byte) ((value & 0xff000000) >> 24);
            case 3:
                bytes[offset++] = (byte) ((value & 0x00ff0000) >> 16);
            case 2:
                bytes[offset++] = (byte) ((value & 0x0000ff00) >> 8);
            case 1:
                bytes[offset++] = (byte) (value & 0x000000ff);
        }
        return offset;
    }

    public static int getInt(byte[] bytes, int offset, int count) {
        int value = 0;

        switch (count) {
            case 4:
                value |= (0x00ff & bytes[offset++]) << 24;
            case 3:
                value |= (0x00ff & bytes[offset++]) << 16;
            case 2:
                value |= (0x00ff & bytes[offset++]) << 8;
            case 1:
                value |= (0x00ff & bytes[offset]);
        }
        return value;
    }

    public static int addChar(byte[] bytes, int offset, char c) {
        bytes[offset++] = (byte) ((c & 0x0000ff00) >> 8);
        bytes[offset++] = (byte) (c & 0x000000ff);
        return offset;
    }

    public static char getChar(byte[] bytes, int offset) {
        char c = (char) ((0x00ff & bytes[offset++]) << 8);
        c |= 0x00ff & bytes[offset];
        return c;
    }

    public static int addChars(byte[] bytes, int offset, @NotNull CharSequence chars, int start, int end) {
        for (int i = start; i < end; i++) {
            char c = chars.charAt(i);
            bytes[offset++] = (byte) ((c & 0x0000ff00) >> 8);
            bytes[offset++] = (byte) (c & 0x000000ff);
        }
        return offset;
    }

    public static int addCharAscii(byte[] bytes, int offset, char c) {
        assert c < 256;
        bytes[offset++] = (byte) (c & 0x000000ff);
        return offset;
    }

    public static int addCharsAscii(byte[] bytes, int offset, @NotNull CharSequence chars, int start, int end) {
        for (int i = start; i < end; i++) {
            char c = chars.charAt(i);
            assert c < 256;
            bytes[offset++] = (byte) (c & 0x000000ff);
        }
        return offset;
    }

    public static char getCharAscii(byte[] bytes, int offset) {
        return (char) (0x00ff & bytes[offset]);
    }

    public static int addSegBytes(byte[] bytes, int offset, @NotNull Seg seg, @NotNull CharSequence textChars) {
        SegType segType = getSegType(seg, textChars);
        int segLength = seg.length();

        if (segType.hasOffset()) {
            int segStart = seg.getStart();

            if (segType.hasLength()) {
                int offsetBytes = getIntBytes(segStart);
                int intBytes = getIntBytes(segLength);

                bytes[offset++] = (byte) (segType.flags | (offsetBytes - 1) | ((intBytes - 1) << 2));
                offset = addIntBytes(bytes, offset, segStart, offsetBytes);
                offset = addIntBytes(bytes, offset, segLength, intBytes);
            } else {
                int offsetBytes = getOffsetBytes(segStart);
                if (offsetBytes == 0) {
                    assert segStart < 16;
                    bytes[offset++] = (byte) (segType.flags | TYPE_NO_SIZE_BYTES | segStart);
                } else {
                    bytes[offset++] = (byte) (segType.flags | (offsetBytes - 1));
                    offset = addIntBytes(bytes, offset, segStart, offsetBytes);
                }
            }
        } else if (segType.hasLength()) {
            int lengthBytes = getLengthBytes(segLength);
            if (lengthBytes == 0) {
                assert segLength < 16;
                bytes[offset++] = (byte) (segType.flags | TYPE_NO_SIZE_BYTES | segLength);
            } else {
                bytes[offset++] = (byte) (segType.flags | ((lengthBytes - 1) << 2));
                offset = addIntBytes(bytes, offset, segLength, lengthBytes);
            }
        }

        if (segType.hasChar()) offset = addChar(bytes, offset, textChars.charAt(seg.getTextStart()));
        else if (segType.hasChars()) offset = addChars(bytes, offset, textChars, seg.getTextStart(), seg.getTextEnd());
        else if (segType.hasByte()) offset = addCharAscii(bytes, offset, textChars.charAt(seg.getTextStart()));
        else if (segType.hasBytes()) offset = addCharsAscii(bytes, offset, textChars, seg.getTextStart(), seg.getTextEnd());

        return offset;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy