All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.xmlbeans.impl.regex.RangeToken Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*   Copyright 2004 The Apache Software Foundation
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.apache.xmlbeans.impl.regex;

/**
 * This class represents a character class such as [a-z] or a period.
 */
final class RangeToken extends Token implements java.io.Serializable {

    int[] ranges;
    boolean sorted;
    boolean compacted;
    RangeToken icaseCache = null;
    int[] map = null;
    int nonMapIndex;

    RangeToken(int type) {
        super(type);
        this.setSorted(false);
    }

    // for RANGE or NRANGE
    protected void addRange(int start, int end) {
        this.icaseCache = null;
        //System.err.println("Token#addRange(): "+start+" "+end);
        int r1, r2;
        if (start <= end) {
            r1 = start;
            r2 = end;
        } else {
            r1 = end;
            r2 = start;
        }

        if (this.ranges == null) {
            this.ranges = new int[2];
            this.ranges[0] = r1;
            this.ranges[1] = r2;
            this.setSorted(true);
        } else {
            int pos = this.ranges.length;
            if (this.ranges[pos - 1] + 1 == r1) {
                this.ranges[pos - 1] = r2;
                return;
            }
            int[] temp = new int[pos + 2];
            System.arraycopy(this.ranges, 0, temp, 0, pos);
            this.ranges = temp;
            if (this.ranges[pos - 1] >= r1) {
                this.setSorted(false);
            }
            this.ranges[pos++] = r1;
            this.ranges[pos] = r2;
            if (!this.sorted) {
                this.sortRanges();
            }
        }
    }

    private boolean isSorted() {
        return this.sorted;
    }

    private void setSorted(boolean sort) {
        this.sorted = sort;
        if (!sort) {
            this.compacted = false;
        }
    }

    private boolean isCompacted() {
        return this.compacted;
    }

    private void setCompacted() {
        this.compacted = true;
    }

    protected void sortRanges() {
        if (this.isSorted()) {
            return;
        }
        if (this.ranges == null) {
            return;
        }
        //System.err.println("Do sorting: "+this.ranges.length);

        // Bubble sort
        // Why? -- In many cases,
        //         this.ranges has few elements.
        for (int i = this.ranges.length - 4; i >= 0; i -= 2) {
            for (int j = 0; j <= i; j += 2) {
                if (this.ranges[j] > this.ranges[j + 2]
                    || this.ranges[j] == this.ranges[j + 2] && this.ranges[j + 1] > this.ranges[j + 3]) {
                    int tmp;
                    tmp = this.ranges[j + 2];
                    this.ranges[j + 2] = this.ranges[j];
                    this.ranges[j] = tmp;
                    tmp = this.ranges[j + 3];
                    this.ranges[j + 3] = this.ranges[j + 1];
                    this.ranges[j + 1] = tmp;
                }
            }
        }
        this.setSorted(true);
    }

    /**
     * this.ranges is sorted.
     */
    @SuppressWarnings("ConstantConditions")
    protected void compactRanges() {
        boolean DEBUG = false;
        if (this.ranges == null || this.ranges.length <= 2) {
            return;
        }
        if (this.isCompacted()) {
            return;
        }
        int base = 0;                           // Index of writing point
        int target = 0;                         // Index of processing point

        while (target < this.ranges.length) {
            if (base != target) {
                this.ranges[base] = this.ranges[target++];
                this.ranges[base + 1] = this.ranges[target++];
            } else {
                target += 2;
            }
            int baseend = this.ranges[base + 1];
            while (target < this.ranges.length) {
                if (baseend + 1 < this.ranges[target]) {
                    break;
                }
                if (baseend + 1 == this.ranges[target]) {
                    if (DEBUG) {
                        System.err.println("Token#compactRanges(): Compaction: [" + this.ranges[base]
                                           + ", " + this.ranges[base + 1]
                                           + "], [" + this.ranges[target]
                                           + ", " + this.ranges[target + 1]
                                           + "] -> [" + this.ranges[base]
                                           + ", " + this.ranges[target + 1]
                                           + "]");
                    }
                    this.ranges[base + 1] = this.ranges[target + 1];
                    baseend = this.ranges[base + 1];
                    target += 2;
                } else if (baseend >= this.ranges[target + 1]) {
                    if (DEBUG) {
                        System.err.println("Token#compactRanges(): Compaction: [" + this.ranges[base]
                                           + ", " + this.ranges[base + 1]
                                           + "], [" + this.ranges[target]
                                           + ", " + this.ranges[target + 1]
                                           + "] -> [" + this.ranges[base]
                                           + ", " + this.ranges[base + 1]
                                           + "]");
                    }
                    target += 2;
                } else if (baseend < this.ranges[target + 1]) {
                    if (DEBUG) {
                        System.err.println("Token#compactRanges(): Compaction: [" + this.ranges[base]
                                           + ", " + this.ranges[base + 1]
                                           + "], [" + this.ranges[target]
                                           + ", " + this.ranges[target + 1]
                                           + "] -> [" + this.ranges[base]
                                           + ", " + this.ranges[target + 1]
                                           + "]");
                    }
                    this.ranges[base + 1] = this.ranges[target + 1];
                    baseend = this.ranges[base + 1];
                    target += 2;
                } else {
                    throw new RuntimeException("Token#compactRanges(): Internel Error: ["
                                               + this.ranges[base]
                                               + "," + this.ranges[base + 1]
                                               + "] [" + this.ranges[target]
                                               + "," + this.ranges[target + 1] + "]");
                }
            } // while
            base += 2;
        }

        if (base != this.ranges.length) {
            int[] result = new int[base];
            System.arraycopy(this.ranges, 0, result, 0, base);
            this.ranges = result;
        }
        this.setCompacted();
    }

    protected void mergeRanges(Token token) {
        RangeToken tok = (RangeToken) token;
        this.sortRanges();
        tok.sortRanges();
        if (tok.ranges == null) {
            return;
        }
        this.icaseCache = null;
        this.setSorted(true);
        if (this.ranges == null) {
            this.ranges = new int[tok.ranges.length];
            System.arraycopy(tok.ranges, 0, this.ranges, 0, tok.ranges.length);
            return;
        }
        int[] result = new int[this.ranges.length + tok.ranges.length];
        for (int i = 0, j = 0, k = 0; i < this.ranges.length || j < tok.ranges.length; ) {
            if (i >= this.ranges.length) {
                result[k++] = tok.ranges[j++];
                result[k++] = tok.ranges[j++];
            } else if (j >= tok.ranges.length) {
                result[k++] = this.ranges[i++];
                result[k++] = this.ranges[i++];
            } else if (tok.ranges[j] < this.ranges[i]
                       || tok.ranges[j] == this.ranges[i] && tok.ranges[j + 1] < this.ranges[i + 1]) {
                result[k++] = tok.ranges[j++];
                result[k++] = tok.ranges[j++];
            } else {
                result[k++] = this.ranges[i++];
                result[k++] = this.ranges[i++];
            }
        }
        this.ranges = result;
    }

    protected void subtractRanges(Token token) {
        if (token.type == NRANGE) {
            this.intersectRanges(token);
            return;
        }
        RangeToken tok = (RangeToken) token;
        if (tok.ranges == null || this.ranges == null) {
            return;
        }
        this.icaseCache = null;
        this.sortRanges();
        this.compactRanges();
        tok.sortRanges();
        tok.compactRanges();

        //System.err.println("Token#substractRanges(): Entry: "+this.ranges.length+", "+tok.ranges.length);

        int[] result = new int[this.ranges.length + tok.ranges.length];
        int wp = 0, src = 0, sub = 0;
        while (src < this.ranges.length && sub < tok.ranges.length) {
            int srcbegin = this.ranges[src];
            int srcend = this.ranges[src + 1];
            int subbegin = tok.ranges[sub];
            int subend = tok.ranges[sub + 1];
            if (srcend < subbegin) {            // Not overlapped
                // src: o-----o
                // sub:         o-----o
                // res: o-----o
                // Reuse sub
                result[wp++] = this.ranges[src++];
                result[wp++] = this.ranges[src++];
            } else if (srcend >= subbegin
                       && srcbegin <= subend) { // Overlapped
                // src:    o--------o
                // sub:  o----o
                // sub:      o----o
                // sub:          o----o
                // sub:  o------------o
                if (subbegin <= srcbegin && srcend <= subend) {
                    // src:    o--------o
                    // sub:  o------------o
                    // res: empty
                    // Reuse sub
                    src += 2;
                } else if (subbegin <= srcbegin) {
                    // src:    o--------o
                    // sub:  o----o
                    // res:       o-----o
                    // Reuse src(=res)
                    this.ranges[src] = subend + 1;
                    sub += 2;
                } else if (srcend <= subend) {
                    // src:    o--------o
                    // sub:          o----o
                    // res:    o-----o
                    // Reuse sub
                    result[wp++] = srcbegin;
                    result[wp++] = subbegin - 1;
                    src += 2;
                } else {
                    // src:    o--------o
                    // sub:      o----o
                    // res:    o-o    o-o
                    // Reuse src(=right res)
                    result[wp++] = srcbegin;
                    result[wp++] = subbegin - 1;
                    this.ranges[src] = subend + 1;
                    sub += 2;
                }
            } else if (subend < srcbegin) {
                // Not overlapped
                // src:          o-----o
                // sub: o----o
                sub += 2;
            } else {
                throw new RuntimeException("Token#subtractRanges(): Internal Error: [" + this.ranges[src]
                                           + "," + this.ranges[src + 1]
                                           + "] - [" + tok.ranges[sub]
                                           + "," + tok.ranges[sub + 1]
                                           + "]");
            }
        }
        while (src < this.ranges.length) {
            result[wp++] = this.ranges[src++];
            result[wp++] = this.ranges[src++];
        }
        this.ranges = new int[wp];
        System.arraycopy(result, 0, this.ranges, 0, wp);
        // this.ranges is sorted and compacted.
    }

    /**
     * @param token Ignore whether it is NRANGE or not.
     */
    protected void intersectRanges(Token token) {
        RangeToken tok = (RangeToken) token;
        if (tok.ranges == null || this.ranges == null) {
            return;
        }
        this.icaseCache = null;
        this.sortRanges();
        this.compactRanges();
        tok.sortRanges();
        tok.compactRanges();

        int[] result = new int[this.ranges.length + tok.ranges.length];
        int wp = 0, src1 = 0, src2 = 0;
        while (src1 < this.ranges.length && src2 < tok.ranges.length) {
            int src1begin = this.ranges[src1];
            int src1end = this.ranges[src1 + 1];
            int src2begin = tok.ranges[src2];
            int src2end = tok.ranges[src2 + 1];
            if (src1end < src2begin) {          // Not overlapped
                // src1: o-----o
                // src2:         o-----o
                // res:  empty
                // Reuse src2
                src1 += 2;
            } else if (src1end >= src2begin
                       && src1begin <= src2end) { // Overlapped
                // src1:    o--------o
                // src2:  o----o
                // src2:      o----o
                // src2:          o----o
                // src2:  o------------o
                if (src2begin <= src1begin && src1end <= src2end) {
                    // src1:    o--------o
                    // src2:  o------------o
                    // res:     o--------o
                    // Reuse src2
                    result[wp++] = src1begin;
                    result[wp++] = src1end;
                    src1 += 2;
                } else if (src2begin <= src1begin) {
                    // src1:    o--------o
                    // src2:  o----o
                    // res:     o--o
                    // Reuse the rest of src1
                    result[wp++] = src1begin;
                    result[wp++] = src2end;
                    this.ranges[src1] = src2end + 1;
                    src2 += 2;
                } else if (src1end <= src2end) {
                    // src1:    o--------o
                    // src2:          o----o
                    // res:           o--o
                    // Reuse src2
                    result[wp++] = src2begin;
                    result[wp++] = src1end;
                    src1 += 2;
                } else {
                    // src1:    o--------o
                    // src2:      o----o
                    // res:       o----o
                    // Reuse the rest of src1
                    result[wp++] = src2begin;
                    result[wp++] = src2end;
                    this.ranges[src1] = src2end + 1;
                }
            } else if (src2end < src1begin) {
                // Not overlapped
                // src1:          o-----o
                // src2: o----o
                src2 += 2;
            } else {
                throw new RuntimeException("Token#intersectRanges(): Internal Error: ["
                                           + this.ranges[src1]
                                           + "," + this.ranges[src1 + 1]
                                           + "] & [" + tok.ranges[src2]
                                           + "," + tok.ranges[src2 + 1]
                                           + "]");
            }
        }
        while (src1 < this.ranges.length) {
            result[wp++] = this.ranges[src1++];
            result[wp++] = this.ranges[src1++];
        }
        this.ranges = new int[wp];
        System.arraycopy(result, 0, this.ranges, 0, wp);
        // this.ranges is sorted and compacted.
    }

    /**
     * for RANGE: Creates complement.
     * for NRANGE: Creates the same meaning RANGE.
     */
    static Token complementRanges(Token token) {
        if (token.type != RANGE && token.type != NRANGE) {
            throw new IllegalArgumentException("Token#complementRanges(): must be RANGE: " + token.type);
        }
        RangeToken tok = (RangeToken) token;
        tok.sortRanges();
        tok.compactRanges();
        int len = tok.ranges.length + 2;
        if (tok.ranges[0] == 0) {
            len -= 2;
        }
        int last = tok.ranges[tok.ranges.length - 1];
        if (last == UTF16_MAX) {
            len -= 2;
        }
        RangeToken ret = Token.createRange();
        ret.ranges = new int[len];
        int wp = 0;
        if (tok.ranges[0] > 0) {
            ret.ranges[wp++] = 0;
            ret.ranges[wp++] = tok.ranges[0] - 1;
        }
        for (int i = 1; i < tok.ranges.length - 2; i += 2) {
            ret.ranges[wp++] = tok.ranges[i] + 1;
            ret.ranges[wp++] = tok.ranges[i + 1] - 1;
        }
        if (last != UTF16_MAX) {
            ret.ranges[wp++] = last + 1;
            ret.ranges[wp] = UTF16_MAX;
        }
        ret.setCompacted();
        return ret;
    }

    synchronized RangeToken getCaseInsensitiveToken() {
        if (this.icaseCache != null) {
            return this.icaseCache;
        }

        RangeToken uppers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
        for (int i = 0; i < this.ranges.length; i += 2) {
            for (int ch = this.ranges[i]; ch <= this.ranges[i + 1]; ch++) {
                if (ch > 0xffff) {
                    uppers.addRange(ch, ch);
                } else {
                    char uch = Character.toUpperCase((char) ch);
                    uppers.addRange(uch, uch);
                }
            }
        }
        RangeToken lowers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
        for (int i = 0; i < uppers.ranges.length; i += 2) {
            for (int ch = uppers.ranges[i]; ch <= uppers.ranges[i + 1]; ch++) {
                if (ch > 0xffff) {
                    lowers.addRange(ch, ch);
                } else {
                    char uch = Character.toUpperCase((char) ch);
                    lowers.addRange(uch, uch);
                }
            }
        }
        lowers.mergeRanges(uppers);
        lowers.mergeRanges(this);
        lowers.compactRanges();

        this.icaseCache = lowers;
        return lowers;
    }

    boolean match(int ch) {
        if (this.map == null) {
            this.createMap();
        }
        boolean ret;
        if (this.type == RANGE) {
            if (ch < MAPSIZE) {
                return (this.map[ch / 32] & (1 << (ch & 0x1f))) != 0;
            }
            ret = false;
            for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) {
                if (this.ranges[i] <= ch && ch <= this.ranges[i + 1]) {
                    return true;
                }
            }
        } else {
            if (ch < MAPSIZE) {
                return (this.map[ch / 32] & (1 << (ch & 0x1f))) == 0;
            }
            ret = true;
            for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) {
                if (this.ranges[i] <= ch && ch <= this.ranges[i + 1]) {
                    return false;
                }
            }
        }
        return ret;
    }

    private static final int MAPSIZE = 256;

    private void createMap() {
        int asize = MAPSIZE / 32;                 // 32 is the number of bits in `int'.
        // CHANGE(radup) we need a new map, since this is not synchronized
        // and if we init the instance map with 0's it's going to be trouble
        // -this.map = new int[asize];
        // -this.nonMapIndex = this.ranges.length;
        // -for (int i = 0; i < asize; i++) this.map[i] = 0;
        int[] localmap = new int[asize]; // +
        int localnonMapIndex = this.ranges.length; // +
        for (int i = 0; i < asize; i++) {
            localmap[i] = 0; // + redundant
        }
        for (int i = 0; i < this.ranges.length; i += 2) {
            int s = this.ranges[i];
            int e = this.ranges[i + 1];
            if (s < MAPSIZE) {
                for (int j = s; j <= e && j < MAPSIZE; j++) {
                    localmap[j / 32] |= 1 << (j & 0x1f); // s&0x1f : 0-31
                }
            } else {
                localnonMapIndex = i;
                break;
            }
            if (e >= MAPSIZE) {
                localnonMapIndex = i;
                break;
            }
        }
        this.nonMapIndex = localnonMapIndex; // +
        this.map = localmap; // +
        //for (int i = 0;  i < asize;  i ++)  System.err.println("Map: "+Integer.toString(this.map[i], 16));
    }

    public String toString(int options) {
        String ret;
        if (this.type == RANGE) {
            if (this == Token.token_dot) {
                ret = ".";
            } else if (this == Token.token_0to9) {
                ret = "\\d";
            } else if (this == Token.token_wordchars) {
                ret = "\\w";
            } else if (this == Token.token_spaces) {
                ret = "\\s";
            } else {
                StringBuilder sb = new StringBuilder();
                sb.append("[");
                for (int i = 0; i < this.ranges.length; i += 2) {
                    if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) {
                        sb.append(",");
                    }
                    if (this.ranges[i] == this.ranges[i + 1]) {
                        sb.append(escapeCharInCharClass(this.ranges[i]));
                    } else {
                        sb.append(escapeCharInCharClass(this.ranges[i]));
                        sb.append('-');
                        sb.append(escapeCharInCharClass(this.ranges[i + 1]));
                    }
                }
                sb.append("]");
                ret = sb.toString();
            }
        } else {
            if (this == Token.token_not_0to9) {
                ret = "\\D";
            } else if (this == Token.token_not_wordchars) {
                ret = "\\W";
            } else if (this == Token.token_not_spaces) {
                ret = "\\S";
            } else {
                StringBuilder sb = new StringBuilder();
                sb.append("[^");
                for (int i = 0; i < this.ranges.length; i += 2) {
                    if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) {
                        sb.append(",");
                    }
                    if (this.ranges[i] == this.ranges[i + 1]) {
                        sb.append(escapeCharInCharClass(this.ranges[i]));
                    } else {
                        sb.append(escapeCharInCharClass(this.ranges[i]));
                        sb.append('-');
                        sb.append(escapeCharInCharClass(this.ranges[i + 1]));
                    }
                }
                sb.append("]");
                ret = sb.toString();
            }
        }
        return ret;
    }

    private static String escapeCharInCharClass(int ch) {
        String ret;
        switch (ch) {
            case '[':
            case ']':
            case '-':
            case '^':
            case ',':
            case '\\':
                ret = "\\" + (char) ch;
                break;
            case '\f':
                ret = "\\f";
                break;
            case '\n':
                ret = "\\n";
                break;
            case '\r':
                ret = "\\r";
                break;
            case '\t':
                ret = "\\t";
                break;
            case 0x1b:
                ret = "\\e";
                break;
            //case 0x0b:  ret = "\\v";  break;
            default:
                if (ch < 0x20) {
                    String pre = "0" + Integer.toHexString(ch);
                    ret = "\\x" + pre.substring(pre.length() - 2);
                } else if (ch >= 0x10000) {
                    String pre = "0" + Integer.toHexString(ch);
                    ret = "\\v" + pre.substring(pre.length() - 6);
                } else {
                    ret = "" + (char) ch;
                }
        }
        return ret;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy