org.bidib.wizard.common.highlight.Scanner Maven / Gradle / Ivy

Go to download
package org.bidib.wizard.common.highlight;

import java.util.HashMap;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

// Public domain, no restrictions, Ian Holyer, University of Bristol.

/**
 * 
 * A Scanner object provides a lexical analyser and a resulting token array. Incremental rescanning is supported, e.g.
 * for use in a token colouring editor. This is a base class dealing with plain text, which can be extended to support
 * other languages.
 * 
 * 

 * The actual text is assumed to be held elsewhere, e.g. in a document. The change() method is called to
 * report the position and length of a change in the text, and the scan() method is called to perform
 * scanning or rescanning. For example, to scan an entire document held in a character array text in one
 * go:
 * 
 * 

 * 
 *  * scanner.change(0, 0, text.length);
 * scanner.scan(text, 0, text.length);
 * 
 * 
 * 
 * 
 * 
 * For incremental scanning, the position() method is used to find the text position at which rescanning
 * should start. For example, a syntax highlighter might contain this code:
 * 
 * 

 * 
 *  * // Where to start rehighlighting, and a segment object
 * int firstRehighlightToken;
 * Segment segment;
 * 
 * ...
 * 
 * // Whenever the text changes, e.g. on an insert or remove or read.
 * firstRehighlightToken = scanner.change(offset, oldLength, newLength);
 * repaint();
 * 
 * ...
 * 
 * // in repaintComponent
 * int offset = scanner.position();
 * if (offset < 0) return;
 * int tokensToRedo = 0;
 * int amount = 100;
 * while (tokensToRedo == 0 && offset >= 0)
 * {
 *    int length = doc.getLength() - offset;
 *    if (length > amount) length = amount;
 *    try { doc.getText(offset, length, text); }
 *    catch (BadLocationException e) { return; }
 *    tokensToRedo = scanner.scan(text.array, text.offset, text.count);
 *    offset = scanner.position();
 *    amount = 2*amount;
 * }
 * for (int i = 0; i < tokensToRedo; i++)
 * {
 *    Token t = scanner.getToken(firstRehighlightToken + i);
 *    int length = t.symbol.name.length();
 *    int type = t.symbol.type;
 *    doc.setCharacterAttributes (t.position, length, styles[type], false);
 * }
 * firstRehighlightToken += tokensToRedo;
 * if (offset >= 0) repaint(2);
 * 
 * 
 * 
 * 
 * 
 * Note that change can be called at any time, even between calls to scan. Only small number
 * of characters are passed to scan so that only a small burst of scanning is done, to prevent the
 * program's user interface from freezing.
 */
public abstract class Scanner implements TokenTypes {

    private static final Logger LOGGER = LoggerFactory.getLogger(Scanner.class);

    /**
     * 

     * Read one token from the start of the current text buffer, given the start offset, end offset, and current scanner
     * state. The method moves the start offset past the token, updates the scanner state, and returns the type of the
     * token just scanned.
     * 
     * 
     * The scanner state is a representative token type. It is either the state left after the last call to read, or the
     * type of the old token at the same position if rescanning, or WHITESPACE if at the start of a document. The method
     * succeeds in all cases, returning whitespace or comment or error tokens where necessary. Each line of a multi-line
     * comment is treated as a separate token, to improve incremental rescanning. If the buffer does not extend to the
     * end of the document, the last token returned for the buffer may be incomplete and the caller must rescan it. The
     * read method can be overridden to implement different languages. The default version splits plain text into words,
     * numbers and punctuation.
     */
    protected int read() {
        char c = buffer[start];
        int type;
        // Ignore the state, since there is only one.
        if (Character.isWhitespace(c)) {
            type = WHITESPACE;
            while (++start < end) {
                if (!Character.isWhitespace(buffer[start])) {
                    break;
                }
            }
        }
        else if (Character.isLetter(c)) {
            type = WORD;
            while (++start < end) {
                c = buffer[start];
                if (Character.isLetter(c) || Character.isDigit(c)) {
                    continue;
                }
                if (c == '-' || c == '\'' || c == '_') {
                    continue;
                }
                break;
            }
        }
        else if (Character.isDigit(c)) {
            type = NUMBER;
            while (++start < end) {
                c = buffer[start];
                if (!Character.isDigit(c) && c != '.') {
                    break;
                }
            }
        }
        else if (c >= '!' || c <= '~') {
            type = PUNCTUATION;
            start++;
        }
        else {
            type = UNRECOGNIZED;
            start++;
        }

        // state = WHITESPACE;
        return type;
    }

    /**
     * The current buffer of text being scanned.
     */
    protected char[] buffer;

    /**
     * The current offset within the buffer, at which to scan the next token.
     */
    protected int start;

    /**
     * The end offset in the buffer.
     */
    protected int end;

    /**
     * The current scanner state, as a representative token type.
     */
    protected int state = WHITESPACE;

    // The array of tokens forms a gap buffer. The total length of the text is
    // tracked, and tokens after the gap have (negative) positions relative to
    // the end of the text. While scanning, the gap represents the area to be
    // scanned, no tokens after the gap can be taken as valid, and in
    // particular the end-of-text sentinel token is after the gap.

    private Token[] tokens;

    private int gap;

    private int endgap;

    private int textLength;

    private boolean scanning;

    private int position;

    private boolean isCaseInsensitive;

    /**
     * The symbol table can be accessed by initSymbolTable or lookup, if they are overridden.
     * Symbols are inserted with symbolTable.put(sym,sym) and extracted with
     * symbolTable.get(sym).
     */
    protected HashMap symbolTable;

    /**
     * Create a new Scanner representing an empty text document. For non-incremental scanning, use change() to report
     * the document size, then pass the entire text to the scan() method in one go, or if coming from an input stream, a
     * bufferful at a time.
     */
    Scanner() {
        tokens = new Token[1];
        gap = 0;
        endgap = 0;
        textLength = 0;
        symbolTable = new HashMap();
        initSymbolTable();
        Symbol endOfText = new Symbol(WHITESPACE, "");
        tokens[0] = new Token(endOfText, 0);
        scanning = false;
        position = 0;
    }

    public boolean isCaseInsensitive() {
        return isCaseInsensitive;
    }

    public void setCaseInsensitive(boolean isCaseInsensitive) {
        this.isCaseInsensitive = isCaseInsensitive;
    }

    // Move the gap to a new index within the tokens array. When preparing to
    // pass a token back to a caller, this is used to ensure that the token's
    // position is relative to the start of the text and not the end.

    private void moveGap(int newgap) {
        if (scanning) {
            throw new RuntimeException("moveGap called while scanning");
        }
        if (newgap < 0 || newgap > gap + tokens.length - endgap) {
            throw new RuntimeException("bad argument to moveGap");
        }
        if (gap < newgap) {
            while (gap < newgap) {
                tokens[endgap].position += textLength;
                tokens[gap++] = tokens[endgap++];
            }
        }
        else if (gap > newgap) {
            while (gap > newgap) {
                tokens[--endgap] = tokens[--gap];
                tokens[endgap].position -= textLength;
            }
        }
    }

    /**
     * Find the number of available valid tokens, not counting tokens in or after any area yet to be rescanned.
     */
    public int size() {
        if (scanning) {
            return gap;
        }
        else {
            return gap + tokens.length - endgap;
        }
    }

    /**
     * Find the n'th token, or null if it is not currently valid.
     */
    public Token getToken(int n) {
        if (n < 0 || n >= gap && scanning) {
            return null;
        }
        if (n >= gap) {
            moveGap(n + 1);
        }
        return tokens[n];
    }

    /**
     * Find the index of the valid token starting before, but nearest to, text position p. This uses an O(log(n)) binary
     * chop search.
     */
    public int find(int p) {
        int startPos = 0;
        int endPos;
        int mid;
        int midpos;
        if (!scanning) {
            moveGap(gap + tokens.length - endgap);
        }
        endPos = gap - 1;
        if (p > tokens[endPos].position) {
            return endPos;
        }
        while (endPos > startPos + 1) {
            mid = (startPos + endPos) >>> 1;
            midpos = tokens[mid].position;
            if (p > midpos) {
                startPos = mid;
            }
            else {
                endPos = mid;
            }
        }
        return startPos;
    }

    /**
     * Report the position of an edit, the length of the text being replaced, and the length of the replacement text, to
     * prepare for rescanning. The call returns the index of the token at which rescanning will start.
     */
    public int change(int start, int len, int newLen) {
        if (start < 0 || len < 0 || newLen < 0 || start + len > textLength) {
            throw new RuntimeException("change(" + start + "," + len + "," + newLen + ")");
        }
        textLength += newLen - len;
        int endPos = start + newLen;
        if (scanning) {
            while (gap > 0 && tokens[gap - 1].position > start) {
                gap--;
            }
            if (gap > 0) {
                gap--;
            }
            if (gap > 0) {
                gap--;
                position = tokens[gap].position;
                state = tokens[gap].symbol.type;
            }
            else {
                position = 0;
                state = WHITESPACE;
            }
            while (tokens[endgap].position + textLength < endPos) {
                endgap++;
            }
            return gap;
        }
        if (endgap == tokens.length) {
            moveGap(gap - 1);
        }
        scanning = true;
        while (tokens[endgap].position + textLength < start) {
            tokens[endgap].position += textLength;
            tokens[gap++] = tokens[endgap++];
        }
        while (gap > 0 && tokens[gap - 1].position > start) {
            tokens[--endgap] = tokens[--gap];
            tokens[endgap].position -= textLength;
        }
        if (gap > 0) {
            gap--;
        }
        if (gap > 0) {
            gap--;
            position = tokens[gap].position;
            state = tokens[gap].symbol.type;
        }
        else {
            position = 0;
            state = WHITESPACE;
        }
        while (tokens[endgap].position + textLength < endPos) {
            endgap++;
        }
        return gap;
    }

    /**
     * Find out at what text position any remaining scanning work should start, or -1 if scanning is complete.
     */
    public int position() {
        if (!scanning) {
            return -1;
        }
        else {
            return position;
        }
    }

    /**
     * Create the initial symbol table. This can be overridden to enter keywords, for example. The default
     * implementation does nothing.
     */
    protected abstract void initSymbolTable();

    // Reuse this symbol object to create each new symbol, then look it up in
    // the symbol table, to replace it by a shared version to minimize space.

    private final Symbol symbol = new Symbol(0, null);

    /**
     * Lookup a symbol in the symbol table. This can be overridden to implement keyword detection, for example. The
     * default implementation just uses the table to ensure that there is only one shared occurrence of each symbol.
     */
    protected Symbol lookup(int type, String name) {
        symbol.type = type;
        symbol.name = name;
        Symbol sym = symbolTable.get(symbol);
        if (sym != null) {
            return sym;
        }
        sym = new Symbol(type, name);
        symbolTable.put(sym, sym);
        return sym;
    }

    /**
     * Scan or rescan a given read-only segment of text. The segment is assumed to represent a portion of the document
     * starting at position(). Return the number of tokens successfully scanned, excluding any partial
     * token at the end of the text segment but not at the end of the document. If the result is 0, the call should be
     * retried with a longer segment.
     */
    public int scan(char[] array, int offset, int length) {
        if (!scanning) {
            throw new RuntimeException("scan called when not scanning");
        }
        if (position + length > textLength) {
            throw new RuntimeException("scan too much");
        }
        boolean all = position + length == textLength;
        end = start + length;
        int startGap = gap;

        buffer = array;
        start = offset;
        end = start + length;
        while (start < end) {
            int tokenStart = start;
            int type = read();

            // TODO why ?
            // if (start >= end && !all) {
            // break;
            // }

            if (type != WHITESPACE) {
                try {
                    LOGGER.debug("start: {}, tokenStart: {}", start, tokenStart);
                    String name = new String(buffer, tokenStart, start - tokenStart);
                    LOGGER.debug("name: '{}', type: {}", name, type);

                    if (isCaseInsensitive() && type != STRING) {
                        name = name.toLowerCase();
                    }
                    Symbol sym = lookup(type, name);
                    Token t = new Token(sym, position);
                    if (gap >= endgap) {
                        checkCapacity(gap + tokens.length - endgap + 1);
                    }
                    tokens[gap++] = t;
                }
                catch (StringIndexOutOfBoundsException ex) {
                    LOGGER.warn("uupppssss.", ex);
                }

            }

            // Try to synchronise

            while (tokens[endgap].position + textLength < position) {
                endgap++;
            }
            if (position + start - tokenStart == textLength) {
                scanning = false;
            }
            else if (gap > 0 && tokens[endgap].position + textLength == position && tokens[endgap].symbol.type == type) {
                endgap++;
                scanning = false;
                break;
            }
            position += start - tokenStart;
        }
        checkCapacity(gap + tokens.length - endgap);
        return gap - startGap;
    }

    // Change the size of the gap buffer, doubling it if it fills up, and
    // halving if it becomes less than a quarter full.

    private void checkCapacity(int capacity) {
        int oldCapacity = tokens.length;
        if (capacity <= oldCapacity && 4 * capacity >= oldCapacity) {
            return;
        }
        Token[] oldTokens = tokens;
        int newCapacity;
        if (capacity > oldCapacity) {
            newCapacity = oldCapacity * 2;
            if (newCapacity < capacity) {
                newCapacity = capacity;
            }
        }
        else {
            newCapacity = capacity * 2;
        }

        tokens = new Token[newCapacity];
        System.arraycopy(oldTokens, 0, tokens, 0, gap);
        int n = oldCapacity - endgap;
        System.arraycopy(oldTokens, endgap, tokens, newCapacity - n, n);
        endgap = newCapacity - n;
    }

    void print() {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < tokens.length; i++) {
            if (i >= gap && i < endgap) {
                continue;
            }
            if (i == endgap) {
                sb.append("... ");
            }

            sb.append(i).append(":").append(tokens[i].position);
            sb.append("-").append(tokens[i].position + tokens[i].symbol.name.length());
            sb.append(" ");
        }
        LOGGER.debug(sb.toString());
    }
}