All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.bidib.wizard.common.highlight.Scanner Maven / Gradle / Ivy

package org.bidib.wizard.common.highlight;

import java.util.HashMap;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

// Public domain, no restrictions, Ian Holyer, University of Bristol.

/**
 * 

* A Scanner object provides a lexical analyser and a resulting token array. Incremental rescanning is supported, e.g. * for use in a token colouring editor. This is a base class dealing with plain text, which can be extended to support * other languages. * *

* The actual text is assumed to be held elsewhere, e.g. in a document. The change() method is called to * report the position and length of a change in the text, and the scan() method is called to perform * scanning or rescanning. For example, to scan an entire document held in a character array text in one * go: * *

* *
 * scanner.change(0, 0, text.length);
 * scanner.scan(text, 0, text.length);
 * 
* *
* *

* For incremental scanning, the position() method is used to find the text position at which rescanning * should start. For example, a syntax highlighter might contain this code: * *

* *
 * // Where to start rehighlighting, and a segment object
 * int firstRehighlightToken;
 * Segment segment;
 * 
 * ...
 * 
 * // Whenever the text changes, e.g. on an insert or remove or read.
 * firstRehighlightToken = scanner.change(offset, oldLength, newLength);
 * repaint();
 * 
 * ...
 * 
 * // in repaintComponent
 * int offset = scanner.position();
 * if (offset < 0) return;
 * int tokensToRedo = 0;
 * int amount = 100;
 * while (tokensToRedo == 0 && offset >= 0)
 * {
 *    int length = doc.getLength() - offset;
 *    if (length > amount) length = amount;
 *    try { doc.getText(offset, length, text); }
 *    catch (BadLocationException e) { return; }
 *    tokensToRedo = scanner.scan(text.array, text.offset, text.count);
 *    offset = scanner.position();
 *    amount = 2*amount;
 * }
 * for (int i = 0; i < tokensToRedo; i++)
 * {
 *    Token t = scanner.getToken(firstRehighlightToken + i);
 *    int length = t.symbol.name.length();
 *    int type = t.symbol.type;
 *    doc.setCharacterAttributes (t.position, length, styles[type], false);
 * }
 * firstRehighlightToken += tokensToRedo;
 * if (offset >= 0) repaint(2);
 * 
* *
* *

* Note that change can be called at any time, even between calls to scan. Only small number * of characters are passed to scan so that only a small burst of scanning is done, to prevent the * program's user interface from freezing. */ public abstract class Scanner implements TokenTypes { private static final Logger LOGGER = LoggerFactory.getLogger(Scanner.class); /** *

* Read one token from the start of the current text buffer, given the start offset, end offset, and current scanner * state. The method moves the start offset past the token, updates the scanner state, and returns the type of the * token just scanned. * *

* The scanner state is a representative token type. It is either the state left after the last call to read, or the * type of the old token at the same position if rescanning, or WHITESPACE if at the start of a document. The method * succeeds in all cases, returning whitespace or comment or error tokens where necessary. Each line of a multi-line * comment is treated as a separate token, to improve incremental rescanning. If the buffer does not extend to the * end of the document, the last token returned for the buffer may be incomplete and the caller must rescan it. The * read method can be overridden to implement different languages. The default version splits plain text into words, * numbers and punctuation. */ protected int read() { char c = buffer[start]; int type; // Ignore the state, since there is only one. if (Character.isWhitespace(c)) { type = WHITESPACE; while (++start < end) { if (!Character.isWhitespace(buffer[start])) { break; } } } else if (Character.isLetter(c)) { type = WORD; while (++start < end) { c = buffer[start]; if (Character.isLetter(c) || Character.isDigit(c)) { continue; } if (c == '-' || c == '\'' || c == '_') { continue; } break; } } else if (Character.isDigit(c)) { type = NUMBER; while (++start < end) { c = buffer[start]; if (!Character.isDigit(c) && c != '.') { break; } } } else if (c >= '!' || c <= '~') { type = PUNCTUATION; start++; } else { type = UNRECOGNIZED; start++; } // state = WHITESPACE; return type; } /** * The current buffer of text being scanned. */ protected char[] buffer; /** * The current offset within the buffer, at which to scan the next token. */ protected int start; /** * The end offset in the buffer. */ protected int end; /** * The current scanner state, as a representative token type. */ protected int state = WHITESPACE; // The array of tokens forms a gap buffer. The total length of the text is // tracked, and tokens after the gap have (negative) positions relative to // the end of the text. While scanning, the gap represents the area to be // scanned, no tokens after the gap can be taken as valid, and in // particular the end-of-text sentinel token is after the gap. private Token[] tokens; private int gap; private int endgap; private int textLength; private boolean scanning; private int position; private boolean isCaseInsensitive; /** * The symbol table can be accessed by initSymbolTable or lookup, if they are overridden. * Symbols are inserted with symbolTable.put(sym,sym) and extracted with * symbolTable.get(sym). */ protected HashMap symbolTable; /** * Create a new Scanner representing an empty text document. For non-incremental scanning, use change() to report * the document size, then pass the entire text to the scan() method in one go, or if coming from an input stream, a * bufferful at a time. */ Scanner() { tokens = new Token[1]; gap = 0; endgap = 0; textLength = 0; symbolTable = new HashMap(); initSymbolTable(); Symbol endOfText = new Symbol(WHITESPACE, ""); tokens[0] = new Token(endOfText, 0); scanning = false; position = 0; } public boolean isCaseInsensitive() { return isCaseInsensitive; } public void setCaseInsensitive(boolean isCaseInsensitive) { this.isCaseInsensitive = isCaseInsensitive; } // Move the gap to a new index within the tokens array. When preparing to // pass a token back to a caller, this is used to ensure that the token's // position is relative to the start of the text and not the end. private void moveGap(int newgap) { if (scanning) { throw new RuntimeException("moveGap called while scanning"); } if (newgap < 0 || newgap > gap + tokens.length - endgap) { throw new RuntimeException("bad argument to moveGap"); } if (gap < newgap) { while (gap < newgap) { tokens[endgap].position += textLength; tokens[gap++] = tokens[endgap++]; } } else if (gap > newgap) { while (gap > newgap) { tokens[--endgap] = tokens[--gap]; tokens[endgap].position -= textLength; } } } /** * Find the number of available valid tokens, not counting tokens in or after any area yet to be rescanned. */ public int size() { if (scanning) { return gap; } else { return gap + tokens.length - endgap; } } /** * Find the n'th token, or null if it is not currently valid. */ public Token getToken(int n) { if (n < 0 || n >= gap && scanning) { return null; } if (n >= gap) { moveGap(n + 1); } return tokens[n]; } /** * Find the index of the valid token starting before, but nearest to, text position p. This uses an O(log(n)) binary * chop search. */ public int find(int p) { int startPos = 0; int endPos; int mid; int midpos; if (!scanning) { moveGap(gap + tokens.length - endgap); } endPos = gap - 1; if (p > tokens[endPos].position) { return endPos; } while (endPos > startPos + 1) { mid = (startPos + endPos) >>> 1; midpos = tokens[mid].position; if (p > midpos) { startPos = mid; } else { endPos = mid; } } return startPos; } /** * Report the position of an edit, the length of the text being replaced, and the length of the replacement text, to * prepare for rescanning. The call returns the index of the token at which rescanning will start. */ public int change(int start, int len, int newLen) { if (start < 0 || len < 0 || newLen < 0 || start + len > textLength) { throw new RuntimeException("change(" + start + "," + len + "," + newLen + ")"); } textLength += newLen - len; int endPos = start + newLen; if (scanning) { while (gap > 0 && tokens[gap - 1].position > start) { gap--; } if (gap > 0) { gap--; } if (gap > 0) { gap--; position = tokens[gap].position; state = tokens[gap].symbol.type; } else { position = 0; state = WHITESPACE; } while (tokens[endgap].position + textLength < endPos) { endgap++; } return gap; } if (endgap == tokens.length) { moveGap(gap - 1); } scanning = true; while (tokens[endgap].position + textLength < start) { tokens[endgap].position += textLength; tokens[gap++] = tokens[endgap++]; } while (gap > 0 && tokens[gap - 1].position > start) { tokens[--endgap] = tokens[--gap]; tokens[endgap].position -= textLength; } if (gap > 0) { gap--; } if (gap > 0) { gap--; position = tokens[gap].position; state = tokens[gap].symbol.type; } else { position = 0; state = WHITESPACE; } while (tokens[endgap].position + textLength < endPos) { endgap++; } return gap; } /** * Find out at what text position any remaining scanning work should start, or -1 if scanning is complete. */ public int position() { if (!scanning) { return -1; } else { return position; } } /** * Create the initial symbol table. This can be overridden to enter keywords, for example. The default * implementation does nothing. */ protected abstract void initSymbolTable(); // Reuse this symbol object to create each new symbol, then look it up in // the symbol table, to replace it by a shared version to minimize space. private final Symbol symbol = new Symbol(0, null); /** * Lookup a symbol in the symbol table. This can be overridden to implement keyword detection, for example. The * default implementation just uses the table to ensure that there is only one shared occurrence of each symbol. */ protected Symbol lookup(int type, String name) { symbol.type = type; symbol.name = name; Symbol sym = symbolTable.get(symbol); if (sym != null) { return sym; } sym = new Symbol(type, name); symbolTable.put(sym, sym); return sym; } /** * Scan or rescan a given read-only segment of text. The segment is assumed to represent a portion of the document * starting at position(). Return the number of tokens successfully scanned, excluding any partial * token at the end of the text segment but not at the end of the document. If the result is 0, the call should be * retried with a longer segment. */ public int scan(char[] array, int offset, int length) { if (!scanning) { throw new RuntimeException("scan called when not scanning"); } if (position + length > textLength) { throw new RuntimeException("scan too much"); } boolean all = position + length == textLength; end = start + length; int startGap = gap; buffer = array; start = offset; end = start + length; while (start < end) { int tokenStart = start; int type = read(); // TODO why ? // if (start >= end && !all) { // break; // } if (type != WHITESPACE) { try { LOGGER.debug("start: {}, tokenStart: {}", start, tokenStart); String name = new String(buffer, tokenStart, start - tokenStart); LOGGER.debug("name: '{}', type: {}", name, type); if (isCaseInsensitive() && type != STRING) { name = name.toLowerCase(); } Symbol sym = lookup(type, name); Token t = new Token(sym, position); if (gap >= endgap) { checkCapacity(gap + tokens.length - endgap + 1); } tokens[gap++] = t; } catch (StringIndexOutOfBoundsException ex) { LOGGER.warn("uupppssss.", ex); } } // Try to synchronise while (tokens[endgap].position + textLength < position) { endgap++; } if (position + start - tokenStart == textLength) { scanning = false; } else if (gap > 0 && tokens[endgap].position + textLength == position && tokens[endgap].symbol.type == type) { endgap++; scanning = false; break; } position += start - tokenStart; } checkCapacity(gap + tokens.length - endgap); return gap - startGap; } // Change the size of the gap buffer, doubling it if it fills up, and // halving if it becomes less than a quarter full. private void checkCapacity(int capacity) { int oldCapacity = tokens.length; if (capacity <= oldCapacity && 4 * capacity >= oldCapacity) { return; } Token[] oldTokens = tokens; int newCapacity; if (capacity > oldCapacity) { newCapacity = oldCapacity * 2; if (newCapacity < capacity) { newCapacity = capacity; } } else { newCapacity = capacity * 2; } tokens = new Token[newCapacity]; System.arraycopy(oldTokens, 0, tokens, 0, gap); int n = oldCapacity - endgap; System.arraycopy(oldTokens, endgap, tokens, newCapacity - n, n); endgap = newCapacity - n; } void print() { StringBuilder sb = new StringBuilder(); for (int i = 0; i < tokens.length; i++) { if (i >= gap && i < endgap) { continue; } if (i == endgap) { sb.append("... "); } sb.append(i).append(":").append(tokens[i].position); sb.append("-").append(tokens[i].position + tokens[i].symbol.name.length()); sb.append(" "); } LOGGER.debug(sb.toString()); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy