astra.ast.core.ADTTokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of astra-compiler Show documentation
Core compiler artifact for the ASTRA Language
There is a newer version: 1.4.3
package astra.ast.core;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Stack;

public class ADTTokenizer {
	private int tabSpace = 4;
	private StringBuffer tokenBuffer = new StringBuffer();
	private StringBuffer contentBuffer = new StringBuffer();
	private List lines = new ArrayList<>();
	private List tokens = new LinkedList<>();
	private InputStream in;
	private int line = 1;
	private int column = 0;


	private int beginLine = 1;
	private int beginColumn = 0;
	private int count = 0;
	private Token lastToken;
	private Stack back = new Stack();
	
	private static List speechActParts = new LinkedList();
	static {
		speechActParts.add("accept");
		speechActParts.add("inform");
		speechActParts.add("not");
		speechActParts.add("query");
		speechActParts.add("reject");
		speechActParts.add("request");
	}

	public ADTTokenizer(InputStream in) {
		this.in = in;
		lines.add("IGNORE");
		lines.add("");
	}

	public void setTabSpace(int tabSpace) {
		this.tabSpace = tabSpace;
	}
	
	public Token nextToken() throws ParseException {
		return lastToken = generateNextToken();
	}
	
	private Token generateNextToken() throws ParseException {
		// remove any tokens in the token buffer first before parsing more tokens
		if (!tokens.isEmpty()) {
			return tokens.remove(0);
		}
		
		// find the next token(s)
		Character ch;
		boolean quit = false;
		while (!quit && (ch = readCharacter()) != null) {
			switch (ch) {
			case ' ':
				endOfToken();
				column++;
				beginLine = line;
				beginColumn = column;
				quit = !tokens.isEmpty();
				break;
			case '\t':
				endOfToken();
				column+=tabSpace;
				beginLine = line;
				beginColumn = column;
				quit = !tokens.isEmpty();
				break;
			case '/':
				column++;
				StringBuffer buf = new StringBuffer();
				buf.append((char)ch);
				char ch2 = readCharacter();
				buf.append(ch2);
				column++;
				switch (ch2) {
				case '/':
					// Single Line Comment
					while ((ch2 = readCharacter()) != '\n') {
						buf.append(ch2);
						column++;
					}
					line++;
					column=0;
					break;
				case '*':
					// Multi Line Comment
					boolean inComment = true;
					while (inComment) {
						do {
							Character character = readCharacter();
							if (character == null) throw new ParseException("Multiline comment not terminated correctly", beginLine, beginColumn, count);
							ch2 = character;

							buf.append(ch2);
							if (ch2== '\n') {
								column=0;
								line++;
							} else {
								column++;
							}
						} while (ch2 != '*');

						ch2 = readCharacter();
						buf.append(ch2);
						switch (ch2) {
						case '/':
							inComment = false;
							column++;
							beginLine = line;
							beginColumn = column;
							break;
						case '\r':
							break;
						case '\n':
							column=0;
							line++;
							break;
						default:
							column++;
						}
					}
					break;
				default:
					// Treat this a a divide (/)
					column -= 2;
					endOfToken();
					column++;
					addCharacterToken((char) ch);
					beginLine = line;
					beginColumn = column;
					column++;
					tokenBuffer.append(ch2);
					quit = !tokens.isEmpty();
					break;
				}
				break;
			case '\"':
				if (tokenBuffer.length() > 0) throw new ParseException("Unexpected start of string", line, column, 1);
			case '\'':
				if (tokenBuffer.length() > 0) throw new ParseException("Unexpected start of character", line, column, 1);
				tokenBuffer.append((char) ch);
				column++;
				do {
					Character character = readCharacter();
					if (character == null) throw new ParseException("Unterminated String: " + tokenBuffer.toString(), line, column, tokenBuffer.length());
					ch2 = character;
					tokenBuffer.append(ch2);
					column++;

					if (ch2 == '\\') {
						char ch3 = readCharacter();
						tokenBuffer.append(ch3);
						column++;
					}
				}
				while (ch2 != ch);
				break;
			case '\n':
//				System.out.println("new line: " + line);
				column = 0;
				line++;
				beginLine = line;
				beginColumn = column;
				break;
			case '-':
				if (speechActParts.contains(tokenBuffer.toString())) {
					column++;
					tokenBuffer.append((char) ch);
					break;
				}
			case '+':
			case '%':
			case '*':
			case '.':
			case ',':
			case '!':
			case '[':
			case ']':
			case ':':
			case '=':
			case '(':
			case ')':
			case '{':
			case '}':
			case ';':
			case '~':
			case '&':
			case '|':
			case '>':
			case '<':
			case '$':
				endOfToken();
				beginLine = line;
				beginColumn = column;
				column++;
				addCharacterToken((char) ch);
				beginLine = line;
				beginColumn = column;
				quit = !tokens.isEmpty();
				break;
			case '0':
			case '1':
			case '2':
			case '3':
			case '4':
			case '5':
			case '6':
			case '7':
			case '8':
			case '9':
				column++;
				tokenBuffer.append((char) ch);
				if (tokenBuffer.length() == 1) {
					try {
						if (in.available() > 0) {
							ch2 = readCharacter();
							while (ch2 >= '0' && ch2 <='9' && in.available() > 0) {
								tokenBuffer.append(ch2);
								column++;
								ch2 = readCharacter();
							}
							if (ch2 >= '0' && ch2 <= '9') {
								tokenBuffer.append(ch2);
								column++;
							} else  if (ch2 == '.') {
								tokenBuffer.append(ch2);
								column++;
								ch2 = readCharacter();
								while (ch2 >= '0' && ch2 <='9' && in.available() > 0) {
									tokenBuffer.append(ch2);
									column++;
									ch2 = readCharacter();
								}
								if (ch2 >= '0' && ch2 <= '9') {
									tokenBuffer.append(ch2);
									column++;
									endOfToken();
								} else if (ch2 == 'f') {
									tokenBuffer.append(ch2);
									endOfToken();
								} else {
									column -= 1;
									endOfToken();
									back(ch2);
								}
							} else if (ch2 == 'l') {
								tokenBuffer.append(ch2);
								endOfToken();
							} else {
								column -= 1;
								endOfToken();
								back(ch2);
								// Line below commented out because it does not push ch2 back onto the stack
								// for processing in the next round if it is the last character...
								// Not sure why this was here previously, hopefully testing will clarify...
								// if (in.available() > 0) back(ch2);
							}
						}
					} catch (IOException e) {
						e.printStackTrace();
					}
				}
				break;
			default:
				column++;
				tokenBuffer.append((char) ch);
			}
			
			try {
				quit = tokens.isEmpty() && in.available() == 0;
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		
		if (tokenBuffer.length() > 0) {
			endOfToken();
		}
		if (tokens.isEmpty()) return Token.EOF_TOKEN;
//		throw new ParseException("Unexected end of file", line, 0, 0);
//		System.out.println("generated: " + tokens.get(0).token);
		return tokens.remove(0);
	}
	
	private void back(Character ch) {
		back.push(ch);
		count--;
	}
	
	private Character readCharacter() {
		try {
			int ch = -1;
			if (!back.isEmpty()) {
				ch = (int) back.pop().charValue();
			} else {
				ch = in.read();
			}
			count++;
			if (ch > -1) {
				contentBuffer.append((char) ch);
				lines.set(lines.size()-1,contentBuffer.toString());
				if (((char) ch) == '\n') {
					contentBuffer = new StringBuffer();
					lines.add(contentBuffer.toString());
				}
				return (char) ch;
			}
		} catch (IOException e) {
			// This should not happen
		}
		return null;
	}

	int startOfToken = 0;
	
	private void endOfToken() throws ParseException {
		String tok = tokenBuffer.toString().trim();
		if (!tok.isEmpty()) tokens.add(new Token(tok, beginLine, beginColumn, line, column, startOfToken, count));
		tokenBuffer = new StringBuffer();
		startOfToken = count;
	}

	private void addCharacterToken(char ch) throws ParseException {
		tokens.add(new Token("" + ch, beginLine, beginColumn, line, column, count-1, count));
		tokenBuffer = new StringBuffer();
	}

	public String getContents() {
		return contentBuffer.toString();
	}

	public String getSource(Token tok, Token tok2) {
//		System.out.println("'" + tok.token + "' / type: " + tok.type + " (" + tok.beginColumn + ", " + tok.endColumn + ")");
//		System.out.println("'" + tok2.token + "' / type: " + tok2.type + " (" + tok2.beginColumn + ", " + tok2.endColumn + ")");
//		System.out.println(tok.token + " -> " + tok2.token);
		String out = "";
		for (int i=tok.beginLine; i <= tok2.endLine; i++) {
			int start = 0;
			int end = lines.get(i).length();
			if (i == tok.beginLine && tok.beginColumn <= end) {
				start = tok.beginColumn;
			}
			if (i == tok2.endLine) {
				if (tok2.endColumn < end) end = tok2.endColumn;
				
			}
//			System.out.println("start: " + start + " / end: " + end);
			out += lines.get(i).substring(start, end);
		}
		return out;
	}

	public Token getLastToken() {
		return lastToken;
	}

	public void back(Token token) {
		tokens.add(0, token);
	}
	public Token peek() throws ParseException {
		Token tok = nextToken();
		back(tok);
		return tok;
	}
}