
com.articulate.sigma.semRewrite.Lexer Maven / Gradle / Ivy
package com.articulate.sigma.semRewrite;
/*
Author: Adam Pease [email protected]
Stephan Schulz
A simple lexical analyser that converts a string into a sequence of
tokens. Java's StreamTokenizer can't be used since it only can
"push back" one token.
This will convert a string into a sequence of
tokens that can be inspected and processed in-order. It is a bit
of an overkill for a simple application, but makes actual
parsing later much easier and more robust than a quicker hack.
Initialize the Lexer with a String or a filename then
iterate through the tokens with next() or testTok() to check
for expected token types and error if it's not an expected type.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program ; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston,
MA 02111-1307 USA
*/
import java.io.*;
import java.util.*;
import java.util.regex.*;
import java.text.*;
import com.articulate.sigma.KBmanager;
public class Lexer {
public static final String NoToken = "No Token";
public static final String WhiteSpace = "White Space";
public static final String Newline = "Newline";
public static final String SemiComment = "SemiComment";
public static final String Directive = "Directive";
public static final String Ident = "Identifier";
public static final String Number = "Positive or negative Integer or real";
public static final String QuotedString = "Quoted string";
public static final String FullStop = ". (full stop)";
public static final String OpenPar = "(";
public static final String ClosePar = ")";
public static final String OpenBracket = "{";
public static final String CloseBracket = "}";
public static final String Or = "|";
public static final String Plus = "+";
public static final String Comma = ",";
public static final String Implies = "==>";
public static final String OptImplies = "?=>";
public static final String Clause = "/-";
public static final String Var = "Variable";
public static final String Negation = "-";
public static final String Stop = "stop";
public static final String Zero = "!";
public static final String EOFToken = "*EOF*";
public String filename = "";
public String type = "";
public String literal = "";
public String line = null;
public String SZS = "";
public int pos = 0; // character position on the current line
public LineNumberReader input = null;
public ArrayDeque tokenStack = new ArrayDeque();
/** This array contains all of the compiled Pattern objects that
* will be used by methods in this file. */
public static LinkedHashMap tokenDefs = new LinkedHashMap();
public static ArrayList andOr = new ArrayList();
public static ArrayList binaryRel = new ArrayList();
public static ArrayList quant = new ArrayList();
/** ***************************************************************
*/
public Lexer() {
init();
}
/** ***************************************************************
*/
public Lexer(String s) {
init();
//source = s;
input = new LineNumberReader(new StringReader(s));
filename = "";
}
/** ***************************************************************
* Read a text file into the "input" String variables. Throws an
* error on file not found.
*/
public Lexer(File f) {
init();
//source = file2string(f);
try {
input = new LineNumberReader(new FileReader(f));
}
catch (FileNotFoundException fnf) {
System.out.println("Error in Lexer(): File not found: " + f);
System.out.println(fnf.getMessage());
fnf.printStackTrace();
}
}
/** ***************************************************************
* Read the contents of a text file into a String. Throws IOException
*/
public String file2string(File f) {
String result = null;
DataInputStream in = null;
try {
byte[] buffer = new byte[(int) f.length()];
in = new DataInputStream(new FileInputStream(f));
in.readFully(buffer);
result = new String(buffer);
}
catch (IOException e) {
throw new RuntimeException("IO problem in fileToString", e);
}
finally {
try {
in.close();
}
catch (IOException e) { /* ignore it */
}
}
return result;
}
/** ***************************************************************
* @return the line number of the token by counting all the
* newlines in the position up to the current token.
*/
public int linepos() {
return input.getLineNumber();
//return source.substring(0,pos).split(" ").length + 1;
}
/** ***************************************************************
* Set up the regular expressions to recognize each token type.
*/
private static void init() {
tokenDefs.put(FullStop, Pattern.compile("\\."));
tokenDefs.put(OpenPar, Pattern.compile("\\("));
tokenDefs.put(ClosePar, Pattern.compile("\\)"));
tokenDefs.put(OpenBracket, Pattern.compile("\\{"));
tokenDefs.put(CloseBracket, Pattern.compile("\\}"));
tokenDefs.put(Comma, Pattern.compile(","));
tokenDefs.put(Or, Pattern.compile("\\|"));
tokenDefs.put(Implies, Pattern.compile("==>"));
tokenDefs.put(OptImplies, Pattern.compile("\\?=>"));
tokenDefs.put(Clause, Pattern.compile("/-"));
tokenDefs.put(Plus, Pattern.compile("\\+"));
tokenDefs.put(Var, Pattern.compile("\\?[a-zA-Z][_a-z0-9_A-Z]*\\*?"));
tokenDefs.put(Newline, Pattern.compile("\\n"));
tokenDefs.put(WhiteSpace, Pattern.compile("\\s+"));
//tokenDefs.put(Ident, Pattern.compile("\\\"?\\'?[0-9a-zA-Z]\\'?[_\\-a-z0-9_A-Z ]+\\*?\\\"?"));
tokenDefs.put(Number, Pattern.compile("-?[0-9]?[0-9\\.]+(,\\d\\d\\d)*[^,()]+"));
tokenDefs.put(Negation, Pattern.compile("-"));
tokenDefs.put(Zero, Pattern.compile("\\!"));
tokenDefs.put(SemiComment, Pattern.compile(";[^\\n]*"));
tokenDefs.put(Directive, Pattern.compile("#[^\\n]*"));
tokenDefs.put(QuotedString, Pattern.compile("'[^']*'"));
tokenDefs.put(Ident, Pattern.compile("[^,()]+"));
tokenDefs.put(Stop, Pattern.compile("stop"));
andOr.add(Comma);
andOr.add(Or);
binaryRel.add(Implies);
binaryRel.add(OptImplies);
}
/** ***************************************************************
* @return the next token type without consuming it.
*/
public String lookType() throws ParseException {
look();
return type;
}
/** ***************************************************************
* @return the next token without consuming it.
*/
public String look() throws ParseException {
String res = next();
//System.out.println("INFO in Lexer.look(): " + res);
tokenStack.push(res);
return res;
}
/** ***************************************************************
* @return the literal value of the next token, i.e. the string
* generating the token.
*/
public String lookLit() throws ParseException {
look();
return literal;
}
/** ***************************************************************
* Take a list of expected token types.
* @return True if the next token is expected, False otherwise.
*/
public boolean testTok(ArrayList tokens) throws ParseException {
look();
for (int i = 0; i < tokens.size(); i++) {
if (type.equals(tokens.get(i))) {
//System.out.println("INFO in Lexer.testTok(): found token");
return true;
}
}
//System.out.println("INFO in Lexer.testTok(): didn't find tokens with type: " + type + " for list " + tokens);
return false;
}
/** ***************************************************************
* Convenience method
*/
public boolean testTok(String tok) throws ParseException {
ArrayList tokens = new ArrayList();
tokens.add(tok);
return testTok(tokens);
}
/** ***************************************************************
* Take a list of expected token types. If the next token is
* not among the expected ones, exit with an error. Otherwise do
* nothing.
*/
public void checkTok(String tok) throws ParseException {
ArrayList tokens = new ArrayList();
tokens.add(tok);
checkTok(tokens);
}
/** ***************************************************************
* Take a list of expected token types. If the next token is
* not among the expected ones, exit with an error. Otherwise do
* nothing.
*/
public void checkTok(ArrayList tokens) throws ParseException {
look();
for (int i = 0; i < tokens.size(); i++) {
if (type.equals(tokens.get(i)))
return;
}
throw new ParseException("Error in Lexer.checkTok(): Unexpected token '" + type + "'",linepos());
}
/** ***************************************************************
* Take an expected token type. If the next token is
* the same as the expected one, consume and return it. Otherwise, exit
* with an error.
* @return the token matching the type of the input
*/
public String acceptTok(String token) throws ParseException {
ArrayList tokens = new ArrayList();
tokens.add(token);
checkTok(tokens);
return next();
}
/** ***************************************************************
* Take a list of expected token types. If the next token is
* among the expected ones, consume and return it. Otherwise, exit
* with an error.
* @return the token matching one of the types in the inputs
*/
public String acceptTok(ArrayList tokens) throws ParseException {
checkTok(tokens);
return next();
}
/** ***************************************************************
* @param litval an expected literal string.
* @return True if the
* next token's string value the same as the input, False otherwise.
*/
public boolean testLit(String litval) throws ParseException {
ArrayList litvals = new ArrayList();
litvals.add(litval);
return testLit(litvals);
}
/** ***************************************************************
* @param litvals a list of expected literal strings
* @return True if the next token's string value is among the input
* string and false otherwise.
*/
public boolean testLit(ArrayList litvals) throws ParseException {
lookLit();
for (int i = 0; i < litvals.size(); i++) {
if (literal.equals(litvals.get(i)))
return true;
}
return false;
}
/** ***************************************************************
* Take an expected literal string. If the next token's
* literal is not the expected one, exit with an
* error. Otherwise do nothing.
*/
private void checkLit(String litval) throws ParseException {
ArrayList litvals = new ArrayList();
litvals.add(litval);
checkLit(litvals);
}
/** ***************************************************************
* Take a list of expected literal strings. If the next token's
* literal is not among the expected ones, exit with an
* error. Otherwise do nothing.
*/
private void checkLit(ArrayList litvals) throws ParseException {
if (!testLit(litvals)) {
look();
throw new ParseException("Error in Lexer.checkLit(): " + literal + " not in " + litvals, linepos());
}
}
/** ***************************************************************
* Take a list of expected literal strings. If the next token's
* literal is among the expected ones, consume and return the
* literal. Otherwise, exit with an error.
*/
public String acceptLit(ArrayList litvals) throws ParseException {
checkLit(litvals);
return next();
}
/** ***************************************************************
* Take a list of expected literal strings. If the next token's
* literal is among the expected ones, consume and return the
* literal. Otherwise, exit with an error.
*/
public String acceptLit(String litval) throws ParseException {
ArrayList litvals = new ArrayList();
litvals.add(litval);
checkLit(litvals);
return next();
}
/** ***************************************************************
* @return next semantically relevant token (not whitespace,
* comments etc)
*/
public String next() throws ParseException {
String res = nextUnfiltered();
while ((type.equals(WhiteSpace) || type.equals(SemiComment)) &&
!res.equals(EOFToken)) {
//System.out.println(type + ":" + line);
res = nextUnfiltered();
}
//System.out.println("INFO in next(): returning token: " + res);
return res;
}
/** ***************************************************************
* @return next token, including tokens, such as whitespace and
* comments, that are ignored by most languages.
*/
public String nextUnfiltered() throws ParseException {
//System.out.println("INFO in Lexer.nextUnfiltered(): " + line);
if (tokenStack.size() > 0)
return tokenStack.pop();
else {
if (line == null || line.length() <= pos) {
try {
do {
line = input.readLine();
} while (line != null && line.length() == 0);
//System.out.println("INFO in Lexer.nextUnfiltered(): " + line);
pos = 0;
}
catch (IOException ioe) {
System.out.println("Error in Lexer.nextUnfiltered()");
System.out.println(ioe.getMessage());
ioe.printStackTrace();
return EOFToken;
}
if (line == null) {
//System.out.println("INFO in Lexer.nextUnfiltered(): returning eof");
type = EOFToken;
return EOFToken;
}
}
Iterator it = tokenDefs.keySet().iterator();
while (it.hasNext()) { // Go through all the token definitions and process the first one that matches
String key = it.next();
Pattern value = tokenDefs.get(key);
Matcher m = value.matcher(line.substring(pos));
//System.out.println("INFO in Lexer.nextUnfiltered(): checking: " + key + " against: " + line.substring(pos));
if (m.lookingAt()) {
//System.out.println("INFO in Lexer.nextUnfiltered(): got token against source: " + line.substring(pos));
literal = line.substring(pos + m.start(),pos + m.end());
pos = pos + m.end();
type = key;
//System.out.println("INFO in Lexer.nextUnfiltered(): got token: " + literal + " type: " + type +
// " at pos: " + pos + " with regex: " + value);
return m.group();
}
}
if (pos + 4 > line.length())
if (pos - 4 < 0)
throw new ParseException("Error in Lexer.nextUnfiltered(): no matches in token list for " +
line.substring(0,line.length()) + "... at line " + input.getLineNumber(),pos);
else
throw new ParseException("Error in Lexer.nextUnfiltered(): no matches in token list for " +
line.substring(pos - 4,line.length()) + "... at line " + input.getLineNumber(),pos);
else
throw new ParseException("Error in Lexer.nextUnfiltered(): no matches in token list for " +
line.substring(pos,pos+4) + "... at line " + input.getLineNumber(),pos);
}
}
/** ***************************************************************
* Return a list of all tokens in the source.
*/
public ArrayList lex() throws ParseException {
ArrayList res = new ArrayList();
while (!testTok(EOFToken)) {
String tok = next();
//System.out.println("INFO in Lexer.lex(): " + tok);
res.add(tok);
}
return res;
}
/** ***************************************************************
* Return a list of all tokens in the source.
*/
public ArrayList lexTypes() throws ParseException {
ArrayList res = new ArrayList();
while (!testTok(EOFToken)) {
String type = lookType();
String tok = next();
//System.out.println("INFO in Lexer.lex(): " + type);
res.add(type);
}
return res;
}
/** ***************************************************************
** ***************************************************************
*/
private static String example1 = "sense(212345678,?E), nsubj(?E,?X), dobj(?E,?Y) ==> " +
"{(exists (?X ?E ?Y) " +
"(and " +
"(instance ?X Organization) " +
"(instance ?Y Human)}" +
"(instance ?E Hiring)" +
"(agent ?E ?X) " +
"(patient ?E ?Y)))}.";
private static String example2 = "bank2";
private static String example3 = "at*";
private static String example4 = "num(PM-6, 8:30-5)";
private static String example5 = "name(John-6, \"John\")";
/** ***************************************************************
* Test that comments and whitespace are normally ignored.
*/
private static void testLex() {
System.out.println("-------------------------------------------------");
System.out.println("INFO in Lexer.testLex(): example2: " + example2);
Lexer lex1 = new Lexer(example1);
Lexer lex2 = new Lexer(example2);
try {
ArrayList res1 = lex1.lex();
System.out.println("INFO in Lexer.testLex(): completed parsing example 1: " + example1);
ArrayList res2 = lex2.lex();
System.out.println("INFO in Lexer.testLex(): completed parsing example 1: " + example2);
}
catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
/** ***************************************************************
* Test accepTok()
*/
private static void testString() {
System.out.println("-------------------------------------------------");
System.out.println("INFO in Lexer.testString()");
Lexer lex1 = new Lexer(example3);
try {
System.out.println(lex1.acceptTok(Ident));
}
catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
/** ***************************************************************
* Test that self.example 1 is split into the expected tokens.
*/
private static void testTerm() {
System.out.println("-------------------------------------------------");
System.out.println("INFO in Lexer.testTerm()");
Lexer lex1 = new Lexer(example1);
try {
lex1.acceptTok(Ident); // sense
lex1.acceptTok(OpenPar); // (
lex1.acceptTok(Number); // 212345678
lex1.acceptTok(Comma); // ,
lex1.acceptTok(Var); // ?E
lex1.acceptTok(ClosePar); // )
lex1.acceptTok(Comma); // ,
lex1.acceptTok(Ident); // nsubj
lex1.acceptTok(OpenPar); // (
// ...
}
catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
/** ***************************************************************
* Do a deep compare to two ArrayList for equality.
*/
private static boolean compareArrays(ArrayList s1, ArrayList s2) {
if (s1.size() != s2.size())
return false;
for (int i = 0; i < s1.size(); i++)
if (!s1.get(i).equals(s2.get(i)))
return false;
return true;
}
/** ***************************************************************
* Check the positive case of AcceptLit().
*/
private static void testAcceptLit() {
System.out.println("-------------------------------------------------");
System.out.println("INFO in Lexer.testAcceptLit()");
Lexer lex = new Lexer(example1);
try {
lex.acceptLit("sense");
lex.acceptLit("(");
lex.acceptLit("212345678");
lex.acceptLit(",");
lex.acceptLit("?E");
lex.acceptLit(")");
lex.acceptLit(",");
lex.acceptLit("nsubj");
lex.acceptLit("(");
}
catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
/** ***************************************************************
* Provoke different errors.
*/
private static void testErrors() {
System.out.println("-------------------------------------------------");
System.out.println("INFO in Lexer.testErrors(): Should throw three errors");
Lexer lex = null;
try {
lex = new Lexer(example1);
lex.look();
}
catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
try {
lex = new Lexer(example1);
lex.checkTok(Implies);
}
catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
try {
lex = new Lexer(example1);
lex.checkLit("abc");
}
catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
/** ***************************************************************
* Check the positive case of AcceptLit().
*/
private static void testAcceptClause() {
System.out.println("-------------------------------------------------");
System.out.println("INFO in Lexer.testAcceptClause()");
Lexer lex = new Lexer(example4);
try {
Pattern value = tokenDefs.get(Number);
Matcher m = value.matcher("8:30");
if (m.lookingAt()) {
System.out.println("parse ok");
}
System.out.println(lex.next());
System.out.println(lex.next());
System.out.println(lex.next());
System.out.println(lex.next());
System.out.println(lex.next());
System.out.println(lex.next());
}
catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
/** ***************************************************************
* Check the positive case of AcceptLit().
*/
private static void testAcceptClause3() {
System.out.println("-------------------------------------------------");
System.out.println("INFO in Lexer.testAcceptClause3()");
Lexer lex = new Lexer(example4);
try {
Pattern value = tokenDefs.get(Ident);
Matcher m = value.matcher("5th-6");
if (m.lookingAt()) {
System.out.println("parse ok");
}
}
catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
/** ***************************************************************
* Check the positive case of AcceptLit().
*/
private static void testAcceptClause2() {
System.out.println("-------------------------------------------------");
System.out.println("INFO in Lexer.testAcceptClause()");
Lexer lex = new Lexer(example5);
try {
System.out.println(lex.next());
System.out.println(lex.next());
System.out.println(lex.next());
System.out.println(lex.next());
System.out.println(lex.next());
System.out.println(lex.next());
}
catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
/** ***************************************************************
*/
public static void main(String[] args) {
System.out.println("INFO in Lexer.main()");
Interpreter interp = new Interpreter();
if (args != null && args.length > 1 && args[0].equals("-s")) {
Lexer lex = new Lexer(args[1]);
try {
System.out.println(lex.lex());
lex = new Lexer(args[1]);
System.out.println(lex.lexTypes());
}
catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
else if (args != null && args.length > 0 && args[0].equals("-h")) {
System.out.println("Semantic Rewriting with SUMO, Sigma and E");
System.out.println(" options:");
System.out.println(" -h - show this help screen");
System.out.println(" -s - runs one conversion of one quoted input");
System.out.println(" with no options this falls through to some tests.");
}
else {
//testString();
testLex();
testTerm();
//testAcceptLit();
//testErrors();
//testAcceptClause3();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy