All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.igormaznitsa.prologparser.tokenizer.Tokenizer Maven / Gradle / Ivy

Go to download

It is a handwritten prolog parser, it allows to parse prolog sources written in Edinburgh Prolog style

The newest version!
/*
 * Copyright (c) 2011-2018 Igor Maznitsa. All rights reserved.
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.igormaznitsa.prologparser.tokenizer;

import static com.igormaznitsa.prologparser.ParserContext.FLAG_BLOCK_COMMENTS;
import static com.igormaznitsa.prologparser.ParserContext.FLAG_COMMENTS_AS_ATOMS;
import static com.igormaznitsa.prologparser.ParserContext.FLAG_ZERO_QUOTATION_ALLOWS_WHITESPACE_CHAR;
import static com.igormaznitsa.prologparser.ParserContext.FLAG_ZERO_QUOTATION_CHARCODE;
import static com.igormaznitsa.prologparser.tokenizer.TokenizerState.ATOM;
import static com.igormaznitsa.prologparser.tokenizer.TokenizerState.LOOK_FOR;
import static com.igormaznitsa.prologparser.tokenizer.TokenizerState.OPERATOR;
import static com.igormaznitsa.prologparser.tokenizer.TokenizerState.STRING;
import static com.igormaznitsa.prologparser.utils.StringUtils.isCharAllowedForUnquotedAtom;

import com.igormaznitsa.prologparser.ParserContext;
import com.igormaznitsa.prologparser.PrologParser;
import com.igormaznitsa.prologparser.exceptions.CriticalUnexpectedError;
import com.igormaznitsa.prologparser.exceptions.PrologParserException;
import com.igormaznitsa.prologparser.terms.OpContainer;
import com.igormaznitsa.prologparser.terms.PrologAtom;
import com.igormaznitsa.prologparser.terms.PrologFloat;
import com.igormaznitsa.prologparser.terms.PrologInt;
import com.igormaznitsa.prologparser.terms.PrologTerm;
import com.igormaznitsa.prologparser.terms.PrologVar;
import com.igormaznitsa.prologparser.terms.Quotation;
import com.igormaznitsa.prologparser.utils.Koi7CharOpMap;
import com.igormaznitsa.prologparser.utils.StringBuilderEx;
import com.igormaznitsa.prologparser.utils.StringUtils;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.math.BigInteger;

/**
 * Internal tokenizer to extract next token from reader.
 */
public final class Tokenizer implements AutoCloseable {

  private final StringBuilderEx internalStringBuffer;
  private final StringBuilderEx specialCharBuffer;
  private final StringBuilderEx insideCharBuffer;
  private final boolean blockCommentsAllowed;
  private final boolean returnCommentsAsToken;
  private final boolean zeroSingleQuotationAllowed;
  private final boolean zeroQuotationAllowsWhitespaceChar;
  private final Reader reader;
  private final PrologParser parser;
  private final Koi7CharOpMap metaOperators;
  private TokenizerResult lastPushedTerm;
  private int prevTokenLine;
  private int prevTokenPos;
  private int lastTokenLine;
  private int lastTokenPos;
  private int prevPos;
  private int prevLine;
  private int pos;
  private int line;

  public Tokenizer(
      final PrologParser parser,
      final Koi7CharOpMap metaOperators,
      final Reader reader
  ) {
    super();
    this.metaOperators = metaOperators;
    this.reader = reader;
    this.parser = parser;

    final int maxAllowedCharBufferSize = parser.getContext() == null ? Integer.MAX_VALUE :
        parser.getContext().getMaxTokenizerBufferLength();
    this.returnCommentsAsToken = parser.getContext() != null
        && ((parser.getContext().getFlags() & FLAG_COMMENTS_AS_ATOMS) != 0);
    this.blockCommentsAllowed = parser.getContext() != null
        && ((parser.getContext().getFlags() & FLAG_BLOCK_COMMENTS) != 0);
    this.zeroSingleQuotationAllowed = parser.getContext() != null
        && ((parser.getContext().getFlags() & FLAG_ZERO_QUOTATION_CHARCODE) != 0);
    this.zeroQuotationAllowsWhitespaceChar = parser.getContext() != null
        && ((parser.getContext().getFlags()
        & FLAG_ZERO_QUOTATION_ALLOWS_WHITESPACE_CHAR) != 0);

    this.internalStringBuffer = new StringBuilderEx(32, maxAllowedCharBufferSize);
    this.specialCharBuffer = new StringBuilderEx(8, maxAllowedCharBufferSize);
    this.insideCharBuffer = new StringBuilderEx(8, maxAllowedCharBufferSize);

    this.pos = 1;
    this.line = 1;
    this.prevPos = 1;
    this.prevLine = 1;
  }

  public static boolean isCharAllowedForRadix(final char chr, final int radix) {
    if (radix == 10) {
      return Character.isDigit(chr);
    } else if (radix < 10) {
      return chr >= '0' && chr < ('0' + radix);
    } else {
      if (chr >= '0' && chr <= '9') {
        return true;
      }
      final int diff = radix - 10;
      if (chr >= 'A' && chr < ('A' + diff)) {
        return true;
      }
      return chr >= 'a' && chr < ('a' + diff);
    }
  }

  TokenizerResult getLastPushed() {
    return this.lastPushedTerm;
  }

  private synchronized int doReadChar() throws IOException {
    int ch;
    if (this.insideCharBuffer.isEmpty()) {
      ch = this.reader.read();
    } else {
      ch = this.insideCharBuffer.pop();
    }

    this.prevPos = this.pos;
    this.prevLine = this.line;
    if (ch == '\n') {
      this.pos = 1;
      this.line++;
    } else {
      if (ch >= 0) {
        this.pos++;
      }
    }
    return ch;
  }

  public void calcDiffAndPushResultBack(final String etalon, final StringBuilderEx buffer) {
    int chars = buffer.length() - etalon.length();
    int bufferPosition = buffer.length() - 1;

    int locPos = this.pos;
    int locPosPrev = this.prevPos;
    int locLine = this.line;
    int locLinePrev = this.prevLine;

    while (chars > 0) {
      final char ch = buffer.charAt(bufferPosition--);
      this.insideCharBuffer.push(ch);
      chars--;
      locPos = Math.max(1, locPos - 1);

      locPosPrev = locPos;
      if (ch == '\n') {
        locLine = Math.max(1, locLine - 1);
        locLinePrev = locLine;
      }
    }

    this.pos = locPos;
    this.prevPos = locPosPrev;
    this.line = locLine;
    this.prevLine = locLinePrev;
  }

  public void push(final char ch) {
    this.insideCharBuffer.push(ch);
    if (ch == '\n') {
      this.pos = 1;
      this.line = Math.max(1, this.line - 1);
    } else {
      this.pos = Math.max(1, this.pos - 1);
    }
  }

  @Override
  public void close() throws IOException {
    this.close(true);
  }

  /**
   * Close the tokenizer and clear internal buffers,
   *
   * @param closeReader true if underlying reader also should be closed, false otherwise
   * @throws IOException if any transport error during close
   */
  public void close(final boolean closeReader) throws IOException {
    try {
      if (this.reader instanceof PushbackReader) {
        final PushbackReader pushBackReader = (PushbackReader) this.reader;
        while (!this.internalStringBuffer.isEmpty()) {
          pushBackReader.unread(this.internalStringBuffer.pop());
        }
      }
    } finally {
      this.lastPushedTerm = null;
      this.internalStringBuffer.clear();

      if (closeReader) {
        this.reader.close();
      }
    }
  }

  public boolean hasOperatorStartsWith(final String operatorNameStartSubstring) {
    boolean result = false;
    if (this.metaOperators.contains(operatorNameStartSubstring)) {
      result = true;
    } else if (parser != null) {
      final ParserContext ctx = parser.getContext();
      if (ctx != null) {
        result = ctx.hasOpStartsWith(parser, operatorNameStartSubstring);
      }
    }
    return result;
  }

  public OpContainer findOperatorForName(final String operatorName) {
    OpContainer result = null;

    // check meta-operators as the first ones
    if (operatorName.length() == 1) {
      result = this.metaOperators.get(operatorName);
    }
    if (result == null) {
      // check user defined operators because a user can replace a system operator
      final ParserContext ctx = parser.getContext();
      if (ctx != null) {
        result = ctx.findOpForName(parser, operatorName);
      }
    }

    return result;
  }

  public OpContainer findOperatorForSingleChar(final char c) {
    OpContainer result = this.metaOperators.get(c);
    if (result == null) {
      return findOperatorForName(String.valueOf(c));
    }
    return result;
  }

  public void push(final TokenizerResult object) {
    if (this.lastPushedTerm != null) {
      throw new IllegalStateException("There is already pushed term");
    }
    this.lastPushedTerm = object;
  }

  public TokenizerResult peek() {
    TokenizerResult result;
    if (this.lastPushedTerm == null) {
      result = this.readNextToken();
      this.push(result);
    } else {
      result = this.lastPushedTerm;
    }
    return result;
  }

  public int getLastTokenPos() {
    return this.lastPushedTerm == null ? this.lastTokenPos : this.prevTokenPos;
  }

  public int getLastTokenLine() {
    return this.lastPushedTerm == null ? this.lastTokenLine : this.prevTokenLine;
  }

  public void fixPosition() {
    this.prevTokenLine = this.lastTokenLine;
    this.prevTokenPos = this.lastTokenPos;
    this.lastTokenLine = this.line;
    this.lastTokenPos = this.pos - 1;
  }

  private String skipTillBlockCommentEnd(final boolean accumulateText) throws IOException {
    final StringBuilder result = accumulateText ? new StringBuilder() : null;
    boolean starCharDetected = false;
    while (true) {
      final int readChar = this.doReadChar();
      if (readChar < 0) {
        break;
      } else if (readChar == '/') {
        if (starCharDetected) {
          if (accumulateText) {
            result.setLength(result.length() - 1);
          }
          break;
        } else {
          if (accumulateText) {
            result.append((char) readChar);
          }
        }
      } else {
        starCharDetected = readChar == '*';
        if (accumulateText) {
          result.append((char) readChar);
        }
      }
    }
    return accumulateText ? result.toString() : null;
  }

  private String skipTillNextLine(final boolean accumulateText) throws IOException {
    final StringBuilder result = accumulateText ? new StringBuilder() : null;

    while (true) {
      final int readChar = this.doReadChar();
      if (readChar < 0 || readChar == '\n') {
        break;
      }
      if (accumulateText) {
        result.append((char) readChar);
      }
    }
    return accumulateText ? result.toString() : null;
  }

  public TokenizerResult pop() {
    try {
      return this.lastPushedTerm;
    } finally {
      this.lastPushedTerm = null;
    }
  }

  /**
   * Read next token
   *
   * @return next token or null if not found or stream ended
   */
  public TokenizerResult readNextToken() {

    if (this.lastPushedTerm != null) {
      return this.pop();
    }

    Quotation currentQuoting = Quotation.NONE;

    int currentRadix = 10;
    char detectedRadixChar = ' ';

    TokenizerState state = LOOK_FOR;
    boolean specCharDetected = false;
    boolean charCodeAsInt = false;

    this.internalStringBuffer.clear();
    this.specialCharBuffer.clear();

    final StringBuilderEx locInternalStringBuffer = this.internalStringBuffer;
    final StringBuilderEx locSpecialCharBuffer = this.specialCharBuffer;

    final boolean commentsAsAtoms = this.returnCommentsAsToken;

    OpContainer lastFoundFullOperator = null;
    boolean letterOrDigitOnly = false;
    boolean foundUnderscoreInNumber = false;

    try {
      while (true) {
        final int readChar = this.doReadChar();

        if (readChar < 0) {
          switch (state) {
            case LOOK_FOR:
              return null;
            case FLOAT:
            case INTEGER:
            case ATOM: {
              if (foundUnderscoreInNumber) {
                throw new PrologParserException(
                    "Contains unexpected underscore: " + locInternalStringBuffer,
                    this.prevLine, this.prevPos);
              }

              if (state == TokenizerState.FLOAT && locInternalStringBuffer.isLastChar('.')) {
                // non-ended float then it is an integer number ended by the '.' operator
                push('.');
                // it is Integer
                return new TokenizerResult(
                    makeTermFromString(locInternalStringBuffer.toStringExcludeLastChar(),
                        currentRadix, currentQuoting,
                        TokenizerState.INTEGER),
                    TokenizerState.INTEGER,
                    getLastTokenLine(),
                    getLastTokenPos()
                );
              } else {
                // it is just integer number or an atom
                final String text = locInternalStringBuffer.toString();
                return new TokenizerResult(
                    makeTermFromString(text, currentRadix, PrologTerm.findQuotation(text), state),
                    state,
                    getLastTokenLine(),
                    getLastTokenPos()
                );
              }
            }
            case VAR: {
              if (locInternalStringBuffer.isSingleChar('_')) {
                return new TokenizerResult(
                    new PrologVar(),
                    state,
                    getLastTokenLine(),
                    getLastTokenPos()
                );
              } else {
                return new TokenizerResult(
                    new PrologVar(locInternalStringBuffer.toString()),
                    state,
                    getLastTokenLine(),
                    getLastTokenPos()
                );
              }
            }
            case STRING: {
              throw new PrologParserException("Non-completed string: " + locInternalStringBuffer,
                  this.lastTokenLine, this.lastTokenPos);
            }
            case OPERATOR: {
              if (lastFoundFullOperator == null) {
                return new TokenizerResult(
                    makeTermFromString(locInternalStringBuffer.toString(), currentRadix,
                        currentQuoting, state),
                    state,
                    getLastTokenLine(),
                    getLastTokenPos()
                );
              } else {
                calcDiffAndPushResultBack(lastFoundFullOperator.getText(), locInternalStringBuffer);
                return new TokenizerResult(
                    lastFoundFullOperator,
                    state,
                    getLastTokenLine(),
                    getLastTokenPos()
                );
              }
            }
            default: {
              throw new PrologParserException(
                  "Non-completed term (" + state + "): " + locInternalStringBuffer,
                  this.prevLine, this.prevPos);
            }
          }
        }

        final char chr = (char) readChar;

        if (state != STRING
            && this.blockCommentsAllowed
            && chr == '*'
            && this.internalStringBuffer.isLastChar('/')
        ) {
          if (this.internalStringBuffer.isSingleChar('/')) {
            this.internalStringBuffer.pop();
            state = this.internalStringBuffer.isEmpty() ? LOOK_FOR : state;
          } else if (state == OPERATOR) {
            throw new PrologParserException("Operator can be mixed with comment block: "
                + this.internalStringBuffer + chr, this.getLastTokenLine(), this.getLastTokenPos());
          }

          if (commentsAsAtoms) {
            final String commentText = this.skipTillBlockCommentEnd(true);
            return new TokenizerResult(
                new PrologAtom(commentText, Quotation.COMMENT_BLOCK),
                state,
                this.getLastTokenLine(),
                this.getLastTokenPos()
            );
          } else {
            this.skipTillBlockCommentEnd(false);
          }
        } else {
          switch (state) {
            case LOOK_FOR: {
              if (Character.isWhitespace(chr) || Character.isISOControl(chr)) {
                continue;
              }

              switch (chr) {
                case '%': {
                  this.fixPosition();
                  final String text = skipTillNextLine(commentsAsAtoms);
                  if (commentsAsAtoms) {
                    return new TokenizerResult(
                        new PrologAtom(text, Quotation.COMMENT_LINE),
                        state,
                        this.getLastTokenLine(),
                        this.getLastTokenPos()
                    );
                  }
                }
                break;
                case '_': {
                  fixPosition();
                  locInternalStringBuffer.append(chr);
                  state = TokenizerState.VAR;
                }
                break;
                case '\'': {
                  fixPosition();
                  currentQuoting = Quotation.SINGLE;
                  state = TokenizerState.STRING;
                }
                break;
                case '\"': {
                  fixPosition();
                  currentQuoting = Quotation.DOUBLE;
                  state = TokenizerState.STRING;
                }
                break;
                case '`': {
                  fixPosition();
                  currentQuoting = Quotation.BACK_TICK;
                  state = TokenizerState.STRING;
                }
                break;

                default: {
                  fixPosition();

                  locInternalStringBuffer.append(chr);

                  if (Character.isUpperCase(chr)) {
                    state = TokenizerState.VAR;
                  } else {
                    letterOrDigitOnly = Character.isLetterOrDigit(chr);
                    final String operator = String.valueOf(chr);

                    if (hasOperatorStartsWith(operator)) {
                      lastFoundFullOperator = findOperatorForName(operator);
                      state = TokenizerState.OPERATOR;
                    } else {
                      if (Character.isDigit(chr)) {
                        state = TokenizerState.INTEGER;
                      } else {
                        state = TokenizerState.ATOM;
                      }
                    }
                  }
                }
                break;
              }
            }
            break;
            case ATOM: {
              if (chr == '_') {
                locInternalStringBuffer.append(chr);
              } else if (Character.isISOControl(chr) || Character.isWhitespace(chr)) {
                final String text = locInternalStringBuffer.toString();

                if (currentQuoting == Quotation.NONE) {
                  final OpContainer operator = findOperatorForName(text);
                  if (operator != null) {
                    return new TokenizerResult(
                        operator,
                        TokenizerState.OPERATOR,
                        getLastTokenLine(),
                        getLastTokenPos());
                  }
                }

                return new TokenizerResult(
                    makeTermFromString(text, currentRadix, PrologTerm.findQuotation(text), state),
                    state,
                    getLastTokenLine(),
                    getLastTokenPos());
              } else if ((chr == '\'' || chr == '\"' || chr == '`')
                  || (letterOrDigitOnly != Character.isLetterOrDigit(chr))
                  || (!Character.isLetter(chr) && findOperatorForSingleChar(chr) != null)) {
                push(chr);
                final String text = locInternalStringBuffer.toString();

                if (currentQuoting == Quotation.NONE) {
                  final OpContainer operator = findOperatorForName(text);
                  if (operator != null) {
                    return new TokenizerResult(
                        operator,
                        TokenizerState.OPERATOR,
                        getLastTokenLine(),
                        getLastTokenPos());
                  }
                }
                return new TokenizerResult(
                    makeTermFromString(text, currentRadix, PrologTerm.findQuotation(text), state),
                    state,
                    getLastTokenLine(),
                    getLastTokenPos());

              } else {
                locInternalStringBuffer.append(chr);
              }
            }
            break;
            case INTEGER: {
              if (isCharAllowedForRadix(chr, currentRadix)) {
                foundUnderscoreInNumber = false;
                locInternalStringBuffer.append(chr);
              } else if (chr == '_') {
                if (foundUnderscoreInNumber) {
                  throw new PrologParserException("Duplicated underscore char in integer",
                      this.prevLine, this.prevPos);
                } else {
                  foundUnderscoreInNumber = true;
                }
              } else {
                if (chr == '.' || chr == 'e' || chr == 'E') {
                  if (foundUnderscoreInNumber) {
                    throw new PrologParserException("Underscore is not allowed before E or dot",
                        this.prevLine, this.prevPos);
                  }

                  locInternalStringBuffer.append(chr);
                  state = TokenizerState.FLOAT;
                } else {
                  if (foundUnderscoreInNumber) {
                    throw new PrologParserException("Unexpected underscore", this.prevLine,
                        this.prevPos);
                  }

                  if (chr == '\'') {
                    if (locInternalStringBuffer.isSingleChar('0')) {
                      if (this.zeroSingleQuotationAllowed) {
                        state = STRING;
                        charCodeAsInt = true;
                        locInternalStringBuffer.clear();
                      } else {
                        push(chr);
                        return new TokenizerResult(
                            makeTermFromString(locInternalStringBuffer.toString(), currentRadix,
                                currentQuoting, state),
                            TokenizerState.INTEGER,
                            getLastTokenLine(),
                            getLastTokenPos());
                      }
                    } else {
                      currentRadix = Integer.parseInt(locInternalStringBuffer.toString());
                      if (currentRadix < 2 || currentRadix > 36) {
                        throw new PrologParserException("Radix must be 2..36: " + currentRadix,
                            getLastTokenLine(),
                            getLastTokenPos());
                      }
                      locInternalStringBuffer.clear();
                    }
                  } else {
                    boolean radixCharFound = false;
                    if (currentRadix == 10 && locInternalStringBuffer.isSingleChar('0')) {
                      switch (chr) {
                        case 'x':
                          radixCharFound = true;
                          currentRadix = 16;
                          detectedRadixChar = chr;
                          locInternalStringBuffer.clear();
                          break;
                        case 'o':
                          radixCharFound = true;
                          currentRadix = 8;
                          detectedRadixChar = chr;
                          locInternalStringBuffer.clear();
                          break;
                        case 'b':
                          radixCharFound = true;
                          currentRadix = 2;
                          detectedRadixChar = chr;
                          locInternalStringBuffer.clear();
                          break;
                        default:
                          break;
                      }
                    }
                    if (!radixCharFound) {
                      push(chr);
                      if (locInternalStringBuffer.isEmpty() && detectedRadixChar != ' ') {
                        push(detectedRadixChar);
                        locInternalStringBuffer.append('0');
                      }
                      return new TokenizerResult(
                          makeTermFromString(locInternalStringBuffer.toString(), currentRadix,
                              currentQuoting, state),
                          TokenizerState.INTEGER,
                          getLastTokenLine(),
                          getLastTokenPos());
                    }
                  }
                }
              }
            }
            break;
            case FLOAT: {
              if (Character.isDigit(chr)) {
                foundUnderscoreInNumber = false;
                locInternalStringBuffer.append(chr);
              } else if (chr == '_') {
                if (foundUnderscoreInNumber || locInternalStringBuffer.isLastChar('.')) {
                  throw new PrologParserException(
                      "Underscore after dot in number: " + locInternalStringBuffer,
                      this.prevLine, this.prevPos);
                } else {
                  foundUnderscoreInNumber = true;
                }
              } else {
                switch (chr) {
                  case '-':
                  case '+':
                    if (locInternalStringBuffer.isLastChar('e')) {
                      locInternalStringBuffer.append(chr);
                    } else {
                      push(chr);
                      return new TokenizerResult(
                          makeTermFromString(locInternalStringBuffer.toString(), currentRadix,
                              currentQuoting,
                              TokenizerState.FLOAT),
                          TokenizerState.FLOAT,
                          getLastTokenLine(),
                          getLastTokenPos());
                    }
                    break;
                  case 'e':
                  case 'E':
                    if (foundUnderscoreInNumber) {
                      throw new PrologParserException("Underscore is not allowed before E",
                          this.prevLine, this.prevPos);
                    }
                    if (locInternalStringBuffer.lastIndexOf('e') < 0) {
                      locInternalStringBuffer.append('e');
                    } else {
                      push(chr);
                      return new TokenizerResult(
                          makeTermFromString(locInternalStringBuffer.toStringExcludeLastChar(),
                              currentRadix, currentQuoting,
                              TokenizerState.FLOAT),
                          TokenizerState.FLOAT,
                          getLastTokenLine(),
                          getLastTokenPos());
                    }
                    break;
                  default:
                    push(chr);

                    if (foundUnderscoreInNumber) {
                      throw new PrologParserException("Unexpected underscore", this.prevLine,
                          this.prevPos);
                    }

                    if (locInternalStringBuffer.isLastChar('.')) {
                      // it was an integer
                      push('.');
                      return new TokenizerResult(
                          makeTermFromString(locInternalStringBuffer.toStringExcludeLastChar(),
                              currentRadix, currentQuoting,
                              TokenizerState.INTEGER),
                          TokenizerState.INTEGER,
                          getLastTokenLine(),
                          getLastTokenPos());
                    } else {
                      // it is a float
                      if (!Character.isDigit(locInternalStringBuffer.getLastChar())) {
                        throw new PrologParserException("Unexpected end of float: "
                            + locInternalStringBuffer, this.prevLine, this.prevPos);
                      }
                      return new TokenizerResult(
                          makeTermFromString(locInternalStringBuffer.toString(), currentRadix,
                              currentQuoting, state),
                          state,
                          getLastTokenLine(),
                          getLastTokenPos()
                      );
                    }
                }
              }
            }
            break;
            case OPERATOR: {
              if (chr != '_' && letterOrDigitOnly != Character.isLetterOrDigit(chr)) {
                push(chr);

                if (lastFoundFullOperator == null || letterOrDigitOnly) {
                  final String textInBuffer = locInternalStringBuffer.toString();

                  if (lastFoundFullOperator != null
                      && lastFoundFullOperator.getText().equals(textInBuffer)) {
                    return new TokenizerResult(
                        lastFoundFullOperator,
                        state,
                        getLastTokenLine(),
                        getLastTokenPos()
                    );
                  } else {
                    return new TokenizerResult(
                        makeTermFromString(textInBuffer, currentRadix, currentQuoting, ATOM),
                        ATOM,
                        getLastTokenLine(),
                        getLastTokenPos()
                    );
                  }
                } else {
                  calcDiffAndPushResultBack(
                      lastFoundFullOperator.getText(), locInternalStringBuffer);
                  return new TokenizerResult(
                      lastFoundFullOperator,
                      state,
                      getLastTokenLine(),
                      getLastTokenPos()
                  );
                }
              } else {
                final OpContainer previouslyDetectedOperator = lastFoundFullOperator;

                locInternalStringBuffer.append(chr);
                final String operator = locInternalStringBuffer.toString();
                lastFoundFullOperator = findOperatorForName(operator);

                if (previouslyDetectedOperator == null) {
                  if (!hasOperatorStartsWith(operator)) {
                    if (hasOperatorStartsWith(String.valueOf(chr))) {
                      // next char can be the start char of an
                      // operator, so we need get back it into the
                      // buffer
                      locInternalStringBuffer.pop();
                      push(chr);
                    }
                    state = TokenizerState.ATOM;
                  }
                } else {
                  if (lastFoundFullOperator == null) {
                    if (hasOperatorStartsWith(operator)) {
                      lastFoundFullOperator = previouslyDetectedOperator;
                    } else {
                      if (letterOrDigitOnly) {
                        state = TokenizerState.ATOM;
                      } else {
                        calcDiffAndPushResultBack(
                            previouslyDetectedOperator.getText(), locInternalStringBuffer);
                        return new TokenizerResult(
                            previouslyDetectedOperator,
                            state, getLastTokenLine(),
                            getLastTokenPos()
                        );
                      }
                    }
                  } else {
                    if (!hasOperatorStartsWith(operator)) {
                      calcDiffAndPushResultBack(
                          previouslyDetectedOperator.getText(), locInternalStringBuffer);
                      return new TokenizerResult(
                          previouslyDetectedOperator,
                          state,
                          getLastTokenLine(),
                          getLastTokenPos());
                    }
                  }
                }
              }
            }
            break;
            case STRING: {
              if (specCharDetected) {
                if (locSpecialCharBuffer.isEmpty() && chr == '\n') {
                  specCharDetected = false;

                  locInternalStringBuffer.append('\n');
                  if (charCodeAsInt) {
                    return new TokenizerResult(
                        makeTermFromString(locInternalStringBuffer.toString(), currentRadix,
                            currentQuoting, state),
                        state,
                        getLastTokenLine(),
                        getLastTokenPos()
                    );
                  }
                } else if (!Character.isISOControl(chr)) {
                  locSpecialCharBuffer.append(chr);
                  final StringUtils.UnescapeResult result =
                      StringUtils.tryUnescapeCharacter(locSpecialCharBuffer);
                  if (result.isError()) {
                    throw new PrologParserException("Detected wrong escape char: \\"
                        + locSpecialCharBuffer, this.prevLine, this.prevPos);
                  } else if (!result.doesNeedMore()) {
                    specCharDetected = false;
                    if (charCodeAsInt) {
                      return new TokenizerResult(
                          new PrologInt(result.getDecoded()),
                          state,
                          getLastTokenLine(),
                          getLastTokenPos()
                      );
                    } else {
                      locInternalStringBuffer.append(result.getDecoded());
                    }
                  }
                } else {
                  if (!(chr == '\r' && locSpecialCharBuffer.isEmpty())) {
                    throw new PrologParserException(
                        "Unexpected char: 0x" + Integer.toHexString(chr), this.prevLine,
                        this.prevPos);
                  }
                }
              } else {
                switch (chr) {
                  case '\'':
                    if (currentQuoting == Quotation.SINGLE) {
                      return new TokenizerResult(
                          makeTermFromString(locInternalStringBuffer.toString(), currentRadix,
                              currentQuoting, state),
                          state,
                          getLastTokenLine(),
                          getLastTokenPos()
                      );
                    } else {
                      if (charCodeAsInt) {
                        throw new PrologParserException("Char ''' must be escaped in such case",
                            this.prevLine, this.prevPos);
                      } else {
                        locInternalStringBuffer.append(chr);
                      }
                    }
                    break;
                  case '`':
                    if (currentQuoting == Quotation.BACK_TICK) {
                      return new TokenizerResult(
                          makeTermFromString(locInternalStringBuffer.toString(), currentRadix,
                              currentQuoting, state),
                          state,
                          getLastTokenLine(),
                          getLastTokenPos()
                      );
                    } else {
                      if (charCodeAsInt) {
                        return new TokenizerResult(
                            new PrologInt(chr),
                            state,
                            getLastTokenLine(),
                            getLastTokenPos()
                        );
                      } else {
                        locInternalStringBuffer.append(chr);
                      }
                    }
                    break;
                  case '\"':
                    if (currentQuoting == Quotation.DOUBLE) {
                      return new TokenizerResult(
                          makeTermFromString(locInternalStringBuffer.toString(), currentRadix,
                              currentQuoting, state),
                          state,
                          getLastTokenLine(),
                          getLastTokenPos()
                      );
                    } else {
                      if (charCodeAsInt) {
                        return new TokenizerResult(
                            new PrologInt(chr),
                            state,
                            getLastTokenLine(),
                            getLastTokenPos()
                        );
                      } else {
                        locInternalStringBuffer.append(chr);
                      }
                    }
                    break;
                  case '\\':
                    specCharDetected = true;
                    locSpecialCharBuffer.clear();
                    break;
                  default:
                    final char theChar;

                    if (Character.isISOControl(chr)) {
                      throw new PrologParserException(
                          "Unexpected control char: 0x" + Integer.toHexString(chr), this.prevLine,
                          this.prevPos);
                    } else {
                      theChar = chr;
                    }
                    if (charCodeAsInt) {
                      if (Character.isWhitespace(chr) && !this.zeroQuotationAllowsWhitespaceChar) {
                        throw new PrologParserException(
                            "Unexpected whitespace char: 0x" + Integer.toHexString(chr),
                            this.prevLine, this.prevPos);
                      } else {
                        return new TokenizerResult(
                            new PrologInt(theChar),
                            state,
                            getLastTokenLine(),
                            getLastTokenPos()
                        );
                      }
                    } else {
                      locInternalStringBuffer.append(theChar);
                    }
                    break;
                }
              }
            }
            break;
            case VAR: {
              if (!isCharAllowedForUnquotedAtom(chr)) {
                push(chr);
                if (locInternalStringBuffer.isSingleChar('_')) {
                  return new TokenizerResult(new PrologVar(), state, getLastTokenLine(),
                      getLastTokenPos());
                }
                return new TokenizerResult(new PrologVar(locInternalStringBuffer.toString()), state,
                    getLastTokenLine(), getLastTokenPos());
              } else {
                locInternalStringBuffer.append(chr);
              }
            }
            break;
            default:
              throw new CriticalUnexpectedError();
          }
        }
      }
    } catch (IOException ex) {
      throw new PrologParserException("IO exception during read char", this.prevLine, this.prevPos,
          ex);
    }
  }

  public PrologTerm makeTermFromString(final String str, final int radix,
                                       final Quotation quotingType, final TokenizerState state) {
    PrologTerm result;

    switch (state) {
      case INTEGER: {
        try {
          result = radix == 10 ? new PrologInt(str) : new PrologInt(new BigInteger(str, radix));
        } catch (NumberFormatException ex) {
          result = null;
        }
      }
      break;
      case FLOAT: {
        try {
          result = new PrologFloat(str);
        } catch (NumberFormatException ex) {
          result = null;
        }
      }
      break;
      default: {
        result = null;
      }
      break;
    }

    if (result == null) {
      result = new PrologAtom(str, quotingType);
    }

    return result;
  }

  public int getLine() {
    return this.line;
  }

  public int getPos() {
    return this.pos;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy