org.jparsec.Terminals Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jparsec Show documentation
The newest version!
/*****************************************************************************
 * Copyright (C) jparsec.org                                                *
 * ------------------------------------------------------------------------- *
 * Licensed under the Apache License, Version 2.0 (the "License");           *
 * you may not use this file except in compliance with the License.          *
 * You may obtain a copy of the License at                                   *
 *                                                                           *
 * http://www.apache.org/licenses/LICENSE-2.0                                *
 *                                                                           *
 * Unless required by applicable law or agreed to in writing, software       *
 * distributed under the License is distributed on an "AS IS" BASIS,         *
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
 * See the License for the specific language governing permissions and       *
 * limitations under the License.                                            *
 *****************************************************************************/
package org.jparsec;

import static java.util.Arrays.asList;
import static org.jparsec.internal.util.Checks.checkArgument;

import java.util.ArrayList;
import java.util.Collection;
import java.util.function.Function;

import org.jparsec.Tokens.Fragment;
import org.jparsec.Tokens.ScientificNotation;
import org.jparsec.Tokens.Tag;
import org.jparsec.internal.annotations.Private;
import org.jparsec.internal.util.Objects;
import org.jparsec.internal.util.Strings;
import org.jparsec.internal.util.Checks;

/**
 * Provides convenient API to build lexer and parsers for terminals.
 * The following example is a parser snippet for Java generic type expression such as
 * {@code List}:    {@code
 *   Terminals terms = Terminals
 *       .operators("?", "<", ">", ",")
 *       .words(Scanners.IDENTIFIER)
 *       .keywords("super", "extends")
 *       .build();
 *   Parser typeName = Terminals.identifier();
 *   Parser wildcardWithUpperBound = terms.phrase("?", "extends");
 *   ...
 *   parser.from(terms.tokenizer(), Scanners.WHITESPACES.optional()).parse("List");
 * }
 * 
 * @author Ben Yu
 */
public final class Terminals extends Lexicon {

  /**
   * {@link Parser} that recognizes reserved word tokens.
   * i.e. {@link Tokens.Fragment} tokens tagged as {@link Tag#RESERVED}.
   * {@link Fragment#text()} is returned as parser result.
   */ 
  public static final Parser RESERVED = fragment(Tag.RESERVED);
  
  /** Entry point for parser and tokenizers of character literal. */
  public static final class CharLiteral {
    
    private CharLiteral() {}
    
    /** {@link Parser} that recognizes {@link Character} tokens. */
    public static final Parser PARSER =
        Parsers.tokenType(Character.class, "character literal");
    
    /**
     * A tokenizer that parses single quoted character literal (escaped by {@code '\'}),
     * and then converts the character to a {@link Character} token.
     */
    public static final Parser SINGLE_QUOTE_TOKENIZER =
        Scanners.SINGLE_QUOTE_CHAR.map(TokenizerMaps.SINGLE_QUOTE_CHAR);
  }
  
  /** Entry point for parser and tokenizers of string literal. */
  public static final class StringLiteral {

    private StringLiteral() {}
    
    /** {@link Parser} that recognizes {@link String} tokens. */
    public static final Parser PARSER = Parsers.tokenType(String.class, "string literal");
    
    /**
     * A tokenizer that parses double quoted string literal (escaped by {@code '\'}),
     * and transforms the quoted content by applying escape characters.
     */
    public static final Parser DOUBLE_QUOTE_TOKENIZER =
        Scanners.DOUBLE_QUOTE_STRING.map(TokenizerMaps.DOUBLE_QUOTE_STRING);
    
    /**
     * A tokenizer that parses single quoted string literal (single quote is escaped with
     * another single quote), and transforms the quoted content by applying escape characters.
     */
    public static final Parser SINGLE_QUOTE_TOKENIZER =
        Scanners.SINGLE_QUOTE_STRING.map(TokenizerMaps.SINGLE_QUOTE_STRING);
  }
  
  /** Entry point for parser and tokenizers of integral number literal represented as {@link Long}. */
  public static final class LongLiteral {

    private LongLiteral() {}
    
    /** {@link Parser} that recognizes {@link Long} tokens. */
    public static final Parser PARSER = Parsers.tokenType(Long.class, "integer literal");
    
    /**
     * A tokenizer that parses a decimal integer number (valid patterns are: {@code 1, 10, 123}),
     * and converts the string to a {@link Long} value.
     */
    public static final Parser DEC_TOKENIZER =
        Scanners.DEC_INTEGER.map(TokenizerMaps.DEC_AS_LONG);
    
    /**
     * A tokenizer that parses a octal integer number (valid patterns are:
     * {@code 0, 07, 017, 0371} etc.), and converts the string to a {@link Long} value.
     * 
     *  An octal number has to start with 0.
     */
    public static final Parser OCT_TOKENIZER =
        Scanners.OCT_INTEGER.map(TokenizerMaps.OCT_AS_LONG);
    
    /**
     * A tokenizer that parses a hex integer number (valid patterns are:
     * {@code 0x1, 0Xff, 0xFe1} etc.), and converts the string to a {@link Long} value.
     * 
     * 
 A hex number has to start with either 0x or 0X.
     */
    public static final Parser HEX_TOKENIZER =
        Scanners.HEX_INTEGER.map(TokenizerMaps.HEX_AS_LONG);
    
    /**
     * A tokenizer that parses decimal, hex, and octal numbers and converts the string to a
     * {@code Long} value.
     */
    public static final Parser TOKENIZER =
        Parsers.or(HEX_TOKENIZER, DEC_TOKENIZER, OCT_TOKENIZER);
  }
  
  /** Entry point for any arbitrary integer literal represented as a {@link String}. */
  public static final class IntegerLiteral {

    private IntegerLiteral() {}
    
    /**
     * {@link Parser} that recognizes {@link Tokens.Fragment} tokens tagged as {@link Tag#INTEGER}.
     */
    public static final Parser PARSER = fragment(Tag.INTEGER);
    
    /**
     * A tokenizer that parses a integer number (valid patterns are: {@code 0, 00, 1, 10})
     * and returns a {@link Fragment} token tagged as {@link Tag#INTEGER}.
     */
    public static final Parser TOKENIZER =
        Scanners.INTEGER.map(TokenizerMaps.INTEGER_FRAGMENT);
  }
  
  /** Entry point for parser and tokenizers of decimal number literal represented as {@link String}.*/
  public static final class DecimalLiteral {

    private DecimalLiteral() {}
    
    /**
     * {@link Parser} that recognizes {@link Tokens.Fragment} tokens tagged as {@link Tag#DECIMAL}.
     */
    public static final Parser PARSER = fragment(Tag.DECIMAL);
    
    /**
     * A tokenizer that parses a decimal number (valid patterns are: {@code 1, 2.3, 00, 0., .23})
     * and returns a {@link Fragment} token tagged as {@link Tag#DECIMAL}.
     */
    public static final Parser TOKENIZER =
        Scanners.DECIMAL.map(TokenizerMaps.DECIMAL_FRAGMENT);
  }
  
  /** Entry point for parser and tokenizers of scientific notation literal. */
  public static final class ScientificNumberLiteral {
    
    private ScientificNumberLiteral() {}
    
    /** {@link Parser} that recognies {@link ScientificNotation} tokens. */
    public static final Parser PARSER =
        Parsers.tokenType(ScientificNotation.class, "scientific number literal");
    
    /**
     * A tokenizer that parses a scientific notation and converts the string to a
     * {@link ScientificNotation} value.
     */
    public static final Parser TOKENIZER =
        Scanners.SCIENTIFIC_NOTATION.map(TokenizerMaps.SCIENTIFIC_NOTATION);
  }
  
  /** Entry point for parser and tokenizers of regular identifier. */
  public static final class Identifier {

    private Identifier() {}
    
    /**
     * {@link Parser} that recognizes identifier tokens.
     * i.e. {@link Tokens.Fragment} tokens tagged as {@link Tag#IDENTIFIER}.
     * {@link Fragment#text()} is returned as parser result.
     */ 
    public static final Parser PARSER = fragment(Tag.IDENTIFIER);
    
    /**
     * A tokenizer that parses any identifier and returns a {@link Fragment} token tagged as
     * {@link Tag#IDENTIFIER}.
     * 
     * 
 An identifier starts with an alphabetic character or underscore,
     * and is followed by 0 or more alphanumeric characters or underscore.
     */
    public static final Parser TOKENIZER =
        Scanners.IDENTIFIER.map(TokenizerMaps.IDENTIFIER_FRAGMENT);
  }
  
  private Terminals(Lexicon lexicon) {
    super(lexicon.words, lexicon.tokenizer);
  }

  /**
   * Returns a {@link Terminals} object for lexing and parsing the operators with names specified in
   * {@code ops}, and for lexing and parsing the keywords case insensitively. Parsers for operators
   * and keywords can be obtained through {@link #token}; parsers for identifiers through
   * {@link #identifier}.
   * 
   * 
In detail, keywords and operators are lexed as {@link Tokens.Fragment} with
   * {@link Tag#RESERVED} tag. Words that are not among {@code keywords} are lexed as
   * {@code Fragment} with {@link Tag#IDENTIFIER} tag.
   *
   * 
A word is defined as an alphanumeric  string that starts with {@code [_a - zA - Z]},
   * with 0 or more {@code [0 - 9_a - zA - Z]} following.
   * 
   * @param ops the operator names.
   * @param keywords the keyword names.
   * @return the Terminals instance.
   * @deprecated Use {@code operators(ops)
   *                 .words(Scanners.IDENTIFIER)
   *                 .caseInsensitiveKeywords(keywords)
   *                 .build()} instead.
   */
  @Deprecated
  public static Terminals caseInsensitive(String[] ops, String[] keywords) {
    return operators(ops).words(Scanners.IDENTIFIER).caseInsensitiveKeywords(asList(keywords)).build();
  }
  
  /**
   * Returns a {@link Terminals} object for lexing and parsing the operators with names specified in
   * {@code ops}, and for lexing and parsing the keywords case sensitively. Parsers for operators
   * and keywords can be obtained through {@link #token}; parsers for identifiers through
   * {@link #identifier}.
   * 
   * 
In detail, keywords and operators are lexed as {@link Tokens.Fragment} with
   * {@link Tag#RESERVED} tag. Words that are not among {@code keywords} are lexed as
   * {@code Fragment} with {@link Tag#IDENTIFIER} tag.
   *
   * 
A word is defined as an alphanumeric string that starts with {@code [_a - zA - Z]},
   * with 0 or more {@code [0 - 9_a - zA - Z]} following.
   * 
   * @param ops the operator names.
   * @param keywords the keyword names.
   * @return the Terminals instance.
   * @deprecated Use {@code operators(ops)
   *                 .words(Scanners.IDENTIFIER)
   *                 .keywords(keywords)
   *                 .build()} instead.
   */
  @Deprecated
  public static Terminals caseSensitive(String[] ops, String[] keywords) {
    return operators(ops).words(Scanners.IDENTIFIER).keywords(asList(keywords)).build();
  }
  
  /**
   * Returns a {@link Terminals} object for lexing and parsing the operators with names specified in
   * {@code ops}, and for lexing and parsing the keywords case insensitively. Parsers for operators
   * and keywords can be obtained through {@link #token}; parsers for identifiers through
   * {@link #identifier}.
   * 
   * 
In detail, keywords and operators are lexed as {@link Tokens.Fragment} with
   * {@link Tag#RESERVED} tag. Words that are not among {@code keywords} are lexed as
   * {@code Fragment} with {@link Tag#IDENTIFIER} tag.
   *  
   * @param wordScanner the scanner that returns a word in the language.
   * @param ops the operator names.
   * @param keywords the keyword names.
   * @return the Terminals instance.
   * @deprecated Use {@code operators(ops)
   *                 .words(wordScanner)
   *                 .caseInsensitiveKeywords(keywords)
   *                 .build()} instead.
   */
  @Deprecated
  public static Terminals caseInsensitive(
      Parser wordScanner, String[] ops, String[] keywords) {
    return operators(ops)
        .words(wordScanner)
        .caseInsensitiveKeywords(keywords)
        .build();
  }
  
  /**
   * Returns a {@link Terminals} object for lexing and parsing the operators with names specified in
   * {@code ops}, and for lexing and parsing the keywords case sensitively. Parsers for operators
   * and keywords can be obtained through {@link #token}; parsers for identifiers through
   * {@link #identifier}.
   * 
   * 
In detail, keywords and operators are lexed as {@link Tokens.Fragment} with
   * {@link Tag#RESERVED} tag. Words that are not among {@code keywords} are lexed as
   * {@code Fragment} with {@link Tag#IDENTIFIER} tag.
   *  
   * @param wordScanner the scanner that returns a word in the language.
   * @param ops the operator names.
   * @param keywords the keyword names.
   * @return the Terminals instance.
   * @deprecated Use {@code operators(ops)
   *                 .words(wordScanner)
   *                 .keywords(keywords)
   *                 .build()} instead.
   */
  @Deprecated
  public static Terminals caseSensitive(
      Parser wordScanner, String[] ops, String[] keywords) {
    return operators(ops)
        .words(wordScanner)
        .keywords(keywords)
        .build();
  }
  
  /**
   * Returns a {@link Terminals} object for lexing and parsing the operators with names specified in
   * {@code ops}, and for lexing and parsing the keywords case insensitively. Parsers for operators
   * and keywords can be obtained through {@link #token}; parsers for identifiers through
   * {@link #identifier}.
   * 
   * 
In detail, keywords and operators are lexed as {@link Tokens.Fragment} with
   * {@link Tag#RESERVED} tag. Words that are not among {@code keywords} are lexed as
   * {@code Fragment} with {@link Tag#IDENTIFIER} tag.
   *  
   * @param wordScanner the scanner that returns a word in the language.
   * @param ops the operator names.
   * @param keywords the keyword names.
   * @param wordMap maps the text to a token value for non-keywords recognized by
   * {@code wordScanner}.
   * @return the Terminals instance.
   * @deprecated Use {@code operators(ops)
   *                 .words(wordScanner)
   *                 .tokenizeWordsWith(wordMap)
   *                 .caseInsensitiveKeywords(keywords)
   *                 .build()} instead.
   */
  @Deprecated
  public static Terminals caseInsensitive(
      Parser wordScanner, String[] ops, String[] keywords, Function wordMap) {
    return operators(ops)
        .words(wordScanner)
        .caseInsensitiveKeywords(keywords)
        .tokenizeWordsWith(wordMap)
        .build();
  }
  
  /**
   * Returns a {@link Terminals} object for lexing and parsing the operators with names specified in
   * {@code ops}, and for lexing and parsing the keywords case sensitively. Parsers for operators
   * and keywords can be obtained through {@link #token}; parsers for identifiers through
   * {@link #identifier}.
   * 
   * 
In detail, keywords and operators are lexed as {@link Tokens.Fragment} with
   * {@link Tag#RESERVED} tag. Words that are not among {@code keywords} are lexed as
   * {@code Fragment} with {@link Tag#IDENTIFIER} tag.
   *  
   * @param wordScanner the scanner that returns a word in the language.
   * @param ops the operator names.
   * @param keywords the keyword names.
   * @param wordMap maps the text to a token value for non-keywords recognized by
   * {@code wordScanner}.
   * @return the Terminals instance.
   * @deprecated Use {@code operators(ops)
   *                 .words(wordScanner)
   *                 .tokenizeWordsWith(wordMap)
   *                 .keywords(keywords)
   *                 .build()} instead.
   */
  @Deprecated
  public static Terminals caseSensitive(
      Parser wordScanner, String[] ops, String[] keywords, Function wordMap) {
    return operators(ops)
        .words(wordScanner)
        .keywords(keywords)
        .tokenizeWordsWith(wordMap)
        .build();
  }
  
  /**
   * Returns a {@link Terminals} object for lexing the operators with names specified in
   * {@code ops}. Operators are lexed as {@link Tokens.Fragment} with {@link Tag#RESERVED} tag.
   * For example, to get the parser for operator "?", simply call {@code token("?")}.
   *
   * 
If words and keywords need to be parsed, they can be configured via {@link #words}.
   * 
   * @param ops the operator names.
   * @return the Terminals instance.
   */
  public static Terminals operators(String... ops) {
    return operators(asList(ops));
  }
  
  /**
   * Returns a {@link Terminals} object for lexing the operators with names specified in
   * {@code ops}. Operators are lexed as {@link Tokens.Fragment} with {@link Tag#RESERVED} tag.
   * For example, to get the parser for operator "?", simply call {@code token("?")}.
   *
   * 
If words and keywords need to be parsed, they can be configured via {@link #words}.
   * 
   * @param ops the operator names.
   * @return the Terminals instance.
   * @since 2.2
   */
  public static Terminals operators(Collection ops) {
    return new Terminals(Operators.lexicon(ops));
  }

  /**
   * Starts to build a new {@code Terminals} instance that recognizes words not already recognized
   * by {@code this} {@code Terminals} instance (typically operators).
   *
   * 
By default identifiers are recognized through {@link #identifier} during token-level
   * parsing phase. Use {@link Builder#tokenizeWordsWith} to tokenize differently, and choose an
   * alternative token-level parser accordingly.
   *
   * @param wordScanner defines words recognized by the new instance
   * @since 2.2
   */
  public Builder words(Parser wordScanner) {
    return new Builder(wordScanner);
  }

  /**
   * Builds {@link Terminals} instance by defining the words and keywords recognized.
   * The following example implements a calculator with logical operators: 
   {@code
   *   Terminals terms = Terminals
   *       .operators("<", ">", "=", ">=", "<=")
   *       .words(Scanners.IDENTIFIER)
   *       .caseInsensitiveKeywords("and", "or")
   *       .build();
   *   Parser var = Terminals.identifier();
   *   Parser integer = Terminals.IntegerLiteral.PARSER.map(...);
   *   Parser and = terms.token("and");
   *   Parser lessThan = terms.token("<");
   *   ...
   *   Parser parser = grammar.from(
   *       terms.tokenizer().or(IntegerLiteral.TOKENIZER), Scanners.WHITSPACES.optional());
   * }
   *
   * @since 2.2
   */
  public final class Builder {
    private final Parser wordScanner;

    private Collection keywords = new ArrayList();
    private StringCase stringCase = StringCase.CASE_SENSITIVE;
    private Function wordTokenMap = TokenizerMaps.IDENTIFIER_FRAGMENT;
    
    Builder(Parser wordScanner) {
      this.wordScanner = Checks.checkNotNull(wordScanner);
    }

    /**
     * Defines keywords. Keywords are special words with their own grammar rules.
     * To get the parser for a keyword, call {@code token(keyword)}.
     *
     * Note that if you call {@link #keywords} or {@link #caseInsensitiveKeywords} multiple
     * times on the same {@link Builder} instance, the last call overwrites previous calls.
     */
    public Builder keywords(@SuppressWarnings("hiding") String... keywords) {
      return keywords(asList(keywords));
    }

    /**
     * Defines keywords. Keywords are special words with their own grammar rules.
     * To get the parser for a keyword, call {@code token(keyword)}.
     *
     * 
Note that if you call {@link #keywords} or {@link #caseInsensitiveKeywords} multiple
     * times on the same {@link Builder} instance, the last call overwrites previous calls.
     */
    public Builder keywords(@SuppressWarnings("hiding") Collection keywords) {
      this.keywords = keywords;
      this.stringCase = StringCase.CASE_SENSITIVE;
      return this;
    }

    /**
     * Defines case insensitive keywords. Keywords are special words with their own grammar
     * rules. To get the parser for a keyword, call {@code token(keyword)}.
     *
     * 
Note that if you call {@link #keywords} or {@link #caseInsensitiveKeywords} multiple
     * times on the same {@link Builder} instance, the last call overwrites previous calls.
     */
    public Builder caseInsensitiveKeywords(@SuppressWarnings("hiding") String... keywords) {
      return caseInsensitiveKeywords(asList(keywords));
    }

    /**
     * Defines case insensitive keywords. Keywords are special words with their own grammar
     * rules. To get the parser for a keyword, call {@code token(keyword)}.
     *
     * Note that if you call {@link #keywords} or {@link #caseInsensitiveKeywords} multiple
     * times on the same {@link Builder} instance, the last call overwrites previous calls.
     */
    public Builder caseInsensitiveKeywords(@SuppressWarnings("hiding") Collection keywords) {
      this.keywords = keywords;
      this.stringCase = StringCase.CASE_INSENSITIVE;
      return this;
    }

    /** Configures alternative tokenization strategy for words (except keywords). */
    public Builder tokenizeWordsWith(Function wordMap) {
      this.wordTokenMap = Checks.checkNotNull(wordMap);
      return this;
    }

    /** Builds a new {@link Terminals} instance that recognizes words defined in this builder. */
    public Terminals build() {
      return new Terminals(
          union(Keywords.lexicon(wordScanner, keywords, stringCase, wordTokenMap)));
    }
  }

  /**
   * Returns a {@link Parser} that recognizes identifiers (a.k.a words, variable names etc).
   * Equivalent to {@link Identifier#PARSER}.
   *
   * @since 2.2
   */
  public static Parser identifier() {
    return Identifier.PARSER;
  }
  
  /**
   * Returns a {@link Parser} that recognizes {@link Tokens.Fragment} token values
   * tagged with one of {@code tags}. 
   */
  public static Parser fragment(final Object... tags) {
    return Parsers.token(fromFragment(tags));
  }
  
  /**
   * Returns a {@link TokenMap} object that only recognizes {@link Tokens.Fragment} token values
   * tagged with one of {@code tags}.
   */
  static TokenMap fromFragment(final Object... tags) {
    return new TokenMap() {
      @Override public String map(final Token token) {
        final Object val = token.value();
        if (val instanceof Fragment) {
          Fragment c = (Fragment) val;
          if (!Objects.in(c.tag(), tags)) return null;
          return c.text();
        }
        else return null;
      }
      @Override public String toString() {
        if (tags.length == 0) return "";
        if (tags.length == 1) return String.valueOf(tags[0]);
        return "[" + Strings.join(", ", tags) + "]";
      }
    };
  }
  
  @Private static void checkDup(Iterable a, Iterable b) {
    for (String s1 : a) {
      for (String s2 : b) {
        Checks.checkArgument(!s1.equals(s2), "%s duplicated", s1);
      }
    }
  }
}