All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.squarespace.cldrengine.message.MessagePatternParser Maven / Gradle / Ivy

The newest version!
package com.squarespace.cldrengine.message;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import com.squarespace.cldrengine.message.MessageMatcher.State;

/**
 * Hand-implemented parser for ICU message format. Designed to be compact and fast vs. other implementations. The parser
 * produces an instruction tree which can be cached for repeated use, and is intended to be evaluated by a separate
 * engine.
 *
 * Note: The 'choice' formatter is not implemented since it is deprecated.
 *
 * See ICU docs for details on syntax:
 * https://unicode-org.github.io/icu-docs/apidoc/released/icu4j/com/ibm/icu/text/MessageFormat.html
 *
 * Rationale
 *
 * The decision to hand-implement the parser was for 2 reasons: 1. Avoiding extra runtime dependencies (parser
 * generators, e.g. pegjs, etc) 2. Provide control over the memory allocation, garbage generation, and other aspects
 * that impact parser performance.
 *
 * A major consideration is size of the resulting parser code. The OpenJS project 'messageformat-parser' which is
 * generated by Peg.js is 30kB minified. It also requires other dependencies for the plural calculations, where this is
 * already supported in our library via @phensley/plurals
 *
 * See: https://unpkg.com/messageformat-parser/parser.js
 */
public class MessagePatternParser {

  private static final MessageCode NOOP = new MessageCode(MessageOpType.NOOP);

  private static final char LEFT = '{';
  private static final char RIGHT = '}';
  private static final char MINUS = '-';
  private static final char APOS = '\'';
  private static final char POUND = '#';

  private final String str;
  private final int len;
  private final MessageMatcher matcher;
  private final boolean disableEscapes;

  public MessagePatternParser(Collection formatters, String str, boolean disableEscapes) {
    this.str = str;
    this.len = str.length();
    this.matcher = new MessageMatcher(formatters, str);
    this.disableEscapes = disableEscapes;
  }

  public MessageCode parse() {
    State state = new MessageMatcher.State(0, len);
    return outer(state, null);
  }

  public MessageCode outer(MessageMatcher.State r, Object argsub) {
    // Accumulate parsed instruction nodes
    List n = new ArrayList<>();

    // Accumulate plain text characters
    StringBuilder buf = new StringBuilder();

    while (r.s < r.e) {
      char c = str.charAt(r.s);

      // Loop for characters which mark the start of a special section
      switch (c) {
        case LEFT: {
          // Push non-empty buffer
          if (buf.length() > 0) {
            n.add(textarg(buf.toString(), argsub));
            buf = new StringBuilder();
          }

          int sn = r.s + 1;
          boolean hidden = sn < str.length() && str.charAt(sn) == MINUS;

          int k = seek(r.s, r.e);
          if (k == -1) {
            n.add(textarg(str.substring(r.s, r.e), argsub));
            r.s = r.e;
          } else if (hidden) {
            // Tag is hidden from processor, emit as text
            n.add(new MessageTextCode(LEFT + str.substring(r.s + 2, k + 1)));

            // Skip over hidden tag
            r.s = k;
          } else {
            // Process tag interior
            MessageCode child = inner(new MessageMatcher.State(r.s + 1, k));
            if (child == null) {
              // If we're not in the outermost scope, push text
              if (argsub != null && r.s + 1 != k) {
                n.add(textarg(str.substring(r.s + 1, k), argsub));
              }
            } else {
              n.add(child);
            }

            // Skip over processed tag
            r.s = k;
          }
          break;
        }

        case APOS: {
          if (disableEscapes) {
            buf.append(c);
          } else {
            int k = r.s + 1;
            if (k < len && c == str.charAt(k)) {
              // Convert double apostrophe to single
              buf.append(c);
              r.s++;
  
            } else {
              // Skip over apostrophe
              r.s++;
  
              // Capture string wrapped in apostrophes
              k = str.indexOf(c, r.s);
              if (k == -1) {
                k = r.e;
              }
  
              // Since this is escaped text, push text node without substituting '#'
              buf.append(str.substring(r.s, k));
  
              // Skip over escaped text
              r.s = k;
            }
          }
          break;
        }

        default:
          // Append plain character to output buffer
          buf.append(c);
          break;
      }
      r.s++;
    }

    // Push any trailing characters
    if (buf.length() > 0) {
      n.add(textarg(buf.toString(), argsub));
    }

    // Flatten blocks
    return flatten(n);
  }

  public MessageCode inner(MessageMatcher.State r) {
    MessageMatcher m = this.matcher;

    // Skip any optional leading spaces
    m.spaces(r);

    // See if we have any arguments. we must have at least one or we fail this tag.
    List args = m.arguments(r);
    if (args == null) {
      return null;
    }

    // Check if we're done..
    if (!m.spaces(r) || m.complete(r)) {
      // We have a simple argument
      return new MessageArgCode(args.get(0));
    }

    // See if any of our known formatters are present
    String name = m.formatter(r);
    if (name == null) {
      return null;
    }
    m.spaces(r);

    // Execute logic to parse instructions by type
    switch (name) {
      case "plural":
      case "selectordinal":
        PluralNumberType type = name.equals("plural")
          ? PluralNumberType.CARDINAL : PluralNumberType.ORDINAL;
        return this.plural(args, type, r);

      case "select":
        return this.select(args, r);

      default:
        return this.simple(args, name, r);
    }

    // This code should never be reached if the 'name' corresponds to a valid
    // formatter.
  }

  /**
   * Parse a nested tag sequence '{' ... '}'
   */
  protected MessageCode tag(MessageMatcher.State r, Object argsub) {
    matcher.spaces(r);

    // Ensure we see a tag start next
    if (matcher.character(r) != LEFT) {
      return null;
    }

    int sn = r.s + 1;
    boolean hidden = sn < this.str.length() && this.str.charAt(sn) == MINUS;

    // Find matching end delimiter
    int k = this.seek(r.s, r.e);

    // Parse nested block and skip over it.
    MessageCode node = hidden
        ? new MessageTextCode(LEFT + this.str.substring(r.s + 2, k + 1))
        : this.outer(new MessageMatcher.State(r.s + 1, k), argsub);
    r.s = k + 1;
    return node;
  }

  /**
   * Parse a plural instruction.
   */
  protected MessageCode plural(List args, PluralNumberType type, MessageMatcher.State r) {
    MessageMatcher m = this.matcher;

    int offset = m.pluralOffset(r);
    m.spaces(r);

    List choices = new ArrayList<>();
    do {
      // Parse a plural choice
      String _choice = m.pluralChoice(r);
      if (_choice == null) {
        break;
      }

      // Parse a tag into a block of instructions
      MessageCode block = this.tag(r, args.get(0));
      if (block == null) {
        return NOOP;
      }

      // Determine which choice node to construct
      PluralChoice node;
      if (_choice.charAt(0) == '=') {
        // Plural exact match
        node = new PluralChoice(PluralChoiceType.EXACT, _choice.substring(1), block);
      } else {
        // Plural category match
        node = new PluralChoice(PluralChoiceType.CATEGORY, _choice, block);
      }

      // Append and skip spaces
      choices.add(node);
      m.spaces(r);
    } while (!m.complete(r));

    // If we parsed no choices, emit a no-op
    if (choices.isEmpty()) {
      return NOOP;
    }
    return new MessagePluralCode(args, offset, type, choices);
  }

  /**
   * Parse a select instruction.
   */
  protected MessageCode select(List args, MessageMatcher.State r) {
    List choices = new ArrayList<>();
    do {
      // Parse an identifier to be used as the select choice
      String ident = matcher.identifier(r);
      if (ident == null) {
        break;
      }

      // Parse a tag into block of instructions
      MessageCode block = this.tag(r, args.get(0));
      if (block == null) {
        return NOOP;
      }

      // Append and skip to the next choice
      choices.add(new SelectChoice(ident, block));
      matcher.spaces(r);


    } while (!matcher.complete(r));

    // If we parsed no choices, just emit a no-op
    if (choices.isEmpty()) {
      return NOOP;
    }
    return new MessageSelectCode(args, choices);
  }

  /**
   * Simple single-argument formatter with zero or more options.
   */
  protected MessageCode simple(List args, String name, MessageMatcher.State r) {
    List options = matcher.options(r);
    return new MessageSimpleCode(name, args, options);
  }

  /**
   * Seek to the matching '}' character at the same nesting level, skipping over any apostrophes. This adds some
   * redundant scanning of the string but simplifies some of the parsing logic in other areas. It ensures we're always
   * dealing with a well-formed tag where all '{' have a corresponding '}'.
   */
  protected int seek(int i, int j) {
    // Track nesting depth
    int d = 0;

    loop: while (i < j) {
      char c = str.charAt(i);
      switch (c) {
        case LEFT:
          // Increase depth
          d++;
          break;

        case RIGHT:
          // Reduce depth
          d--;
          if (d == 0) {
            // Depth is zero, we're done
            break loop;
          }
          break;

        case APOS:
          if (i + 1 < len && c == str.charAt(i + 1)) {
            // Skip single escaped apostrophe
            i++;
          } else {
            // Find matching apostrophe
            int k = i + 1 < len ? str.indexOf(c, i + 1) : -1;
            if (k == -1) {
              // No apostrophe, assume rest of string is escaped
              return -1;
            }
            // Skip over matching apostrophe
            i = k;
          }
        break;
      }

      i++;
    }

    // If we still have un-matched characters, return -1
    return d == 0 ? i : -1;
  }

  /**
   * Emit a text node, performing argument substitution for all occurrences of the '#' character.
   */
  protected MessageCode textarg(String s, Object argsub) {
    int i = 0;
    int j = 0;

    // If no argument substituion is requested, return plain text
    if (argsub == null) {
      return new MessageTextCode(s);
    }

    // If no '#' character is found, return plain text
    j = s.indexOf(POUND);
    if (j == -1) {
      return new MessageTextCode(s);
    }

    // Here 'j' points to position of '#'

    // We need to perform substitution on each occurrence of '#' in the
    // string and return a block.
    int len = s.length();
    List n = new ArrayList<>();

    // Loop, substituting an arg node for each occurrence of '#'
    while (j != -1) {
      // Push leading text
      if (i < j) {
        n.add(new MessageTextCode(s.substring(i, j)));
      }

      // Add a substitution op
      n.add(new MessageCode(MessageOpType.ARGSUB));

      // Skip over '#' and search for next occurrence
      i = j + 1;
      j = s.indexOf(POUND, i);
    }

    // Push trailing text
    if (i < len) {
      n.add(new MessageTextCode(s.substring(i)));
    }

    return flatten(n);
  }

  protected MessageCode flatten(List n) {
    int size = n.size();
    if (size == 0) {
      return NOOP;
    }
    return size == 1 ? n.get(0) : new MessageBlockCode(n);
  }
}