com.squarespace.cldrengine.message.MessagePatternParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cldr-engine Show documentation
Squarespace cldr-engine
The newest version!
package com.squarespace.cldrengine.message;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import com.squarespace.cldrengine.message.MessageMatcher.State;

/**
 * Hand-implemented parser for ICU message format. Designed to be compact and fast vs. other implementations. The parser
 * produces an instruction tree which can be cached for repeated use, and is intended to be evaluated by a separate
 * engine.
 *
 * Note: The 'choice' formatter is not implemented since it is deprecated.
 *
 * See ICU docs for details on syntax:
 * https://unicode-org.github.io/icu-docs/apidoc/released/icu4j/com/ibm/icu/text/MessageFormat.html
 *
 * Rationale
 *
 * The decision to hand-implement the parser was for 2 reasons: 1. Avoiding extra runtime dependencies (parser
 * generators, e.g. pegjs, etc) 2. Provide control over the memory allocation, garbage generation, and other aspects
 * that impact parser performance.
 *
 * A major consideration is size of the resulting parser code. The OpenJS project 'messageformat-parser' which is
 * generated by Peg.js is 30kB minified. It also requires other dependencies for the plural calculations, where this is
 * already supported in our library via @phensley/plurals
 *
 * See: https://unpkg.com/messageformat-parser/parser.js
 */
public class MessagePatternParser {

  private static final MessageCode NOOP = new MessageCode(MessageOpType.NOOP);

  private static final char LEFT = '{';
  private static final char RIGHT = '}';
  private static final char MINUS = '-';
  private static final char APOS = '\'';
  private static final char POUND = '#';

  private final String str;
  private final int len;
  private final MessageMatcher matcher;
  private final boolean disableEscapes;

  public MessagePatternParser(Collection formatters, String str, boolean disableEscapes) {
    this.str = str;
    this.len = str.length();
    this.matcher = new MessageMatcher(formatters, str);
    this.disableEscapes = disableEscapes;
  }

  public MessageCode parse() {
    State state = new MessageMatcher.State(0, len);
    return outer(state, null);
  }

  public MessageCode outer(MessageMatcher.State r, Object argsub) {
    // Accumulate parsed instruction nodes
    List n = new ArrayList<>();

    // Accumulate plain text characters
    StringBuilder buf = new StringBuilder();

    while (r.s < r.e) {
      char c = str.charAt(r.s);

      // Loop for characters which mark the start of a special section
      switch (c) {
        case LEFT: {
          // Push non-empty buffer
          if (buf.length() > 0) {
            n.add(textarg(buf.toString(), argsub));
            buf = new StringBuilder();
          }

          int sn = r.s + 1;
          boolean hidden = sn < str.length() && str.charAt(sn) == MINUS;

          int k = seek(r.s, r.e);
          if (k == -1) {
            n.add(textarg(str.substring(r.s, r.e), argsub));
            r.s = r.e;
          } else if (hidden) {
            // Tag is hidden from processor, emit as text
            n.add(new MessageTextCode(LEFT + str.substring(r.s + 2, k + 1)));

            // Skip over hidden tag
            r.s = k;
          } else {
            // Process tag interior
            MessageCode child = inner(new MessageMatcher.State(r.s + 1, k));
            if (child == null) {
              // If we're not in the outermost scope, push text
              if (argsub != null && r.s + 1 != k) {
                n.add(textarg(str.substring(r.s + 1, k), argsub));
              }
            } else {
              n.add(child);
            }

            // Skip over processed tag
            r.s = k;
          }
          break;
        }

        case APOS: {
          if (disableEscapes) {
            buf.append(c);
          } else {
            int k = r.s + 1;
            if (k < len && c == str.charAt(k)) {
              // Convert double apostrophe to single
              buf.append(c);
              r.s++;
  
            } else {
              // Skip over apostrophe
              r.s++;
  
              // Capture string wrapped in apostrophes
              k = str.indexOf(c, r.s);
              if (k == -1) {
                k = r.e;
              }
  
              // Since this is escaped text, push text node without substituting '#'
              buf.append(str.substring(r.s, k));
  
              // Skip over escaped text
              r.s = k;
            }
          }
          break;
        }

        default:
          // Append plain character to output buffer
          buf.append(c);
          break;
      }
      r.s++;
    }

    // Push any trailing characters
    if (buf.length() > 0) {
      n.add(textarg(buf.toString(), argsub));
    }

    // Flatten blocks
    return flatten(n);
  }

  public MessageCode inner(MessageMatcher.State r) {
    MessageMatcher m = this.matcher;

    // Skip any optional leading spaces
    m.spaces(r);

    // See if we have any arguments. we must have at least one or we fail this tag.
    List