com.google.gwt.i18n.shared.BidiUtils Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of gwt-servlet Show documentation
There is a newer version: 2.10.0
/*
 * Copyright 2010 Google Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.gwt.i18n.shared;

import com.google.gwt.i18n.client.HasDirection.Direction;
import com.google.gwt.regexp.shared.RegExp;
import com.google.gwt.regexp.shared.SplitResult;

/**
 * Utility functions for performing common Bidi tests on strings.
 */
public class BidiUtils {

  /**
   * A practical pattern to identify strong LTR characters. This pattern is not
   * completely correct according to the Unicode standard. It is simplified
   * for performance and small code size.
   * 
   * This is volatile to prevent the compiler from inlining this constant in
   * various references below.
   */
  private static volatile String LTR_CHARS =
    "A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8\u0300-\u0590\u0800-\u1FFF" +
    "\u2C00-\uFB1C\uFDFE-\uFE6F\uFEFD-\uFFFF";

  /**
   * A practical pattern to identify strong RTL characters. This pattern is not
   * completely correct according to the Unicode standard. It is simplified for
   * performance and small code size.
   * 
   * This is volatile to prevent the compiler from inlining this constant in
   * various references below.
   */
  private static volatile String RTL_CHARS =
      "\u0591-\u07FF\uFB1D-\uFDFD\uFE70-\uFEFC";
  
  /**
   * Regular expression to check if the first strongly directional character in
   * a string is LTR.
   */
  private static final RegExp FIRST_STRONG_IS_LTR_RE =
      RegExp.compile("^[^" + RTL_CHARS + "]*[" + LTR_CHARS + ']');

  /**
   * Regular expression to check if the first strongly directional character in
   * a string is RTL.
   */
  private static final RegExp FIRST_STRONG_IS_RTL_RE =
      RegExp.compile("^[^" + LTR_CHARS + "]*[" + RTL_CHARS + ']');

  /**
   * Regular expression to check if a string contains any LTR characters.
   */
  private static final RegExp HAS_ANY_LTR_RE =
      RegExp.compile("[" + LTR_CHARS + ']');

  /**
   * Regular expression to check if a string contains any RTL characters.
   */
  private static final RegExp HAS_ANY_RTL_RE =
      RegExp.compile("[" + RTL_CHARS + ']');

  /**
   * Regular expression to check if a string contains any numerals. Used to
   * differentiate between completely neutral strings and those containing
   * numbers, which are weakly LTR.
   */
  private static final RegExp HAS_NUMERALS_RE = RegExp.compile("\\d");

  /**
   * Simplified regular expression for an HTML tag (opening or closing) or an
   * HTML escape. We might want to skip over such expressions when estimating
   * the text directionality.
   */
  private static final RegExp SKIP_HTML_RE =
      RegExp.compile("<[^>]*>|&[^;]+;", "g");

  /**
   * An instance of BidiUtils, to be returned by {@link #get()}.
   */
  private static final BidiUtils INSTANCE = new BidiUtils();

  /**
   * Regular expression to check if a string looks like something that must
   * always be LTR even in RTL text, e.g. a URL. When estimating the
   * directionality of text containing these, we treat these as weakly LTR, like
   * numbers.
   */
  private static final RegExp IS_REQUIRED_LTR_RE = RegExp.compile("^http://.*");

  /**
   * Regular expressions to check if the last strongly-directional character in
   * a piece of text is LTR.
   */
  private static final RegExp LAST_STRONG_IS_LTR_RE =
      RegExp.compile("[" + LTR_CHARS + "][^" + RTL_CHARS + "]*$");

  /**
   * Regular expressions to check if the last strongly-directional character in
   * a piece of text is RTL.
   */
  private static final RegExp LAST_STRONG_IS_RTL_RE =
      RegExp.compile("[" + RTL_CHARS + "][^" + LTR_CHARS + "]*$");

  /**
   * This constant defines the threshold of RTL directionality.
   */
  private static final float RTL_DETECTION_THRESHOLD = 0.40f;

  /**
   * Regular expression to split a string into "words" for directionality
   * estimation based on relative word counts.
   */
  private static final RegExp WORD_SEPARATOR_RE = RegExp.compile("\\s+");

  /**
   * Get an instance of BidiUtils.
   * @return An instance of BidiUtils
   */
  public static BidiUtils get() {
    return INSTANCE;
  }

  /**
   * Not instantiable.
   */
  private BidiUtils() {
  }

  /**
   * Like {@link #endsWithLtr(String, boolean)}, but assumes {@code str} is not
   * HTML / HTML-escaped.
   */
  public boolean endsWithLtr(String str) {
    return LAST_STRONG_IS_LTR_RE.test(str);
  }

  /**
   * Check whether the last strongly-directional character in the string is LTR.
   * @param str the string to check
   * @param isHtml whether str is HTML / HTML-escaped
   * @return whether LTR exit directionality was detected
   */
  public boolean endsWithLtr(String str, boolean isHtml) {
    return endsWithLtr(stripHtmlIfNeeded(str, isHtml));
  }

  /**
   * Like {@link #endsWithRtl(String, boolean)}, but assumes {@code str} is not
   * HTML / HTML-escaped.
   */
  public boolean endsWithRtl(String str) {
    return LAST_STRONG_IS_RTL_RE.test(str);
  }

  /**
   * Check whether the last strongly-directional character in the string is RTL.
   * @param str the string to check
   * @param isHtml whether str is HTML / HTML-escaped
   * @return whether RTL exit directionality was detected
   */
  public boolean endsWithRtl(String str, boolean isHtml) {
    return endsWithRtl(stripHtmlIfNeeded(str, isHtml));
  }

  /**
   * Like {@link #estimateDirection(String, boolean)}, but assumes {@code str}
   * is not HTML / HTML-escaped.
   */
  public Direction estimateDirection(String str) {
    int rtlCount = 0;
    int total = 0;
    boolean hasWeaklyLtr = false;
    SplitResult tokens = WORD_SEPARATOR_RE.split(str);
    for (int i = 0; i < tokens.length(); i++) {
      String token = tokens.get(i);
      if (startsWithRtl(token)) {
        rtlCount++;
        total++;
      } else if (IS_REQUIRED_LTR_RE.test(token)) {
        hasWeaklyLtr = true;
      } else if (hasAnyLtr(token)) {
        total++;
      } else if (HAS_NUMERALS_RE.test(token)) {
        hasWeaklyLtr = true;
      }
    }

    return total == 0 ? (hasWeaklyLtr ? Direction.LTR : Direction.DEFAULT)
        : ((float) rtlCount / total > RTL_DETECTION_THRESHOLD ? Direction.RTL :
        Direction.LTR);
  }

  /**
   * Estimates the directionality of a string based on relative word counts.
   * If the number of RTL words is above a certain percentage of the total
   * number of strongly directional words, returns RTL.
   * Otherwise, if any words are strongly or weakly LTR, returns LTR.
   * Otherwise, returns DEFAULT, which is used to mean "neutral".
   * Numbers are counted as weakly LTR.
   * @param str the string to check
   * @param isHtml whether {@code str} is HTML / HTML-escaped. Use this to
   *        ignore HTML tags and escapes that would otherwise be mistaken for
   *        LTR text.
   * @return the string's directionality
   */
  public Direction estimateDirection(String str, boolean isHtml) {
    return estimateDirection(stripHtmlIfNeeded(str, isHtml));
  }

  /**
   * Like {@link #hasAnyLtr(String, boolean)}, but assumes {@code str} is not
   * HTML / HTML-escaped.
   * @param str the string to be tested
   * @return whether the string contains any LTR characters
   */
  public boolean hasAnyLtr(String str) {
    return HAS_ANY_LTR_RE.test(str);
  }

  /**
   * Checks if the given string has any LTR characters in it.
   * @param str the string to be tested
   * @param isHtml whether str is HTML / HTML-escaped
   * @return whether the string contains any LTR characters
   */
  public boolean hasAnyLtr(String str, boolean isHtml) {
    return hasAnyLtr(stripHtmlIfNeeded(str, isHtml));
  }

  /**
   * Like {@link #hasAnyRtl(String, boolean)}, but assumes {@code str} is not
   * HTML / HTML-escaped.
   * @param str the string to be tested
   * @return whether the string contains any RTL characters
   */
  public boolean hasAnyRtl(String str) {
    return HAS_ANY_RTL_RE.test(str);
  }

  /**
   * Checks if the given string has any RTL characters in it.
   * @param isHtml whether str is HTML / HTML-escaped
   * @param str the string to be tested
   * @return whether the string contains any RTL characters
   */
  public boolean hasAnyRtl(String str, boolean isHtml) {
    return hasAnyRtl(stripHtmlIfNeeded(str, isHtml));
  }

  /**
   * Like {@link #startsWithLtr(String, boolean)}, but assumes {@code str} is
   * not HTML / HTML-escaped.
   */
  public boolean startsWithLtr(String str) {
    return FIRST_STRONG_IS_LTR_RE.test(str);
  }

  /**
   * Check whether the first strongly-directional character in the string is
   * LTR.
   * @param str the string to check
   * @param isHtml whether str is HTML / HTML-escaped
   * @return whether LTR exit directionality was detected
   */
  public boolean startsWithLtr(String str, boolean isHtml) {
    return startsWithLtr(stripHtmlIfNeeded(str, isHtml));
  }

  /**
   * Like {@link #startsWithRtl(String, boolean)}, but assumes {@code str} is
   * not HTML / HTML-escaped.
   */
  public boolean startsWithRtl(String str) {
    return FIRST_STRONG_IS_RTL_RE.test(str);
  }

  /**
   * Check whether the first strongly-directional character in the string is
   * RTL.
   * @param str the string to check
   * @param isHtml whether {@code str} is HTML / HTML-escaped
   * @return whether RTL exit directionality was detected
   */
  public boolean startsWithRtl(String str, boolean isHtml) {
    return startsWithRtl(stripHtmlIfNeeded(str, isHtml));
  }

  /**
   * Returns the input text with spaces instead of HTML tags or HTML escapes, if
   * isStripNeeded is true. Else returns the input as is.
   * Useful for text directionality estimation.
   * Note: the function should not be used in other contexts; it is not 100%
   * correct, but rather a good-enough implementation for directionality
   * estimation purposes.
   */
  String stripHtmlIfNeeded(String str, boolean isStripNeeded) {
    return isStripNeeded ? SKIP_HTML_RE.replace(str, " ") : str;
  }
}