com.google.re2j.Unicode Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of re2j-td Show documentation
The newest version!
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Many of these were derived from the corresponding Go functions in
// http://code.google.com/p/go/source/browse/src/pkg/unicode/letter.go

package com.google.re2j;

import static java.nio.charset.StandardCharsets.UTF_8;

/**
 * Utilities for dealing with Unicode better than Java does.
 *
 * @author [email protected] (Alan Donovan)
 */
class Unicode {

  // Rune and UTF8 sequences are the same.
  static final int RUNE_SELF = 0x80;

  // The highest legal rune value.
  static final int MAX_RUNE = 0x10FFFF;

  // The highest legal ASCII value.
  static final int MAX_ASCII = 0x7f;

  // The highest legal Latin-1 value.
  static final int MAX_LATIN1 = 0xFF;

  private static final int MAX_CASE = 3;

  // Represents invalid code points.
  private static final int REPLACEMENT_CHAR = 0xFFFD;

  // Minimum and maximum runes involved in folding.
  // Checked during test.
  static final int MIN_FOLD = 0x0041;
  static final int MAX_FOLD = 0x1044f;

  // Maximum bytes per rune
  static final int UTF_MAX = 4;

  // is32 uses binary search to test whether rune is in the specified
  // slice of 32-bit ranges.
  // TODO(adonovan): opt: consider using int[n*3] instead of int[n][3].
  private static boolean is32(int[][] ranges, int r) {
    // binary search over ranges
    for (int lo = 0, hi = ranges.length; lo < hi; ) {
      int m = lo + (hi - lo) / 2;
      int[] range = ranges[m];  // [lo, hi, stride]
      if (range[0] <= r && r <= range[1]) {
        return ((r - range[0]) % range[2]) == 0;
      }
      if (r < range[0]) {
        hi = m;
      } else {
        lo = m + 1;
      }
    }
    return false;
  }

  // is tests whether rune is in the specified table of ranges.
  private static boolean is(int[][] ranges, int r) {
    // common case: rune is ASCII or Latin-1, so use linear search.
    if (r <= MAX_LATIN1) {
      for (int[] range : ranges) {  // range = [lo, hi, stride]
        if (r > range[1]) {
          continue;
        }
        if (r < range[0]) {
          return false;
        }
        return ((r - range[0]) % range[2]) == 0;
      }
      return false;
    }
    return ranges.length > 0 &&
        r >= ranges[0][0] &&
        is32(ranges, r);
  }

  static byte[] codePointToUtf8(int codePoint) {
    return new String(Character.toChars(codePoint)).getBytes(UTF_8);
  }

  static int maxRune(int len) {
    int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
    if (len == 1) {
      b = 7;
    } else {
      b = 8 - (len + 1) + 6 * (len - 1);
    }
    return (1 << b) - 1; // maximum Rune for b bits.
  }

  // isUpper reports whether the rune is an upper case letter.
  static boolean isUpper(int r) {
    // See comment in isGraphic.
    if (r <= MAX_LATIN1) {
      return Character.isUpperCase((char) r);
    }
    return is(UnicodeTables.Upper, r);
  }

  // isLower reports whether the rune is a lower case letter.
  static boolean isLower(int r) {
    // See comment in isGraphic.
    if (r <= MAX_LATIN1) {
      return Character.isLowerCase((char) r);
    }
    return is(UnicodeTables.Lower, r);
  }

  // isTitle reports whether the rune is a title case letter.
  static boolean isTitle(int r) {
    if (r <= MAX_LATIN1) {
      return false;
    }
    return is(UnicodeTables.Title, r);
  }

  // isPrint reports whether the rune is printable (Unicode L/M/N/P/S or ' ').
  static boolean isPrint(int r) {
    if (r <= MAX_LATIN1) {
      return r >= 0x20 && r < 0x7F ||
             r >= 0xA1 && r != 0xAD;
    }
    return is(UnicodeTables.L, r) ||
           is(UnicodeTables.M, r) ||
           is(UnicodeTables.N, r) ||
           is(UnicodeTables.P, r) ||
           is(UnicodeTables.S, r);
  }

  // A case range is conceptually a record:
  // class CaseRange {
  //   int lo, hi;
  //   int upper, lower, title;
  // }
  // but flattened as an int[5].

  // to maps the rune using the specified case mapping.
  private static int to(int kase, int r, int[][] caseRange) {
    if (kase < 0 || MAX_CASE <= kase) {
      return REPLACEMENT_CHAR; // as reasonable an error as any
    }
    // binary search over ranges
    for (int lo = 0, hi = caseRange.length; lo < hi; ) {
      int m = lo + (hi - lo) / 2;
      int[] cr = caseRange[m];  // cr = [lo, hi, upper, lower, title]
      int crlo = cr[0];
      int crhi = cr[1];
      if (crlo <= r && r <= crhi) {
        int delta = cr[2 + kase];
        if (delta > MAX_RUNE) {
          // In an Upper-Lower sequence, which always starts with
          // an UpperCase letter, the real deltas always look like:
          //      {0, 1, 0}    UpperCase (Lower is next)
          //      {-1, 0, -1}  LowerCase (Upper, Title are previous)
          // The characters at even offsets from the beginning of the
          // sequence are upper case; the ones at odd offsets are lower.
          // The correct mapping can be done by clearing or setting the low
          // bit in the sequence offset.
          // The constants UpperCase and TitleCase are even while LowerCase
          // is odd so we take the low bit from kase.
          return crlo + (((r - crlo) & ~1) | (kase & 1));
        }
        return r + delta;
      }
      if (r < crlo) {
        hi = m;
      } else {
        lo = m + 1;
      }
    }
    return r;
  }

  // to maps the rune to the specified case: UpperCase, LowerCase, or TitleCase.
  private static int to(int kase, int r) {
    return to(kase, r, UnicodeTables.CASE_RANGES);
  }

  // toUpper maps the rune to upper case.
  static int toUpper(int r) {
    if (r <= MAX_ASCII) {
      if ('a' <= r && r <= 'z') {
        r -= 'a' - 'A';
      }
      return r;
    }
    return to(UnicodeTables.UpperCase, r);
  }

  // toLower maps the rune to lower case.
  static int toLower(int r) {
    if (r <= MAX_ASCII) {
      if ('A' <= r && r <= 'Z') {
        r += 'a' - 'A';
      }
      return r;
    }
    return to(UnicodeTables.LowerCase, r);
  }

  // simpleFold iterates over Unicode code points equivalent under
  // the Unicode-defined simple case folding.  Among the code points
  // equivalent to rune (including rune itself), SimpleFold returns the
  // smallest r >= rune if one exists, or else the smallest r >= 0.
  //
  // For example:
  //      SimpleFold('A') = 'a'
  //      SimpleFold('a') = 'A'
  //
  //      SimpleFold('K') = 'k'
  //      SimpleFold('k') = '\u212A' (Kelvin symbol, K)
  //      SimpleFold('\u212A') = 'K'
  //
  //      SimpleFold('1') = '1'
  //
  // Derived from Go's unicode.SimpleFold.
  //
  static int simpleFold(int r) {
    // Consult caseOrbit table for special cases.
    int lo = 0;
    int hi = UnicodeTables.CASE_ORBIT.length;
    while (lo < hi) {
      int m = lo + (hi - lo) / 2;
      if (UnicodeTables.CASE_ORBIT[m][0] < r) {
        lo = m + 1;
      } else {
        hi = m;
      }
    }
    if (lo < UnicodeTables.CASE_ORBIT.length &&
        UnicodeTables.CASE_ORBIT[lo][0] == r) {
      return UnicodeTables.CASE_ORBIT[lo][1];
    }

    // No folding specified.  This is a one- or two-element
    // equivalence class containing rune and toLower(rune)
    // and toUpper(rune) if they are different from rune.
    int l = toLower(r);
    if (l != r) {
      return l;
    }
    return toUpper(r);
  }

  private Unicode() {}  // uninstantiable

}