All Downloads are FREE. Search and download functionalities are using the official Maven repository.

fm.common.Normalize.scala Maven / Gradle / Ivy

 * Copyright 2016 Frugal Mechanic (
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package fm.common

import java.lang.{StringBuilder => JavaStringBuilder}
import scala.collection.mutable.Builder

object Normalize {
  def stripAccents(s: String): String = ASCIIUtil.convertToASCII(s)

   * Replaces any non-alphanumeric characters with collapsed spaces
  def lowerAlphanumericWithSpaces(s: String): String = {
    if (null == s) return ""

    val normalized: String = unicodeNormalization(s)
    val sb: JavaStringBuilder = new JavaStringBuilder()
    var i: Int = 0
    var prevCh: Char = 0

    def handleChar(ch: Char): Unit = {
      if (Character.isLetterOrDigit(ch)) {
        prevCh = ch
      } else if (prevCh != ' ') {
        sb.append(' ')
        prevCh = ' '

    while (i < normalized.length) {
      val rawCh: Char = normalized.charAt(i)
      val expandedChars: String = ASCIIUtil.toASCIICharsOrNull(rawCh)

      if (null == expandedChars) {
      } else {
        var j: Int = 0
        while (j < expandedChars.length) {
          j += 1

      i += 1
   * Removes any non-alphanumeric characters and strips accents (when it can be converted to a single character) - Only allocates a new string if the passed in string is not already normalized
   * Note: This logic should match reverseLowerAlphanumeric() -- EXCEPT that this implementation now only allocates if it needs to
  def lowerAlphanumeric(s: String): String = lowerAlphanumericWithPositionsImpl(s, false)._1
   * Removes any non-alphanumeric characters and strips accents (when it can be converted to a single character) - Only allocates a new string if the passed in string is not already normalized
   * Note: This logic should match reverseLowerAlphanumeric() -- EXCEPT that this implementation now only allocates if it needs to
  def lowerAlphanumericWithPositions(s: String): (String, Array[Int]) = lowerAlphanumericWithPositionsImpl(s, true)
   * The implementation for both lowerAlphanumeric and lowerAlphanumericWithPositions
  private def lowerAlphanumericWithPositionsImpl(s: String, includePositions: Boolean): (String, Array[Int]) = {
    if (null == s) return ("", Array())

    val normalized: String = unicodeNormalization(s)

    var res: JavaStringBuilder = null // The lowerAlphanumeric chars
    var pos: ImmutableArrayBuilder[Int] = null // Original positions the lowerAlphanumeric chars came from

    var i: Int = 0

    def handleChar(ch: Char): Unit = {
      if (null == res && (!Character.isLetterOrDigit(ch) || ch != Character.toLowerCase(ch))) {
        // The original string is not normalized so we need to initialize arr and copy over everything so far
        res = new JavaStringBuilder(s.length)
        if (includePositions) pos = makePositionsArray(normalized.length, i)

        // Copy over everything so far
        if (i > 0) {
          res.append(normalized, 0, i)

      // Normal case of building up our new string
      if (null != res) {
        if (Character.isLetterOrDigit(ch)) {
          if (includePositions) pos += i

    while (i < normalized.length) {
      val rawCh: Char = normalized.charAt(i)
      val expandedChars: String = ASCIIUtil.toASCIICharsOrNull(rawCh)

      if (null == expandedChars) {
      } else {
        var j: Int = 0
        while (j < expandedChars.length) {
          j += 1

      i += 1

    // If arr is null then the original string is already normalized
    val normalizedString: String = if (null == res) s else res.toString

    val normalizedPositions: Array[Int] = if (includePositions) {
      if (null == pos) {
        // If pos is null then arr was null so we just need to fill with 0..normalizedString.length
        makePositionsArray(normalizedString.length, normalizedString.length).toArray
      } else {
        // Otherwise trim the pos array to the same length as the normalized string
    } else null

    (normalizedString, normalizedPositions)
  // Used by lowerAlphanumericWithPositionsImpl
  private def makePositionsArray(length: Int, fillLength: Int): ImmutableArrayBuilder[Int] = {
    val arr: ImmutableArrayBuilder[Int] = ImmutableArray.newBuilder
    var i: Int = 0
    while (i < fillLength) {
      arr(i) = i
      i += 1
   * Given the original string and a normalized substring, extract the original version of the normalized substring.
   * e.g. Original: "Foo B.O.S.C.H. Bar"  Normalized: "bosch"  Result: "B.O.S.C.H."
   * Note: This logic should match lowerAlphanumeric
  def reverseLowerAlphanumeric(original: String, normalized: String): Option[String] = {
    if (original.isNullOrBlank || normalized.isNullOrBlank) return None

    val unicodeNormalizedOriginal: String = unicodeNormalization(original)
    val (lowerAlphaNumericOriginal: String, positions: Array[Int]) = lowerAlphanumericWithPositions(unicodeNormalizedOriginal)
    val matchIdx: Int = lowerAlphaNumericOriginal.indexOf(normalized)
    if (matchIdx < 0) None else {
      val startIdx: Int = positions(matchIdx)
      var endIdx: Int = positions(matchIdx + normalized.length - 1)
      val maxEndIdx: Int = if (matchIdx + normalized.length >= lowerAlphaNumericOriginal.length) unicodeNormalizedOriginal.length else positions(matchIdx + normalized.length)
      // Take any additional non-whitespace up to the next normalized character
      while (endIdx < maxEndIdx && !Character.isWhitespace(unicodeNormalizedOriginal.charAt(endIdx))) {
        endIdx += 1
      Some(unicodeNormalizedOriginal.substring(startIdx, endIdx))
  def lowerAlphaNumericWords(s: String): Array[String] = {
    val builder = Array.newBuilder[String]
    lowerAlphaNumericWords(s, builder)
  def lowerAlphaNumericWords(s: String, buf: Builder[String,_]): Unit = {
    if (null == s) return

    val normalized: String = unicodeNormalization(s)
    var i: Int = 0

    var sb: JavaStringBuilder = new JavaStringBuilder()

    def handleChar(ch: Char): Unit = {
      // If its a valid character (alphanumeric or a dot) add it to the StringBuilder
      if (Character.isLetterOrDigit(ch) || ch == '.') {
      } else if (sb.length > 0) {
        // Otherwise we have a complete word, add it to the result buffer
        buf += sb.toString
        sb = new JavaStringBuilder()

    while (i < normalized.length) {
      val rawCh: Char = normalized.charAt(i)
      val expandedChars: String = ASCIIUtil.toASCIICharsOrNull(rawCh)

      if (null == expandedChars) {
      } else {
        var j: Int = 0
        while (j < expandedChars.length) {
          j += 1
      i += 1

    // If there is anything left in the StringBuilder, add it to the result buffer
    if (sb.length > 0) buf += sb.toString

  def stripControl(s: String): String = {
    new String(s.filter{ch => !Character.isISOControl(ch) || '\t' == ch }.toArray)

  def numeric(s: String): String = {
    new String(s.filter{ch => Character.isDigit(ch) || '.' == ch || '-' == ch }.toArray)
  /** The word seperator character for urlName */
  private[this] val SepChar: Char = '-'
  /** These characters should be transformed into the SepChar in urlName */
  private[this] val ReplaceWithSepChars: Set[Char] = Set('_', '\\', '/', ' ')
  /** These characters should be expanded into words in urlName */
  private[this] val ExpandCharMap: Map[Char, String] = Map(
    '&' -> "and",
    '+' -> "plus",
    '"' -> "inch"
   * Transform the string into something that is URL Friendly.
  def urlName(raw: String): String = {
    if (null == raw) return ""

    // 2015-02-19 - This additional step to added to strip accented chars
    val s: String = stripAccents(raw)

    val sb: JavaStringBuilder = new JavaStringBuilder(s.length)
    var i: Int = 0
    var lastCharWasSep: Boolean = false
    while (i < s.length) {
      val ch: Char = s.charAt(i)
      if (ch == SepChar || ReplaceWithSepChars.contains(ch)) {
        if (!lastCharWasSep) {
          lastCharWasSep = true
      } else if (ExpandCharMap.contains(ch)) {
        if (!lastCharWasSep) sb.append(SepChar)
        lastCharWasSep = true
      } else if (Character.isLetterOrDigit(ch)) {
        lastCharWasSep = false

      i += 1

    // We can end up with a leading and/or trailing SepChar so lets remove those
    if (sb.length() > 0 && sb.charAt(0) == SepChar) sb.deleteCharAt(0)
    if (sb.length() > 0 && sb.charAt(sb.length - 1) == SepChar) sb.deleteCharAt(sb.length - 1)
   * Converts an ASCII Character to it's Unicode Full Width equivalent
   * scala> val a = (33 to 126).map{ _.toChar }
   * a: scala.collection.immutable.IndexedSeq[Char] = Vector(!, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, :, ;, <, =, >, ?, @, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V, W, X, Y, Z, [, \, ], ^, _, `, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, {, |, }, ~)
   * scala> val b = (65281 to 65374).map{ _.toChar }
   * b: scala.collection.immutable.IndexedSeq[Char] = Vector(!, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, :, ;, <, =, >, ?, @, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V, W, X, Y, Z, [, \, ], ^, _, `, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, {, |, }, ~)
   * scala> (a zip b)
   * res44: scala.collection.immutable.IndexedSeq[(Char, Char)] = Vector((!,!), (","), (#,#), ($,$), (%,%), (&,&), (','), ((,(), (),)), (*,*), (+,+), (,,,), (-,-), (.,.), (/,/), (0,0), (1,1), (2,2), (3,3), (4,4), (5,5), (6,6), (7,7), (8,8), (9,9), (:,:), (;,;), (<,<), (=,=), (>,>), (?,?), (@,@), (A,A), (B,B), (C,C), (D,D), (E,E), (F,F), (G,G), (H,H), (I,I), (J,J), (K,K), (L,L), (M,M), (N,N), (O,O), (P,P), (Q,Q), (R,R), (S,S), (T,T), (U,U), (V,V), (W,W), (X,X), (Y,Y), (Z,Z), ([,[), (\,\), (],]), (^,^), (_,_), (`,`), (a,a), (b,b), (c,c), (d,d), (e,e), (f,f), (g,g), (h,h), (i,i), (j,j), (k,k), (l,l), (m,m), (n,n), (o,o), (p,p), (q,q), (r,r), (s,s), (t,t), (u,u), (v,v), (w,w), (x,x), (y,y), (z,z), ({,{), (|,|), (},}), (~,~))
  def toFullWidth(ch: Char): Char = if (ch == ' ') 12288.toChar else if (ch >= 33 && ch <= 126) (ch+65248).toChar else ch
   * Converts ASCII Characters in a String to their Unicode Full Width equivalent
  def toFullWidth(s: String): String ={ toFullWidth }

  /** Used by the various lowerAlphanumeric methods */
  private[common] def unicodeNormalization(s: String): String = {
    // Unicode Normalization Form KC (NFKC) - "Compatibility decomposition, followed by canonical composition."

© 2015 - 2025 Weber Informatics LLC | Privacy Policy