fm.common.Normalize.scala Maven / Gradle / Ivy
/*
* Copyright 2016 Frugal Mechanic (http://frugalmechanic.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package fm.common
import java.lang.{StringBuilder => JavaStringBuilder}
import scala.collection.mutable.Builder
object Normalize {
def stripAccents(s: String): String = ASCIIUtil.convertToASCII(s)
/**
* Replaces any non-alphanumeric characters with collapsed spaces
*/
def lowerAlphanumericWithSpaces(s: String): String = {
if (null == s) return ""
val normalized: String = unicodeNormalization(s)
val sb: JavaStringBuilder = new JavaStringBuilder()
var i: Int = 0
var prevCh: Char = 0
def handleChar(ch: Char): Unit = {
if (Character.isLetterOrDigit(ch)) {
sb.append(Character.toLowerCase(ch))
prevCh = ch
} else if (prevCh != ' ') {
sb.append(' ')
prevCh = ' '
}
}
while (i < normalized.length) {
val rawCh: Char = normalized.charAt(i)
val expandedChars: String = ASCIIUtil.toASCIICharsOrNull(rawCh)
if (null == expandedChars) {
handleChar(rawCh)
} else {
var j: Int = 0
while (j < expandedChars.length) {
handleChar(expandedChars.charAt(j))
j += 1
}
}
i += 1
}
sb.toString.trim
}
/**
* Removes any non-alphanumeric characters and strips accents (when it can be converted to a single character) - Only allocates a new string if the passed in string is not already normalized
*
* Note: This logic should match reverseLowerAlphanumeric() -- EXCEPT that this implementation now only allocates if it needs to
*/
def lowerAlphanumeric(s: String): String = lowerAlphanumericWithPositionsImpl(s, false)._1
/**
* Removes any non-alphanumeric characters and strips accents (when it can be converted to a single character) - Only allocates a new string if the passed in string is not already normalized
*
* Note: This logic should match reverseLowerAlphanumeric() -- EXCEPT that this implementation now only allocates if it needs to
*/
def lowerAlphanumericWithPositions(s: String): (String, Array[Int]) = lowerAlphanumericWithPositionsImpl(s, true)
/**
* The implementation for both lowerAlphanumeric and lowerAlphanumericWithPositions
*/
private def lowerAlphanumericWithPositionsImpl(s: String, includePositions: Boolean): (String, Array[Int]) = {
if (null == s) return ("", Array())
val normalized: String = unicodeNormalization(s)
var res: JavaStringBuilder = null // The lowerAlphanumeric chars
var pos: ImmutableArrayBuilder[Int] = null // Original positions the lowerAlphanumeric chars came from
var i: Int = 0
def handleChar(ch: Char): Unit = {
if (null == res && (!Character.isLetterOrDigit(ch) || ch != Character.toLowerCase(ch))) {
// The original string is not normalized so we need to initialize arr and copy over everything so far
res = new JavaStringBuilder(s.length)
if (includePositions) pos = makePositionsArray(normalized.length, i)
// Copy over everything so far
if (i > 0) {
res.append(normalized, 0, i)
}
}
// Normal case of building up our new string
if (null != res) {
if (Character.isLetterOrDigit(ch)) {
res.append(Character.toLowerCase(ch))
if (includePositions) pos += i
}
}
}
while (i < normalized.length) {
val rawCh: Char = normalized.charAt(i)
val expandedChars: String = ASCIIUtil.toASCIICharsOrNull(rawCh)
if (null == expandedChars) {
handleChar(rawCh)
} else {
var j: Int = 0
while (j < expandedChars.length) {
handleChar(expandedChars.charAt(j))
j += 1
}
}
i += 1
}
// If arr is null then the original string is already normalized
val normalizedString: String = if (null == res) s else res.toString
val normalizedPositions: Array[Int] = if (includePositions) {
if (null == pos) {
// If pos is null then arr was null so we just need to fill with 0..normalizedString.length
makePositionsArray(normalizedString.length, normalizedString.length).toArray
} else {
// Otherwise trim the pos array to the same length as the normalized string
pos.toArray
}
} else null
(normalizedString, normalizedPositions)
}
// Used by lowerAlphanumericWithPositionsImpl
private def makePositionsArray(length: Int, fillLength: Int): ImmutableArrayBuilder[Int] = {
val arr: ImmutableArrayBuilder[Int] = ImmutableArray.newBuilder
var i: Int = 0
while (i < fillLength) {
arr(i) = i
i += 1
}
arr
}
/**
* Given the original string and a normalized substring, extract the original version of the normalized substring.
* e.g. Original: "Foo B.O.S.C.H. Bar" Normalized: "bosch" Result: "B.O.S.C.H."
*
* Note: This logic should match lowerAlphanumeric
*/
def reverseLowerAlphanumeric(original: String, normalized: String): Option[String] = {
if (original.isNullOrBlank || normalized.isNullOrBlank) return None
val unicodeNormalizedOriginal: String = unicodeNormalization(original)
val (lowerAlphaNumericOriginal: String, positions: Array[Int]) = lowerAlphanumericWithPositions(unicodeNormalizedOriginal)
val matchIdx: Int = lowerAlphaNumericOriginal.indexOf(normalized)
if (matchIdx < 0) None else {
val startIdx: Int = positions(matchIdx)
var endIdx: Int = positions(matchIdx + normalized.length - 1)
val maxEndIdx: Int = if (matchIdx + normalized.length >= lowerAlphaNumericOriginal.length) unicodeNormalizedOriginal.length else positions(matchIdx + normalized.length)
// Take any additional non-whitespace up to the next normalized character
while (endIdx < maxEndIdx && !Character.isWhitespace(unicodeNormalizedOriginal.charAt(endIdx))) {
endIdx += 1
}
Some(unicodeNormalizedOriginal.substring(startIdx, endIdx))
}
}
def lowerAlphaNumericWords(s: String): Array[String] = {
val builder = Array.newBuilder[String]
lowerAlphaNumericWords(s, builder)
builder.result()
}
def lowerAlphaNumericWords(s: String, buf: Builder[String,_]): Unit = {
if (null == s) return
val normalized: String = unicodeNormalization(s)
var i: Int = 0
var sb: JavaStringBuilder = new JavaStringBuilder()
def handleChar(ch: Char): Unit = {
// If its a valid character (alphanumeric or a dot) add it to the StringBuilder
if (Character.isLetterOrDigit(ch) || ch == '.') {
sb.append(Character.toLowerCase(ch))
} else if (sb.length > 0) {
// Otherwise we have a complete word, add it to the result buffer
buf += sb.toString
sb = new JavaStringBuilder()
}
}
while (i < normalized.length) {
val rawCh: Char = normalized.charAt(i)
val expandedChars: String = ASCIIUtil.toASCIICharsOrNull(rawCh)
if (null == expandedChars) {
handleChar(rawCh)
} else {
var j: Int = 0
while (j < expandedChars.length) {
handleChar(expandedChars.charAt(j))
j += 1
}
}
i += 1
}
// If there is anything left in the StringBuilder, add it to the result buffer
if (sb.length > 0) buf += sb.toString
}
def stripControl(s: String): String = {
new String(s.filter{ch => !Character.isISOControl(ch) || '\t' == ch }.toArray)
}
def numeric(s: String): String = {
new String(s.filter{ch => Character.isDigit(ch) || '.' == ch || '-' == ch }.toArray)
}
/** The word seperator character for urlName */
private[this] val SepChar: Char = '-'
/** These characters should be transformed into the SepChar in urlName */
private[this] val ReplaceWithSepChars: Set[Char] = Set('_', '\\', '/', ' ')
/** These characters should be expanded into words in urlName */
private[this] val ExpandCharMap: Map[Char, String] = Map(
'&' -> "and",
'+' -> "plus",
'"' -> "inch"
)
/**
* Transform the string into something that is URL Friendly.
*/
def urlName(raw: String): String = {
if (null == raw) return ""
// 2015-02-19 - This additional step to added to strip accented chars
val s: String = stripAccents(raw)
val sb: JavaStringBuilder = new JavaStringBuilder(s.length)
var i: Int = 0
var lastCharWasSep: Boolean = false
while (i < s.length) {
val ch: Char = s.charAt(i)
if (ch == SepChar || ReplaceWithSepChars.contains(ch)) {
if (!lastCharWasSep) {
sb.append(SepChar)
lastCharWasSep = true
}
} else if (ExpandCharMap.contains(ch)) {
if (!lastCharWasSep) sb.append(SepChar)
sb.append(ExpandCharMap(ch))
sb.append(SepChar)
lastCharWasSep = true
} else if (Character.isLetterOrDigit(ch)) {
sb.append(Character.toLowerCase(ch))
lastCharWasSep = false
}
i += 1
}
// We can end up with a leading and/or trailing SepChar so lets remove those
if (sb.length() > 0 && sb.charAt(0) == SepChar) sb.deleteCharAt(0)
if (sb.length() > 0 && sb.charAt(sb.length - 1) == SepChar) sb.deleteCharAt(sb.length - 1)
sb.toString
}
/**
* Converts an ASCII Character to it's Unicode Full Width equivalent
*
* scala> val a = (33 to 126).map{ _.toChar }
* a: scala.collection.immutable.IndexedSeq[Char] = Vector(!, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, :, ;, <, =, >, ?, @, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V, W, X, Y, Z, [, \, ], ^, _, `, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, {, |, }, ~)
*
* scala> val b = (65281 to 65374).map{ _.toChar }
* b: scala.collection.immutable.IndexedSeq[Char] = Vector(!, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, :, ;, <, =, >, ?, @, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V, W, X, Y, Z, [, \, ], ^, _, `, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, {, |, }, ~)
*
* scala> (a zip b)
* res44: scala.collection.immutable.IndexedSeq[(Char, Char)] = Vector((!,!), (","), (#,#), ($,$), (%,%), (&,&), (','), ((,(), (),)), (*,*), (+,+), (,,,), (-,-), (.,.), (/,/), (0,0), (1,1), (2,2), (3,3), (4,4), (5,5), (6,6), (7,7), (8,8), (9,9), (:,:), (;,;), (<,<), (=,=), (>,>), (?,?), (@,@), (A,A), (B,B), (C,C), (D,D), (E,E), (F,F), (G,G), (H,H), (I,I), (J,J), (K,K), (L,L), (M,M), (N,N), (O,O), (P,P), (Q,Q), (R,R), (S,S), (T,T), (U,U), (V,V), (W,W), (X,X), (Y,Y), (Z,Z), ([,[), (\,\), (],]), (^,^), (_,_), (`,`), (a,a), (b,b), (c,c), (d,d), (e,e), (f,f), (g,g), (h,h), (i,i), (j,j), (k,k), (l,l), (m,m), (n,n), (o,o), (p,p), (q,q), (r,r), (s,s), (t,t), (u,u), (v,v), (w,w), (x,x), (y,y), (z,z), ({,{), (|,|), (},}), (~,~))
*
*/
def toFullWidth(ch: Char): Char = if (ch == ' ') 12288.toChar else if (ch >= 33 && ch <= 126) (ch+65248).toChar else ch
/**
* Converts ASCII Characters in a String to their Unicode Full Width equivalent
*/
def toFullWidth(s: String): String = s.map{ toFullWidth }
/** Used by the various lowerAlphanumeric methods */
private[common] def unicodeNormalization(s: String): String = {
// Unicode Normalization Form KC (NFKC) - "Compatibility decomposition, followed by canonical composition."
UnicodeNormalization.normalizeNFKC(s)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy