![JAR search and dependency download from the Maven repository](/logo.png)
sanskritnlp.transliteration.romanScripts.scala Maven / Gradle / Ivy
package sanskritnlp.transliteration
import java.util.Collections
import org.slf4j.LoggerFactory
import scala.util.matching.Regex
import scala.util.matching.Regex.Match
// Point of entry: toDevanagari()
// Read that function, and the logic will be clear.
trait RomanScript extends IndicScript {
val log = LoggerFactory.getLogger(this.getClass)
val romanToDevaIndependentVowels: Map[String, String] = null
val romanToDevaDependentVowels: Map[String, String] = null
val romanToDevaConsonants: Map[String, String] = null
val romanToDevaConsonantsNoVirama: Map[String, String] = null
val romanToDevaContextFreeReplacements: Map[String, String] = null
val devaConsonantsNoViramaToRomanVirama: Map[String, String] = null
val devaConsonantsNoViramaToRoman: Map[String, String] = null
val devaIndependentVowelsToRoman: Map[String, String] = null
val devaDependentVowelsToRoman: Map[String, String] = null
val devaConsonantsToRoman: Map[String, String] = null
val aToRoman: String = null
val devaToRomanGeneral: Map[String, String] = null
val distinctCharacters: List[String] = null
val caseNeutral = false
val devaIndependentToDependent: Map[String, String] = Map(
"अ" -> "",
"आ" -> "ा",
"इ" -> "ि",
"ई" -> "ी",
"उ" -> "ु", "ऊ" -> "ू",
"ऋ" -> "ृ", "ॠ" -> "ॄ",
"ऌ" -> "ॢ", "ॡ" -> "ॣ",
"ए" -> "े",
"ऐ" -> "ै",
"ओ" -> "ो", "औ" -> "ौ")
def debugString() = {
println(romanToDevaIndependentVowels)
println(romanToDevaDependentVowels)
println(romanToDevaConsonants)
println(romanToDevaConsonantsNoVirama)
println(romanToDevaContextFreeReplacements)
println(devaConsonantsNoViramaToRomanVirama)
println(devaConsonantsNoViramaToRoman)
println(devaDependentVowelsToRoman)
println(aToRoman)
println(devaToRomanGeneral)
}
def makeRegexFromKeys(keys: Iterable[String]): Regex = {
def escapeWildcardsForRegex(input: String): String = {
input.replaceAllLiterally(".", "\\.").replaceAllLiterally("|", "\\|")
}
("(" + keys.map(escapeWildcardsForRegex).mkString("|") + ")").r
}
def replaceKeysLongestFirst(str_in: String, mapping: Map[String, String]): String = {
var output = str_in
val keyLengths = mapping.keys.map(_.length).toList.distinct.sorted.reverse
// The above yields List(3, 2, 1) for HK.
keyLengths.foreach(x => {
val mapping_length_x = mapping.filter(t => (t._1.length() == x))
// log.info(mapping)
val regexFromKeys = makeRegexFromKeys(mapping_length_x.keys)
output = regexFromKeys.replaceAllIn(output, _ match { case regexFromKeys(key) => mapping(key) })
})
output
}
def replaceRomanDependentVowels(str_in: String, vowelMap: Map[String, String]): String = {
val regex_dependent_vowels = (makeRegexFromKeys(romanToDevaConsonantsNoVirama.keys).toString() + makeRegexFromKeys(vowelMap.keys).toString()).r
var output = str_in
output = regex_dependent_vowels.replaceAllIn(output, _ match { case regex_dependent_vowels(c1, key) =>
{
c1 + vowelMap(key)
}})
output
}
def replaceRomanDependentVowels(str_in: String): String = {
var output = str_in
val keyLengths = romanToDevaDependentVowels.keys.map(_.length).toList.distinct.sorted.reverse
// The above yields List(3, 2, 1) for HK.
keyLengths.foreach(x => {
val vowelMap = romanToDevaDependentVowels.filter(t => (t._1.length() == x))
output = replaceRomanDependentVowels(output, vowelMap)
})
output
}
def test_replaceRomanDependentVowels(str_in: String): Unit = {
log.info("Test string : " + str_in)
log.info("Result : " + replaceRomanDependentVowels(str_in))
}
def replaceRomanConsonantsFollowedByVowels(str_in: String, consonantMapNoVirama: Map[String, String]): String = {
val regex_consonant_vowel = (makeRegexFromKeys(consonantMapNoVirama.keys).toString()
+ makeRegexFromKeys(romanToDevaDependentVowels.values ++ List(aToRoman)).toString()).r
var output = str_in
output = regex_consonant_vowel.replaceAllIn(output, _ match { case regex_consonant_vowel(consonant, vowel) => consonantMapNoVirama(consonant) + vowel.replaceAll("a", "") })
output
}
// Assumption: This should only be called after replaceRomanDependentVowels .
def replaceRomanConsonantsFollowedByVowels(str_in: String): String = {
var output = str_in
val keyLengths = romanToDevaConsonantsNoVirama.keys.map(_.length).toList.distinct.sorted.reverse
// The above yields List(3, 2, 1) for HK.
// log.info(keyLengths)
keyLengths.foreach(x => {
val mapping = romanToDevaConsonantsNoVirama.filter(t => (t._1.length() == x))
// log.info(mapping)
output = replaceRomanConsonantsFollowedByVowels(output, mapping)
})
output
}
def test_replaceRomanConsonantsFollowedByVowels(str_in: String): Unit = {
log.info("Test string : " + str_in)
log.info("Result : " + replaceRomanConsonantsFollowedByVowels(replaceKeysLongestFirst(replaceRomanDependentVowels(str_in), romanToDevaDependentVowels)))
}
override def toDevanagari(str_in: String): String = {
var output = str_in
if (caseNeutral) {
output = output.toLowerCase
}
try {
output = replaceRomanDependentVowels(output)
output = replaceRomanConsonantsFollowedByVowels(output)
output = replaceKeysLongestFirst(output, romanToDevaConsonants ++ romanToDevaContextFreeReplacements ++ romanToDevaIndependentVowels)
output
} catch {
case e: java.util.NoSuchElementException => {
return str_in
}
}
}
def replaceDevanagariConsonants(str_in: String): String = {
var output = str_in
// First add a few virAma-s.
val VIRAMA = "्"
val consonantVowelPattern = (
"(" + devaConsonantsNoViramaToRomanVirama.keys.mkString("|") + ")"
+ "(" + devaDependentVowelsToRoman.keys.mkString("|") + ")").r
output = consonantVowelPattern.replaceAllIn(output, _ match { case consonantVowelPattern(consonant, vowel) =>
consonant + VIRAMA + vowel})
val consonantNonVowelPattern = (
"(" + devaConsonantsNoViramaToRomanVirama.keys.mkString("|") + ")"
+ s"(?=[^$VIRAMA" + devaDependentVowelsToRoman.keys.mkString("") + "])").r
// log.info(consonantNonVowelPattern)
output = consonantNonVowelPattern.replaceAllIn(output, (m:Match) => {m.group(0) + VIRAMA + aToRoman})
if(romanToDevaConsonantsNoVirama.values.toList.contains(output.last.toString)) {
output = output + VIRAMA + aToRoman
}
// log.info("After virAma addition: " + output.mkString("-"))
output = replaceKeysLongestFirst(output, devaConsonantsToRoman)
output
}
def isEncoding(str_in: String): Boolean = {
return distinctCharacters.map(x => str_in.contains(x)).contains(true)
}
override def fromDevanagari(str_in: String): String = {
var output = str_in
output = replaceDevanagariConsonants(output)
// log.info("Consonant replacement: " + output)
output = replaceKeysLongestFirst(output, devaDependentVowelsToRoman)
// log.info(output)
output = replaceKeysLongestFirst(output, devaIndependentVowelsToRoman)
// log.info(output)
output = replaceKeysLongestFirst(output, devaToRomanGeneral)
output
}
def restoreEscapeSequences(str_in: String): String = {
var output = str_in
val escapePattern = """\\(.्?)""".r
output = escapePattern.replaceAllIn(output, _ match { case escapePattern(matched) => """\\""" + fromDevanagari(matched) })
output
}
def test_restoreEscapeSequences() = {
val str1 = """हरिः ॐ १ 1ad\न् \त्"""
log.info(restoreEscapeSequences(str1))
}
// ASSUMPTION: Escape characters appropriately in romanStart and romanEnd.
def restoreRomanBetweenStrings(str_in: String, romanStart: String, romanEnd: String): String = {
var output = str_in
val escapePattern = (romanStart + """(.+?)""" + romanEnd).r
output = escapePattern.replaceAllIn(output, _ match { case escapePattern(matched) => romanStart + fromDevanagari(matched) + romanEnd})
output
}
def test_restoreRomanBetweenStrings() = {
val str1 = """हरिः ॐ १ {#Pअगे#} 1ad {#आन्द्#} \न् \त्"""
log.info(restoreRomanBetweenStrings(str1, "\\{#", "#\\}"))
}
def test_toDevanagari(str_in : String) = {
test_replaceRomanDependentVowels(str_in)
test_replaceRomanConsonantsFollowedByVowels(str_in)
log.info(toDevanagari(str_in))
}
def test_fromDevanagari(str_in : String = "असय औषधिः ग्रन्थः! ॡकारो।ऽस्ति। नास्ति लेशोऽपि संशयः। कीलकम्? कूपिः? कष्ठं भोः। शङ्कर! ज्ञानम्। सञ्जीवय। १२३४५.. ॐ तत्।") = {
log.info("Input: " + str_in)
log.info("Output: " + fromDevanagari(str_in))
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy