
com.jetbrains.pluginverifier.misc.HtmlEscaper.kt Maven / Gradle / Ivy
/*
* Copyright 2000-2023 JetBrains s.r.o. and other contributors. Use of this source code is governed by the Apache 2.0 license that can be found in the LICENSE file.
*/
package com.jetbrains.pluginverifier.misc
import java.io.IOException
import java.io.StringWriter
import java.io.Writer
import java.lang.Character.*
import java.util.*
/**
* HTML 4.0 String escaper.
*
* Code from Apache `commons-text`, adjusted for Kotlin and simplified.
*/
class HtmlEscaper {
/** The mapping to be used in translation. */
private val lookupMap = mutableMapOf()
/** The first character of each key in the lookupMap. */
private val prefixSet = BitSet()
/** The length of the shortest key in the lookupMap. */
private val shortest: Int
/** The length of the longest key in the lookupMap. */
private val longest: Int
init {
var currentShortest = Int.MAX_VALUE
var currentLongest = 0
for ((key, value) in escapeLookup) {
this.lookupMap[key.toString()] = value.toString()
prefixSet.set(key[0].code)
val sz = key.length
if (sz < currentShortest) {
currentShortest = sz
}
if (sz > currentLongest) {
currentLongest = sz
}
}
shortest = currentShortest
longest = currentLongest
}
fun escape(input: CharSequence): String {
return StringWriter(input.length * 2).run {
escape(input, this)
this.toString()
}
}
@Throws(IOException::class)
private fun escape(input: CharSequence, index: Int, writer: Writer): Int {
// check if translation exists for the input at position index
if (!prefixSet[input[index].code]) return 0
var max = longest
if (index + longest > input.length) {
max = input.length - index
}
// implement greedy algorithm by trying maximum match first
for (i in max downTo shortest) {
val subSeq = input.subSequence(index, index + i)
val result = lookupMap[subSeq.toString()]
if (result != null) {
writer.write(result.toString())
return codePointCount(subSeq, 0, subSeq.length)
}
}
return 0
}
@Throws(IOException::class)
private fun escape(input: CharSequence, writer: Writer) {
var pos = 0
val len = input.length
while (pos < len) {
val consumed = escape(input, pos, writer)
if (consumed == 0) {
// inlined implementation of Character.toChars(Character.codePointAt(input, pos))
// avoids allocating temp char arrays and duplicate checks
val c1 = input[pos]
writer.write(c1.code)
pos++
if (isHighSurrogate(c1) && pos < len) {
val c2 = input[pos]
if (isLowSurrogate(c2)) {
writer.write(c2.code)
pos++
}
}
continue
}
// contract with translators is that they have to understand code points
// and they just took care of a surrogate pair
repeat(consumed) {
pos += charCount(codePointAt(input, pos))
}
}
}
}
private val escapeLookup: Map = mapOf(
"\u00A0" to " ",
"\u00A1" to "¡",
"\u00A2" to "¢",
"\u00A3" to "£",
"\u00A4" to "¤",
"\u00A5" to "¥",
"\u00A6" to "¦",
"\u00A7" to "§",
"\u00A8" to "¨",
"\u00A9" to "©",
"\u00AA" to "ª",
"\u00AB" to "«",
"\u00AC" to "¬",
"\u00AD" to "",
"\u00AE" to "®",
"\u00AF" to "¯",
"\u00B0" to "°",
"\u00B1" to "±",
"\u00B2" to "²",
"\u00B3" to "³",
"\u00B4" to "´",
"\u00B5" to "µ",
"\u00B6" to "¶",
"\u00B7" to "·",
"\u00B8" to "¸",
"\u00B9" to "¹",
"\u00BA" to "º",
"\u00BB" to "»",
"\u00BC" to "¼",
"\u00BD" to "½",
"\u00BE" to "¾",
"\u00BF" to "¿",
"\u00C0" to "À",
"\u00C1" to "Á",
"\u00C2" to "Â",
"\u00C3" to "Ã",
"\u00C4" to "Ä",
"\u00C5" to "Å",
"\u00C6" to "Æ",
"\u00C7" to "Ç",
"\u00C8" to "È",
"\u00C9" to "É",
"\u00CA" to "Ê",
"\u00CB" to "Ë",
"\u00CC" to "Ì",
"\u00CD" to "Í",
"\u00CE" to "Î",
"\u00CF" to "Ï",
"\u00D0" to "Ð",
"\u00D1" to "Ñ",
"\u00D2" to "Ò",
"\u00D3" to "Ó",
"\u00D4" to "Ô",
"\u00D5" to "Õ",
"\u00D6" to "Ö",
"\u00D7" to "×",
"\u00D8" to "Ø",
"\u00D9" to "Ù",
"\u00DA" to "Ú",
"\u00DB" to "Û",
"\u00DC" to "Ü",
"\u00DD" to "Ý",
"\u00DE" to "Þ",
"\u00DF" to "ß",
"\u00E0" to "à",
"\u00E1" to "á",
"\u00E2" to "â",
"\u00E3" to "ã",
"\u00E4" to "ä",
"\u00E5" to "å",
"\u00E6" to "æ",
"\u00E7" to "ç",
"\u00E8" to "è",
"\u00E9" to "é",
"\u00EA" to "ê",
"\u00EB" to "ë",
"\u00EC" to "ì",
"\u00ED" to "í",
"\u00EE" to "î",
"\u00EF" to "ï",
"\u00F0" to "ð",
"\u00F1" to "ñ",
"\u00F2" to "ò",
"\u00F3" to "ó",
"\u00F4" to "ô",
"\u00F5" to "õ",
"\u00F6" to "ö",
"\u00F7" to "÷",
"\u00F8" to "ø",
"\u00F9" to "ù",
"\u00FA" to "ú",
"\u00FB" to "û",
"\u00FC" to "ü",
"\u00FD" to "ý",
"\u00FE" to "þ",
"\u00FF" to "ÿ",
// HTML 4.0 Extended
"\u0192" to "ƒ",
"\u0391" to "Α",
"\u0392" to "Β",
"\u0393" to "Γ",
"\u0394" to "Δ",
"\u0395" to "Ε",
"\u0396" to "Ζ",
"\u0397" to "Η",
"\u0398" to "Θ",
"\u0399" to "Ι",
"\u039A" to "Κ",
"\u039B" to "Λ",
"\u039C" to "Μ",
"\u039D" to "Ν",
"\u039E" to "Ξ",
"\u039F" to "Ο",
"\u03A0" to "Π",
"\u03A1" to "Ρ",
"\u03A3" to "Σ",
"\u03A4" to "Τ",
"\u03A5" to "Υ",
"\u03A6" to "Φ",
"\u03A7" to "Χ",
"\u03A8" to "Ψ",
"\u03A9" to "Ω",
"\u03B1" to "α",
"\u03B2" to "β",
"\u03B3" to "γ",
"\u03B4" to "δ",
"\u03B5" to "ε",
"\u03B6" to "ζ",
"\u03B7" to "η",
"\u03B8" to "θ",
"\u03B9" to "ι",
"\u03BA" to "κ",
"\u03BB" to "λ",
"\u03BC" to "μ",
"\u03BD" to "ν",
"\u03BE" to "ξ",
"\u03BF" to "ο",
"\u03C0" to "π",
"\u03C1" to "ρ",
"\u03C2" to "ς",
"\u03C3" to "σ",
"\u03C4" to "τ",
"\u03C5" to "υ",
"\u03C6" to "φ",
"\u03C7" to "χ",
"\u03C8" to "ψ",
"\u03C9" to "ω",
"\u03D1" to "ϑ",
"\u03D2" to "ϒ",
"\u03D6" to "ϖ",
"\u2022" to "•",
"\u2026" to "…",
"\u2032" to "′",
"\u2033" to "″",
"\u203E" to "‾",
"\u2044" to "⁄",
"\u2118" to "℘",
"\u2111" to "ℑ",
"\u211C" to "ℜ",
"\u2122" to "™",
"\u2135" to "ℵ",
"\u2190" to "←",
"\u2191" to "↑",
"\u2192" to "→",
"\u2193" to "↓",
"\u2194" to "↔",
"\u21B5" to "↵",
"\u21D0" to "⇐",
"\u21D1" to "⇑",
"\u21D2" to "⇒",
"\u21D3" to "⇓",
"\u21D4" to "⇔",
"\u2200" to "∀",
"\u2202" to "∂",
"\u2203" to "∃",
"\u2205" to "∅",
"\u2207" to "∇",
"\u2208" to "∈",
"\u2209" to "∉",
"\u220B" to "∋",
"\u220F" to "∏",
"\u2211" to "∑",
"\u2212" to "−",
"\u2217" to "∗",
"\u221A" to "√",
"\u221D" to "∝",
"\u221E" to "∞",
"\u2220" to "∠",
"\u2227" to "∧",
"\u2228" to "∨",
"\u2229" to "∩",
"\u222A" to "∪",
"\u222B" to "∫",
"\u2234" to "∴",
"\u223C" to "∼",
"\u2245" to "≅",
"\u2248" to "≈",
"\u2260" to "≠",
"\u2261" to "≡",
"\u2264" to "≤",
"\u2265" to "≥",
"\u2282" to "⊂",
"\u2283" to "⊃",
"\u2284" to "⊄",
"\u2286" to "⊆",
"\u2287" to "⊇",
"\u2295" to "⊕",
"\u2297" to "⊗",
"\u22A5" to "⊥",
"\u22C5" to "⋅",
"\u2308" to "⌈",
"\u2309" to "⌉",
"\u230A" to "⌊",
"\u230B" to "⌋",
"\u2329" to "〈",
"\u232A" to "〉",
"\u25CA" to "◊",
"\u2660" to "♠",
"\u2663" to "♣",
"\u2665" to "♥",
"\u2666" to "♦",
"\u0152" to "Œ",
"\u0153" to "œ",
"\u0160" to "Š",
"\u0161" to "š",
"\u0178" to "Ÿ",
"\u02C6" to "ˆ",
"\u02DC" to "˜",
"\u2002" to " ",
"\u2003" to " ",
"\u2009" to " ",
"\u200C" to "",
"\u200D" to "",
"\u200E" to "",
"\u200F" to "",
"\u2013" to "–",
"\u2014" to "—",
"\u2018" to "‘",
"\u2019" to "’",
"\u201A" to "‚",
"\u201C" to "“",
"\u201D" to "”",
"\u201E" to "„",
"\u2020" to "†",
"\u2021" to "‡",
"\u2030" to "‰",
"\u2039" to "‹",
"\u203A" to "›",
"\u20AC" to "€",
// basic escapes
"\"" to """,
"&" to "&",
"<" to "<",
">" to ">"
)
fun String.escapeHtml4(): String {
return HtmlEscaper().escape(this)
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy