All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jetbrains.pluginverifier.misc.HtmlEscaper.kt Maven / Gradle / Ivy

/*
 * Copyright 2000-2023 JetBrains s.r.o. and other contributors. Use of this source code is governed by the Apache 2.0 license that can be found in the LICENSE file.
 */

package com.jetbrains.pluginverifier.misc

import java.io.IOException
import java.io.StringWriter
import java.io.Writer
import java.lang.Character.*
import java.util.*

/**
 * HTML 4.0 String escaper.
 *
 * Code from Apache `commons-text`, adjusted for Kotlin and simplified.
 */
class HtmlEscaper {
    /** The mapping to be used in translation.  */
    private val lookupMap = mutableMapOf()

    /** The first character of each key in the lookupMap.  */
    private val prefixSet = BitSet()

    /** The length of the shortest key in the lookupMap.  */
    private val shortest: Int

    /** The length of the longest key in the lookupMap.  */
    private val longest: Int

    init {
        var currentShortest = Int.MAX_VALUE
        var currentLongest = 0
        for ((key, value) in escapeLookup) {
            this.lookupMap[key.toString()] = value.toString()
            prefixSet.set(key[0].code)
            val sz = key.length
            if (sz < currentShortest) {
                currentShortest = sz
            }
            if (sz > currentLongest) {
                currentLongest = sz
            }
        }
        shortest = currentShortest
        longest = currentLongest
    }

    fun escape(input: CharSequence): String {
        return StringWriter(input.length * 2).run {
            escape(input, this)
            this.toString()
        }
    }


    @Throws(IOException::class)
    private fun escape(input: CharSequence, index: Int, writer: Writer): Int {
        // check if translation exists for the input at position index
        if (!prefixSet[input[index].code]) return 0

        var max = longest
        if (index + longest > input.length) {
            max = input.length - index
        }
        // implement greedy algorithm by trying maximum match first
        for (i in max downTo shortest) {
            val subSeq = input.subSequence(index, index + i)
            val result = lookupMap[subSeq.toString()]
            if (result != null) {
                writer.write(result.toString())
                return codePointCount(subSeq, 0, subSeq.length)
            }
        }
        return 0
    }

    @Throws(IOException::class)
    private fun escape(input: CharSequence, writer: Writer) {
        var pos = 0
        val len = input.length
        while (pos < len) {
            val consumed = escape(input, pos, writer)
            if (consumed == 0) {
                // inlined implementation of Character.toChars(Character.codePointAt(input, pos))
                // avoids allocating temp char arrays and duplicate checks
                val c1 = input[pos]
                writer.write(c1.code)
                pos++
                if (isHighSurrogate(c1) && pos < len) {
                    val c2 = input[pos]
                    if (isLowSurrogate(c2)) {
                        writer.write(c2.code)
                        pos++
                    }
                }
                continue
            }
            // contract with translators is that they have to understand code points
            // and they just took care of a surrogate pair
            repeat(consumed) {
                pos += charCount(codePointAt(input, pos))
            }
        }
    }
}

private val escapeLookup: Map = mapOf(
  "\u00A0" to " ",
  "\u00A1" to "¡",
  "\u00A2" to "¢",
  "\u00A3" to "£",
  "\u00A4" to "¤",
  "\u00A5" to "¥",
  "\u00A6" to "¦",
  "\u00A7" to "§",
  "\u00A8" to "¨",
  "\u00A9" to "©",
  "\u00AA" to "ª",
  "\u00AB" to "«",
  "\u00AC" to "¬",
  "\u00AD" to "­",
  "\u00AE" to "®",
  "\u00AF" to "¯",
  "\u00B0" to "°",
  "\u00B1" to "±",
  "\u00B2" to "²",
  "\u00B3" to "³",
  "\u00B4" to "´",
  "\u00B5" to "µ",
  "\u00B6" to "¶",
  "\u00B7" to "·",
  "\u00B8" to "¸",
  "\u00B9" to "¹",
  "\u00BA" to "º",
  "\u00BB" to "»",
  "\u00BC" to "¼",
  "\u00BD" to "½",
  "\u00BE" to "¾",
  "\u00BF" to "¿",
  "\u00C0" to "À",
  "\u00C1" to "Á",
  "\u00C2" to "Â",
  "\u00C3" to "Ã",
  "\u00C4" to "Ä",
  "\u00C5" to "Å",
  "\u00C6" to "Æ",
  "\u00C7" to "Ç",
  "\u00C8" to "È",
  "\u00C9" to "É",
  "\u00CA" to "Ê",
  "\u00CB" to "Ë",
  "\u00CC" to "Ì",
  "\u00CD" to "Í",
  "\u00CE" to "Î",
  "\u00CF" to "Ï",
  "\u00D0" to "Ð",
  "\u00D1" to "Ñ",
  "\u00D2" to "Ò",
  "\u00D3" to "Ó",
  "\u00D4" to "Ô",
  "\u00D5" to "Õ",
  "\u00D6" to "Ö",
  "\u00D7" to "×",
  "\u00D8" to "Ø",
  "\u00D9" to "Ù",
  "\u00DA" to "Ú",
  "\u00DB" to "Û",
  "\u00DC" to "Ü",
  "\u00DD" to "Ý",
  "\u00DE" to "Þ",
  "\u00DF" to "ß",
  "\u00E0" to "à",
  "\u00E1" to "á",
  "\u00E2" to "â",
  "\u00E3" to "ã",
  "\u00E4" to "ä",
  "\u00E5" to "å",
  "\u00E6" to "æ",
  "\u00E7" to "ç",
  "\u00E8" to "è",
  "\u00E9" to "é",
  "\u00EA" to "ê",
  "\u00EB" to "ë",
  "\u00EC" to "ì",
  "\u00ED" to "í",
  "\u00EE" to "î",
  "\u00EF" to "ï",
  "\u00F0" to "ð",
  "\u00F1" to "ñ",
  "\u00F2" to "ò",
  "\u00F3" to "ó",
  "\u00F4" to "ô",
  "\u00F5" to "õ",
  "\u00F6" to "ö",
  "\u00F7" to "÷",
  "\u00F8" to "ø",
  "\u00F9" to "ù",
  "\u00FA" to "ú",
  "\u00FB" to "û",
  "\u00FC" to "ü",
  "\u00FD" to "ý",
  "\u00FE" to "þ",
  "\u00FF" to "ÿ",
  // HTML 4.0 Extended
  "\u0192" to "ƒ",
  "\u0391" to "Α",
  "\u0392" to "Β",
  "\u0393" to "Γ",
  "\u0394" to "Δ",
  "\u0395" to "Ε",
  "\u0396" to "Ζ",
  "\u0397" to "Η",
  "\u0398" to "Θ",
  "\u0399" to "Ι",
  "\u039A" to "Κ",
  "\u039B" to "Λ",
  "\u039C" to "Μ",
  "\u039D" to "Ν",
  "\u039E" to "Ξ",
  "\u039F" to "Ο",
  "\u03A0" to "Π",
  "\u03A1" to "Ρ",
  "\u03A3" to "Σ",
  "\u03A4" to "Τ",
  "\u03A5" to "Υ",
  "\u03A6" to "Φ",
  "\u03A7" to "Χ",
  "\u03A8" to "Ψ",
  "\u03A9" to "Ω",
  "\u03B1" to "α",
  "\u03B2" to "β",
  "\u03B3" to "γ",
  "\u03B4" to "δ",
  "\u03B5" to "ε",
  "\u03B6" to "ζ",
  "\u03B7" to "η",
  "\u03B8" to "θ",
  "\u03B9" to "ι",
  "\u03BA" to "κ",
  "\u03BB" to "λ",
  "\u03BC" to "μ",
  "\u03BD" to "ν",
  "\u03BE" to "ξ",
  "\u03BF" to "ο",
  "\u03C0" to "π",
  "\u03C1" to "ρ",
  "\u03C2" to "ς",
  "\u03C3" to "σ",
  "\u03C4" to "τ",
  "\u03C5" to "υ",
  "\u03C6" to "φ",
  "\u03C7" to "χ",
  "\u03C8" to "ψ",
  "\u03C9" to "ω",
  "\u03D1" to "ϑ",
  "\u03D2" to "ϒ",
  "\u03D6" to "ϖ",
  "\u2022" to "•",
  "\u2026" to "…",
  "\u2032" to "′",
  "\u2033" to "″",
  "\u203E" to "‾",
  "\u2044" to "⁄",
  "\u2118" to "℘",
  "\u2111" to "ℑ",
  "\u211C" to "ℜ",
  "\u2122" to "™",
  "\u2135" to "ℵ",
  "\u2190" to "←",
  "\u2191" to "↑",
  "\u2192" to "→",
  "\u2193" to "↓",
  "\u2194" to "↔",
  "\u21B5" to "↵",
  "\u21D0" to "⇐",
  "\u21D1" to "⇑",
  "\u21D2" to "⇒",
  "\u21D3" to "⇓",
  "\u21D4" to "⇔",
  "\u2200" to "∀",
  "\u2202" to "∂",
  "\u2203" to "∃",
  "\u2205" to "∅",
  "\u2207" to "∇",
  "\u2208" to "∈",
  "\u2209" to "∉",
  "\u220B" to "∋",
  "\u220F" to "∏",
  "\u2211" to "∑",
  "\u2212" to "−",
  "\u2217" to "∗",
  "\u221A" to "√",
  "\u221D" to "∝",
  "\u221E" to "∞",
  "\u2220" to "∠",
  "\u2227" to "∧",
  "\u2228" to "∨",
  "\u2229" to "∩",
  "\u222A" to "∪",
  "\u222B" to "∫",
  "\u2234" to "∴",
  "\u223C" to "∼",
  "\u2245" to "≅",
  "\u2248" to "≈",
  "\u2260" to "≠",
  "\u2261" to "≡",
  "\u2264" to "≤",
  "\u2265" to "≥",
  "\u2282" to "⊂",
  "\u2283" to "⊃",
  "\u2284" to "⊄",
  "\u2286" to "⊆",
  "\u2287" to "⊇",
  "\u2295" to "⊕",
  "\u2297" to "⊗",
  "\u22A5" to "⊥",
  "\u22C5" to "⋅",
  "\u2308" to "⌈",
  "\u2309" to "⌉",
  "\u230A" to "⌊",
  "\u230B" to "⌋",
  "\u2329" to "⟨",
  "\u232A" to "⟩",
  "\u25CA" to "◊",
  "\u2660" to "♠",
  "\u2663" to "♣",
  "\u2665" to "♥",
  "\u2666" to "♦",
  "\u0152" to "Œ",
  "\u0153" to "œ",
  "\u0160" to "Š",
  "\u0161" to "š",
  "\u0178" to "Ÿ",
  "\u02C6" to "ˆ",
  "\u02DC" to "˜",
  "\u2002" to " ",
  "\u2003" to " ",
  "\u2009" to " ",
  "\u200C" to "‌",
  "\u200D" to "‍",
  "\u200E" to "‎",
  "\u200F" to "‏",
  "\u2013" to "–",
  "\u2014" to "—",
  "\u2018" to "‘",
  "\u2019" to "’",
  "\u201A" to "‚",
  "\u201C" to "“",
  "\u201D" to "”",
  "\u201E" to "„",
  "\u2020" to "†",
  "\u2021" to "‡",
  "\u2030" to "‰",
  "\u2039" to "‹",
  "\u203A" to "›",
  "\u20AC" to "€",
  // basic escapes
  "\"" to """,
  "&" to "&",
  "<" to "<",
  ">" to ">"
)

fun String.escapeHtml4(): String {
    return HtmlEscaper().escape(this)
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy