All Downloads are FREE. Search and download functionalities are using the official Maven repository.

commonMain.ure.UreCore.kt Maven / Gradle / Ivy

package pl.mareklangiewicz.ure.core


import pl.mareklangiewicz.annotations.*
import pl.mareklangiewicz.bad.*
import pl.mareklangiewicz.text.*
import pl.mareklangiewicz.ure.*
import kotlin.jvm.JvmInline
import kotlin.text.RegexOption.*



/** IR is the traditional regular expression - no human should read - kind of "intermediate representation" */
@JvmInline
value class IR @DelicateApi internal constructor(val str: String) {
    override fun toString(): String = str
}


@OptIn(DelicateApi::class) private val String.asIR get() = IR(this)

/**
 * General info about Ure (Micro Regular Expressions):
 * Multiplatform Kotlin Frontend / DSL for regular expressions. Actual regular expressions are used like IR
 * (intermediate representation) just to compile it to standard kotlin.text.Regex,
 * but the developer is using nice DSL to build regular expressions instead of writing them by hand.
 *
 * Reference links to RE engines/backends docs, etc.:
 * https://kotlinlang.org/api/latest/jvm/stdlib/kotlin.text/-regex/
 * https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
 * https://docs.oracle.com/javase/tutorial/essential/regex/quant.html
 * https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp
 * https://www.w3schools.com/jsref/jsref_obj_regexp.asp
 * https://www.regular-expressions.info/ (comprehensive information, notes about different implementations)
 * https://regexr.com/
 * https://regex101.com/ (nice but closed source)
 */
sealed interface Ure {

    fun toIR(): IR

    /**
     * Optionally wraps in a non-capturing group before generating IR, so it's safe to use with quantifiers, alternations, etc.
     * Wrapping is done only when needed. For example, [UreConcatenation] with more than one element is wrapped.
     * (UreConcatenation with zero elements also is wrapped - so f. e. external UreQuantif only catches empty concatenation)
     */
    fun toClosedIR(): IR

    /**
     * It sets MULTILINE by default.
     * Also, I decided NOT to use DOT_MATCHES_ALL by default. Let's keep the ".": [chAnyInLine] as single line matcher.
     * Let's use explicit [chAnyAtAll] instead of changing the "." meaning all the time.
     * IMPORTANT:
     * We assume in all normal val/fun ureSth... That DOT_MATCHES_ALL is DISABLED, and MULTILINE is ENABLED,
     * so we don't have to enable/disable it all the time "just to make sure".
     */
    @OptIn(DelicateApi::class, NotPortableApi::class)
    fun compile() = compileWithOptions(MULTILINE)

    @DelicateApi("Usually code using Ure assumes default options, so changing options can create hard to find issues.")
    @NotPortableApi("Some options work only on some platforms. Check docs for each used platform.")
    fun compileWithOptions(vararg options: RegexOption) = Regex(toIR().str, options.toSet())
}

/** https://www.regular-expressions.info/brackets.html */
sealed interface UreNonCapturing: Ure

/** https://www.regular-expressions.info/brackets.html */
sealed interface UreCapturing: Ure

/** https://www.regular-expressions.info/brackets.html */
sealed interface UreNumbered: UreCapturing

/**
 * Named group is also automatically numbered (in most implementations),
 * but different regex implementations can number them differently.
 * So better not to mix [UreNamedGroup] with [UreNumberedGroup] in one [Ure],
 * but choose one way of capturing. See "Numbers for Named Capturing Groups" here:
 * https://www.regular-expressions.info/named.html
 */
sealed interface UreNamed: UreNumbered

/**
 * The first way it matches becomes the only way (backtracking info gets removed if there was any).
 * So no trying if it could eat more or less chars (in particular place) when sth later failed.
 * [UreAtomic] only means that this [Ure] is known to be atomic. Some others can also in practice be atomic.
 * https://www.regular-expressions.info/atomic.html
 */
sealed interface UreAtomic: UreNonCapturing

/** https://www.regular-expressions.info/anchors.html */
sealed interface UreAnchor: UreAtomic

/**
 * Also known as a character set: https://www.regular-expressions.info/charclass.html
 * Note:
 *   It can sometimes match more than one char technically.
 *   For example, most (all?) emoji "code points" take two "code units" (16b chars).
 *   Such 32b encoding is also called "surrogate pair".
 */
sealed interface UreCharClass: UreAtomic {
    fun toIRInCharClass(): IR
}

@JvmInline
value class UreConcatenation internal constructor(val tokens: MutableList = mutableListOf()) : UreNonCapturing {
    // TODO_someday: make tokens publicly List, when kotlin have this feature:
    // https://youtrack.jetbrains.com/issue/KT-14663/Support-having-a-public-and-a-private-type-for-the-same-property

    private val debugWithClosedTokens: Boolean get() = false
        // In case of difficult issues: try to temporarily change it to true see if matching changes somehow.

    override fun toIR(): IR = when (tokens.size) {
        0 -> "".asIR
        else -> tokens.joinToString("") {
            if ((it is UreAlternation) or (it is UreWithRawIR) or debugWithClosedTokens) it.toClosedIR().str else it.toIR().str
        }.asIR
    }

    override fun toClosedIR() = when (tokens.size) {
        1 -> tokens[0].toClosedIR()
        else -> this.groupNonCapt().toIR() // In case 0, we also want to wrap it in groupNonCapt!
        // To avoid issues when outside operator captures something else instead of empty concatenation.
        // I decided NOT to throw IllegalStateError in case 0, so we can always monitor IR in half-baked UREs.
        // (Like when creating UREs with some @Composable UI)
    }

    // Can't decide if this syntax is better in case of "1 of ..."; let's leave it for now.
    // TODO_later: rethink syntax when context receivers become multiplatform.
    //   Maybe somehow force '+' in other cases too, but I don't want to force some syntax with additional parentheses.
    operator fun Ure.unaryPlus() { tokens.add(this) }

    class UreX internal constructor(val times: IntRange, val reluctant: Boolean, val possessive: Boolean)

    fun x(times: IntRange, reluctant: Boolean = false, possessive: Boolean = false) = UreX(times, reluctant, possessive)
    fun x(times: Int) = x(times..times)

    infix fun UreX.of(ure: Ure) {
        tokens.add(ure.times(times, reluctant, possessive))
    }

    infix fun UreX.of(init: UreConcatenation.() -> Unit) {
        this of ure(init = init)
    }

    infix fun IntRange.of(ure: Ure) = x(this) of ure
    infix fun Int.of(ure: Ure) = x(this) of ure
    infix fun IntRange.of(init: UreConcatenation.() -> Unit) = x(this) of init
    infix fun Int.of(init: UreConcatenation.() -> Unit) = x(this) of init
}

data class UreAlternation internal constructor(val first: Ure, val second: Ure) : UreNonCapturing {
    override fun toIR() = "${first.toClosedIR()}|${second.toClosedIR()}".asIR
    override fun toClosedIR() = this.groupNonCapt().toIR()
}

sealed interface UreGroup : Ure {
    val content: Ure
    private val contentIR get() = content.toIR()
    val typeIR: IR // it's not full IR but just the part that signifies the type of group

    override fun toIR(): IR = "($typeIR$contentIR)".asIR
    // it looks like all possible typeIR prefixes cannot be confused with first contentIR characters.
    // (meaning: RE designers thought about it, so I don't have to be extra careful here.)

    override fun toClosedIR() = toIR() // group is always "closed" - has parentheses outside
}

data class UreNamedGroup internal constructor(override val content: Ure, val name: String) : UreGroup, UreNamed {
    override val typeIR get() = "?<$name>".asIR
}

@JvmInline
value class UreNonCapturingGroup internal constructor(override val content: Ure) : UreGroup, UreNonCapturing {
    override val typeIR get() = "?:".asIR
}

@JvmInline
value class UreNumberedGroup internal constructor(override val content: Ure) : UreGroup, UreNumbered {
    override val typeIR get() = "".asIR
}


/** https://www.regular-expressions.info/atomic.html */
@JvmInline
@NotPortableApi("Does NOT even compile (Ure.compile) on JS.")
value class UreAtomicGroup internal constructor(override val content: Ure) : UreGroup, UreAtomic {
    override val typeIR get() = "?>".asIR
}

sealed class UreChangeOptions @DelicateApi @NotPortableApi protected constructor(
): UreNonCapturing {
    abstract val enable: Set
    abstract val disable: Set

    // run it in init of final class
    protected fun reqCorrectOptions() {
        req((enable intersect disable).isEmpty()) { "Can not enable and disable the same option at the same time" }
        req(enable.isNotEmpty() || disable.isNotEmpty()) { "No options provided" }
    }

    @Suppress("GrazieInspection")
    protected val RegexOption.code
        get() = when (this) {
            // Note: Kotlin stdlib RegexOption will probably evolve, so I'll enable more options here in the future.
            // See also: https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
            IGNORE_CASE -> "i"
            MULTILINE -> "m"
            // LITERAL -> bad { "Looks like not even supported by kotlin stdlib RegexOption in common code." }
            // UNIX_LINES -> "d" // bad { "Looks like not even supported by kotlin stdlib RegexOption in common code." }
            // COMMENTS -> "x" // bad { "not really supported... maybe in UreIR, but I wouldn't use it." }
            // DOT_MATCHES_ALL -> "s" // bad { "Looks like not even supported by kotlin stdlib RegexOption in common code." }
            // CANON_EQ -> bad { "Looks like not even supported by kotlin stdlib RegexOption." }
            // UNICODE_CASE -> "u" // bad { "Looks like not even supported by kotlin stdlib RegexOption in common code." }
            else -> bad { "RegexOption: $this is not supported." }
        }

    private val oec get() = enable.code
    private val odc get() = disable.code.let { if (it.isEmpty()) it else "-$it" }
    private val Set.code get() = joinToString("") { it.code }

    protected val optionsCode get() = "$oec$odc"
}

data class UreChangeOptionsGroup @DelicateApi @NotPortableApi internal constructor(
    override val content: Ure,
    override val enable: Set = emptySet(),
    override val disable: Set = emptySet(),
) : UreChangeOptions(), UreGroup {
    init { reqCorrectOptions() }
    override val typeIR get() = "?$optionsCode:".asIR
}

/**
 * Changes regex options ([RegexOption]) from this point ahead. Very problematic construct.
 * It is much safer to use [UreChangeOptionsGroup] instead of [UreChangeOptionsAhead].
 * Or even safer not to change options at all, so all [Ure]s are interpreted the same way.
 */
@DelicateApi("Makes the whole Ure very difficult to analyze.", ReplaceWith("UreChangingOptionsGroup"))
@SecondaryApi("Use UreChangingOptionsGroup", ReplaceWith("UreChangingOptionsGroup"))
@NotPortableApi("Does NOT even compile (Ure.compile) on JS.", ReplaceWith("UreChangingOptionsGroup"))
data class UreChangeOptionsAhead internal constructor(
    override val enable: Set = emptySet(),
    override val disable: Set = emptySet(),
) : UreChangeOptions(), UreNonCapturing {
    init { reqCorrectOptions() }
    override fun toClosedIR(): IR = toIR()
    override fun toIR(): IR = "(?$optionsCode)".asIR
}


// Note: For delicate/not portable reasons, see
//   https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Lookbehind_assertion#description
//   search: "This behavior is reasonable...Therefore, it starts... Regexes in some other languages forbid..."
data class UreLookGroup @DelicateApi @NotPortableApi internal constructor(
    override val content: Ure,
    val ahead: Boolean = true,
    val positive: Boolean = true,
) : UreGroup, UreAnchor {
    override val typeIR
        get() = when (ahead to positive) {
            true to true -> "?="
            true to false -> "?!"
            false to true -> "?<="
            false to false -> "? bad { "Impossible case" }
        }.asIR
    @OptIn(DelicateApi::class, NotPortableApi::class)
    operator fun not() = UreLookGroup(content, ahead, !positive)
}


data class UreGroupRef internal constructor(val nr: Int? = null, val name: String? = null) : UreAtomic {
    init {
        nr == null || name == null || bad { "Can not reference capturing group by both nr ($nr) and name ($name)" }
        nr == null && name == null && bad { "Either nr or name has to be provided for the group reference" }
    }

    override fun toIR(): IR = if (nr != null) "\\$nr".asIR else "\\k<$name>".asIR
    override fun toClosedIR(): IR = toIR()
}

/**
 * By default, it's "greedy" - tries to match as many "times" as possible, but backs off one by one when about to fail.
 * @param times - Uses shorter notation when appropriate, like: 0..1 -> "?"; 0..MAX -> "*"; 1..MAX -> "+"
 * @param reluctant - Tries to eat as little "times" as possible. Opposite to default "greedy" behavior.
 * @param possessive - It's like more greedy than default greedy. Never backtracks - fails instead. Just as [UreAtomicGroup].
 */
data class UreQuantifier internal constructor(
    val content: Ure,
    val times: IntRange,
    val reluctant: Boolean = false,
    val possessive: Boolean = false,
) : UreNonCapturing {
    init {
        reluctant && possessive && bad { "UreQuantifier can't be reluctant and possessive at the same time" }
    }

    val greedy get() = !reluctant && !possessive

    override fun toIR(): IR {
        val timesIR = when (times) {
            1..1 -> return content.toIR()
            0..1 -> "?"
            0..MAX -> "*"
            1..MAX -> "+"
            else -> when (times.last) {
                times.first -> "{${times.first}}"
                MAX -> "{${times.first},}" // Note: skipping min is not implicit 0, it's an incorrect syntax.
                else -> "{${times.first},${times.last}}"
            }
        }.asIR
        val suffixIR = when {
            reluctant -> "?"
            possessive -> "+"
            greedy -> ""
            else -> bad { "impossible" }
        }.asIR
        return "${content.toClosedIR()}$timesIR$suffixIR".asIR
    }
    override fun toClosedIR() = this.groupNonCapt().toIR()
        // has to be wrapped, because stacking quantifiers doesn't compile in different cases, especially on JS
        // (see TestUreQuantifiersEtc.kt: "dangling quantifiers", etc.)
        // TODO_someday: Optimize: carefully multiply min and max when content is also UreQuantifier
}

/**
 * Represents exactly one character (code point in Unicode). Will be automatically escaped if needed.
 * @param str can contain more than one jvm char in cases when one codepoint in utf16 takes more than one char,
 * but it does not accept regexes representing special characters, like "\\t", or "\\n" - use single backslash,
 * so kotlin compiler changes "\n" into actual newline character, etc;
 * UreCharExact.toIR() will recreate necessary regex (like \n or \x{hhhhh}) for weird characters.
 * Only surrogate pair case is not portable. It compiles to \x{hhhhh} which works only on JVM.
 * Note: On JS there is \u{hhhhh} syntax instead of \x{hhhhh},
 * but I really don't want to create different IR for different platforms.
 * It all should be the same common implementation, except actual regex matching (which is outside Ure).
 * (usecases like: tool on website creating IR to copy&paste to different places)
 */
@JvmInline value class UreCharExact @NotPortableApi internal constructor(val str: String) : UreCharClass {
    init {
        req(str.isNotEmpty()) { "Empty char point." }
        req(str.isSingleUnicodeCharacter) { "Looks like more than one char point." }
    }
    override fun toClosedIR() = toIR()
    override fun toIR(): IR = toIR(str[0].isMeta)
    override fun toIRInCharClass(): IR = toIR(str[0].isMetaInCharClass)

    @OptIn(ExperimentalStdlibApi::class)
    private fun toIR(justQuote: Boolean): IR = when {
        justQuote -> "\\$str" // below, we know it's not meta-like character in this context
        str == "\t" -> "\\t" // tab
        str == "\n" -> "\\n" // newline
        str == "\r" -> "\\r" // carriage-return
        str == "\u000C" -> "\\f" // form-feed
        str == "\u0007" -> "\\a" // alert bell
        str == "\u001B" -> "\\e" // escape
        // Note: other ascii control chars are encoded below as "\\x$hex" which is fine.
        str.length == 1 && str[0].isAsciiPrintable -> str
            // TODO_someday: make sure all ascii printable are fine (we already checked it's not meta in this context).
        else -> {
            val p = str.toSingleCodePoint()
            when {
                p < 0x100 -> "\\x${p.toUByte().toHexString()}" // ascii control chars are also represented this way
                p < 0x10000 -> "\\u${p.toUShort().toHexString()}"
                else -> "\\x{${p.toHexString(HexFormat { number { removeLeadingZeros = true } })}}"
            }
        }
    }.asIR

}

private val Char.isMeta get() = this in "\\[].^\$?*+{}|()" // https://www.regular-expressions.info/characters.html

private val Char.isMetaInCharClass get() = this in "\\[]^-" // https://www.regular-expressions.info/charclass.html

/**
 * https://www.regular-expressions.info/anchors.html
 * https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#bounds
 */
@JvmInline value class UreAnchorPreDef @NotPortableApi internal constructor(val name: Char) : UreAnchor {
    init { req(name.isNameOfAnchorPreDef) { "Incorrect name of predefined anchor: $name" } }
    override fun toIR(): IR = if (name in "^$") "$name".asIR else "\\$name".asIR
    override fun toClosedIR(): IR = toIR()

    @OptIn(NotPortableApi::class) operator fun not() =
        if (name in "bB") UreAnchorPreDef(name.switchCase()) else bad { "The anchor: $name can't be negated." }

    companion object {
        private val Char.isNameOfAnchorPreDef get() = this in "^\$bBAGZz"
    }
}


/**
 * Also known as shorthand character set
 * https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#predef
 * https://www.regular-expressions.info/shorthand.html
 */
@JvmInline value class UreCharClassPreDef @DelicateApi internal constructor(val name: Char) : UreCharClass {
    init { req(name.isNameOfPreDefCC) { "Incorrect name of predefined character class: $name" } }
    override fun toIR(): IR = if (name == '.') "$name".asIR else "\\$name".asIR
    override fun toClosedIR(): IR = toIR()
    override fun toIRInCharClass(): IR = toIR()

    @OptIn(DelicateApi::class) operator fun not() =
        if (name == '.') bad { "The chAnyInLine can't be negated." }
        else UreCharClassPreDef(name.switchCase())

    companion object {
        private val Char.isNameOfPreDefCC get() = this in ".dDhHsSvVwW"
    }
}


data class UreCharClassUnion @NotPortableApi internal constructor(val tokens: List, val positive: Boolean = true) : UreCharClass {
    init { req(tokens.isNotEmpty()) { "No tokens in UreCharClassUnion." } }
    override fun toIR(): IR = if (tokens.size == 1 && positive) tokens[0].toIR()
        else tokens.joinToString("", if (positive) "[" else "[^", "]") { it.toIRInCharClass().str }.asIR
    override fun toClosedIR(): IR = toIR()
    override fun toIRInCharClass(): IR = tokens.joinToString("", if (positive) "" else "[^", if (positive) "" else "]") { it.toIRInCharClass().str }.asIR
    @OptIn(NotPortableApi::class)
    operator fun not() = UreCharClassUnion(tokens, !positive)
}

// TODO_later: analyze if some special kotlin progression/range would fit here better
data class UreCharClassRange @NotPortableApi constructor(val from: UreCharClass, val to: UreCharClass, val positive: Boolean = true) : UreCharClass {
    override fun toClosedIR(): IR = toIR()
    override fun toIR(): IR = "[$content]".asIR
    override fun toIRInCharClass(): IR = if (positive) content.asIR else toIR()
    private val neg get() = if (positive) "" else "^"
    private val content get() = "$neg${from.toIRInCharClass()}-${to.toIRInCharClass()}"
    @OptIn(NotPortableApi::class)
    operator fun not() = UreCharClassRange(from, to, !positive)
}

/**
 * This class is not only not-portable, but also VERY DELICATE.
 * Please always write unit tests to make sure it behaves as expected on platforms you're using.
 * There are weird inconsistencies when regex engines interpret intersections of unions, negated intersections, etc etc.
 * Some are described here: https://www.regular-expressions.info/charclassintersect.html
 * Some are reproduced in fun testUreCharClasses in TestUreCharClasses.cmn.kt
 * Usual workaround for weird behavior is to wrap some parts in additional chOfAny(token).
 */
data class UreCharClassIntersect @NotPortableApi @DelicateApi internal constructor(val tokens: List, val positive: Boolean = true) : UreCharClass {
    override fun toIR(): IR = tokens.joinToString("&&", if (positive) "[" else "[^", "]") { it.toIRInCharClass().str }.asIR
    override fun toClosedIR(): IR = toIR()
    override fun toIRInCharClass(): IR = toIR() // this class is delicate enough, so let's not try to drop brackets here
    @NotPortableApi @DelicateApi
    operator fun not() = UreCharClassIntersect(tokens, !positive)
}

data class UreCharClassProp @NotPortableApi internal constructor(val prop: String, val positive: Boolean = true) : UreCharClass {
    override fun toIR(): IR = "\\${if (positive) "p" else "P"}{$prop}".asIR
    override fun toClosedIR(): IR = toIR()
    override fun toIRInCharClass(): IR = toIR()
    @OptIn(NotPortableApi::class)
    operator fun not() = UreCharClassProp(prop, !positive)
}

/** Dirty way to inject whole regexes fast. Avoid if possible. */
@JvmInline value class UreWithRawIR @DelicateApi @NotPortableApi internal constructor(val ir: IR) : Ure {
    override fun toIR(): IR = ir
    override fun toClosedIR(): IR = if (isClosed) ir else this.groupNonCapt().toIR()
    private val isClosed get() = when {
        ir.str.length == 1 -> true
        ir.str.length == 2 && ir.str[0] == '\\' -> true
        else -> false
    }
    // TODO_someday: analyze more carefully and drop grouping when actually not needed.
}

@JvmInline value class UreQuote @NotPortableApi internal constructor(val str: String) : UreAtomic {
    override fun toClosedIR(): IR = toIR()
    override fun toIR() = "\\Q$str\\E".asIR
}

/** Could be implemented as [UreConcatenation] of each character, but it's better to have a smaller tree. */
@JvmInline value class UreText internal constructor(val str: String) : UreAtomic {
    override fun toClosedIR(): IR = this.groupNonCapt().toIR()
    override fun toIR() = str.map { if (it.isMeta) "\\$it" else "$it" }.joinToString("").asIR
}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy