All Downloads are FREE. Search and download functionalities are using the official Maven repository.

jvmMain.com.xebia.functional.tokenizer.regex.kt Maven / Gradle / Ivy

package com.xebia.functional.tokenizer

actual val p50k_regex: Regex
  get() = Regex("""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

actual val cl100k_base_regex: Regex
  get() =
    Regex(
      """(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
      RegexOption.IGNORE_CASE,
    )

actual val o200k_base_regex: Regex
  get() =
    Regex(
      listOf(
          """[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
          """[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
          """\p{N}{1,3}""",
          """ ?[^\s\p{L}\p{N}]+[\r\n/]*""",
          """\s*[\r\n]+""",
          """\s+(?!\S)""",
          """\s+""",
        )
        .joinToString("|"),
      RegexOption.IGNORE_CASE,
    )




© 2015 - 2024 Weber Informatics LLC | Privacy Policy