All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.xenomachina.parser.RegexTokenizer.kt Maven / Gradle / Ivy

There is a newer version: 0.0.3
Show newest version
// Copyright © 2017 Laurence Gonsalves
//
// This file is part of kessel, a library which can be found at
// http://github.com/xenomachina/kessel
//
// This library is free software; you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License as published by the
// Free Software Foundation; either version 2.1 of the License, or (at your
// option) any later version.
//
// This library is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
// for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with this library; if not, see http://www.gnu.org/licenses/

package com.xenomachina.parser

import java.util.regex.MatchResult
import java.util.regex.Matcher
import kotlin.coroutines.experimental.buildSequence

/**
 * Constructs a token from a [MatchResult].
 */
typealias TokenConstructor = (MatchResult) -> T

/**
 * A `Tokenizer` converts a [CharSequence] into a [Sequence] of tokens.
 */
interface Tokenizer {
    /**
     * Tokenize chars, keeping track of position.
     */
    fun 

tokenize(positionTracker: PositionTracker

, chars: CharSequence): Sequence> /** * Tokenize chars. */ fun tokenize(chars: CharSequence): Sequence = tokenize(NoOpPositionTracker, chars).map { it.value } } /** * A `RegexTokenizer` acts as a mapping from [Regex] objects to [TokenConstructor] objects. * * A `RegexTokenizer` is stateless and reentrant. It is safe to reuse an instance of `RegexTokenizer`, or even to use * it on multiple sequences concurrently. * * When multiple regexes match the input, the longest match wins. Ties go to the earliest match. */ class RegexTokenizer( vararg regexToToken: Pair> ) : Tokenizer { /** * Converts the specified [CharSequence] into a [Sequence] of tokens of * type `T`. Starting at the beginning of the `CharSequence`, each `Regex` * is tested for a match, and the `TokenConstructor` associated with the * longest match is used to construct a token. It then advances to the next * position in the `CharSequence`. */ override fun

tokenize(positionTracker: PositionTracker

, chars: CharSequence): Sequence> = buildSequence { val length = chars.length val matchersToHandlers = patternsToHandlers.map { (pattern, f) -> pattern.matcher(chars) to f } var index = 0 var pos = positionTracker.start() while (index < length) { var bestMatcherToHandler: Pair>? = null var bestLen = 0 for (matcherToHandler in matchersToHandlers) { val matcher = matcherToHandler.first matcher.region(index, length) if (matcher.lookingAt()) { val matchLen = matcher.end() - matcher.start() if (matchLen > bestLen) { bestMatcherToHandler = matcherToHandler bestLen = matchLen } } } if (bestMatcherToHandler == null) { TODO("add fallback handling") } else { val (matcher, handler) = bestMatcherToHandler val nextPos = positionTracker.next(pos, matcher.group()) yield(Positioned(pos, handler(matcher.toMatchResult()), nextPos)) pos = nextPos index += bestLen } } } private val patternsToHandlers = regexToToken.map { (regex, f) -> regex.toPattern() to f }.toList() }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy