commonMain.piacenti.dslmaker.ExpressionMatcher.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of dsl-maker-js Show documentation
Show all versions of dsl-maker-js Show documentation
Kotlin multiplatform library to facilitate creation of DSLs with ANTLR or a simple built in parser
package piacenti.dslmaker
import piacenti.dslmaker.abstraction.ProductionStep
import piacenti.dslmaker.errors.ParserException
import piacenti.dslmaker.interfaces.MatchData
import piacenti.dslmaker.structures.*
import piacenti.dslmaker.structures.derivationgraph.DerivationGraph
import piacenti.dslmaker.structures.derivationgraph.DerivationNode
/**
* @param
* @author Piacenti
*/
open class ExpressionMatcher {
/**
* @return
*/
lateinit var graph: DerivationGraph
protected set
protected lateinit var text: String
/**
* @return
*/
lateinit var grammar: Grammar<*>
protected set
var highestSuccessfulIndex: Int = 0
protected lateinit var lastSuccessStack: MutableList>
lateinit var expectedTokenNotMatched: MutableSet
lateinit var possibleValidTokensIfContinuingParsing: MutableSet
protected set
protected lateinit var matchCache: MutableMap
fun match(graph: DerivationGraph, grammar: Grammar<*>, text: String, matchAllTokens: Boolean,
startIndex: Int = 0): FoundIndex {
this.graph = graph
this.grammar = grammar
this.text = text
// need to organize things better to allow caching, if I only cache successful results than results that depend on failures and expectedTokens not matched
//will not work correctly
// int key = grammar.hashCode() + text.hashCode();
// if (resultCache.get(key) != null)
// {
// processResults(resultCache.get(key));
// return resultCache.get(key);
// }
// else
run {
resetGatherers()
//grammar says where to start the parse
val start = graph.getSubGraphs()[grammar.startProduction]?:throw ParserException("Start production has not been defined")
val result: FoundIndex
try {
result = matchProduction(start, startIndex, StepStack(), false)
if (matchAllTokens && !result.matchTokens.isEmpty() &&
result.matchTokens.last().endIndex < text.length - 1 || !result.found) {
throw ParserException("")
}
if (result.found) {
expectedTokenNotMatched.clear()
updateAST(text, result.astNode!!)
updateFireParametersWithAST(result.fireParameters, result.astNode)
fixIndexes(result.astNode!!)
processResults(result)
}
} catch (e: ParserException) {
throw throwException(text, e, null)
}
// resultCache.put(key, result);
return result
}
}
private fun fixIndexes(astNode: ASTNode) {
astNode.forEachRecursive({
it.endIndex=it.endIndex-1
})
}
private fun updateFireParametersWithAST(fireParameters: ArrayList, astNode: ASTNode?) {
fireParameters.forEach { fireParameters1 ->
astNode!!.forEachRecursive({ tastNode ->
if (tastNode.type === fireParameters1.node.step && fireParameters1.startIndex == tastNode.startIndex &&
fireParameters1.endIndex == tastNode.endIndex) {
fireParameters1.astNode = tastNode
}
})
}
}
private fun updateAST(text: String, astNode: ASTNode) {
astNode.children.forEach { node ->
node.parent = astNode
updateAST(text, node)
}
}
private fun processResults(result: FoundIndex) {
for (i in 0 until result.fireParameters.size) {
val p = result.fireParameters[i]
fireSuccessAction(p)
}
}
private fun throwException(text: String, e: ParserException?, result: FoundIndex?): ParserException {
val subList: String
val index: Int = if (result != null && !result.matchTokens.isEmpty()) {
result.matchTokens.last().endIndex
} else {
highestSuccessfulIndex
}
subList = if (index + 1 < text.length) {
text.substring(index + 1, text.length)
} else {
""
}
return ParserException(
(if (e != null) e.message + "\n" else "") + "Not all tokens were matched, highest index reached " + highestSuccessfulIndex + "\nhighest success result index: " + index +
"\nlast success stacks: " + lastSuccessStack + "" + "\nexpected: " + expectedTokenNotMatched + "" + "\nremaining text: " + subList)
}
private fun resetGatherers() {
highestSuccessfulIndex = 0
lastSuccessStack = ArrayList()
expectedTokenNotMatched = HashSet()
possibleValidTokensIfContinuingParsing = HashSet()
matchCache = HashMap()
}
/**
* This method does all the work of the matching. It consists of two recursive operations. It starts with the production being parsed which is a node that
* contains the information given in the Grammar. The nodes in the grammar are built so that parts of a grammar that are in sequence are represented by
* a chain of parent child nodes. If there are certain points of a grammar that may vary then the parent may have more than one child. For example, if a production
* says X->abc|adf|a then node 'a' has two children 'b' and 'd' while 'b' has only 'c' as child and 'd' has only 'f' as child. Node a in that case may also be treated as a leaf
* node if the longer branches are not matched so the program always tries to match the longer branches first. As indicated a match of a production is only reached
* when a leaf node is matched, that is a node without children. This should be simple enough except for when the grammar is recursive which means that they
* always have children, in those situations nodes that can also be leaf nodes are used for matching. For example, a production X -> aX|bX|a|b will generate
* a node structure that has X as the root and two children 'a' and 'b' which in turn have X as child. This would end in an infinite loop if 'a' and 'b' were
* not marked as leaf nodes. This also means that a poorly designed grammar can lead to stack overflows due to the recursion. Had X been defined as X -> aX|bX,
* this would definitely generate a stack overflow. Another important note about the structure is that every root node is marked as root whild child nodes are not
* so X in X -> aX|bX|a|b is marked as root node even when it comes as a child of 'a' and 'b' while other productions present in the current production are not
* if X -> aX|bY|a|b and Y -> aY|b then for production X, every appearance of X is marked as root while the appearances of Y are not, but Y is marked as root
* inside its own production. This is done because as the program finds productions it recursively call this method to parse them but to avoid complications
* and to mark the match only when the toplevel production is given, the program skips the root nodes so that it keeps going down its children until a match is
* made and mark the match as happening to the top level of the production. For example, if we did not have this in place the match for X would
* happen 4 times for abab, now most people are not going to be interested in the individual pieces of the recursive production but rather the whole of it
* which is abab. If there was for sure an interest on matching each individually then multiple productions would be more appropriate such as
* X -> AX|BX|A|B, A->a, B->b.
*
*
* Whenever the program finds another production as it is parsing the nodes recursively for the current production, it calls again the matchProduction method
* passing the grammar root production node for that node as the start. This node is the node stored in the map of production built from the grammar definitions
* which means that it will be a node marked as root and it not the same as the node currently being parsed. So for X -> aX|bYd|a|b and Y -> yX|y if
* when parsing X we getText to the Y node that node has child 'd' while the actual Grammar node which marked as root is the production which has child y.
* So the actual production node is passed as start node to the matchProduction method. If a match of Y occurs after that call is made and the current Y node has
* no children, then it would mark X as matched, but since Y has 'd' as a child it goes on to match that node also before marking X as matched.
*
*
* The match object is kept as a method level object so that when the match happens anywhere down the branch in the recursion it updates it for the production
* and stop the matching for that production.
*
*
* Every time a new production is called the stack is udpated so that we know how deep we are in the matching hierarchy of productions that depend on one
* another. This stack is also used by the user to be able to have a single action address items parsed at different levels. The startIndex has the start
* point match for the current production while the result of the match will contain the final matched index so that a substring of the text represent the match.
* Note that once a production is matched it adds to the index of the result and that is done because after the call to the method that index may advance
* several steps instead of single step as it happens when doing a simple token match.
*
*
* When a match for a production occurs a fire parameter is created and added to the result. After all matching has occurred the final result should only
* contain the Fire parameters that should actually be used to fire the action associated to the production. The stack is passed so that the level of call
* may be determined so that same action may be used to deal with objects that may be present at different levels of the structure
*
* @param start
* @param startIndex
* @param stack
* @return
*/
protected open fun matchProduction(start: DerivationNode, startIndex: Int, stack: StepStack,
ignoreActions: Boolean): FoundIndex {
val key = StringBuilder().append(start.step.hashCode()).append(startIndex).toString()
val foundIndex = matchCache[key]
if (foundIndex != null) {
return foundIndex
} else {
stack.add(start.step)
//Depth first search approach
val result = FoundIndex(startIndex, false)
result.fullPath.add(start.step)
val branchedAccumulation = AccumulationParameters()
val action = object : LoopTraverse {
override fun call(node: DerivationNode, index: Int,
accumulatedParameters: AccumulationParameters) {
var index = index
//if node is another production then go to that tree to deal with it
val subGraphNode = graph.getSubGraphs()[node.step]
if (subGraphNode != null) {
val temp: FoundIndex = if (node.step === start.step) {
matchProduction(subGraphNode, index, stack, true)
} else {
matchProduction(subGraphNode, index, stack, false)
}
if (temp.found) {
//assign index when changed because in recursive calls you will never fall in the condition that has no children
index = temp.index
result.index = index
accumulatedParameters.branchedMatchTokens.addAll(temp.matchTokens)
accumulatedParameters.branchedFireParameters.addAll(temp.fireParameters)
accumulatedParameters.branchedFullPath.addAll(temp.fullPath)
accumulatedParameters.astNodes.add(temp.astNode!!)
} else {
return
}
} else {
//=============BASE FAIL CASES==========================
if (node.step.isProduction) {
throw ParserException("Unsupported Production: " + node.step)
}
if (index > text.length) {
return
}
//step must be of same type as token step, if not, do not continue recursion for its children
val evalText = text.substring(index)
val m = ("(?s)^\\s*(?:" + node.step.regexDefinition + ")").genericRegex().find(evalText)
if (m == null) {
if (index > highestSuccessfulIndex || (index == 0 && highestSuccessfulIndex == 0)) {
expectedTokenNotMatched.add(node.step)
}
if (evalText.isBlank()) {
possibleValidTokensIfContinuingParsing.add(node.step)
possibleValidTokensIfContinuingParsing.add(start.step)
}
return
}
//================================================
val endIndex = index + m.range.end
if (endIndex > highestSuccessfulIndex) {
expectedTokenNotMatched = HashSet()
highestSuccessfulIndex = endIndex
if (evalText.isNotBlank())
possibleValidTokensIfContinuingParsing = HashSet()
}
accumulatedParameters.branchedMatchTokens.add(
TokenMatch(endIndex, index, m.groupValues[0].trim(), node.step))
//add AST node for terminal
addASTNodeToTerminal(node, index, accumulatedParameters, endIndex)
if (node.step.regexDefinition != "") index += m.range.end
}
if (node.children.isEmpty()) {
setFoundResult(index, accumulatedParameters, result, ignoreActions, start, startIndex, stack)
return
}
for (child in node.children) {
if (!result.found) {
call(child, index, accumulatedParameters.copy())
} else {
break
}
}
//if nothing worked and this node can be a leaf then mark as found stopping here
if (node.isLeaf && !result.found) {
setFoundResult(index, accumulatedParameters, result, ignoreActions, start, startIndex, stack)
return
}
}
}
for (child in start.children) {
if (!result.found) {
action.call(child, startIndex, branchedAccumulation.copy())
} else {
break
}
}
if (result.found) {
if (result.index == highestSuccessfulIndex) {
lastSuccessStack = ArrayList()
val list = mutableListOf()
list.addAll(stack.getAsList())
lastSuccessStack.add(list)
} else if (result.index > startIndex && result.index - 1 == highestSuccessfulIndex) {
val list = mutableListOf()
list.addAll(stack.getAsList())
lastSuccessStack.add(list)
}
} else if (startIndex > highestSuccessfulIndex) {
expectedTokenNotMatched.add(start.step)
} else {
result.fullPath.removeLast()
}
if (!result.found && startIndex > highestSuccessfulIndex) {
possibleValidTokensIfContinuingParsing.add(start.step)
}
stack.removeLast()
matchCache[key] = result
return result
}
}
protected fun addASTNodeToTerminal(node: DerivationNode, index: Int,
accumulatedParameters: AccumulationParameters, endIndex: Int) {
val terminalNode = ASTNode(originalText = text)
terminalNode.type = node.step
terminalNode.endIndex = endIndex
terminalNode.startIndex = index
accumulatedParameters.astNodes.add(terminalNode)
}
fun setFoundResult(index: Int, accumulatedParameters: AccumulationParameters, result: FoundIndex,
ignoreActions: Boolean, start: DerivationNode, startIndex: Int,
stack: StepStack) {
result.astNode = ASTNode(originalText = text)
result.astNode!!.startIndex = startIndex
result.astNode!!.endIndex = index
result.astNode!!.type = start.step
result.astNode!!.children.addAll(accumulatedParameters.astNodes)
result.found = true
result.index = index
result.fireParameters.addAll(accumulatedParameters.branchedFireParameters)
result.matchTokens.addAll(accumulatedParameters.branchedMatchTokens)
result.fullPath.addAll(accumulatedParameters.branchedFullPath)
addAction(index, result, ignoreActions, start, startIndex, stack, result.astNode)
}
private fun addAction(index: Int, result: FoundIndex, ignoreActions: Boolean, start: DerivationNode, startIndex: Int,
stack: StepStack, astNode: ASTNode?) {
if (!ignoreActions && startIndex != index && grammar.productions[start.step]!!.action != null) {
//somehow the astNode added to the firedParameter gets disconnected from the rest of the AST in some cases. Moving the logic so that we set the nodes as a post process
//my theory is that it matches the same rule from two different branches, one that failed and the second that succeeds. But since we cache the results we are stuck with
//the fire AST node from the first failed branch which gets disconnected from the rest of the code
result.fireParameters.add(FireParameters(start, startIndex, index, stack, result.matchTokens))
}
}
protected open fun fireSuccessAction(p: FireParameters) {
//getText subList that pertains to this production match
val subString = text.substring(p.startIndex, p.endIndex)
val str = filterText(p, subString).toString().trim()
val whiteSpaceOffset = getBeginningWhiteSpaceOffset(subString)
grammar.productions[p.node.step]!!.action!!(MatchData(str, p.stack, p.matchTokens, p.startIndex,
p.startIndex + whiteSpaceOffset, p.endIndex, p.astNode!!))
}
protected fun getBeginningWhiteSpaceOffset(subString: String): Int {
val whiteSpaceBeginning = "^\\s*".toRegex().find(subString)
var whiteSpaceOffset = 0
if (whiteSpaceBeginning != null) {
whiteSpaceOffset = whiteSpaceBeginning.range.end
}
return whiteSpaceOffset
}
protected fun filterText(p: FireParameters, subString: String): StringBuilder {
var subStringCleaned = subString
val str = StringBuilder()
for (regexToken in p.matchTokens) {
val m = ("(?s)^\\s*(?:" + regexToken.token!!.regexDefinition + ")").genericRegex().find(subStringCleaned)
str.append(m!!.groupValues[regexToken.token!!.matchFilter])
subStringCleaned = subStringCleaned.substring(m.range.end)
}
return str
}
fun find(graph: DerivationGraph, grammar: Grammar<*>, text: String, delimiterPattern: String?) {
var index = 0
while (index < text.length) {
index = try {
val result = match(graph, grammar, text, false, index)
if (!result.found) {
incrementByDelimiterIndex(delimiterPattern, index, text)
} else {
result.index
}
} catch (e: ParserException) {
incrementByDelimiterIndex(delimiterPattern, index, text)
}
}
}
private fun incrementByDelimiterIndex(delimiterPattern: String?, startIndex: Int, text: String): Int {
var index = startIndex
if (delimiterPattern != null) {
val sub = text.substring(index)
val m = "(?s)(?:${delimiterPattern})".genericRegex().find(sub)
if (m != null) {
index += m.range.end
} else {
index++
}
} else {
index++
}
return index
}
protected interface LoopTraverse {
fun call(node: DerivationNode, tokenIndex: Int, accumulatedParameters: AccumulationParameters)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy