From 22e19930cdaa4ba2ffd3e7d3172061de4ce16721 Mon Sep 17 00:00:00 2001 From: Alex Semin Date: Tue, 22 Aug 2023 00:03:27 +0200 Subject: [PATCH] Fix false positive for token first char heuristic --- .../me/alllex/parsus/parser/ChoiceParser.kt | 9 +++---- .../kotlin/me/alllex/parsus/parser/Lexer.kt | 20 +++++++++++----- .../kotlin/me/alllex/parsus/TokenTests.kt | 24 +++++++++++++++++++ 3 files changed, 41 insertions(+), 12 deletions(-) create mode 100644 src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt index 098da62..0cf8c46 100644 --- a/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt +++ b/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt @@ -15,15 +15,12 @@ internal class ChoiceParser( val pendingUnknownFirstTokens = mutableListOf>() for (parser in parsers) { if (parser.hasUnknownFirstTokens()) { - values.forEach { it.add(parser) } pendingUnknownFirstTokens += parser + values.forEach { it += parser } } else { for (token in parser.firstTokens) { - val parsersForToken = getOrPut(token) { mutableListOf() } - if (parsersForToken.isEmpty()) { - parsersForToken += pendingUnknownFirstTokens - } - parsersForToken.add(parser) + val parsersForToken = getOrPut(token) { pendingUnknownFirstTokens.toMutableList() } + parsersForToken += parser } } } diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/Lexer.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/Lexer.kt index 2491385..87ed8b4 100644 --- a/src/commonMain/kotlin/me/alllex/parsus/parser/Lexer.kt +++ b/src/commonMain/kotlin/me/alllex/parsus/parser/Lexer.kt @@ -18,10 +18,20 @@ internal class Lexer( init { tokensByFirstChar = mutableMapOf>() + val unknownFirstCharTokens = mutableListOf() for (token in tokens) { val firstChars = token.firstChars - for (c in firstChars) { - tokensByFirstChar.getOrPut(c) { mutableListOf() }.add(token) + if (firstChars.isEmpty()) { + // If the token first char is unknown, then the first char heuristic cannot be applied. + // Therefore, we assume that such tokens can start with any character and put them in appropriate buckets + // to ensure the token priority correctness. + unknownFirstCharTokens += token + tokensByFirstChar.values.forEach { it += token } + } else { + for (c in firstChars) { + tokensByFirstChar.getOrPut(c) { unknownFirstCharTokens.toMutableList() } + .add(token) + } } } } @@ -54,14 +64,12 @@ internal class Lexer( if (fromIndex < input.length) { val nextChar = input[fromIndex] val byFirstChar = tokensByFirstChar[nextChar].orEmpty() - for (i in byFirstChar.indices) { - val token = byFirstChar[i] + for (token in byFirstChar) { matchImpl(fromIndex, token)?.let { return it } } } - for (i in tokens.indices) { - val token = tokens[i] + for (token in tokens) { matchImpl(fromIndex, token)?.let { return it } } return null diff --git a/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt b/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt new file mode 100644 index 0000000..776a8d2 --- /dev/null +++ b/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt @@ -0,0 +1,24 @@ +package me.alllex.parsus + +import assertk.assertions.isEqualTo +import me.alllex.parsus.parser.Grammar +import me.alllex.parsus.parser.map +import me.alllex.parsus.parser.or +import me.alllex.parsus.token.literalToken +import me.alllex.parsus.token.regexToken +import kotlin.test.Test + +class TokenTests { + + @Test + fun literalTokenThatPrefixesRegexTokenWithHigherPriority() { + object : Grammar() { + val r by regexToken("abba") map 1 + val ab by literalToken("ab") map 2 + override val root by r or ab + }.run { + assertParsed("abba").isEqualTo(1) + } + } + +}