From 22e19930cdaa4ba2ffd3e7d3172061de4ce16721 Mon Sep 17 00:00:00 2001
From: Alex Semin <alllexsm@gmail.com>
Date: Tue, 22 Aug 2023 00:03:27 +0200
Subject: [PATCH] Fix false positive for token first char heuristic

---
 .../me/alllex/parsus/parser/ChoiceParser.kt   |  9 +++----
 .../kotlin/me/alllex/parsus/parser/Lexer.kt   | 20 +++++++++++-----
 .../kotlin/me/alllex/parsus/TokenTests.kt     | 24 +++++++++++++++++++
 3 files changed, 41 insertions(+), 12 deletions(-)
 create mode 100644 src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt
diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt
index 098da62..0cf8c46 100644
--- a/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt
+++ b/src/commonMain/kotlin/me/alllex/parsus/parser/ChoiceParser.kt
@@ -15,15 +15,12 @@ internal class ChoiceParser<out T>(
                 val pendingUnknownFirstTokens = mutableListOf<Parser<T>>()
                 for (parser in parsers) {
                     if (parser.hasUnknownFirstTokens()) {
-                        values.forEach { it.add(parser) }
                         pendingUnknownFirstTokens += parser
+                        values.forEach { it += parser }
                     } else {
                         for (token in parser.firstTokens) {
-                            val parsersForToken = getOrPut(token) { mutableListOf() }
-                            if (parsersForToken.isEmpty()) {
-                                parsersForToken += pendingUnknownFirstTokens
-                            }
-                            parsersForToken.add(parser)
+                            val parsersForToken = getOrPut(token) { pendingUnknownFirstTokens.toMutableList() }
+                            parsersForToken += parser
                         }
                     }
                 }
diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/Lexer.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/Lexer.kt
index 2491385..87ed8b4 100644
--- a/src/commonMain/kotlin/me/alllex/parsus/parser/Lexer.kt
+++ b/src/commonMain/kotlin/me/alllex/parsus/parser/Lexer.kt
@@ -18,10 +18,20 @@ internal class Lexer(
 
     init {
         tokensByFirstChar = mutableMapOf<Char, MutableList<Token>>()
+        val unknownFirstCharTokens = mutableListOf<Token>()
         for (token in tokens) {
             val firstChars = token.firstChars
-            for (c in firstChars) {
-                tokensByFirstChar.getOrPut(c) { mutableListOf() }.add(token)
+            if (firstChars.isEmpty()) {
+                // If the token first char is unknown, then the first char heuristic cannot be applied.
+                // Therefore, we assume that such tokens can start with any character and put them in appropriate buckets
+                // to ensure the token priority correctness.
+                unknownFirstCharTokens += token
+                tokensByFirstChar.values.forEach { it += token }
+            } else {
+                for (c in firstChars) {
+                    tokensByFirstChar.getOrPut(c) { unknownFirstCharTokens.toMutableList() }
+                        .add(token)
+                }
             }
         }
     }
@@ -54,14 +64,12 @@ internal class Lexer(
         if (fromIndex < input.length) {
             val nextChar = input[fromIndex]
             val byFirstChar = tokensByFirstChar[nextChar].orEmpty()
-            for (i in byFirstChar.indices) {
-                val token = byFirstChar[i]
+            for (token in byFirstChar) {
                 matchImpl(fromIndex, token)?.let { return it }
             }
         }
 
-        for (i in tokens.indices) {
-            val token = tokens[i]
+        for (token in tokens) {
             matchImpl(fromIndex, token)?.let { return it }
         }
         return null
diff --git a/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt b/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt
new file mode 100644
index 0000000..776a8d2
--- /dev/null
+++ b/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt
@@ -0,0 +1,24 @@
+package me.alllex.parsus
+
+import assertk.assertions.isEqualTo
+import me.alllex.parsus.parser.Grammar
+import me.alllex.parsus.parser.map
+import me.alllex.parsus.parser.or
+import me.alllex.parsus.token.literalToken
+import me.alllex.parsus.token.regexToken
+import kotlin.test.Test
+
+class TokenTests {
+
+    @Test
+    fun literalTokenThatPrefixesRegexTokenWithHigherPriority() {
+        object : Grammar<Int>() {
+            val r by regexToken("abba") map 1
+            val ab by literalToken("ab") map 2
+            override val root by r or ab
+        }.run {
+            assertParsed("abba").isEqualTo(1)
+        }
+    }
+
+}