Merge pull request #25 from alllex/error-descriptions

User-friendly unmatched token errors
alllex · Oct 7, 2023 · bc6ead1 · bc6ead1
2 parents 801f4d7 + 4876834
commit bc6ead1
Show file tree

Hide file tree

Showing 8 changed files with 276 additions and 23 deletions.
diff --git a/buildSrc/build.gradle.kts b/buildSrc/build.gradle.kts
@@ -8,5 +8,5 @@ dependencies {
     implementation("org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlinVer")
     implementation("org.jetbrains.kotlin:kotlin-allopen:$kotlinVer")
     implementation("org.jetbrains.kotlinx:kotlinx-benchmark-plugin:0.4.8")
-    implementation("org.jetbrains.dokka:dokka-gradle-plugin:1.8.20")
+    implementation("org.jetbrains.dokka:dokka-gradle-plugin:1.9.0")
 }
diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/ParseResult.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/ParseResult.kt
@@ -2,6 +2,7 @@ package me.alllex.parsus.parser
 
 import me.alllex.parsus.token.Token
 import me.alllex.parsus.token.TokenMatch
+import me.alllex.parsus.util.replaceNonPrintable
 
 /**
  * Result of a parse that is either a [parsed value][ParsedValue]
@@ -23,20 +24,96 @@ abstract class ParseError : ParseResult<Nothing>() {
      */
     abstract val offset: Int
 
-    override fun toString(): String = "ParseError"
+    open val contextProvider: ParseErrorContextProvider? get() = null
+
+    abstract fun describe(): String
+
+    override fun toString(): String = describe()
+
+    protected fun format(message: String, messageAtOffset: String): String = buildString {
+        append(message)
+        contextProvider?.getParseErrorContext(offset)?.run {
+            appendLine()
+            append(" ".repeat(lookBehind)).append(messageAtOffset)
+            appendLine()
+            append(" ".repeat(lookBehind)).append("| offset=$offset (or after ignored tokens)")
+            appendLine()
+            appendLine(replaceNonPrintable(inputSection))
+            if (previousTokenMatch != null) {
+                append("^".repeat(previousTokenMatch.length.coerceAtLeast(1)))
+                append(" Previous token: ${previousTokenMatch.token} at offset=${previousTokenMatch.offset}")
+                appendLine()
+            }
+        }
+    }
 }
 
-data class UnmatchedToken(val expected: Token, override val offset: Int) : ParseError()
+data class ParseErrorContext(
+    val inputSection: String,
+    val lookBehind: Int,
+    val lookAhead: Int,
+    val previousTokenMatch: TokenMatch?,
+)
 
-data class MismatchedToken(val expected: Token, val found: TokenMatch) : ParseError() {
+fun interface ParseErrorContextProvider {
+    fun getParseErrorContext(offset: Int): ParseErrorContext?
+}
+
+data class UnmatchedToken(
+    val expected: Token,
+    override val offset: Int,
+    override val contextProvider: ParseErrorContextProvider? = null
+) : ParseError() {
+
+    override fun toString(): String = describe()
+
+    override fun describe(): String = format(
+        message = "Unmatched token at offset=$offset, when expected: $expected",
+        messageAtOffset = "Expected token: $expected"
+    )
+}
+
+data class MismatchedToken(
+    val expected: Token,
+    val found: TokenMatch,
+    override val contextProvider: ParseErrorContextProvider? = null,
+) : ParseError() {
     override val offset: Int get() = found.offset
+    override fun toString(): String = describe()
+    override fun describe(): String = format(
+        message = "Mismatched token at offset=$offset, when expected: $expected, got: ${found.token}",
+        messageAtOffset = "Expected token: $expected at offset=$offset, got: ${found.token}"
+    )
+}
+
+data class NoMatchingToken(
+    override val offset: Int,
+) : ParseError() {
+
+    override fun toString(): String = describe()
+    override fun describe(): String = format(
+        message = "No matching token at offset=$offset",
+        messageAtOffset = "No matching token"
+    )
+}
+
+data class NoViableAlternative(
+    override val offset: Int,
+) : ParseError() {
+    override fun toString(): String = describe()
+    override fun describe(): String = format(
+        message = "None of the alternatives succeeded at offset=$offset",
+        messageAtOffset = "None of the alternatives succeeded"
+    )
+}
+
+data class NotEnoughRepetition(override val offset: Int, val expectedAtLeast: Int, val actualCount: Int) : ParseError() {
+    override fun toString(): String = describe()
+    override fun describe(): String = "Expected at least $expectedAtLeast, found $actualCount"
 }
-data class NoMatchingToken(override val offset: Int) : ParseError()
-data class NoViableAlternative(override val offset: Int) : ParseError()
-data class NotEnoughRepetition(override val offset: Int, val expectedAtLeast: Int, val actualCount: Int) : ParseError()
 
 class ParseException(val error: ParseError) : Exception() {
-    override fun toString(): String = "ParseException($error)"
+    override fun toString(): String = "ParseException: ${error.describe()}"
 }
 
 inline fun <T, R> ParseResult<T>.map(f: (T) -> R): ParseResult<R> {

diff --git a/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingContext.kt b/src/commonMain/kotlin/me/alllex/parsus/parser/ParsingContext.kt
@@ -20,9 +20,12 @@ internal class ParsingContext(
     private val debugMode: Boolean = false
 ) : ParsingScope {
 
+    private val inputLength = tokenizer.input.length
+
     private var backtrackCont: Continuation<ParseError>? = null
     private var cont: Continuation<Any?>? = null
     private var position: Int = 0
+    private var lastTokenMatchContext = LastTokenMatchContext(tokenizer.input, currentOffset = 0)
     private var result: Result<Any?> = PENDING_RESULT
 
     fun <T> runParser(parser: Parser<T>): ParseResult<T> {
@@ -60,13 +63,23 @@ internal class ParsingContext(
     override fun tryParse(token: Token): ParseResult<TokenMatch> {
         val fromIndex = this.position
         val match = tokenizer.findMatchOf(fromIndex, token)
-            ?: return UnmatchedToken(token, fromIndex)
-        // TODO: clean up, as this should not happen anymore
-        if (match.token != token) return MismatchedToken(token, match)
-        this.position = match.offset + match.length
+            ?: return UnmatchedToken(token, fromIndex, getParseErrorContextProviderOrNull())
+
+        // This can only happen with EagerTokenizer
+        if (match.token != token) return MismatchedToken(token, match, getParseErrorContextProviderOrNull())
+
+        val newPosition = match.nextOffset.coerceAtMost(inputLength)
+        this.position = newPosition
+        this.lastTokenMatchContext.currentOffset = newPosition
+        this.lastTokenMatchContext.lastMatch = match
+
         return ParsedValue(match)
     }
 
+    private fun getParseErrorContextProviderOrNull(): ParseErrorContextProvider {
+        return this.lastTokenMatchContext
+    }
+
     override suspend fun fail(error: ParseError): Nothing {
         suspendCoroutineUninterceptedOrReturn<Any?> {
             withCont(backtrackCont) // may be null
@@ -164,3 +177,39 @@ internal class ParsingContext(
         }
     }
 }
+
+internal class LastTokenMatchContext(
+    val input: String,
+    var currentOffset: Int,
+    var lastMatch: TokenMatch? = null,
+) : ParseErrorContextProvider {
+
+    override fun toString() = "LastTokenMatchContext(currentOffset=$currentOffset, lastMatch=$lastMatch)"
+
+    override fun getParseErrorContext(offset: Int): ParseErrorContext? {
+        if (offset != currentOffset) {
+            return null
+        }
+
+        val lastMatch = this.lastMatch
+        val lookAhead = 20
+        return if (lastMatch == null || lastMatch.nextOffset != offset) {
+            ParseErrorContext(
+                inputSection = getInputSection(offset, offset + lookAhead),
+                lookBehind = 0,
+                lookAhead = lookAhead,
+                previousTokenMatch = null
+            )
+        } else {
+            ParseErrorContext(
+                inputSection = getInputSection(lastMatch.offset, lastMatch.nextOffset + lookAhead),
+                lookBehind = lastMatch.length,
+                lookAhead = lookAhead,
+                previousTokenMatch = lastMatch
+            )
+        }
+    }
+
+    private fun getInputSection(inputSectionStart: Int, inputSectionStop: Int) =
+        input.substring(inputSectionStart, inputSectionStop.coerceAtMost(input.length))
+}
diff --git a/src/commonMain/kotlin/me/alllex/parsus/token/TokenMatch.kt b/src/commonMain/kotlin/me/alllex/parsus/token/TokenMatch.kt
@@ -8,4 +8,9 @@ data class TokenMatch(
     val token: Token,
     val offset: Int,
     val length: Int,
-)
+) {
+    /**
+     * Offset of the next character after the match.
+     */
+    val nextOffset: Int get() = offset + length
+}
diff --git a/src/commonMain/kotlin/me/alllex/parsus/trace/TokenMatchingTrace.kt b/src/commonMain/kotlin/me/alllex/parsus/trace/TokenMatchingTrace.kt
@@ -3,6 +3,7 @@ package me.alllex.parsus.trace
 import me.alllex.parsus.annotations.ExperimentalParsusApi
 import me.alllex.parsus.token.Token
 import me.alllex.parsus.token.TokenMatch
+import me.alllex.parsus.util.replaceNonPrintable
 
 
 @ExperimentalParsusApi
@@ -82,13 +83,3 @@ fun formatTokenMatchingTrace(
     }
     return sb.toString()
 }
-
-private fun replaceNonPrintable(char: Char): Char {
-    return when (char) {
-        ' ' -> '␣' // U+2423 OPEN BOX
-        '\n' -> '␤' // U+2424 SYMBOL FOR NEWLINE
-        '\r' -> '␍' // U+240D SYMBOL FOR CARRIAGE RETURN
-        '\t' -> '␉' // U+2409 SYMBOL FOR HORIZONTAL TABULATION
-        else -> char
-    }
-}
diff --git a/src/commonMain/kotlin/me/alllex/parsus/util/text.kt b/src/commonMain/kotlin/me/alllex/parsus/util/text.kt
@@ -0,0 +1,19 @@
+package me.alllex.parsus.util
+
+internal fun replaceNonPrintable(string: String): String {
+    return buildString {
+        for (char in string) {
+            append(replaceNonPrintable(char))
+        }
+    }
+}
+
+internal fun replaceNonPrintable(char: Char): Char {
+    return when (char) {
+        ' ' -> '␣' // U+2423 OPEN BOX
+        '\n' -> '␤' // U+2424 SYMBOL FOR NEWLINE
+        '\r' -> '␍' // U+240D SYMBOL FOR CARRIAGE RETURN
+        '\t' -> '␉' // U+2409 SYMBOL FOR HORIZONTAL TABULATION
+        else -> char
+    }
+}
diff --git a/src/commonTest/kotlin/me/alllex/parsus/ParseErrorTest.kt b/src/commonTest/kotlin/me/alllex/parsus/ParseErrorTest.kt
@@ -0,0 +1,97 @@
+package me.alllex.parsus
+
+import assertk.assertions.isEqualTo
+import assertk.assertions.prop
+import me.alllex.parsus.parser.Grammar
+import me.alllex.parsus.parser.ParseError
+import me.alllex.parsus.parser.map
+import me.alllex.parsus.parser.times
+import me.alllex.parsus.token.literalToken
+import me.alllex.parsus.token.regexToken
+import kotlin.test.Test
+
+class ParseErrorTest {
+
+    @Test
+    fun unmatchedTokenErrorsProvideUserFriendlyDescriptions() {
+        object : Grammar<String>() {
+            val ab by literalToken("ab")
+            val cd by literalToken("cd")
+            override val root by ab * cd map { (v1, v2) -> "${v1.text}-${v2.text}" }
+        }.run {
+
+            assertParsed("abcd").isEqualTo("ab-cd")
+
+            assertNotParsed("abab").prop(ParseError::describe).isEqualTo(
+                "Unmatched token at offset=2, when expected: LiteralToken('cd')\n" + """
+                      Expected token: LiteralToken('cd')
+                      | offset=2 (or after ignored tokens)
+                    abab
+                    ^^ Previous token: LiteralToken('ab') at offset=0
+                """.trimIndent() + "\n"
+            )
+
+            assertNotParsed("cd").prop(ParseError::describe).isEqualTo(
+                "Unmatched token at offset=0, when expected: LiteralToken('ab')\n" + """
+                    Expected token: LiteralToken('ab')
+                    | offset=0 (or after ignored tokens)
+                    cd
+                """.trimIndent() + "\n"
+            )
+
+            assertNotParsed("abcdab").prop(ParseError::describe).isEqualTo(
+                "Unmatched token at offset=4, when expected: Token(EOF)\n" + """
+                      Expected token: Token(EOF)
+                      | offset=4 (or after ignored tokens)
+                    cdab
+                    ^^ Previous token: LiteralToken('cd') at offset=2
+                """.trimIndent() + "\n"
+            )
+        }
+    }
+
+    @Test
+    fun lastMatchDescriptionIsPresentWhenThereAreIgnoredTokensInBetween() {
+        object : Grammar<String>() {
+            val ws by literalToken(" ", ignored = true)
+            val ab by literalToken("ab")
+            val cd by literalToken("cd")
+            override val root by ab * cd map { (v1, v2) -> "${v1.text}-${v2.text}" }
+        }.run {
+            assertParsed("ab cd").isEqualTo("ab-cd")
+
+            assertNotParsed("ab ab").prop(ParseError::describe).isEqualTo(
+                "Unmatched token at offset=2, when expected: LiteralToken('cd')\n" + """
+                      Expected token: LiteralToken('cd')
+                      | offset=2 (or after ignored tokens)
+                    ab␣ab
+                    ^^ Previous token: LiteralToken('ab') at offset=0
+                """.trimIndent() + "\n"
+            )
+        }
+    }
+
+    @Test
+    fun unprintableCharactersAreReplacedInErrors() {
+        object : Grammar<String>() {
+            val ws by regexToken("\\s+")
+            val ab by literalToken("ab")
+            @Suppress("unused")
+            val cd by literalToken("cd")
+            override val root by ws * ab map { (v1, v2) -> "${v1.text}-${v2.text}" }
+        }.run {
+            assertParsed(" \t\r\nab").isEqualTo(" \t\r\n-ab")
+
+            assertNotParsed(" \t\r\ncd").prop(ParseError::describe).isEqualTo(
+                "Unmatched token at offset=4, when expected: LiteralToken('ab')\n" + """
+                        Expected token: LiteralToken('ab')
+                        | offset=4 (or after ignored tokens)
+                    ␣␉␍␤cd
+                    ^^^^ Previous token: RegexToken(ws [\s+]) at offset=0
+                """.trimIndent() + "\n"
+            )
+        }
+
+    }
+
+}
diff --git a/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt b/src/commonTest/kotlin/me/alllex/parsus/TokenTests.kt
@@ -83,4 +83,19 @@ class TokenTests {
         }
     }
 
+    @Test
+    fun explicitEofMatchesDoNotOverflowInputLength() {
+        object : Grammar<List<TokenMatch>>() {
+            val ab by literalToken("ab")
+            val eof by EofToken
+            override val root by ab * eof * eof map { it.toList() }
+        }.run {
+            assertParsed("ab").isEqualTo(listOf(
+                TokenMatch(ab, 0, 2),
+                TokenMatch(EofToken, 2, 1),
+                TokenMatch(EofToken, 2, 1),
+            ))
+        }
+    }
+
 }