src/cpp-common: add bt2c::StrScanner class

This patch adds the `bt2c::StrScanner` class, defined in `str-scanner.hpp` and implemented in `str-scanner.cpp`. A string scanner is a simple lexical scanner. This one is a stripped-down version of yactfr's `yactfr::internal::StrScanner`, stripped-down because yactfr uses this to parse TSDL, therefore it needs more features. What's left for the `bt2c::StrScanner` version is: tryScanLitStr(): Tries to scan a double-quoted literal string, possibly containing escape sequences. tryScanConstInt(): Tries to scan a constant unsigned or signed decimal integer string. tryScanConstReal(): Tries to scan a real number string. tryScanToken(): Tries to scan an exact string. skipWhitespaces(): Skips the next whitespaces. See the specific comments in `str-scanner.hpp` for more details. I could have used the `GScanner` API [1], as we do to parse the value of the `--params` CLI option of `babeltrace2`, but: * `yactfr::internal::StrScanner` is already working, tested, and documented. `bt2c::StrScanner` is a much lighter version of it. * Should we ever make an effort to remove the GLib dependency, this part will already be done. * The `GScanner` API doesn't support `\u` escape sequences in literal strings (needed for JSON strings) out of the box, so we'd need this part on our side anyway. `bt2c::StrScanner` could eventually replace `GScanner` elsewhere in the tree, but it would require a few more features (which already exist in `yactfr::internal::StrScanner`). This is part of an effort to implement a JSON parser to support CTF2‑SPECRC‑4.0 [2]. [1]: https://docs.gtk.org/glib/struct.Scanner.html [2]: https://diamon.org/ctf/files/CTF2-SPECRC-4.0.html Signed-off-by: Philippe Proulx <eeppeliteloop@gmail.com> Change-Id: I8317917124218618278611794f32a67be4b9a6dd Reviewed-on: https://review.lttng.org/c/babeltrace/+/7410
simark · Apr 17, 2024 · 9bb1e61 · 9bb1e61
1 parent e6b1287
commit 9bb1e61
Show file tree

Hide file tree

Showing 3 changed files with 773 additions and 0 deletions.
diff --git a/src/Makefile.am b/src/Makefile.am
@@ -175,6 +175,8 @@ cpp_common_libcpp_common_la_SOURCES = \
 	cpp-common/bt2c/safe-ops.hpp \
 	cpp-common/bt2c/span.hpp \
 	cpp-common/bt2c/std-int.hpp \
+	cpp-common/bt2c/str-scanner.cpp \
+	cpp-common/bt2c/str-scanner.hpp \
 	cpp-common/bt2c/text-loc.cpp \
 	cpp-common/bt2c/text-loc.hpp \
 	cpp-common/bt2c/text-loc-str.cpp \

diff --git a/src/cpp-common/bt2c/str-scanner.cpp b/src/cpp-common/bt2c/str-scanner.cpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2015-2022 Philippe Proulx <pproulx@efficios.com>
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <cmath>
+#include <regex>
+
+#include "str-scanner.hpp"
+
+namespace bt2c {
+
+const std::regex StrScanner::_mRealRegex {
+    "^"                   /* Start of target */
+    "-?"                  /* Optional negation */
+    "(?:0|[1-9]\\d*)"     /* Integer part */
+    "(?=[eE.]\\d)"        /* Assertion: need fraction/exponent part */
+    "(?:\\.\\d+)?"        /* Optional fraction part */
+    "(?:[eE][+-]?\\d+)?", /* Optional exponent part */
+    std::regex::optimize};
+
+StrScanner::StrScanner(const char * const begin, const char * const end,
+                       const std::size_t baseOffset, const bt2c::Logger& logger,
+                       const TextLocStrFmt textLocStrFmt) :
+    _mBegin {begin},
+    _mEnd {end}, _mAt {begin}, _mLineBegin {begin},
+    _mBaseOffset {baseOffset}, _mLogger {logger, "STR-SCANNER"}, _mTextLocStrFmt {textLocStrFmt}
+{
+}
+
+StrScanner::StrScanner(const char * const begin, const char * const end, const bt2c::Logger& logger,
+                       const TextLocStrFmt textLocStrFmt) :
+    StrScanner {begin, end, 0, logger, textLocStrFmt}
+{
+}
+
+void StrScanner::reset()
+{
+    this->at(_mBegin);
+    _mNbLines = 0;
+    _mLineBegin = _mBegin;
+}
+
+void StrScanner::skipWhitespaces() noexcept
+{
+    while (!this->isDone()) {
+        switch (*_mAt) {
+        case '\n':
+            this->_checkNewline();
+            /* Fall through */
+        case ' ':
+        case '\t':
+        case '\v':
+        case '\r':
+            this->_incrAt();
+            break;
+        default:
+            return;
+        }
+    }
+}
+
+std::string StrScanner::_locStr() const
+{
+    return textLocStr(this->loc(), _mTextLocStrFmt);
+}
+
+void StrScanner::_appendEscapedUnicodeChar(const char * const at)
+{
+    /* Create array of four hex characters */
+    const std::string hexCpBuf {at, at + 4};
+
+    /* Validate hex characters */
+    for (const auto ch : hexCpBuf) {
+        if (!std::isxdigit(ch)) {
+            BT_CPPLOGE_APPEND_CAUSE_AND_THROW(
+                bt2::Error, "[{}] In `\\u` escape sequence: unexpected character `{:c}`.",
+                this->_locStr(), ch);
+        }
+    }
+
+    /* Convert hex characters to integral codepoint (always works) */
+    const auto cp = std::strtoull(hexCpBuf.data(), nullptr, 16);
+
+    /*
+     * Append UTF-8 bytes from integral codepoint.
+     *
+     * See <https://en.wikipedia.org/wiki/UTF-8#Encoding>.
+     */
+    if (cp <= 0x7f) {
+        _mStrBuf.push_back(cp);
+    } else if (cp <= 0x7ff) {
+        _mStrBuf.push_back(static_cast<char>((cp >> 6) + 0xc0));
+        _mStrBuf.push_back(static_cast<char>((cp & 0x3f) + 0x80));
+    } else if (cp > 0xd800 && cp <= 0xdfff) {
+        /* Unsupported surrogate pairs */
+        BT_CPPLOGE_APPEND_CAUSE_AND_THROW(
+            bt2::Error, "[{}] In `\\u` escape sequence: unsupported surrogate codepoint U+{:x}.",
+            this->_locStr(), static_cast<unsigned int>(cp));
+    } else {
+        BT_ASSERT(cp <= 0xffff);
+        _mStrBuf.push_back(static_cast<char>((cp >> 12) + 0xe0));
+        _mStrBuf.push_back(static_cast<char>(((cp >> 6) & 0x3f) + 0x80));
+        _mStrBuf.push_back(static_cast<char>((cp & 0x3f) + 0x80));
+    }
+}
+
+bool StrScanner::_tryAppendEscapedChar(const char * const escapeSeqStartList)
+{
+    if (this->charsLeft() < 2) {
+        /* Need at least `\` and another character */
+        return false;
+    }
+
+    if (_mAt[0] != '\\') {
+        /* Not an escape sequence */
+        return false;
+    }
+
+    auto escapeSeqStart = escapeSeqStartList;
+
+    /* Try each character of `escapeSeqStartList` */
+    while (*escapeSeqStart != '\0') {
+        if (_mAt[1] == '"' || _mAt[1] == '\\' || _mAt[1] == *escapeSeqStart) {
+            /* Escape sequence detected */
+            if (_mAt[1] == 'u') {
+                /* `\u` escape sequence */
+                if (this->charsLeft() < 6) {
+                    /* Need `\u` + four hex characters */
+                    BT_CPPLOGE_APPEND_CAUSE_AND_THROW(
+                        bt2::Error, "[{}] `\\u` escape sequence needs four hexadecimal digits.",
+                        this->_locStr());
+                }
+
+                this->_appendEscapedUnicodeChar(_mAt + 2);
+                this->_incrAt(6);
+            } else {
+                /* Single-character escape sequence */
+                switch (_mAt[1]) {
+                case 'a':
+                    _mStrBuf.push_back('\a');
+                    break;
+                case 'b':
+                    _mStrBuf.push_back('\b');
+                    break;
+                case 'f':
+                    _mStrBuf.push_back('\f');
+                    break;
+                case 'n':
+                    _mStrBuf.push_back('\n');
+                    break;
+                case 'r':
+                    _mStrBuf.push_back('\r');
+                    break;
+                case 't':
+                    _mStrBuf.push_back('\t');
+                    break;
+                case 'v':
+                    _mStrBuf.push_back('\v');
+                    break;
+                default:
+                    /* As is */
+                    _mStrBuf.push_back(_mAt[1]);
+                    break;
+                }
+
+                this->_incrAt(2);
+            }
+
+            return true;
+        }
+
+        ++escapeSeqStart;
+    }
+
+    return false;
+}
+
+const std::string *StrScanner::tryScanLitStr(const char * const escapeSeqStartList)
+{
+    this->skipWhitespaces();
+
+    /* Backup if we can't completely scan */
+    const auto initAt = _mAt;
+    const auto initLineBegin = _mLineBegin;
+    const auto initNbLines = _mNbLines;
+
+    /* First character: `"` or alpha */
+    auto c = this->_tryScanAnyChar();
+
+    if (c < 0) {
+        return nullptr;
+    }
+
+    if (c != '"') {
+        /* Not a literal string */
+        this->at(initAt);
+        _mLineBegin = initLineBegin;
+        _mNbLines = initNbLines;
+        return nullptr;
+    }
+
+    /* Reset string buffer */
+    _mStrBuf.clear();
+
+    /*
+     * Scan inner string, processing escape sequences during the
+     * process.
+     */
+    while (!this->isDone()) {
+        /* Check for illegal control character */
+        if (std::iscntrl(*_mAt)) {
+            BT_CPPLOGE_APPEND_CAUSE_AND_THROW(
+                bt2::Error, "[{}] Illegal control character 0x{:02x} in literal string.",
+                this->_locStr(), static_cast<unsigned int>(*_mAt));
+        }
+
+        /* Try to append an escaped character first */
+        if (this->_tryAppendEscapedChar(escapeSeqStartList)) {
+            continue;
+        }
+
+        /* End of literal string? */
+        if (*_mAt == '"') {
+            /* Skip `"` */
+            this->_incrAt();
+            return &_mStrBuf;
+        }
+
+        /* Check for newline */
+        this->_checkNewline();
+
+        /* Append regular character and go to next one */
+        _mStrBuf.push_back(*_mAt);
+        this->_incrAt();
+    }
+
+    /* Couldn't find end of string */
+    this->at(initAt);
+    _mLineBegin = initLineBegin;
+    _mNbLines = initNbLines;
+    return nullptr;
+}
+
+bool StrScanner::tryScanToken(const char * const token) noexcept
+{
+    this->skipWhitespaces();
+
+    /* Backup if we can't completely scan */
+    const auto initAt = _mAt;
+
+    /* Try to scan token completely */
+    auto tokenAt = token;
+
+    while (*tokenAt != '\0' && _mAt != _mEnd) {
+        if (*_mAt != *tokenAt) {
+            /* Mismatch */
+            this->at(initAt);
+            return false;
+        }
+
+        this->_incrAt();
+        ++tokenAt;
+    }
+
+    if (*tokenAt != '\0') {
+        /* Wrapped string ends before end of token */
+        this->at(initAt);
+        return false;
+    }
+
+    /* Success */
+    return true;
+}
+
+bt2s::optional<double> StrScanner::tryScanConstReal() noexcept
+{
+    this->skipWhitespaces();
+
+    /*
+     * Validate JSON number format (with fraction and/or exponent part).
+     *
+     * This is needed because std::strtod() accepts more formats which
+     * JSON doesn't support.
+     */
+    if (!std::regex_search(_mAt, _mEnd, _mRealRegex)) {
+        return bt2s::nullopt;
+    }
+
+    /* Parse */
+    char *strEnd = nullptr;
+    const auto val = std::strtod(_mAt, &strEnd);
+
+    if (val == HUGE_VAL || (val == 0 && _mAt == strEnd) || errno == ERANGE) {
+        /* Couldn't parse */
+        errno = 0;
+        return bt2s::nullopt;
+    }
+
+    /* Success: update character pointer and return value */
+    this->at(strEnd);
+    return val;
+}
+
+} /* namespace bt2c */