forked from efficios/babeltrace
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
src/cpp-common: add
bt2c::StrScanner
class
This patch adds the `bt2c::StrScanner` class, defined in `str-scanner.hpp` and implemented in `str-scanner.cpp`. A string scanner is a simple lexical scanner. This one is a stripped-down version of yactfr's `yactfr::internal::StrScanner`, stripped-down because yactfr uses this to parse TSDL, therefore it needs more features. What's left for the `bt2c::StrScanner` version is: tryScanLitStr(): Tries to scan a double-quoted literal string, possibly containing escape sequences. tryScanConstInt(): Tries to scan a constant unsigned or signed decimal integer string. tryScanConstReal(): Tries to scan a real number string. tryScanToken(): Tries to scan an exact string. skipWhitespaces(): Skips the next whitespaces. See the specific comments in `str-scanner.hpp` for more details. I could have used the `GScanner` API [1], as we do to parse the value of the `--params` CLI option of `babeltrace2`, but: * `yactfr::internal::StrScanner` is already working, tested, and documented. `bt2c::StrScanner` is a much lighter version of it. * Should we ever make an effort to remove the GLib dependency, this part will already be done. * The `GScanner` API doesn't support `\u` escape sequences in literal strings (needed for JSON strings) out of the box, so we'd need this part on our side anyway. `bt2c::StrScanner` could eventually replace `GScanner` elsewhere in the tree, but it would require a few more features (which already exist in `yactfr::internal::StrScanner`). This is part of an effort to implement a JSON parser to support CTF2‑SPECRC‑4.0 [2]. [1]: https://docs.gtk.org/glib/struct.Scanner.html [2]: https://diamon.org/ctf/files/CTF2-SPECRC-4.0.html Signed-off-by: Philippe Proulx <eeppeliteloop@gmail.com> Change-Id: I8317917124218618278611794f32a67be4b9a6dd Reviewed-on: https://review.lttng.org/c/babeltrace/+/7410
- Loading branch information
Showing
3 changed files
with
773 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,306 @@ | ||
/* | ||
* Copyright (c) 2015-2022 Philippe Proulx <pproulx@efficios.com> | ||
* | ||
* SPDX-License-Identifier: MIT | ||
*/ | ||
|
||
#include <cmath> | ||
#include <regex> | ||
|
||
#include "str-scanner.hpp" | ||
|
||
namespace bt2c { | ||
|
||
const std::regex StrScanner::_mRealRegex { | ||
"^" /* Start of target */ | ||
"-?" /* Optional negation */ | ||
"(?:0|[1-9]\\d*)" /* Integer part */ | ||
"(?=[eE.]\\d)" /* Assertion: need fraction/exponent part */ | ||
"(?:\\.\\d+)?" /* Optional fraction part */ | ||
"(?:[eE][+-]?\\d+)?", /* Optional exponent part */ | ||
std::regex::optimize}; | ||
|
||
StrScanner::StrScanner(const char * const begin, const char * const end, | ||
const std::size_t baseOffset, const bt2c::Logger& logger, | ||
const TextLocStrFmt textLocStrFmt) : | ||
_mBegin {begin}, | ||
_mEnd {end}, _mAt {begin}, _mLineBegin {begin}, | ||
_mBaseOffset {baseOffset}, _mLogger {logger, "STR-SCANNER"}, _mTextLocStrFmt {textLocStrFmt} | ||
{ | ||
} | ||
|
||
StrScanner::StrScanner(const char * const begin, const char * const end, const bt2c::Logger& logger, | ||
const TextLocStrFmt textLocStrFmt) : | ||
StrScanner {begin, end, 0, logger, textLocStrFmt} | ||
{ | ||
} | ||
|
||
void StrScanner::reset() | ||
{ | ||
this->at(_mBegin); | ||
_mNbLines = 0; | ||
_mLineBegin = _mBegin; | ||
} | ||
|
||
void StrScanner::skipWhitespaces() noexcept | ||
{ | ||
while (!this->isDone()) { | ||
switch (*_mAt) { | ||
case '\n': | ||
this->_checkNewline(); | ||
/* Fall through */ | ||
case ' ': | ||
case '\t': | ||
case '\v': | ||
case '\r': | ||
this->_incrAt(); | ||
break; | ||
default: | ||
return; | ||
} | ||
} | ||
} | ||
|
||
std::string StrScanner::_locStr() const | ||
{ | ||
return textLocStr(this->loc(), _mTextLocStrFmt); | ||
} | ||
|
||
void StrScanner::_appendEscapedUnicodeChar(const char * const at) | ||
{ | ||
/* Create array of four hex characters */ | ||
const std::string hexCpBuf {at, at + 4}; | ||
|
||
/* Validate hex characters */ | ||
for (const auto ch : hexCpBuf) { | ||
if (!std::isxdigit(ch)) { | ||
BT_CPPLOGE_APPEND_CAUSE_AND_THROW( | ||
bt2::Error, "[{}] In `\\u` escape sequence: unexpected character `{:c}`.", | ||
this->_locStr(), ch); | ||
} | ||
} | ||
|
||
/* Convert hex characters to integral codepoint (always works) */ | ||
const auto cp = std::strtoull(hexCpBuf.data(), nullptr, 16); | ||
|
||
/* | ||
* Append UTF-8 bytes from integral codepoint. | ||
* | ||
* See <https://en.wikipedia.org/wiki/UTF-8#Encoding>. | ||
*/ | ||
if (cp <= 0x7f) { | ||
_mStrBuf.push_back(cp); | ||
} else if (cp <= 0x7ff) { | ||
_mStrBuf.push_back(static_cast<char>((cp >> 6) + 0xc0)); | ||
_mStrBuf.push_back(static_cast<char>((cp & 0x3f) + 0x80)); | ||
} else if (cp > 0xd800 && cp <= 0xdfff) { | ||
/* Unsupported surrogate pairs */ | ||
BT_CPPLOGE_APPEND_CAUSE_AND_THROW( | ||
bt2::Error, "[{}] In `\\u` escape sequence: unsupported surrogate codepoint U+{:x}.", | ||
this->_locStr(), static_cast<unsigned int>(cp)); | ||
} else { | ||
BT_ASSERT(cp <= 0xffff); | ||
_mStrBuf.push_back(static_cast<char>((cp >> 12) + 0xe0)); | ||
_mStrBuf.push_back(static_cast<char>(((cp >> 6) & 0x3f) + 0x80)); | ||
_mStrBuf.push_back(static_cast<char>((cp & 0x3f) + 0x80)); | ||
} | ||
} | ||
|
||
bool StrScanner::_tryAppendEscapedChar(const char * const escapeSeqStartList) | ||
{ | ||
if (this->charsLeft() < 2) { | ||
/* Need at least `\` and another character */ | ||
return false; | ||
} | ||
|
||
if (_mAt[0] != '\\') { | ||
/* Not an escape sequence */ | ||
return false; | ||
} | ||
|
||
auto escapeSeqStart = escapeSeqStartList; | ||
|
||
/* Try each character of `escapeSeqStartList` */ | ||
while (*escapeSeqStart != '\0') { | ||
if (_mAt[1] == '"' || _mAt[1] == '\\' || _mAt[1] == *escapeSeqStart) { | ||
/* Escape sequence detected */ | ||
if (_mAt[1] == 'u') { | ||
/* `\u` escape sequence */ | ||
if (this->charsLeft() < 6) { | ||
/* Need `\u` + four hex characters */ | ||
BT_CPPLOGE_APPEND_CAUSE_AND_THROW( | ||
bt2::Error, "[{}] `\\u` escape sequence needs four hexadecimal digits.", | ||
this->_locStr()); | ||
} | ||
|
||
this->_appendEscapedUnicodeChar(_mAt + 2); | ||
this->_incrAt(6); | ||
} else { | ||
/* Single-character escape sequence */ | ||
switch (_mAt[1]) { | ||
case 'a': | ||
_mStrBuf.push_back('\a'); | ||
break; | ||
case 'b': | ||
_mStrBuf.push_back('\b'); | ||
break; | ||
case 'f': | ||
_mStrBuf.push_back('\f'); | ||
break; | ||
case 'n': | ||
_mStrBuf.push_back('\n'); | ||
break; | ||
case 'r': | ||
_mStrBuf.push_back('\r'); | ||
break; | ||
case 't': | ||
_mStrBuf.push_back('\t'); | ||
break; | ||
case 'v': | ||
_mStrBuf.push_back('\v'); | ||
break; | ||
default: | ||
/* As is */ | ||
_mStrBuf.push_back(_mAt[1]); | ||
break; | ||
} | ||
|
||
this->_incrAt(2); | ||
} | ||
|
||
return true; | ||
} | ||
|
||
++escapeSeqStart; | ||
} | ||
|
||
return false; | ||
} | ||
|
||
const std::string *StrScanner::tryScanLitStr(const char * const escapeSeqStartList) | ||
{ | ||
this->skipWhitespaces(); | ||
|
||
/* Backup if we can't completely scan */ | ||
const auto initAt = _mAt; | ||
const auto initLineBegin = _mLineBegin; | ||
const auto initNbLines = _mNbLines; | ||
|
||
/* First character: `"` or alpha */ | ||
auto c = this->_tryScanAnyChar(); | ||
|
||
if (c < 0) { | ||
return nullptr; | ||
} | ||
|
||
if (c != '"') { | ||
/* Not a literal string */ | ||
this->at(initAt); | ||
_mLineBegin = initLineBegin; | ||
_mNbLines = initNbLines; | ||
return nullptr; | ||
} | ||
|
||
/* Reset string buffer */ | ||
_mStrBuf.clear(); | ||
|
||
/* | ||
* Scan inner string, processing escape sequences during the | ||
* process. | ||
*/ | ||
while (!this->isDone()) { | ||
/* Check for illegal control character */ | ||
if (std::iscntrl(*_mAt)) { | ||
BT_CPPLOGE_APPEND_CAUSE_AND_THROW( | ||
bt2::Error, "[{}] Illegal control character 0x{:02x} in literal string.", | ||
this->_locStr(), static_cast<unsigned int>(*_mAt)); | ||
} | ||
|
||
/* Try to append an escaped character first */ | ||
if (this->_tryAppendEscapedChar(escapeSeqStartList)) { | ||
continue; | ||
} | ||
|
||
/* End of literal string? */ | ||
if (*_mAt == '"') { | ||
/* Skip `"` */ | ||
this->_incrAt(); | ||
return &_mStrBuf; | ||
} | ||
|
||
/* Check for newline */ | ||
this->_checkNewline(); | ||
|
||
/* Append regular character and go to next one */ | ||
_mStrBuf.push_back(*_mAt); | ||
this->_incrAt(); | ||
} | ||
|
||
/* Couldn't find end of string */ | ||
this->at(initAt); | ||
_mLineBegin = initLineBegin; | ||
_mNbLines = initNbLines; | ||
return nullptr; | ||
} | ||
|
||
bool StrScanner::tryScanToken(const char * const token) noexcept | ||
{ | ||
this->skipWhitespaces(); | ||
|
||
/* Backup if we can't completely scan */ | ||
const auto initAt = _mAt; | ||
|
||
/* Try to scan token completely */ | ||
auto tokenAt = token; | ||
|
||
while (*tokenAt != '\0' && _mAt != _mEnd) { | ||
if (*_mAt != *tokenAt) { | ||
/* Mismatch */ | ||
this->at(initAt); | ||
return false; | ||
} | ||
|
||
this->_incrAt(); | ||
++tokenAt; | ||
} | ||
|
||
if (*tokenAt != '\0') { | ||
/* Wrapped string ends before end of token */ | ||
this->at(initAt); | ||
return false; | ||
} | ||
|
||
/* Success */ | ||
return true; | ||
} | ||
|
||
bt2s::optional<double> StrScanner::tryScanConstReal() noexcept | ||
{ | ||
this->skipWhitespaces(); | ||
|
||
/* | ||
* Validate JSON number format (with fraction and/or exponent part). | ||
* | ||
* This is needed because std::strtod() accepts more formats which | ||
* JSON doesn't support. | ||
*/ | ||
if (!std::regex_search(_mAt, _mEnd, _mRealRegex)) { | ||
return bt2s::nullopt; | ||
} | ||
|
||
/* Parse */ | ||
char *strEnd = nullptr; | ||
const auto val = std::strtod(_mAt, &strEnd); | ||
|
||
if (val == HUGE_VAL || (val == 0 && _mAt == strEnd) || errno == ERANGE) { | ||
/* Couldn't parse */ | ||
errno = 0; | ||
return bt2s::nullopt; | ||
} | ||
|
||
/* Success: update character pointer and return value */ | ||
this->at(strEnd); | ||
return val; | ||
} | ||
|
||
} /* namespace bt2c */ |
Oops, something went wrong.