Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: reduce std::string_view creation #59

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/ada/checkers.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ namespace ada::checkers {
return view.size() >= prefix.size() && (view.substr(0, prefix.size()) == prefix);
}

ada_really_inline constexpr bool is_ipv4(std::string_view view) noexcept;

} // namespace ada::checkers

#endif //ADA_CHECKERS_H
4 changes: 2 additions & 2 deletions include/ada/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@

namespace ada::parser {
// first_percent should be = plain.find('%')
std::optional<std::string> to_ascii(std::string_view plain, bool be_strict, size_t first_percent);
bool to_ascii(std::optional<std::string>& out, std::string_view plain, bool be_strict, size_t first_percent);

bool parse_opaque_host(std::optional<std::string>& out, std::string_view input);
bool parse_ipv6(std::optional<std::string>& out, std::string_view input);
bool parse_ipv4(std::optional<std::string>& out, std::string_view input);
bool parse_host(std::optional<std::string>& out, std::string_view input, bool is_not_special, bool input_is_ascii);
bool parse_host(std::optional<std::string>& out, std::string_view input, bool is_not_special);

url parse_url(std::string_view user_input,
std::optional<ada::url> base_url = std::nullopt,
Expand Down
6 changes: 4 additions & 2 deletions include/ada/serializers.h
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
#ifndef ADA_SERIALIZERS_H
#define ADA_SERIALIZERS_H

#include "ada/common_defs.h"

#include <array>
#include <optional>
#include <string>

namespace ada::serializers {

size_t find_longest_sequence_of_ipv6_pieces(const std::array<uint16_t, 8> address) noexcept;
size_t find_longest_sequence_of_ipv6_pieces(const std::array<uint16_t, 8>& address) noexcept;

// An IPv6 address is a 128-bit unsigned integer that identifies a network address.
std::string ipv6(const std::array<uint16_t, 8> address) noexcept;
std::string ipv6(const std::array<uint16_t, 8>& address) noexcept;

// An IPv4 address is a 32-bit unsigned integer that identifies a network address.
std::string ipv4(const uint64_t address) noexcept;
Expand Down
2 changes: 2 additions & 0 deletions include/ada/state.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#include "common_defs.h"

#include <string>

namespace ada {

enum class state {
Expand Down
2 changes: 2 additions & 0 deletions include/ada/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "common_defs.h"
#include <string>
#include <optional>

namespace ada::unicode {

Expand All @@ -24,6 +25,7 @@ namespace ada::unicode {
std::string percent_decode(const std::string_view input, size_t first_percent);
std::string percent_encode(const std::string_view input, const uint8_t character_set[]);
void percent_encode_character(const char input, const uint8_t character_set[], std::string& out);
ada_really_inline bool to_lower_ascii_string(std::optional<std::string>& out, size_t first_percent) noexcept;

} // namespace ada::unicode

Expand Down
1 change: 1 addition & 0 deletions src/ada.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "ada.h"
#include "checkers.cpp"
#include "unicode.cpp"
#include "serializers.cpp"
#include "implementation.cpp"
Expand Down
21 changes: 21 additions & 0 deletions src/checkers.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#include "ada.h"
#include <algorithm>

namespace ada::checkers {

ada_really_inline constexpr bool is_ipv4(std::string_view view) noexcept {
size_t last_dot = view.rfind('.');
if(last_dot == view.size() - 1) {
view.remove_suffix(1);
last_dot = view.rfind('.');
}
std::string_view number = (last_dot == std::string_view::npos) ? view : view.substr(last_dot+1);
if(number.empty()) { return false; }
/** Optimization opportunity: we have basically identified the last number of the
ipv4 if we return true here. We might as well parse it and have at least one
number parsed when we get to parse_ipv4. */
if(std::all_of(number.begin(), number.end(), ada::checkers::is_digit)) { return true; }
return (checkers::has_hex_prefix(number) && std::all_of(number.begin()+2, number.end(), ada::unicode::is_lowercase_hex));
}

} // namespace ada::checkers
110 changes: 55 additions & 55 deletions src/parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <iostream>
#include <numeric>

#include <optional>
#include <string_view>
#include <unicode/utypes.h>
#include <unicode/uidna.h>
Expand All @@ -26,7 +27,7 @@ namespace ada::parser {
* to_ascii does not expect the input to be percent decoded. This is
* mostly used to conform with the test suite.
*/
std::optional<std::string> to_ascii(const std::string_view plain, const bool be_strict, size_t first_percent) {
bool to_ascii(std::optional<std::string>& out, const std::string_view plain, const bool be_strict, size_t first_percent) {
anonrig marked this conversation as resolved.
Show resolved Hide resolved
std::string percent_decoded_buffer;
std::string_view input = plain;
if(first_percent != std::string_view::npos) {
Expand All @@ -42,25 +43,25 @@ namespace ada::parser {

UIDNA* uidna = uidna_openUTS46(options, &status);
if (U_FAILURE(status)) {
return std::nullopt;
return false;
}

UIDNAInfo info = UIDNA_INFO_INITIALIZER;
std::string result(255, ' ');
out = std::string(255, 0);
int32_t length = uidna_nameToASCII_UTF8(uidna,
input.data(),
int32_t(input.length()),
result.data(), int32_t(result.capacity()),
out.value().data(), 255,
&info,
&status);

if (status == U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR;
result.resize(length);
out.value().resize(length);
length = uidna_nameToASCII_UTF8(uidna,
input.data(),
int32_t(input.length()),
result.data(), int32_t(result.capacity()),
out.value().data(), length,
&info,
&status);
}
Expand All @@ -84,13 +85,16 @@ namespace ada::parser {
uidna_close(uidna);

if (U_FAILURE(status) || info.errors != 0 || length == 0) {
return std::nullopt;
out = std::nullopt;
return false;
}

result.resize(length);
if(std::any_of(result.begin(), result.end(), ada::unicode::is_forbidden_domain_code_point)) { return std::nullopt; }

return result;
out.value().resize(length);
if(std::any_of(out.value().begin(), out.value().end(), ada::unicode::is_forbidden_domain_code_point)) {
out = std::nullopt;
return false;
}
return true;
}

/**
Expand Down Expand Up @@ -363,11 +367,14 @@ namespace ada::parser {
/**
* @see https://url.spec.whatwg.org/#host-parsing
*/
bool parse_host(std::optional<std::string>& out, const std::string_view input, bool is_not_special, bool input_is_ascii) {
bool parse_host(std::optional<std::string>& out, const std::string_view input, bool is_not_special) {
//
// Note: this function assumes that parse_host is not empty. Make sure we can
// guarantee that.
//
#if ADA_DEVELOP_MODE
if(input.empty()) { abort(); }
#endif
// If input starts with U+005B ([), then:
if (input[0] == '[') {
// If input does not end with U+005D (]), validation error, return failure.
Expand All @@ -389,44 +396,29 @@ namespace ada::parser {
// The most common case is an ASCII input, in which case we do not need to call the expensive 'to_ascii'
// if a few conditions are met: no '%' and no 'xn-' subsequence.
size_t first_percent = input.find('%');
// most input strings will be ASCII which may enable some optimizations.
bool is_ascii = !input.empty() && 128>(std::reduce(input.begin(), input.end(), uint8_t(input[0]), std::bit_or<uint8_t>()));

// if simple_case is true, there is a good chance we might be able to use the fast path.
bool simple_case = input_is_ascii && (first_percent == std::string_view::npos);

// This function attemps to convert an ASCII string to a lower-case version.
// Once the lower cased version has been materialized, we check for the presence
// of the substring 'xn-', if it is found (unlikely), we then call the expensive 'to_ascii'.
auto to_lower_ascii_string = [first_percent](std::string_view view) -> std::optional<std::string> {
if(std::any_of(view.begin(), view.end(), ada::unicode::is_forbidden_domain_code_point)) { return std::nullopt; }
std::string result(view);
std::transform(result.begin(), result.end(), result.begin(),[](char c) -> char {
return (uint8_t((c|0x20) - 0x61) <= 25 ? (c|0x20) : c);}
);
return (result.find("xn-") == std::string_view::npos) ? result : to_ascii(view, false, first_percent);
};
bool simple_case = (is_ascii && (first_percent == std::string_view::npos));

// This is required since `to_lower_ascii_string` assumes non-null `out` parameter
if (simple_case) {
out = input;
}

// In the simple case, we call to_lower_ascii_string above, or else, we fall back on the expensive case.
out = simple_case ? to_lower_ascii_string(input) : to_ascii(input, false, first_percent);
bool is_valid = (is_ascii && (first_percent == std::string_view::npos)) ?
unicode::to_lower_ascii_string(out, first_percent) :
to_ascii(out, input, false, first_percent);

// If asciiDomain is failure, validation error, return failure.
if (!out.has_value()) {
if (!out.has_value() || !is_valid) {
return false;
}

// If asciiDomain ends in a number, then return the result of IPv4 parsing asciiDomain.
auto is_ipv4 = [](std::string_view view) {
size_t last_dot = view.rfind('.');
if(last_dot == view.size() - 1) {
view.remove_suffix(1);
last_dot = view.rfind('.');
}
std::string_view number = (last_dot == std::string_view::npos) ? view : view.substr(last_dot+1);
if(number.empty()) { return false; }
/** Optimization opportunity: we have basically identified the last number of the
ipv4 if we return true here. We might as well parse it and have at least one
number parsed when we get to parse_ipv4. */
if(std::all_of(number.begin(), number.end(), ada::checkers::is_digit)) { return true; }
return (checkers::has_hex_prefix(number) && std::all_of(number.begin()+2, number.end(), ::ada::unicode::is_lowercase_hex));
};
if(is_ipv4(*out)) {
if(checkers::is_ipv4(*out)) {
return parse_ipv4(out, *out);
}
return true;
Expand All @@ -451,9 +443,6 @@ namespace ada::parser {
ada::url url = optional_url.has_value() ? std::move(optional_url.value()) : ada::url();
// From this point forward, optional_url should not be used.

// most input strings will be ASCII which may enable some optimizations.
const bool is_ascii = !user_input.empty() && 128>(std::reduce(user_input.begin(), user_input.end(), uint8_t(user_input[0]), std::bit_or<uint8_t>()));

std::string tmp_buffer;
std::string_view internal_input;
if(std::any_of(user_input.begin(), user_input.end(), ada::unicode::is_ascii_tab_or_newline)) {
Expand Down Expand Up @@ -487,13 +476,6 @@ namespace ada::parser {
pointer_start = state_override.has_value() ? internal_input.begin() : url_data.begin();
pointer_end = state_override.has_value() ? internal_input.end() : url_data.end();

// most URLs have no @. Having no @ tells us that we don't have to worry about AUTHORITY. Of course,
// we could have @ and still not have to worry about AUTHORITY.
// TODO: Instead of just collecting a bool, collect the location of the '@' and do something useful with it.
// TODO: We could do various processing early on, using a single pass over the string to collect
// information about it, e.g., telling us whether there is a @ and if so, where (or how many).
const bool contains_ampersand = (std::find(pointer_start, pointer_end, '@') != pointer_end);

// Let pointer be a pointer for input.
std::string_view::iterator pointer = pointer_start;

Expand Down Expand Up @@ -648,6 +630,13 @@ namespace ada::parser {
break;
}
case ada::state::AUTHORITY: {
// most URLs have no @. Having no @ tells us that we don't have to worry about AUTHORITY. Of course,
// we could have @ and still not have to worry about AUTHORITY.
// TODO: Instead of just collecting a bool, collect the location of the '@' and do something useful with it.
// TODO: We could do various processing early on, using a single pass over the string to collect
// information about it, e.g., telling us whether there is a @ and if so, where (or how many).
const bool contains_ampersand = (std::find(pointer, pointer_end, '@') != pointer_end);

if(!contains_ampersand) {
// TODO: This is a waste of time, we should never have arrived here.
pointer--;
Expand Down Expand Up @@ -925,7 +914,7 @@ namespace ada::parser {
}

// Let host be the result of host parsing buffer with url is not special.
url.is_valid = parse_host(url.host, host_view, !url.is_special(), is_ascii);
url.is_valid = parse_host(url.host, host_view, !url.is_special());

// Set url’s host to host, buffer to the empty string, and state to port state.
state = ada::state::PORT;
Expand All @@ -949,7 +938,12 @@ namespace ada::parser {
}

// Let host be the result of host parsing host_view with url is not special.
url.is_valid = parse_host(url.host, host_view, !url.is_special(), is_ascii);
if (host_view.empty()) {
url.is_valid = true;
url.host = "";
} else {
url.is_valid = parse_host(url.host, host_view, !url.is_special());
}

// Set url’s host to host, and state to path start state.
state = ada::state::PATH_START;
Expand Down Expand Up @@ -1003,6 +997,12 @@ namespace ada::parser {
if ((pointer == pointer_end) || ((*pointer != '/') && (*pointer != '\\'))) {
pointer--;
}

// Optimization: Avoiding going into PATH state improves the performance of urls ending with /.
if (std::next(pointer) == pointer_end) {
url.path = "/";
return url;
}
}
// Otherwise, if state override is not given and c is U+003F (?),
// set url’s query to the empty string and state to query state.
Expand Down Expand Up @@ -1137,7 +1137,7 @@ namespace ada::parser {
}
else {
// Let host be the result of host parsing buffer with url is not special.
url.is_valid = parse_host(url.host, file_host_buffer, !url.is_special(), is_ascii);
url.is_valid = parse_host(url.host, file_host_buffer, !url.is_special());

// If host is "localhost", then set host to the empty string.
if (url.host.has_value() && url.host.value() == "localhost") {
Expand Down
6 changes: 3 additions & 3 deletions src/serializers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@

namespace ada::serializers {

size_t find_longest_sequence_of_ipv6_pieces(const std::array<uint16_t, 8> address) noexcept {
size_t find_longest_sequence_of_ipv6_pieces(const std::array<uint16_t, 8>& address) noexcept {
size_t max_index = -1;
size_t max_length = 1;
size_t current_start = -1;
size_t current_length = 0;

for (size_t i = 0; i < address.size(); i++) {
for (size_t i = 0; i < 8; i++) {
if (address[i] != 0) {
if (current_length > max_length) {
max_index = current_start;
Expand All @@ -36,7 +36,7 @@ namespace ada::serializers {
/**
* @see https://url.spec.whatwg.org/#concept-ipv6-serializer
*/
std::string ipv6(const std::array<uint16_t, 8> address) noexcept {
std::string ipv6(const std::array<uint16_t, 8>& address) noexcept {
// Let output be the empty string.
std::string output{};

Expand Down
18 changes: 18 additions & 0 deletions src/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -283,4 +283,22 @@ namespace ada::unicode {
}
}

// This function attemps to convert an ASCII string to a lower-case version.
// Once the lower cased version has been materialized, we check for the presence
// of the substring 'xn-', if it is found (unlikely), we then call the expensive 'to_ascii'.
ada_really_inline bool to_lower_ascii_string(std::optional<std::string>& out, size_t first_percent) noexcept {
anonrig marked this conversation as resolved.
Show resolved Hide resolved
#if ADA_DEVELOP_MODE
if(!out.has_value()) { abort(); }
#endif
if(std::any_of(out.value().begin(), out.value().end(), ada::unicode::is_forbidden_domain_code_point)) { return false; }
std::transform(out.value().begin(), out.value().end(), out.value().begin(), [](char c) -> char {
return (uint8_t((c|0x20) - 0x61) <= 25 ? (c|0x20) : c);}
);
if (out.value().find("xn-") == std::string_view::npos) {
return true;
}

return ada::parser::to_ascii(out, out.value(), false, first_percent);
}

} // namespace ada::unicode
7 changes: 4 additions & 3 deletions tests/wpt_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,14 +203,15 @@ bool toascii_encoding() {
} else if (element.type() == ondemand::json_type::object) {
ondemand::object object = element.get_object();
std::string_view input = object["input"];
auto output = ada::parser::to_ascii(input, false, input.find("%")).value_or("");
std::optional<std::string> output;
ada::parser::to_ascii(output, input, false, input.find('%'));
auto expected_output = object["output"];

if (expected_output.type() == ondemand::json_type::string) {
std::string_view stringified_output = expected_output.get_string();
TEST_ASSERT(output, stringified_output, "Should have been equal");
TEST_ASSERT(output.value_or(""), stringified_output, "Should have been equal");
} else if (expected_output.is_null()) {
TEST_ASSERT(output, "", "Should have been empty");
TEST_ASSERT(output.value_or(""), "", "Should have been empty");
}
}
}
Expand Down