Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: reduce std::string_view creation #59

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions include/ada/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@

namespace ada::parser {
// first_percent should be = plain.find('%')
std::optional<std::string> to_ascii(std::string_view plain, bool be_strict, size_t first_percent);
bool to_ascii(std::optional<std::string>& out, std::string_view plain, bool be_strict, size_t first_percent);

bool parse_opaque_host(std::optional<std::string>& out, std::string_view input);
bool parse_ipv6(std::optional<std::string>& out, std::string_view input);
bool parse_ipv4(std::optional<std::string>& out, std::string_view input);
bool parse_host(std::optional<std::string>& out, std::string_view input, bool is_not_special, bool input_is_ascii);
bool parse_host(std::optional<std::string>& out, std::string_view input, bool is_not_special);

url parse_url(std::string_view user_input,
std::optional<ada::url> base_url = std::nullopt,
Expand Down
6 changes: 4 additions & 2 deletions include/ada/serializers.h
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
#ifndef ADA_SERIALIZERS_H
#define ADA_SERIALIZERS_H

#include "ada/common_defs.h"

#include <array>
#include <optional>
#include <string>

namespace ada::serializers {

size_t find_longest_sequence_of_ipv6_pieces(const std::array<uint16_t, 8> address) noexcept;
size_t find_longest_sequence_of_ipv6_pieces(const uint16_t* address) noexcept;

// An IPv6 address is a 128-bit unsigned integer that identifies a network address.
std::string ipv6(const std::array<uint16_t, 8> address) noexcept;
std::string ipv6(const uint16_t* address) noexcept;

// An IPv4 address is a 32-bit unsigned integer that identifies a network address.
std::string ipv4(const uint64_t address) noexcept;
Expand Down
2 changes: 2 additions & 0 deletions include/ada/state.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#include "common_defs.h"

#include <string>

namespace ada {

enum class state {
Expand Down
2 changes: 2 additions & 0 deletions include/ada/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "common_defs.h"
#include <string>
#include <optional>

namespace ada::unicode {

Expand All @@ -24,6 +25,7 @@ namespace ada::unicode {
std::string percent_decode(const std::string_view input, size_t first_percent);
std::string percent_encode(const std::string_view input, const uint8_t character_set[]);
void percent_encode_character(const char input, const uint8_t character_set[], std::string& out);
ada_really_inline bool to_lower_ascii_string(std::optional<std::string>& out, size_t first_percent) noexcept;

} // namespace ada::unicode

Expand Down
70 changes: 35 additions & 35 deletions src/parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <iostream>
#include <numeric>

#include <optional>
#include <string_view>
#include <unicode/utypes.h>
#include <unicode/uidna.h>
Expand All @@ -26,7 +27,7 @@ namespace ada::parser {
* to_ascii does not expect the input to be percent decoded. This is
* mostly used to conform with the test suite.
*/
std::optional<std::string> to_ascii(const std::string_view plain, const bool be_strict, size_t first_percent) {
bool to_ascii(std::optional<std::string>& out, const std::string_view plain, const bool be_strict, size_t first_percent) {
anonrig marked this conversation as resolved.
Show resolved Hide resolved
std::string percent_decoded_buffer;
std::string_view input = plain;
if(first_percent != std::string_view::npos) {
Expand All @@ -42,25 +43,25 @@ namespace ada::parser {

UIDNA* uidna = uidna_openUTS46(options, &status);
if (U_FAILURE(status)) {
return std::nullopt;
return false;
}

UIDNAInfo info = UIDNA_INFO_INITIALIZER;
std::string result(255, ' ');
out = std::string(255, 0);
int32_t length = uidna_nameToASCII_UTF8(uidna,
input.data(),
int32_t(input.length()),
result.data(), int32_t(result.capacity()),
out.value().data(), 255,
&info,
&status);

if (status == U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR;
result.resize(length);
out.value().resize(length);
length = uidna_nameToASCII_UTF8(uidna,
input.data(),
int32_t(input.length()),
result.data(), int32_t(result.capacity()),
out.value().data(), length,
&info,
&status);
}
Expand All @@ -84,13 +85,16 @@ namespace ada::parser {
uidna_close(uidna);

if (U_FAILURE(status) || info.errors != 0 || length == 0) {
return std::nullopt;
out = std::nullopt;
return false;
}

result.resize(length);
if(std::any_of(result.begin(), result.end(), ada::unicode::is_forbidden_domain_code_point)) { return std::nullopt; }

return result;
out.value().resize(length);
if(std::any_of(out.value().begin(), out.value().end(), ada::unicode::is_forbidden_domain_code_point)) {
out = std::nullopt;
return false;
}
return true;
}

/**
Expand Down Expand Up @@ -356,22 +360,22 @@ namespace ada::parser {
return false;
}

out = ada::serializers::ipv6(address);
out = ada::serializers::ipv6(address.data());
return true;
}

/**
* @see https://url.spec.whatwg.org/#host-parsing
*/
bool parse_host(std::optional<std::string>& out, const std::string_view input, bool is_not_special, bool input_is_ascii) {
bool parse_host(std::optional<std::string>& out, const std::string_view input, bool is_not_special) {
//
// Note: this function assumes that parse_host is not empty. Make sure we can
// guarantee that.
//
// If input starts with U+005B ([), then:
if (input[0] == '[') {
// If input does not end with U+005D (]), validation error, return failure.
if (input.back() != ']') {
if (input[input.size()-1] != ']') {
anonrig marked this conversation as resolved.
Show resolved Hide resolved
return false;
}

Expand All @@ -389,25 +393,24 @@ namespace ada::parser {
// The most common case is an ASCII input, in which case we do not need to call the expensive 'to_ascii'
// if a few conditions are met: no '%' and no 'xn-' subsequence.
size_t first_percent = input.find('%');
// most input strings will be ASCII which may enable some optimizations.
bool is_ascii = !input.empty() && 128>(std::reduce(input.begin(), input.end(), uint8_t(input[0]), std::bit_or<uint8_t>()));

// if simple_case is true, there is a good chance we might be able to use the fast path.
bool simple_case = input_is_ascii && (first_percent == std::string_view::npos);

// This function attemps to convert an ASCII string to a lower-case version.
// Once the lower cased version has been materialized, we check for the presence
// of the substring 'xn-', if it is found (unlikely), we then call the expensive 'to_ascii'.
auto to_lower_ascii_string = [first_percent](std::string_view view) -> std::optional<std::string> {
if(std::any_of(view.begin(), view.end(), ada::unicode::is_forbidden_domain_code_point)) { return std::nullopt; }
std::string result(view);
std::transform(result.begin(), result.end(), result.begin(),[](char c) -> char {
return (uint8_t((c|0x20) - 0x61) <= 25 ? (c|0x20) : c);}
);
return (result.find("xn-") == std::string_view::npos) ? result : to_ascii(view, false, first_percent);
};
bool simple_case = (is_ascii && (first_percent == std::string_view::npos));

// This is required since `to_lower_ascii_string` assumes non-null `out` parameter
if (simple_case) {
out = input;
}

// In the simple case, we call to_lower_ascii_string above, or else, we fall back on the expensive case.
out = simple_case ? to_lower_ascii_string(input) : to_ascii(input, false, first_percent);
bool is_valid = (is_ascii && (first_percent == std::string_view::npos)) ?
unicode::to_lower_ascii_string(out, first_percent) :
to_ascii(out, input, false, first_percent);

// If asciiDomain is failure, validation error, return failure.
if (!out.has_value()) {
if (!out.has_value() || !is_valid) {
return false;
}

Expand Down Expand Up @@ -451,9 +454,6 @@ namespace ada::parser {
ada::url url = optional_url.has_value() ? std::move(optional_url.value()) : ada::url();
// From this point forward, optional_url should not be used.

// most input strings will be ASCII which may enable some optimizations.
const bool is_ascii = !user_input.empty() && 128>(std::reduce(user_input.begin(), user_input.end(), uint8_t(user_input[0]), std::bit_or<uint8_t>()));

std::string tmp_buffer;
std::string_view internal_input;
if(std::any_of(user_input.begin(), user_input.end(), ada::unicode::is_ascii_tab_or_newline)) {
Expand Down Expand Up @@ -925,7 +925,7 @@ namespace ada::parser {
}

// Let host be the result of host parsing buffer with url is not special.
url.is_valid = parse_host(url.host, host_view, !url.is_special(), is_ascii);
url.is_valid = parse_host(url.host, host_view, !url.is_special());

// Set url’s host to host, buffer to the empty string, and state to port state.
state = ada::state::PORT;
Expand All @@ -949,7 +949,7 @@ namespace ada::parser {
}

// Let host be the result of host parsing host_view with url is not special.
url.is_valid = parse_host(url.host, host_view, !url.is_special(), is_ascii);
url.is_valid = parse_host(url.host, host_view, !url.is_special());

// Set url’s host to host, and state to path start state.
state = ada::state::PATH_START;
Expand Down Expand Up @@ -1137,7 +1137,7 @@ namespace ada::parser {
}
else {
// Let host be the result of host parsing buffer with url is not special.
url.is_valid = parse_host(url.host, file_host_buffer, !url.is_special(), is_ascii);
url.is_valid = parse_host(url.host, file_host_buffer, !url.is_special());

// If host is "localhost", then set host to the empty string.
if (url.host.has_value() && url.host.value() == "localhost") {
Expand Down
7 changes: 3 additions & 4 deletions src/serializers.cpp
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
#include <array>
#include <string>

namespace ada::serializers {

size_t find_longest_sequence_of_ipv6_pieces(const std::array<uint16_t, 8> address) noexcept {
size_t find_longest_sequence_of_ipv6_pieces(const uint16_t* address) noexcept {
size_t max_index = -1;
size_t max_length = 1;
size_t current_start = -1;
size_t current_length = 0;

for (size_t i = 0; i < address.size(); i++) {
for (size_t i = 0; i < 8; i++) {
if (address[i] != 0) {
if (current_length > max_length) {
max_index = current_start;
Expand All @@ -36,7 +35,7 @@ namespace ada::serializers {
/**
* @see https://url.spec.whatwg.org/#concept-ipv6-serializer
*/
std::string ipv6(const std::array<uint16_t, 8> address) noexcept {
std::string ipv6(const uint16_t* address) noexcept {
anonrig marked this conversation as resolved.
Show resolved Hide resolved
// Let output be the empty string.
std::string output{};

Expand Down
15 changes: 15 additions & 0 deletions src/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -283,4 +283,19 @@ namespace ada::unicode {
}
}

// This function attemps to convert an ASCII string to a lower-case version.
// Once the lower cased version has been materialized, we check for the presence
// of the substring 'xn-', if it is found (unlikely), we then call the expensive 'to_ascii'.
ada_really_inline bool to_lower_ascii_string(std::optional<std::string>& out, size_t first_percent) noexcept {
anonrig marked this conversation as resolved.
Show resolved Hide resolved
if(std::any_of(out.value().begin(), out.value().end(), ada::unicode::is_forbidden_domain_code_point)) { return false; }
std::transform(out.value().begin(), out.value().end(), out.value().begin(), [](char c) -> char {
return (uint8_t((c|0x20) - 0x61) <= 25 ? (c|0x20) : c);}
);
if (out.value().find("xn-") == std::string_view::npos) {
return true;
}

return ada::parser::to_ascii(out, out.value(), false, first_percent);
}

} // namespace ada::unicode
7 changes: 4 additions & 3 deletions tests/wpt_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,14 +203,15 @@ bool toascii_encoding() {
} else if (element.type() == ondemand::json_type::object) {
ondemand::object object = element.get_object();
std::string_view input = object["input"];
auto output = ada::parser::to_ascii(input, false, input.find("%")).value_or("");
std::optional<std::string> output;
ada::parser::to_ascii(output, input, false, input.find("%"));
anonrig marked this conversation as resolved.
Show resolved Hide resolved
auto expected_output = object["output"];

if (expected_output.type() == ondemand::json_type::string) {
std::string_view stringified_output = expected_output.get_string();
TEST_ASSERT(output, stringified_output, "Should have been equal");
TEST_ASSERT(output.value_or(""), stringified_output, "Should have been equal");
} else if (expected_output.is_null()) {
TEST_ASSERT(output, "", "Should have been empty");
TEST_ASSERT(output.value_or(""), "", "Should have been empty");
}
}
}
Expand Down