ada-url · anonrig · Jan 16, 2023 · Jan 16, 2023 · Jan 16, 2023 · Jan 16, 2023
diff --git a/include/ada/parser.h b/include/ada/parser.h
@@ -10,12 +10,12 @@
 
 namespace ada::parser {
   // first_percent should be  = plain.find('%')
-  std::optional<std::string> to_ascii(std::string_view plain, bool be_strict, size_t first_percent);
+  bool to_ascii(std::optional<std::string>& out, std::string_view plain, bool be_strict, size_t first_percent);
 
   bool parse_opaque_host(std::optional<std::string>& out, std::string_view input);
   bool parse_ipv6(std::optional<std::string>& out, std::string_view input);
   bool parse_ipv4(std::optional<std::string>& out, std::string_view input);
-  bool parse_host(std::optional<std::string>& out, std::string_view input, bool is_not_special, bool input_is_ascii);
+  bool parse_host(std::optional<std::string>& out, std::string_view input, bool is_not_special);
 
   url parse_url(std::string_view user_input,
                 std::optional<ada::url> base_url = std::nullopt,

diff --git a/include/ada/serializers.h b/include/ada/serializers.h
@@ -1,16 +1,18 @@
 #ifndef ADA_SERIALIZERS_H
 #define ADA_SERIALIZERS_H
 
+#include "ada/common_defs.h"
+
 #include <array>
 #include <optional>
 #include <string>
 
 namespace ada::serializers {
 
-  size_t find_longest_sequence_of_ipv6_pieces(const std::array<uint16_t, 8> address) noexcept;
+  size_t find_longest_sequence_of_ipv6_pieces(const uint16_t* address) noexcept;
 
   // An IPv6 address is a 128-bit unsigned integer that identifies a network address.
-  std::string ipv6(const std::array<uint16_t, 8> address) noexcept;
+  std::string ipv6(const uint16_t* address) noexcept;
 
   // An IPv4 address is a 32-bit unsigned integer that identifies a network address.
   std::string ipv4(const uint64_t address) noexcept;

diff --git a/include/ada/state.h b/include/ada/state.h
@@ -3,6 +3,8 @@
 
 #include "common_defs.h"
 
+#include <string>
+
 namespace ada {
 
   enum class state {

diff --git a/include/ada/unicode.h b/include/ada/unicode.h
@@ -3,6 +3,7 @@
 
 #include "common_defs.h"
 #include <string>
+#include <optional>
 
 namespace ada::unicode {
 
@@ -24,6 +25,7 @@ namespace ada::unicode {
   std::string percent_decode(const std::string_view input, size_t first_percent);
   std::string percent_encode(const std::string_view input, const uint8_t character_set[]);
   void percent_encode_character(const char input, const uint8_t character_set[], std::string& out);
+  ada_really_inline bool to_lower_ascii_string(std::optional<std::string>& out, size_t first_percent) noexcept;
 
 } // namespace ada::unicode
 

diff --git a/src/parser.cpp b/src/parser.cpp
@@ -12,6 +12,7 @@
 #include <iostream>
 #include <numeric>
 
+#include <optional>
 #include <string_view>
 #include <unicode/utypes.h>
 #include <unicode/uidna.h>
@@ -26,7 +27,7 @@ namespace ada::parser {
    * to_ascii does not expect the input to be percent decoded. This is
    * mostly used to conform with the test suite.
    */
-  std::optional<std::string> to_ascii(const std::string_view plain, const bool be_strict, size_t first_percent) {
+  bool to_ascii(std::optional<std::string>& out, const std::string_view plain, const bool be_strict, size_t first_percent) {
     std::string percent_decoded_buffer;
     std::string_view input = plain;
     if(first_percent != std::string_view::npos) {
@@ -42,25 +43,25 @@ namespace ada::parser {
 
     UIDNA* uidna = uidna_openUTS46(options, &status);
     if (U_FAILURE(status)) {
-      return std::nullopt;
+      return false;
     }
 
     UIDNAInfo info = UIDNA_INFO_INITIALIZER;
-    std::string result(255, ' ');
+    out = std::string(255, 0);
     int32_t length = uidna_nameToASCII_UTF8(uidna,
                                          input.data(),
                                          int32_t(input.length()),
-                                         result.data(), int32_t(result.capacity()),
+                                         out.value().data(), 255,
                                          &info,
                                          &status);
 
     if (status == U_BUFFER_OVERFLOW_ERROR) {
       status = U_ZERO_ERROR;
-      result.resize(length);
+      out.value().resize(length);
       length = uidna_nameToASCII_UTF8(uidna,
                                      input.data(),
                                      int32_t(input.length()),
-                                     result.data(), int32_t(result.capacity()),
+                                     out.value().data(), length,
                                      &info,
                                      &status);
     }
@@ -84,13 +85,16 @@ namespace ada::parser {
     uidna_close(uidna);
 
     if (U_FAILURE(status) || info.errors != 0 || length == 0) {
-      return std::nullopt;
+      out = std::nullopt;
+      return false;
     }
 
-    result.resize(length);
-    if(std::any_of(result.begin(), result.end(), ada::unicode::is_forbidden_domain_code_point)) { return std::nullopt; }
-
-    return result;
+    out.value().resize(length);
+    if(std::any_of(out.value().begin(), out.value().end(), ada::unicode::is_forbidden_domain_code_point)) {
+      out = std::nullopt;
+      return false;
+    }
+    return true;
   }
 
   /**
@@ -356,22 +360,22 @@ namespace ada::parser {
       return false;
     }
 
-    out = ada::serializers::ipv6(address);
+    out = ada::serializers::ipv6(address.data());
     return true;
   }
 
   /**
    * @see https://url.spec.whatwg.org/#host-parsing
    */
-  bool parse_host(std::optional<std::string>& out, const std::string_view input, bool is_not_special, bool input_is_ascii) {
+  bool parse_host(std::optional<std::string>& out, const std::string_view input, bool is_not_special) {
     //
     // Note: this function assumes that parse_host is not empty. Make sure we can
     // guarantee that.
     //
     // If input starts with U+005B ([), then:
     if (input[0] == '[') {
       // If input does not end with U+005D (]), validation error, return failure.
-      if (input.back() != ']') {
+      if (input[input.size()-1] != ']') {
         return false;
       }
 
@@ -389,25 +393,24 @@ namespace ada::parser {
     // The most common case is an ASCII input, in which case we do not need to call the expensive 'to_ascii'
     // if a few conditions are met: no '%' and no 'xn-' subsequence.
     size_t first_percent = input.find('%');
+    // most input strings will be ASCII which may enable some optimizations.
+    bool is_ascii = !input.empty() && 128>(std::reduce(input.begin(), input.end(), uint8_t(input[0]), std::bit_or<uint8_t>()));
+
     // if simple_case is true, there is a good chance we might be able to use the fast path.
-    bool simple_case = input_is_ascii && (first_percent == std::string_view::npos);
-
-    // This function attemps to convert an ASCII string to a lower-case version.
-    // Once the lower cased version has been materialized, we check for the presence
-    // of the substring 'xn-', if it is found (unlikely), we then call the expensive 'to_ascii'.
-    auto to_lower_ascii_string = [first_percent](std::string_view view) -> std::optional<std::string> {
-      if(std::any_of(view.begin(), view.end(), ada::unicode::is_forbidden_domain_code_point)) { return std::nullopt; }
-      std::string result(view);
-      std::transform(result.begin(), result.end(), result.begin(),[](char c) -> char {
-        return (uint8_t((c|0x20) - 0x61) <= 25 ? (c|0x20) : c);}
-      );
-      return (result.find("xn-") == std::string_view::npos) ? result : to_ascii(view, false, first_percent);
-    };
+    bool simple_case = (is_ascii && (first_percent == std::string_view::npos));
+
+    // This is required since `to_lower_ascii_string` assumes non-null `out` parameter
+    if (simple_case) {
+      out = input;
+    }
+
     // In the simple case, we call to_lower_ascii_string above, or else, we fall back on the expensive case.
-    out = simple_case ? to_lower_ascii_string(input) : to_ascii(input, false, first_percent);
+    bool is_valid = (is_ascii && (first_percent == std::string_view::npos)) ?
+      unicode::to_lower_ascii_string(out, first_percent) :
+      to_ascii(out, input, false, first_percent);
 
     // If asciiDomain is failure, validation error, return failure.
-    if (!out.has_value()) {
+    if (!out.has_value() || !is_valid) {
       return false;
     }
 
@@ -451,9 +454,6 @@ namespace ada::parser {
     ada::url url = optional_url.has_value() ? std::move(optional_url.value()) : ada::url();
     // From this point forward, optional_url should not be used.
 
-    // most input strings will be ASCII which may enable some optimizations.
-    const bool is_ascii = !user_input.empty() && 128>(std::reduce(user_input.begin(), user_input.end(), uint8_t(user_input[0]), std::bit_or<uint8_t>()));
-
     std::string tmp_buffer;
     std::string_view internal_input;
     if(std::any_of(user_input.begin(), user_input.end(), ada::unicode::is_ascii_tab_or_newline)) {
@@ -925,7 +925,7 @@ namespace ada::parser {
             }
 
             // Let host be the result of host parsing buffer with url is not special.
-            url.is_valid = parse_host(url.host, host_view, !url.is_special(), is_ascii);
+            url.is_valid = parse_host(url.host, host_view, !url.is_special());
 
             // Set url’s host to host, buffer to the empty string, and state to port state.
             state = ada::state::PORT;
@@ -949,7 +949,7 @@ namespace ada::parser {
             }
 
             // Let host be the result of host parsing host_view with url is not special.
-            url.is_valid = parse_host(url.host, host_view, !url.is_special(), is_ascii);
+            url.is_valid = parse_host(url.host, host_view, !url.is_special());
 
             // Set url’s host to host, and state to path start state.
             state = ada::state::PATH_START;
@@ -1137,7 +1137,7 @@ namespace ada::parser {
           }
           else {
             // Let host be the result of host parsing buffer with url is not special.
-            url.is_valid = parse_host(url.host, file_host_buffer, !url.is_special(), is_ascii);
+            url.is_valid = parse_host(url.host, file_host_buffer, !url.is_special());
 
             // If host is "localhost", then set host to the empty string.
             if (url.host.has_value() && url.host.value() == "localhost") {

diff --git a/src/serializers.cpp b/src/serializers.cpp
@@ -1,15 +1,14 @@
-#include <array>
 #include <string>
 
 namespace ada::serializers {
 
-  size_t find_longest_sequence_of_ipv6_pieces(const std::array<uint16_t, 8> address) noexcept {
+  size_t find_longest_sequence_of_ipv6_pieces(const uint16_t* address) noexcept {
     size_t max_index = -1;
     size_t max_length = 1;
     size_t current_start = -1;
     size_t current_length = 0;
 
-    for (size_t i = 0; i < address.size(); i++) {
+    for (size_t i = 0; i < 8; i++) {
       if (address[i] != 0) {
         if (current_length > max_length) {
           max_index = current_start;
@@ -36,7 +35,7 @@ namespace ada::serializers {
   /**
    * @see https://url.spec.whatwg.org/#concept-ipv6-serializer
    */
-  std::string ipv6(const std::array<uint16_t, 8> address) noexcept {
+  std::string ipv6(const uint16_t* address) noexcept {
     // Let output be the empty string.
     std::string output{};
 

diff --git a/src/unicode.cpp b/src/unicode.cpp
@@ -283,4 +283,19 @@ namespace ada::unicode {
       }
   }
 
+  // This function attemps to convert an ASCII string to a lower-case version.
+  // Once the lower cased version has been materialized, we check for the presence
+  // of the substring 'xn-', if it is found (unlikely), we then call the expensive 'to_ascii'.
+  ada_really_inline bool to_lower_ascii_string(std::optional<std::string>& out, size_t first_percent) noexcept {
+    if(std::any_of(out.value().begin(), out.value().end(), ada::unicode::is_forbidden_domain_code_point)) { return false; }
+    std::transform(out.value().begin(), out.value().end(), out.value().begin(), [](char c) -> char {
+      return (uint8_t((c|0x20) - 0x61) <= 25 ? (c|0x20) : c);}
+    );
+    if (out.value().find("xn-") == std::string_view::npos) {
+      return true;
+    }
+
+    return ada::parser::to_ascii(out, out.value(), false, first_percent);
+  }
+
 } // namespace ada::unicode
diff --git a/tests/wpt_tests.cpp b/tests/wpt_tests.cpp
@@ -203,14 +203,15 @@ bool toascii_encoding() {
     } else if (element.type() == ondemand::json_type::object) {
       ondemand::object object = element.get_object();
       std::string_view input = object["input"];
-      auto output = ada::parser::to_ascii(input, false, input.find("%")).value_or("");
+      std::optional<std::string> output;
+      ada::parser::to_ascii(output, input, false, input.find("%"));
       auto expected_output = object["output"];
 
       if (expected_output.type() == ondemand::json_type::string) {
         std::string_view stringified_output = expected_output.get_string();
-        TEST_ASSERT(output, stringified_output, "Should have been equal");
+        TEST_ASSERT(output.value_or(""), stringified_output, "Should have been equal");
       } else if (expected_output.is_null()) {
-        TEST_ASSERT(output, "", "Should have been empty");
+        TEST_ASSERT(output.value_or(""), "", "Should have been empty");
       }
     }
   }