ada-url · anonrig · Jan 16, 2023 · Jan 16, 2023 · Jan 16, 2023 · Jan 16, 2023
diff --git a/include/ada/checkers.h b/include/ada/checkers.h
@@ -54,6 +54,8 @@ namespace ada::checkers {
     return view.size() >= prefix.size() && (view.substr(0, prefix.size()) == prefix);
   }
 
+  ada_really_inline constexpr bool is_ipv4(std::string_view view) noexcept;
+
 } // namespace ada::checkers
 
 #endif //ADA_CHECKERS_H
diff --git a/include/ada/parser.h b/include/ada/parser.h
@@ -10,12 +10,12 @@
 
 namespace ada::parser {
   // first_percent should be  = plain.find('%')
-  std::optional<std::string> to_ascii(std::string_view plain, bool be_strict, size_t first_percent);
+  bool to_ascii(std::optional<std::string>& out, std::string_view plain, bool be_strict, size_t first_percent);
 
   bool parse_opaque_host(std::optional<std::string>& out, std::string_view input);
   bool parse_ipv6(std::optional<std::string>& out, std::string_view input);
   bool parse_ipv4(std::optional<std::string>& out, std::string_view input);
-  bool parse_host(std::optional<std::string>& out, std::string_view input, bool is_not_special, bool input_is_ascii);
+  bool parse_host(std::optional<std::string>& out, std::string_view input, bool is_not_special);
 
   url parse_url(std::string_view user_input,
                 std::optional<ada::url> base_url = std::nullopt,

diff --git a/include/ada/serializers.h b/include/ada/serializers.h
@@ -1,16 +1,18 @@
 #ifndef ADA_SERIALIZERS_H
 #define ADA_SERIALIZERS_H
 
+#include "ada/common_defs.h"
+
 #include <array>
 #include <optional>
 #include <string>
 
 namespace ada::serializers {
 
-  size_t find_longest_sequence_of_ipv6_pieces(const std::array<uint16_t, 8> address) noexcept;
+  size_t find_longest_sequence_of_ipv6_pieces(const std::array<uint16_t, 8>& address) noexcept;
 
   // An IPv6 address is a 128-bit unsigned integer that identifies a network address.
-  std::string ipv6(const std::array<uint16_t, 8> address) noexcept;
+  std::string ipv6(const std::array<uint16_t, 8>& address) noexcept;
 
   // An IPv4 address is a 32-bit unsigned integer that identifies a network address.
   std::string ipv4(const uint64_t address) noexcept;

diff --git a/include/ada/state.h b/include/ada/state.h
@@ -3,6 +3,8 @@
 
 #include "common_defs.h"
 
+#include <string>
+
 namespace ada {
 
   enum class state {

diff --git a/include/ada/unicode.h b/include/ada/unicode.h
@@ -3,6 +3,7 @@
 
 #include "common_defs.h"
 #include <string>
+#include <optional>
 
 namespace ada::unicode {
 
@@ -24,6 +25,7 @@ namespace ada::unicode {
   std::string percent_decode(const std::string_view input, size_t first_percent);
   std::string percent_encode(const std::string_view input, const uint8_t character_set[]);
   void percent_encode_character(const char input, const uint8_t character_set[], std::string& out);
+  ada_really_inline bool to_lower_ascii_string(std::optional<std::string>& out, size_t first_percent) noexcept;
 
 } // namespace ada::unicode
 

diff --git a/src/ada.cpp b/src/ada.cpp
@@ -1,4 +1,5 @@
 #include "ada.h"
+#include "checkers.cpp"
 #include "unicode.cpp"
 #include "serializers.cpp"
 #include "implementation.cpp"

diff --git a/src/checkers.cpp b/src/checkers.cpp
@@ -0,0 +1,21 @@
+#include "ada.h"
+#include <algorithm>
+
+namespace ada::checkers {
+
+  ada_really_inline constexpr bool is_ipv4(std::string_view view) noexcept {
+    size_t last_dot = view.rfind('.');
+    if(last_dot == view.size() - 1) {
+      view.remove_suffix(1);
+      last_dot = view.rfind('.');
+    }
+    std::string_view number = (last_dot == std::string_view::npos) ? view : view.substr(last_dot+1);
+    if(number.empty()) { return false; }
+    /** Optimization opportunity: we have basically identified the last number of the
+        ipv4 if we return true here. We might as well parse it and have at least one
+        number parsed when we get to parse_ipv4. */
+    if(std::all_of(number.begin(), number.end(), ada::checkers::is_digit)) { return true; }
+    return (checkers::has_hex_prefix(number) && std::all_of(number.begin()+2, number.end(), ada::unicode::is_lowercase_hex));
+  }
+
+} // namespace ada::checkers
diff --git a/src/parser.cpp b/src/parser.cpp
@@ -12,6 +12,7 @@
 #include <iostream>
 #include <numeric>
 
+#include <optional>
 #include <string_view>
 #include <unicode/utypes.h>
 #include <unicode/uidna.h>
@@ -26,7 +27,7 @@ namespace ada::parser {
    * to_ascii does not expect the input to be percent decoded. This is
    * mostly used to conform with the test suite.
    */
-  std::optional<std::string> to_ascii(const std::string_view plain, const bool be_strict, size_t first_percent) {
+  bool to_ascii(std::optional<std::string>& out, const std::string_view plain, const bool be_strict, size_t first_percent) {
     std::string percent_decoded_buffer;
     std::string_view input = plain;
     if(first_percent != std::string_view::npos) {
@@ -42,25 +43,25 @@ namespace ada::parser {
 
     UIDNA* uidna = uidna_openUTS46(options, &status);
     if (U_FAILURE(status)) {
-      return std::nullopt;
+      return false;
     }
 
     UIDNAInfo info = UIDNA_INFO_INITIALIZER;
-    std::string result(255, ' ');
+    out = std::string(255, 0);
     int32_t length = uidna_nameToASCII_UTF8(uidna,
                                          input.data(),
                                          int32_t(input.length()),
-                                         result.data(), int32_t(result.capacity()),
+                                         out.value().data(), 255,
                                          &info,
                                          &status);
 
     if (status == U_BUFFER_OVERFLOW_ERROR) {
       status = U_ZERO_ERROR;
-      result.resize(length);
+      out.value().resize(length);
       length = uidna_nameToASCII_UTF8(uidna,
                                      input.data(),
                                      int32_t(input.length()),
-                                     result.data(), int32_t(result.capacity()),
+                                     out.value().data(), length,
                                      &info,
                                      &status);
     }
@@ -84,13 +85,16 @@ namespace ada::parser {
     uidna_close(uidna);
 
     if (U_FAILURE(status) || info.errors != 0 || length == 0) {
-      return std::nullopt;
+      out = std::nullopt;
+      return false;
     }
 
-    result.resize(length);
-    if(std::any_of(result.begin(), result.end(), ada::unicode::is_forbidden_domain_code_point)) { return std::nullopt; }
-
-    return result;
+    out.value().resize(length);
+    if(std::any_of(out.value().begin(), out.value().end(), ada::unicode::is_forbidden_domain_code_point)) {
+      out = std::nullopt;
+      return false;
+    }
+    return true;
   }
 
   /**
@@ -363,11 +367,14 @@ namespace ada::parser {
   /**
    * @see https://url.spec.whatwg.org/#host-parsing
    */
-  bool parse_host(std::optional<std::string>& out, const std::string_view input, bool is_not_special, bool input_is_ascii) {
+  bool parse_host(std::optional<std::string>& out, const std::string_view input, bool is_not_special) {
     //
     // Note: this function assumes that parse_host is not empty. Make sure we can
     // guarantee that.
     //
+#if ADA_DEVELOP_MODE
+    if(input.empty()) { abort(); }
+#endif
     // If input starts with U+005B ([), then:
     if (input[0] == '[') {
       // If input does not end with U+005D (]), validation error, return failure.
@@ -389,44 +396,29 @@ namespace ada::parser {
     // The most common case is an ASCII input, in which case we do not need to call the expensive 'to_ascii'
     // if a few conditions are met: no '%' and no 'xn-' subsequence.
     size_t first_percent = input.find('%');
+    // most input strings will be ASCII which may enable some optimizations.
+    bool is_ascii = !input.empty() && 128>(std::reduce(input.begin(), input.end(), uint8_t(input[0]), std::bit_or<uint8_t>()));
+
     // if simple_case is true, there is a good chance we might be able to use the fast path.
-    bool simple_case = input_is_ascii && (first_percent == std::string_view::npos);
-
-    // This function attemps to convert an ASCII string to a lower-case version.
-    // Once the lower cased version has been materialized, we check for the presence
-    // of the substring 'xn-', if it is found (unlikely), we then call the expensive 'to_ascii'.
-    auto to_lower_ascii_string = [first_percent](std::string_view view) -> std::optional<std::string> {
-      if(std::any_of(view.begin(), view.end(), ada::unicode::is_forbidden_domain_code_point)) { return std::nullopt; }
-      std::string result(view);
-      std::transform(result.begin(), result.end(), result.begin(),[](char c) -> char {
-        return (uint8_t((c|0x20) - 0x61) <= 25 ? (c|0x20) : c);}
-      );
-      return (result.find("xn-") == std::string_view::npos) ? result : to_ascii(view, false, first_percent);
-    };
+    bool simple_case = (is_ascii && (first_percent == std::string_view::npos));
+
+    // This is required since `to_lower_ascii_string` assumes non-null `out` parameter
+    if (simple_case) {
+      out = input;
+    }
+
     // In the simple case, we call to_lower_ascii_string above, or else, we fall back on the expensive case.
-    out = simple_case ? to_lower_ascii_string(input) : to_ascii(input, false, first_percent);
+    bool is_valid = (is_ascii && (first_percent == std::string_view::npos)) ?
+      unicode::to_lower_ascii_string(out, first_percent) :
+      to_ascii(out, input, false, first_percent);
 
     // If asciiDomain is failure, validation error, return failure.
-    if (!out.has_value()) {
+    if (!out.has_value() || !is_valid) {
       return false;
     }
 
     // If asciiDomain ends in a number, then return the result of IPv4 parsing asciiDomain.
-    auto is_ipv4 = [](std::string_view view) {
-      size_t last_dot = view.rfind('.');
-      if(last_dot == view.size() - 1) {
-        view.remove_suffix(1);
-        last_dot = view.rfind('.');
-      }
-      std::string_view number = (last_dot == std::string_view::npos) ? view : view.substr(last_dot+1);
-      if(number.empty()) { return false; }
-      /** Optimization opportunity: we have basically identified the last number of the
-      ipv4 if we return true here. We might as well parse it and have at least one
-      number parsed when we get to parse_ipv4. */
-      if(std::all_of(number.begin(), number.end(), ada::checkers::is_digit)) { return true; }
-      return (checkers::has_hex_prefix(number) && std::all_of(number.begin()+2, number.end(), ::ada::unicode::is_lowercase_hex));
-    };
-    if(is_ipv4(*out)) {
+    if(checkers::is_ipv4(*out)) {
       return parse_ipv4(out, *out);
     }
     return true;
@@ -451,9 +443,6 @@ namespace ada::parser {
     ada::url url = optional_url.has_value() ? std::move(optional_url.value()) : ada::url();
     // From this point forward, optional_url should not be used.
 
-    // most input strings will be ASCII which may enable some optimizations.
-    const bool is_ascii = !user_input.empty() && 128>(std::reduce(user_input.begin(), user_input.end(), uint8_t(user_input[0]), std::bit_or<uint8_t>()));
-
     std::string tmp_buffer;
     std::string_view internal_input;
     if(std::any_of(user_input.begin(), user_input.end(), ada::unicode::is_ascii_tab_or_newline)) {
@@ -487,13 +476,6 @@ namespace ada::parser {
     pointer_start = state_override.has_value() ? internal_input.begin() : url_data.begin();
     pointer_end = state_override.has_value() ? internal_input.end() : url_data.end();
 
-    // most URLs have no @. Having no @ tells us that we don't have to worry about AUTHORITY. Of course,
-    // we could have @ and still not have to worry about AUTHORITY.
-    // TODO: Instead of just collecting a bool, collect the location of the '@' and do something useful with it.
-    // TODO: We could do various processing early on, using a single pass over the string to collect
-    // information about it, e.g., telling us whether there is a @ and if so, where (or how many).
-    const bool contains_ampersand = (std::find(pointer_start, pointer_end, '@') != pointer_end);
-
     // Let pointer be a pointer for input.
     std::string_view::iterator pointer = pointer_start;
 
@@ -648,6 +630,13 @@ namespace ada::parser {
           break;
         }
         case ada::state::AUTHORITY: {
+          // most URLs have no @. Having no @ tells us that we don't have to worry about AUTHORITY. Of course,
+          // we could have @ and still not have to worry about AUTHORITY.
+          // TODO: Instead of just collecting a bool, collect the location of the '@' and do something useful with it.
+          // TODO: We could do various processing early on, using a single pass over the string to collect
+          // information about it, e.g., telling us whether there is a @ and if so, where (or how many).
+          const bool contains_ampersand = (std::find(pointer, pointer_end, '@') != pointer_end);
+
           if(!contains_ampersand) {
             // TODO: This is a waste of time, we should never have arrived here.
             pointer--;
@@ -925,7 +914,7 @@ namespace ada::parser {
             }
 
             // Let host be the result of host parsing buffer with url is not special.
-            url.is_valid = parse_host(url.host, host_view, !url.is_special(), is_ascii);
+            url.is_valid = parse_host(url.host, host_view, !url.is_special());
 
             // Set url’s host to host, buffer to the empty string, and state to port state.
             state = ada::state::PORT;
@@ -949,7 +938,12 @@ namespace ada::parser {
             }
 
             // Let host be the result of host parsing host_view with url is not special.
-            url.is_valid = parse_host(url.host, host_view, !url.is_special(), is_ascii);
+            if (host_view.empty()) {
+              url.is_valid = true;
+              url.host = "";
+            } else {
+              url.is_valid = parse_host(url.host, host_view, !url.is_special());
+            }
 
             // Set url’s host to host, and state to path start state.
             state = ada::state::PATH_START;
@@ -1003,6 +997,12 @@ namespace ada::parser {
             if ((pointer == pointer_end) || ((*pointer != '/') && (*pointer != '\\'))) {
               pointer--;
             }
+
+            // Optimization: Avoiding going into PATH state improves the performance of urls ending with /.
+            if (std::next(pointer) == pointer_end) {
+              url.path = "/";
+              return url;
+            }
           }
           // Otherwise, if state override is not given and c is U+003F (?),
           // set url’s query to the empty string and state to query state.
@@ -1137,7 +1137,7 @@ namespace ada::parser {
           }
           else {
             // Let host be the result of host parsing buffer with url is not special.
-            url.is_valid = parse_host(url.host, file_host_buffer, !url.is_special(), is_ascii);
+            url.is_valid = parse_host(url.host, file_host_buffer, !url.is_special());
 
             // If host is "localhost", then set host to the empty string.
             if (url.host.has_value() && url.host.value() == "localhost") {

diff --git a/src/serializers.cpp b/src/serializers.cpp
@@ -3,13 +3,13 @@
 
 namespace ada::serializers {
 
-  size_t find_longest_sequence_of_ipv6_pieces(const std::array<uint16_t, 8> address) noexcept {
+  size_t find_longest_sequence_of_ipv6_pieces(const std::array<uint16_t, 8>& address) noexcept {
     size_t max_index = -1;
     size_t max_length = 1;
     size_t current_start = -1;
     size_t current_length = 0;
 
-    for (size_t i = 0; i < address.size(); i++) {
+    for (size_t i = 0; i < 8; i++) {
       if (address[i] != 0) {
         if (current_length > max_length) {
           max_index = current_start;
@@ -36,7 +36,7 @@ namespace ada::serializers {
   /**
    * @see https://url.spec.whatwg.org/#concept-ipv6-serializer
    */
-  std::string ipv6(const std::array<uint16_t, 8> address) noexcept {
+  std::string ipv6(const std::array<uint16_t, 8>& address) noexcept {
     // Let output be the empty string.
     std::string output{};
 

diff --git a/src/unicode.cpp b/src/unicode.cpp
@@ -283,4 +283,22 @@ namespace ada::unicode {
       }
   }
 
+  // This function attemps to convert an ASCII string to a lower-case version.
+  // Once the lower cased version has been materialized, we check for the presence
+  // of the substring 'xn-', if it is found (unlikely), we then call the expensive 'to_ascii'.
+  ada_really_inline bool to_lower_ascii_string(std::optional<std::string>& out, size_t first_percent) noexcept {
+#if ADA_DEVELOP_MODE
+    if(!out.has_value()) { abort(); }
+#endif
+    if(std::any_of(out.value().begin(), out.value().end(), ada::unicode::is_forbidden_domain_code_point)) { return false; }
+    std::transform(out.value().begin(), out.value().end(), out.value().begin(), [](char c) -> char {
+      return (uint8_t((c|0x20) - 0x61) <= 25 ? (c|0x20) : c);}
+    );
+    if (out.value().find("xn-") == std::string_view::npos) {
+      return true;
+    }
+
+    return ada::parser::to_ascii(out, out.value(), false, first_percent);
+  }
+
 } // namespace ada::unicode
diff --git a/tests/wpt_tests.cpp b/tests/wpt_tests.cpp
@@ -203,14 +203,15 @@ bool toascii_encoding() {
     } else if (element.type() == ondemand::json_type::object) {
       ondemand::object object = element.get_object();
       std::string_view input = object["input"];
-      auto output = ada::parser::to_ascii(input, false, input.find("%")).value_or("");
+      std::optional<std::string> output;
+      ada::parser::to_ascii(output, input, false, input.find('%'));
       auto expected_output = object["output"];
 
       if (expected_output.type() == ondemand::json_type::string) {
         std::string_view stringified_output = expected_output.get_string();
-        TEST_ASSERT(output, stringified_output, "Should have been equal");
+        TEST_ASSERT(output.value_or(""), stringified_output, "Should have been equal");
       } else if (expected_output.is_null()) {
-        TEST_ASSERT(output, "", "Should have been empty");
+        TEST_ASSERT(output.value_or(""), "", "Should have been empty");
       }
     }
   }