feat: add improved protocol detection and handle messy strings in mak…

…eURL
TheNaubit · Mar 6, 2024 · 4808c3b · 4808c3b
1 parent 1c663c8
commit 4808c3b
Show file tree

Hide file tree

Showing 5 changed files with 525 additions and 51 deletions.
diff --git a/README.md b/README.md
@@ -83,6 +83,9 @@ Feel free to test and explore and if later on you need more guidance, read the w
     <tr>
       <td>👌 1kB <a href="https://bundlephobia.com/package/@nauverse/make-url@latest">minified and gzipped</a></td>
     </tr>
+    <tr>
+      <td>🔒 Reliable. Even when you pass really messed up strings, it finds a way to build a valid URL (just check the tests for some examples)</td>
+    </tr>
     <tr>
       <td>✍️ TypeScript types provided</td>
     </tr>

diff --git a/src/lib/helpers.ts b/src/lib/helpers.ts
@@ -0,0 +1,381 @@
+import type {
+  IConfig,
+  IDomainData,
+  IProtocolData,
+  IURLArrayData
+} from "../types";
+
+/**
+ * Detects the domain in a given string.
+ *
+ * @param str - The string to detect the domain from. Important: To detect a valid domain, the URL must contain a non-relative protocol (http:// or https://).
+ * @returns An object containing information about the domain.
+ * @example
+ * detectDomainInString("https://example.com/blog");
+ * // => { hasDomain: true, domain: "example.com" }
+ * detectDomainInString("http://example/blog");
+ * // => { hasDomain: false, domain: "" }
+ */
+export function detectDomainInString(str: string): IDomainData {
+  // We will use "new URL(...)" to detect it
+  // If it throws an error, it means the string is not a valid URL, so probably it doesn't have a domain
+  try {
+    const url = new URL(str);
+
+    // But not throwing in the constructor does not mean it has a valid domain. For example: "https://example/blog" does not throw
+
+    // So we will check if the hostname has at least one dot and then we have greater confidence that it is a domain
+    if (url.hostname.split(".").length < 2) {
+      throw null; // We throw an error to be catched in the catch block
+    }
+
+    return {
+      hasDomain: true,
+      domain: url.hostname
+    };
+  } catch {
+    return {
+      hasDomain: false,
+      domain: ""
+    };
+  }
+}
+
+/**
+ * Detects the protocol in a given string.
+ * @param str - The string to detect the protocol from.
+ * @returns An object containing information about the protocol detection.
+ * @example
+ * detectProtocolInString("https://example.com/blog");
+ * // => { hasProtocol: true, protocol: "https" }
+ * detectProtocolInString("example.com/blog");
+ * // => { hasProtocol: false, protocol: "none" }
+ * detectProtocolInString("//example.com/blog");
+ * // => { hasProtocol: true, protocol: "relative" }
+ * detectProtocolInString("//example/blog");
+ * // => { hasProtocol: false, protocol: "relative" }
+ */
+export function detectProtocolInString(str: string): IProtocolData {
+  const cleanedStr = str.trim().toLowerCase();
+
+  if (cleanedStr.startsWith("//")) {
+    const urlToCheck = `https:${cleanedStr}`; // Since the domain detector function only works with URLs with a protocol, we add a fake non-relative protocol to the string
+    const detectDomainData = detectDomainInString(urlToCheck);
+    return {
+      hasProtocol: detectDomainData.hasDomain, // Even if it contains a valid relative protocol string, we can not count it as valid since it does not contain a valid domain
+      protocol: "relative" // It contains a valid protocol
+    };
+  }
+
+  // Regex that checks if the str starts with http or https
+  const hasProtocol = /^(http|https):\/\//.test(cleanedStr);
+  return {
+    hasProtocol,
+    protocol: hasProtocol
+      ? (cleanedStr.split("://")[0] as "http" | "https")
+      : "none"
+  };
+}
+
+/**
+ * Generates a temporary URL string by merging an array of fragments.
+ * This generated URL is not safe and should only be used for detecting the domain.
+ * Important: This should only be used when we need to detect stuff in the URL like in the `detectDomainInString` function. It should NEVER be used to generate a valid URL. For that you should use the `safeStringArrayToURLString` function.
+ *
+ * @param array - An array of string fragments to be merged.
+ * @param hasProtocol - A boolean indicating whether the array has a protocol.
+ * @returns The merged URL string.
+ * @example
+ * getUnsafeMergedURLString(["https://", "example.com", "blog"], true);
+ * // => "https://example.com/blog"
+ */
+function getUnsafeMergedURLString(
+  array: Array<string>,
+  hasProtocol: boolean
+): string {
+  // We need to generate a temporary URL to detect the domain
+  // This generated URL is not really safe, meaning it could contain
+  // unescaped character, wrong protocols, etc. But we don't care about that, since we only need it to detect the domain
+  // Just, don't use it for anything else
+  return (
+    array
+      // If the array has a protocol, we skip the first fragment
+      .slice(hasProtocol ? 1 : 0)
+      // We filter out empty fragments, keep in mind this is only for detecting the domain, so we don't care about empty fragments
+      .filter(v => v.trim() !== "")
+      // We could join using the `.join`method, but we need more control
+      // over how to join, so we use the `.map` method
+      .map((v, index) => {
+        // If it is the first item or if it contains a dot (potentially a part of the domain), we don't join with "/"
+        if (index === 0 || v.startsWith(".")) return v;
+        // Anything else is joined with "/"
+        else return `/${v}`;
+      })
+      // Previously I said "join", but it was more like modifying the fragment so we could just concatenate the array without any separator
+      .join("")
+  );
+}
+
+/**
+ * Extracts the protocol from an array of fragments based on the given protocol index.
+ *
+ * @param array - The array of fragments.
+ * @param protocolIndex - The index at which the protocol ends.
+ * @returns An array of fragments containing the extracted protocol.
+ * @example
+ * extractProtocolFromArray(["https://", "example", "", ".com", "blog"], 8);
+ * // => ["https://", "example.com", "blog"]
+ */
+function extractProtocolFromArray(
+  array: Array<string>,
+  protocolIndex: number
+): Array<string> {
+  let currentLength = 0;
+  let protocolFragment = "";
+  const returnedFragments: Array<string> = [];
+
+  array.forEach(fragment => {
+    // If the fragment is empty and there are already fragments in the returned fragments, we add the fragment to the returned fragments
+    // If there are no fragments in the returned fragments, we skip it
+    // since it could break our logic to handle the protocol and in any case it would be an empty fragment
+    if (fragment === "" && returnedFragments.length > 0) {
+      returnedFragments.push(fragment);
+      return;
+    }
+    // If the current length plus the length of the fragment is less than the protocol index, we add the fragment to the protocol fragment
+    if (currentLength + fragment.length <= protocolIndex) {
+      protocolFragment += fragment;
+      currentLength += fragment.length;
+      // If the current length is greater than the protocol index, we add the fragment to the returned fragments
+    } else if (currentLength < protocolIndex) {
+      const sliceIndex = protocolIndex - currentLength;
+      protocolFragment += fragment.slice(0, sliceIndex);
+
+      if (sliceIndex < fragment.length) {
+        returnedFragments.push(protocolFragment);
+        protocolFragment = "";
+        returnedFragments.push(fragment.slice(sliceIndex));
+      }
+
+      currentLength += sliceIndex;
+      // If the current length is equal to the protocol index, we add the fragment to the returned fragments
+    } else {
+      if (protocolFragment !== "") {
+        returnedFragments.push(protocolFragment);
+        protocolFragment = "";
+      }
+      returnedFragments.push(fragment);
+    }
+  });
+
+  if (protocolFragment !== "") {
+    returnedFragments.push(protocolFragment);
+    protocolFragment = "";
+  }
+
+  return returnedFragments;
+}
+
+/**
+ * Extracts the domain from an array of URL fragments.
+ * Important: The array entered must come from the extractProtocolFromArray function or the safeStringArrayAssembler function.
+ *
+ * @param array - The array of URL fragments.
+ * @param hasProtocolExtracted - A boolean indicating whether the protocol has been extracted from the URL.
+ * @returns An array of URL fragments with the domain extracted.
+ * @example
+ * extractDomainFromArray(["https://", "example", "", ".com", "blog"], true);
+ * // => ["https://", "example.com", "blog"]
+ * extractDomainFromArray(["example", "", ".com", "blog"], true);
+ * // => ["example.com", "blog"]
+ */
+function extractDomainFromArray(
+  array: Array<string>,
+  hasProtocolExtracted: boolean
+): Array<string> {
+  // We need to generate a temporary URL to detect the domain
+  // This generated URL is not really safe, meaning it could contain
+  // unescaped character, wrong protocols, etc. But we don't care about that, since we only need it to detect the domain
+  // Just, don't use it for anything else
+  const tempURL = getUnsafeMergedURLString(array, hasProtocolExtracted);
+
+  // Resulting URL has no protocol (we removed it if it was there to remove the case of relative protocols, incompatible with the `detectDomainInString` function)
+  // But the `detectDomainInString` function needs an input URL with a protocol, so we add a fake one
+  const domainData = detectDomainInString(`https://${tempURL}`);
+
+  // If the URL does not contain a domain, there is nothing to "extract"/"sort", so we return the array as is
+  if (!domainData.hasDomain) return array;
+
+  // if it contains a domain, we find the index position of the last character of the domain
+  const domainIndex =
+    tempURL.indexOf(domainData.domain) + domainData.domain.length;
+
+  // We need to create an array with the domain fragments but without the protocol (if it has one)
+  const safeArray = array.slice(hasProtocolExtracted ? 1 : 0);
+
+  let domainFragment = "";
+  let currentLength = 0;
+
+  // This will be the array of fragments we will return
+  let returnedFragments: Array<string> = [];
+
+  safeArray.forEach(fragment => {
+    // We will filter out empty strings
+    if (fragment === "") {
+      // We need to check first if the current length is greater or equal than the domain index, because that means we already found the full domain in the array
+      if (currentLength >= domainIndex) {
+        // If the returnedFragments has some item, that means we already found the domain and we already saved it, so we can just push the empty fragment into the array
+        if (returnedFragments.length > 0) {
+          returnedFragments.push(fragment);
+        } else {
+          // If the returnedFragments is empty, that means we haven't saved the domain yet into the array
+          // But since we already found it, we can just push it to the array
+          returnedFragments.push(domainFragment);
+          domainFragment = "";
+          // And then we can push the empty fragment into the array
+          returnedFragments.push(fragment);
+        }
+      }
+      return;
+    }
+
+    // If the current length plus the length of the fragment is less than the domain index, we add the fragment to the domain fragment
+    if (currentLength + fragment.length <= domainIndex) {
+      domainFragment += fragment;
+      currentLength += fragment.length;
+    } else if (currentLength < domainIndex) {
+      const sliceIndex = domainIndex - currentLength;
+      domainFragment += fragment.slice(0, sliceIndex);
+
+      // If the sliceIndex is less than the length of the fragment, we push the domain fragment into the array and then we push the rest of the fragment
+      if (sliceIndex < fragment.length) {
+        returnedFragments.push(domainFragment);
+        domainFragment = "";
+        returnedFragments.push(fragment.slice(sliceIndex));
+      }
+
+      currentLength += sliceIndex;
+      // If the current length is equal to the domain index, we add the fragment to the returned fragments
+    } else {
+      // If the domain fragment is not empty, we push it to the array
+      if (domainFragment !== "") {
+        returnedFragments.push(domainFragment);
+        domainFragment = "";
+      }
+
+      // Then we push the fragment to the array
+      returnedFragments.push(fragment);
+    }
+  });
+
+  // If the domain fragment is not empty, we push it to the array
+  if (domainFragment !== "") {
+    returnedFragments.push(domainFragment);
+    domainFragment = "";
+  }
+
+  // If the array had a protocol we have to add it back at the beginning
+  // of the array before returning it
+  if (hasProtocolExtracted) {
+    returnedFragments = [array[0], ...returnedFragments];
+  }
+
+  return returnedFragments;
+}
+
+/**
+ * Assembles a safe string array by filtering out empty strings and detecting protocols.
+ * @param fragments - An array of strings representing URL fragments.
+ * @param config - The configuration object.
+ * @returns An object containing the assembled URL fragments, information about the presence of a protocol, and the detected protocol.
+ * @example
+ * safeStringArrayAssembler(["https://", "example.com", "blog"]);
+ * // => { array: ["https://", "example.com", "blog"], hasProtocol: true, protocol: "https" }
+ * safeStringArrayAssembler(["example.com", "blog"]);
+ * // => { array: ["example.com", "blog"], hasProtocol: false, protocol: "none" }
+ * safeStringArrayAssembler(["//", "example.com", "blog"]);
+ * // => { array: ["//", "example.com", "blog"], hasProtocol: true, protocol: "relative" }
+ * safeStringArrayAssembler(["https://example.com", "blog", ""]);
+ * // => { array: ["https://", "example.com", "blog"], hasProtocol: true, protocol: "https" }
+ */
+export function safeStringArrayAssembler(
+  fragments: Array<string>,
+  config: IConfig
+): IURLArrayData {
+  // We will filter out empty strings
+  let filteredFragments = [...fragments];
+
+  // If "allowEmptyPathSegments" is false, we can clean empty path segments
+  // but if it is true, we can not because those empty fragments will be joined later with slashes, so technically the final URL wouldn't be what we expect.
+  // For example, if we have ["example.com", "", "blog"], the final URL should be "example.com//blog" instead of "example.com/blog" if "allowEmptyPathSegments" is true
+  if (!config.allowEmptyPathSegments) {
+    filteredFragments = filteredFragments.filter(
+      fragment => fragment.trim() !== ""
+    );
+  }
+
+  // Now we need to detect if it contains a protocol
+  const potentiallyWrongJoinedURL = filteredFragments.join(""); // We don't use here the getUnsafeMergedURLString function because it would fail, since it requires to have already detected the protocol
+  const { hasProtocol, protocol } = detectProtocolInString(
+    potentiallyWrongJoinedURL
+  );
+
+  // We need to join all the fragments until the end of the protocol (if it has one)
+  let returnedFragments: Array<string> = filteredFragments;
+
+  if (hasProtocol) {
+    // If it has a protocol, we need to find the index of the end of the protocol
+    const protocolIndex =
+      potentiallyWrongJoinedURL.indexOf(
+        protocol === "relative" ? "//" : "://"
+      ) + (protocol === "relative" ? 2 : 3);
+
+    // And then sort and merge items in the array so the first item is the whole protocol
+    returnedFragments = extractProtocolFromArray(
+      filteredFragments,
+      protocolIndex
+    );
+
+    // We now know that the first fragment is the protocol (if there are any fragments)
+    // We need to check if there is a second fragment and if there is one, we need to remove any leading slashes it might contain
+    // That way we can be sure the final URL won't contain things like "https:///example.com/blog"
+    if (returnedFragments.length > 1) {
+      returnedFragments[1] = returnedFragments[1].replace(/^\/*/, "");
+    }
+  }
+
+  // No matter if there is protocol or not, we need to check if there is a domain, and if there is one, we need to sort and merge items in the array so the domain is a single fragment in the first position (if there is no protocol) or in the second position (if there is a protocol)
+  returnedFragments = extractDomainFromArray(returnedFragments, hasProtocol);
+
+  return {
+    array: returnedFragments,
+    hasProtocol,
+    protocol
+  };
+}
+
+/**
+ * Converts a safe string array to a URL string.
+ * IMPORTANT: This function assumes the array is safe, meaning it has been processed by the safeStringArrayAssembler function.
+ * The generated string is a potential valid URL but is not guaranteed to be valid.
+ * @param urlArrayData - The data containing the array and whether it has a protocol.
+ * @returns The URL string.
+ * @example
+ * safeStringArrayToURLString({ array: ["https://", "example.com", "blog"], hasProtocol: true, protocol: "https" });
+ * // => "https://example.com/blog"
+ * safeStringArrayToURLString({ array: ["example.com", "blog"], hasProtocol: false, protocol: "none" });
+ * // => "example.com/blog"
+ */
+export function safeStringArrayToURLString(
+  urlArrayData: IURLArrayData
+): string {
+  const { array, hasProtocol } = urlArrayData;
+
+  if (hasProtocol) {
+    // If it has a protocol, we know the first fragment is the protocol, so we skip it
+    const slicedArray = array.length > 1 ? array.slice(1) : [];
+    return `${array.length > 0 ? array[0] : ""}${slicedArray.join("/")}`;
+  }
+
+  return array.join("/");
+}