From 94eae70205ed8c1c52f5138ab959f1afc6d3a37a Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 1 Mar 2024 12:04:14 -0500 Subject: [PATCH 1/3] Replace most uses of urllib with yarl --- dandi/dandiapi.py | 11 ++++++----- dandi/utils.py | 32 ++++++++++++++++++-------------- setup.cfg | 1 + 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 95960825e..a7169c3af 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -14,13 +14,13 @@ from time import sleep, time from types import TracebackType from typing import TYPE_CHECKING, Any, Dict, List, Optional -from urllib.parse import quote_plus, urlparse, urlunparse import click from dandischema import models from pydantic import BaseModel, Field, PrivateAttr import requests import tenacity +from yarl import URL from . import get_logger from .consts import ( @@ -1521,7 +1521,7 @@ def get_content_url( else: raise # reraise since we need to figure out how to handle such a case if strip_query: - url = urlunparse(urlparse(url)._replace(query="")) + url = str(URL(url).with_query(None)) return url def get_download_file_iter( @@ -1970,9 +1970,10 @@ def get_download_file_iter( Returns a function that when called (optionally with an offset into the file to start downloading at) returns a generator of chunks of the file """ - prefix = quote_plus(str(self)) - url = self.client.get_url( - f"/zarr/{self.zarr_id}/files?prefix={prefix}&download=true" + url = str( + URL(self.client.get_url(f"/zarr/{self.zarr_id}/files/")).with_query( + {"prefix": str(self), "download": "true"} + ) ) def downloader(start_at: int = 0) -> Iterator[bytes]: diff --git a/dandi/utils.py b/dandi/utils.py index ec06d8ce7..b7f9ed5b5 100644 --- a/dandi/utils.py +++ b/dandi/utils.py @@ -23,13 +23,14 @@ import traceback import types from typing import IO, Any, List, Optional, Protocol, TypeVar, Union -from urllib.parse import parse_qs, urlparse, urlunparse import dateutil.parser +from multidict import MultiDict # dependency of yarl from pydantic import BaseModel, Field import requests import ruamel.yaml from semantic_version import Version +from yarl import URL from . import __version__, get_logger from .consts import DandiInstance, known_instances, known_instances_rev @@ -567,8 +568,9 @@ def get_instance(dandi_instance_id: str | DandiInstance) -> DandiInstance: else: instance = None is_api = False - bits = urlparse(redirector_url) - redirector_url = urlunparse((bits[0], bits[1], "", "", "", "")) + redirector_url = str( + URL(redirector_url).with_path("").with_query(None).with_fragment(None) + ) else: dandi_id = dandi_instance_id instance = known_instances[dandi_id] @@ -619,13 +621,13 @@ def _get_instance( if dandi_id is None: # Don't use pydantic.AnyHttpUrl, as that sets the `port` attribute even # if it's not present in the string. - bits = urlparse(api_url) - if bits.hostname is not None: - dandi_id = bits.hostname - if bits.port is not None: + u = URL(api_url) + if u.host is not None: + dandi_id = u.host + if (port := u.explicit_port) is not None: if ":" in dandi_id: dandi_id = f"[{dandi_id}]" - dandi_id += f":{bits.port}" + dandi_id += f":{port}" else: dandi_id = api_url return DandiInstance( @@ -790,12 +792,14 @@ def is_page2_url(page1: str, page2: str) -> bool: Tests whether the URL ``page2`` is the same as ``page1`` but with the ``page`` query parameter set to ``2`` """ - bits1 = urlparse(page1) - params1 = parse_qs(bits1.query) - params1["page"] = ["2"] - bits2 = urlparse(page2) - params2 = parse_qs(bits2.query) - return (bits1[:3], params1, bits1.fragment) == (bits2[:3], params2, bits2.fragment) + url1 = URL(page1) + params1 = MultiDict(url1.query) + params1["page"] = "2" + url1 = url1.with_query(None) + url2 = URL(page2) + params2 = url2.query + url2 = url2.with_query(None) + return (url1, sorted(params1.items())) == (url2, sorted(params2.items())) def exclude_from_zarr(path: Path) -> bool: diff --git a/setup.cfg b/setup.cfg index f6ca8b1c8..0e0ebcd8b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -55,6 +55,7 @@ install_requires = ruamel.yaml >=0.15, <1 semantic-version tenacity + yarl ~= 1.9 zarr ~= 2.10 zarr_checksum ~= 0.4.0 zip_safe = False From 1610bf95011cf71c8ba71127ec04b6bc2c5ce775 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 1 Mar 2024 12:14:11 -0500 Subject: [PATCH 2/3] Use yarl in `is_same_url()` --- dandi/delete.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dandi/delete.py b/dandi/delete.py index cae325968..214fde211 100644 --- a/dandi/delete.py +++ b/dandi/delete.py @@ -6,6 +6,7 @@ from pathlib import Path import click +from yarl import URL from .consts import DRAFT, ZARR_EXTENSIONS, DandiInstance, dandiset_metadata_file from .dandiapi import DandiAPIClient, RemoteAsset, RemoteDandiset @@ -246,5 +247,8 @@ def find_local_asset(filepath: str) -> tuple[str, str]: def is_same_url(url1: str, url2: str) -> bool: - # TODO: Use a real URL library like furl, purl, or yarl - return url1.rstrip("/") == url2.rstrip("/") + u1 = URL(url1) + u1 = u1.with_path(u1.path.rstrip("/")) + u2 = URL(url2) + u2 = u2.with_path(u2.path.rstrip("/")) + return u1 == u2 From e58df6254f9767c355b69eb9d13e7ecd9e52b322 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Tue, 5 Mar 2024 07:42:07 -0500 Subject: [PATCH 3/3] Add tests for `is_same_url()` --- dandi/tests/test_delete.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/dandi/tests/test_delete.py b/dandi/tests/test_delete.py index 310b1205e..2b33367ad 100644 --- a/dandi/tests/test_delete.py +++ b/dandi/tests/test_delete.py @@ -8,7 +8,7 @@ from .fixtures import DandiAPI, SampleDandiset from ..consts import DRAFT, dandiset_metadata_file from ..dandiapi import RESTFullAPIClient -from ..delete import delete +from ..delete import delete, is_same_url from ..download import download from ..exceptions import NotFoundError from ..utils import list_paths @@ -439,3 +439,16 @@ def test_delete_zarr_path( assert list_paths(tmp_path) == [ tmp_path / zarr_dandiset.dandiset_id / "dandiset.yaml" ] + + +@pytest.mark.parametrize( + "url1,url2,r", + [ + ("https://example.com/api", "https://example.com/api/", True), + ("https://example.com/api", "http://example.com/api", False), + ("https://example.com/api", "HTTPS://EXAMPLE.COM/api", True), + ("https://example.珠宝/api", "https://example.xn--pbt977c/api", True), + ], +) +def test_is_same_url(url1: str, url2: str, r: bool) -> None: + assert is_same_url(url1, url2) is r