diff --git a/.gitignore b/.gitignore index e5de7a7..7421867 100644 --- a/.gitignore +++ b/.gitignore @@ -504,7 +504,9 @@ tmp # rule files are generated by rules/generate_rules.py src/warc2zim/rules.py +tests/test_fuzzy_rules.py javascript/src/fuzzyRules.js +javascript/test/fuzzyRules.js # wombatSetup.js is generated with rollup src/warc2zim/statics/wombatSetup.js diff --git a/CHANGELOG.md b/CHANGELOG.md index fdb6df0..9fb15f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,12 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Generate fuzzy rules tests in Python and Javascript (#284) + ### Fixed - Handle case where the redirect target is bad / unsupported (#332 and #356) - Fixed WARC files handling order to follow creation order (#366) - Remove subsequent slashes in URLs, both in Python and JS (#365) - Ignore non HTTP(S) WARC records (#351) +- Fix `vimeo_cdn_fix` fuzzy rule for proper operation in Javascript (#348) ## [2.0.3] - 2024-07-24 diff --git a/javascript/.prettierignore b/javascript/.prettierignore index bc94d38..d07ae34 100644 --- a/javascript/.prettierignore +++ b/javascript/.prettierignore @@ -1 +1,2 @@ src/fuzzyRules.js +test/fuzzyRules.js diff --git a/javascript/test/fuzzyRules.js b/javascript/test/fuzzyRules.js deleted file mode 100644 index f958328..0000000 --- a/javascript/test/fuzzyRules.js +++ /dev/null @@ -1,35 +0,0 @@ -import test from 'ava'; - -import { applyFuzzyRules } from '../src/wombatSetup.js'; - -test('i.ytimg.com_1', (t) => { - t.is( - applyFuzzyRules( - 'i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.jpg?sqp=-oaymwEmCIAKENAF8quKqQMa8AEB-AH-CYAC0AWKAgwIABABGHIgTyg-MA8=&rs=AOn4CLDr-FmDmP3aCsD84l48ygBmkwHg-g', - ), - 'i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg', - ); -}); - -test('i.ytimg.com_2', (t) => { - t.is( - applyFuzzyRules( - 'i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.png?sqp=-oaymwEmCIAKENAF8quKqQMa8AEB-AH-CYAC0AWKAgwIABABGHIgTyg-MA8=&rs=AOn4CLDr-FmDmP3aCsD84l48ygBmkwHg-g', - ), - 'i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.png', - ); -}); - -test('i.ytimg.com_3', (t) => { - t.is( - applyFuzzyRules('i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.jpg'), - 'i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg', - ); -}); - -test('i.ytimg.com_4', (t) => { - t.is( - applyFuzzyRules('i.ytimg.com/vi/-KpLmsAR23I/max-res.default.jpg'), - 'i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg', - ); -}); diff --git a/rules/generate_rules.py b/rules/generate_rules.py index 9df34c0..6e0d9e7 100644 --- a/rules/generate_rules.py +++ b/rules/generate_rules.py @@ -13,12 +13,21 @@ FUZZY_RULES = yaml.safe_load(rules_src.read_text())["fuzzyRules"] +for rule in FUZZY_RULES: + if "name" not in rule: + raise SystemExit("Fuzzy rule is missing a name") + if "tests" not in rule or len(rule["tests"]) == 0: + raise SystemExit("Fuzzy rule is missing test cases") + + PY2JS_RULE_RX = re.compile(r"\\(\d)", re.ASCII) # Do not escape anything, we want to generate code as-is, it won't be interpreted as # HTML anyway JINJA_ENV = Environment(autoescape=False) # noqa: S701 +### Generate Javascript code + js_code_template = """// THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY export const fuzzyRules = [ @@ -49,6 +58,50 @@ ) print("JS rules generation completed successfully") +### Generate Javascript tests + +js_test_template = """// THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY + +import test from 'ava'; + +import { applyFuzzyRules } from '../src/wombatSetup.js'; + +{% for rule in FUZZY_RULES %} +{% for test in rule['tests'] %} +test('fuzzyrules_{{rule['name']}}_{{loop.index}}', (t) => { + t.is( + applyFuzzyRules( + '{{test['raw_url']}}', + ), + '{{test['raw_url'] if test['unchanged'] else test['fuzzified_url']}}', + ); +}); +{% endfor %} +{% endfor %} +""" + +js_parent = Path(__file__).joinpath("../../javascript/test").resolve() +if not js_parent.exists(): + # This skip is usefull mostly for CI operations when working on the Python part + print("Skipping JS tests generation, target folder is missing") +else: + (js_parent / "fuzzyRules.js").write_text( + JINJA_ENV.from_string(js_test_template).render( + FUZZY_RULES=[ + { + "name": rule["name"], + "tests": rule["tests"], + "match": rule["pattern"].replace("\\", "\\\\"), + "replace": PY2JS_RULE_RX.sub(r"$\1", rule["replace"]), + } + for rule in FUZZY_RULES + ] + ) + ) + print("JS tests generation completed successfully") + +### Generate Python code + py_code_template = """# THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY FUZZY_RULES = [ @@ -69,3 +122,53 @@ JINJA_ENV.from_string(py_code_template).render(FUZZY_RULES=FUZZY_RULES) ) print("Python rules generation completed successfully") + +### Generate Python tests + +py_test_template = """# THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY + +import pytest + +from warc2zim.url_rewriting import apply_fuzzy_rules + +from .utils import ContentForTests + +{% for rule in FUZZY_RULES %} +@pytest.fixture( + params=[ +{% for test in rule['tests'] %} +{% if test['unchanged'] %} + ContentForTests( + "{{ test['raw_url'] }}", + ), +{% else %} + ContentForTests( + "{{ test['raw_url'] }}", + "{{ test['fuzzified_url'] }}", + ), +{% endif %} +{% endfor %} + ] +) +def {{ rule['name'] }}_case(request): + yield request.param + + +def test_fuzzyrules_{{ rule['name'] }}({{ rule['name'] }}_case): + assert ( + apply_fuzzy_rules({{ rule['name'] }}_case.input_str) + == {{ rule['name'] }}_case.expected_str + ) +{% endfor %} + +""" + +py_parent = Path(__file__).joinpath("../../tests").resolve() +if not py_parent.exists(): + # This skip is usefull mostly for CI operations when working on the JS part + print("Skipping Python tests generation, target folder is missing") +else: + (py_parent / "test_fuzzy_rules.py").absolute().write_text( + JINJA_ENV.from_string(py_test_template).render(FUZZY_RULES=FUZZY_RULES) + ) + print("Python tests generation completed successfully") diff --git a/rules/rules.yaml b/rules/rules.yaml index 06150a5..6e07777 100644 --- a/rules/rules.yaml +++ b/rules/rules.yaml @@ -16,24 +16,170 @@ # Generic rules are also ommitted on purpose, we don't need them # fuzzyRules: - - pattern: .*googlevideo.com/(videoplayback(?=\?)).*[?&](id=[^&]+).* + - name: googlevideo_com + pattern: .*googlevideo.com/(videoplayback(?=\?)).*[?&](id=[^&]+).* replace: youtube.fuzzy.replayweb.page/\1?\2 - - pattern: (?:www\.)?youtube(?:-nocookie)?\.com/(get_video_info\?).*(video_id=[^&]+).* + tests: + - raw_url: foobargooglevideo.com/videoplayback?id=1576&key=value + fuzzified_url: youtube.fuzzy.replayweb.page/videoplayback?id=1576 + - raw_url: foobargooglevideo.com/videoplayback?some=thing&id=1576 + fuzzified_url: youtube.fuzzy.replayweb.page/videoplayback?id=1576 + - raw_url: foobargooglevideo.com/videoplayback?some=thing&id=1576&key=value + fuzzified_url: youtube.fuzzy.replayweb.page/videoplayback?id=1576 + - raw_url: foobargooglevideo.com/videoplaybackandfoo?some=thing&id=1576&key=value + unchanged: true # videoplayback is not followed by `?` + - raw_url: foobargoogle_video.com/videoplaybackandfoo?some=thing&id=1576&key=value + unchanged: true # No googlevideo.com in url + - name: youtube_video_info + pattern: (?:www\.)?youtube(?:-nocookie)?\.com/(get_video_info\?).*(video_id=[^&]+).* replace : youtube.fuzzy.replayweb.page/\1\2 - - pattern: i\.ytimg\.com\/vi\/(.*?)\/.*?\.(\w*?)(?:\?.*|$) + tests: + - raw_url: www.youtube.com/get_video_info?video_id=123ah + fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah + - raw_url: www.youtube.com/get_video_info?foo=bar&video_id=123ah + fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah + - raw_url: www.youtube.com/get_video_info?video_id=123ah&foo=bar + fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah + - raw_url: youtube.com/get_video_info?video_id=123ah + fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah + - raw_url: youtube-nocookie.com/get_video_info?video_id=123ah + fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah + - raw_url: www.youtube-nocookie.com/get_video_info?video_id=123ah + fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah + - raw_url: www.youtube-nocookie.com/get_video_info?foo=bar + unchanged: true # no video_id parameter + - raw_url: www.youtubeqnocookie.com/get_video_info?video_id=123ah + unchanged: true # improper hostname + - name: youtube_thumbnails + pattern: i\.ytimg\.com\/vi\/(.*?)\/.*?\.(\w*?)(?:\?.*|$) replace : i.ytimg.com.fuzzy.replayweb.page/vi/\1/thumbnail.\2 - - pattern: ([^?]+)\?[\d]+$ + tests: + - raw_url: i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.jpg?sqp=-oaymwEmCIAKENAF8quKqQMa8AEB-AH-CYAC0AWKAgwIABABGHIgTyg-MA8=&rs=AOn4CLDr-FmDmP3aCsD84l48ygBmkwHg-g + fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg + - raw_url: i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.png?sqp=-oaymwEmCIAKENAF8quKqQMa8AEB-AH-CYAC0AWKAgwIABABGHIgTyg-MA8=&rs=AOn4CLDr-FmDmP3aCsD84l48ygBmkwHg-g + fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.png + - raw_url: i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.jpg + fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg + - raw_url: i.ytimg.com/vi/-KpLmsAR23I/max-res.default.jpg + fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg + - name: trim_digits_only + pattern: ([^?]+)\?[\d]+$ replace : \1 - - pattern: (?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).* + tests: + - raw_url: www.example.com/page?1234 + fuzzified_url: www.example.com/page + - raw_url: www.example.com/page?foo=1234 + unchanged: true + - raw_url: www.example.com/page1234 + unchanged: true + - raw_url: www.example.com/page?foo=bar&1234 + unchanged: true + - raw_url: www.example.com/page?1234=bar + unchanged: true + - raw_url: www.example.com/page?1234&foo=bar + unchanged: true + - name: youtubei + pattern: (?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).* replace : youtube.fuzzy.replayweb.page/\1?\2 - - pattern: (?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).* + tests: + - raw_url: www.youtube-nocookie.com/youtubei/page/?videoId=123ah + fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah + - raw_url: youtube-nocookie.com/youtubei/page/?videoId=123ah + fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah + - raw_url: youtube.com/youtubei/page/?videoId=123ah + fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah + - raw_url: www.youtube.com/youtubei/page/?videoId=123ah + fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah + - raw_url: youtube.com/youtubei/page/videoId=123ah + fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah + - raw_url: youtube.com/youtubei/page/videoIdqqq=123ah + fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoIdqqq=123ah + - raw_url: youtube.com/youtubei/page/videoId=123ah&foo=bar + fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah + - raw_url: youtube.com/youtubei/page/?foo=bar&videoId=123ah + fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah + - raw_url: youtube.com/youtubei/page/foo=bar&videoId=123ah + fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/foo=bar&?videoId=123ah + - raw_url: youtube.com/youtubei/?videoId=123ah + unchanged: true + - name: youtube_embed + pattern: (?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).* replace : youtube.fuzzy.replayweb.page/embed/\1 - # next one is a custom warc2zim rule intended to fix Vimeo support - - pattern: .*(?:gcs-vimeo|vod|vod-progressive|vod-adaptive)\.akamaized\.net.*/(.+?.mp4)\?.*range=(.*?)(?:&|$) + tests: + - raw_url: www.youtube-nocookie.com/embed/foo + fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo + - raw_url: www.youtube-nocookie.com/embed/bar + fuzzified_url: youtube.fuzzy.replayweb.page/embed/bar + - raw_url: www.youtube-nocookie.com/embed/foo/bar + fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo/bar + - raw_url: www.youtube.com/embed/foo + fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo + - raw_url: youtube.com/embed/foo + fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo + - raw_url: youtube-nocookie.com/embed/foo + fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo + - raw_url: youtube.com/embed/foo?bar=alice + fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo + + - name: vimeo_cdn_fix # custom warc2zim rule intended to fix Vimeo support + pattern: .*(?:gcs-vimeo|vod|vod-progressive|vod-adaptive)\.akamaized\.net.*\/(.+?.mp4)\?.*range=(.*?)(?:&.*|$) replace : vimeo-cdn.fuzzy.replayweb.page/\1?range=\2 - - pattern: .*(?:gcs-vimeo|vod|vod-progressive)\.akamaized\.net.*?/([\d/]+.mp4)$ + tests: + - raw_url: gcs-vimeo.akamaized.net/123.mp4?range=123-456 + fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456 + - raw_url: vod.akamaized.net/123.mp4?range=123-456 + fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456 + - raw_url: vod-progressive.akamaized.net/123.mp4?range=123-456 + fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456 + - raw_url: vod-adaptive.akamaized.net/123.mp4?range=123-456 + fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456 + - raw_url: vod.akamaized.net/123.mp4?foo=bar&range=123-456 + fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456 + - raw_url: vod.akamaized.net/123.mp4?foo=bar&range=123-456&bar=foo + fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456 + - raw_url: vod.akamaized.net/123.mp4?range=123-456&bar=foo + fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456 + - raw_url: foovod.akamaized.net/123.mp4?range=123-456 + fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456 + - raw_url: vod.akamaized.net/1/23.mp4?range=123-456 + fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456 + - raw_url: vod.akamaized.net/a/23.mp4?range=123-456 + fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456 + - raw_url: vod.akamaized.net/foo/bar/23.mp4?range=123-456 + fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456 + - raw_url: foo.akamaized.net/123.mp4?range=123-456 + unchanged: true + - name: vimeo_cdn + pattern: .*(?:gcs-vimeo|vod|vod-progressive)\.akamaized\.net.*?\/([\d/]+.mp4)$ replace : vimeo-cdn.fuzzy.replayweb.page/\1 - - pattern: .*player.vimeo.com/(video/[\d]+)\?.* + tests: + - raw_url: vod.akamaized.net/23.mp4 + fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23.mp4 + - raw_url: vod.akamaized.net/23/12332.mp4 + fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23/12332.mp4 + - raw_url: https://vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 + fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 + - name: vimeo_player + pattern: .*player.vimeo.com\/(video\/[\d]+)\?.* replace : vimeo.fuzzy.replayweb.page/\1 - - pattern: .*i\.vimeocdn\.com\/(.*)\?.* + tests: + - raw_url: player.vimeo.com/video/1234?foo=bar + fuzzified_url: vimeo.fuzzy.replayweb.page/video/1234 + - raw_url: foo.player.vimeo.com/video/1234?foo=bar + fuzzified_url: vimeo.fuzzy.replayweb.page/video/1234 + - raw_url: player.vimeo.com/video/1234?foo + fuzzified_url: vimeo.fuzzy.replayweb.page/video/1234 + - raw_url: player.vimeo.com/video/1/23?foo=bar + unchanged: true + - raw_url: player.vimeo.com/video/123a?foo=bar + unchanged: true + - raw_url: player.vimeo.com/video/?foo=bar + unchanged: true + - name: i_vimeo_cdn + pattern: .*i\.vimeocdn\.com\/(.*)\?.* replace : i.vimeocdn.fuzzy.replayweb.page/\1 + tests: + - raw_url: i.vimeocdn.com/image/1234?foo=bar + fuzzified_url: i.vimeocdn.fuzzy.replayweb.page/image/1234 + - raw_url: i.vimeocdn.com/something/a456?foo + fuzzified_url: i.vimeocdn.fuzzy.replayweb.page/something/a456 diff --git a/tests/test_fuzzy_rules.py b/tests/test_fuzzy_rules.py deleted file mode 100644 index 1b18386..0000000 --- a/tests/test_fuzzy_rules.py +++ /dev/null @@ -1,352 +0,0 @@ -import pytest - -from warc2zim.url_rewriting import apply_fuzzy_rules - -from .utils import ContentForTests - - -@pytest.fixture( - params=[ - ContentForTests( - "foobargooglevideo.com/videoplayback?id=1576&key=value", - "youtube.fuzzy.replayweb.page/videoplayback?id=1576", - ), - ContentForTests( - "foobargooglevideo.com/videoplayback?some=thing&id=1576", - "youtube.fuzzy.replayweb.page/videoplayback?id=1576", - ), - ContentForTests( - "foobargooglevideo.com/videoplayback?some=thing&id=1576&key=value", - "youtube.fuzzy.replayweb.page/videoplayback?id=1576", - ), - # videoplayback is not followed by `?` - ContentForTests( - "foobargooglevideo.com/videoplaybackandfoo?some=thing&id=1576&key=value" - ), - # No googlevideo.com in url - ContentForTests( - "foobargoogle_video.com/videoplaybackandfoo?some=thing&id=1576&key=value" - ), - ] -) -def google_videos_case(request): - yield request.param - - -def test_fuzzyrules_google_videos(google_videos_case): - assert ( - apply_fuzzy_rules(google_videos_case.input_str) - == google_videos_case.expected_str - ) - - -@pytest.fixture( - params=[ - ContentForTests( - "www.youtube.com/get_video_info?video_id=123ah", - "youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah", - ), - ContentForTests( - "www.youtube.com/get_video_info?foo=bar&video_id=123ah", - "youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah", - ), - ContentForTests( - "www.youtube.com/get_video_info?video_id=123ah&foo=bar", - "youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah", - ), - ContentForTests( - "youtube.com/get_video_info?video_id=123ah", - "youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah", - ), - ContentForTests( - "youtube-nocookie.com/get_video_info?video_id=123ah", - "youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah", - ), - ContentForTests( - "www.youtube-nocookie.com/get_video_info?video_id=123ah", - "youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah", - ), - # no video_id parameter - ContentForTests( - "www.youtube-nocookie.com/get_video_info?foo=bar", - ), - # improper hostname - ContentForTests( - "www.youtubeqnocookie.com/get_video_info?video_id=123ah", - ), - ] -) -def google_video_info_case(request): - yield request.param - - -def test_fuzzyrules_google_video_infos(google_video_info_case): - assert ( - apply_fuzzy_rules(google_video_info_case.input_str) - == google_video_info_case.expected_str - ) - - -@pytest.fixture( - params=[ - ContentForTests( - "i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.jpg?sqp=-oaymwEmCIAKENAF8quKqQMa8" - "AEB-AH-CYAC0AWKAgwIABABGHIgTyg-MA8=&rs=AOn4CLDr-FmDmP3aCsD84l48ygBmkwHg-g", - "i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg", - ), - ContentForTests( - "i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.png?sqp=-oaymwEmCIAKENAF8quKqQMa8" - "AEB-AH-CYAC0AWKAgwIABABGHIgTyg-MA8=&rs=AOn4CLDr-FmDmP3aCsD84l48ygBmkwHg-g", - "i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.png", - ), - ContentForTests( - "i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.jpg", - "i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg", - ), - ContentForTests( - "i.ytimg.com/vi/-KpLmsAR23I/max-res.default.jpg", - "i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg", - ), - ] -) -def youtube_thumbnails_case(request): - yield request.param - - -def test_fuzzyrules_youtube_thumbnails(youtube_thumbnails_case): - assert ( - apply_fuzzy_rules(youtube_thumbnails_case.input_str) - == youtube_thumbnails_case.expected_str - ) - - -@pytest.fixture( - params=[ - ContentForTests( - "www.example.com/page?1234", - "www.example.com/page", - ), - ContentForTests( - "www.example.com/page?foo=1234", - ), - ContentForTests( - "www.example.com/page1234", - ), - ContentForTests( - "www.example.com/page?foo=bar&1234", - ), - ContentForTests( - "www.example.com/page?1234=bar", - ), - ContentForTests( - "www.example.com/page?1234&foo=bar", - ), - ] -) -def trim_digits_only_query_case(request): - yield request.param - - -def test_fuzzyrules_trim_digits_only_query(trim_digits_only_query_case): - assert ( - apply_fuzzy_rules(trim_digits_only_query_case.input_str) - == trim_digits_only_query_case.expected_str - ) - - -@pytest.fixture( - params=[ - ContentForTests( - "www.youtube-nocookie.com/youtubei/page/?videoId=123ah", - "youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah", - ), - ContentForTests( - "youtube-nocookie.com/youtubei/page/?videoId=123ah", - "youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah", - ), - ContentForTests( - "youtube.com/youtubei/page/?videoId=123ah", - "youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah", - ), - ContentForTests( - "www.youtube.com/youtubei/page/?videoId=123ah", - "youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah", - ), - ContentForTests( - "youtube.com/youtubei/page/videoId=123ah", - "youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah", - ), - ContentForTests( - "youtube.com/youtubei/page/videoIdqqq=123ah", - "youtube.fuzzy.replayweb.page/youtubei/page/?videoIdqqq=123ah", - ), - ContentForTests( - "youtube.com/youtubei/page/videoId=123ah&foo=bar", - "youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah", - ), - ContentForTests( - "youtube.com/youtubei/page/?foo=bar&videoId=123ah", - "youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah", - ), - ContentForTests( - "youtube.com/youtubei/page/foo=bar&videoId=123ah", - "youtube.fuzzy.replayweb.page/youtubei/page/foo=bar&?videoId=123ah", - ), - ContentForTests( - "youtube.com/youtubei/?videoId=123ah", - ), - ] -) -def youtubei_case(request): - yield request.param - - -def test_fuzzyrules_youtubei(youtubei_case): - assert apply_fuzzy_rules(youtubei_case.input_str) == youtubei_case.expected_str - - -@pytest.fixture( - params=[ - ContentForTests( - "www.youtube-nocookie.com/embed/foo", - "youtube.fuzzy.replayweb.page/embed/foo", - ), - ContentForTests( - "www.youtube-nocookie.com/embed/bar", - "youtube.fuzzy.replayweb.page/embed/bar", - ), - ContentForTests( - "www.youtube-nocookie.com/embed/foo/bar", - "youtube.fuzzy.replayweb.page/embed/foo/bar", - ), - ContentForTests( - "www.youtube.com/embed/foo", - "youtube.fuzzy.replayweb.page/embed/foo", - ), - ContentForTests( - "youtube.com/embed/foo", - "youtube.fuzzy.replayweb.page/embed/foo", - ), - ContentForTests( - "youtube-nocookie.com/embed/foo", - "youtube.fuzzy.replayweb.page/embed/foo", - ), - ContentForTests( - "youtube.com/embed/foo?bar=alice", - "youtube.fuzzy.replayweb.page/embed/foo", - ), - ] -) -def youtube_embed_case(request): - yield request.param - - -def test_fuzzyrules_youtube_embed(youtube_embed_case): - assert ( - apply_fuzzy_rules(youtube_embed_case.input_str) - == youtube_embed_case.expected_str - ) - - -@pytest.fixture( - params=[ - ContentForTests( - "gcs-vimeo.akamaized.net/123.mp4?range=123-456", - "vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456", - ), - ContentForTests( - "vod.akamaized.net/123.mp4?range=123-456", - "vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456", - ), - ContentForTests( - "vod-progressive.akamaized.net/123.mp4?range=123-456", - "vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456", - ), - ContentForTests( - "vod-adaptive.akamaized.net/123.mp4?range=123-456", - "vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456", - ), - ContentForTests( - "vod.akamaized.net/123.mp4?foo=bar&range=123-456", - "vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456", - ), - ContentForTests( - "vod.akamaized.net/123.mp4?foo=bar&range=123-456&bar=foo", - "vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456", - ), - ContentForTests( - "vod.akamaized.net/123.mp4?range=123-456&bar=foo", - "vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456", - ), - ContentForTests( - "foovod.akamaized.net/123.mp4?range=123-456", - "vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456", - ), - ContentForTests( - "vod.akamaized.net/1/23.mp4?range=123-456", - "vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456", - ), - ContentForTests( - "vod.akamaized.net/a/23.mp4?range=123-456", - "vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456", - ), - ContentForTests( - "vod.akamaized.net/foo/bar/23.mp4?range=123-456", - "vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456", - ), - ContentForTests( - "foo.akamaized.net/123.mp4?range=123-456", - ), - ContentForTests( - "vod.akamaized.net/23.mp4", - "vimeo-cdn.fuzzy.replayweb.page/23.mp4", - ), - ContentForTests( - "vod.akamaized.net/23/12332.mp4", - "vimeo-cdn.fuzzy.replayweb.page/23/12332.mp4", - ), - ContentForTests( - "https://vod-progressive.akamaized.net/exp=1635528595" - "~acl=%2Fvimeo-prod-skyfire-std-us" - "%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4" - "~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e" - "/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4", - "vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4", - ), - ] -) -def vimeo_cdn_case(request): - yield request.param - - -def test_fuzzyrules_vimeo_cdn(vimeo_cdn_case): - assert apply_fuzzy_rules(vimeo_cdn_case.input_str) == vimeo_cdn_case.expected_str - - -@pytest.fixture( - params=[ - ContentForTests( - "player.vimeo.com/video/123?foo=bar", - "vimeo.fuzzy.replayweb.page/video/123", - ), - ContentForTests( - "foo.player.vimeo.com/video/123?foo=bar", - "vimeo.fuzzy.replayweb.page/video/123", - ), - ContentForTests( - "player.vimeo.com/video/1/23?foo=bar", - ), - ContentForTests( - "player.vimeo.com/video/123a?foo=bar", - ), - ContentForTests( - "player.vimeo.com/video/?foo=bar", - ), - ] -) -def vimeo_host_case(request): - yield request.param - - -def test_fuzzyrules_vimeo_host(vimeo_host_case): - assert apply_fuzzy_rules(vimeo_host_case.input_str) == vimeo_host_case.expected_str