Skip to content

Commit

Permalink
Merge pull request #348 from openzim/generate_fuzzy_tests
Browse files Browse the repository at this point in the history
Generate fuzzy rules tests
  • Loading branch information
benoit74 authored Jul 30, 2024
2 parents 95790fa + 8f05986 commit deb7df6
Show file tree
Hide file tree
Showing 7 changed files with 268 additions and 398 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,9 @@ tmp

# rule files are generated by rules/generate_rules.py
src/warc2zim/rules.py
tests/test_fuzzy_rules.py
javascript/src/fuzzyRules.js
javascript/test/fuzzyRules.js

# wombatSetup.js is generated with rollup
src/warc2zim/statics/wombatSetup.js
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed

- Generate fuzzy rules tests in Python and Javascript (#284)

### Fixed

- Handle case where the redirect target is bad / unsupported (#332 and #356)
- Fixed WARC files handling order to follow creation order (#366)
- Remove subsequent slashes in URLs, both in Python and JS (#365)
- Ignore non HTTP(S) WARC records (#351)
- Fix `vimeo_cdn_fix` fuzzy rule for proper operation in Javascript (#348)

## [2.0.3] - 2024-07-24

Expand Down
1 change: 1 addition & 0 deletions javascript/.prettierignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
src/fuzzyRules.js
test/fuzzyRules.js
35 changes: 0 additions & 35 deletions javascript/test/fuzzyRules.js

This file was deleted.

103 changes: 103 additions & 0 deletions rules/generate_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,21 @@

FUZZY_RULES = yaml.safe_load(rules_src.read_text())["fuzzyRules"]

for rule in FUZZY_RULES:
if "name" not in rule:
raise SystemExit("Fuzzy rule is missing a name")
if "tests" not in rule or len(rule["tests"]) == 0:
raise SystemExit("Fuzzy rule is missing test cases")


PY2JS_RULE_RX = re.compile(r"\\(\d)", re.ASCII)

# Do not escape anything, we want to generate code as-is, it won't be interpreted as
# HTML anyway
JINJA_ENV = Environment(autoescape=False) # noqa: S701

### Generate Javascript code

js_code_template = """// THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
export const fuzzyRules = [
Expand Down Expand Up @@ -49,6 +58,50 @@
)
print("JS rules generation completed successfully")

### Generate Javascript tests

js_test_template = """// THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
import test from 'ava';
import { applyFuzzyRules } from '../src/wombatSetup.js';
{% for rule in FUZZY_RULES %}
{% for test in rule['tests'] %}
test('fuzzyrules_{{rule['name']}}_{{loop.index}}', (t) => {
t.is(
applyFuzzyRules(
'{{test['raw_url']}}',
),
'{{test['raw_url'] if test['unchanged'] else test['fuzzified_url']}}',
);
});
{% endfor %}
{% endfor %}
"""

js_parent = Path(__file__).joinpath("../../javascript/test").resolve()
if not js_parent.exists():
# This skip is usefull mostly for CI operations when working on the Python part
print("Skipping JS tests generation, target folder is missing")
else:
(js_parent / "fuzzyRules.js").write_text(
JINJA_ENV.from_string(js_test_template).render(
FUZZY_RULES=[
{
"name": rule["name"],
"tests": rule["tests"],
"match": rule["pattern"].replace("\\", "\\\\"),
"replace": PY2JS_RULE_RX.sub(r"$\1", rule["replace"]),
}
for rule in FUZZY_RULES
]
)
)
print("JS tests generation completed successfully")

### Generate Python code

py_code_template = """# THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
FUZZY_RULES = [
Expand All @@ -69,3 +122,53 @@
JINJA_ENV.from_string(py_code_template).render(FUZZY_RULES=FUZZY_RULES)
)
print("Python rules generation completed successfully")

### Generate Python tests

py_test_template = """# THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
import pytest
from warc2zim.url_rewriting import apply_fuzzy_rules
from .utils import ContentForTests
{% for rule in FUZZY_RULES %}
@pytest.fixture(
params=[
{% for test in rule['tests'] %}
{% if test['unchanged'] %}
ContentForTests(
"{{ test['raw_url'] }}",
),
{% else %}
ContentForTests(
"{{ test['raw_url'] }}",
"{{ test['fuzzified_url'] }}",
),
{% endif %}
{% endfor %}
]
)
def {{ rule['name'] }}_case(request):
yield request.param
def test_fuzzyrules_{{ rule['name'] }}({{ rule['name'] }}_case):
assert (
apply_fuzzy_rules({{ rule['name'] }}_case.input_str)
== {{ rule['name'] }}_case.expected_str
)
{% endfor %}
"""

py_parent = Path(__file__).joinpath("../../tests").resolve()
if not py_parent.exists():
# This skip is usefull mostly for CI operations when working on the JS part
print("Skipping Python tests generation, target folder is missing")
else:
(py_parent / "test_fuzzy_rules.py").absolute().write_text(
JINJA_ENV.from_string(py_test_template).render(FUZZY_RULES=FUZZY_RULES)
)
print("Python tests generation completed successfully")
168 changes: 157 additions & 11 deletions rules/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,170 @@
# Generic rules are also ommitted on purpose, we don't need them
#
fuzzyRules:
- pattern: .*googlevideo.com/(videoplayback(?=\?)).*[?&](id=[^&]+).*
- name: googlevideo_com
pattern: .*googlevideo.com/(videoplayback(?=\?)).*[?&](id=[^&]+).*
replace: youtube.fuzzy.replayweb.page/\1?\2
- pattern: (?:www\.)?youtube(?:-nocookie)?\.com/(get_video_info\?).*(video_id=[^&]+).*
tests:
- raw_url: foobargooglevideo.com/videoplayback?id=1576&key=value
fuzzified_url: youtube.fuzzy.replayweb.page/videoplayback?id=1576
- raw_url: foobargooglevideo.com/videoplayback?some=thing&id=1576
fuzzified_url: youtube.fuzzy.replayweb.page/videoplayback?id=1576
- raw_url: foobargooglevideo.com/videoplayback?some=thing&id=1576&key=value
fuzzified_url: youtube.fuzzy.replayweb.page/videoplayback?id=1576
- raw_url: foobargooglevideo.com/videoplaybackandfoo?some=thing&id=1576&key=value
unchanged: true # videoplayback is not followed by `?`
- raw_url: foobargoogle_video.com/videoplaybackandfoo?some=thing&id=1576&key=value
unchanged: true # No googlevideo.com in url
- name: youtube_video_info
pattern: (?:www\.)?youtube(?:-nocookie)?\.com/(get_video_info\?).*(video_id=[^&]+).*
replace : youtube.fuzzy.replayweb.page/\1\2
- pattern: i\.ytimg\.com\/vi\/(.*?)\/.*?\.(\w*?)(?:\?.*|$)
tests:
- raw_url: www.youtube.com/get_video_info?video_id=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
- raw_url: www.youtube.com/get_video_info?foo=bar&video_id=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
- raw_url: www.youtube.com/get_video_info?video_id=123ah&foo=bar
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
- raw_url: youtube.com/get_video_info?video_id=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
- raw_url: youtube-nocookie.com/get_video_info?video_id=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
- raw_url: www.youtube-nocookie.com/get_video_info?video_id=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/get_video_info?video_id=123ah
- raw_url: www.youtube-nocookie.com/get_video_info?foo=bar
unchanged: true # no video_id parameter
- raw_url: www.youtubeqnocookie.com/get_video_info?video_id=123ah
unchanged: true # improper hostname
- name: youtube_thumbnails
pattern: i\.ytimg\.com\/vi\/(.*?)\/.*?\.(\w*?)(?:\?.*|$)
replace : i.ytimg.com.fuzzy.replayweb.page/vi/\1/thumbnail.\2
- pattern: ([^?]+)\?[\d]+$
tests:
- raw_url: i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.jpg?sqp=-oaymwEmCIAKENAF8quKqQMa8AEB-AH-CYAC0AWKAgwIABABGHIgTyg-MA8=&rs=AOn4CLDr-FmDmP3aCsD84l48ygBmkwHg-g
fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg
- raw_url: i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.png?sqp=-oaymwEmCIAKENAF8quKqQMa8AEB-AH-CYAC0AWKAgwIABABGHIgTyg-MA8=&rs=AOn4CLDr-FmDmP3aCsD84l48ygBmkwHg-g
fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.png
- raw_url: i.ytimg.com/vi/-KpLmsAR23I/maxresdefault.jpg
fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg
- raw_url: i.ytimg.com/vi/-KpLmsAR23I/max-res.default.jpg
fuzzified_url: i.ytimg.com.fuzzy.replayweb.page/vi/-KpLmsAR23I/thumbnail.jpg
- name: trim_digits_only
pattern: ([^?]+)\?[\d]+$
replace : \1
- pattern: (?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).*
tests:
- raw_url: www.example.com/page?1234
fuzzified_url: www.example.com/page
- raw_url: www.example.com/page?foo=1234
unchanged: true
- raw_url: www.example.com/page1234
unchanged: true
- raw_url: www.example.com/page?foo=bar&1234
unchanged: true
- raw_url: www.example.com/page?1234=bar
unchanged: true
- raw_url: www.example.com/page?1234&foo=bar
unchanged: true
- name: youtubei
pattern: (?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).*
replace : youtube.fuzzy.replayweb.page/\1?\2
- pattern: (?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).*
tests:
- raw_url: www.youtube-nocookie.com/youtubei/page/?videoId=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
- raw_url: youtube-nocookie.com/youtubei/page/?videoId=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
- raw_url: youtube.com/youtubei/page/?videoId=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
- raw_url: www.youtube.com/youtubei/page/?videoId=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
- raw_url: youtube.com/youtubei/page/videoId=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
- raw_url: youtube.com/youtubei/page/videoIdqqq=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoIdqqq=123ah
- raw_url: youtube.com/youtubei/page/videoId=123ah&foo=bar
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
- raw_url: youtube.com/youtubei/page/?foo=bar&videoId=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/?videoId=123ah
- raw_url: youtube.com/youtubei/page/foo=bar&videoId=123ah
fuzzified_url: youtube.fuzzy.replayweb.page/youtubei/page/foo=bar&?videoId=123ah
- raw_url: youtube.com/youtubei/?videoId=123ah
unchanged: true
- name: youtube_embed
pattern: (?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).*
replace : youtube.fuzzy.replayweb.page/embed/\1
# next one is a custom warc2zim rule intended to fix Vimeo support
- pattern: .*(?:gcs-vimeo|vod|vod-progressive|vod-adaptive)\.akamaized\.net.*/(.+?.mp4)\?.*range=(.*?)(?:&|$)
tests:
- raw_url: www.youtube-nocookie.com/embed/foo
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo
- raw_url: www.youtube-nocookie.com/embed/bar
fuzzified_url: youtube.fuzzy.replayweb.page/embed/bar
- raw_url: www.youtube-nocookie.com/embed/foo/bar
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo/bar
- raw_url: www.youtube.com/embed/foo
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo
- raw_url: youtube.com/embed/foo
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo
- raw_url: youtube-nocookie.com/embed/foo
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo
- raw_url: youtube.com/embed/foo?bar=alice
fuzzified_url: youtube.fuzzy.replayweb.page/embed/foo

- name: vimeo_cdn_fix # custom warc2zim rule intended to fix Vimeo support
pattern: .*(?:gcs-vimeo|vod|vod-progressive|vod-adaptive)\.akamaized\.net.*\/(.+?.mp4)\?.*range=(.*?)(?:&.*|$)
replace : vimeo-cdn.fuzzy.replayweb.page/\1?range=\2
- pattern: .*(?:gcs-vimeo|vod|vod-progressive)\.akamaized\.net.*?/([\d/]+.mp4)$
tests:
- raw_url: gcs-vimeo.akamaized.net/123.mp4?range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
- raw_url: vod.akamaized.net/123.mp4?range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
- raw_url: vod-progressive.akamaized.net/123.mp4?range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
- raw_url: vod-adaptive.akamaized.net/123.mp4?range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
- raw_url: vod.akamaized.net/123.mp4?foo=bar&range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
- raw_url: vod.akamaized.net/123.mp4?foo=bar&range=123-456&bar=foo
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
- raw_url: vod.akamaized.net/123.mp4?range=123-456&bar=foo
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
- raw_url: foovod.akamaized.net/123.mp4?range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456
- raw_url: vod.akamaized.net/1/23.mp4?range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456
- raw_url: vod.akamaized.net/a/23.mp4?range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456
- raw_url: vod.akamaized.net/foo/bar/23.mp4?range=123-456
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456
- raw_url: foo.akamaized.net/123.mp4?range=123-456
unchanged: true
- name: vimeo_cdn
pattern: .*(?:gcs-vimeo|vod|vod-progressive)\.akamaized\.net.*?\/([\d/]+.mp4)$
replace : vimeo-cdn.fuzzy.replayweb.page/\1
- pattern: .*player.vimeo.com/(video/[\d]+)\?.*
tests:
- raw_url: vod.akamaized.net/23.mp4
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23.mp4
- raw_url: vod.akamaized.net/23/12332.mp4
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/23/12332.mp4
- raw_url: https://vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4
fuzzified_url: vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4
- name: vimeo_player
pattern: .*player.vimeo.com\/(video\/[\d]+)\?.*
replace : vimeo.fuzzy.replayweb.page/\1
- pattern: .*i\.vimeocdn\.com\/(.*)\?.*
tests:
- raw_url: player.vimeo.com/video/1234?foo=bar
fuzzified_url: vimeo.fuzzy.replayweb.page/video/1234
- raw_url: foo.player.vimeo.com/video/1234?foo=bar
fuzzified_url: vimeo.fuzzy.replayweb.page/video/1234
- raw_url: player.vimeo.com/video/1234?foo
fuzzified_url: vimeo.fuzzy.replayweb.page/video/1234
- raw_url: player.vimeo.com/video/1/23?foo=bar
unchanged: true
- raw_url: player.vimeo.com/video/123a?foo=bar
unchanged: true
- raw_url: player.vimeo.com/video/?foo=bar
unchanged: true
- name: i_vimeo_cdn
pattern: .*i\.vimeocdn\.com\/(.*)\?.*
replace : i.vimeocdn.fuzzy.replayweb.page/\1
tests:
- raw_url: i.vimeocdn.com/image/1234?foo=bar
fuzzified_url: i.vimeocdn.fuzzy.replayweb.page/image/1234
- raw_url: i.vimeocdn.com/something/a456?foo
fuzzified_url: i.vimeocdn.fuzzy.replayweb.page/something/a456
Loading

0 comments on commit deb7df6

Please sign in to comment.