Skip to content

Commit

Permalink
Handle http and https url patterns in regulations checkup flow
Browse files Browse the repository at this point in the history
  • Loading branch information
VincentAntoine committed Mar 6, 2024
1 parent baff9ff commit 1b660f1
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 13 deletions.
2 changes: 1 addition & 1 deletion datascience/src/pipeline/flows/regulations_checkup.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def add_article_id(regulations: pd.DataFrame, url_column: str) -> pd.DataFrame:
"""
legipeche_regex = re.compile(
(
r"^http://legipeche\.metier\."
r"^https?://legipeche\.metier\."
r"e2\.rie\.gouv\.fr/"
r"(?:[a-zA-Z0-9-]*)"
r"-a(?P<article_id>\d+)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ INSERT INTO public.legipeche (
extraction_datetime_utc, extraction_occurence, page_title, page_url, document_title, document_url
) VALUES
( '2021-3-2 14:25', 'previous', 'Some old page', 'http://legipeche.metier.e2.rie.gouv.fr/deleted-regulation-a671.html', 'Some old reg text', 'http://some.thing'),
( '2021-3-2 14:25', 'previous', 'Med. sea regulation', 'http://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html', 'Med reg text', 'http://med.reg'),
( '2021-3-2 14:25', 'previous', 'Med. sea regulation', 'https://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html', 'Med reg text', 'http://med.reg'),
( '2021-3-2 14:25', 'previous', 'Bretagne regulation', 'http://legipeche.metier.e2.rie.gouv.fr/some-regulation-a666.html', 'Bretagne reg text', 'http://bzh.reg'),
( '2021-3-2 14:25', 'previous', 'Bretagne modified reg', 'http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html', 'Bretagne modified reg 1', 'http://bzh.other_1'),
( '2021-3-2 14:25', 'previous', 'Bretagne modified reg', 'http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html', 'Bretagne modified reg 2', 'http://bzh.other_2'),
( '2021-3-3 14:25', 'latest', 'Bretagne modified reg', 'http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html', 'Bretagne modified reg 1', 'http://bzh.other_1'),
( '2021-3-3 14:25', 'latest', 'Bretagne modified reg', 'http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html', 'Bretagne modified reg 3', 'http://bzh.other_3'),
( '2021-3-3 14:25', 'latest', 'Bretagne regulation', 'http://legipeche.metier.e2.rie.gouv.fr/some-regulation-a666.html', 'Bretagne reg text', 'http://bzh.reg'),
( '2021-3-2 14:25', 'previous', 'Unused regulation', 'http://legipeche.metier.e2.rie.gouv.fr/unused-regulation-a670.html', 'Unused reg text', 'http://unused.reg'),
( '2021-3-3 14:25', 'latest', 'Med. sea regulation', 'http://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html', 'Med reg text', 'http://med.reg'),
( '2021-3-3 14:25', 'latest', 'Med. sea regulation', 'https://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html', 'Med reg text', 'http://med.reg'),
( '2021-3-3 14:25', 'latest', 'Unused regulation', 'http://legipeche.metier.e2.rie.gouv.fr/unused-regulation-a670.html', 'Unused reg text', 'http://unused.reg'),
( '2021-3-3 14:25', 'latest', 'Unused regulation 2', 'http://legipeche.metier.e2.rie.gouv.fr/other-unused-regulation-a675.html', 'Unused reg text', 'http://unused2.reg');
Original file line number Diff line number Diff line change
Expand Up @@ -155,57 +155,57 @@ def legipeche_regulations() -> pd.DataFrame:
"Bretagne modified reg",
"Bretagne modified reg",
"Unused regulation 2",
"Med. sea regulation",
"Bretagne regulation",
"Unused regulation",
"Med. sea regulation",
"Some old page",
"Bretagne modified reg",
"Bretagne modified reg",
"Med. sea regulation",
"Bretagne regulation",
"Unused regulation",
"Med. sea regulation",
],
"page_url": [
"http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html",
"http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html",
"http://legipeche.metier.e2.rie.gouv.fr/other-unused-regulation-a675.html",
"http://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html",
"http://legipeche.metier.e2.rie.gouv.fr/some-regulation-a666.html",
"http://legipeche.metier.e2.rie.gouv.fr/unused-regulation-a670.html",
"https://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html",
"http://legipeche.metier.e2.rie.gouv.fr/deleted-regulation-a671.html",
"http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html",
"http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html",
"http://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html",
"http://legipeche.metier.e2.rie.gouv.fr/some-regulation-a666.html",
"http://legipeche.metier.e2.rie.gouv.fr/unused-regulation-a670.html",
"https://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html",
],
"document_title": [
"Bretagne modified reg 1",
"Bretagne modified reg 3",
"Unused reg text",
"Med reg text",
"Bretagne reg text",
"Unused reg text",
"Med reg text",
"Some old reg text",
"Bretagne modified reg 1",
"Bretagne modified reg 2",
"Med reg text",
"Bretagne reg text",
"Unused reg text",
"Med reg text",
],
"document_url": [
"http://bzh.other_1",
"http://bzh.other_3",
"http://unused2.reg",
"http://med.reg",
"http://bzh.reg",
"http://unused.reg",
"http://med.reg",
"http://some.thing",
"http://bzh.other_1",
"http://bzh.other_2",
"http://med.reg",
"http://bzh.reg",
"http://unused.reg",
"http://med.reg",
],
}
)
Expand All @@ -228,15 +228,15 @@ def legipeche_regulations_with_id(legipeche_regulations) -> pd.DataFrame:
"668",
"668",
"675",
"689",
"666",
"670",
"689",
"671",
"668",
"668",
"689",
"666",
"670",
"689",
]
)
return regulations
Expand Down

0 comments on commit 1b660f1

Please sign in to comment.