From 1b660f116225c04cf0c1a296d17a0fce00295145 Mon Sep 17 00:00:00 2001 From: Vincent Date: Wed, 6 Mar 2024 14:14:30 +0100 Subject: [PATCH] Handle http and https url patterns in regulations checkup flow --- .../src/pipeline/flows/regulations_checkup.py | 2 +- .../V666.6__Reset_test_legipeche.sql | 4 ++-- .../test_flows/test_regulations_checkup.py | 20 +++++++++---------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/datascience/src/pipeline/flows/regulations_checkup.py b/datascience/src/pipeline/flows/regulations_checkup.py index e6561e68d9..fbd4695b15 100644 --- a/datascience/src/pipeline/flows/regulations_checkup.py +++ b/datascience/src/pipeline/flows/regulations_checkup.py @@ -170,7 +170,7 @@ def add_article_id(regulations: pd.DataFrame, url_column: str) -> pd.DataFrame: """ legipeche_regex = re.compile( ( - r"^http://legipeche\.metier\." + r"^https?://legipeche\.metier\." r"e2\.rie\.gouv\.fr/" r"(?:[a-zA-Z0-9-]*)" r"-a(?P\d+)" diff --git a/datascience/tests/test_data/remote_database/V666.6__Reset_test_legipeche.sql b/datascience/tests/test_data/remote_database/V666.6__Reset_test_legipeche.sql index 894a2256b8..0dbefe21da 100644 --- a/datascience/tests/test_data/remote_database/V666.6__Reset_test_legipeche.sql +++ b/datascience/tests/test_data/remote_database/V666.6__Reset_test_legipeche.sql @@ -4,7 +4,7 @@ INSERT INTO public.legipeche ( extraction_datetime_utc, extraction_occurence, page_title, page_url, document_title, document_url ) VALUES ( '2021-3-2 14:25', 'previous', 'Some old page', 'http://legipeche.metier.e2.rie.gouv.fr/deleted-regulation-a671.html', 'Some old reg text', 'http://some.thing'), - ( '2021-3-2 14:25', 'previous', 'Med. sea regulation', 'http://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html', 'Med reg text', 'http://med.reg'), + ( '2021-3-2 14:25', 'previous', 'Med. sea regulation', 'https://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html', 'Med reg text', 'http://med.reg'), ( '2021-3-2 14:25', 'previous', 'Bretagne regulation', 'http://legipeche.metier.e2.rie.gouv.fr/some-regulation-a666.html', 'Bretagne reg text', 'http://bzh.reg'), ( '2021-3-2 14:25', 'previous', 'Bretagne modified reg', 'http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html', 'Bretagne modified reg 1', 'http://bzh.other_1'), ( '2021-3-2 14:25', 'previous', 'Bretagne modified reg', 'http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html', 'Bretagne modified reg 2', 'http://bzh.other_2'), @@ -12,6 +12,6 @@ INSERT INTO public.legipeche ( ( '2021-3-3 14:25', 'latest', 'Bretagne modified reg', 'http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html', 'Bretagne modified reg 3', 'http://bzh.other_3'), ( '2021-3-3 14:25', 'latest', 'Bretagne regulation', 'http://legipeche.metier.e2.rie.gouv.fr/some-regulation-a666.html', 'Bretagne reg text', 'http://bzh.reg'), ( '2021-3-2 14:25', 'previous', 'Unused regulation', 'http://legipeche.metier.e2.rie.gouv.fr/unused-regulation-a670.html', 'Unused reg text', 'http://unused.reg'), - ( '2021-3-3 14:25', 'latest', 'Med. sea regulation', 'http://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html', 'Med reg text', 'http://med.reg'), + ( '2021-3-3 14:25', 'latest', 'Med. sea regulation', 'https://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html', 'Med reg text', 'http://med.reg'), ( '2021-3-3 14:25', 'latest', 'Unused regulation', 'http://legipeche.metier.e2.rie.gouv.fr/unused-regulation-a670.html', 'Unused reg text', 'http://unused.reg'), ( '2021-3-3 14:25', 'latest', 'Unused regulation 2', 'http://legipeche.metier.e2.rie.gouv.fr/other-unused-regulation-a675.html', 'Unused reg text', 'http://unused2.reg'); \ No newline at end of file diff --git a/datascience/tests/test_pipeline/test_flows/test_regulations_checkup.py b/datascience/tests/test_pipeline/test_flows/test_regulations_checkup.py index cab2ebe458..4fbac3deac 100644 --- a/datascience/tests/test_pipeline/test_flows/test_regulations_checkup.py +++ b/datascience/tests/test_pipeline/test_flows/test_regulations_checkup.py @@ -155,57 +155,57 @@ def legipeche_regulations() -> pd.DataFrame: "Bretagne modified reg", "Bretagne modified reg", "Unused regulation 2", - "Med. sea regulation", "Bretagne regulation", "Unused regulation", + "Med. sea regulation", "Some old page", "Bretagne modified reg", "Bretagne modified reg", - "Med. sea regulation", "Bretagne regulation", "Unused regulation", + "Med. sea regulation", ], "page_url": [ "http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html", "http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html", "http://legipeche.metier.e2.rie.gouv.fr/other-unused-regulation-a675.html", - "http://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html", "http://legipeche.metier.e2.rie.gouv.fr/some-regulation-a666.html", "http://legipeche.metier.e2.rie.gouv.fr/unused-regulation-a670.html", + "https://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html", "http://legipeche.metier.e2.rie.gouv.fr/deleted-regulation-a671.html", "http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html", "http://legipeche.metier.e2.rie.gouv.fr/modified-regulation-a668.html", - "http://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html", "http://legipeche.metier.e2.rie.gouv.fr/some-regulation-a666.html", "http://legipeche.metier.e2.rie.gouv.fr/unused-regulation-a670.html", + "https://legipeche.metier.e2.rie.gouv.fr/regulation-with-unstable-url-a689.html", ], "document_title": [ "Bretagne modified reg 1", "Bretagne modified reg 3", "Unused reg text", - "Med reg text", "Bretagne reg text", "Unused reg text", + "Med reg text", "Some old reg text", "Bretagne modified reg 1", "Bretagne modified reg 2", - "Med reg text", "Bretagne reg text", "Unused reg text", + "Med reg text", ], "document_url": [ "http://bzh.other_1", "http://bzh.other_3", "http://unused2.reg", - "http://med.reg", "http://bzh.reg", "http://unused.reg", + "http://med.reg", "http://some.thing", "http://bzh.other_1", "http://bzh.other_2", - "http://med.reg", "http://bzh.reg", "http://unused.reg", + "http://med.reg", ], } ) @@ -228,15 +228,15 @@ def legipeche_regulations_with_id(legipeche_regulations) -> pd.DataFrame: "668", "668", "675", - "689", "666", "670", + "689", "671", "668", "668", - "689", "666", "670", + "689", ] ) return regulations