datagouv · bolinocroustibat · Sep 23, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -62,6 +62,7 @@
 - Refactor routes URLs to be more RESTful and separate legacy routes code from new routes code [#132](https://github.com/datagouv/hydra/pull/132)
 - Display app version and environment in health check endpoint [#164](https://github.com/datagouv/hydra/pull/164)
 - Use ENVIRONMENT from config file instead of env var [#165](https://github.com/datagouv/hydra/pull/165)
+- Manage large resources exceptions differently [#148](https://github.com/datagouv/hydra/pull/148)
 
 ## 1.0.1 (2023-01-04)
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,7 @@ minio = "7.2.7"
 pyarrow = "16.1.0"
 python-dateutil = "^2.8.2"
 python-magic = "^0.4.25"
+python-slugify = "^8.0.4"
 progressist = "^0.1.0"
 redis = "^4.1.4"
 rq = "^1.11.1"

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,5 +1,6 @@
 import asyncio
 import hashlib
+import logging
 import os
 import uuid
 from datetime import datetime
@@ -17,17 +18,22 @@
 from udata_hydra.app import app_factory
 from udata_hydra.db.check import Check
 from udata_hydra.db.resource import Resource
+from udata_hydra.db.resource_exception import ResourceException
 from udata_hydra.logger import stop_sentry
 
 DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://postgres:postgres@localhost:5433/postgres")
 RESOURCE_ID = "c4e3a9fb-4415-488e-ba57-d05269b27adf"
+RESOURCE_EXCEPTION_ID = "d4e3a9fb-4415-488e-ba57-d05269b27adf"
+RESOURCE_EXCEPTION_TABLE_INDEXES = {"Nom": "index", "N° de certificat": "index"}
 RESOURCE_URL = "https://example.com/resource-1"
 DATASET_ID = "601ddcfc85a59c3a45c2435a"
 NOT_EXISTING_RESOURCE_ID = "5d0b2b91-b21b-4120-83ef-83f818ba2451"
 pytestmark = pytest.mark.asyncio
 
 nest_asyncio.apply()
 
+log = logging.getLogger("udata-hydra")
+
 
 def dummy(return_value=None):
     """
@@ -135,6 +141,21 @@ def setup_catalog(catalog_content, rmock):
     run("load_catalog", url=catalog)
 
 
+@pytest.fixture
+async def setup_catalog_with_resource_exception(setup_catalog):
+    """Setup a catalog with a resource that is too large to be processed
+    Columns for the resource RESOURCE_ID_EXCEPTION:
+    ['__id', 'Nom', 'Prenom', 'Societe', 'Adresse', 'CP', 'Ville', 'Tel1', 'Tel2', 'email', 'Organisme', 'Org Cofrac', 'Type de certificat', 'N° de certificat', 'Date début validité', 'Date fin validité']
+    """
+    await Resource.insert(
+        dataset_id=DATASET_ID, resource_id=RESOURCE_EXCEPTION_ID, url="http://example.com/"
+    )
+    await ResourceException.insert(
+        resource_id=RESOURCE_EXCEPTION_ID,
+        table_indexes=RESOURCE_EXCEPTION_TABLE_INDEXES,
+    )
+
+
 @pytest.fixture
 def produce_mock(mocker):
     mocker.patch("udata_hydra.crawl.process_check_data.send", dummy())

diff --git a/tests/test_analysis/test_analysis_csv.py b/tests/test_analysis/test_analysis_csv.py
@@ -8,7 +8,6 @@
 from yarl import URL
 
 from tests.conftest import RESOURCE_ID, RESOURCE_URL
-from udata_hydra import config
 from udata_hydra.analysis.csv import analyse_csv, csv_to_db
 from udata_hydra.db.resource import Resource
 
@@ -85,46 +84,6 @@ async def test_analyse_csv_big_file(setup_catalog, rmock, db, fake_check, produc
     assert profile["total_lines"] == expected_count
 
 
-async def test_exception_analysis(setup_catalog, rmock, db, fake_check, produce_mock):
-    """
-    Tests that exception resources (files that are too large to be normally processed) are indeed processed.
-    """
-    save_config = config.MAX_FILESIZE_ALLOWED
-    config.override(MAX_FILESIZE_ALLOWED={"csv": 5000})
-    await db.execute(
-        f"UPDATE catalog SET resource_id = '{config.LARGE_RESOURCES_EXCEPTIONS[0]}' WHERE id=1"
-    )
-    check = await fake_check(resource_id=config.LARGE_RESOURCES_EXCEPTIONS[0])
-    filename, expected_count = ("20190618-annuaire-diagnostiqueurs.csv", 45522)
-    url = check["url"]
-    table_name = hashlib.md5(url.encode("utf-8")).hexdigest()
-    with open(f"tests/data/{filename}", "rb") as f:
-        data = f.read()
-    rmock.get(url, status=200, body=data)
-
-    # Check resource status before analysis
-    resource = await Resource.get(config.LARGE_RESOURCES_EXCEPTIONS[0])
-    assert resource["status"] is None
-
-    # Analyse the CSV
-    await analyse_csv(check_id=check["id"])
-
-    # Check resource status after analysis
-    resource = await Resource.get(config.LARGE_RESOURCES_EXCEPTIONS[0])
-    assert resource["status"] is None
-
-    count = await db.fetchrow(f'SELECT count(*) AS count FROM "{table_name}"')
-    assert count["count"] == expected_count
-    profile = await db.fetchrow(
-        "SELECT csv_detective FROM tables_index WHERE resource_id = $1", check["resource_id"]
-    )
-    profile = json.loads(profile["csv_detective"])
-    for attr in ("header", "columns", "formats", "profile"):
-        assert profile[attr]
-    assert profile["total_lines"] == expected_count
-    config.override(MAX_FILESIZE_ALLOWED=save_config)
-
-
 @pytest.mark.parametrize(
     "line_expected",
     (
@@ -155,7 +114,7 @@ async def test_csv_to_db_simple_type_casting(db, line_expected, clean_db):
             "header": list(columns.keys()),
             "columns": columns,
         }
-        await csv_to_db(fp.name, inspection, "test_table")
+        await csv_to_db(file_path=fp.name, inspection=inspection, table_name="test_table")
     res = list(await db.fetch("SELECT * FROM test_table"))
     assert len(res) == 1
     cols = ["__id", "int", "float", "string", "bool"]
@@ -200,7 +159,7 @@ async def test_csv_to_db_complex_type_casting(db, line_expected, clean_db):
             "columns": columns,
         }
         # Insert the data
-        await csv_to_db(fp.name, inspection, "test_table")
+        await csv_to_db(file_path=fp.name, inspection=inspection, table_name="test_table")
     res = list(await db.fetch("SELECT * FROM test_table"))
     assert len(res) == 1
     cols = ["__id", "json", "date", "datetime"]
@@ -227,7 +186,7 @@ async def test_basic_sql_injection(db, clean_db):
             "columns": columns,
         }
         # Insert the data
-        await csv_to_db(fp.name, inspection, "test_table")
+        await csv_to_db(file_path=fp.name, inspection=inspection, table_name="test_table")
     res = await db.fetchrow("SELECT * FROM test_table")
     assert res[injection] == "test"
 
@@ -249,7 +208,7 @@ async def test_percentage_column(db, clean_db):
             "columns": columns,
         }
         # Insert the data
-        await csv_to_db(fp.name, inspection, "test_table")
+        await csv_to_db(file_path=fp.name, inspection=inspection, table_name="test_table")
     res = await db.fetchrow("SELECT * FROM test_table")
     assert res["% mon pourcent"] == "test"
 
@@ -271,7 +230,7 @@ async def test_reserved_column_name(db, clean_db):
             "columns": columns,
         }
         # Insert the data
-        await csv_to_db(fp.name, inspection, "test_table")
+        await csv_to_db(file_path=fp.name, inspection=inspection, table_name="test_table")
     res = await db.fetchrow("SELECT * FROM test_table")
     assert res["xmin__hydra_renamed"] == "test"
 

diff --git a/tests/test_analysis/test_analysis_csv_exceptions.py b/tests/test_analysis/test_analysis_csv_exceptions.py
@@ -0,0 +1,72 @@
+import hashlib
+import json
+import logging
+
+import pytest
+from asyncpg import Record
+
+from tests.conftest import RESOURCE_EXCEPTION_ID, RESOURCE_EXCEPTION_TABLE_INDEXES
+from udata_hydra import config
+from udata_hydra.analysis.csv import analyse_csv
+from udata_hydra.db.resource import Resource
+from udata_hydra.db.resource_exception import ResourceException
+from udata_hydra.utils.db import get_columns_with_indexes
+
+pytestmark = pytest.mark.asyncio
+
+
+log = logging.getLogger("udata-hydra")
+
+
+async def test_exception_analysis(
+    setup_catalog_with_resource_exception, rmock, db, fake_check, produce_mock
+):
+    """
+    Tests that exception resources (files that are too large to be normally processed) are indeed processed.
+    """
+    # Change config to accept large files
+    save_config = config.MAX_FILESIZE_ALLOWED
+    config.override(MAX_FILESIZE_ALLOWED={"csv": 5000})
+
+    # Create a previous fake check for the resource
+    check = await fake_check(resource_id=RESOURCE_EXCEPTION_ID)
+    filename, expected_count = ("20190618-annuaire-diagnostiqueurs.csv", 45522)
+    url = check["url"]
+    table_name = hashlib.md5(url.encode("utf-8")).hexdigest()
+    with open(f"tests/data/{filename}", "rb") as f:
+        data = f.read()
+    rmock.get(url, status=200, body=data)
+
+    # Check resource status before analysis
+    resource = await Resource.get(RESOURCE_EXCEPTION_ID)
+    assert resource["status"] is None
+
+    # Analyse the CSV
+    await analyse_csv(check_id=check["id"])
+
+    # Check resource status after analysis
+    resource = await Resource.get(RESOURCE_EXCEPTION_ID)
+    assert resource["status"] is None
+
+    # Check the table has been created in CSV DB, with the expected number of rows, and get the columns
+    row: Record = await db.fetchrow(f'SELECT *, count(*) over () AS count FROM "{table_name}"')
+    assert row["count"] == expected_count
+
+    # Check if indexes have been created for the table
+    expected_columns_with_indexes = list(RESOURCE_EXCEPTION_TABLE_INDEXES.keys())
+    expected_columns_with_indexes.append("__id")
+    indexes: list[Record] | None = await get_columns_with_indexes(table_name)
+    assert indexes
+    for idx in indexes:
+        assert idx["table_name"] == table_name
+        assert idx["column_name"] in expected_columns_with_indexes
+
+    # Check the profile has been saved in the tables_index
+    profile = await db.fetchrow(
+        "SELECT csv_detective FROM tables_index WHERE resource_id = $1", check["resource_id"]
+    )
+    profile = json.loads(profile["csv_detective"])
+    for attr in ("header", "columns", "formats", "profile"):
+        assert profile[attr]
+    assert profile["total_lines"] == expected_count
+    config.override(MAX_FILESIZE_ALLOWED=save_config)
diff --git a/tests/test_parquet_export.py b/tests/test_parquet_export.py
@@ -1,5 +1,4 @@
 from io import BytesIO
-from typing import Optional
 
 import pyarrow.parquet as pq
 import pytest
@@ -27,7 +26,7 @@ async def test_parquet_conversion(
 ):
     filename, expected_count = file_and_count
     file_path = f"tests/data/{filename}"
-    inspection: Optional[dict] = await perform_csv_inspection(file_path)
+    inspection: dict | None = await perform_csv_inspection(file_path)
     assert inspection
     columns = inspection["columns"]
     columns = {

diff --git a/udata_hydra/analysis/csv.py b/udata_hydra/analysis/csv.py
@@ -12,6 +12,7 @@
 from csv_detective.detection import engine_to_file
 from csv_detective.explore_csv import routine as csv_detective_routine
 from progressist import ProgressBar
+from slugify import slugify
 from sqlalchemy import (
     JSON,
     BigInteger,
@@ -26,7 +27,7 @@
     Table,
 )
 from sqlalchemy.dialects.postgresql import asyncpg
-from sqlalchemy.schema import CreateTable
+from sqlalchemy.schema import CreateIndex, CreateTable, Index
 from str2bool import str2bool
 from str2float import str2float
 
@@ -36,6 +37,7 @@
 from udata_hydra.db import compute_insert_query
 from udata_hydra.db.check import Check
 from udata_hydra.db.resource import Resource
+from udata_hydra.db.resource_exception import ResourceException
 from udata_hydra.utils import Reader, Timer, download_resource, queue, send
 from udata_hydra.utils.minio import MinIOClient
 from udata_hydra.utils.parquet import save_as_parquet
@@ -121,12 +123,14 @@ async def analyse_csv(
     # Update resource status to ANALYSING_CSV
     await Resource.update(resource_id, {"status": "ANALYSING_CSV"})
 
-    exceptions = config.LARGE_RESOURCES_EXCEPTIONS
+    # Check if the resource is in the exceptions table
+    # If it is, get the table_indexes to use them later
+    exception: Record | None = await ResourceException.get_by_resource_id(resource_id)
+    table_indexes: dict | None = json.loads(exception["table_indexes"]) if exception else None
 
     timer = Timer("analyse-csv")
     assert any(_ is not None for _ in (check_id, url))
     url: str = check.get("url") or url
-    exception_file = str(check.get("resource_id", "")) in exceptions
 
     headers = json.loads(check.get("headers") or "{}")
     tmp_file = (
@@ -135,7 +139,7 @@ async def analyse_csv(
         else await download_resource(
             url=url,
             headers=headers,
-            max_size_allowed=None if exception_file else int(config.MAX_FILESIZE_ALLOWED["csv"]),
+            max_size_allowed=None if exception else int(config.MAX_FILESIZE_ALLOWED["csv"]),
         )
     )
     table_name = hashlib.md5(url.encode("utf-8")).hexdigest()
@@ -151,6 +155,7 @@ async def analyse_csv(
             file_path=tmp_file.name,
             inspection=csv_inspection,
             table_name=table_name,
+            table_indexes=table_indexes,
             resource_id=resource_id,
             debug_insert=debug_insert,
         )
@@ -207,17 +212,45 @@ def smart_cast(_type: str, value, failsafe: bool = False) -> Any:
         return None
 
 
-def compute_create_table_query(table_name: str, columns: list) -> str:
+def compute_create_table_query(
+    table_name: str, columns: dict, indexes: dict[str, str] | None = None
+) -> str:
     """Use sqlalchemy to build a CREATE TABLE statement that should not be vulnerable to injections"""
+
     metadata = MetaData()
     table = Table(table_name, metadata, Column("__id", Integer, primary_key=True))
+
     for col_name, col_type in columns.items():
         table.append_column(Column(col_name, PYTHON_TYPE_TO_PG.get(col_type, String)))
-    compiled = CreateTable(table).compile(dialect=asyncpg.dialect())
+
+    if indexes:
+        for col_name, index_type in indexes.items():
+            if index_type not in config.SQL_INDEXES_TYPES_SUPPORTED:
+                log.error(
+                    f'Index type "{index_type}" is unknown or not supported yet! Index for colum {col_name} was not created.'
+                )
+                continue
+
+            else:
+                if index_type == "index":
+                    index_name = f"{table_name}_{slugify(col_name)}_idx"
+                    table.append_constraint(Index(index_name, col_name))
+                # TODO: other index types. Not easy with sqlalchemy, maybe use raw sql?
+
+    compiled_query = CreateTable(table).compile(dialect=asyncpg.dialect())
+    query: str = compiled_query.string
+
+    # Add the index creation queries to the main query
+    for index in table.indexes:
+        log.debug(f'Creating {index_type} on column "{col_name}"')
+        query_idx = CreateIndex(index).compile(dialect=asyncpg.dialect())
+        query: str = query + ";" + query_idx.string
+
     # compiled query will want to write "%% mon pourcent" VARCHAR but will fail when querying "% mon pourcent"
     # also, "% mon pourcent" works well in pg as a column
     # TODO: dirty hack, maybe find an alternative
-    return compiled.string.replace("%%", "%")
+    query = query.replace("%%", "%")
+    return query
 
 
 def generate_records(file_path: str, inspection: dict, columns: dict) -> Iterator[list]:
@@ -269,6 +302,7 @@ async def csv_to_db(
     file_path: str,
     inspection: dict,
     table_name: str,
+    table_indexes: dict[str, str] | None = None,
     resource_id: str | None = None,
     debug_insert: bool = False,
 ) -> None:
@@ -302,8 +336,11 @@ async def csv_to_db(
     q = f'DROP TABLE IF EXISTS "{table_name}"'
     db = await context.pool("csv")
     await db.execute(q)
-    q = compute_create_table_query(table_name, columns)
+
+    # Create table
+    q = compute_create_table_query(table_name=table_name, columns=columns, indexes=table_indexes)
     await db.execute(q)
+
     # this use postgresql COPY from an iterator, it's fast but might be difficult to debug
     if not debug_insert:
         # NB: also see copy_to_table for a file source