fix: add primary keys when upserting in Postgres (#2819)

aws · May 15, 2024 · b3f215e · b3f215e
1 parent 8ea7427
commit b3f215e
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 42 deletions.
diff --git a/awswrangler/postgresql.py b/awswrangler/postgresql.py
@@ -79,6 +79,7 @@ def _create_table(
     index: bool,
     dtype: dict[str, str] | None,
     varchar_lengths: dict[str, int] | None,
+    unique_keys: list[str] | None = None,
 ) -> None:
     if mode == "overwrite":
         if overwrite_method in ["drop", "cascade"]:
@@ -101,6 +102,8 @@ def _create_table(
         converter_func=_data_types.pyarrow2postgresql,
     )
     cols_str: str = "".join([f"{_identifier(k)} {v},\n" for k, v in postgresql_types.items()])[:-2]
+    if unique_keys:
+        cols_str += f",\nUNIQUE ({', '.join([_identifier(k) for k in unique_keys])})"
     sql = f"CREATE TABLE IF NOT EXISTS {_identifier(schema)}.{_identifier(table)} (\n{cols_str})"
     _logger.debug("Create table query:\n%s", sql)
     cursor.execute(sql)
@@ -619,6 +622,7 @@ def to_sql(
                 index=index,
                 dtype=dtype,
                 varchar_lengths=varchar_lengths,
+                unique_keys=upsert_conflict_columns or insert_conflict_columns,
             )
             if index:
                 df.reset_index(level=df.index.names, inplace=True)

diff --git a/tests/unit/test_postgresql.py b/tests/unit/test_postgresql.py
@@ -285,16 +285,6 @@ def test_dfs_are_equal_for_different_chunksizes(postgresql_table, postgresql_con
 
 
 def test_upsert(postgresql_table, postgresql_con):
-    create_table_sql = (
-        f"CREATE TABLE public.{postgresql_table} "
-        "(c0 varchar NULL PRIMARY KEY,"
-        "c1 int NULL DEFAULT 42,"
-        "c2 int NOT NULL);"
-    )
-    with postgresql_con.cursor() as cursor:
-        cursor.execute(create_table_sql)
-        postgresql_con.commit()
-
     df = pd.DataFrame({"c0": ["foo", "bar"], "c2": [1, 2]})
 
     with pytest.raises(wr.exceptions.InvalidArgumentValue):
@@ -369,17 +359,6 @@ def test_upsert(postgresql_table, postgresql_con):
 
 
 def test_upsert_multiple_conflict_columns(postgresql_table, postgresql_con):
-    create_table_sql = (
-        f"CREATE TABLE public.{postgresql_table} "
-        "(c0 varchar NULL PRIMARY KEY,"
-        "c1 int NOT NULL,"
-        "c2 int NOT NULL,"
-        "UNIQUE (c1, c2));"
-    )
-    with postgresql_con.cursor() as cursor:
-        cursor.execute(create_table_sql)
-        postgresql_con.commit()
-
     df = pd.DataFrame({"c0": ["foo", "bar"], "c1": [1, 2], "c2": [3, 4]})
     upsert_conflict_columns = ["c1", "c2"]
 
@@ -437,16 +416,6 @@ def test_upsert_multiple_conflict_columns(postgresql_table, postgresql_con):
 
 
 def test_insert_ignore_duplicate_columns(postgresql_table, postgresql_con):
-    create_table_sql = (
-        f"CREATE TABLE public.{postgresql_table} "
-        "(c0 varchar NULL PRIMARY KEY,"
-        "c1 int NULL DEFAULT 42,"
-        "c2 int NOT NULL);"
-    )
-    with postgresql_con.cursor() as cursor:
-        cursor.execute(create_table_sql)
-        postgresql_con.commit()
-
     df = pd.DataFrame({"c0": ["foo", "bar"], "c2": [1, 2]})
 
     wr.postgresql.to_sql(
@@ -501,17 +470,6 @@ def test_insert_ignore_duplicate_columns(postgresql_table, postgresql_con):
 
 
 def test_insert_ignore_duplicate_multiple_columns(postgresql_table, postgresql_con):
-    create_table_sql = (
-        f"CREATE TABLE public.{postgresql_table} "
-        "(c0 varchar NULL PRIMARY KEY,"
-        "c1 int NOT NULL,"
-        "c2 int NOT NULL,"
-        "UNIQUE (c1, c2));"
-    )
-    with postgresql_con.cursor() as cursor:
-        cursor.execute(create_table_sql)
-        postgresql_con.commit()
-
     df = pd.DataFrame({"c0": ["foo", "bar"], "c1": [1, 2], "c2": [3, 4]})
     insert_conflict_columns = ["c1", "c2"]