stac-utils · bitner · Jun 13, 2022 · Jun 3, 2022 · Jun 3, 2022 · Jun 13, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Changelog
+
+## [Unreleased]
+
+### Fixed
+
+- Fix failure of pypgstac load for large items [#121](https://github.com/stac-utils/pgstac/pull/121)
+
 ## [v0.6.4]
+
 ### Fixed
 - Fixed casts for numeric data when a property is not in the queryables table to use the type from the incoming json filter
 - Fixed issue loader grouping an unordered iterable by partition, speeding up loads of items with mixed partitions [#116](https://github.com/stac-utils/pgstac/pull/116)

diff --git a/pypgstac/pypgstac/load.py b/pypgstac/pypgstac/load.py
@@ -20,7 +20,6 @@
     Generator,
     TextIO,
 )
-import csv
 import orjson
 import psycopg
 from orjson import JSONDecodeError
@@ -512,6 +511,8 @@ def read_dehydrated(self, file: Union[Path, str] = "stdin") -> Generator:
         if isinstance(file, str):
             open_file: Any = open_std(file, "r")
             with open_file as f:
+                # Note: if 'content' is changed to be anything
+                # but the last field, the logic below will break.
                 fields = [
                     "id",
                     "geometry",
@@ -520,8 +521,21 @@ def read_dehydrated(self, file: Union[Path, str] = "stdin") -> Generator:
                     "end_datetime",
                     "content",
                 ]
-                csvreader = csv.DictReader(f, fields, delimiter="\t")
-                for item in csvreader:
+
+                for line in f:
+                    tab_split = line.split("\t")
+                    item = {}
+                    for i, field in enumerate(fields):
+                        if field == "content":
+                            # Join the remaining splits in case
+                            # there were any tabs in the JSON content.
+                            content_value = "\t".join(tab_split[i:])
+                            # Replace quote characters that can be
+                            # written on export and causes failures.
+                            content_value = content_value.replace(r'\\"', r"\"")
+                            item[field] = content_value
+                        else:
+                            item[field] = tab_split[i]
                     item["partition"] = self._partition_update(item)
                     yield item
 

diff --git a/pypgstac/tests/test_load.py b/pypgstac/tests/test_load.py
@@ -320,3 +320,22 @@ def test_s1_grd_load_and_query(loader: Loader) -> None:
     )[0]
     item = res["features"][0]
     pystac.Item.from_dict(item).validate()
+
+
+def test_load_dehydrated(loader: Loader) -> None:
+    """Test loader for items dumped directly out of item table."""
+    collections = [
+        HERE / "data-files" / "hydration" / "collections" / "chloris-biomass.json",
+    ]
+
+    for collection in collections:
+        loader.load_collections(
+            str(collection),
+            insert_mode=Methods.ignore,
+        )
+
+    dehydrated_items = HERE / "data-files" / "load" / "dehydrated.txt"
+
+    loader.load_items(
+        str(dehydrated_items), insert_mode=Methods.insert, dehydrated=True
+    )