From 9d93adbfda77bf9d14f231382645739f0b16572c Mon Sep 17 00:00:00 2001
From: tycho garen <garen@tychoish.com>
Date: Wed, 20 Mar 2024 07:09:11 -0400
Subject: [PATCH] chore: bson/json comparison

---
 tests/bench_json.py    | 95 +++++++++++-----------------------------
 tests/conftest.py      |  2 +-
 tests/fixtures/data.py | 99 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 125 insertions(+), 71 deletions(-)
 create mode 100644 tests/fixtures/data.py

diff --git a/tests/bench_json.py b/tests/bench_json.py
index ef041d0c7..6182cc05e 100644
--- a/tests/bench_json.py
+++ b/tests/bench_json.py
@@ -3,21 +3,12 @@
 # and read_json(), with the serde_json+streaming JSON code which has
 # more relaxed parsing with regards to newlines.
 import pathlib
-import os
-import json
-import random
-import string
 import logging
 
 import pytest
 import psycopg2
 
-import tests.tools
-
-VALUES_SET = string.ascii_uppercase + string.digits
 logger = logging.getLogger("json")
-
-
 CASES = [
     "c16.r256",
     "c16.r512",
@@ -30,69 +21,17 @@
 ]
 
 
-@pytest.fixture(scope="session")
-def generated_bench_data(
-    tmp_path_factory: pytest.TempPathFactory,
-) -> dict[str, pathlib.Path]:
-    out: dict[str, pathlib.Path] = {}
-
-    with tests.tools.timed(logger, "generating-test-data"):
-        tmpdir = tmp_path_factory.mktemp(basename="json-bench-singles", numbered=False)
-        for col, row in [(16, 256), (16, 512), (32, 256), (32, 512)]:
-            out[f"c{col}.r{row}"] = write_random_json_file(
-                path=tmpdir.joinpath(f"c{col}.r{row}.json"), column_count=col, row_count=row
-            )
-
-        globdir = tmp_path_factory.mktemp(basename="json-bench-globs", numbered=False)
-        for num in [16, 64, 256, 512]:
-            tmpdir = globdir.joinpath(f"n{num}")
-            test_path = tmpdir.joinpath("*.json")
-            out[f"glob.n{num}"] = test_path
-            os.mkdir(tmpdir)
-            logger.info(f"added glob test at '{test_path}'")
-            for i in range(num):
-                write_random_json_file(
-                    tmpdir.joinpath(f"benchdata.{i}.json"), column_count=16, row_count=512
-                )
-
-    logger.info(f"wrote {len(out)} test cases; {len(CASES)} registered")
-
-    return out
-
-
-def write_random_json_file(
-    path: pathlib.Path,
-    column_count: int,
-    row_count: int,
-) -> pathlib.Path:
-    vals = [
-        "".join(random.choices(VALUES_SET, k=4)),
-        "".join(random.choices(VALUES_SET, k=8)),
-        "".join(random.choices(VALUES_SET, k=16)),
-        "".join(random.choices(VALUES_SET, k=32)),
-        "".join(random.choices(VALUES_SET, k=64)),
-    ]
-
-    with open(path, "w") as f:
-        for idx, rc in enumerate(range(row_count)):
-            doc = {}
-            for cc in range(column_count):
-                if cc % 4 == 0:
-                    doc[f"{cc}.{idx}"] = random.randint(0, (column_count + 1) * (row_count + 1))
-                else:
-                    doc[f"{cc}.{idx}"] = random.choice(vals)
-
-            json.dump(doc, f)
-            f.write("\n")
-
-    return path
-
-
 @pytest.mark.parametrize(
     "case_name,read_fn_name",
     [
         bench
-        for pair in [[(item, "read_json"), (item, "read_ndjson")] for item in CASES]
+        for pair in [
+            [
+                (item, "read_json"),
+                (item, "read_ndjson"),
+            ]
+            for item in CASES
+        ]
         for bench in pair
     ],
 )
@@ -100,18 +39,34 @@ def write_random_json_file(
 def test_json_function(
     glaredb_connection: psycopg2.extensions.connection,
     tmp_path_factory: pytest.TempPathFactory,
-    generated_bench_data: dict[str, pathlib.Path],
+    generated_json_bench_data: dict[str, pathlib.Path],
     case_name: str,
     read_fn_name: str,
     benchmark: callable,
 ):
-    path = generated_bench_data[case_name]
+    path = generated_json_bench_data[case_name]
 
     logger.info(f"using test data at '{path}' for {case_name}")
 
     benchmark(run_query_operation, glaredb_connection, path, read_fn_name)
 
 
+@pytest.mark.parametrize("case_name", CASES)
+@pytest.mark.benchmark
+def test_bson_function(
+    glaredb_connection: psycopg2.extensions.connection,
+    tmp_path_factory: pytest.TempPathFactory,
+    generated_bson_bench_data: dict[str, pathlib.Path],
+    case_name: str,
+    benchmark: callable,
+):
+    path = generated_bson_bench_data[case_name]
+
+    logger.info(f"using test data at '{path}' for {case_name}")
+
+    benchmark(run_query_operation, glaredb_connection, path, "read_bson")
+
+
 def run_query_operation(
     glaredb_connection: psycopg2.extensions.connection,
     path: pathlib.Path,
diff --git a/tests/conftest.py b/tests/conftest.py
index 179651923..811a6d3ca 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1 +1 @@
-pytest_plugins = ["tests.fixtures.glaredb"]
+pytest_plugins = ["tests.fixtures.glaredb", "tests.fixtures.data"]
diff --git a/tests/fixtures/data.py b/tests/fixtures/data.py
new file mode 100644
index 000000000..08d15006e
--- /dev/null
+++ b/tests/fixtures/data.py
@@ -0,0 +1,99 @@
+import json
+import logging
+import string
+import random
+import pathlib
+import os
+
+import pytest
+import bson
+
+import tests.tools
+
+VALUES_SET = string.ascii_uppercase + string.digits
+logger = logging.getLogger("fixture.data")
+
+
+@pytest.fixture(scope="session")
+def generated_json_bench_data(tmp_path_factory: pytest.TempPathFactory) -> dict[str, pathlib.Path]:
+    def _dump(obj, file):
+        json.dump(obj, file)
+        file.write("\n")
+
+    return generate_bench_data(tmp_path_factory, "json", _dump)
+
+
+@pytest.fixture(scope="session")
+def generated_bson_bench_data(tmp_path_factory: pytest.TempPathFactory) -> dict[str, pathlib.Path]:
+    def _dump(obj, file):
+        file.write(bson.encode(obj))
+
+    return generate_bench_data(tmp_path_factory, "bson", _dump)
+
+
+def generate_bench_data(
+    tmp_path_factory: pytest.TempPathFactory,
+    ext: str,
+    dump: callable,
+) -> dict[str, pathlib.Path]:
+    out: dict[str, pathlib.Path] = {}
+    modes = {"json": "w", "bson": "wb"}
+
+    with tests.tools.timed(logger, "generating-test-data"):
+        tmpdir = tmp_path_factory.mktemp(basename=f"{ext}-bench-singles", numbered=False)
+        for col, row in [(16, 256), (16, 512), (32, 256), (32, 512)]:
+            out[f"c{col}.r{row}"] = write_test_docs(
+                path=tmpdir.joinpath(f"c{col}.r{row}.{ext}"),
+                column_count=col,
+                row_count=row,
+                dump=dump,
+                mode=modes[ext],
+            )
+
+        globdir = tmp_path_factory.mktemp(basename=f"{ext}-bench-globs", numbered=False)
+        for num in [16, 64, 256, 512]:
+            tmpdir = globdir.joinpath(f"n{num}")
+            test_path = tmpdir.joinpath(f"*.{ext}")
+            out[f"glob.n{num}"] = test_path
+            os.mkdir(tmpdir)
+            logger.info(f"added glob test at '{test_path}'")
+            for i in range(num):
+                write_test_docs(
+                    tmpdir.joinpath(f"benchdata.{i}.{ext}"),
+                    column_count=16,
+                    row_count=512,
+                    dump=dump,
+                    mode=modes[ext],
+                )
+
+    logger.info(f"wrote {len(out)} {ext} test cases")
+
+    return out
+
+
+def write_test_docs(
+    path: pathlib.Path,
+    column_count: int,
+    row_count: int,
+    dump: callable,
+    mode: str,
+) -> pathlib.Path:
+    vals = [
+        "".join(random.choices(VALUES_SET, k=4)),
+        "".join(random.choices(VALUES_SET, k=8)),
+        "".join(random.choices(VALUES_SET, k=16)),
+        "".join(random.choices(VALUES_SET, k=32)),
+        "".join(random.choices(VALUES_SET, k=64)),
+    ]
+
+    with open(path, mode) as f:
+        for idx, rc in enumerate(range(row_count)):
+            doc = {}
+            for cc in range(column_count):
+                if cc % 4 == 0:
+                    doc[f"{cc}.{idx}"] = random.randint(0, (column_count + 1) * (row_count + 1))
+                else:
+                    doc[f"{cc}.{idx}"] = random.choice(vals)
+            dump(doc, f)
+
+    return path