From 9d93adbfda77bf9d14f231382645739f0b16572c Mon Sep 17 00:00:00 2001 From: tycho garen Date: Wed, 20 Mar 2024 07:09:11 -0400 Subject: [PATCH] chore: bson/json comparison --- tests/bench_json.py | 95 +++++++++++----------------------------- tests/conftest.py | 2 +- tests/fixtures/data.py | 99 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+), 71 deletions(-) create mode 100644 tests/fixtures/data.py diff --git a/tests/bench_json.py b/tests/bench_json.py index ef041d0c7..6182cc05e 100644 --- a/tests/bench_json.py +++ b/tests/bench_json.py @@ -3,21 +3,12 @@ # and read_json(), with the serde_json+streaming JSON code which has # more relaxed parsing with regards to newlines. import pathlib -import os -import json -import random -import string import logging import pytest import psycopg2 -import tests.tools - -VALUES_SET = string.ascii_uppercase + string.digits logger = logging.getLogger("json") - - CASES = [ "c16.r256", "c16.r512", @@ -30,69 +21,17 @@ ] -@pytest.fixture(scope="session") -def generated_bench_data( - tmp_path_factory: pytest.TempPathFactory, -) -> dict[str, pathlib.Path]: - out: dict[str, pathlib.Path] = {} - - with tests.tools.timed(logger, "generating-test-data"): - tmpdir = tmp_path_factory.mktemp(basename="json-bench-singles", numbered=False) - for col, row in [(16, 256), (16, 512), (32, 256), (32, 512)]: - out[f"c{col}.r{row}"] = write_random_json_file( - path=tmpdir.joinpath(f"c{col}.r{row}.json"), column_count=col, row_count=row - ) - - globdir = tmp_path_factory.mktemp(basename="json-bench-globs", numbered=False) - for num in [16, 64, 256, 512]: - tmpdir = globdir.joinpath(f"n{num}") - test_path = tmpdir.joinpath("*.json") - out[f"glob.n{num}"] = test_path - os.mkdir(tmpdir) - logger.info(f"added glob test at '{test_path}'") - for i in range(num): - write_random_json_file( - tmpdir.joinpath(f"benchdata.{i}.json"), column_count=16, row_count=512 - ) - - logger.info(f"wrote {len(out)} test cases; {len(CASES)} registered") - - return out - - -def write_random_json_file( - path: pathlib.Path, - column_count: int, - row_count: int, -) -> pathlib.Path: - vals = [ - "".join(random.choices(VALUES_SET, k=4)), - "".join(random.choices(VALUES_SET, k=8)), - "".join(random.choices(VALUES_SET, k=16)), - "".join(random.choices(VALUES_SET, k=32)), - "".join(random.choices(VALUES_SET, k=64)), - ] - - with open(path, "w") as f: - for idx, rc in enumerate(range(row_count)): - doc = {} - for cc in range(column_count): - if cc % 4 == 0: - doc[f"{cc}.{idx}"] = random.randint(0, (column_count + 1) * (row_count + 1)) - else: - doc[f"{cc}.{idx}"] = random.choice(vals) - - json.dump(doc, f) - f.write("\n") - - return path - - @pytest.mark.parametrize( "case_name,read_fn_name", [ bench - for pair in [[(item, "read_json"), (item, "read_ndjson")] for item in CASES] + for pair in [ + [ + (item, "read_json"), + (item, "read_ndjson"), + ] + for item in CASES + ] for bench in pair ], ) @@ -100,18 +39,34 @@ def write_random_json_file( def test_json_function( glaredb_connection: psycopg2.extensions.connection, tmp_path_factory: pytest.TempPathFactory, - generated_bench_data: dict[str, pathlib.Path], + generated_json_bench_data: dict[str, pathlib.Path], case_name: str, read_fn_name: str, benchmark: callable, ): - path = generated_bench_data[case_name] + path = generated_json_bench_data[case_name] logger.info(f"using test data at '{path}' for {case_name}") benchmark(run_query_operation, glaredb_connection, path, read_fn_name) +@pytest.mark.parametrize("case_name", CASES) +@pytest.mark.benchmark +def test_bson_function( + glaredb_connection: psycopg2.extensions.connection, + tmp_path_factory: pytest.TempPathFactory, + generated_bson_bench_data: dict[str, pathlib.Path], + case_name: str, + benchmark: callable, +): + path = generated_bson_bench_data[case_name] + + logger.info(f"using test data at '{path}' for {case_name}") + + benchmark(run_query_operation, glaredb_connection, path, "read_bson") + + def run_query_operation( glaredb_connection: psycopg2.extensions.connection, path: pathlib.Path, diff --git a/tests/conftest.py b/tests/conftest.py index 179651923..811a6d3ca 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1 +1 @@ -pytest_plugins = ["tests.fixtures.glaredb"] +pytest_plugins = ["tests.fixtures.glaredb", "tests.fixtures.data"] diff --git a/tests/fixtures/data.py b/tests/fixtures/data.py new file mode 100644 index 000000000..08d15006e --- /dev/null +++ b/tests/fixtures/data.py @@ -0,0 +1,99 @@ +import json +import logging +import string +import random +import pathlib +import os + +import pytest +import bson + +import tests.tools + +VALUES_SET = string.ascii_uppercase + string.digits +logger = logging.getLogger("fixture.data") + + +@pytest.fixture(scope="session") +def generated_json_bench_data(tmp_path_factory: pytest.TempPathFactory) -> dict[str, pathlib.Path]: + def _dump(obj, file): + json.dump(obj, file) + file.write("\n") + + return generate_bench_data(tmp_path_factory, "json", _dump) + + +@pytest.fixture(scope="session") +def generated_bson_bench_data(tmp_path_factory: pytest.TempPathFactory) -> dict[str, pathlib.Path]: + def _dump(obj, file): + file.write(bson.encode(obj)) + + return generate_bench_data(tmp_path_factory, "bson", _dump) + + +def generate_bench_data( + tmp_path_factory: pytest.TempPathFactory, + ext: str, + dump: callable, +) -> dict[str, pathlib.Path]: + out: dict[str, pathlib.Path] = {} + modes = {"json": "w", "bson": "wb"} + + with tests.tools.timed(logger, "generating-test-data"): + tmpdir = tmp_path_factory.mktemp(basename=f"{ext}-bench-singles", numbered=False) + for col, row in [(16, 256), (16, 512), (32, 256), (32, 512)]: + out[f"c{col}.r{row}"] = write_test_docs( + path=tmpdir.joinpath(f"c{col}.r{row}.{ext}"), + column_count=col, + row_count=row, + dump=dump, + mode=modes[ext], + ) + + globdir = tmp_path_factory.mktemp(basename=f"{ext}-bench-globs", numbered=False) + for num in [16, 64, 256, 512]: + tmpdir = globdir.joinpath(f"n{num}") + test_path = tmpdir.joinpath(f"*.{ext}") + out[f"glob.n{num}"] = test_path + os.mkdir(tmpdir) + logger.info(f"added glob test at '{test_path}'") + for i in range(num): + write_test_docs( + tmpdir.joinpath(f"benchdata.{i}.{ext}"), + column_count=16, + row_count=512, + dump=dump, + mode=modes[ext], + ) + + logger.info(f"wrote {len(out)} {ext} test cases") + + return out + + +def write_test_docs( + path: pathlib.Path, + column_count: int, + row_count: int, + dump: callable, + mode: str, +) -> pathlib.Path: + vals = [ + "".join(random.choices(VALUES_SET, k=4)), + "".join(random.choices(VALUES_SET, k=8)), + "".join(random.choices(VALUES_SET, k=16)), + "".join(random.choices(VALUES_SET, k=32)), + "".join(random.choices(VALUES_SET, k=64)), + ] + + with open(path, mode) as f: + for idx, rc in enumerate(range(row_count)): + doc = {} + for cc in range(column_count): + if cc % 4 == 0: + doc[f"{cc}.{idx}"] = random.randint(0, (column_count + 1) * (row_count + 1)) + else: + doc[f"{cc}.{idx}"] = random.choice(vals) + dump(doc, f) + + return path