Skip to content

Commit

Permalink
Feature flag empty index
Browse files Browse the repository at this point in the history
  • Loading branch information
Vasil Pashov committed Apr 5, 2024
1 parent e6c4f28 commit 3f5f783
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 7 deletions.
21 changes: 14 additions & 7 deletions python/arcticdb/version_store/_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ def check_is_utc_if_newer_pandas(*args, **kwargs):

NormalizedInput = NamedTuple("NormalizedInput", [("item", NPDDataFrame), ("metadata", NormalizationMetadata)])

def _allow_empty_index():
return os.getenv("ARCTICDB_ALLOW_EMPTY_INDEX", "0").lower() in ['1', 'true']

# To simplify unit testing of serialization logic. This maps the cpp _FrameData exposed object
class FrameData(
Expand Down Expand Up @@ -370,9 +372,9 @@ def _denormalize_single_index(item, norm_meta):
name = norm_meta.index.name if norm_meta.index.name else None
return RangeIndex(start=norm_meta.index.start, stop=stop, step=norm_meta.index.step, name=name)
else:
return Index([])
return Index([]) if _allow_empty_index() else None
else:
return Index([])
return Index([]) if _allow_empty_index() else RangeIndex(start=0, stop=0, step=1)
# this means that the index is not a datetime index and it's been represented as a regular field in the stream
item.index_columns.append(item.names.pop(0))

Expand Down Expand Up @@ -525,7 +527,8 @@ def denormalize(self, item, norm_meta):
class _PandasNormalizer(Normalizer):
def _index_to_records(self, df, pd_norm, dynamic_strings, string_max_len):
index = df.index
if len(index) == 0 and len(df.select_dtypes(include="category").columns) == 0:
empty_index = len(index) == 0 and len(df.select_dtypes(include="category").columns) == 0
if empty_index and _allow_empty_index():
index_norm = pd_norm.index
index_norm.is_physically_stored = False
index = Index([])
Expand All @@ -551,11 +554,15 @@ def _index_to_records(self, df, pd_norm, dynamic_strings, string_max_len):
index = df.index
else:
is_not_range_index = not isinstance(index, RangeIndex)
df_has_rows = not(len(index) == 0 and len(df.select_dtypes(include="category").columns) == 0)
index_norm = pd_norm.index
index_norm.is_physically_stored = is_not_range_index and df_has_rows
if not df_has_rows:
index = Index([])
if _allow_empty_index():
index_norm.is_physically_stored = is_not_range_index and not empty_index
if empty_index:
index = Index([])
else:
if IS_PANDAS_TWO and isinstance(index, RangeIndex) and empty_index:
index = DatetimeIndex([])
index_norm.is_physically_stored = not isinstance(index, RangeIndex)

return _normalize_single_index(index, list(index.names), index_norm, dynamic_strings, string_max_len)

Expand Down
11 changes: 11 additions & 0 deletions python/tests/unit/arcticdb/version_store/test_empty_column_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,20 @@
"""
import sys
from math import nan
import os
import pandas as pd
from pandas.testing import assert_frame_equal
import numpy as np
import pytest


def setup_module(module):
os.environ["ARCTICDB_ALLOW_EMPTY_INDEX"] = "1"

def teardown_module(module):
os.environ["ARCTICDB_ALLOW_EMPTY_INDEX"] = "0"


class DtypeGenerator:
"""
Can generate representative subset of all supported dtypes. Can generate by category (e.g. int, float, etc...) or
Expand Down Expand Up @@ -584,6 +593,7 @@ def append_index(self, request):
@pytest.fixture(autouse=True)
def create_empty_column(self, lmdb_version_store_static_and_dynamic, dtype, empty_index):
lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index))
assert lmdb_version_store_static_and_dynamic.read("sym").data.index.equals(pd.Index([]))
yield

def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype, dtype, append_index):
Expand Down Expand Up @@ -744,6 +754,7 @@ class TestCanUpdateEmptyColumn:
@pytest.fixture(autouse=True)
def create_empty_column(self, lmdb_version_store_static_and_dynamic, dtype, empty_index):
lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": []}, dtype=dtype, index=empty_index))
assert lmdb_version_store_static_and_dynamic.read("sym").data.index.equals(pd.Index([]))
yield

def update_index(self):
Expand Down

0 comments on commit 3f5f783

Please sign in to comment.