Skip to content

Commit

Permalink
Add domain handling to dataframe write code (#1308)
Browse files Browse the repository at this point in the history
  • Loading branch information
ivirshup authored Oct 28, 2024
1 parent d1f7719 commit 1ea21f7
Show file tree
Hide file tree
Showing 6 changed files with 19 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def add_dataframe(coll: CollectionBase, key: str, value_range: range) -> None:
]
),
index_column_names=["soma_joinid"],
domain=[(min(value_range), max(value_range))],
)
df.write(
pa.Table.from_pydict(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,10 @@ def create_census_summary(

# write to a SOMA dataframe
with info_collection.add_new_dataframe(
CENSUS_SUMMARY_NAME, schema=pa.Schema.from_pandas(df, preserve_index=False), index_column_names=["soma_joinid"]
CENSUS_SUMMARY_NAME,
schema=pa.Schema.from_pandas(df, preserve_index=False),
index_column_names=["soma_joinid"],
domain=[(df["soma_joinid"].min(), df["soma_joinid"].max())],
) as summary:
summary.write(pa.Table.from_pandas(df, preserve_index=False))

Expand All @@ -57,5 +60,6 @@ def create_census_info_organisms(
CENSUS_INFO_ORGANISMS_NAME,
schema=pa.Schema.from_pandas(df, preserve_index=False),
index_column_names=["soma_joinid"],
domain=[(df["soma_joinid"].min(), df["soma_joinid"].max())],
) as summary:
summary.write(pa.Table.from_pandas(df, preserve_index=False))
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ def create_dataset_manifest(info_collection: soma.Collection, datasets: list[Dat

# write to a SOMA dataframe
with info_collection.add_new_dataframe(
CENSUS_DATASETS_NAME, schema=schema, index_column_names=["soma_joinid"]
CENSUS_DATASETS_NAME,
schema=schema,
index_column_names=["soma_joinid"],
domain=[(manifest_df["soma_joinid"].min(), manifest_df["soma_joinid"].max())],
) as manifest:
manifest.write(pa.Table.from_pandas(manifest_df, preserve_index=False, schema=schema))
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def write_obs_dataframe(self) -> None:
assert self.experiment is not None
_assert_open_for_write(self.experiment)

obs_df = CENSUS_OBS_TABLE_SPEC.recategoricalize(self.obs_df)
obs_df = cast(pd.DataFrame, CENSUS_OBS_TABLE_SPEC.recategoricalize(self.obs_df))
obs_schema = CENSUS_OBS_TABLE_SPEC.to_arrow_schema(obs_df)

# create `obs`
Expand All @@ -169,6 +169,7 @@ def write_obs_dataframe(self) -> None:
schema=obs_schema,
index_column_names=["soma_joinid"],
platform_config=CENSUS_OBS_PLATFORM_CONFIG,
domain=[(obs_df["soma_joinid"].min(), obs_df["soma_joinid"].max())],
)

if obs_df is None or obs_df.empty:
Expand All @@ -186,7 +187,7 @@ def write_var_dataframe(self) -> None:

rna_measurement = self.experiment.ms[MEASUREMENT_RNA_NAME]

var_df = CENSUS_VAR_TABLE_SPEC.recategoricalize(self.var_df)
var_df = cast(pd.DataFrame, CENSUS_VAR_TABLE_SPEC.recategoricalize(self.var_df))
var_schema = CENSUS_VAR_TABLE_SPEC.to_arrow_schema(var_df)

# create `var` in the measurement
Expand All @@ -195,6 +196,7 @@ def write_var_dataframe(self) -> None:
schema=var_schema,
index_column_names=["soma_joinid"],
platform_config=CENSUS_VAR_PLATFORM_CONFIG,
domain=[(var_df["soma_joinid"].min(), var_df["soma_joinid"].max())],
)

if var_df is None or var_df.empty:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@ def create_census_summary_cell_counts(

# write to a SOMA dataframe
with info_collection.add_new_dataframe(
CENSUS_SUMMARY_CELL_COUNTS_NAME, schema=schema, index_column_names=["soma_joinid"]
CENSUS_SUMMARY_CELL_COUNTS_NAME,
schema=schema,
index_column_names=["soma_joinid"],
domain=[(df["soma_joinid"].min(), df["soma_joinid"].max())],
) as cell_counts:
cell_counts.write(pa.Table.from_pandas(df, preserve_index=False, schema=schema))

Expand Down
1 change: 1 addition & 0 deletions tools/cellxgene_census_builder/tests/test_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ def test_unicode_support(tmp_path: pathlib.Path) -> None:
uri=os.path.join(tmp_path, "unicode_support"),
schema=pa.Schema.from_pandas(pd_df, preserve_index=False),
index_column_names=["soma_joinid"],
domain=[(pd_df["soma_joinid"].min(), pd_df["soma_joinid"].max())],
) as s_df:
s_df.write(pa.Table.from_pandas(pd_df, preserve_index=False))

Expand Down

0 comments on commit 1ea21f7

Please sign in to comment.