Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Output compression to ignore timestamps #119

Merged
merged 9 commits into from
Jan 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
language: python
python:
- "3.7"
- "3.8"
- "3.9"
install:
- pip install -r requirements.txt
script:
- pip install pytest-cov codecov
- py.test -v --cov-report=xml --cov=pycytominer pycytominer/tests/
Expand Down
4 changes: 2 additions & 2 deletions pycytominer/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def aggregate(
operation="median",
output_file="none",
subset_data_df="none",
compression=None,
compression_options=None,
float_format=None,
):
"""
Expand Down Expand Up @@ -79,7 +79,7 @@ def aggregate(
output(
df=population_df,
output_filename=output_file,
compression=compression,
compression_options=compression_options,
float_format=float_format,
)
else:
Expand Down
6 changes: 3 additions & 3 deletions pycytominer/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def annotate(
external_metadata="none",
external_join_left="none",
external_join_right="none",
compression=None,
compression_options=None,
float_format=None,
):
"""
Expand All @@ -48,7 +48,7 @@ def annotate(
metadata information
external_join_left - [default: "none"] the merge column in the profile metadata
external_join_right - [default: "none"] the merge column in the external metadata
compression - the mechanism to compress [default: None] See cyto_utils/output.py for options.
compression_options - the mechanism to compress [default: None] See cyto_utils/output.py for options.
float_format - decimal precision to use in writing output file [default: None]
For example, use "%.3g" for 3 decimal precision.

Expand Down Expand Up @@ -180,7 +180,7 @@ def annotate(
output(
df=annotated,
output_filename=output_file,
compression=compression,
compression_options=compression_options,
float_format=float_format,
)
else:
Expand Down
8 changes: 4 additions & 4 deletions pycytominer/consensus.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def consensus(
operation="median",
features="infer",
output_file="none",
compression=None,
compression_options=None,
float_format=None,
modz_args={"method": "spearman"},
):
Expand All @@ -38,8 +38,8 @@ def consensus(
:type output_file: str
:param modz_args: Additional custom arguments passed as kwargs if operation="modz". See pycytominer.cyto_utils.modz for more details.
:type modz_args: dict
:param compression: the method to compress output data, defaults to None. See pycytominer.cyto_utils.output.py for options
:type compression: str
:param compression_options: the method to compress output data, defaults to None. See pycytominer.cyto_utils.output.py for options
:type compression_options: str
:param float_format: decimal precision to use in writing output file, defaults to None. For example, use "%.3g" for 3 decimal precision.

:Example:
Expand Down Expand Up @@ -102,7 +102,7 @@ def consensus(
output(
df=consensus_df,
output_filename=output_file,
compression=compression,
compression_options=compression_options,
float_format=float_format,
)
else:
Expand Down
8 changes: 4 additions & 4 deletions pycytominer/cyto_utils/cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def aggregate_compartment(
def merge_single_cells(
self,
sc_output_file="none",
compression=None,
compression_options=None,
float_format=None,
single_cell_normalize=False,
normalize_args=None,
Expand Down Expand Up @@ -429,7 +429,7 @@ def merge_single_cells(
output(
df=sc_df,
output_filename=sc_output_file,
compression=compression,
compression_options=compression_options,
float_format=float_format,
)
else:
Expand All @@ -439,7 +439,7 @@ def aggregate_profiles(
self,
compute_subsample=False,
output_file="none",
compression=None,
compression_options=None,
float_format=None,
aggregate_args=None,
):
Expand Down Expand Up @@ -492,7 +492,7 @@ def aggregate_profiles(
output(
df=aggregated,
output_filename=self.output_file,
compression=compression,
compression_options=compression_options,
float_format=float_format,
)
else:
Expand Down
130 changes: 78 additions & 52 deletions pycytominer/cyto_utils/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,73 +6,99 @@
import warnings
import pandas as pd

compress_options = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz", None: ""}
compress_options = ["gzip", None]


def output(df, output_filename, compression="gzip", float_format=None):
"""
Given an output file and compression options, write file to disk
def output(
df,
output_filename,
sep=",",
float_format=None,
compression_options={"method": "gzip", "mtime": 1},
):
"""Given an output file and compression options, write file to disk

Arguments:
df - a pandas dataframe that will be written to file
output_filename - a string or path object that stores location of file
compression - the mechanism to compress [default: "gzip"]
float_format - decimal precision to use in writing output file [default: None]
For example, use "%.3g" for 3 decimal precision.
:param df: a pandas dataframe that will be written to file
:type df: pandas.DataFrame
:param output_filename: a string or path object that stores location of file
:type output_filename: str
:param sep: file delimiter
:type sep: str
:param float_format: decimal precision to use in writing output file [default: None]
:type float_format: str
:param compression_options: compression arguments as input to pandas.to_csv() [default: check different function call]
:type compression_options: str, dict

Return:
Nothing, write df to file
"""
:Example:

# Extract suffixes from the provided output file name
filename, output_file_extension = os.path.splitext(output_filename)
basefilename, non_compression_suffix = os.path.splitext(filename)

# if no additional suffix was provided, make it a csv
if len(non_compression_suffix) == 0 and output_file_extension not in [
".csv",
".tsv",
]:
output_filename = "{}.csv".format(output_filename)

# Set the delimiter
delim = ","
if non_compression_suffix == ".tsv":
delim = "\t"

# Determine the compression suffix
compression_suffix = infer_compression_suffix(compression=compression)
if output_file_extension in compress_options.values():
if output_file_extension != compression_suffix:
warnings.warn(
"The output file has a compression file extension ('{}') that is different than what is specified in 'compression' ('{}'). Defaulting to output filename suffix.".format(
output_file_extension, compression_suffix
)
)
compression = "infer"
else:
output_filename = "{}{}".format(output_filename, compression_suffix)
import pandas as pd
from pycytominer.cyto_utils import output

data_df = pd.concat(
[
pd.DataFrame(
{
"Metadata_Plate": "X",
"Metadata_Well": "a",
"Cells_x": [0.1, 0.3, 0.8],
"Nuclei_y": [0.5, 0.3, 0.1],
}
),
pd.DataFrame(
{
"Metadata_Plate": "X",
"Metadata_Well": "b",
"Cells_x": [0.4, 0.2, -0.5],
"Nuclei_y": [-0.8, 1.2, -0.5],
}
),
]
).reset_index(drop=True)

output_file = "test.csv.gz"
output(
df=data_df,
output_filename=output_file,
sep=",",
compression_options={"method": "gzip", "mtime": 1},
float_format=None,
)
"""
# Make sure the compression method is supported
compression_options = set_compression_method(compression=compression_options)

df.to_csv(
path_or_buf=output_filename,
sep=delim,
sep=sep,
index=False,
float_format=float_format,
compression=compression,
compression=compression_options,
)


def infer_compression_suffix(compression="gzip"):
def set_compression_method(compression):
"""Set the compression options

:param compression: indicating compression options
:type compression: str, dict
"""
Determine the compression suffix

Arguments:
compression - the mechanism to compress [default: "gzip"]
if compression is None:
compression = {"method": None}

if isinstance(compression, str):
compression = {"method": compression}

check_compression_method(compression["method"])
return compression


def check_compression_method(compression):
"""Ensure compression options are set properly

:param compression: the compression used to output data
:type compression: str
"""
assert (
compression in compress_options
), "{} is not supported, select one of {}".format(
compression, list(compress_options.keys())
)

return compress_options[compression]
), "{} is not supported, select one of {}".format(compression, compress_options)
4 changes: 2 additions & 2 deletions pycytominer/feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def feature_select(
corr_method="pearson",
freq_cut=0.05,
unique_cut=0.1,
compression=None,
compression_options=None,
float_format=None,
blocklist_file=None,
outlier_cutoff=15,
Expand Down Expand Up @@ -144,7 +144,7 @@ def feature_select(
output(
df=selected_df,
output_filename=output_file,
compression=compression,
compression_options=compression_options,
float_format=float_format,
)
else:
Expand Down
4 changes: 2 additions & 2 deletions pycytominer/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def normalize(
samples="all",
method="standardize",
output_file="none",
compression=None,
compression_options=None,
float_format=None,
spherize_center=True,
spherize_method="ZCA-cor",
Expand Down Expand Up @@ -104,7 +104,7 @@ def normalize(
output(
df=normalized,
output_filename=output_file,
compression=compression,
compression_options=compression_options,
float_format=float_format,
)
else:
Expand Down
2 changes: 1 addition & 1 deletion pycytominer/tests/test_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test_annotate_compress():
join_on=["Metadata_well_position", "Metadata_Well"],
add_metadata_id_to_platemap=False,
output_file=compress_file,
compression="gzip",
compression_options={"method": "gzip"},
)

result = annotate(
Expand Down
4 changes: 3 additions & 1 deletion pycytominer/tests/test_cyto_utils/test_cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,9 @@ def test_aggregate_subsampling_profile():
def test_aggregate_subsampling_profile_compress():
compress_file = os.path.join(tmpdir, "test_aggregate_compress.csv.gz")

_ = ap_subsample.aggregate_profiles(output_file=compress_file, compression="gzip")
_ = ap_subsample.aggregate_profiles(
output_file=compress_file, compression_options={"method": "gzip"}
)
result = pd.read_csv(compress_file)

expected_result = pd.DataFrame(
Expand Down
Loading