diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 824eccd..4e805dd 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,9 @@ +-------------------- +[0.1.1] - 2021-04-XX +-------------------- + +- Add support for new columns in tskit. (benjeffery, #39, #42). + -------------------- [0.1.0] - 2019-05-10 -------------------- diff --git a/requirements/CI-tests-conda.txt b/requirements/CI-tests-conda.txt index 1c21b0b..f5ee0bc 100644 --- a/requirements/CI-tests-conda.txt +++ b/requirements/CI-tests-conda.txt @@ -1,4 +1,5 @@ humanize==3.4.1 +h5py==3.2.1 msprime==1.0.0 pytest==6.2.3 pytest-cov==2.11.1 diff --git a/requirements/development.txt b/requirements/development.txt index a8609e2..f801ebe 100644 --- a/requirements/development.txt +++ b/requirements/development.txt @@ -1,6 +1,7 @@ codecov coverage flake8 +h5py pre-commit pytest pytest-cov diff --git a/tests/files/1.0.0.trees b/tests/files/1.0.0.trees new file mode 100644 index 0000000..c30951a Binary files /dev/null and b/tests/files/1.0.0.trees differ diff --git a/tests/files/1.0.0.trees.tsz b/tests/files/1.0.0.trees.tsz new file mode 100644 index 0000000..ca14d5b Binary files /dev/null and b/tests/files/1.0.0.trees.tsz differ diff --git a/tests/test_cli.py b/tests/test_cli.py index 1d14e07..fd01d88 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -31,7 +31,6 @@ import msprime import numpy as np -import pytest import tskit import tszip @@ -43,6 +42,9 @@ class TestException(Exception): Custom exception we can throw for testing. """ + # We don't want pytest to use this as a class to test + __test__ = False + def capture_output(func, *args, **kwargs): """ @@ -189,7 +191,6 @@ def setUp(self): def tearDown(self): del self.tmpdir - @pytest.mark.xfail def test_simple(self): self.assertTrue(self.trees_path.exists()) self.run_tszip([str(self.trees_path)]) @@ -199,7 +200,6 @@ def test_simple(self): ts = tszip.decompress(outpath) self.assertEqual(ts.tables, self.ts.tables) - @pytest.mark.xfail def test_suffix(self): self.assertTrue(self.trees_path.exists()) self.run_tszip([str(self.trees_path), "-S", ".XYZasdf"]) @@ -221,7 +221,6 @@ def test_variants_only(self): G2 = self.ts.genotype_matrix() self.assertTrue(np.array_equal(G1, G2)) - @pytest.mark.xfail def test_keep(self): self.assertTrue(self.trees_path.exists()) self.run_tszip([str(self.trees_path), "--keep"]) @@ -231,7 +230,6 @@ def test_keep(self): ts = tszip.decompress(outpath) self.assertEqual(ts.tables, self.ts.tables) - @pytest.mark.xfail def test_overwrite(self): self.assertTrue(self.trees_path.exists()) outpath = pathlib.Path(str(self.trees_path) + ".tsz") @@ -255,7 +253,6 @@ def test_no_overwrite(self): f"'{outpath}' already exists; use --force to overwrite" ) - @pytest.mark.xfail def test_bad_file_format(self): self.assertTrue(self.trees_path.exists()) with open(str(self.trees_path), "w") as f: @@ -283,7 +280,6 @@ def setUp(self): def tearDown(self): del self.tmpdir - @pytest.mark.xfail def test_simple(self): self.assertTrue(self.compressed_path.exists()) self.run_decompress([str(self.compressed_path)]) @@ -293,7 +289,6 @@ def test_simple(self): ts = tskit.load(str(outpath)) self.assertEqual(ts.tables, self.ts.tables) - @pytest.mark.xfail def test_suffix(self): suffix = ".XYGsdf" self.compressed_path = self.compressed_path.with_suffix(suffix) @@ -306,7 +301,6 @@ def test_suffix(self): ts = tskit.load(str(outpath)) self.assertEqual(ts.tables, self.ts.tables) - @pytest.mark.xfail def test_keep(self): self.assertTrue(self.compressed_path.exists()) self.run_decompress([str(self.compressed_path), "--keep"]) @@ -316,7 +310,6 @@ def test_keep(self): ts = tskit.load(str(outpath)) self.assertEqual(ts.tables, self.ts.tables) - @pytest.mark.xfail def test_overwrite(self): self.assertTrue(self.compressed_path.exists()) outpath = self.trees_path @@ -346,7 +339,6 @@ def test_decompress_bad_suffix(self): "Compressed file must have 'asdf' suffix" ) - @pytest.mark.xfail def test_bad_file_format(self): self.assertTrue(self.compressed_path.exists()) with open(str(self.compressed_path), "w") as f: diff --git a/tests/test_compression.py b/tests/test_compression.py index 531bf8c..eb52f22 100644 --- a/tests/test_compression.py +++ b/tests/test_compression.py @@ -28,7 +28,6 @@ import msprime import numpy as np -import pytest import tskit import zarr @@ -106,20 +105,17 @@ class RoundTripMixin: Set of example tree sequences that we should be able to round trip. """ - @pytest.mark.xfail def test_small_msprime_no_recomb(self): ts = msprime.simulate(10, mutation_rate=2, random_seed=2) self.assertGreater(ts.num_sites, 2) self.verify(ts) - @pytest.mark.xfail def test_small_msprime_recomb(self): ts = msprime.simulate(10, recombination_rate=2, mutation_rate=2, random_seed=2) self.assertGreater(ts.num_sites, 2) self.assertGreater(ts.num_trees, 2) self.verify(ts) - @pytest.mark.xfail def test_small_msprime_migration(self): ts = msprime.simulate( population_configurations=[ @@ -137,7 +133,6 @@ def test_small_msprime_migration(self): self.assertGreater(ts.num_trees, 2) self.verify(ts) - @pytest.mark.xfail def test_small_msprime_top_level_metadata(self): ts = msprime.simulate(10, recombination_rate=2, mutation_rate=2, random_seed=2) self.assertGreater(ts.num_sites, 2) @@ -151,7 +146,6 @@ def test_small_msprime_top_level_metadata(self): tables.metadata = {"my_int": 1234} self.verify(tables.tree_sequence()) - @pytest.mark.xfail def test_small_msprime_individuals_metadata(self): ts = msprime.simulate(10, recombination_rate=1, mutation_rate=2, random_seed=2) self.assertGreater(ts.num_sites, 2) @@ -159,7 +153,9 @@ def test_small_msprime_individuals_metadata(self): tables = ts.dump_tables() tables.nodes.clear() for j, node in enumerate(ts.nodes()): - tables.individuals.add_row(flags=j, location=[j] * j, metadata=b"x" * j) + tables.individuals.add_row( + flags=j, location=[j] * j, parents=[j - 1] * j, metadata=b"x" * j + ) tables.nodes.add_row( flags=node.flags, population=node.population, @@ -169,6 +165,7 @@ def test_small_msprime_individuals_metadata(self): ) tables.populations.clear() tables.populations.add_row(metadata=b"X" * 1024) + tables.sort() self.verify(tables.tree_sequence()) def test_small_msprime_complex_mutations(self): @@ -200,6 +197,36 @@ def test_mutation_parent_example(self): tables.mutations.add_row(site=0, node=0, parent=0, derived_state="A") self.verify(tables.tree_sequence()) + def test_all_fields(self): + demography = msprime.Demography() + demography.add_population(name="A", initial_size=10_000) + demography.add_population(name="B", initial_size=5_000) + demography.add_population(name="C", initial_size=1_000) + demography.add_population_split(time=1000, derived=["A", "B"], ancestral="C") + ts = msprime.sim_ancestry( + samples={"A": 1, "B": 1}, + demography=demography, + random_seed=42, + record_migrations=True, + ) + ts = msprime.sim_mutations(ts, rate=1, random_seed=42) + tables = ts.dump_tables() + for name, table in tables.name_map.items(): + if name not in ["provenances", "edges"]: + table.metadata_schema = tskit.MetadataSchema({"codec": "json"}) + metadatas = [f'{{"foo":"n_{name}_{u}"}}' for u in range(len(table))] + metadata, metadata_offset = tskit.pack_strings(metadatas) + table.set_columns( + **{ + **table.asdict(), + "metadata": metadata, + "metadata_offset": metadata_offset, + } + ) + tables.metadata_schema = tskit.MetadataSchema({"codec": "json"}) + tables.metadata = "Test metadata" + self.verify(tables.tree_sequence()) + class TestGenotypeRoundTrip(unittest.TestCase, RoundTripMixin): """ diff --git a/tests/test_legacy.py b/tests/test_legacy.py new file mode 100644 index 0000000..6c334e2 --- /dev/null +++ b/tests/test_legacy.py @@ -0,0 +1,37 @@ +# MIT License +# +# Copyright (c) 2021 Tskit Developers +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Tests for files from previous releases. +""" +import pathlib + +import tskit + +import tszip + + +class Testv1: + def test_decompress(self): + files = pathlib.Path(__file__).parent / "files" + ts = tszip.decompress(files / "1.0.0.trees.tsz") + ts2 = tskit.load(files / "1.0.0.trees") + assert ts == ts2 diff --git a/tszip/compression.py b/tszip/compression.py index 7a42a49..2eeda6e 100644 --- a/tszip/compression.py +++ b/tszip/compression.py @@ -29,6 +29,7 @@ import tempfile import warnings import zipfile +from typing import Mapping import humanize import numcodecs @@ -158,16 +159,47 @@ def compress_zarr(ts, root, variants_only=False): if variants_only: logging.info("Using lossy variants-only compression") - # Reduce to site topology and quantise node times. Note that we will remove + # Reduce to site topology. Note that we will remove # any sites, individuals and populations here that have no references. ts = ts.simplify(reduce_to_site_topology=True) - tables = ts.tables + + tables = ts.tables + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # When using a zipfile in Zarr we get some harmless warnings. See + # https://zarr.readthedocs.io/en/stable/api/storage.html#zarr.storage.ZipStore + root.attrs["format_name"] = FORMAT_NAME + root.attrs["format_version"] = FORMAT_VERSION + root.attrs["sequence_length"] = tables.sequence_length + root.attrs["provenance"] = provenance_dict + + columns = {} + for key, value in tables.asdict().items(): + if isinstance(value, dict): + for sub_key, sub_value in value.items(): + columns[f"{key}/{sub_key}"] = sub_value + else: + columns[key] = value + + if variants_only: time = np.unique(tables.nodes.time) - node_time = np.searchsorted(time, tables.nodes.time) - else: - tables = ts.tables - node_time = tables.nodes.time + columns["node/time"] = np.searchsorted(time, tables.nodes.time) + # Encoding array is a tuple so must be converted + columns["encoding_version"] = np.asarray(columns["encoding_version"]) + + # Sequence length is stored as an attr for compatibility with older versions of tszip + del columns["sequence_length"] + + # Schemas and metadata need to be converted to arrays + for name in columns: + if name.endswith("metadata_schema"): + columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.int8) + if name.endswith("metadata"): + columns[name] = np.frombuffer(columns[name], np.int8) + + # Some columns benefit from being quantised coordinates = np.unique( np.hstack( [ @@ -180,78 +212,18 @@ def compress_zarr(ts, root, variants_only=False): ] ) ) - - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - # When using a zipfile in Zarr we get some harmless warnings. See - # https://zarr.readthedocs.io/en/stable/api/storage.html#zarr.storage.ZipStore - root.attrs["format_name"] = FORMAT_NAME - root.attrs["format_version"] = FORMAT_VERSION - root.attrs["sequence_length"] = tables.sequence_length - root.attrs["provenance"] = provenance_dict - if tables.metadata_schema.schema is not None: - root.attrs["metadata_schema"] = tables.metadata_schema.schema - root.attrs["metadata"] = tables.metadata - - columns = [ - Column("coordinates", coordinates), - Column("individuals/flags", tables.individuals.flags), - Column("individuals/location", tables.individuals.location), - Column( - "individuals/location_offset", - tables.individuals.location_offset, - delta_filter=True, - ), - Column("individuals/metadata", tables.individuals.metadata), - Column( - "individuals/metadata_offset", - tables.individuals.metadata_offset, - delta_filter=True, - ), - Column("nodes/time", node_time), - Column("nodes/flags", tables.nodes.flags), - Column("nodes/population", tables.nodes.population), - Column("nodes/individual", tables.nodes.individual), - Column("nodes/metadata", tables.nodes.metadata), - Column( - "nodes/metadata_offset", tables.nodes.metadata_offset, delta_filter=True - ), - # Delta filtering makes storage slightly worse for everything except parent. - Column("edges/left", np.searchsorted(coordinates, tables.edges.left)), - Column("edges/right", np.searchsorted(coordinates, tables.edges.right)), - Column("edges/parent", tables.edges.parent, delta_filter=True), - Column("edges/child", tables.edges.child), - Column("migrations/left", np.searchsorted(coordinates, tables.migrations.left)), - Column( - "migrations/right", np.searchsorted(coordinates, tables.migrations.right) - ), - Column("migrations/node", tables.migrations.node), - Column("migrations/source", tables.migrations.source), - Column("migrations/dest", tables.migrations.dest), - Column("migrations/time", tables.migrations.time), - Column( - "sites/position", - np.searchsorted(coordinates, tables.sites.position), - delta_filter=True, - ), - Column("sites/ancestral_state", tables.sites.ancestral_state), - Column("sites/ancestral_state_offset", tables.sites.ancestral_state_offset), - Column("sites/metadata", tables.sites.metadata), - Column("sites/metadata_offset", tables.sites.metadata_offset), - Column("mutations/site", tables.mutations.site), - Column("mutations/node", tables.mutations.node), - Column("mutations/parent", tables.mutations.parent), - Column("mutations/derived_state", tables.mutations.derived_state), - Column("mutations/derived_state_offset", tables.mutations.derived_state_offset), - Column("mutations/metadata", tables.mutations.metadata), - Column("mutations/metadata_offset", tables.mutations.metadata_offset), - Column("populations/metadata", tables.populations.metadata), - Column("populations/metadata_offset", tables.populations.metadata_offset), - Column("provenances/timestamp", tables.provenances.timestamp), - Column("provenances/timestamp_offset", tables.provenances.timestamp_offset), - Column("provenances/record", tables.provenances.record), - Column("provenances/record_offset", tables.provenances.record_offset), - ] + columns["coordinates"] = coordinates + for name in [ + "edges/left", + "edges/right", + "migrations/left", + "migrations/right", + "sites/position", + ]: + columns[name] = np.searchsorted(coordinates, columns[name]) + + # Some columns benefit from additional options + delta_filter_cols = ["edges/parent", "sites/position"] # Note: we're not providing any options to set this here because Blosc+Zstd seems to # have a clear advantage in compression performance and speed. There is very little @@ -261,8 +233,10 @@ def compress_zarr(ts, root, variants_only=False): compressor = numcodecs.Blosc( cname="zstd", clevel=9, shuffle=numcodecs.Blosc.SHUFFLE ) - for column in columns: - column.compress(root, compressor) + for name, data in columns.items(): + Column( + name, data, delta_filter="_offset" in name or name in delta_filter_cols + ).compress(root, compressor) def check_format(root): @@ -307,76 +281,35 @@ def load_zarr(path): def decompress_zarr(root): - tables = tskit.TableCollection(root.attrs["sequence_length"]) coordinates = root["coordinates"][:] - if "metadata_schema" in root.attrs and "metadata" in root.attrs: - tables.metadata_schema = tskit.MetadataSchema(root.attrs["metadata_schema"]) - tables.metadata = root.attrs["metadata"] - - tables.individuals.set_columns( - flags=root["individuals/flags"], - location=root["individuals/location"], - location_offset=root["individuals/location_offset"], - metadata=root["individuals/metadata"], - metadata_offset=root["individuals/metadata_offset"], - ) - - tables.nodes.set_columns( - flags=root["nodes/flags"], - time=root["nodes/time"], - population=root["nodes/population"], - individual=root["nodes/individual"], - metadata=root["nodes/metadata"], - metadata_offset=root["nodes/metadata_offset"], - ) - - tables.edges.set_columns( - left=coordinates[root["edges/left"]], - right=coordinates[root["edges/right"]], - parent=root["edges/parent"], - child=root["edges/child"], - ) - - tables.migrations.set_columns( - left=coordinates[root["migrations/left"]], - right=coordinates[root["migrations/right"]], - node=root["migrations/node"], - source=root["migrations/source"], - dest=root["migrations/dest"], - time=root["migrations/time"], - ) - - tables.sites.set_columns( - position=coordinates[root["sites/position"]], - ancestral_state=root["sites/ancestral_state"], - ancestral_state_offset=root["sites/ancestral_state_offset"], - metadata=root["sites/metadata"], - metadata_offset=root["sites/metadata_offset"], - ) - - tables.mutations.set_columns( - site=root["mutations/site"], - node=root["mutations/node"], - parent=root["mutations/parent"], - derived_state=root["mutations/derived_state"], - derived_state_offset=root["mutations/derived_state_offset"], - metadata=root["mutations/metadata"], - metadata_offset=root["mutations/metadata_offset"], - ) - - tables.populations.set_columns( - metadata=root["populations/metadata"], - metadata_offset=root["populations/metadata_offset"], - ) - - tables.provenances.set_columns( - timestamp=root["provenances/timestamp"], - timestamp_offset=root["provenances/timestamp_offset"], - record=root["provenances/record"], - record_offset=root["provenances/record_offset"], - ) - - return tables.tree_sequence() + dict_repr = {"sequence_length": root.attrs["sequence_length"]} + + quantised_arrays = [ + "edges/left", + "edges/right", + "migrations/left", + "migrations/right", + "sites/position", + ] + for key, value in root.items(): + if isinstance(value, Mapping): + for sub_key, sub_value in value.items(): + if f"{key}/{sub_key}" in quantised_arrays: + dict_repr.setdefault(key, {})[sub_key] = coordinates[sub_value] + elif sub_key.endswith("metadata_schema"): + dict_repr.setdefault(key, {})[sub_key] = bytes(sub_value).decode( + "utf-8" + ) + else: + dict_repr.setdefault(key, {})[sub_key] = sub_value + elif key.endswith("metadata_schema"): + dict_repr[key] = bytes(value).decode("utf-8") + elif key.endswith("metadata"): + dict_repr[key] = bytes(value) + else: + dict_repr[key] = value + + return tskit.TableCollection.fromdict(dict_repr).tree_sequence() def print_summary(path, verbosity=0):