From 7a67fe3c09e94fabd02d43be5d61bbe6064296a7 Mon Sep 17 00:00:00 2001 From: chrishavlin Date: Mon, 24 Jul 2023 11:34:20 -0500 Subject: [PATCH 1/2] use _sorted_chunk_iterator to reduce code dupe missed one fix ahf modification --- yt/frontends/adaptahop/io.py | 19 +++++++------------ yt/frontends/ahf/io.py | 22 +++++++++++----------- yt/frontends/gadget_fof/io.py | 14 ++------------ yt/frontends/halo_catalog/io.py | 15 ++------------- yt/frontends/http_stream/io.py | 13 ++----------- yt/frontends/owls_subfind/io.py | 14 ++------------ yt/frontends/rockstar/io.py | 16 +++------------- yt/frontends/sdf/io.py | 12 ++---------- yt/frontends/stream/io.py | 15 ++------------- yt/frontends/tipsy/io.py | 6 +----- yt/frontends/ytdata/io.py | 21 +++------------------ yt/utilities/io_handler.py | 25 ++++++++++++++----------- 12 files changed, 51 insertions(+), 141 deletions(-) diff --git a/yt/frontends/adaptahop/io.py b/yt/frontends/adaptahop/io.py index 58d6040fcd..de0b42b316 100644 --- a/yt/frontends/adaptahop/io.py +++ b/yt/frontends/adaptahop/io.py @@ -36,16 +36,11 @@ def _yield_coordinates(self, data_file): def _read_particle_coords(self, chunks, ptf): # This will read chunks and yield the results. - chunks = list(chunks) - data_files = set() # Only support halo reading for now. assert len(ptf) == 1 assert list(ptf.keys())[0] == "halos" ptype = "halos" - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) - for data_file in sorted(data_files, key=attrgetter("filename")): + for data_file in self._sorted_chunk_iterator(chunks): pcount = ( data_file.ds.parameters["nhalos"] + data_file.ds.parameters["nsubs"] ) @@ -56,14 +51,10 @@ def _read_particle_coords(self, chunks, ptf): def _read_particle_fields(self, chunks, ptf, selector): # Now we have all the sizes, and we can allocate - chunks = list(chunks) - data_files = set() + # Only support halo reading for now. assert len(ptf) == 1 assert list(ptf.keys())[0] == "halos" - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) def iterate_over_attributes(attr_list): for attr, *_ in attr_list: @@ -76,7 +67,7 @@ def iterate_over_attributes(attr_list): attr_pos = partial(_find_attr_position, halo_attributes=halo_attributes) - for data_file in sorted(data_files, key=attrgetter("filename")): + for data_file in self._sorted_chunk_iterator(chunks): pcount = ( data_file.ds.parameters["nhalos"] + data_file.ds.parameters["nsubs"] ) @@ -194,6 +185,10 @@ def members(self, ihalo): members = fpu.read_attrs(todo.pop(0))["particle_identities"] return members + def _sorted_chunk_iterator(self, chunks): + data_files = self._get_data_files(chunks) + yield from sorted(data_files, key=attrgetter("filename")) + def _todo_from_attributes(attributes: ATTR_T, halo_attributes: ATTR_T): # Helper function to generate a list of read-skip instructions given a list of diff --git a/yt/frontends/ahf/io.py b/yt/frontends/ahf/io.py index df9e83dc80..374155e1ff 100644 --- a/yt/frontends/ahf/io.py +++ b/yt/frontends/ahf/io.py @@ -16,7 +16,8 @@ def _read_particle_coords(self, chunks, ptf): # This needs to *yield* a series of tuples of (ptype, (x, y, z), hsml). # chunks is a list of chunks, and ptf is a dict where the keys are # ptypes and the values are lists of fields. - for data_file in self._get_data_files(chunks, ptf): + self._validate_particle_ptf(ptf) + for data_file in self._sorted_chunk_iterator(chunks): pos = data_file._get_particle_positions("halos") x, y, z = (pos[:, i] for i in range(3)) yield "halos", (x, y, z), 0.0 @@ -34,7 +35,8 @@ def _read_particle_fields(self, chunks, ptf, selector): # reading ptype, field and applying the selector to the data read in. # Selector objects have a .select_points(x,y,z) that returns a mask, so # you need to do your masking here. - for data_file in self._get_data_files(chunks, ptf): + self._validate_particle_ptf(ptf) + for data_file in self._sorted_chunk_iterator(chunks): si, ei = data_file.start, data_file.end cols = [] for field_list in ptf.values(): @@ -67,15 +69,13 @@ def _identify_fields(self, data_file): # Helper methods - def _get_data_files(self, chunks, ptf): + @staticmethod + def _validate_particle_ptf(ptf): # Only support halo reading for now. assert len(ptf) == 1 assert list(ptf.keys())[0] == "halos" - # Get data_files - chunks = list(chunks) - data_files = set() - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) - data_files = sorted(data_files, key=attrgetter("filename")) - yield from data_files + + def _sorted_chunk_iterator(self, chunks): + # yield from sorted list of data_files + data_files = self._get_data_files(chunks) + yield from sorted(data_files, key=attrgetter("filename")) diff --git a/yt/frontends/gadget_fof/io.py b/yt/frontends/gadget_fof/io.py index e6fb351761..66377e179c 100644 --- a/yt/frontends/gadget_fof/io.py +++ b/yt/frontends/gadget_fof/io.py @@ -21,12 +21,7 @@ def _read_fluid_selection(self, chunks, selector, fields, size): def _read_particle_coords(self, chunks, ptf): # This will read chunks and yield the results. - chunks = list(chunks) - data_files = set() - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) - for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)): + for data_file in self._sorted_chunk_iterator(chunks): with h5py.File(data_file.filename, mode="r") as f: for ptype in sorted(ptf): coords = data_file._get_particle_positions(ptype, f=f) @@ -71,12 +66,7 @@ def _read_offset_particle_field(self, field, data_file, fh): def _read_particle_fields(self, chunks, ptf, selector): # Now we have all the sizes, and we can allocate - chunks = list(chunks) - data_files = set() - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) - for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)): + for data_file in self._sorted_chunk_iterator(chunks): si, ei = data_file.start, data_file.end with h5py.File(data_file.filename, mode="r") as f: for ptype, field_list in sorted(ptf.items()): diff --git a/yt/frontends/halo_catalog/io.py b/yt/frontends/halo_catalog/io.py index ad73ae00c3..a5aea371be 100644 --- a/yt/frontends/halo_catalog/io.py +++ b/yt/frontends/halo_catalog/io.py @@ -17,17 +17,12 @@ def _read_fluid_selection(self, chunks, selector, fields, size): def _read_particle_coords(self, chunks, ptf): # This will read chunks and yield the results. - chunks = list(chunks) - data_files = set() # Only support halo reading for now. assert len(ptf) == 1 assert list(ptf.keys())[0] == "halos" ptype = "halos" - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) pn = "particle_position_%s" - for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)): + for data_file in self._sorted_chunk_iterator(chunks): with h5py.File(data_file.filename, mode="r") as f: units = parse_h5_attr(f[pn % "x"], "units") pos = data_file._get_particle_positions(ptype, f=f) @@ -46,17 +41,11 @@ def _yield_coordinates(self, data_file): yield "halos", pos def _read_particle_fields(self, chunks, ptf, selector): - # Now we have all the sizes, and we can allocate - chunks = list(chunks) - data_files = set() # Only support halo reading for now. assert len(ptf) == 1 assert list(ptf.keys())[0] == "halos" - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) pn = "particle_position_%s" - for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)): + for data_file in self._sorted_chunk_iterator(chunks): si, ei = data_file.start, data_file.end with h5py.File(data_file.filename, mode="r") as f: for ptype, field_list in sorted(ptf.items()): diff --git a/yt/frontends/http_stream/io.py b/yt/frontends/http_stream/io.py index 539a814ca0..59bc46c310 100644 --- a/yt/frontends/http_stream/io.py +++ b/yt/frontends/http_stream/io.py @@ -33,12 +33,7 @@ def _identify_fields(self, data_file): return f, {} def _read_particle_coords(self, chunks, ptf): - chunks = list(chunks) - data_files = set() - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) - for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)): + for data_file in self._sorted_chunk_iterator(chunks): for ptype in ptf: s = self._open_stream(data_file, (ptype, "Coordinates")) c = np.frombuffer(s, dtype="float64") @@ -47,11 +42,7 @@ def _read_particle_coords(self, chunks, ptf): def _read_particle_fields(self, chunks, ptf, selector): # Now we have all the sizes, and we can allocate - data_files = set() - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) - for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)): + for data_file in self._sorted_chunk_iterator(chunks): for ptype, field_list in sorted(ptf.items()): s = self._open_stream(data_file, (ptype, "Coordinates")) c = np.frombuffer(s, dtype="float64") diff --git a/yt/frontends/owls_subfind/io.py b/yt/frontends/owls_subfind/io.py index 66358ee820..a145cd59ea 100644 --- a/yt/frontends/owls_subfind/io.py +++ b/yt/frontends/owls_subfind/io.py @@ -20,12 +20,7 @@ def _read_fluid_selection(self, chunks, selector, fields, size): def _read_particle_coords(self, chunks, ptf): # This will read chunks and yield the results. - chunks = list(chunks) - data_files = set() - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) - for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)): + for data_file in self._sorted_chunk_iterator(chunks): with h5py.File(data_file.filename, mode="r") as f: for ptype in sorted(ptf): pcount = data_file.total_particles[ptype] @@ -67,12 +62,7 @@ def _read_offset_particle_field(self, field, data_file, fh): def _read_particle_fields(self, chunks, ptf, selector): # Now we have all the sizes, and we can allocate - chunks = list(chunks) - data_files = set() - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) - for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)): + for data_file in self._sorted_chunk_iterator(chunks): with h5py.File(data_file.filename, mode="r") as f: for ptype, field_list in sorted(ptf.items()): pcount = data_file.total_particles[ptype] diff --git a/yt/frontends/rockstar/io.py b/yt/frontends/rockstar/io.py index 7d59a441e6..bb49d56e63 100644 --- a/yt/frontends/rockstar/io.py +++ b/yt/frontends/rockstar/io.py @@ -19,16 +19,12 @@ def _read_fluid_selection(self, chunks, selector, fields, size): def _read_particle_coords(self, chunks, ptf): # This will read chunks and yield the results. - chunks = list(chunks) - data_files = set() + # Only support halo reading for now. assert len(ptf) == 1 assert list(ptf.keys())[0] == "halos" ptype = "halos" - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) - for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)): + for data_file in self._sorted_chunk_iterator(chunks): pcount = data_file.header["num_halos"] if pcount == 0: continue @@ -37,16 +33,10 @@ def _read_particle_coords(self, chunks, ptf): yield "halos", (pos[:, i] for i in range(3)), 0.0 def _read_particle_fields(self, chunks, ptf, selector): - # Now we have all the sizes, and we can allocate - chunks = list(chunks) - data_files = set() # Only support halo reading for now. assert len(ptf) == 1 assert list(ptf.keys())[0] == "halos" - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) - for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)): + for data_file in self._sorted_chunk_iterator(chunks): si, ei = data_file.start, data_file.end pcount = data_file.header["num_halos"] if pcount == 0: diff --git a/yt/frontends/sdf/io.py b/yt/frontends/sdf/io.py index 00d0481f0a..d14408a17f 100644 --- a/yt/frontends/sdf/io.py +++ b/yt/frontends/sdf/io.py @@ -15,13 +15,9 @@ def _read_fluid_selection(self, chunks, selector, fields, size): raise NotImplementedError def _read_particle_coords(self, chunks, ptf): - chunks = list(chunks) - data_files = set() assert len(ptf) == 1 assert ptf.keys()[0] == "dark_matter" - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) + data_files = self._get_data_files(chunks) assert len(data_files) == 1 for _data_file in sorted(data_files, key=lambda x: (x.filename, x.start)): yield "dark_matter", ( @@ -31,13 +27,9 @@ def _read_particle_coords(self, chunks, ptf): ), 0.0 def _read_particle_fields(self, chunks, ptf, selector): - chunks = list(chunks) - data_files = set() assert len(ptf) == 1 assert ptf.keys()[0] == "dark_matter" - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) + data_files = self._get_data_files(chunks) assert len(data_files) == 1 for _data_file in sorted(data_files, key=lambda x: (x.filename, x.start)): for ptype, field_list in sorted(ptf.items()): diff --git a/yt/frontends/stream/io.py b/yt/frontends/stream/io.py index 058ca90522..b4a7ab71b0 100644 --- a/yt/frontends/stream/io.py +++ b/yt/frontends/stream/io.py @@ -104,9 +104,7 @@ def __init__(self, ds): super().__init__(ds) def _read_particle_coords(self, chunks, ptf): - for data_file in sorted( - self._get_data_files(chunks), key=lambda x: (x.filename, x.start) - ): + for data_file in self._sorted_chunk_iterator(chunks): f = self.fields[data_file.filename] # This double-reads for ptype in sorted(ptf): @@ -117,19 +115,10 @@ def _read_particle_coords(self, chunks, ptf): ), 0.0 def _read_smoothing_length(self, chunks, ptf, ptype): - for data_file in sorted( - self._get_data_files(chunks), key=lambda x: (x.filename, x.start) - ): + for data_file in self._sorted_chunk_iterator(chunks): f = self.fields[data_file.filename] return f[ptype, "smoothing_length"] - def _get_data_files(self, chunks): - data_files = set() - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) - return data_files - def _read_particle_data_file(self, data_file, ptf, selector=None): return_data = {} f = self.fields[data_file.filename] diff --git a/yt/frontends/tipsy/io.py b/yt/frontends/tipsy/io.py index a2a35a7293..820d0c0907 100644 --- a/yt/frontends/tipsy/io.py +++ b/yt/frontends/tipsy/io.py @@ -85,12 +85,8 @@ def _fill_fields(self, fields, vals, hsml, mask, data_file): return rv def _read_particle_coords(self, chunks, ptf): - data_files = set() - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) chunksize = self.ds.index.chunksize - for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)): + for data_file in self._sorted_chunk_iterator(chunks): poff = data_file.field_offsets tp = data_file.total_particles f = open(data_file.filename, "rb") diff --git a/yt/frontends/ytdata/io.py b/yt/frontends/ytdata/io.py index 875cf413ad..318c42bf24 100644 --- a/yt/frontends/ytdata/io.py +++ b/yt/frontends/ytdata/io.py @@ -191,12 +191,7 @@ def _yield_coordinates(self, data_file): def _read_particle_coords(self, chunks, ptf): # This will read chunks and yield the results. - chunks = list(chunks) - data_files = set() - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) - for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)): + for data_file in self._sorted_chunk_iterator(chunks): index_mask = slice(data_file.start, data_file.end) with h5py.File(data_file.filename, mode="r") as f: for ptype in sorted(ptf): @@ -271,12 +266,7 @@ class IOHandlerYTSpatialPlotHDF5(IOHandlerYTDataContainerHDF5): def _read_particle_coords(self, chunks, ptf): # This will read chunks and yield the results. - chunks = list(chunks) - data_files = set() - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) - for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)): + for data_file in self._sorted_chunk_iterator(chunks): with h5py.File(data_file.filename, mode="r") as f: for ptype in sorted(ptf): pcount = data_file.total_particles[ptype] @@ -292,12 +282,7 @@ def _read_particle_coords(self, chunks, ptf): def _read_particle_fields(self, chunks, ptf, selector): # Now we have all the sizes, and we can allocate - chunks = list(chunks) - data_files = set() - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) - for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)): + for data_file in self._sorted_chunk_iterator(chunks): all_count = self._count_particles(data_file) with h5py.File(data_file.filename, mode="r") as f: for ptype, field_list in sorted(ptf.items()): diff --git a/yt/utilities/io_handler.py b/yt/utilities/io_handler.py index 983941e720..ee0e6da909 100644 --- a/yt/utilities/io_handler.py +++ b/yt/utilities/io_handler.py @@ -224,31 +224,34 @@ def _read_particle_selection( def _read_particle_fields(self, chunks, ptf, selector): # Now we have all the sizes, and we can allocate - data_files = set() - for chunk in chunks: - for obj in chunk.objs: - data_files.update(obj.data_files) - for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)): + for data_file in self._sorted_chunk_iterator(chunks): data_file_data = self._read_particle_data_file(data_file, ptf, selector) # temporary trickery so it's still an iterator, need to adjust # the io_handler.BaseIOHandler.read_particle_selection() method # to not use an iterator. yield from data_file_data.items() - -# As a note: we don't *actually* want this to be how it is forever. There's no -# reason we need to have the fluid and particle IO handlers separated. But, -# for keeping track of which frontend is which, this is a useful abstraction. -class BaseParticleIOHandler(BaseIOHandler): - def _sorted_chunk_iterator(self, chunks): + @staticmethod + def _get_data_files(chunks): chunks = list(chunks) data_files = set() for chunk in chunks: for obj in chunk.objs: data_files.update(obj.data_files) + return data_files + + def _sorted_chunk_iterator(self, chunks): + data_files = self._get_data_files(chunks) yield from sorted(data_files, key=lambda x: (x.filename, x.start)) +# As a note: we don't *actually* want this to be how it is forever. There's no +# reason we need to have the fluid and particle IO handlers separated. But, +# for keeping track of which frontend is which, this is a useful abstraction. +class BaseParticleIOHandler(BaseIOHandler): + pass + + class IOHandlerExtracted(BaseIOHandler): _dataset_type = "extracted" From c3784b0684b36cff693216dde153bcc6a8b6492e Mon Sep 17 00:00:00 2001 From: chrishavlin Date: Tue, 25 Jul 2023 09:13:06 -0500 Subject: [PATCH 2/2] move ptf halo validation to functions --- yt/frontends/ahf/io.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/yt/frontends/ahf/io.py b/yt/frontends/ahf/io.py index 374155e1ff..7fbb1e40e2 100644 --- a/yt/frontends/ahf/io.py +++ b/yt/frontends/ahf/io.py @@ -16,7 +16,10 @@ def _read_particle_coords(self, chunks, ptf): # This needs to *yield* a series of tuples of (ptype, (x, y, z), hsml). # chunks is a list of chunks, and ptf is a dict where the keys are # ptypes and the values are lists of fields. - self._validate_particle_ptf(ptf) + + # Only support halo reading for now. + assert len(ptf) == 1 + assert list(ptf.keys())[0] == "halos" for data_file in self._sorted_chunk_iterator(chunks): pos = data_file._get_particle_positions("halos") x, y, z = (pos[:, i] for i in range(3)) @@ -35,7 +38,9 @@ def _read_particle_fields(self, chunks, ptf, selector): # reading ptype, field and applying the selector to the data read in. # Selector objects have a .select_points(x,y,z) that returns a mask, so # you need to do your masking here. - self._validate_particle_ptf(ptf) + # Only support halo reading for now. + assert len(ptf) == 1 + assert list(ptf.keys())[0] == "halos" for data_file in self._sorted_chunk_iterator(chunks): si, ei = data_file.start, data_file.end cols = [] @@ -67,14 +72,6 @@ def _identify_fields(self, data_file): fields = [("halos", f) for f in data_file.col_names] return fields, {} - # Helper methods - - @staticmethod - def _validate_particle_ptf(ptf): - # Only support halo reading for now. - assert len(ptf) == 1 - assert list(ptf.keys())[0] == "halos" - def _sorted_chunk_iterator(self, chunks): # yield from sorted list of data_files data_files = self._get_data_files(chunks)