From 7a67fe3c09e94fabd02d43be5d61bbe6064296a7 Mon Sep 17 00:00:00 2001
From: chrishavlin <chris.havlin@gmail.com>
Date: Mon, 24 Jul 2023 11:34:20 -0500
Subject: [PATCH 1/2] use _sorted_chunk_iterator to reduce code dupe

missed one

fix ahf modification
---
 yt/frontends/adaptahop/io.py    | 19 +++++++------------
 yt/frontends/ahf/io.py          | 22 +++++++++++-----------
 yt/frontends/gadget_fof/io.py   | 14 ++------------
 yt/frontends/halo_catalog/io.py | 15 ++-------------
 yt/frontends/http_stream/io.py  | 13 ++-----------
 yt/frontends/owls_subfind/io.py | 14 ++------------
 yt/frontends/rockstar/io.py     | 16 +++-------------
 yt/frontends/sdf/io.py          | 12 ++----------
 yt/frontends/stream/io.py       | 15 ++-------------
 yt/frontends/tipsy/io.py        |  6 +-----
 yt/frontends/ytdata/io.py       | 21 +++------------------
 yt/utilities/io_handler.py      | 25 ++++++++++++++-----------
 12 files changed, 51 insertions(+), 141 deletions(-)

diff --git a/yt/frontends/adaptahop/io.py b/yt/frontends/adaptahop/io.py
index 58d6040fcd..de0b42b316 100644
--- a/yt/frontends/adaptahop/io.py
+++ b/yt/frontends/adaptahop/io.py
@@ -36,16 +36,11 @@ def _yield_coordinates(self, data_file):
 
     def _read_particle_coords(self, chunks, ptf):
         # This will read chunks and yield the results.
-        chunks = list(chunks)
-        data_files = set()
         # Only support halo reading for now.
         assert len(ptf) == 1
         assert list(ptf.keys())[0] == "halos"
         ptype = "halos"
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
-        for data_file in sorted(data_files, key=attrgetter("filename")):
+        for data_file in self._sorted_chunk_iterator(chunks):
             pcount = (
                 data_file.ds.parameters["nhalos"] + data_file.ds.parameters["nsubs"]
             )
@@ -56,14 +51,10 @@ def _read_particle_coords(self, chunks, ptf):
 
     def _read_particle_fields(self, chunks, ptf, selector):
         # Now we have all the sizes, and we can allocate
-        chunks = list(chunks)
-        data_files = set()
+
         # Only support halo reading for now.
         assert len(ptf) == 1
         assert list(ptf.keys())[0] == "halos"
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
 
         def iterate_over_attributes(attr_list):
             for attr, *_ in attr_list:
@@ -76,7 +67,7 @@ def iterate_over_attributes(attr_list):
 
         attr_pos = partial(_find_attr_position, halo_attributes=halo_attributes)
 
-        for data_file in sorted(data_files, key=attrgetter("filename")):
+        for data_file in self._sorted_chunk_iterator(chunks):
             pcount = (
                 data_file.ds.parameters["nhalos"] + data_file.ds.parameters["nsubs"]
             )
@@ -194,6 +185,10 @@ def members(self, ihalo):
             members = fpu.read_attrs(todo.pop(0))["particle_identities"]
         return members
 
+    def _sorted_chunk_iterator(self, chunks):
+        data_files = self._get_data_files(chunks)
+        yield from sorted(data_files, key=attrgetter("filename"))
+
 
 def _todo_from_attributes(attributes: ATTR_T, halo_attributes: ATTR_T):
     # Helper function to generate a list of read-skip instructions given a list of
diff --git a/yt/frontends/ahf/io.py b/yt/frontends/ahf/io.py
index df9e83dc80..374155e1ff 100644
--- a/yt/frontends/ahf/io.py
+++ b/yt/frontends/ahf/io.py
@@ -16,7 +16,8 @@ def _read_particle_coords(self, chunks, ptf):
         # This needs to *yield* a series of tuples of (ptype, (x, y, z), hsml).
         # chunks is a list of chunks, and ptf is a dict where the keys are
         # ptypes and the values are lists of fields.
-        for data_file in self._get_data_files(chunks, ptf):
+        self._validate_particle_ptf(ptf)
+        for data_file in self._sorted_chunk_iterator(chunks):
             pos = data_file._get_particle_positions("halos")
             x, y, z = (pos[:, i] for i in range(3))
             yield "halos", (x, y, z), 0.0
@@ -34,7 +35,8 @@ def _read_particle_fields(self, chunks, ptf, selector):
         # reading ptype, field and applying the selector to the data read in.
         # Selector objects have a .select_points(x,y,z) that returns a mask, so
         # you need to do your masking here.
-        for data_file in self._get_data_files(chunks, ptf):
+        self._validate_particle_ptf(ptf)
+        for data_file in self._sorted_chunk_iterator(chunks):
             si, ei = data_file.start, data_file.end
             cols = []
             for field_list in ptf.values():
@@ -67,15 +69,13 @@ def _identify_fields(self, data_file):
 
     # Helper methods
 
-    def _get_data_files(self, chunks, ptf):
+    @staticmethod
+    def _validate_particle_ptf(ptf):
         # Only support halo reading for now.
         assert len(ptf) == 1
         assert list(ptf.keys())[0] == "halos"
-        # Get data_files
-        chunks = list(chunks)
-        data_files = set()
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
-        data_files = sorted(data_files, key=attrgetter("filename"))
-        yield from data_files
+
+    def _sorted_chunk_iterator(self, chunks):
+        # yield from sorted list of data_files
+        data_files = self._get_data_files(chunks)
+        yield from sorted(data_files, key=attrgetter("filename"))
diff --git a/yt/frontends/gadget_fof/io.py b/yt/frontends/gadget_fof/io.py
index e6fb351761..66377e179c 100644
--- a/yt/frontends/gadget_fof/io.py
+++ b/yt/frontends/gadget_fof/io.py
@@ -21,12 +21,7 @@ def _read_fluid_selection(self, chunks, selector, fields, size):
 
     def _read_particle_coords(self, chunks, ptf):
         # This will read chunks and yield the results.
-        chunks = list(chunks)
-        data_files = set()
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
-        for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)):
+        for data_file in self._sorted_chunk_iterator(chunks):
             with h5py.File(data_file.filename, mode="r") as f:
                 for ptype in sorted(ptf):
                     coords = data_file._get_particle_positions(ptype, f=f)
@@ -71,12 +66,7 @@ def _read_offset_particle_field(self, field, data_file, fh):
 
     def _read_particle_fields(self, chunks, ptf, selector):
         # Now we have all the sizes, and we can allocate
-        chunks = list(chunks)
-        data_files = set()
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
-        for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)):
+        for data_file in self._sorted_chunk_iterator(chunks):
             si, ei = data_file.start, data_file.end
             with h5py.File(data_file.filename, mode="r") as f:
                 for ptype, field_list in sorted(ptf.items()):
diff --git a/yt/frontends/halo_catalog/io.py b/yt/frontends/halo_catalog/io.py
index ad73ae00c3..a5aea371be 100644
--- a/yt/frontends/halo_catalog/io.py
+++ b/yt/frontends/halo_catalog/io.py
@@ -17,17 +17,12 @@ def _read_fluid_selection(self, chunks, selector, fields, size):
 
     def _read_particle_coords(self, chunks, ptf):
         # This will read chunks and yield the results.
-        chunks = list(chunks)
-        data_files = set()
         # Only support halo reading for now.
         assert len(ptf) == 1
         assert list(ptf.keys())[0] == "halos"
         ptype = "halos"
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
         pn = "particle_position_%s"
-        for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)):
+        for data_file in self._sorted_chunk_iterator(chunks):
             with h5py.File(data_file.filename, mode="r") as f:
                 units = parse_h5_attr(f[pn % "x"], "units")
                 pos = data_file._get_particle_positions(ptype, f=f)
@@ -46,17 +41,11 @@ def _yield_coordinates(self, data_file):
             yield "halos", pos
 
     def _read_particle_fields(self, chunks, ptf, selector):
-        # Now we have all the sizes, and we can allocate
-        chunks = list(chunks)
-        data_files = set()
         # Only support halo reading for now.
         assert len(ptf) == 1
         assert list(ptf.keys())[0] == "halos"
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
         pn = "particle_position_%s"
-        for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)):
+        for data_file in self._sorted_chunk_iterator(chunks):
             si, ei = data_file.start, data_file.end
             with h5py.File(data_file.filename, mode="r") as f:
                 for ptype, field_list in sorted(ptf.items()):
diff --git a/yt/frontends/http_stream/io.py b/yt/frontends/http_stream/io.py
index 539a814ca0..59bc46c310 100644
--- a/yt/frontends/http_stream/io.py
+++ b/yt/frontends/http_stream/io.py
@@ -33,12 +33,7 @@ def _identify_fields(self, data_file):
         return f, {}
 
     def _read_particle_coords(self, chunks, ptf):
-        chunks = list(chunks)
-        data_files = set()
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
-        for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)):
+        for data_file in self._sorted_chunk_iterator(chunks):
             for ptype in ptf:
                 s = self._open_stream(data_file, (ptype, "Coordinates"))
                 c = np.frombuffer(s, dtype="float64")
@@ -47,11 +42,7 @@ def _read_particle_coords(self, chunks, ptf):
 
     def _read_particle_fields(self, chunks, ptf, selector):
         # Now we have all the sizes, and we can allocate
-        data_files = set()
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
-        for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)):
+        for data_file in self._sorted_chunk_iterator(chunks):
             for ptype, field_list in sorted(ptf.items()):
                 s = self._open_stream(data_file, (ptype, "Coordinates"))
                 c = np.frombuffer(s, dtype="float64")
diff --git a/yt/frontends/owls_subfind/io.py b/yt/frontends/owls_subfind/io.py
index 66358ee820..a145cd59ea 100644
--- a/yt/frontends/owls_subfind/io.py
+++ b/yt/frontends/owls_subfind/io.py
@@ -20,12 +20,7 @@ def _read_fluid_selection(self, chunks, selector, fields, size):
 
     def _read_particle_coords(self, chunks, ptf):
         # This will read chunks and yield the results.
-        chunks = list(chunks)
-        data_files = set()
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
-        for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)):
+        for data_file in self._sorted_chunk_iterator(chunks):
             with h5py.File(data_file.filename, mode="r") as f:
                 for ptype in sorted(ptf):
                     pcount = data_file.total_particles[ptype]
@@ -67,12 +62,7 @@ def _read_offset_particle_field(self, field, data_file, fh):
 
     def _read_particle_fields(self, chunks, ptf, selector):
         # Now we have all the sizes, and we can allocate
-        chunks = list(chunks)
-        data_files = set()
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
-        for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)):
+        for data_file in self._sorted_chunk_iterator(chunks):
             with h5py.File(data_file.filename, mode="r") as f:
                 for ptype, field_list in sorted(ptf.items()):
                     pcount = data_file.total_particles[ptype]
diff --git a/yt/frontends/rockstar/io.py b/yt/frontends/rockstar/io.py
index 7d59a441e6..bb49d56e63 100644
--- a/yt/frontends/rockstar/io.py
+++ b/yt/frontends/rockstar/io.py
@@ -19,16 +19,12 @@ def _read_fluid_selection(self, chunks, selector, fields, size):
 
     def _read_particle_coords(self, chunks, ptf):
         # This will read chunks and yield the results.
-        chunks = list(chunks)
-        data_files = set()
+
         # Only support halo reading for now.
         assert len(ptf) == 1
         assert list(ptf.keys())[0] == "halos"
         ptype = "halos"
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
-        for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)):
+        for data_file in self._sorted_chunk_iterator(chunks):
             pcount = data_file.header["num_halos"]
             if pcount == 0:
                 continue
@@ -37,16 +33,10 @@ def _read_particle_coords(self, chunks, ptf):
                 yield "halos", (pos[:, i] for i in range(3)), 0.0
 
     def _read_particle_fields(self, chunks, ptf, selector):
-        # Now we have all the sizes, and we can allocate
-        chunks = list(chunks)
-        data_files = set()
         # Only support halo reading for now.
         assert len(ptf) == 1
         assert list(ptf.keys())[0] == "halos"
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
-        for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)):
+        for data_file in self._sorted_chunk_iterator(chunks):
             si, ei = data_file.start, data_file.end
             pcount = data_file.header["num_halos"]
             if pcount == 0:
diff --git a/yt/frontends/sdf/io.py b/yt/frontends/sdf/io.py
index 00d0481f0a..d14408a17f 100644
--- a/yt/frontends/sdf/io.py
+++ b/yt/frontends/sdf/io.py
@@ -15,13 +15,9 @@ def _read_fluid_selection(self, chunks, selector, fields, size):
         raise NotImplementedError
 
     def _read_particle_coords(self, chunks, ptf):
-        chunks = list(chunks)
-        data_files = set()
         assert len(ptf) == 1
         assert ptf.keys()[0] == "dark_matter"
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
+        data_files = self._get_data_files(chunks)
         assert len(data_files) == 1
         for _data_file in sorted(data_files, key=lambda x: (x.filename, x.start)):
             yield "dark_matter", (
@@ -31,13 +27,9 @@ def _read_particle_coords(self, chunks, ptf):
             ), 0.0
 
     def _read_particle_fields(self, chunks, ptf, selector):
-        chunks = list(chunks)
-        data_files = set()
         assert len(ptf) == 1
         assert ptf.keys()[0] == "dark_matter"
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
+        data_files = self._get_data_files(chunks)
         assert len(data_files) == 1
         for _data_file in sorted(data_files, key=lambda x: (x.filename, x.start)):
             for ptype, field_list in sorted(ptf.items()):
diff --git a/yt/frontends/stream/io.py b/yt/frontends/stream/io.py
index 058ca90522..b4a7ab71b0 100644
--- a/yt/frontends/stream/io.py
+++ b/yt/frontends/stream/io.py
@@ -104,9 +104,7 @@ def __init__(self, ds):
         super().__init__(ds)
 
     def _read_particle_coords(self, chunks, ptf):
-        for data_file in sorted(
-            self._get_data_files(chunks), key=lambda x: (x.filename, x.start)
-        ):
+        for data_file in self._sorted_chunk_iterator(chunks):
             f = self.fields[data_file.filename]
             # This double-reads
             for ptype in sorted(ptf):
@@ -117,19 +115,10 @@ def _read_particle_coords(self, chunks, ptf):
                 ), 0.0
 
     def _read_smoothing_length(self, chunks, ptf, ptype):
-        for data_file in sorted(
-            self._get_data_files(chunks), key=lambda x: (x.filename, x.start)
-        ):
+        for data_file in self._sorted_chunk_iterator(chunks):
             f = self.fields[data_file.filename]
             return f[ptype, "smoothing_length"]
 
-    def _get_data_files(self, chunks):
-        data_files = set()
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
-        return data_files
-
     def _read_particle_data_file(self, data_file, ptf, selector=None):
         return_data = {}
         f = self.fields[data_file.filename]
diff --git a/yt/frontends/tipsy/io.py b/yt/frontends/tipsy/io.py
index a2a35a7293..820d0c0907 100644
--- a/yt/frontends/tipsy/io.py
+++ b/yt/frontends/tipsy/io.py
@@ -85,12 +85,8 @@ def _fill_fields(self, fields, vals, hsml, mask, data_file):
         return rv
 
     def _read_particle_coords(self, chunks, ptf):
-        data_files = set()
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
         chunksize = self.ds.index.chunksize
-        for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)):
+        for data_file in self._sorted_chunk_iterator(chunks):
             poff = data_file.field_offsets
             tp = data_file.total_particles
             f = open(data_file.filename, "rb")
diff --git a/yt/frontends/ytdata/io.py b/yt/frontends/ytdata/io.py
index 875cf413ad..318c42bf24 100644
--- a/yt/frontends/ytdata/io.py
+++ b/yt/frontends/ytdata/io.py
@@ -191,12 +191,7 @@ def _yield_coordinates(self, data_file):
 
     def _read_particle_coords(self, chunks, ptf):
         # This will read chunks and yield the results.
-        chunks = list(chunks)
-        data_files = set()
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
-        for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)):
+        for data_file in self._sorted_chunk_iterator(chunks):
             index_mask = slice(data_file.start, data_file.end)
             with h5py.File(data_file.filename, mode="r") as f:
                 for ptype in sorted(ptf):
@@ -271,12 +266,7 @@ class IOHandlerYTSpatialPlotHDF5(IOHandlerYTDataContainerHDF5):
 
     def _read_particle_coords(self, chunks, ptf):
         # This will read chunks and yield the results.
-        chunks = list(chunks)
-        data_files = set()
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
-        for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)):
+        for data_file in self._sorted_chunk_iterator(chunks):
             with h5py.File(data_file.filename, mode="r") as f:
                 for ptype in sorted(ptf):
                     pcount = data_file.total_particles[ptype]
@@ -292,12 +282,7 @@ def _read_particle_coords(self, chunks, ptf):
 
     def _read_particle_fields(self, chunks, ptf, selector):
         # Now we have all the sizes, and we can allocate
-        chunks = list(chunks)
-        data_files = set()
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
-        for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)):
+        for data_file in self._sorted_chunk_iterator(chunks):
             all_count = self._count_particles(data_file)
             with h5py.File(data_file.filename, mode="r") as f:
                 for ptype, field_list in sorted(ptf.items()):
diff --git a/yt/utilities/io_handler.py b/yt/utilities/io_handler.py
index 983941e720..ee0e6da909 100644
--- a/yt/utilities/io_handler.py
+++ b/yt/utilities/io_handler.py
@@ -224,31 +224,34 @@ def _read_particle_selection(
 
     def _read_particle_fields(self, chunks, ptf, selector):
         # Now we have all the sizes, and we can allocate
-        data_files = set()
-        for chunk in chunks:
-            for obj in chunk.objs:
-                data_files.update(obj.data_files)
-        for data_file in sorted(data_files, key=lambda x: (x.filename, x.start)):
+        for data_file in self._sorted_chunk_iterator(chunks):
             data_file_data = self._read_particle_data_file(data_file, ptf, selector)
             # temporary trickery so it's still an iterator, need to adjust
             # the io_handler.BaseIOHandler.read_particle_selection() method
             # to not use an iterator.
             yield from data_file_data.items()
 
-
-# As a note: we don't *actually* want this to be how it is forever.  There's no
-# reason we need to have the fluid and particle IO handlers separated.  But,
-# for keeping track of which frontend is which, this is a useful abstraction.
-class BaseParticleIOHandler(BaseIOHandler):
-    def _sorted_chunk_iterator(self, chunks):
+    @staticmethod
+    def _get_data_files(chunks):
         chunks = list(chunks)
         data_files = set()
         for chunk in chunks:
             for obj in chunk.objs:
                 data_files.update(obj.data_files)
+        return data_files
+
+    def _sorted_chunk_iterator(self, chunks):
+        data_files = self._get_data_files(chunks)
         yield from sorted(data_files, key=lambda x: (x.filename, x.start))
 
 
+# As a note: we don't *actually* want this to be how it is forever.  There's no
+# reason we need to have the fluid and particle IO handlers separated.  But,
+# for keeping track of which frontend is which, this is a useful abstraction.
+class BaseParticleIOHandler(BaseIOHandler):
+    pass
+
+
 class IOHandlerExtracted(BaseIOHandler):
     _dataset_type = "extracted"
 

From c3784b0684b36cff693216dde153bcc6a8b6492e Mon Sep 17 00:00:00 2001
From: chrishavlin <chris.havlin@gmail.com>
Date: Tue, 25 Jul 2023 09:13:06 -0500
Subject: [PATCH 2/2] move ptf halo validation to functions

---
 yt/frontends/ahf/io.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/yt/frontends/ahf/io.py b/yt/frontends/ahf/io.py
index 374155e1ff..7fbb1e40e2 100644
--- a/yt/frontends/ahf/io.py
+++ b/yt/frontends/ahf/io.py
@@ -16,7 +16,10 @@ def _read_particle_coords(self, chunks, ptf):
         # This needs to *yield* a series of tuples of (ptype, (x, y, z), hsml).
         # chunks is a list of chunks, and ptf is a dict where the keys are
         # ptypes and the values are lists of fields.
-        self._validate_particle_ptf(ptf)
+
+        # Only support halo reading for now.
+        assert len(ptf) == 1
+        assert list(ptf.keys())[0] == "halos"
         for data_file in self._sorted_chunk_iterator(chunks):
             pos = data_file._get_particle_positions("halos")
             x, y, z = (pos[:, i] for i in range(3))
@@ -35,7 +38,9 @@ def _read_particle_fields(self, chunks, ptf, selector):
         # reading ptype, field and applying the selector to the data read in.
         # Selector objects have a .select_points(x,y,z) that returns a mask, so
         # you need to do your masking here.
-        self._validate_particle_ptf(ptf)
+        # Only support halo reading for now.
+        assert len(ptf) == 1
+        assert list(ptf.keys())[0] == "halos"
         for data_file in self._sorted_chunk_iterator(chunks):
             si, ei = data_file.start, data_file.end
             cols = []
@@ -67,14 +72,6 @@ def _identify_fields(self, data_file):
         fields = [("halos", f) for f in data_file.col_names]
         return fields, {}
 
-    # Helper methods
-
-    @staticmethod
-    def _validate_particle_ptf(ptf):
-        # Only support halo reading for now.
-        assert len(ptf) == 1
-        assert list(ptf.keys())[0] == "halos"
-
     def _sorted_chunk_iterator(self, chunks):
         # yield from sorted list of data_files
         data_files = self._get_data_files(chunks)