Intermediate output logging enablement - Inspector logging (#4293)

Summary: Pull Request resolved: #4293 This diff parses the logged intermediate outputs in etdump into Inspector objects. It pretty much automatically works because the infra has already been built out for non-delegate intermediate output logging. The only change needed here is to add delegate debug id to `InstructionEventSignature` and `DebugEventSignature` so they group delegated ops correctly. Design doc: https://docs.google.com/document/d/1qGHsgd-roqtxPz4CrUlqGrKaAtbaf9bArziMuwHD0So/edit Reviewed By: Jack-Khuu Differential Revision: D59840296 fbshipit-source-id: 04f22d4520584090f3b37b83386f704cc7a2c271
pytorch · Jul 18, 2024 · 4a88318 · 4a88318
1 parent 1b0bf1c
commit 4a88318
Show file tree

Hide file tree

Showing 6 changed files with 255 additions and 5 deletions.
diff --git a/sdk/etdump/schema_flatcc.py b/sdk/etdump/schema_flatcc.py
@@ -95,6 +95,8 @@ class Value:
 class DebugEvent:
     chain_index: int
     instruction_id: int
+    delegate_debug_id_int: Optional[int]
+    delegate_debug_id_str: Optional[str]
     debug_entry: Value
 
 

diff --git a/sdk/etdump/tests/serialize_test.py b/sdk/etdump/tests/serialize_test.py
@@ -85,6 +85,8 @@ def get_sample_etdump_flatcc() -> flatcc.ETDumpFlatCC:
                         debug_event=flatcc.DebugEvent(
                             chain_index=1,
                             instruction_id=0,
+                            delegate_debug_id_str="56",
+                            delegate_debug_id_int=-1,
                             debug_entry=flatcc.Value(
                                 val=flatcc.ValueType.TENSOR.value,
                                 tensor=flatcc.Tensor(

diff --git a/sdk/inspector/_inspector.py b/sdk/inspector/_inspector.py
@@ -65,6 +65,8 @@
 class InstructionEventSignature:
     instruction_id: int
     chain_index: int
+    delegate_id: Optional[int] = None
+    delegate_id_str: Optional[str] = None
 
 
 # Aggregated Runtime Events for a single instruction
@@ -92,8 +94,12 @@ def gen_from_events(run_events: List[flatcc.Event]) -> List["InstructionEvent"]:
 
             # Get existing InstructionEvent or insert a new one
             signature = InstructionEventSignature(
-                populated_event.instruction_id, populated_event.chain_index
+                instruction_id=populated_event.instruction_id,
+                chain_index=populated_event.chain_index,
+                delegate_id=populated_event.delegate_debug_id_int,
+                delegate_id_str=populated_event.delegate_debug_id_str,
             )
+
             instruction_event = instruction_events.setdefault(
                 signature, InstructionEvent(signature=signature)
             )
@@ -104,6 +110,7 @@ def gen_from_events(run_events: List[flatcc.Event]) -> List["InstructionEvent"]:
                     instruction_event.profile_events = []
                 instruction_event.profile_events.append(populated_event)
             elif isinstance(populated_event, DebugEvent):
+                # Ignore run_output events
                 if not is_debug_output(populated_event.debug_entry):
                     if instruction_event.debug_events is None:
                         instruction_event.debug_events = []
@@ -139,7 +146,9 @@ def _gen_from_event(event: ProfileEvent) -> "ProfileEventSignature":
 # Signature of a DebugEvent
 @dataclass(frozen=True, order=True)
 class DebugEventSignature:
-    instruction_id: Optional[int]
+    instruction_id: Optional[int] = -1
+    delegate_id: Optional[int] = None
+    delegate_id_str: Optional[str] = None
 
     @staticmethod
     def _gen_from_event(event: DebugEvent) -> "DebugEventSignature":
@@ -150,7 +159,9 @@ def _gen_from_event(event: DebugEvent) -> "DebugEventSignature":
         The Signature will convert these back to the intended None value
         """
         return DebugEventSignature(
-            event.instruction_id if event.instruction_id != -1 else None
+            event.instruction_id if event.instruction_id != -1 else None,
+            event.delegate_debug_id_int if event.delegate_debug_id_int != -1 else None,
+            event.delegate_debug_id_str if event.delegate_debug_id_str != "" else None,
         )
 
 
@@ -562,6 +573,7 @@ def _populate_debugging_related_fields(
         Fields Updated:
             debug_data
         """
+
         debug_data: List[flatcc.Value] = []
         for event in events:
             if (debug_events := event.debug_events) is None:

diff --git a/sdk/inspector/tests/event_blocks_test.py b/sdk/inspector/tests/event_blocks_test.py
@@ -58,6 +58,55 @@ def _gen_sample_profile_event(
             end_time=time[1],
         )
 
+    @staticmethod
+    def _gen_sample_debug_event(
+        instruction_id: int,
+        delegate_debug_id: Optional[Union[int, str]] = None,
+    ) -> flatcc.DebugEvent:
+        """
+        Helper for generating test DebugEvents
+
+        Notably:
+        - delegate_debug_id takes either the str or int representation
+        """
+        delegate_debug_id_int = (
+            delegate_debug_id if isinstance(delegate_debug_id, int) else -1
+        )
+        delegate_debug_id_str = (
+            delegate_debug_id if isinstance(delegate_debug_id, str) else ""
+        )
+
+        return flatcc.DebugEvent(
+            chain_index=0,
+            instruction_id=instruction_id,
+            delegate_debug_id_int=delegate_debug_id_int,
+            delegate_debug_id_str=delegate_debug_id_str,
+            debug_entry=flatcc.Value(
+                val=flatcc.ValueType.TENSOR.value,
+                tensor=flatcc.Tensor(
+                    scalar_type=flatcc.ScalarType.INT,
+                    sizes=[1],
+                    strides=[1],
+                    offset=12345,
+                ),
+                tensor_list=flatcc.TensorList(
+                    [
+                        flatcc.Tensor(
+                            scalar_type=flatcc.ScalarType.INT,
+                            sizes=[1],
+                            strides=[1],
+                            offset=12345,
+                        )
+                    ]
+                ),
+                int_value=flatcc.Int(1),
+                float_value=flatcc.Float(1.0),
+                double_value=flatcc.Double(1.0),
+                bool_value=flatcc.Bool(False),
+                output=None,
+            ),
+        )
+
     @staticmethod
     def _get_sample_etdump_flatcc() -> flatcc.ETDumpFlatCC:
         """
@@ -123,17 +172,145 @@ def _get_sample_etdump_flatcc() -> flatcc.ETDumpFlatCC:
 
         return ETDumpFlatCC(version=0, run_data=[run_data_1, run_data_2, run_data_3])
 
+    @staticmethod
+    def _get_sample_etdump_flatcc_inconsistent_debug_data() -> flatcc.ETDumpFlatCC:
+        debug_event_1 = TestEventBlock._gen_sample_debug_event(
+            instruction_id=1, delegate_debug_id=100
+        )
+        run_data_1 = flatcc.RunData(
+            name="signature_a",
+            bundled_input_index=-1,
+            allocators=[],
+            events=[
+                flatcc.Event(
+                    allocation_event=None,
+                    debug_event=debug_event_1,
+                    profile_event=None,
+                ),
+            ],
+        )
+
+        debug_event_2 = TestEventBlock._gen_sample_debug_event(
+            instruction_id=1, delegate_debug_id=100
+        )
+        # Modify this debug event so it's different from debug_event_1
+        debug_event_2.debug_entry.tensor.sizes = [2]  # pyre-ignore
+        run_data_2 = flatcc.RunData(
+            name="signature_a",
+            bundled_input_index=-1,
+            allocators=[],
+            events=[
+                flatcc.Event(
+                    allocation_event=None,
+                    debug_event=debug_event_2,
+                    profile_event=None,
+                ),
+            ],
+        )
+        return ETDumpFlatCC(version=0, run_data=[run_data_1, run_data_2])
+
+    @staticmethod
+    def _get_sample_etdump_flatcc_profiling_and_debugging() -> flatcc.ETDumpFlatCC:
+        """
+        Helper for getting a sample ETDumpFlatCC object with 3 RunData:
+        - run_data_1 has signature_a with (debug_event_1, profile_event_1)
+        - run_data_2 has the same signature with run_data_1 and same debug event, but different profiling times
+        - run_data_3 has signature_b with (debug_event_3, profile_event_3) and (not debug event, profile_event_4)
+        """
+        profile_event_1 = TestEventBlock._gen_sample_profile_event(
+            name="profile_1", instruction_id=1, time=(0, 1), delegate_debug_id=100
+        )
+        debug_event_1 = TestEventBlock._gen_sample_debug_event(
+            instruction_id=1, delegate_debug_id=100
+        )
+        run_data_1 = flatcc.RunData(
+            name="signature_a",
+            bundled_input_index=-1,
+            allocators=[],
+            events=[
+                flatcc.Event(
+                    allocation_event=None,
+                    debug_event=None,
+                    profile_event=profile_event_1,
+                ),
+                flatcc.Event(
+                    allocation_event=None,
+                    debug_event=debug_event_1,
+                    profile_event=None,
+                ),
+            ],
+        )
+
+        profile_event_2 = TestEventBlock._gen_sample_profile_event(
+            name="profile_1", instruction_id=1, time=(2, 4), delegate_debug_id=100
+        )
+        debug_event_2 = TestEventBlock._gen_sample_debug_event(
+            instruction_id=1, delegate_debug_id=100
+        )
+        run_data_2 = flatcc.RunData(
+            name="signature_a",
+            bundled_input_index=-1,
+            allocators=[],
+            events=[
+                flatcc.Event(
+                    allocation_event=None,
+                    debug_event=None,
+                    profile_event=profile_event_2,
+                ),
+                flatcc.Event(
+                    allocation_event=None,
+                    debug_event=debug_event_2,
+                    profile_event=None,
+                ),
+            ],
+        )
+
+        profile_event_3 = TestEventBlock._gen_sample_profile_event(
+            name="profile_3", instruction_id=1, time=(5, 6), delegate_debug_id=100
+        )
+        debug_event_3 = TestEventBlock._gen_sample_debug_event(
+            instruction_id=1, delegate_debug_id=100
+        )
+        profile_event_4 = TestEventBlock._gen_sample_profile_event(
+            name="profile_4", instruction_id=2, time=(7, 8), delegate_debug_id=100
+        )
+        run_data_3 = flatcc.RunData(
+            name="signature_b",
+            bundled_input_index=-1,
+            allocators=[],
+            events=[
+                flatcc.Event(
+                    allocation_event=None,
+                    debug_event=debug_event_3,
+                    profile_event=None,
+                ),
+                flatcc.Event(
+                    allocation_event=None,
+                    debug_event=None,
+                    profile_event=profile_event_3,
+                ),
+                flatcc.Event(
+                    allocation_event=None,
+                    debug_event=None,
+                    profile_event=profile_event_4,
+                ),
+            ],
+        )
+
+        return ETDumpFlatCC(version=0, run_data=[run_data_1, run_data_2, run_data_3])
+
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
     def test_gen_from_etdump(self) -> None:
         """
         Test "e2e" generation of EventBlocks given an ETDump
-            - Generated via EventBock.gen_from_etdump
+            - Generated via EventBlock.gen_from_etdump
 
         Specifically it tests for external correctness:
         - Correct number of EventBlocks
         - Correct number of Events and Raw Data values (iterations)
         """
+
         etdump: ETDumpFlatCC = TestEventBlock._get_sample_etdump_flatcc()
         blocks: List[EventBlock] = EventBlock._gen_from_etdump(etdump)
 
@@ -147,6 +324,52 @@ def test_gen_from_etdump(self) -> None:
                 run_counts.add((len(block.events), len(perf_data.raw)))
         self.assertSetEqual(run_counts, {(1, 2), (2, 1)})
 
+    def test_gen_from_etdump_profiling_and_debugging(self) -> None:
+        """
+        Test "e2e" generation of EventBlocks given an ETDump with both profiling and debugging events
+            - Generated via EventBlock.gen_from_etdump
+
+        Specifically it tests for external correctness:
+        - Correct number of EventBlocks
+        - Correct number of raw perf_data and debug_data for each Event
+        """
+        etdump: ETDumpFlatCC = (
+            TestEventBlock._get_sample_etdump_flatcc_profiling_and_debugging()
+        )
+        blocks: List[EventBlock] = EventBlock._gen_from_etdump(etdump)
+
+        self.assertEqual(len(blocks), 2, f"Expected 2 runs, got {len(blocks)}")
+
+        # One EventBlock should have 1 event with 2 iterations
+        # and 1 debug data (because we only populate debug data in the first iteration)
+        self.assertEqual(len(blocks[0].events), 1)
+        if (perf_data := blocks[0].events[0].perf_data) is not None:
+            self.assertEqual(len(perf_data.raw), 2)
+        self.assertEqual(len(blocks[0].events[0].debug_data), 1)
+
+        # The other EventBlock should have 2 events with 1 iterations, and only the fist event has debug data
+        self.assertEqual(len(blocks[1].events), 2)
+        perf_data = blocks[1].events[0].perf_data
+        self.assertIsNotNone(perf_data)
+        self.assertEqual(len(perf_data.raw), 1)
+
+        perf_data = blocks[1].events[1].perf_data
+        self.assertIsNotNone(perf_data)
+        self.assertEqual(len(perf_data.raw), 1)
+        self.assertEqual(len(blocks[1].events[0].debug_data), 1)
+        self.assertEqual(len(blocks[1].events[1].debug_data), 0)
+
+    def test_gen_from_etdump_inconsistent_debug_data(self) -> None:
+        """
+        Make sure AssertionError is thrown when intermediate outputs are different across
+        different iterations of a model run
+        """
+        etdump: ETDumpFlatCC = (
+            TestEventBlock._get_sample_etdump_flatcc_inconsistent_debug_data()
+        )
+        with self.assertRaises(AssertionError):
+            EventBlock._gen_from_etdump(etdump)
+
     def test_inspector_event_generation(self) -> None:
         """
         Test Inspector.Event derivation from various ProfileEvent cases
@@ -187,7 +410,8 @@ def _test_profile_event_generation(
             self.assertEqual(profile_signature, expected_signature)
 
             event_signature = EventSignature(
-                instruction_id=instruction_id, profile_event_signature=profile_signature
+                instruction_id=instruction_id,
+                profile_event_signature=profile_signature,
             )
 
             # Test Event Generation

diff --git a/sdk/inspector/tests/inspector_test.py b/sdk/inspector/tests/inspector_test.py
@@ -283,6 +283,8 @@ def test_populate_debugging_related_fields_raises_for_inconsistent_events(self):
         debug_event_0 = flatcc.DebugEvent(
             chain_index=1,
             instruction_id=0,
+            delegate_debug_id_int=1,
+            delegate_debug_id_str=None,
             debug_entry=flatcc.Value(
                 val=flatcc.ValueType.TENSOR.value,
                 tensor=flatcc.Tensor(
@@ -304,6 +306,8 @@ def test_populate_debugging_related_fields_raises_for_inconsistent_events(self):
         debug_event_1 = flatcc.DebugEvent(
             chain_index=1,
             instruction_id=0,
+            delegate_debug_id_int=1,
+            delegate_debug_id_str=None,
             debug_entry=flatcc.Value(
                 val=flatcc.ValueType.TENSOR.value,
                 tensor=flatcc.Tensor(
@@ -346,6 +350,8 @@ def test_populate_debugging_related_fields_passes_for_consistent_events(self):
         debug_event_0 = flatcc.DebugEvent(
             chain_index=1,
             instruction_id=0,
+            delegate_debug_id_int=1,
+            delegate_debug_id_str=None,
             debug_entry=flatcc.Value(
                 val=flatcc.ValueType.TENSOR.value,
                 tensor=flatcc.Tensor(
@@ -367,6 +373,8 @@ def test_populate_debugging_related_fields_passes_for_consistent_events(self):
         debug_event_1 = flatcc.DebugEvent(
             chain_index=1,
             instruction_id=0,
+            delegate_debug_id_int=1,
+            delegate_debug_id_str=None,
             debug_entry=flatcc.Value(
                 val=flatcc.ValueType.TENSOR.value,
                 tensor=flatcc.Tensor(

diff --git a/sdk/inspector/tests/inspector_utils_test.py b/sdk/inspector/tests/inspector_utils_test.py
@@ -76,6 +76,8 @@ def test_find_populated_event(self):
         debug_event = flatcc.DebugEvent(
             chain_index=1,
             instruction_id=0,
+            delegate_debug_id_str="56",
+            delegate_debug_id_int=-1,
             debug_entry=flatcc.Value(
                 val=flatcc.ValueType.TENSOR.value,
                 tensor=flatcc.Tensor(