diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py index dc944e68c7f55..7079d9678b2fd 100644 --- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py +++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py @@ -51,8 +51,9 @@ def test_statistic_case1(self): profilerstep_node = HostPythonNode('ProfileStep#1', profiler.TracerEventType.ProfileStep, 0, 400, 1000, 1001) - dataloader_node = HostPythonNode( - 'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001) + dataloader_node = HostPythonNode('Dataloader', + profiler.TracerEventType.Dataloader, 5, + 15, 1000, 1001) mobilenet_node = HostPythonNode( 'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) yolonet_node = HostPythonNode( @@ -155,7 +156,7 @@ def test_statistic_case1(self): profiler.TracerEventType.ProfileStep), 400) self.assertEqual( time_range_summary.get_cpu_range_sum( - profiler.TracerEventType.Forward), 100) + profiler.TracerEventType.Forward), 90) self.assertEqual( time_range_summary.get_cpu_range_sum( profiler.TracerEventType.Backward), 80) @@ -185,12 +186,12 @@ def test_statistic_case1(self): profiler.TracerEventType.Communication), 5) self.assertEqual(len(event_summary.items), 2) self.assertEqual(len(event_summary.userdefined_items), 1) - self.assertEqual(len(event_summary.model_perspective_items), 4) + self.assertEqual(len(event_summary.model_perspective_items), 5) self.assertEqual(len(event_summary.memory_manipulation_items), 1) self.assertEqual(event_summary.items['conv2d'].cpu_time, 15) self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25) self.assertEqual( - event_summary.model_perspective_items['Forward'].cpu_time, 100) + event_summary.model_perspective_items['Forward'].cpu_time, 90) self.assertEqual( event_summary.model_perspective_items['Forward'].general_gpu_time, 135) @@ -217,8 +218,9 @@ def test_statistic_case2(self): profiler.TracerEventType.ProfileStep, 0, 400, 1000, 1001) - dataloader_node = HostPythonNode( - 'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001) + dataloader_node = HostPythonNode('Dataloader', + profiler.TracerEventType.Dataloader, 5, + 15, 1000, 1001) mobilenet_node = HostPythonNode( 'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) @@ -372,7 +374,7 @@ def test_statistic_case2(self): profiler.TracerEventType.ProfileStep), 400) self.assertEqual( time_range_summary.get_cpu_range_sum( - profiler.TracerEventType.Forward), 100) + profiler.TracerEventType.Forward), 90) self.assertEqual( time_range_summary.get_cpu_range_sum( profiler.TracerEventType.Backward), 80) @@ -417,12 +419,12 @@ def test_statistic_case2(self): distributed_summary.overlap_range), 85) self.assertEqual(len(event_summary.items), 4) self.assertEqual(len(event_summary.userdefined_items), 1) - self.assertEqual(len(event_summary.model_perspective_items), 4) + self.assertEqual(len(event_summary.model_perspective_items), 5) self.assertEqual(len(event_summary.memory_manipulation_items), 1) self.assertEqual(event_summary.items['conv2d'].cpu_time, 15) self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25) self.assertEqual( - event_summary.model_perspective_items['Forward'].cpu_time, 100) + event_summary.model_perspective_items['Forward'].cpu_time, 90) self.assertEqual( event_summary.model_perspective_items['Forward'].general_gpu_time, 315) @@ -441,6 +443,86 @@ def test_statistic_case2(self): thread_sep=False, time_unit='ms')) + def test_statistic_case3(self): + # for coverage, test all time is 0 + root_node = HostPythonNode('Root Node', + profiler.TracerEventType.UserDefined, 0, + float('inf'), 1000, 1001) + profilerstep_node = HostPythonNode('ProfileStep#1', + profiler.TracerEventType.ProfileStep, + 0, 400, 1000, 1001) + dataloader_node = HostPythonNode('Dataloader', + profiler.TracerEventType.Dataloader, 5, + 15, 1000, 1001) + mobilenet_node = HostPythonNode( + 'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) + + backward_node = HostPythonNode('Gradient Backward', + profiler.TracerEventType.Backward, 120, + 200, 1000, 1001) + optimization_node = HostPythonNode( + 'Optimization', profiler.TracerEventType.Optimization, 220, 300, + 1000, 1001) + userdefined_node = HostPythonNode('Communication Time', + profiler.TracerEventType.UserDefined, + 60, 70, 1000, 1001) + + conv2d_node = HostPythonNode( + 'conv2d', profiler.TracerEventType.Operator, 25, 25, 1000, 1001) + + conv2d_infer_shape = HostPythonNode( + 'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25, + 25, 1000, 1001) + conv2d_compute = HostPythonNode('conv2d::compute', + profiler.TracerEventType.OperatorInner, + 25, 25, 1000, 1001) + conv2d_launchkernel = HostPythonNode( + 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 25, 25, + 1000, 1001) + + conv2d_kernel = DevicePythonNode( + 'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0) + another_kernel = DevicePythonNode( + 'void phi::funcs::VectorizedBroadcastKernel, phi::funcs::AddFunctor>()', + profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0) + root_node.children_node.append(profilerstep_node) + profilerstep_node.children_node.extend([ + dataloader_node, mobilenet_node, userdefined_node, backward_node, + optimization_node + ]) + mobilenet_node.children_node.append(conv2d_node) + conv2d_node.children_node.extend([conv2d_infer_shape, conv2d_compute]) + conv2d_compute.runtime_node.append(conv2d_launchkernel) + conv2d_launchkernel.device_node.append(conv2d_kernel) + conv2d_launchkernel.device_node.append(another_kernel) + thread_tree = {'thread1001': root_node} + extra_info = { + 'Process Cpu Utilization': '1.02', + 'System Cpu Utilization': '0.68' + } + statistic_data = profiler.profiler_statistic.StatisticData(thread_tree, + extra_info) + time_range_summary = statistic_data.time_range_summary + event_summary = statistic_data.event_summary + + self.assertEqual(event_summary.items['conv2d'].cpu_time, 0) + self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 0) + self.assertEqual(event_summary.userdefined_items['Communication Time'] + .general_gpu_time, 0) + for sort_key in [ + profiler.SortedKeys.CPUTotal, profiler.SortedKeys.CPUMax, + profiler.SortedKeys.CPUMin, profiler.SortedKeys.CPUAvg, + profiler.SortedKeys.GPUTotal, profiler.SortedKeys.GPUMax, + profiler.SortedKeys.GPUMin, profiler.SortedKeys.GPUAvg + ]: + print( + profiler.profiler_statistic._build_table( + statistic_data, + sorted_by=sort_key, + op_detail=True, + thread_sep=False, + time_unit='ms')) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index 7465a8e80ff8e..422dbe4ce359f 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -78,15 +78,19 @@ def __init__(self, hostnode): self.self_gpu_time = 0 self.general_gpu_time = 0 # besides kernel, include time of gpu events like memcpy and memset self.self_general_gpu_time = 0 + self.is_terminal_operator_node = True def cal_statistic(self): for child in self.children_node: child.cal_statistic() + if child.is_terminal_operator_node == False: + self.is_terminal_operator_node = False for rt in self.runtime_node: rt.cal_statistic() - self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns for child in self.children_node: + if child.type == TracerEventType.Operator: + self.is_terminal_operator_node = False self.gpu_time += child.gpu_time self.general_gpu_time += child.general_gpu_time self.self_cpu_time -= (child.end_ns - child.start_ns) @@ -419,10 +423,10 @@ def add_item(self, node): for runtimenode in node.runtime_node: for devicenode in runtimenode.device_node: - if devicenode.name not in self.devices: - self.devices[devicenode.name] = EventSummary.DeviceItem( - devicenode.name) - self.devices[devicenode.name].add_item(devicenode) + name = devicenode.name + if name not in self.devices: + self.devices[name] = EventSummary.DeviceItem(name) + self.devices[name].add_item(devicenode) class GeneralItem: def __init__(self, name): @@ -489,6 +493,7 @@ def __init__(self): dict) # for userdefined summary self.model_perspective_items = {} # for model summary self.memory_manipulation_items = {} # for memory manipulation summary + self.kernel_items = {} # for kernel summary def parse(self, nodetrees): r""" @@ -508,6 +513,7 @@ def parse(self, nodetrees): self.add_memory_manipulation_item(host_statistic_node) else: self.add_userdefined_item(host_statistic_node) + self.add_kernel_item(host_statistic_nodes[0]) for threadid, root_statistic_node in node_statistic_trees.items(): deque = collections.deque() @@ -525,11 +531,7 @@ def parse(self, nodetrees): deque.append(child) def add_operator_item(self, operator_node): - have_inner = False - for child in operator_node.children_node: - if child.type == TracerEventType.OperatorInner: - have_inner = True - if have_inner == False: + if operator_node.is_terminal_operator_node == False: return if operator_node.name not in self.items: self.items[operator_node.name] = EventSummary.OperatorItem( @@ -585,6 +587,15 @@ def add_model_perspective_item(self, model_perspective_node): self.model_perspective_items[name] = EventSummary.GeneralItem(name) self.model_perspective_items[name].add_item(model_perspective_node) + def add_kernel_item(self, root_node): + device_nodes = get_device_nodes(root_node) + for device_node in device_nodes: + if device_node.type == TracerEventType.Kernel: + name = device_node.name + if name not in self.kernel_items: + self.kernel_items[name] = EventSummary.DeviceItem(name) + self.kernel_items[name].add_item(device_node) + class StatisticData: r""" @@ -752,6 +763,9 @@ def format_ratio(ratio, indent=0): cpu_call_times[ event_type] = statistic_data.event_summary.model_perspective_items[ event_type_name].call + cpu_type_time[ + event_type] = statistic_data.event_summary.model_perspective_items[ + event_type_name].cpu_time gpu_time_range = collections.defaultdict(list) for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items( @@ -800,7 +814,6 @@ def format_ratio(ratio, indent=0): append( "Note:\nIn this table, We sum up all collected events in terms of event type.\n" "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n" - "The time with ratio 100% is the base time for calculating ratio. \n" "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n" "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n" "Example:\n" @@ -820,13 +833,18 @@ def format_ratio(ratio, indent=0): all_row_values = [] accmulation_time = 0 gpu_accmulation_time = 0 - gpu_total_time = 0 + gpu_total_time = statistic_data.event_summary.model_perspective_items[ + 'ProfileStep'].general_gpu_time for name in [ 'ProfileStep', 'Dataloader', 'Forward', 'Backward', 'Optimization' ]: if name in model_perspective_items: item = model_perspective_items[name] + if gpu_total_time == 0: + gpu_ratio = 0 + else: + gpu_ratio = float(item.general_gpu_time) / gpu_total_time name = '{}'.format( name) if 'ProfileStep' in name else ' {}'.format(name) row_values = [ @@ -850,17 +868,19 @@ def format_ratio(ratio, indent=0): item.max_gpu_time, unit=time_unit), format_time( item.min_gpu_time, unit=time_unit), - format_ratio(float(item.gpu_time) / total_time)) + format_ratio(gpu_ratio)) ] all_row_values.append(row_values) if 'ProfileStep' not in name: accmulation_time += item.cpu_time - gpu_accmulation_time += item.gpu_time - else: - gpu_total_time = item.gpu_time + gpu_accmulation_time += item.general_gpu_time other_time = total_time - accmulation_time other_gpu_time = gpu_total_time - gpu_accmulation_time + if gpu_total_time == 0: + gpu_ratio = 0 + else: + gpu_ratio = float(other_gpu_time) / gpu_total_time row_values = [ ' Others', '-', '{} / - / - / - / {}'.format( format_time( @@ -869,7 +889,7 @@ def format_ratio(ratio, indent=0): '{} / - / - / - / {}'.format( format_time( other_gpu_time, unit=time_unit), - format_ratio(float(other_gpu_time) / total_time)) + format_ratio(gpu_ratio)) ] all_row_values.append(row_values) # Calculate the column width @@ -913,7 +933,6 @@ def format_ratio(ratio, indent=0): append( "Note:\nIn this table, GPU time is the sum of all device(GPU) events called in the phase.\n" "Unlike overview summary, if two device(GPU) events execute on different streams with overlap time, we sum them directly here.\n" - "The time with ratio 100% is the base time for calculating ratio. \n" ) append('-' * line_length) append('') @@ -981,7 +1000,6 @@ def format_ratio(ratio, indent=0): "Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n" "Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n" "Overlap time: Communication time intersects with computation time.\n" - "The time with ratio 100% is the base time for calculating ratio. \n" "Example:\n" "Communication:\n" " CPU: |_________________|\n" @@ -1040,8 +1058,22 @@ def format_ratio(ratio, indent=0): elif sorted_by == SortedKeys.GPUMin: sorted_items = sorted( items.items(), key=lambda x: x[1].min_general_gpu_time) + total_op_cpu_time = 0 + total_op_gpu_time = 0 + + for name, item in sorted_items: + total_op_cpu_time += item.cpu_time + total_op_gpu_time += item.general_gpu_time for name, item in sorted_items: + if total_op_cpu_time == 0: + cpu_ratio = 0 + else: + cpu_ratio = float(item.cpu_time) / total_op_cpu_time + if total_op_gpu_time == 0: + gpu_ratio = 0 + else: + gpu_ratio = float(item.general_gpu_time) / total_op_gpu_time row_values = [ name, item.call, '{} / {} / {} / {} / {}'.format( format_time( @@ -1052,7 +1084,7 @@ def format_ratio(ratio, indent=0): item.max_cpu_time, unit=time_unit), format_time( item.min_cpu_time, unit=time_unit), - format_ratio(float(item.cpu_time) / total_time)), + format_ratio(cpu_ratio)), '{} / {} / {} / {} / {}'.format( format_time( item.general_gpu_time, unit=time_unit), @@ -1062,13 +1094,22 @@ def format_ratio(ratio, indent=0): item.max_general_gpu_time, unit=time_unit), format_time( item.min_general_gpu_time, unit=time_unit), - format_ratio( - float(item.general_gpu_time) / total_time)) + format_ratio(gpu_ratio)) ] all_row_values.append(row_values) if op_detail: for innerop_name, innerop_node in item.operator_inners.items( ): + if item.cpu_time == 0: + cpu_ratio = 0 + else: + cpu_ratio = float( + innerop_node.cpu_time) / item.cpu_time + if item.general_gpu_time == 0: + gpu_ratio = 0 + else: + gpu_ratio = float(innerop_node.general_gpu_time + ) / item.general_gpu_time if len(innerop_name) + 2 > name_column_width: innerop_name = innerop_name[:name_column_width - 5] innerop_name += "..." @@ -1083,8 +1124,7 @@ def format_ratio(ratio, indent=0): innerop_node.max_cpu_time, unit=time_unit), format_time( innerop_node.min_cpu_time, unit=time_unit), - format_ratio( - float(innerop_node.cpu_time) / total_time)), + format_ratio(cpu_ratio)), '{} / {} / {} / {} / {}'.format( format_time( innerop_node.general_gpu_time, @@ -1098,13 +1138,17 @@ def format_ratio(ratio, indent=0): format_time( innerop_node.min_general_gpu_time, unit=time_unit), - format_ratio( - float(innerop_node.general_gpu_time) / - total_time)) + format_ratio(gpu_ratio)) ] all_row_values.append(row_values) for device_node_name, device_node in innerop_node.devices.items( ): + if innerop_node.general_gpu_time == 0: + gpu_ratio = 0 + else: + gpu_ratio = float( + device_node. + gpu_time) / innerop_node.general_gpu_time if len(device_node_name) + 4 > name_column_width: device_node_name = device_node_name[: name_column_width @@ -1125,12 +1169,15 @@ def format_ratio(ratio, indent=0): format_time( device_node.min_gpu_time, unit=time_unit), - format_ratio( - float(device_node.gpu_time) / - total_time)) + format_ratio(gpu_ratio)) ] all_row_values.append(row_values) for device_node_name, device_node in item.devices.items(): + if item.general_gpu_time == 0: + gpu_ratio = 0 + else: + gpu_ratio = float( + device_node.gpu_time) / item.general_gpu_time if len(device_node_name) + 2 > name_column_width: device_node_name = device_node_name[: name_column_width @@ -1148,8 +1195,7 @@ def format_ratio(ratio, indent=0): device_node.max_gpu_time, unit=time_unit), format_time( device_node.min_gpu_time, unit=time_unit), - format_ratio( - float(device_node.gpu_time) / total_time)) + format_ratio(gpu_ratio)) ] all_row_values.append(row_values) # Calculate the column width @@ -1197,11 +1243,106 @@ def format_ratio(ratio, indent=0): append('') append('') + ###### Print Kernel Summary Report ###### + if statistic_data.event_summary.kernel_items: + all_row_values = [] + kernel_items = statistic_data.event_summary.kernel_items + if sorted_by == SortedKeys.GPUAvg: + sorted_items = sorted( + kernel_items.items(), + key=lambda x: x[1].avg_gpu_time, + reverse=True) + elif sorted_by == SortedKeys.GPUMax: + sorted_items = sorted( + kernel_items.items(), + key=lambda x: x[1].max_gpu_time, + reverse=True) + elif sorted_by == SortedKeys.GPUMin: + sorted_items = sorted( + kernel_items.items(), key=lambda x: x[1].min_gpu_time) + else: + sorted_items = sorted( + kernel_items.items(), key=lambda x: x[1].gpu_time, reverse=True) + + total_kernel_gpu_time = 0 + for name, item in sorted_items: + total_kernel_gpu_time += item.gpu_time + for name, item in sorted_items: + if total_kernel_gpu_time == 0: + gpu_ratio = 0 + else: + gpu_ratio = float(item.gpu_time) / total_kernel_gpu_time + row_values = [ + name, + item.call, + '{} / {} / {} / {} / {}'.format( + format_time( + item.gpu_time, unit=time_unit), + format_time( + item.avg_gpu_time, unit=time_unit), + format_time( + item.max_gpu_time, unit=time_unit), + format_time( + item.min_gpu_time, unit=time_unit), + format_ratio(gpu_ratio)), + ] + all_row_values.append(row_values) + + headers = ['Name', 'Calls', 'GPU Total / Avg / Max / Min / Ratio(%)'] + # Calculate the column width + name_column_width = 90 + calltime_width = 6 + gpu_data_description_width = 40 + for row_values in all_row_values: + if isinstance(row_values[1], + int) and len(str(row_values[1])) > calltime_width: + calltime_width = len(str(row_values[1])) + if len(row_values[2]) > gpu_data_description_width: + gpu_data_description_width = len(row_values[2]) + + row_format_list = [""] + header_sep_list = [""] + line_length_list = [-SPACING_SIZE] + add_column(name_column_width) + add_column(calltime_width) + add_column(gpu_data_description_width) + + row_format = row_format_list[0] + header_sep = header_sep_list[0] + line_length = line_length_list[0] + + # construct table string + append(add_title(line_length, "Kernel Summary")) + append('Time unit: {}'.format(time_unit)) + append(header_sep) + append(row_format.format(*headers)) + append(header_sep) + for row_values in all_row_values: + indx = row_values[0].find('(') + if indx != -1: + name = row_values[0][:indx] + else: + name = row_values[0] + if len(name) > name_column_width: + row_values[0] = name[:name_column_width - 3] + '...' + else: + row_values[0] = name + append(row_format.format(*row_values)) + append(header_sep) + append('') + append('') + ###### Print Memory Manipulation Summary Report ###### if statistic_data.event_summary.memory_manipulation_items: all_row_values = [] memory_manipulation_items = statistic_data.event_summary.memory_manipulation_items + gpu_total_time = statistic_data.event_summary.model_perspective_items[ + 'ProfileStep'].general_gpu_time for name, item in memory_manipulation_items.items(): + if gpu_total_time == 0: + gpu_ratio = 0 + else: + gpu_ratio = float(item.general_gpu_time) / gpu_total_time row_values = [ name, item.call, @@ -1224,7 +1365,7 @@ def format_ratio(ratio, indent=0): item.max_general_gpu_time, unit=time_unit), format_time( item.min_general_gpu_time, unit=time_unit), - format_ratio(float(item.general_gpu_time) / total_time)), + format_ratio(gpu_ratio)), ] all_row_values.append(row_values) @@ -1274,6 +1415,8 @@ def format_ratio(ratio, indent=0): ###### Print UserDefined Summary Report ###### if statistic_data.event_summary.userdefined_items: all_row_values = [] + gpu_total_time = statistic_data.event_summary.model_perspective_items[ + 'ProfileStep'].general_gpu_time if thread_sep == True: userdefined_thread_items = statistic_data.event_summary.userdefined_thread_items else: @@ -1319,6 +1462,10 @@ def format_ratio(ratio, indent=0): items.items(), key=lambda x: x[1].min_general_gpu_time) for name, item in sorted_items: + if gpu_total_time == 0: + gpu_ratio = 0 + else: + gpu_ratio = float(item.general_gpu_time) / gpu_total_time row_values = [ name, item.call, @@ -1341,8 +1488,7 @@ def format_ratio(ratio, indent=0): item.max_general_gpu_time, unit=time_unit), format_time( item.min_general_gpu_time, unit=time_unit), - format_ratio( - float(item.general_gpu_time) / total_time)), + format_ratio(gpu_ratio)), ] all_row_values.append(row_values)