Aligning Benchmarking and PyBuda repositories.

tenstorrent · Apr 26, 2024 · 1839b59 · 1839b59
1 parent 8251137
commit 1839b59
Show file tree

Hide file tree

Showing 23 changed files with 407 additions and 199 deletions.
diff --git a/benchmark.py b/benchmark.py
@@ -57,6 +57,32 @@
 logger.setLevel(logging.INFO)
 
 
+def print_benchmark_envs():
+
+    # If specified by env variable, print the environment variables
+    # It can be useful in CI jobs to get the state of the enviroment variables before test session starts
+    print_env_variables = bool(int(os.environ.get("PYTEST_PRINT_ENV_VARIABLES", "0")))
+    if print_env_variables:
+        pybuda_specific_vars = {}
+        tt_backend_specific_vars = {}
+        print(f"####### Environment variables - Count: {len(os.environ)} #######")
+        for key, value in os.environ.items():
+            print(f"{key}={value}")
+            if key.startswith("PYBUDA_") or key.startswith("GOLDEN_"):
+                pybuda_specific_vars[key] = value
+            elif key.startswith("TT_BACKEND_"):
+                tt_backend_specific_vars[key] = value
+
+        print(f"####### PYBUDA specific enviroment variables - Count: {len(pybuda_specific_vars)} #######")
+        for key, value in pybuda_specific_vars.items():
+            print(f"{key}={value}")
+
+        print(f"####### TT_BACKEND specific enviroment variables - Count: {len(tt_backend_specific_vars)} #######")
+        for key, value in tt_backend_specific_vars.items():
+            print(f"{key}={value}")
+
+
+
 def run(
     args,
     model: Any,
@@ -169,14 +195,19 @@ def run(
                 print(f"Pybuda successfully compiled model to: {args.save_tti}")
                 exit(0)
 
+            if "verify_cfg" in model.keys():
+                verify_cfg = model["verify_cfg"]
+            else:
+                verify_cfg = pybuda.verify.VerifyConfig(verify_pybuda_codegen_vs_framework=True)
+
             # Compilation run
             monitor_thread = threading.Thread(target=benchmark_run.cpu_usage_monitor)
             monitor_thread.start()
             benchmark_run.start_compilation_timer()
             output_q = pybuda.initialize_pipeline(
                 training=args.training,
                 sample_inputs=sample_inputs,
-                _verify_cfg=pybuda.verify.VerifyConfig(verify_pybuda_codegen_vs_framework=True),
+                _verify_cfg=verify_cfg,
                 sample_targets=targets,
             )
             benchmark_run.stop_monitoring = True
@@ -330,6 +361,9 @@ def pop_outputs_thread(output_q):
 
 
 if __name__ == "__main__":
+
+    print_benchmark_envs()
+
     # Arguments
     parser = argparse.ArgumentParser(description="Benchmark a model on TT hardware")
     parser.add_argument("-m", "--model", help="Model to benchmark (i.e. bert)")

diff --git a/benchmark/models/bert/bert.py b/benchmark/models/bert/bert.py
@@ -27,17 +27,16 @@ def bert(training: bool, task: str, config: str, microbatch: int, device: str, d
             os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
             os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "1"
             os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
-            os.environ["PYBUDA_EXP_APPROX"] = "1"
             if data_type == "Bfp8_b":
+                if pybuda.detect_available_devices()[0] != BackendDevice.Grayskull:
+                    os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
+                os.environ["PYBUDA_EXP_APPROX"] = "1"
                 pybuda.config.configure_mixed_precision(op_type="add", output_df=pybuda.DataFormat.Float16_b)
                 pybuda.config.configure_mixed_precision(op_type="subtract", output_df=pybuda.DataFormat.Float16_b)
                 pybuda.config.configure_mixed_precision(op_type="reciprocal", output_df=pybuda.DataFormat.Float16_b)
-
-        available_devices = pybuda.detect_available_devices()
-        if available_devices[0] == BackendDevice.Grayskull:
-            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{18*1024}"
-            if config == "large":
-                pybuda.config.override_op_size("gelu_103", (3, 1))
+            if data_type == "Fp16_b":
+                os.environ["PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"] = "1" #overlay blob issue on bfp8
+                os.environ["PYBUDA_RIBBON2_OPTIMIZATION_ITERATIONS"] = "10"
 
     # Set model parameters based on chosen task and model configuration
     if task == "na":

diff --git a/benchmark/models/deit/deit.py b/benchmark/models/deit/deit.py
@@ -24,18 +24,22 @@ def deit(training: bool, task: str, config: str, microbatch: int, device: str, d
             compiler_cfg.balancer_policy = "Ribbon"
             os.environ["PYBUDA_RIBBON2"] = "1"
 
-            # These are about to be enabled by default.
-            #
-            if data_type != "Bfp8_b":
-                os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
-                os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "1"
-                os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
-            os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
+        # These are about to be enabled by default.
+        #
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
+        os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "1"
+        os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+        os.environ["PYBUDA_RIBBON2_CONSERVATIVE_OPTIMIZATION_ITERATIONS"] = "10"
+
+        if data_type == "Fp16_b":
+            os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES_APPLY_FILTERING"] = "1"
+            os.environ["PYBUDA_TEMP_DISABLE_MODEL_KB_PROLOGUE_BW"] = "1"
 
         if data_type == "Bfp8_b":
+            os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
             pybuda.config.configure_mixed_precision(op_type="reciprocal", output_df=pybuda.DataFormat.Float16_b)
-            os.environ["PYBUDA_RIBBON2_OPTIMIZATION_ITERATIONS"] = "10"
-            os.environ["PYBUDA_TEMP_BALANCER_MODEL_PCIE_BW"] = "0"
+            os.environ["PYBUDA_FUSE_DF_OVERRIDE"] = "0"
 
     # Set model parameters based on chosen task and model configuration
     img_res = 224

diff --git a/benchmark/models/falcon/falcon.py b/benchmark/models/falcon/falcon.py
@@ -22,6 +22,21 @@ def falcon(
     training: bool, task: str, config: str, microbatch: int, device: str, data_type: str, benchmark_run: BenchmarkRun
 ):
 
+    import os
+    import pybuda
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+
+    if compiler_cfg.balancer_policy == "default":
+        compiler_cfg.balancer_policy = "Ribbon"
+        os.environ["PYBUDA_RIBBON2"] = "1"
+
+    os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
+    os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "1"
+    os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+    os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
+    os.environ["PYBUDA_EXP_APPROX"] = "1"
+    os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "233472"
+
     # Set model parameters based on chosen task and model configuration
     if task in ["na", "hellaswag", "text_summarization", "alpacaeval"]:
         if config == "7b":

diff --git a/benchmark/models/falcon/utils/pybudify.py b/benchmark/models/falcon/utils/pybudify.py
@@ -381,7 +381,7 @@ def __init__(
 
             pybuda_arch = {
                 "grayskull": pybuda.BackendDevice.Grayskull,
-                "wormhole": pybuda.BackendDevice.Wormhole,
+                # "wormhole": pybuda.BackendDevice.Wormhole,
                 "wormhole_b0": pybuda.BackendDevice.Wormhole_B0,
             }[arch]
 

diff --git a/benchmark/models/flant5/flant5_past_cache_enc_dec.py b/benchmark/models/flant5/flant5_past_cache_enc_dec.py
@@ -61,28 +61,54 @@ def flant5_past_cache_enc_dec(training: bool, task: str, config: str, microbatch
         import pybuda
         from pybuda.pybudaglobal import TILE_DIM
 
-        # Add PyBUDA configurations
+        # ---------------------------------------------------------------------------------------- #
+        # flan-T5, START
+        # ---------------------------------------------------------------------------------------- #
+
+        compiler_cfg = pybuda.config._get_global_compiler_config()
+
+        if compiler_cfg.balancer_policy == "default":
+            compiler_cfg.balancer_policy = "Ribbon"
+            os.environ["PYBUDA_RIBBON2"] = "1"
+
+        # These are about to be enabled by default.
+        #
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
+        os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "1"
+        os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
+        os.environ["PYBUDA_EXP_APPROX"] = "1"
+
+        # ---------------------------------------------------------------------------------------- #
+        # flan-T5, END
+        # ---------------------------------------------------------------------------------------- #
+
+
+        # ---------------------------------------------------------------------------------------- #
+        # Generate T5 past cache encoder-decoder, START
+        # ---------------------------------------------------------------------------------------- #
+
+        # T5 past cache encoder-decoder overrides (I)
+        # Flags
         os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
-        os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
-        os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
-        os.environ["PYBUDA_EXTRA_L1_MARGIN"] = "120000"
         os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
-        os.environ["PYBUDA_NLP_MANUAL_TARGET"] = "26000"
         os.environ["TT_BACKEND_DRAM_POLLING_FREQUENCY"] = "64"
-        os.environ["TT_BACKEND_PROFILER"] = "1"
         os.environ["TT_BACKEND_EPOCH_BIN_NUM_SLOTS"] = "64"
         os.environ["PYBUDA_ROTATE_PAST_CACHE_PARAMS"] = "1"
-
+
+        # Compiler configurations
         compiler_cfg = pybuda.config._get_global_compiler_config()
-        compiler_cfg.enable_t_streaming = True
         compiler_cfg.enable_tvm_cpu_fallback = False
         compiler_cfg.default_df_override = pybuda._C.Float16_b
         compiler_cfg.default_dram_parameters = False
-        compiler_cfg.input_queues_on_host = True
         compiler_cfg.enable_amp_light()
         compiler_cfg.compile_subgraphs = True
         compiler_cfg.enable_link_past_cache_ios = True
 
+        # ---------------------------------------------------------------------------------------- #
+        # Generate T5 past cache encoder-decoder, END
+        # ---------------------------------------------------------------------------------------- #
+
     # Set model parameters based on chosen task and model configuration
     if task in ["na", "text_classification", "text_summarization"]:
         if config == "small":

diff --git a/benchmark/models/hrnet/hrnet.py b/benchmark/models/hrnet/hrnet.py
@@ -34,34 +34,30 @@ def hrnet(training: bool, task: str, config: str, microbatch: int, device: str,
         from pybuda._C.backend_api import BackendDevice
 
         compiler_cfg = pybuda.config._get_global_compiler_config()
+        compiler_cfg.enable_auto_transposing_placement = True
 
         if compiler_cfg.balancer_policy == "default":
             compiler_cfg.balancer_policy = "Ribbon"
             os.environ["PYBUDA_RIBBON2"] = "1"
-            os.environ["PYBUDA_RIBBON2_OPTIMIZATION_ITERATIONS"] = "10"
-            os.environ["PYBUDA_SUPRESS_T_FACTOR_MM"] = "46"
-            os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
 
-            if data_type != "Bfp8_b":
-                os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
-                os.environ["PYBUDA_RIBBON2_DISABLE_NON_MATMUL_UTIL"] = "1"
-                os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "1"
-                os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
-            os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
+        os.environ["PYBUDA_SUPRESS_T_FACTOR_MM"] = "46" # removing causes hang #2139
+        os.environ["PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"] = "1"
+
+        # These are about to be enabled by default.
+        #
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
+        os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "1"
+        os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
+        if data_type == "Fp16_b":
+            # Hangs with autotranspose on #2542
+            compiler_cfg.enable_auto_transposing_placement = False
+            os.environ["PYBUDA_RIBBON2_OPTIMIZATION_ITERATIONS"] = "10"
 
         # Manually enable amp light for Ribbon
         if compiler_cfg.balancer_policy == "Ribbon":
             compiler_cfg.enable_amp_light()
 
-        if config == "v2_w64":
-            if "TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE" not in os.environ:
-                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{10*1024}"
-            available_devices = pybuda.detect_available_devices()
-            if available_devices:
-                if available_devices[0] == BackendDevice.Grayskull:
-                    pybuda.config.set_epoch_break("add_618")
-                    pybuda.config.insert_buffering_nop("add_442", ["add_471"], nop_count=20)
-
     # Set model parameters based on chosen task and model configuration
     img_res = 224
     target_microbatch = 32
@@ -82,6 +78,14 @@ def hrnet(training: bool, task: str, config: str, microbatch: int, device: str,
         model_name = "hrnetv2_w48"
     elif config == "v2_w64":
         model_name = "hrnetv2_w64"
+        model_name = "hrnetv2_w64"
+        if data_type == "Bfp8_b":
+            if "TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE" not in os.environ:
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{10*1024}"
+        available_devices = pybuda.detect_available_devices()
+        if available_devices:
+            if available_devices[0] == BackendDevice.Grayskull:
+                pybuda.config._internal_insert_fj_buffering_nop('add_312', ['add_341'], nop_count=2)
     else:
         raise RuntimeError("Unknown config")
 

diff --git a/benchmark/models/inception_v4/inception_v4.py b/benchmark/models/inception_v4/inception_v4.py
@@ -22,16 +22,20 @@ def inception_v4(training: bool, task: str, config: str, microbatch: int, device
 
         # Configurations
         compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
+        compiler_cfg.enable_auto_transposing_placement = True
 
         if compiler_cfg.balancer_policy == "default":
             compiler_cfg.balancer_policy = "Ribbon"
             os.environ["PYBUDA_RIBBON2"] = "1"
-            os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
-            if data_type != "Bfp8_b":
-                os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
-                os.environ["PYBUDA_OP_MODEL_COMPARE_VERSION"] = "1"
-            else:
-                os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "1"
+
+        # These are about to be enabled by default.
+        #
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
+        os.environ["PYBUDA_RIBBON2_CONSERVATIVE_OPTIMIZATION_ITERATIONS"] = "10"
+
+        if data_type == "Bfp8_b":
+            os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "1"
 
     if config == "224":
         model_name = "inception_v4"

diff --git a/benchmark/models/mobilenet_v1/mobilenet_v1.py b/benchmark/models/mobilenet_v1/mobilenet_v1.py
@@ -24,20 +24,26 @@ def mobilenetv1(training: bool, task: str, config: str, microbatch: int, device:
             compiler_cfg.balancer_policy = "Ribbon"
             os.environ["PYBUDA_RIBBON2"] = "1"
 
-        os.environ["PYBUDA_SUPRESS_T_FACTOR_MM"] = "8"
+        os.environ["PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"] = "1"
 
         # These are about to be enabled by default.
         #
         os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
-        if data_type != "Bfp8_b":
-            os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
+        os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "1"
+        os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
+        os.environ["PYBUDA_RIBBON2_CONSERVATIVE_OPTIMIZATION_ITERATIONS"] = "10"
+
+        if data_type == "Fp16_b":
+            os.environ["PYBUDA_SUPRESS_T_FACTOR_MM"] = "40"
+            os.environ["PYBUDA_TEMP_DISABLE_MODEL_KB_PROLOGUE_BW"] = "1"
 
         if data_type == "Bfp8_b":
-            # tenstorrent/pybuda#2228
-            os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
+            os.environ["PYBUDA_FUSE_DF_OVERRIDE"] = "0"
             pybuda.config.configure_mixed_precision(name_regex="input.*add.*", output_df=pybuda.DataFormat.Float16_b)
             pybuda.config.configure_mixed_precision(op_type="add", output_df=pybuda.DataFormat.Float16_b)
-            pybuda.config.configure_mixed_precision(op_type="depthwise", output_df=pybuda.DataFormat.Float16_b)
+            pybuda.config.configure_mixed_precision(op_type="multiply", math_fidelity=pybuda.MathFidelity.HiFi2)
+            pybuda.config.configure_mixed_precision(op_type="depthwise", output_df=pybuda.DataFormat.Float16_b, math_fidelity=pybuda.MathFidelity.HiFi2)
 
     # Set model parameters based on chosen task and model configuration
     if config == "192":

diff --git a/benchmark/models/mobilenet_v2/mobilenet_v2.py b/benchmark/models/mobilenet_v2/mobilenet_v2.py
@@ -28,28 +28,31 @@ def mobilenetv2(training: bool, task: str, config: str, microbatch: int, device:
         # These are about to be enabled by default.
         #
         os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
-        if data_type != "Bfp8_b":
-            os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
-            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
-            os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "1"
-            os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+        os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "1"
+        os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
 
-        if pybuda.detect_available_devices()[0] != BackendDevice.Grayskull:
-            os.environ["PYBUDA_MAXIMIZE_SPARSE_UBLOCK"] = "1"
-            os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
-            os.environ["PYBUDA_RIBBON2_OPTIMIZATION_ITERATIONS"] = "10"
-            os.environ["PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY"] = "1"
+        if data_type == "Fp16_b":
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+            os.environ["PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"] = "1"
 
         if data_type == "Bfp8_b":
             pybuda.config.configure_mixed_precision(name_regex="input.*add.*", output_df=pybuda.DataFormat.Float16_b)
             pybuda.config.configure_mixed_precision(op_type="add", output_df=pybuda.DataFormat.Float16_b)
             pybuda.config.configure_mixed_precision(
-                op_type="depthwise",
-                input_df={
-                    1: (pybuda.DataFormat.Float16_b, False),
-                },
-                output_df=pybuda.DataFormat.Float16_b,
+                op_type="depthwise", 
+                input_df={1: (pybuda.DataFormat.Float16_b, False),}, 
+                output_df=pybuda.DataFormat.Float16_b, 
+                math_fidelity=pybuda.MathFidelity.HiFi2
             )
+            pybuda.config.configure_mixed_precision(op_type="multiply", math_fidelity=pybuda.MathFidelity.HiFi2)
+            pybuda.config.configure_mixed_precision(op_type="matmul", math_fidelity=pybuda.MathFidelity.HiFi2)
+
+        if pybuda.detect_available_devices()[0] != BackendDevice.Grayskull:
+            os.environ["PYBUDA_MAXIMIZE_SPARSE_UBLOCK"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1" 
+            os.environ["PYBUDA_RIBBON2_OPTIMIZATION_ITERATIONS"] = "10" 
+            os.environ["PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY"] = "1"
 
     # Set model parameters based on chosen task and model configuration
     if config == "224":