Merge pull request IntersectMBO#5779 from IntersectMBO/bench-master

workbench: UTxO scaling + LMDB benchmarks, improved Nomad cluster handling
Emurgo · Apr 16, 2024 · 0cc5b08 · 0cc5b08
2 parents 3f5181c + 6c619b1
commit 0cc5b08
Show file tree

Hide file tree

Showing 10 changed files with 267 additions and 50 deletions.
diff --git a/Makefile b/Makefile
@@ -75,9 +75,9 @@ ps:                                              ## Plain-text list of profiles
 ## Profile-based cluster shells (autogenerated targets)
 ##
 PROFILES_BASE             := default default-p2p plutus plutus-secp-ecdsa plutus-secp-schnorr oldtracing idle tracer-only
-PROFILES_FAST             := fast fast-p2p fast-plutus fast-notracer fast-oldtracing
+PROFILES_FAST             := fast fast-p2p fast-plutus fast-notracer fast-oldtracing faststartup-24M
 PROFILES_CI_TEST          := ci-test ci-test-p2p ci-test-plutus ci-test-notracer ci-test-rtview ci-test-dense10
-PROFILES_CI_BENCH         := ci-bench ci-bench-p2p ci-bench-plutus ci-bench-plutus-secp-ecdsa ci-bench-plutus-secp-schnorr ci-bench-notracer ci-bench-rtview
+PROFILES_CI_BENCH         := ci-bench ci-bench-p2p ci-bench-plutus ci-bench-plutus-secp-ecdsa ci-bench-plutus-secp-schnorr ci-bench-notracer ci-bench-rtview ci-bench-lmdb
 PROFILES_TRACE_BENCH      := trace-bench trace-bench-notracer trace-bench-oldtracing trace-bench-rtview
 PROFILES_TRACE_FULL       := trace-full trace-full-rtview
 PROFILES_EPOCHTRANS       := epoch-transition
@@ -100,7 +100,9 @@ PROFILES_NOMAD_PERF       := default-nomadperf ci-test-nomadperf ci-bench-nomadp
 PROFILES_NOMAD_PERF       += plutus-nomadperf fast-nomadperf latency-nomadperf
 PROFILES_NOMAD_PERF_NOP2P := default-nomadperf-nop2p oldtracing-nomadperf-nop2p ci-test-nomadperf-nop2p ci-bench-nomadperf-nop2p
 PROFILES_NOMAD_PERF_NOP2P += value-nomadperf-nop2p value-oldtracing-nomadperf-nop2p plutus-nomadperf-nop2p fast-nomadperf-nop2p
-PROFILES_NOMAD_PERFSSD    := fast-nomadperfssd
+PROFILES_NOMAD_PERFSSD    := value-nomadperfssd fast-nomadperfssd latency-nomadperfssd
+# single node profiles on the NomadSSD cluster on AWS
+PROFILES_UTXOSCALE_SOLO	  := utxoscale-solo-24M64G-nomadperfssd utxoscale-solo-12M64G-nomadperfssd utxoscale-solo-12M16G-nomadperfssd
 
 LOCAL_PROFILES += $(PROFILES_BASE)
 LOCAL_PROFILES += $(PROFILES_FAST)
@@ -120,6 +122,7 @@ LOCAL_PROFILES += $(PROFILES_VENDOR)
 CLOUD_PROFILES += $(PROFILES_NOMAD_PERF)
 CLOUD_PROFILES += $(PROFILES_NOMAD_PERF_NOP2P)
 CLOUD_PROFILES += $(PROFILES_NOMAD_PERFSSD)
+CLOUD_PROFILES += $(PROFILES_UTXOSCALE_SOLO)
 
 
 ## Note:  to enable a shell for a profile, just add its name (one of names from 'make ps') to SHELL_PROFILES

diff --git a/nix/nixos/cardano-node-service.nix b/nix/nixos/cardano-node-service.nix
@@ -128,6 +128,11 @@ let
       ];
     };
     instanceDbPath = cfg.databasePath i;
+    utxoLmdbParams = ["--v1-lmdb-ledger-db-backend"]
+      ++ lib.optionals (cfg.lmdbDatabasePath i != null)
+        [ "--ssd-database-dir ${cfg.lmdbDatabasePath i}"
+          # "--ssd-snapshot-tables"
+        ];
     cmd = builtins.filter (x: x != "") [
       "${cfg.executable} run"
       "--config ${nodeConfigFile}"
@@ -143,7 +148,8 @@ let
       "--tracer-socket-path-accept ${cfg.tracerSocketPathAccept i}"
     ] ++ lib.optionals (cfg.tracerSocketPathConnect i != null) [
       "--tracer-socket-path-connect ${cfg.tracerSocketPathConnect i}"
-    ] ++ consensusParams.${cfg.nodeConfig.Protocol} ++ cfg.extraArgs ++ cfg.rtsArgs;
+    ] ++ lib.optionals (cfg.withUtxoHdLmdb i) utxoLmdbParams
+      ++ consensusParams.${cfg.nodeConfig.Protocol} ++ cfg.extraArgs ++ cfg.rtsArgs;
     in ''
       echo "Starting: ${concatStringsSep "\"\n   echo \"" cmd}"
       echo "..or, once again, in a single line:"
@@ -350,6 +356,16 @@ in {
         description = ''Node database path, for each instance.'';
       };
 
+      lmdbDatabasePath = mkOption {
+        type = funcToOr nullOrStr;
+        default = null;
+        apply = x : if builtins.isFunction x then x else if x == null then _: null else _: x;
+        description = ''
+          Node UTxO-HD LMDB path for performant disk I/O, for each instance.
+          This could point to a direct-access SSD, with a specifically created journal-less file system and optimized mount options.
+        '';
+      };
+
       socketPath = mkOption {
         type = funcToOr types.str;
         default = i : "${runtimeDir i}/node.socket";
@@ -648,6 +664,13 @@ in {
         default = false;
       };
 
+      withUtxoHdLmdb = mkOption {
+        type = funcToOr types.bool;
+        default = false;
+        apply = x: if builtins.isFunction x then x else _: x;
+        description = ''On an UTxO-HD enabled node, the in-memory backend is the default. This activates the on-disk backend (LMDB) instead.'';
+      };
+
       extraArgs = mkOption {
         type = types.listOf types.str;
         default = [];
@@ -692,6 +715,7 @@ in {
   config = mkIf cfg.enable ( let
     stateDirBase = "/var/lib/";
     runDirBase = "/run/";
+    lmdbPaths = filter (x: x != null) (map (e: cfg.lmdbDatabasePath e) (builtins.genList lib.trivial.id cfg.instances));
     genInstanceConf = f: listToAttrs (if cfg.instances > 1
       then genList (i: let n = "cardano-node-${toString i}"; in nameValuePair n (f n i)) cfg.instances
       else [ (nameValuePair "cardano-node" (f "cardano-node" 0)) ]); in lib.mkMerge [
@@ -793,6 +817,10 @@ in {
           assertion = !(cfg.systemdSocketActivation && cfg.useNewTopology);
           message = "Systemd socket activation cannot be used with p2p topology due to a systemd socket re-use issue.";
         }
+        {
+          assertion = (length lmdbPaths) == (length (lib.lists.unique lmdbPaths));
+          message   = "When configuring multiple LMDB enabled nodes on one instance, lmdbDatabasePath must be unique.";
+        }
       ];
     }
   ]);

diff --git a/nix/workbench/backend/nomad-job.nix b/nix/workbench/backend/nomad-job.nix
@@ -476,14 +476,28 @@ let
             }
           ];
         };
-
-        # The Consul namespace in which group and task-level services within the
-        # group will be registered. Use of template to access Consul KV will read
-        # from the specified Consul namespace. Specifying namespace takes
-        # precedence over the -consul-namespace command line argument in job run.
-        # namespace = "";
-        # Not available as the documentations says: Extraneous JSON object property; No argument or block type is named "namespace".
-
+      }
+      //
+      # If it needs host volumes add the constraints (can't be "null" or "[]".)
+      ### - https://developer.hashicorp.com/nomad/tutorials/stateful-workloads/stateful-workloads-host-volumes
+      (lib.optionalAttrs (profileData.value.cluster.nomad.host_volumes != null) {
+        volume = lib.listToAttrs (lib.lists.imap0
+          (i: v: {
+            # Internal name, reference to mount in this group's tasks below.
+            name = "volume-${taskName}-${toString i}";
+            value = {
+              type = "host"; # We only support type "host".
+              read_only = v.read_only;
+              # How it is named in the Nomad Client's config.
+              # https://developer.hashicorp.com/nomad/docs/configuration/client#host_volume-block
+              source = v.source;
+            };
+          })
+          profileData.value.cluster.nomad.host_volumes
+        );
+      })
+      //
+      {
         # The task stanza creates an individual unit of work, such as a Docker
         # container, web application, or batch processing.
         # https://developer.hashicorp.com/nomad/docs/job-specification/task
@@ -557,12 +571,12 @@ let
             # address of an AWS EC2 instance set this to
             # ${attr.unique.platform.aws.public-ipv4}.
             address =
-              # When using the dedicated P&T Nomad cluster on AWS we use public
-              # IPs/routing, all the other cloud runs are behind a VPC/firewall.
-              # Local runs just use 12.0.0.1.
-              if lib.strings.hasInfix "-nomadperf" profileData.profileName
+              # When using dedicated Nomad clusters on AWS we want to use public
+              # IPs/routing, all the other cloud runs will run behind a
+              # VPC/firewall.
+              if profileData.value.cluster.aws.use_public_routing
               then "\${attr.unique.platform.aws.public-ipv4}"
-              else ""
+              else "" # Local runs just use 127.0.0.1.
             ;
             # Specifies the port to advertise for this service. The value of
             # port depends on which address_mode is being used:
@@ -591,6 +605,20 @@ let
             check = null;
           };
 
+          # If it needs host volumes mount them (defined above if any).
+          volume_mount = if profileData.value.cluster.nomad.host_volumes != null
+            then lib.lists.imap0
+              (i: v: {
+                # Internal name, defined above in the group's specification.
+                volume = "volume-${taskName}-${toString i}";
+                # Where it is going to be mounted inside the Task.
+                destination = v.destination;
+                read_only = v.read_only;
+              })
+              profileData.value.cluster.nomad.host_volumes
+            else null
+          ;
+
           # Specifies the set of templates to render for the task. Templates can
           # be used to inject both static and dynamic configuration with data
           # populated from environment variables, Consul and Vault.
@@ -1363,7 +1391,7 @@ let
       [
         # Address string to
         (
-          if lib.strings.hasInfix "-nomadperf" profileData.profileName
+          if profileData.value.cluster.aws.use_public_routing
           then ''--host-addr {{ env "attr.unique.platform.aws.local-ipv4" }}''
           else ''--host-addr 0.0.0.0''
         )

diff --git a/nix/workbench/backend/nomad.sh b/nix/workbench/backend/nomad.sh
@@ -2323,12 +2323,23 @@ backend_nomad() {
               # If the node in "${generator_task}" quits generators fails with:
               # tx-generator: MuxError MuxBearerClosed "<socket: 12> closed when reading data, waiting on next header True"
               # Service binary 'tx-generator' returned status: 1
+              msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\") quit with an error exit code!")"
+              # Give the node where tx-generator runs some time to quit.
+              msg "$(yellow " Waiting 60s to check the status of supervisord program \"${generator_task}\" (inside Nomad Task \"${generator_task}\")")"
+              sleep 30
               if backend_nomad is-task-program-running "${dir}" "${generator_task}" "${generator_task}" 5
               then
                 # This was not expected!
                 # But check it wasn't a race condition of a stopping cluster!
                 if ! test -f "${dir}"/flag/cluster-stopping
                 then
+                  msg "$(red "ERROR: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\") quit with an error exit code while supervisord program \"${generator_task}\" (inside Nomad Task \"${generator_task}\") is still running!")"
+                  # The tx-generator can fail because something happened with
+                  # the nodes (out of memory?), this gives the nodes more time
+                  # to shutdown properly and/or show any possible cause of
+                  # trouble before being killed.
+                  msg "$(yellow "WARNING: Waiting one minute so nodes are not killed immediately")"
+                  sleep 60
                   touch "${dir}"/flag/cluster-stopping
                   fatal "Generator quit unexpectedly!!!"
                 fi
@@ -2337,14 +2348,14 @@ backend_nomad() {
                 touch "${dir}"/generator/quit
                 # Show the warning and continue with the counter
                 echo -ne "\n"
-                msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\" quit with an error exit code")"
+                msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\") quit with an error exit code but expected when supervisord program \"${generator_task}\" (inside Nomad Task \"${generator_task}\") is not running")"
                 msg_ne "nomad: $(blue Waiting) until all pool nodes are stopped: 000000"
               fi
             else
               touch "${dir}"/generator/quit
               # Show the warning and continue with the counter
               echo -ne "\n"
-              msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\" quit with a non-error exit code")"
+              msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\") quit with a non-error exit code")"
               msg_ne "nomad: $(blue Waiting) until all pool nodes are stopped: 000000"
             fi
           fi # Finish generator checks.
@@ -3855,7 +3866,7 @@ client {
 
   # Specifies an arbitrary string used to logically group client nodes by
   # user-defined class. This can be used during job placement as a filter.
-  node_class = "perf" # Using the "world.dev.cardano.org" testing class for "perf".
+  node_class = "" # Make sure we are not using namespaces locally.
 
   # "artifact" parameters (fail fast!!!)
   ######################################

diff --git a/nix/workbench/backend/supervisor-conf.nix b/nix/workbench/backend/supervisor-conf.nix
@@ -120,24 +120,25 @@ let
         startretries   = 0;
         # Seconds it needs to stay running to consider the start successful
         # In cases with a big genesis file, like the "value" profile with ~600
-        # mega, if this file has an error the node can fail after the 5 seconds
-        # we use as default for the other programs and the error will be catched
-        # later by the healthcheck service with a misleading message.
-        # We found with our reference machines (c5.2xlarge, 16 MB and 8 cores),
-        # when running the "value" profile, that with 50 seconds at least one
-        # node was assummed successful (its socket was created). So to the
-        # default 5 we add 45 seconds when the UTxO size is the one of the
-        # "value" profile and seconds proportionaly to this for the others.
-        ### derived.utxo_generated
-        ### - fast:                      18000 (Default of 5s is OK)
-        ### - ci-test:                   18000 (Default of 5s is OK)
-        ### - default:                   43200 (Default of 5s is OK)
-        ### - plutus:                    61200 (Default of 5s is OK)
-        ### - forge-stress-pre:          72000
-        ### - forge-stress-large:       144000
-        ### - value:                   1536000 (30s more needed)
-        ### - chainsync-early-alonzo: 31104000
-        startsecs      = 5 + (profileData.derived.utxo_generated / (1536000 / 50));
+        # mega, if this file has a format error the node can fail after the 5
+        # seconds we use as default for the other "program"s and the error will
+        # be caught later by the healthcheck service with a misleading message.
+        # We found with our AWS reference machines (c5.2xlarge, 16 MB and 8
+        # cores), when running the "value" profile, that with 50 seconds at
+        # least one node was assumed successful (its socket was created). So to
+        # the default 5 we add 50 seconds when the UTxO set size is the one of
+        # the "value" profile and seconds proportionally to this for the others.
+        # Not directly related to "genesis.extra_future_offset" or
+        # "derived.genesis_future_offset".
+        ### derived.dataset_measure
+        ### - fast:                          0 (Default of 5s is OK)
+        ### - ci-test:                       0 (Default of 5s is OK)
+        ### - default:                       0 (Default of 5s is OK)
+        ### - plutus:                        0 (Default of 5s is OK)
+        ### - forge-stress-pre:        5000000
+        ### - forge-stress-large:     11300000
+        ### - value:                   5000000 (50s more needed)
+        startsecs      = 5 + (profileData.derived.dataset_measure / (5000000 / 50));
       })
     nodeSpecs))
     //

diff --git a/nix/workbench/nomad.sh b/nix/workbench/nomad.sh
@@ -429,7 +429,7 @@ EOL
         local nomad_class
         nomad_class="$(jq -r .cluster.nomad.class "${WB_SHELL_PROFILE_DATA}"/profile.json)"
         local perf_nodes
-        perf_nodes="$(nomad node status -filter 'Status=="ready"' -filter "NodeClass==\"${nomad_class}\"" -json)"
+        perf_nodes="$(nomad node status -filter "Status==\"ready\" and NodeClass==\"${nomad_class}\"" -json)"
         # Create the base JSON string but without the "attributes" because those
         # are only available when fetching the status of individual nodes.
         local nodes_json

diff --git a/nix/workbench/profile/prof0-defaults.jq b/nix/workbench/profile/prof0-defaults.jq
@@ -61,10 +61,12 @@ def era_defaults($era):
 
   , node:
     { rts_flags_override:             []
+    , heap_limit:                     null                ## optional: heap limit in MB (translates to RTS flag -M)
     , shutdown_on_slot_synced:        null
     , shutdown_on_block_synced:       null
     , tracing_backend:                "trace-dispatcher"  ## or "iohk-monitoring"
     , tracer:                         true
+    , utxo_lmdb:                      false               ## use LMDB backend (instead of default in-mem) on a UTxO-HD node; will be ignored by non-UTxO-HD nodes
     , verbatim:
       {
       }
@@ -96,20 +98,25 @@ def era_defaults($era):
         { producer: {cores: 2, memory: 15000, memory_max: 16000}
         , explorer: {cores: 2, memory: 15000, memory_max: 16000}
         }
+      # Volumes like {source: "ssd1", destination: "/ssd1", read_only: false}
+      , host_volumes: null
       , fetch_logs_ssh: false
       }
     , aws:
       { instance_type:
         { producer: "c5.2xlarge"
         , explorer: "m5.4xlarge"
         }
+      # "attr.unique.platform.aws.public-ipv4" to bind and service definition.
+      , use_public_routing: false
       }
     , minimun_storage:
       { producer: 12582912 # 12×1024×1024
       , explorer: 14155776 # 13.5×1024×1024
       }
     , keep_running: false
+    , ssd_directory: null
     }
-
   }
+
 } | (.common * (.[$era] // {}));