diff --git a/Makefile b/Makefile index 1dd249e5beb..a711507eed3 100644 --- a/Makefile +++ b/Makefile @@ -75,9 +75,9 @@ ps: ## Plain-text list of profiles ## Profile-based cluster shells (autogenerated targets) ## PROFILES_BASE := default default-p2p plutus plutus-secp-ecdsa plutus-secp-schnorr oldtracing idle tracer-only -PROFILES_FAST := fast fast-p2p fast-plutus fast-notracer fast-oldtracing +PROFILES_FAST := fast fast-p2p fast-plutus fast-notracer fast-oldtracing faststartup-24M PROFILES_CI_TEST := ci-test ci-test-p2p ci-test-plutus ci-test-notracer ci-test-rtview ci-test-dense10 -PROFILES_CI_BENCH := ci-bench ci-bench-p2p ci-bench-plutus ci-bench-plutus-secp-ecdsa ci-bench-plutus-secp-schnorr ci-bench-notracer ci-bench-rtview +PROFILES_CI_BENCH := ci-bench ci-bench-p2p ci-bench-plutus ci-bench-plutus-secp-ecdsa ci-bench-plutus-secp-schnorr ci-bench-notracer ci-bench-rtview ci-bench-lmdb PROFILES_TRACE_BENCH := trace-bench trace-bench-notracer trace-bench-oldtracing trace-bench-rtview PROFILES_TRACE_FULL := trace-full trace-full-rtview PROFILES_EPOCHTRANS := epoch-transition @@ -100,7 +100,9 @@ PROFILES_NOMAD_PERF := default-nomadperf ci-test-nomadperf ci-bench-nomadp PROFILES_NOMAD_PERF += plutus-nomadperf fast-nomadperf latency-nomadperf PROFILES_NOMAD_PERF_NOP2P := default-nomadperf-nop2p oldtracing-nomadperf-nop2p ci-test-nomadperf-nop2p ci-bench-nomadperf-nop2p PROFILES_NOMAD_PERF_NOP2P += value-nomadperf-nop2p value-oldtracing-nomadperf-nop2p plutus-nomadperf-nop2p fast-nomadperf-nop2p -PROFILES_NOMAD_PERFSSD := fast-nomadperfssd +PROFILES_NOMAD_PERFSSD := value-nomadperfssd fast-nomadperfssd latency-nomadperfssd +# single node profiles on the NomadSSD cluster on AWS +PROFILES_UTXOSCALE_SOLO := utxoscale-solo-24M64G-nomadperfssd utxoscale-solo-12M64G-nomadperfssd utxoscale-solo-12M16G-nomadperfssd LOCAL_PROFILES += $(PROFILES_BASE) LOCAL_PROFILES += $(PROFILES_FAST) @@ -120,6 +122,7 @@ LOCAL_PROFILES += $(PROFILES_VENDOR) CLOUD_PROFILES += $(PROFILES_NOMAD_PERF) CLOUD_PROFILES += $(PROFILES_NOMAD_PERF_NOP2P) CLOUD_PROFILES += $(PROFILES_NOMAD_PERFSSD) +CLOUD_PROFILES += $(PROFILES_UTXOSCALE_SOLO) ## Note: to enable a shell for a profile, just add its name (one of names from 'make ps') to SHELL_PROFILES diff --git a/nix/nixos/cardano-node-service.nix b/nix/nixos/cardano-node-service.nix index ce603fba52d..b29a57f74ca 100644 --- a/nix/nixos/cardano-node-service.nix +++ b/nix/nixos/cardano-node-service.nix @@ -128,6 +128,11 @@ let ]; }; instanceDbPath = cfg.databasePath i; + utxoLmdbParams = ["--v1-lmdb-ledger-db-backend"] + ++ lib.optionals (cfg.lmdbDatabasePath i != null) + [ "--ssd-database-dir ${cfg.lmdbDatabasePath i}" + # "--ssd-snapshot-tables" + ]; cmd = builtins.filter (x: x != "") [ "${cfg.executable} run" "--config ${nodeConfigFile}" @@ -143,7 +148,8 @@ let "--tracer-socket-path-accept ${cfg.tracerSocketPathAccept i}" ] ++ lib.optionals (cfg.tracerSocketPathConnect i != null) [ "--tracer-socket-path-connect ${cfg.tracerSocketPathConnect i}" - ] ++ consensusParams.${cfg.nodeConfig.Protocol} ++ cfg.extraArgs ++ cfg.rtsArgs; + ] ++ lib.optionals (cfg.withUtxoHdLmdb i) utxoLmdbParams + ++ consensusParams.${cfg.nodeConfig.Protocol} ++ cfg.extraArgs ++ cfg.rtsArgs; in '' echo "Starting: ${concatStringsSep "\"\n echo \"" cmd}" echo "..or, once again, in a single line:" @@ -350,6 +356,16 @@ in { description = ''Node database path, for each instance.''; }; + lmdbDatabasePath = mkOption { + type = funcToOr nullOrStr; + default = null; + apply = x : if builtins.isFunction x then x else if x == null then _: null else _: x; + description = '' + Node UTxO-HD LMDB path for performant disk I/O, for each instance. + This could point to a direct-access SSD, with a specifically created journal-less file system and optimized mount options. + ''; + }; + socketPath = mkOption { type = funcToOr types.str; default = i : "${runtimeDir i}/node.socket"; @@ -648,6 +664,13 @@ in { default = false; }; + withUtxoHdLmdb = mkOption { + type = funcToOr types.bool; + default = false; + apply = x: if builtins.isFunction x then x else _: x; + description = ''On an UTxO-HD enabled node, the in-memory backend is the default. This activates the on-disk backend (LMDB) instead.''; + }; + extraArgs = mkOption { type = types.listOf types.str; default = []; @@ -692,6 +715,7 @@ in { config = mkIf cfg.enable ( let stateDirBase = "/var/lib/"; runDirBase = "/run/"; + lmdbPaths = filter (x: x != null) (map (e: cfg.lmdbDatabasePath e) (builtins.genList lib.trivial.id cfg.instances)); genInstanceConf = f: listToAttrs (if cfg.instances > 1 then genList (i: let n = "cardano-node-${toString i}"; in nameValuePair n (f n i)) cfg.instances else [ (nameValuePair "cardano-node" (f "cardano-node" 0)) ]); in lib.mkMerge [ @@ -793,6 +817,10 @@ in { assertion = !(cfg.systemdSocketActivation && cfg.useNewTopology); message = "Systemd socket activation cannot be used with p2p topology due to a systemd socket re-use issue."; } + { + assertion = (length lmdbPaths) == (length (lib.lists.unique lmdbPaths)); + message = "When configuring multiple LMDB enabled nodes on one instance, lmdbDatabasePath must be unique."; + } ]; } ]); diff --git a/nix/workbench/backend/nomad-job.nix b/nix/workbench/backend/nomad-job.nix index c5811b385c7..11ff8b1684d 100644 --- a/nix/workbench/backend/nomad-job.nix +++ b/nix/workbench/backend/nomad-job.nix @@ -476,14 +476,28 @@ let } ]; }; - - # The Consul namespace in which group and task-level services within the - # group will be registered. Use of template to access Consul KV will read - # from the specified Consul namespace. Specifying namespace takes - # precedence over the -consul-namespace command line argument in job run. - # namespace = ""; - # Not available as the documentations says: Extraneous JSON object property; No argument or block type is named "namespace". - + } + // + # If it needs host volumes add the constraints (can't be "null" or "[]".) + ### - https://developer.hashicorp.com/nomad/tutorials/stateful-workloads/stateful-workloads-host-volumes + (lib.optionalAttrs (profileData.value.cluster.nomad.host_volumes != null) { + volume = lib.listToAttrs (lib.lists.imap0 + (i: v: { + # Internal name, reference to mount in this group's tasks below. + name = "volume-${taskName}-${toString i}"; + value = { + type = "host"; # We only support type "host". + read_only = v.read_only; + # How it is named in the Nomad Client's config. + # https://developer.hashicorp.com/nomad/docs/configuration/client#host_volume-block + source = v.source; + }; + }) + profileData.value.cluster.nomad.host_volumes + ); + }) + // + { # The task stanza creates an individual unit of work, such as a Docker # container, web application, or batch processing. # https://developer.hashicorp.com/nomad/docs/job-specification/task @@ -557,12 +571,12 @@ let # address of an AWS EC2 instance set this to # ${attr.unique.platform.aws.public-ipv4}. address = - # When using the dedicated P&T Nomad cluster on AWS we use public - # IPs/routing, all the other cloud runs are behind a VPC/firewall. - # Local runs just use 12.0.0.1. - if lib.strings.hasInfix "-nomadperf" profileData.profileName + # When using dedicated Nomad clusters on AWS we want to use public + # IPs/routing, all the other cloud runs will run behind a + # VPC/firewall. + if profileData.value.cluster.aws.use_public_routing then "\${attr.unique.platform.aws.public-ipv4}" - else "" + else "" # Local runs just use 127.0.0.1. ; # Specifies the port to advertise for this service. The value of # port depends on which address_mode is being used: @@ -591,6 +605,20 @@ let check = null; }; + # If it needs host volumes mount them (defined above if any). + volume_mount = if profileData.value.cluster.nomad.host_volumes != null + then lib.lists.imap0 + (i: v: { + # Internal name, defined above in the group's specification. + volume = "volume-${taskName}-${toString i}"; + # Where it is going to be mounted inside the Task. + destination = v.destination; + read_only = v.read_only; + }) + profileData.value.cluster.nomad.host_volumes + else null + ; + # Specifies the set of templates to render for the task. Templates can # be used to inject both static and dynamic configuration with data # populated from environment variables, Consul and Vault. @@ -1363,7 +1391,7 @@ let [ # Address string to ( - if lib.strings.hasInfix "-nomadperf" profileData.profileName + if profileData.value.cluster.aws.use_public_routing then ''--host-addr {{ env "attr.unique.platform.aws.local-ipv4" }}'' else ''--host-addr 0.0.0.0'' ) diff --git a/nix/workbench/backend/nomad.sh b/nix/workbench/backend/nomad.sh index 81e750ccc28..0e9a3d2e6f0 100644 --- a/nix/workbench/backend/nomad.sh +++ b/nix/workbench/backend/nomad.sh @@ -2323,12 +2323,23 @@ backend_nomad() { # If the node in "${generator_task}" quits generators fails with: # tx-generator: MuxError MuxBearerClosed " closed when reading data, waiting on next header True" # Service binary 'tx-generator' returned status: 1 + msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\") quit with an error exit code!")" + # Give the node where tx-generator runs some time to quit. + msg "$(yellow " Waiting 60s to check the status of supervisord program \"${generator_task}\" (inside Nomad Task \"${generator_task}\")")" + sleep 30 if backend_nomad is-task-program-running "${dir}" "${generator_task}" "${generator_task}" 5 then # This was not expected! # But check it wasn't a race condition of a stopping cluster! if ! test -f "${dir}"/flag/cluster-stopping then + msg "$(red "ERROR: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\") quit with an error exit code while supervisord program \"${generator_task}\" (inside Nomad Task \"${generator_task}\") is still running!")" + # The tx-generator can fail because something happened with + # the nodes (out of memory?), this gives the nodes more time + # to shutdown properly and/or show any possible cause of + # trouble before being killed. + msg "$(yellow "WARNING: Waiting one minute so nodes are not killed immediately")" + sleep 60 touch "${dir}"/flag/cluster-stopping fatal "Generator quit unexpectedly!!!" fi @@ -2337,14 +2348,14 @@ backend_nomad() { touch "${dir}"/generator/quit # Show the warning and continue with the counter echo -ne "\n" - msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\" quit with an error exit code")" + msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\") quit with an error exit code but expected when supervisord program \"${generator_task}\" (inside Nomad Task \"${generator_task}\") is not running")" msg_ne "nomad: $(blue Waiting) until all pool nodes are stopped: 000000" fi else touch "${dir}"/generator/quit # Show the warning and continue with the counter echo -ne "\n" - msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\" quit with a non-error exit code")" + msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\") quit with a non-error exit code")" msg_ne "nomad: $(blue Waiting) until all pool nodes are stopped: 000000" fi fi # Finish generator checks. @@ -3855,7 +3866,7 @@ client { # Specifies an arbitrary string used to logically group client nodes by # user-defined class. This can be used during job placement as a filter. - node_class = "perf" # Using the "world.dev.cardano.org" testing class for "perf". + node_class = "" # Make sure we are not using namespaces locally. # "artifact" parameters (fail fast!!!) ###################################### diff --git a/nix/workbench/backend/supervisor-conf.nix b/nix/workbench/backend/supervisor-conf.nix index be8f82c2241..052362e35e7 100644 --- a/nix/workbench/backend/supervisor-conf.nix +++ b/nix/workbench/backend/supervisor-conf.nix @@ -120,24 +120,25 @@ let startretries = 0; # Seconds it needs to stay running to consider the start successful # In cases with a big genesis file, like the "value" profile with ~600 - # mega, if this file has an error the node can fail after the 5 seconds - # we use as default for the other programs and the error will be catched - # later by the healthcheck service with a misleading message. - # We found with our reference machines (c5.2xlarge, 16 MB and 8 cores), - # when running the "value" profile, that with 50 seconds at least one - # node was assummed successful (its socket was created). So to the - # default 5 we add 45 seconds when the UTxO size is the one of the - # "value" profile and seconds proportionaly to this for the others. - ### derived.utxo_generated - ### - fast: 18000 (Default of 5s is OK) - ### - ci-test: 18000 (Default of 5s is OK) - ### - default: 43200 (Default of 5s is OK) - ### - plutus: 61200 (Default of 5s is OK) - ### - forge-stress-pre: 72000 - ### - forge-stress-large: 144000 - ### - value: 1536000 (30s more needed) - ### - chainsync-early-alonzo: 31104000 - startsecs = 5 + (profileData.derived.utxo_generated / (1536000 / 50)); + # mega, if this file has a format error the node can fail after the 5 + # seconds we use as default for the other "program"s and the error will + # be caught later by the healthcheck service with a misleading message. + # We found with our AWS reference machines (c5.2xlarge, 16 MB and 8 + # cores), when running the "value" profile, that with 50 seconds at + # least one node was assumed successful (its socket was created). So to + # the default 5 we add 50 seconds when the UTxO set size is the one of + # the "value" profile and seconds proportionally to this for the others. + # Not directly related to "genesis.extra_future_offset" or + # "derived.genesis_future_offset". + ### derived.dataset_measure + ### - fast: 0 (Default of 5s is OK) + ### - ci-test: 0 (Default of 5s is OK) + ### - default: 0 (Default of 5s is OK) + ### - plutus: 0 (Default of 5s is OK) + ### - forge-stress-pre: 5000000 + ### - forge-stress-large: 11300000 + ### - value: 5000000 (50s more needed) + startsecs = 5 + (profileData.derived.dataset_measure / (5000000 / 50)); }) nodeSpecs)) // diff --git a/nix/workbench/nomad.sh b/nix/workbench/nomad.sh index 479b953a575..960ab5f0c96 100644 --- a/nix/workbench/nomad.sh +++ b/nix/workbench/nomad.sh @@ -429,7 +429,7 @@ EOL local nomad_class nomad_class="$(jq -r .cluster.nomad.class "${WB_SHELL_PROFILE_DATA}"/profile.json)" local perf_nodes - perf_nodes="$(nomad node status -filter 'Status=="ready"' -filter "NodeClass==\"${nomad_class}\"" -json)" + perf_nodes="$(nomad node status -filter "Status==\"ready\" and NodeClass==\"${nomad_class}\"" -json)" # Create the base JSON string but without the "attributes" because those # are only available when fetching the status of individual nodes. local nodes_json diff --git a/nix/workbench/profile/prof0-defaults.jq b/nix/workbench/profile/prof0-defaults.jq index e233d45bccf..d0f1f76f455 100644 --- a/nix/workbench/profile/prof0-defaults.jq +++ b/nix/workbench/profile/prof0-defaults.jq @@ -61,10 +61,12 @@ def era_defaults($era): , node: { rts_flags_override: [] + , heap_limit: null ## optional: heap limit in MB (translates to RTS flag -M) , shutdown_on_slot_synced: null , shutdown_on_block_synced: null , tracing_backend: "trace-dispatcher" ## or "iohk-monitoring" , tracer: true + , utxo_lmdb: false ## use LMDB backend (instead of default in-mem) on a UTxO-HD node; will be ignored by non-UTxO-HD nodes , verbatim: { } @@ -96,6 +98,8 @@ def era_defaults($era): { producer: {cores: 2, memory: 15000, memory_max: 16000} , explorer: {cores: 2, memory: 15000, memory_max: 16000} } + # Volumes like {source: "ssd1", destination: "/ssd1", read_only: false} + , host_volumes: null , fetch_logs_ssh: false } , aws: @@ -103,13 +107,16 @@ def era_defaults($era): { producer: "c5.2xlarge" , explorer: "m5.4xlarge" } + # "attr.unique.platform.aws.public-ipv4" to bind and service definition. + , use_public_routing: false } , minimun_storage: { producer: 12582912 # 12×1024×1024 , explorer: 14155776 # 13.5×1024×1024 } , keep_running: false + , ssd_directory: null } - } + } | (.common * (.[$era] // {})); diff --git a/nix/workbench/profile/prof1-variants.jq b/nix/workbench/profile/prof1-variants.jq index 97632c82c72..900a1e9f460 100644 --- a/nix/workbench/profile/prof1-variants.jq +++ b/nix/workbench/profile/prof1-variants.jq @@ -1,5 +1,18 @@ import "epoch-timeline" as timeline; +## For the Nomad perf-ssd cluster, we might want to artificially +## cap the large RAM resources the instances provide. +def nomad_memory_limit($limit): + { nomad: + { resources: + { producer: + { memory: $limit + , memory_max: $limit + } + } + } + }; + def all_profile_variants: 1024 as $Ki | 1000000 as $M @@ -52,6 +65,12 @@ def all_profile_variants: , delegators: (0.2 * $M) } } as $dataset_small + | + { genesis: + { utxo: (24 * $M) + , delegators: (1.2 * $M) + } + } as $dataset_24m | { genesis: { utxo: (30 * $M) @@ -193,6 +212,7 @@ def all_profile_variants: { producer: "c5.2xlarge" , explorer: "m5.4xlarge" } + , use_public_routing: true } # We are requiring 10.5GB on the explorer node and 9GB on the others. , minimun_storage: @@ -214,16 +234,21 @@ def all_profile_variants: { namespace: "perf-ssd" , class: "perf-ssd" , resources: - { producer: {cores: 32, memory: 64000, memory_max: 64000} - , explorer: {cores: 32, memory: 64000, memory_max: 64000} + { producer: {cores: 16, memory: 120000, memory_max: 124000} + , explorer: {cores: 16, memory: 120000, memory_max: 124000} } + , host_volumes: [ + {source: "ssd1", destination: "/ssd1", read_only: false} + , {source: "ssd2", destination: "/ssd2", read_only: false} + ] , fetch_logs_ssh: true } , aws: { instance_type: - { producer: "c5.9xlarge" - , explorer: null + { producer: "r5d.4xlarge" + , explorer: "r5d.4xlarge" } + , use_public_routing: true } , minimun_storage: null , keep_running: true @@ -256,6 +281,15 @@ def all_profile_variants: } } ) as $nomad_perfssd_unicircle + | + ($nomad_perfssd * + { composition: + { locations: ["eu-central-1", "us-east-1", "ap-southeast-2"] + , topology: "torus-dense" + , with_explorer: true + } + } + ) as $nomad_perfssd_dense | ## ### Definition vocabulary: filtering @@ -275,8 +309,8 @@ def all_profile_variants: } as $compressed_timescale | { genesis: - { epoch_length: 1800 - , parameter_k: 9 + { epoch_length: 1200 + , parameter_k: 6 } } as $small_timescale | @@ -541,6 +575,14 @@ def all_profile_variants: ($model_timescale * $nomad_perf_tps_saturation_value * { scenario: "fixed-loaded" }) as $scenario_nomad_perf + | + ($model_timescale * $nomad_perf_tps_saturation_value * + { scenario: "fixed-loaded" + }) as $scenario_nomad_perfssd + | + ($small_timescale * $nomad_perf_tps_saturation_value * + { scenario: "fixed-loaded" + }) as $scenario_nomad_perfssd_solo | ($model_timescale * $model_tps_saturation_value * { scenario: "fixed-loaded" @@ -608,6 +650,23 @@ def all_profile_variants: } , desc: "AWS c5-2xlarge cluster dataset, 7 epochs" }) as $nomad_perf_base + | + ($scenario_nomad_perfssd * $compose_fiftytwo * $dataset_oct2021 * $for_8ep * + { node: + { shutdown_on_slot_synced: 64000 + } + , analysis: + { filters: ["epoch3+", "size-full"] + } + , generator: + { init_cooldown: 45 + } + , genesis: + { funds_balance: 20000000000000 + , max_block_size: 88000 + } + , desc: "AWS c5-2xlarge cluster dataset, 7 epochs" + }) as $nomad_perfssd_base | ($scenario_nomad_perf * $compose_fiftytwo * $dataset_oct2021 * $for_9ep * $plutus_base * $plutus_loop_counter * { node: @@ -628,8 +687,29 @@ def all_profile_variants: }) as $nomad_perf_plutus_base | ($scenario_latency * $compose_fiftytwo * $dataset_empty * $no_filtering * - { desc: "AWS c5-2xlarge cluster, stop when all latency services stop" + { desc: "AWS perf class cluster, stop when all latency services stop" }) as $nomad_perf_latency_base + | + ($scenario_latency * $compose_fiftytwo * $dataset_empty * $no_filtering * + { desc: "AWS perf-ssd class cluster, stop when all latency services stop" + }) as $nomad_perfssd_latency_base + | + ($scenario_nomad_perfssd_solo * $solo * $dataset_24m * + { node: + { shutdown_on_slot_synced: 7200 + } + , analysis: + { filters: ["epoch3+", "size-full"] + } + , generator: + { epochs: 6 + } + , genesis: + { funds_balance: 20000000000000 + , max_block_size: 88000 + } + , desc: "AWS c5[d]-9xlarge utxoscale dataset, 6 epochs" + }) as $nomad_perfssd_solo_base | ($scenario_model * $quadruplet * $dataset_current * $for_7ep * { node: @@ -709,6 +789,13 @@ def all_profile_variants: ### Actual profiles ## + ### Profile templates + ### + # UTxO scaling on a single node, mainnet blocksize, ~2h runtime (6 epochs) - default: 24mio UTxO, 64GB RAM cap + ($nomad_perfssd_solo_base * $nomad_perfssd_unicircle * $costmodel_v8_preview * $p2p + ) as $utxoscale_solo_template + | + ### First, auto-named profiles: ### ## Short slots: @@ -798,6 +885,11 @@ def all_profile_variants: { name: "fast-oldtracing" } + ## Fast variants: single node with large, varying dataset sizes + , $fast_base * $solo * $dataset_24m * + { name: "faststartup-24M" + } + ## CI variants: test duration, 3 blocks , $citest_base * { name: "ci-test" @@ -849,6 +941,11 @@ def all_profile_variants: , $cibench_base * $with_rtview * { name: "ci-bench-rtview" } + , $cibench_base * $p2p * + { name: "ci-bench-lmdb" + , node: { utxo_lmdb: true } + , cluster: { ssd_directory: "/tmp" } + } , $cibench_base * $nomad_perf_torus * $p2p * { name: "ci-bench-nomadperf" , desc: "ci-bench on P&T exclusive cluster" @@ -927,6 +1024,9 @@ def all_profile_variants: , $nomad_perf_base * $nomad_perf_dense * $p2p * $costmodel_v8_preview * { name: "value-nomadperf" } + , $nomad_perfssd_base * $nomad_perfssd_dense * $p2p * $costmodel_v8_preview * + { name: "value-nomadperfssd" + } , $nomad_perf_base * $nomad_perf_dense * $p2p * $costmodel_v8_preview * $old_tracing * { name: "value-oldtracing-nomadperf" } @@ -936,6 +1036,9 @@ def all_profile_variants: , $nomad_perf_latency_base * $nomad_perf_dense * $p2p * $costmodel_v8_preview * { name: "latency-nomadperf" } + , $nomad_perfssd_latency_base * $nomad_perfssd_dense * $p2p * $costmodel_v8_preview * + { name: "latency-nomadperfssd" + } ## P&T Nomad cluster: 52 nodes, 3 regions, value-only (with old tracing variant) and Plutus, no P2P flavour , $nomad_perf_base * $nomad_perf_dense * $costmodel_v8_preview * @@ -955,10 +1058,30 @@ def all_profile_variants: , $fast_base * $compose_fiftytwo * $nomad_perf_dense * $costmodel_v8_preview * { name: "fast-nomadperf-nop2p" } - , $fast_base * $solo * $nomad_perfssd_unicircle * $costmodel_v8_preview * $p2p * + , $fast_base * $compose_fiftytwo * $nomad_perfssd_dense * $costmodel_v8_preview * $p2p * { name: "fast-nomadperfssd" } +## P&T NomadSSD cluster: UTxO scale benchmarks on a single node + , $utxoscale_solo_template * + { name: "utxoscale-solo-24M64G-nomadperfssd" + } + , $utxoscale_solo_template * + { name: "utxoscale-solo-12M64G-nomadperfssd" + , genesis: + { utxo: (12 * $M) + } + } + , $utxoscale_solo_template * + { name: "utxoscale-solo-12M16G-nomadperfssd" + , genesis: + { utxo: (12 * $M) + } + , node: + { heap_limit: 16384 + } + } + ## Model value variant: 7 epochs (128GB RAM needed; 16GB for testing locally) , $model_base * $costmodel_v8_preview * { name: "model-value" diff --git a/nix/workbench/profile/prof3-derived.jq b/nix/workbench/profile/prof3-derived.jq index dfd1894c94b..5a5e4b5ba13 100644 --- a/nix/workbench/profile/prof3-derived.jq +++ b/nix/workbench/profile/prof3-derived.jq @@ -86,6 +86,16 @@ def add_derived_params: | .node as $node | .genesis.shelley.protocolParams as $pparams +## The perf-ssd machines have abundant physical RAM, and Nomad uses cgroups to constrain resources. +## To also influence RTS / GC behaviour, -M needs to be used, as the RTS infers a heap limit from +## the system's ulimit, not the cgroup limit. +| $node.rts_flags_override as $rtsflags +| $node.heap_limit as $heap_limit +| (if $heap_limit | type == "number" + then $rtsflags + [("-M" + ($heap_limit | tostring) + "m")] + else $rtsflags + end) as $rtsflags_derived + ## Absolute durations: | ($gsis.epoch_length * $gsis.slot_duration) as $epoch_duration | ($gsis.slot_duration / $gsis.active_slots_coeff) as $block_duration @@ -191,7 +201,7 @@ def add_derived_params: { tx_count: $generator_tx_count } , node: - { + { rts_flags_override: $rtsflags_derived } , analysis: { minimum_chain_density: ($gsis.active_slots_coeff * 0.5) diff --git a/nix/workbench/service/nodes.nix b/nix/workbench/service/nodes.nix index 4d2ccc97355..05f66cf0102 100644 --- a/nix/workbench/service/nodes.nix +++ b/nix/workbench/service/nodes.nix @@ -68,6 +68,12 @@ let topology = "topology.json"; nodeConfigFile = "config.json"; + # Allow for local clusters to have multiple LMDB directories in the same physical ssd_directory + withUtxoHdLmdb = profile.node.utxo_lmdb; + lmdbDatabasePath = + if (profile.cluster ? "ssd_directory" && profile.cluster.ssd_directory != null) + then "${profile.cluster.ssd_directory}/lmdb-node-${toString i}" + else null; ## Combine: ## 0. baseNodeConfig (coming cardanoLib's testnet environ)