Skip to content

Commit

Permalink
Merge pull request IntersectMBO#5779 from IntersectMBO/bench-master
Browse files Browse the repository at this point in the history
workbench: UTxO scaling + LMDB benchmarks, improved Nomad cluster handling
  • Loading branch information
mgmeier authored Apr 16, 2024
2 parents 3f5181c + 6c619b1 commit 0cc5b08
Show file tree
Hide file tree
Showing 10 changed files with 267 additions and 50 deletions.
9 changes: 6 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,9 @@ ps: ## Plain-text list of profiles
## Profile-based cluster shells (autogenerated targets)
##
PROFILES_BASE := default default-p2p plutus plutus-secp-ecdsa plutus-secp-schnorr oldtracing idle tracer-only
PROFILES_FAST := fast fast-p2p fast-plutus fast-notracer fast-oldtracing
PROFILES_FAST := fast fast-p2p fast-plutus fast-notracer fast-oldtracing faststartup-24M
PROFILES_CI_TEST := ci-test ci-test-p2p ci-test-plutus ci-test-notracer ci-test-rtview ci-test-dense10
PROFILES_CI_BENCH := ci-bench ci-bench-p2p ci-bench-plutus ci-bench-plutus-secp-ecdsa ci-bench-plutus-secp-schnorr ci-bench-notracer ci-bench-rtview
PROFILES_CI_BENCH := ci-bench ci-bench-p2p ci-bench-plutus ci-bench-plutus-secp-ecdsa ci-bench-plutus-secp-schnorr ci-bench-notracer ci-bench-rtview ci-bench-lmdb
PROFILES_TRACE_BENCH := trace-bench trace-bench-notracer trace-bench-oldtracing trace-bench-rtview
PROFILES_TRACE_FULL := trace-full trace-full-rtview
PROFILES_EPOCHTRANS := epoch-transition
Expand All @@ -100,7 +100,9 @@ PROFILES_NOMAD_PERF := default-nomadperf ci-test-nomadperf ci-bench-nomadp
PROFILES_NOMAD_PERF += plutus-nomadperf fast-nomadperf latency-nomadperf
PROFILES_NOMAD_PERF_NOP2P := default-nomadperf-nop2p oldtracing-nomadperf-nop2p ci-test-nomadperf-nop2p ci-bench-nomadperf-nop2p
PROFILES_NOMAD_PERF_NOP2P += value-nomadperf-nop2p value-oldtracing-nomadperf-nop2p plutus-nomadperf-nop2p fast-nomadperf-nop2p
PROFILES_NOMAD_PERFSSD := fast-nomadperfssd
PROFILES_NOMAD_PERFSSD := value-nomadperfssd fast-nomadperfssd latency-nomadperfssd
# single node profiles on the NomadSSD cluster on AWS
PROFILES_UTXOSCALE_SOLO := utxoscale-solo-24M64G-nomadperfssd utxoscale-solo-12M64G-nomadperfssd utxoscale-solo-12M16G-nomadperfssd

LOCAL_PROFILES += $(PROFILES_BASE)
LOCAL_PROFILES += $(PROFILES_FAST)
Expand All @@ -120,6 +122,7 @@ LOCAL_PROFILES += $(PROFILES_VENDOR)
CLOUD_PROFILES += $(PROFILES_NOMAD_PERF)
CLOUD_PROFILES += $(PROFILES_NOMAD_PERF_NOP2P)
CLOUD_PROFILES += $(PROFILES_NOMAD_PERFSSD)
CLOUD_PROFILES += $(PROFILES_UTXOSCALE_SOLO)


## Note: to enable a shell for a profile, just add its name (one of names from 'make ps') to SHELL_PROFILES
Expand Down
30 changes: 29 additions & 1 deletion nix/nixos/cardano-node-service.nix
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,11 @@ let
];
};
instanceDbPath = cfg.databasePath i;
utxoLmdbParams = ["--v1-lmdb-ledger-db-backend"]
++ lib.optionals (cfg.lmdbDatabasePath i != null)
[ "--ssd-database-dir ${cfg.lmdbDatabasePath i}"
# "--ssd-snapshot-tables"
];
cmd = builtins.filter (x: x != "") [
"${cfg.executable} run"
"--config ${nodeConfigFile}"
Expand All @@ -143,7 +148,8 @@ let
"--tracer-socket-path-accept ${cfg.tracerSocketPathAccept i}"
] ++ lib.optionals (cfg.tracerSocketPathConnect i != null) [
"--tracer-socket-path-connect ${cfg.tracerSocketPathConnect i}"
] ++ consensusParams.${cfg.nodeConfig.Protocol} ++ cfg.extraArgs ++ cfg.rtsArgs;
] ++ lib.optionals (cfg.withUtxoHdLmdb i) utxoLmdbParams
++ consensusParams.${cfg.nodeConfig.Protocol} ++ cfg.extraArgs ++ cfg.rtsArgs;
in ''
echo "Starting: ${concatStringsSep "\"\n echo \"" cmd}"
echo "..or, once again, in a single line:"
Expand Down Expand Up @@ -350,6 +356,16 @@ in {
description = ''Node database path, for each instance.'';
};

lmdbDatabasePath = mkOption {
type = funcToOr nullOrStr;
default = null;
apply = x : if builtins.isFunction x then x else if x == null then _: null else _: x;
description = ''
Node UTxO-HD LMDB path for performant disk I/O, for each instance.
This could point to a direct-access SSD, with a specifically created journal-less file system and optimized mount options.
'';
};

socketPath = mkOption {
type = funcToOr types.str;
default = i : "${runtimeDir i}/node.socket";
Expand Down Expand Up @@ -648,6 +664,13 @@ in {
default = false;
};

withUtxoHdLmdb = mkOption {
type = funcToOr types.bool;
default = false;
apply = x: if builtins.isFunction x then x else _: x;
description = ''On an UTxO-HD enabled node, the in-memory backend is the default. This activates the on-disk backend (LMDB) instead.'';
};

extraArgs = mkOption {
type = types.listOf types.str;
default = [];
Expand Down Expand Up @@ -692,6 +715,7 @@ in {
config = mkIf cfg.enable ( let
stateDirBase = "/var/lib/";
runDirBase = "/run/";
lmdbPaths = filter (x: x != null) (map (e: cfg.lmdbDatabasePath e) (builtins.genList lib.trivial.id cfg.instances));
genInstanceConf = f: listToAttrs (if cfg.instances > 1
then genList (i: let n = "cardano-node-${toString i}"; in nameValuePair n (f n i)) cfg.instances
else [ (nameValuePair "cardano-node" (f "cardano-node" 0)) ]); in lib.mkMerge [
Expand Down Expand Up @@ -793,6 +817,10 @@ in {
assertion = !(cfg.systemdSocketActivation && cfg.useNewTopology);
message = "Systemd socket activation cannot be used with p2p topology due to a systemd socket re-use issue.";
}
{
assertion = (length lmdbPaths) == (length (lib.lists.unique lmdbPaths));
message = "When configuring multiple LMDB enabled nodes on one instance, lmdbDatabasePath must be unique.";
}
];
}
]);
Expand Down
56 changes: 42 additions & 14 deletions nix/workbench/backend/nomad-job.nix
Original file line number Diff line number Diff line change
Expand Up @@ -476,14 +476,28 @@ let
}
];
};

# The Consul namespace in which group and task-level services within the
# group will be registered. Use of template to access Consul KV will read
# from the specified Consul namespace. Specifying namespace takes
# precedence over the -consul-namespace command line argument in job run.
# namespace = "";
# Not available as the documentations says: Extraneous JSON object property; No argument or block type is named "namespace".

}
//
# If it needs host volumes add the constraints (can't be "null" or "[]".)
### - https://developer.hashicorp.com/nomad/tutorials/stateful-workloads/stateful-workloads-host-volumes
(lib.optionalAttrs (profileData.value.cluster.nomad.host_volumes != null) {
volume = lib.listToAttrs (lib.lists.imap0
(i: v: {
# Internal name, reference to mount in this group's tasks below.
name = "volume-${taskName}-${toString i}";
value = {
type = "host"; # We only support type "host".
read_only = v.read_only;
# How it is named in the Nomad Client's config.
# https://developer.hashicorp.com/nomad/docs/configuration/client#host_volume-block
source = v.source;
};
})
profileData.value.cluster.nomad.host_volumes
);
})
//
{
# The task stanza creates an individual unit of work, such as a Docker
# container, web application, or batch processing.
# https://developer.hashicorp.com/nomad/docs/job-specification/task
Expand Down Expand Up @@ -557,12 +571,12 @@ let
# address of an AWS EC2 instance set this to
# ${attr.unique.platform.aws.public-ipv4}.
address =
# When using the dedicated P&T Nomad cluster on AWS we use public
# IPs/routing, all the other cloud runs are behind a VPC/firewall.
# Local runs just use 12.0.0.1.
if lib.strings.hasInfix "-nomadperf" profileData.profileName
# When using dedicated Nomad clusters on AWS we want to use public
# IPs/routing, all the other cloud runs will run behind a
# VPC/firewall.
if profileData.value.cluster.aws.use_public_routing
then "\${attr.unique.platform.aws.public-ipv4}"
else ""
else "" # Local runs just use 127.0.0.1.
;
# Specifies the port to advertise for this service. The value of
# port depends on which address_mode is being used:
Expand Down Expand Up @@ -591,6 +605,20 @@ let
check = null;
};

# If it needs host volumes mount them (defined above if any).
volume_mount = if profileData.value.cluster.nomad.host_volumes != null
then lib.lists.imap0
(i: v: {
# Internal name, defined above in the group's specification.
volume = "volume-${taskName}-${toString i}";
# Where it is going to be mounted inside the Task.
destination = v.destination;
read_only = v.read_only;
})
profileData.value.cluster.nomad.host_volumes
else null
;

# Specifies the set of templates to render for the task. Templates can
# be used to inject both static and dynamic configuration with data
# populated from environment variables, Consul and Vault.
Expand Down Expand Up @@ -1363,7 +1391,7 @@ let
[
# Address string to
(
if lib.strings.hasInfix "-nomadperf" profileData.profileName
if profileData.value.cluster.aws.use_public_routing
then ''--host-addr {{ env "attr.unique.platform.aws.local-ipv4" }}''
else ''--host-addr 0.0.0.0''
)
Expand Down
17 changes: 14 additions & 3 deletions nix/workbench/backend/nomad.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2323,12 +2323,23 @@ backend_nomad() {
# If the node in "${generator_task}" quits generators fails with:
# tx-generator: MuxError MuxBearerClosed "<socket: 12> closed when reading data, waiting on next header True"
# Service binary 'tx-generator' returned status: 1
msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\") quit with an error exit code!")"
# Give the node where tx-generator runs some time to quit.
msg "$(yellow " Waiting 60s to check the status of supervisord program \"${generator_task}\" (inside Nomad Task \"${generator_task}\")")"
sleep 30
if backend_nomad is-task-program-running "${dir}" "${generator_task}" "${generator_task}" 5
then
# This was not expected!
# But check it wasn't a race condition of a stopping cluster!
if ! test -f "${dir}"/flag/cluster-stopping
then
msg "$(red "ERROR: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\") quit with an error exit code while supervisord program \"${generator_task}\" (inside Nomad Task \"${generator_task}\") is still running!")"
# The tx-generator can fail because something happened with
# the nodes (out of memory?), this gives the nodes more time
# to shutdown properly and/or show any possible cause of
# trouble before being killed.
msg "$(yellow "WARNING: Waiting one minute so nodes are not killed immediately")"
sleep 60
touch "${dir}"/flag/cluster-stopping
fatal "Generator quit unexpectedly!!!"
fi
Expand All @@ -2337,14 +2348,14 @@ backend_nomad() {
touch "${dir}"/generator/quit
# Show the warning and continue with the counter
echo -ne "\n"
msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\" quit with an error exit code")"
msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\") quit with an error exit code but expected when supervisord program \"${generator_task}\" (inside Nomad Task \"${generator_task}\") is not running")"
msg_ne "nomad: $(blue Waiting) until all pool nodes are stopped: 000000"
fi
else
touch "${dir}"/generator/quit
# Show the warning and continue with the counter
echo -ne "\n"
msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\" quit with a non-error exit code")"
msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\") quit with a non-error exit code")"
msg_ne "nomad: $(blue Waiting) until all pool nodes are stopped: 000000"
fi
fi # Finish generator checks.
Expand Down Expand Up @@ -3855,7 +3866,7 @@ client {
# Specifies an arbitrary string used to logically group client nodes by
# user-defined class. This can be used during job placement as a filter.
node_class = "perf" # Using the "world.dev.cardano.org" testing class for "perf".
node_class = "" # Make sure we are not using namespaces locally.
# "artifact" parameters (fail fast!!!)
######################################
Expand Down
37 changes: 19 additions & 18 deletions nix/workbench/backend/supervisor-conf.nix
Original file line number Diff line number Diff line change
Expand Up @@ -120,24 +120,25 @@ let
startretries = 0;
# Seconds it needs to stay running to consider the start successful
# In cases with a big genesis file, like the "value" profile with ~600
# mega, if this file has an error the node can fail after the 5 seconds
# we use as default for the other programs and the error will be catched
# later by the healthcheck service with a misleading message.
# We found with our reference machines (c5.2xlarge, 16 MB and 8 cores),
# when running the "value" profile, that with 50 seconds at least one
# node was assummed successful (its socket was created). So to the
# default 5 we add 45 seconds when the UTxO size is the one of the
# "value" profile and seconds proportionaly to this for the others.
### derived.utxo_generated
### - fast: 18000 (Default of 5s is OK)
### - ci-test: 18000 (Default of 5s is OK)
### - default: 43200 (Default of 5s is OK)
### - plutus: 61200 (Default of 5s is OK)
### - forge-stress-pre: 72000
### - forge-stress-large: 144000
### - value: 1536000 (30s more needed)
### - chainsync-early-alonzo: 31104000
startsecs = 5 + (profileData.derived.utxo_generated / (1536000 / 50));
# mega, if this file has a format error the node can fail after the 5
# seconds we use as default for the other "program"s and the error will
# be caught later by the healthcheck service with a misleading message.
# We found with our AWS reference machines (c5.2xlarge, 16 MB and 8
# cores), when running the "value" profile, that with 50 seconds at
# least one node was assumed successful (its socket was created). So to
# the default 5 we add 50 seconds when the UTxO set size is the one of
# the "value" profile and seconds proportionally to this for the others.
# Not directly related to "genesis.extra_future_offset" or
# "derived.genesis_future_offset".
### derived.dataset_measure
### - fast: 0 (Default of 5s is OK)
### - ci-test: 0 (Default of 5s is OK)
### - default: 0 (Default of 5s is OK)
### - plutus: 0 (Default of 5s is OK)
### - forge-stress-pre: 5000000
### - forge-stress-large: 11300000
### - value: 5000000 (50s more needed)
startsecs = 5 + (profileData.derived.dataset_measure / (5000000 / 50));
})
nodeSpecs))
//
Expand Down
2 changes: 1 addition & 1 deletion nix/workbench/nomad.sh
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ EOL
local nomad_class
nomad_class="$(jq -r .cluster.nomad.class "${WB_SHELL_PROFILE_DATA}"/profile.json)"
local perf_nodes
perf_nodes="$(nomad node status -filter 'Status=="ready"' -filter "NodeClass==\"${nomad_class}\"" -json)"
perf_nodes="$(nomad node status -filter "Status==\"ready\" and NodeClass==\"${nomad_class}\"" -json)"
# Create the base JSON string but without the "attributes" because those
# are only available when fetching the status of individual nodes.
local nodes_json
Expand Down
9 changes: 8 additions & 1 deletion nix/workbench/profile/prof0-defaults.jq
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,12 @@ def era_defaults($era):

, node:
{ rts_flags_override: []
, heap_limit: null ## optional: heap limit in MB (translates to RTS flag -M)
, shutdown_on_slot_synced: null
, shutdown_on_block_synced: null
, tracing_backend: "trace-dispatcher" ## or "iohk-monitoring"
, tracer: true
, utxo_lmdb: false ## use LMDB backend (instead of default in-mem) on a UTxO-HD node; will be ignored by non-UTxO-HD nodes
, verbatim:
{
}
Expand Down Expand Up @@ -96,20 +98,25 @@ def era_defaults($era):
{ producer: {cores: 2, memory: 15000, memory_max: 16000}
, explorer: {cores: 2, memory: 15000, memory_max: 16000}
}
# Volumes like {source: "ssd1", destination: "/ssd1", read_only: false}
, host_volumes: null
, fetch_logs_ssh: false
}
, aws:
{ instance_type:
{ producer: "c5.2xlarge"
, explorer: "m5.4xlarge"
}
# "attr.unique.platform.aws.public-ipv4" to bind and service definition.
, use_public_routing: false
}
, minimun_storage:
{ producer: 12582912 # 12×1024×1024
, explorer: 14155776 # 13.5×1024×1024
}
, keep_running: false
, ssd_directory: null
}

}

} | (.common * (.[$era] // {}));
Loading

0 comments on commit 0cc5b08

Please sign in to comment.