Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

workbench: UTxO scaling + LMDB benchmarks, improved Nomad cluster handling #5779

Merged
merged 15 commits into from
Apr 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,9 @@ ps: ## Plain-text list of profiles
## Profile-based cluster shells (autogenerated targets)
##
PROFILES_BASE := default default-p2p plutus plutus-secp-ecdsa plutus-secp-schnorr oldtracing idle tracer-only
PROFILES_FAST := fast fast-p2p fast-plutus fast-notracer fast-oldtracing
PROFILES_FAST := fast fast-p2p fast-plutus fast-notracer fast-oldtracing faststartup-24M
PROFILES_CI_TEST := ci-test ci-test-p2p ci-test-plutus ci-test-notracer ci-test-rtview ci-test-dense10
PROFILES_CI_BENCH := ci-bench ci-bench-p2p ci-bench-plutus ci-bench-plutus-secp-ecdsa ci-bench-plutus-secp-schnorr ci-bench-notracer ci-bench-rtview
PROFILES_CI_BENCH := ci-bench ci-bench-p2p ci-bench-plutus ci-bench-plutus-secp-ecdsa ci-bench-plutus-secp-schnorr ci-bench-notracer ci-bench-rtview ci-bench-lmdb
PROFILES_TRACE_BENCH := trace-bench trace-bench-notracer trace-bench-oldtracing trace-bench-rtview
PROFILES_TRACE_FULL := trace-full trace-full-rtview
PROFILES_EPOCHTRANS := epoch-transition
Expand All @@ -100,7 +100,9 @@ PROFILES_NOMAD_PERF := default-nomadperf ci-test-nomadperf ci-bench-nomadp
PROFILES_NOMAD_PERF += plutus-nomadperf fast-nomadperf latency-nomadperf
PROFILES_NOMAD_PERF_NOP2P := default-nomadperf-nop2p oldtracing-nomadperf-nop2p ci-test-nomadperf-nop2p ci-bench-nomadperf-nop2p
PROFILES_NOMAD_PERF_NOP2P += value-nomadperf-nop2p value-oldtracing-nomadperf-nop2p plutus-nomadperf-nop2p fast-nomadperf-nop2p
PROFILES_NOMAD_PERFSSD := fast-nomadperfssd
PROFILES_NOMAD_PERFSSD := value-nomadperfssd fast-nomadperfssd latency-nomadperfssd
# single node profiles on the NomadSSD cluster on AWS
PROFILES_UTXOSCALE_SOLO := utxoscale-solo-24M64G-nomadperfssd utxoscale-solo-12M64G-nomadperfssd utxoscale-solo-12M16G-nomadperfssd

LOCAL_PROFILES += $(PROFILES_BASE)
LOCAL_PROFILES += $(PROFILES_FAST)
Expand All @@ -120,6 +122,7 @@ LOCAL_PROFILES += $(PROFILES_VENDOR)
CLOUD_PROFILES += $(PROFILES_NOMAD_PERF)
CLOUD_PROFILES += $(PROFILES_NOMAD_PERF_NOP2P)
CLOUD_PROFILES += $(PROFILES_NOMAD_PERFSSD)
CLOUD_PROFILES += $(PROFILES_UTXOSCALE_SOLO)


## Note: to enable a shell for a profile, just add its name (one of names from 'make ps') to SHELL_PROFILES
Expand Down
30 changes: 29 additions & 1 deletion nix/nixos/cardano-node-service.nix
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,11 @@ let
];
};
instanceDbPath = cfg.databasePath i;
utxoLmdbParams = ["--v1-lmdb-ledger-db-backend"]
++ lib.optionals (cfg.lmdbDatabasePath i != null)
[ "--ssd-database-dir ${cfg.lmdbDatabasePath i}"
# "--ssd-snapshot-tables"
];
cmd = builtins.filter (x: x != "") [
"${cfg.executable} run"
"--config ${nodeConfigFile}"
Expand All @@ -143,7 +148,8 @@ let
"--tracer-socket-path-accept ${cfg.tracerSocketPathAccept i}"
] ++ lib.optionals (cfg.tracerSocketPathConnect i != null) [
"--tracer-socket-path-connect ${cfg.tracerSocketPathConnect i}"
] ++ consensusParams.${cfg.nodeConfig.Protocol} ++ cfg.extraArgs ++ cfg.rtsArgs;
] ++ lib.optionals (cfg.withUtxoHdLmdb i) utxoLmdbParams
++ consensusParams.${cfg.nodeConfig.Protocol} ++ cfg.extraArgs ++ cfg.rtsArgs;
in ''
echo "Starting: ${concatStringsSep "\"\n echo \"" cmd}"
echo "..or, once again, in a single line:"
Expand Down Expand Up @@ -350,6 +356,16 @@ in {
description = ''Node database path, for each instance.'';
};

lmdbDatabasePath = mkOption {
type = funcToOr nullOrStr;
default = null;
apply = x : if builtins.isFunction x then x else if x == null then _: null else _: x;
description = ''
Node UTxO-HD LMDB path for performant disk I/O, for each instance.
This could point to a direct-access SSD, with a specifically created journal-less file system and optimized mount options.
'';
};

socketPath = mkOption {
type = funcToOr types.str;
default = i : "${runtimeDir i}/node.socket";
Expand Down Expand Up @@ -648,6 +664,13 @@ in {
default = false;
};

withUtxoHdLmdb = mkOption {
type = funcToOr types.bool;
default = false;
apply = x: if builtins.isFunction x then x else _: x;
description = ''On an UTxO-HD enabled node, the in-memory backend is the default. This activates the on-disk backend (LMDB) instead.'';
};

extraArgs = mkOption {
type = types.listOf types.str;
default = [];
Expand Down Expand Up @@ -692,6 +715,7 @@ in {
config = mkIf cfg.enable ( let
stateDirBase = "/var/lib/";
runDirBase = "/run/";
lmdbPaths = filter (x: x != null) (map (e: cfg.lmdbDatabasePath e) (builtins.genList lib.trivial.id cfg.instances));
genInstanceConf = f: listToAttrs (if cfg.instances > 1
then genList (i: let n = "cardano-node-${toString i}"; in nameValuePair n (f n i)) cfg.instances
else [ (nameValuePair "cardano-node" (f "cardano-node" 0)) ]); in lib.mkMerge [
Expand Down Expand Up @@ -793,6 +817,10 @@ in {
assertion = !(cfg.systemdSocketActivation && cfg.useNewTopology);
message = "Systemd socket activation cannot be used with p2p topology due to a systemd socket re-use issue.";
}
{
assertion = (length lmdbPaths) == (length (lib.lists.unique lmdbPaths));
message = "When configuring multiple LMDB enabled nodes on one instance, lmdbDatabasePath must be unique.";
}
];
}
]);
Expand Down
56 changes: 42 additions & 14 deletions nix/workbench/backend/nomad-job.nix
Original file line number Diff line number Diff line change
Expand Up @@ -476,14 +476,28 @@ let
}
];
};

# The Consul namespace in which group and task-level services within the
# group will be registered. Use of template to access Consul KV will read
# from the specified Consul namespace. Specifying namespace takes
# precedence over the -consul-namespace command line argument in job run.
# namespace = "";
# Not available as the documentations says: Extraneous JSON object property; No argument or block type is named "namespace".

}
//
# If it needs host volumes add the constraints (can't be "null" or "[]".)
### - https://developer.hashicorp.com/nomad/tutorials/stateful-workloads/stateful-workloads-host-volumes
(lib.optionalAttrs (profileData.value.cluster.nomad.host_volumes != null) {
volume = lib.listToAttrs (lib.lists.imap0
(i: v: {
# Internal name, reference to mount in this group's tasks below.
name = "volume-${taskName}-${toString i}";
value = {
type = "host"; # We only support type "host".
read_only = v.read_only;
# How it is named in the Nomad Client's config.
# https://developer.hashicorp.com/nomad/docs/configuration/client#host_volume-block
source = v.source;
};
})
profileData.value.cluster.nomad.host_volumes
);
})
//
{
# The task stanza creates an individual unit of work, such as a Docker
# container, web application, or batch processing.
# https://developer.hashicorp.com/nomad/docs/job-specification/task
Expand Down Expand Up @@ -557,12 +571,12 @@ let
# address of an AWS EC2 instance set this to
# ${attr.unique.platform.aws.public-ipv4}.
address =
# When using the dedicated P&T Nomad cluster on AWS we use public
# IPs/routing, all the other cloud runs are behind a VPC/firewall.
# Local runs just use 12.0.0.1.
if lib.strings.hasInfix "-nomadperf" profileData.profileName
# When using dedicated Nomad clusters on AWS we want to use public
# IPs/routing, all the other cloud runs will run behind a
# VPC/firewall.
if profileData.value.cluster.aws.use_public_routing
then "\${attr.unique.platform.aws.public-ipv4}"
else ""
else "" # Local runs just use 127.0.0.1.
;
# Specifies the port to advertise for this service. The value of
# port depends on which address_mode is being used:
Expand Down Expand Up @@ -591,6 +605,20 @@ let
check = null;
};

# If it needs host volumes mount them (defined above if any).
volume_mount = if profileData.value.cluster.nomad.host_volumes != null
then lib.lists.imap0
(i: v: {
# Internal name, defined above in the group's specification.
volume = "volume-${taskName}-${toString i}";
# Where it is going to be mounted inside the Task.
destination = v.destination;
read_only = v.read_only;
})
profileData.value.cluster.nomad.host_volumes
else null
;

# Specifies the set of templates to render for the task. Templates can
# be used to inject both static and dynamic configuration with data
# populated from environment variables, Consul and Vault.
Expand Down Expand Up @@ -1363,7 +1391,7 @@ let
[
# Address string to
(
if lib.strings.hasInfix "-nomadperf" profileData.profileName
if profileData.value.cluster.aws.use_public_routing
then ''--host-addr {{ env "attr.unique.platform.aws.local-ipv4" }}''
else ''--host-addr 0.0.0.0''
)
Expand Down
17 changes: 14 additions & 3 deletions nix/workbench/backend/nomad.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2323,12 +2323,23 @@ backend_nomad() {
# If the node in "${generator_task}" quits generators fails with:
# tx-generator: MuxError MuxBearerClosed "<socket: 12> closed when reading data, waiting on next header True"
# Service binary 'tx-generator' returned status: 1
msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\") quit with an error exit code!")"
# Give the node where tx-generator runs some time to quit.
msg "$(yellow " Waiting 60s to check the status of supervisord program \"${generator_task}\" (inside Nomad Task \"${generator_task}\")")"
sleep 30
if backend_nomad is-task-program-running "${dir}" "${generator_task}" "${generator_task}" 5
then
# This was not expected!
# But check it wasn't a race condition of a stopping cluster!
if ! test -f "${dir}"/flag/cluster-stopping
then
msg "$(red "ERROR: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\") quit with an error exit code while supervisord program \"${generator_task}\" (inside Nomad Task \"${generator_task}\") is still running!")"
# The tx-generator can fail because something happened with
# the nodes (out of memory?), this gives the nodes more time
# to shutdown properly and/or show any possible cause of
# trouble before being killed.
msg "$(yellow "WARNING: Waiting one minute so nodes are not killed immediately")"
sleep 60
touch "${dir}"/flag/cluster-stopping
fatal "Generator quit unexpectedly!!!"
fi
Expand All @@ -2337,14 +2348,14 @@ backend_nomad() {
touch "${dir}"/generator/quit
# Show the warning and continue with the counter
echo -ne "\n"
msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\" quit with an error exit code")"
msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\") quit with an error exit code but expected when supervisord program \"${generator_task}\" (inside Nomad Task \"${generator_task}\") is not running")"
msg_ne "nomad: $(blue Waiting) until all pool nodes are stopped: 000000"
fi
else
touch "${dir}"/generator/quit
# Show the warning and continue with the counter
echo -ne "\n"
msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\" quit with a non-error exit code")"
msg "$(yellow "WARNING: supervisord program \"generator\" (inside Nomad Task \"${generator_task}\") quit with a non-error exit code")"
msg_ne "nomad: $(blue Waiting) until all pool nodes are stopped: 000000"
fi
fi # Finish generator checks.
Expand Down Expand Up @@ -3855,7 +3866,7 @@ client {

# Specifies an arbitrary string used to logically group client nodes by
# user-defined class. This can be used during job placement as a filter.
node_class = "perf" # Using the "world.dev.cardano.org" testing class for "perf".
node_class = "" # Make sure we are not using namespaces locally.

# "artifact" parameters (fail fast!!!)
######################################
Expand Down
37 changes: 19 additions & 18 deletions nix/workbench/backend/supervisor-conf.nix
Original file line number Diff line number Diff line change
Expand Up @@ -120,24 +120,25 @@ let
startretries = 0;
# Seconds it needs to stay running to consider the start successful
# In cases with a big genesis file, like the "value" profile with ~600
# mega, if this file has an error the node can fail after the 5 seconds
# we use as default for the other programs and the error will be catched
# later by the healthcheck service with a misleading message.
# We found with our reference machines (c5.2xlarge, 16 MB and 8 cores),
# when running the "value" profile, that with 50 seconds at least one
# node was assummed successful (its socket was created). So to the
# default 5 we add 45 seconds when the UTxO size is the one of the
# "value" profile and seconds proportionaly to this for the others.
### derived.utxo_generated
### - fast: 18000 (Default of 5s is OK)
### - ci-test: 18000 (Default of 5s is OK)
### - default: 43200 (Default of 5s is OK)
### - plutus: 61200 (Default of 5s is OK)
### - forge-stress-pre: 72000
### - forge-stress-large: 144000
### - value: 1536000 (30s more needed)
### - chainsync-early-alonzo: 31104000
startsecs = 5 + (profileData.derived.utxo_generated / (1536000 / 50));
# mega, if this file has a format error the node can fail after the 5
# seconds we use as default for the other "program"s and the error will
# be caught later by the healthcheck service with a misleading message.
# We found with our AWS reference machines (c5.2xlarge, 16 MB and 8
# cores), when running the "value" profile, that with 50 seconds at
# least one node was assumed successful (its socket was created). So to
# the default 5 we add 50 seconds when the UTxO set size is the one of
# the "value" profile and seconds proportionally to this for the others.
# Not directly related to "genesis.extra_future_offset" or
# "derived.genesis_future_offset".
### derived.dataset_measure
### - fast: 0 (Default of 5s is OK)
### - ci-test: 0 (Default of 5s is OK)
### - default: 0 (Default of 5s is OK)
### - plutus: 0 (Default of 5s is OK)
### - forge-stress-pre: 5000000
### - forge-stress-large: 11300000
### - value: 5000000 (50s more needed)
startsecs = 5 + (profileData.derived.dataset_measure / (5000000 / 50));
})
nodeSpecs))
//
Expand Down
2 changes: 1 addition & 1 deletion nix/workbench/nomad.sh
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ EOL
local nomad_class
nomad_class="$(jq -r .cluster.nomad.class "${WB_SHELL_PROFILE_DATA}"/profile.json)"
local perf_nodes
perf_nodes="$(nomad node status -filter 'Status=="ready"' -filter "NodeClass==\"${nomad_class}\"" -json)"
perf_nodes="$(nomad node status -filter "Status==\"ready\" and NodeClass==\"${nomad_class}\"" -json)"
# Create the base JSON string but without the "attributes" because those
# are only available when fetching the status of individual nodes.
local nodes_json
Expand Down
9 changes: 8 additions & 1 deletion nix/workbench/profile/prof0-defaults.jq
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,12 @@ def era_defaults($era):

, node:
{ rts_flags_override: []
, heap_limit: null ## optional: heap limit in MB (translates to RTS flag -M)
, shutdown_on_slot_synced: null
, shutdown_on_block_synced: null
, tracing_backend: "trace-dispatcher" ## or "iohk-monitoring"
, tracer: true
, utxo_lmdb: false ## use LMDB backend (instead of default in-mem) on a UTxO-HD node; will be ignored by non-UTxO-HD nodes
, verbatim:
{
}
Expand Down Expand Up @@ -96,20 +98,25 @@ def era_defaults($era):
{ producer: {cores: 2, memory: 15000, memory_max: 16000}
, explorer: {cores: 2, memory: 15000, memory_max: 16000}
}
# Volumes like {source: "ssd1", destination: "/ssd1", read_only: false}
, host_volumes: null
, fetch_logs_ssh: false
}
, aws:
{ instance_type:
{ producer: "c5.2xlarge"
, explorer: "m5.4xlarge"
}
# "attr.unique.platform.aws.public-ipv4" to bind and service definition.
, use_public_routing: false
}
, minimun_storage:
{ producer: 12582912 # 12×1024×1024
, explorer: 14155776 # 13.5×1024×1024
}
, keep_running: false
, ssd_directory: null
}

}

} | (.common * (.[$era] // {}));
Loading
Loading