Skip to content

Commit

Permalink
cfgen:implement ondemand template
Browse files Browse the repository at this point in the history
Summary: title says it

Differential Revision: D57641591

fbshipit-source-id: 66b4c3818a9ca3aecad19cd1f9601fdee913d993
  • Loading branch information
Chengxiong Ruan authored and facebook-github-bot committed May 22, 2024
1 parent faf5964 commit 79f9a3e
Show file tree
Hide file tree
Showing 6 changed files with 402 additions and 19 deletions.
148 changes: 131 additions & 17 deletions src/oomd/cfgen/src/cfgen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,18 +67,20 @@ fn devserver_json_config(node: &Node, attrs: &ConfigParams) -> json::JsonValue {
}
}

fn od_json_config(_attrs: &ConfigParams) -> json::JsonValue {
// TODO(chengxiong): implement this.
json::object! {}
fn od_json_config(attrs: &ConfigParams) -> json::JsonValue {
let mut rulesets = json::Array::new();
rulesets.push(rule_system_overview(attrs));
rulesets.push(rule_protection_against_high_memory_pressure(attrs));
rulesets.append(&mut rules_restart_cgroup_on_mem_threshold(attrs));
rulesets.push(rule_senpai_drop_in_ruleset(attrs));
rulesets.push(rule_od_protection_against_low_swap(attrs));
json::object! {
"rulesets": rulesets,
"version": CONFIG_VERSION,
}
}

fn rule_system_overview(attrs: &ConfigParams) -> json::JsonValue {
let cgroup = if [HostType::DevServer, HostType::OnDemand].contains(&attrs.host_type) {
attrs.oomd2.oomd_target.as_str()
} else {
"workload.slice"
};

let mut rule = json::object! {
"name": "system overview",
"silence-logs": "engine",
Expand All @@ -88,7 +90,7 @@ fn rule_system_overview(attrs: &ConfigParams) -> json::JsonValue {
{
"name": "dump_cgroup_overview",
"args": {
"cgroup": cgroup,
"cgroup": attrs.oomd2.oomd_target.as_str(),
}
}
]
Expand Down Expand Up @@ -243,7 +245,7 @@ fn rule_protection_against_heavy_workload_thrashing_detectors(
}

_ = slow_growing_mem_pressure_detector.push(json::object! {
"name": "pressure_rising_beyong",
"name": "pressure_rising_beyond",
"args": {
"cgroup": attrs.fbtax2.workload_monitoring_slice.as_str(),
"resource": "memory",
Expand Down Expand Up @@ -416,7 +418,7 @@ fn rule_senpai_ruleset(attrs: &ConfigParams) -> json::JsonValue {
fn rule_senpai_drop_in_ruleset(attrs: &ConfigParams) -> json::JsonValue {
json::object! {
"name": "senpai drop-in ruleset",
"silence-logs": "engine",
"silence-logs": if attrs.host_type == HostType::OnDemand {"engine,plugins"} else {"engine"},
"drop-in": {
"actions": true,
"disable-on-drop-in": true,
Expand Down Expand Up @@ -566,6 +568,97 @@ fn rule_user_session_protection(node: &Node, attrs: &ConfigParams) -> json::Json
}
}

fn rule_protection_against_high_memory_pressure(attrs: &ConfigParams) -> json::JsonValue {
json::object! {
"name": "protection against high memory pressure",
"drop-in": {
"detectors": true,
"actions": true,
"disable-on-drop-in": attrs.oomd2.oomd_disable_on_drop_in,
},
"detectors": [
[
"detects fast growing memory pressure",
{
"name": attrs.oomd2.plugins["pressure_above"].as_str(),
"args": {
"cgroup": attrs.oomd2.oomd_target.as_str(),
"resource": "memory",
"threshold": attrs.oomd2.oomd_high_threshold.as_str(),
"duration": attrs.oomd2.oomd_high_threshold_duration.as_str(),
}
},
{
"name": attrs.oomd2.plugins["memory_reclaim"].as_str(),
"args": {
"cgroup": attrs.oomd2.oomd_target.as_str(),
"duration": attrs.oomd2.oomd_reclaim_duation.as_str(),
}
}
],
[
"detects slow growing memory pressure",
{
"name": attrs.oomd2.plugins["pressure_rising_beyond"].as_str(),
"args": {
"cgroup": attrs.oomd2.oomd_target.as_str(),
"resource": "memory",
"threshold": attrs.oomd2.oomd_threshold.as_str(),
"duration": attrs.oomd2.oomd_threshold_duration.as_str(),
}
},
{
"name": attrs.oomd2.plugins["memory_reclaim"].as_str(),
"args": {
"cgroup": attrs.oomd2.oomd_target.as_str(),
"duration": attrs.oomd2.oomd_reclaim_duation.as_str(),
}
}
]
],
"actions": [
{
"name": attrs.oomd2.plugins["kill_by_memory_size_or_growth"].as_str(),
"args": {
"cgroup": attrs.oomd2.oomd_action_target.as_str(),
"dry": if attrs.oomd2.oomd_dry { "true" } else {"false"},
}
}
]
}
}

fn rule_od_protection_against_low_swap(attrs: &ConfigParams) -> json::JsonValue {
json::object! {
"name": "protection against low swap",
"drop-in": {
"detectors": true,
"actions": true,
"disable-on-drop-in": attrs.oomd2.oomd_disable_on_drop_in,
},
"detectors": [
[
"free swap goes below 5 percent",
{
"name": attrs.oomd2.plugins["swap_free"].as_str(),
"args": {
"threshold_pct": "5",
}
}
]
],
"actions": [
{
"name": attrs.oomd2.plugins["kill_by_swap_usage"].as_str(),
"args": {
"cgroup": attrs.oomd2.oomd_action_target.as_str(),
"dry": if attrs.oomd2.oomd_dry { "true" } else {"false"},
}
}
]
}
}

fn get_attributes(node: &Node) -> ConfigParams {
ConfigParams {
host_type: get_host_type(node),
Expand Down Expand Up @@ -597,14 +690,13 @@ fn get_attributes(node: &Node) -> ConfigParams {
"senpai" => "senpai",
)),
oomd_dry: true,
oomd_disable_on_drop_in: false,
oomd_target: String::from("system.slice"),
oomd_action_target: String::from("system.slice"),
oomd_disable_on_drop_in: true,
oomd_target: oomd2_oomd_target(node),
oomd_action_target: String::from("system.slice/*"),
oomd_high_threshold: String::from("80"),
oomd_high_threshold_duration: String::from("60"),
oomd_threshold: String::from("60"),
oomd_threshold_duration: String::from("90"),
oomd_min_swap_pct: String::from("15"),
oomd_restart_threshold: oomd2_oomd_restart_threshold(),
oomd_reclaim_duation: String::from("10"),
oomd_post_action_delay: String::from("15"),
Expand All @@ -625,7 +717,7 @@ fn get_attributes(node: &Node) -> ConfigParams {
memory_high_timeout_ms: String::from("20"),
scuba_logger_dataset: String::from("perfpipe_senpai_events"),
},
disable_senpai_dropin: false,
disable_senpai_dropin: disable_senpai_dropin(node),
}
}

Expand Down Expand Up @@ -707,12 +799,33 @@ fn senpai_limit_min_bytes(node: &Node) -> Option<String> {
None
}

fn oomd2_oomd_target(node: &Node) -> String {
match get_host_type(node) {
HostType::DevServer => String::from("system.slice"),
HostType::OnDemand => {
String::from("system.slice,workload.slice/workload-tw.slice/quicksand*.service")
}
_ => String::from("workload.slice"),
}
}

fn disable_senpai_dropin(node: &Node) -> bool {
if get_host_type(node) == HostType::OnDemand {
return true;
}
false
}

fn get_host_type(node: &Node) -> HostType {
// TODO(chengxiong): add logic to determine host types.
if node.hostname_prefix() == "twshared".into() {
return HostType::TwShared;
}

if node.hostname_prefix() == "od".into() {
return HostType::OnDemand;
}

if node.is_devserver() {
return HostType::DevServer;
}
Expand All @@ -736,6 +849,7 @@ mod tests {
#[rstest]
#[case::shard99("twshared2434.02.cco1", HostType::TwShared)]
#[case::shard99("devvm3170.cln0", HostType::DevServer)]
#[case::shard99("od2228.eag1", HostType::OnDemand)]
fn test_get_host_type(#[case] hostname: &str, #[case] expected: HostType) {
let node = FakeNodeBuilder::new().hostname(hostname).build();
assert_eq!(get_host_type(&node), expected);
Expand Down
3 changes: 1 addition & 2 deletions src/oomd/cfgen/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ pub struct FBTax2Attributes {
}

pub struct Oomd2Attributes {
pub blacklisted_jobs: Vec<String>,
pub blacklisted_jobs: Vec<&'static str>,
pub disable_swap_protection: bool,
pub kill_target: String,
pub plugins: BTreeMap<String, String>,
Expand All @@ -46,7 +46,6 @@ pub struct Oomd2Attributes {
pub oomd_high_threshold_duration: String,
pub oomd_threshold: String,
pub oomd_threshold_duration: String,
pub oomd_min_swap_pct: String,
pub oomd_restart_threshold: BTreeMap<String, OomdRestartThreshold>,
pub oomd_reclaim_duation: String,
pub oomd_post_action_delay: String,
Expand Down
93 changes: 93 additions & 0 deletions src/oomd/cfgen/test/cfgen_test_inputs/ondemand.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
@generated SignedSource<<bb1f0a1f74c2f6d1c6ca34e21fa3ca51>>
@codegen-command arc cfgen update-inputs fb-oomd
{
"fqdn": "od2228.eag1.facebook.com",
"region": "utah",
"clusterType": "SERVICE_GENERIC_NON_MEMCACHE",
"modelId": 341072,
"kernelRelease": "5.19.0-0_fbk12_11583_g0bef9520ca2b",
"serverType": "TYPE_X_SEARCH",
"experiments": [],
"cpuArchitecture": "cooperlake",
"metalosRootfs": false,
"provisioningConfig": {
"ethtoolByInterface": {
"eth0": {
"maxChannelsCombined": 52
}
},
"cpuCoreCount": 26,
"parentModelId": 338998,
"recoveryEnvironment": false,
"deviceType": "SERVER",
"datacenter": "eag1",
"cluster": "05",
"memTotal": 66870956032,
"osVersion": {
"distribution_name": "CentOS Stream release",
"version": 9,
"is_in_ramdisk": false,
"is_metalos": false
},
"pciByAddress": {
"0000:65:00.0": {
"vendor_id": 5555,
"device_id": 4125,
"class_code": 131072,
"board_part_number": "MCX623435MC-CDAE_FB"
}
},
"static_smc_tiers": [],
"machine": "x86_64"
},
"bootConfig": {
"ethtoolByInterface": {
"eth0": {
"driver": "mlx5_core",
"driver_version": "5.19.0-0_fbk12_11583_g0bef9520c",
"firmware_version": "22.32.1206 (FB_0000000018)",
"bus_info": "0000:65:00.0"
}
}
},
"runtimeConfig": {
"hasHighPrivCert": true,
"regionRoutableCluster": "eag1.02",
"block_devices": {
"block_devices": {
"nvme0n1": {
"size_bytes": 256055095296,
"is_rotational": false,
"model": "HFS512GDE9X083N",
"serial": "2621CDA6N79781110H6O",
"physical_block_size": 512,
"logical_block_size": 512,
"is_root": true
},
"nvme1n1": {
"size_bytes": 1800360124416,
"is_rotational": false,
"model": "MZOL21T9HCJR-00AFB",
"serial": "S5X8NG0T524955",
"physical_block_size": 4096,
"logical_block_size": 4096,
"is_root": false
}
}
},
"dynamic_smc_tiers": [],
"cluster_state": "CLUSTER_IN_USE",
"installed_platforms": [
"platform010",
"platform010-compat"
],
"device_nics_enum": [
"ETH0",
"SVC0"
]
},
"reservationConfig": {
"active_machine_materialization_id": "",
"current_reservation_host_profile_id": "NEWLY_PROVISIONED_PROFILE"
}
}
4 changes: 4 additions & 0 deletions src/oomd/cfgen/test/cfgen_test_manifest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,7 @@ library_samples:
- devvm
- twshared_vll_shard00
# Add more samples from https://fburl.com/code/vjwmkoa1 if needed
samples:
ondemand:
# A random host with od hostname prefix.
production_host: od2228.eag1.facebook.com
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
@generated SignedSource<<31b3f2f747768088bd5523d8e690bfac>>
@codegen-command arc cfgen update-outputs fb-oomd
[Service]
[Unit]
Loading

0 comments on commit 79f9a3e

Please sign in to comment.