From eae57d8e64601bb61f18cbf4a6d3b9f7d378d66a Mon Sep 17 00:00:00 2001 From: Di Xu Date: Fri, 10 May 2019 10:54:32 +0800 Subject: [PATCH] adapt dlws jobs (#2604) --- docs/alerting/watchdog-metrics.md | 8 + .../deploy/job-exporter.yaml.template | 2 + src/job-exporter/src/docker_inspect.py | 8 +- src/job-exporter/src/main.py | 26 +- src/job-exporter/src/nvidia.py | 5 +- .../test/data/dlts_docker_inspect.json | 403 ++++++++++++++++++ src/job-exporter/test/test_docker_inspect.py | 15 + src/watchdog/src/watchdog.py | 279 +++++++++--- src/watchdog/test/data/dlws_nodes_list.json | 158 +++++++ .../dlws_nodes_list_with_unschedulable.json | 159 +++++++ src/watchdog/test/data/nodes_list.json | 12 + src/watchdog/test/test_watchdog.py | 75 +++- 12 files changed, 1070 insertions(+), 80 deletions(-) create mode 100644 src/job-exporter/test/data/dlts_docker_inspect.json create mode 100644 src/watchdog/test/data/dlws_nodes_list.json create mode 100644 src/watchdog/test/data/dlws_nodes_list_with_unschedulable.json diff --git a/docs/alerting/watchdog-metrics.md b/docs/alerting/watchdog-metrics.md index e62fec843b..36b18c06a7 100644 --- a/docs/alerting/watchdog-metrics.md +++ b/docs/alerting/watchdog-metrics.md @@ -57,6 +57,14 @@ vi watchdog-xx.log | ---------- | ----------- | | k8s_api_server_count | use label `error` to represent status, if `error` != "ok", means k8s api server is not functioning correctly | +## K8s resource Metrics +| Metric name| Description | +| ---------- | ----------- | +| k8s_node_gpu_total | Total Gpu | +| k8s_node_gpu_available | Total gpu count - used gpu count | +| k8s_node_gpu_reserved | If node is marked as unschedulable via `kubectl cordon $node` all unused gpus are deemed as reserved | + + ## Other Metrics | Metric name| Description | | ---------- | ----------- | diff --git a/src/job-exporter/deploy/job-exporter.yaml.template b/src/job-exporter/deploy/job-exporter.yaml.template index 8a9c4682ba..31dd0eb463 100644 --- a/src/job-exporter/deploy/job-exporter.yaml.template +++ b/src/job-exporter/deploy/job-exporter.yaml.template @@ -68,6 +68,8 @@ spec: value: {{ cluster_cfg["job-exporter"]["logging-level"] }} - name: NV_DRIVER value: /var/drivers/nvidia/current + - name: NVIDIA_VISIBLE_DEVICES + value: all volumeMounts: - mountPath: /var/run/docker.sock name: docker-socket diff --git a/src/job-exporter/src/docker_inspect.py b/src/job-exporter/src/docker_inspect.py index 4d35dcb206..c95418314c 100644 --- a/src/job-exporter/src/docker_inspect.py +++ b/src/job-exporter/src/docker_inspect.py @@ -49,7 +49,7 @@ def __eq__(self, o): keys = {"PAI_JOB_NAME", "PAI_USER_NAME", "PAI_CURRENT_TASK_ROLE_NAME", "GPU_ID", - "PAI_TASK_INDEX"} + "PAI_TASK_INDEX", "DLWS_JOB_ID", "DLWS_USER_NAME"} def parse_docker_inspect(inspect_output): @@ -79,15 +79,13 @@ def parse_docker_inspect(inspect_output): pid = utils.walk_json_field_safe(obj, 0, "State", "Pid") return InspectResult( - m.get("PAI_USER_NAME"), - m.get("PAI_JOB_NAME"), + m.get("PAI_USER_NAME") or m.get("DLWS_USER_NAME"), + m.get("PAI_JOB_NAME") or m.get("DLWS_JOB_ID"), m.get("PAI_CURRENT_TASK_ROLE_NAME"), m.get("PAI_TASK_INDEX"), m.get("GPU_ID"), pid) - return {"env": envs, "labels": labels, "pid": pid} - def inspect(container_id, histogram, timeout): try: result = utils.exec_cmd( diff --git a/src/job-exporter/src/main.py b/src/job-exporter/src/main.py index 982c71179f..bd857b9a91 100644 --- a/src/job-exporter/src/main.py +++ b/src/job-exporter/src/main.py @@ -63,7 +63,12 @@ def config_environ(): os.path.join(driver_path, "lib") + os.pathsep + \ os.path.join(driver_path, "lib64") - logger.debug("LD_LIBRARY_PATH is %s", os.environ["LD_LIBRARY_PATH"]) + driver_bin_path = os.path.join(driver_path, "bin") + os.environ["PATH"] = os.environ["PATH"] + ":" + driver_bin_path + + logger.debug("LD_LIBRARY_PATH is %s, PATH is %s", + os.environ["LD_LIBRARY_PATH"], + os.environ["PATH"]) def try_remove_old_prom_file(path): @@ -82,16 +87,17 @@ def get_gpu_count(path): logger.debug("hostname is %s, ip is %s", hostname, ip) - with open(path) as f: - gpu_config = json.load(f) + if os.path.isfile(path): + with open(path) as f: + gpu_config = json.load(f) + + if hostname is not None and gpu_config["nodes"].get(hostname) is not None: + return gpu_config["nodes"][hostname]["gpuCount"] + elif ip is not None and gpu_config["nodes"].get(ip) is not None: + return gpu_config["nodes"][ip]["gpuCount"] - if hostname is not None and gpu_config["nodes"].get(hostname) is not None: - return gpu_config["nodes"][hostname]["gpuCount"] - elif ip is not None and gpu_config["nodes"].get(ip) is not None: - return gpu_config["nodes"][ip]["gpuCount"] - else: - logger.warning("failed to find gpu count from config %s", gpu_config) - return 0 + logger.warning("failed to find gpu count from config %s", path) + return 0 def register_stack_trace_dump(): diff --git a/src/job-exporter/src/nvidia.py b/src/job-exporter/src/nvidia.py index c9af57d4fe..aa0d3af261 100644 --- a/src/job-exporter/src/nvidia.py +++ b/src/job-exporter/src/nvidia.py @@ -162,11 +162,8 @@ def parse_smi_xml_result(smi): return result def nvidia_smi(histogram, timeout): - driver_path = os.environ["NV_DRIVER"] - bin_path = os.path.join(driver_path, "bin/nvidia-smi") - try: - smi_output = utils.exec_cmd([bin_path, "-q", "-x"], + smi_output = utils.exec_cmd(["nvidia-smi", "-q", "-x"], histogram=histogram, timeout=timeout) return parse_smi_xml_result(smi_output) diff --git a/src/job-exporter/test/data/dlts_docker_inspect.json b/src/job-exporter/test/data/dlts_docker_inspect.json new file mode 100644 index 0000000000..f7f114aa09 --- /dev/null +++ b/src/job-exporter/test/data/dlts_docker_inspect.json @@ -0,0 +1,403 @@ +[ + { + "Id": "e1a9cf8a0ad00a0d5bd2ea250c266ac48aa95940b83190433102a6b05675bf96", + "Created": "2019-04-18T06:31:21.961595409Z", + "Path": "bash", + "Args": [ + "/job/launch-0c435eee-d31f-43d5-a1b3-442845fa1d0c.sh" + ], + "State": { + "Status": "running", + "Running": true, + "Paused": false, + "Restarting": false, + "OOMKilled": false, + "Dead": false, + "Pid": 3533, + "ExitCode": 0, + "Error": "", + "StartedAt": "2019-04-18T06:31:36.178067796Z", + "FinishedAt": "0001-01-01T00:00:00Z" + }, + "Image": "sha256:ee0a1ed21fb27d93bff04b000841d28d77e767ba435a1fdb2033c5c1b373cf55", + "ResolvConfPath": "/etc/resolv.conf", + "HostnamePath": "/data/lib/docker/containers/82a8cc32dbd40766690f0955abf1f2d20db82077217898f85ef68dcf60232ad1/hostname", + "HostsPath": "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/etc-hosts", + "LogPath": "/data/lib/docker/containers/e1a9cf8a0ad00a0d5bd2ea250c266ac48aa95940b83190433102a6b05675bf96/e1a9cf8a0ad00a0d5bd2ea250c266ac48aa95940b83190433102a6b05675bf96-json.log", + "Name": "/k8s_0c435eee-d31f-43d5-a1b3-442845fa1d0c_0c435eee-d31f-43d5-a1b3-442845fa1d0c_default_611006c3-61a3-11e9-9957-000d3a1707fc_0", + "RestartCount": 0, + "Driver": "overlay2", + "Platform": "linux", + "MountLabel": "", + "ProcessLabel": "", + "AppArmorProfile": "docker-default", + "ExecIDs": null, + "HostConfig": { + "Binds": [ + "/etc/resolv.conf:/etc/resolv.conf", + "/dlwsdata/work/dixu:/home/dixu", + "/dlwsdata/work/dixu/jobs/190418/0c435eee-d31f-43d5-a1b3-442845fa1d0c:/job", + "/dlwsdata/storage/imagenet:/data", + "/dlwsdata/work/dixu/.ssh:/home/dixu/.ssh:ro", + "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/volumes/kubernetes.io~empty-dir/dshm:/dev/shm", + "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/volumes/kubernetes.io~secret/default-token-xqzf9:/var/run/secrets/kubernetes.io/serviceaccount:ro", + "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/etc-hosts:/etc/hosts", + "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/containers/0c435eee-d31f-43d5-a1b3-442845fa1d0c/12a87052:/dev/termination-log" + ], + "ContainerIDFile": "", + "LogConfig": { + "Type": "json-file", + "Config": {} + }, + "NetworkMode": "container:82a8cc32dbd40766690f0955abf1f2d20db82077217898f85ef68dcf60232ad1", + "PortBindings": null, + "RestartPolicy": { + "Name": "", + "MaximumRetryCount": 0 + }, + "AutoRemove": false, + "VolumeDriver": "", + "VolumesFrom": null, + "CapAdd": null, + "CapDrop": null, + "Dns": null, + "DnsOptions": null, + "DnsSearch": null, + "ExtraHosts": null, + "GroupAdd": null, + "IpcMode": "container:82a8cc32dbd40766690f0955abf1f2d20db82077217898f85ef68dcf60232ad1", + "Cgroup": "", + "Links": null, + "OomScoreAdj": 999, + "PidMode": "", + "Privileged": false, + "PublishAllPorts": false, + "ReadonlyRootfs": false, + "SecurityOpt": [ + "seccomp=unconfined" + ], + "UTSMode": "", + "UsernsMode": "", + "ShmSize": 67108864, + "Runtime": "nvidia", + "ConsoleSize": [ + 0, + 0 + ], + "Isolation": "", + "CpuShares": 1024, + "Memory": 0, + "NanoCpus": 0, + "CgroupParent": "/kubepods/burstable/pod611006c3-61a3-11e9-9957-000d3a1707fc", + "BlkioWeight": 0, + "BlkioWeightDevice": null, + "BlkioDeviceReadBps": null, + "BlkioDeviceWriteBps": null, + "BlkioDeviceReadIOps": null, + "BlkioDeviceWriteIOps": null, + "CpuPeriod": 0, + "CpuQuota": 0, + "CpuRealtimePeriod": 0, + "CpuRealtimeRuntime": 0, + "CpusetCpus": "", + "CpusetMems": "", + "Devices": [], + "DeviceCgroupRules": null, + "DiskQuota": 0, + "KernelMemory": 0, + "MemoryReservation": 0, + "MemorySwap": 0, + "MemorySwappiness": null, + "OomKillDisable": false, + "PidsLimit": 0, + "Ulimits": null, + "CpuCount": 0, + "CpuPercent": 0, + "IOMaximumIOps": 0, + "IOMaximumBandwidth": 0, + "MaskedPaths": [ + "/proc/asound", + "/proc/acpi", + "/proc/kcore", + "/proc/keys", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/proc/scsi", + "/sys/firmware" + ], + "ReadonlyPaths": [ + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger" + ] + }, + "GraphDriver": { + "Data": { + "LowerDir": "/data/lib/docker/overlay2/72151ab5ff423e77b3cd8b630184608d229cbb1f567a9bf7193116e5eecc5060-init/diff:/data/lib/docker/overlay2/acfef063aca95a1298a01d345bbd78582dd51bfb8786a16b5671e33eac96746f/diff:/data/lib/docker/overlay2/31f7d9efccb65a14dc23ce17b6dbac5eff1fac35bf97c965b34996d851ff6d1d/diff:/data/lib/docker/overlay2/e89ad2b850ed61f0bdc66c6292a6a8e14a76a4b3320831af563c91061e9fe7b6/diff:/data/lib/docker/overlay2/e243dd5be35a82b04b5e39fdaadbb7fa4ad769e06ae2205af0835fff522c193a/diff:/data/lib/docker/overlay2/ec6566e670ff00ef479a98400cee0ac36a88621928bfa713830dd7711cbcf62f/diff:/data/lib/docker/overlay2/af88915a503c89dd25206d355d30a13c75a49f27fbd1a3a190f18853d1365fc2/diff:/data/lib/docker/overlay2/6cc52eaee8a51b53a35a531b7f30506c44936fc8feec7c89fb7bc726989f50d9/diff:/data/lib/docker/overlay2/45b910fadb95ec6d08d39565bd2efb0c64f91f159620cbdf9896a2333c730d3f/diff:/data/lib/docker/overlay2/c58a45f6ccec97c062f6ac15c9a4fbfe1297008cb3c97266788161311e38ba66/diff:/data/lib/docker/overlay2/46d8b5a01a6e6c122fcfc6a6b6845a1baa8d2c6391414363be4ddf0ae136d377/diff:/data/lib/docker/overlay2/2fa00776b11abd79b67d3dc7d4ed6d34d0f8ea1e51e6ee5c1ee6c5939b2b4aa4/diff:/data/lib/docker/overlay2/82093c30847f81d398aad62ce3af0163ee7a3891b8c55c707a146ab69a600220/diff:/data/lib/docker/overlay2/447bedcd0ceac2f7f794fb01029bb81cbec0dcedc4e313b710a5ba6e8e9e5842/diff:/data/lib/docker/overlay2/a9820704ffc18f6ecb97ec03f15bdb0a5d63d39a4df9933dfd398338f5a57e94/diff:/data/lib/docker/overlay2/3c525d697511952e95d2536c6eb6fbc6020f0d2b1d78d025b8faa106d29dab17/diff:/data/lib/docker/overlay2/e1e79aacdabc9cb9c5929f2aec9c85f54022727a753b7558bca62daca86ba592/diff:/data/lib/docker/overlay2/2489dc456813d7b48bf1008d64846013103342b5d8b464424814dfd10253bdbf/diff:/data/lib/docker/overlay2/b00ae5899efef74b0aee509c24cf7b709f6a50a0bfbfe88c6c2f6f2cbe7b5127/diff:/data/lib/docker/overlay2/a902e9872075c97b4c54f2f0ae7d5d388db8b38780fa091e1f8e8a40e5266c8a/diff:/data/lib/docker/overlay2/df9a00088c7486e5bfa974c1576ac5a700e09d4ed601c4f177ce2c1d0bb37d97/diff:/data/lib/docker/overlay2/cd6fcfe7805a8d5adce5c36e65a753dda04f1932799905364707f8d320db8d44/diff:/data/lib/docker/overlay2/f521a15cf2aba16e21812e18138e978a9cddebb9018fb4d6d26a40e19cce80e4/diff:/data/lib/docker/overlay2/a6138afc2e6f51c7c7af8f5489511b8e2749fa4c998274acc9995af36f74241f/diff:/data/lib/docker/overlay2/b58eeaba82d11ef82379c1e85c1923d2ad8c35fcae1695f416bb6e4880380c19/diff:/data/lib/docker/overlay2/becb17f194415ece39e91426ae5182632fc13694d73bad414a8639238e5a5856/diff:/data/lib/docker/overlay2/d72cf469f5669609674441b2a99ebf9a438f1428ce0e76e0498c2e33ec3aa7ec/diff:/data/lib/docker/overlay2/4845d90ff2e9779e48edbec8d9637d7f0ea6ee9a4e514ca44fac397153e09bcc/diff:/data/lib/docker/overlay2/c732c2fa2c716850ffcb4a8d99a89b0cf98ec007212243fda43fe6e47a541e31/diff:/data/lib/docker/overlay2/4003f388d4b4bc5c6301fcdaccbda331e9f123566cdc511320ef7b0dd48e547e/diff:/data/lib/docker/overlay2/8d7c2ded6546c2357827ef63c02e7b9e6b0f513ec00a938a443382f3e77e0743/diff:/data/lib/docker/overlay2/fbf94ec74c2ef8899caa34cd6ceb8a24e40bfeacef1c8df2a7480005764a1bce/diff:/data/lib/docker/overlay2/60211c809b7f4d35c0c10542fbf2d98f3ab595e5344962bf639a6d4423450370/diff:/data/lib/docker/overlay2/c37768a7ff37fc09e74f405bc43346ceb93501a748cab3562de8739409760a57/diff:/data/lib/docker/overlay2/43d5c7c2cc0e5552ab6c1fc79884ec6c7ea8af8dbf0d409674e13501854da0fc/diff:/data/lib/docker/overlay2/0063d873e8bf59c501eae61344e82e8deeb7de78380d50e4b7da579967a0d7ed/diff:/data/lib/docker/overlay2/f18fb9fbdffc608879d0c3c0e7e94b0363d4e2f83908e69dd3b76e89123a4493/diff:/data/lib/docker/overlay2/bc17c791c97ac6f0ba7f8f04a5961d13c1f39972e1c9ce772d2e3e3ceaaf73a9/diff:/data/lib/docker/overlay2/5d075ef85806b1ad57f87cb36885b3213d217da28d08d9087948913b4b4832f9/diff:/data/lib/docker/overlay2/20a4a09494d2e22faf90cba06b47651e0abe2d1a83ed0350c6a9b1c550491521/diff:/data/lib/docker/overlay2/685bba86a073e670857261de47fd3d75afbf5881e68adb38222c1db505e726cd/diff:/data/lib/docker/overlay2/4861f6e118e3aca0b957bdb7c52f9696cfea158accc5e490e9ad4aa73b6e6aa4/diff:/data/lib/docker/overlay2/4c30c034d6d31b45e28df5566ff83138b2e9572cccb49551a3d8237240356261/diff:/data/lib/docker/overlay2/6162a88ed8598d54c4ea5c79e81bd714fd214b2c750843f40f3fd051b4d83a2a/diff:/data/lib/docker/overlay2/904d09c62c4419182145dc67d7dd485bbf89a728bff4e55c4eacf1bd10ae83de/diff:/data/lib/docker/overlay2/5f9010164b907678c212a2902c71d78a57825e37a4f39e4ad170d45c8a3a2f9b/diff:/data/lib/docker/overlay2/cfa6df1b93d137aaaac993e1511572914b6c4438f26ebd9c28a383ff57186f8b/diff:/data/lib/docker/overlay2/3d66bebd75c111c88688e69d54abd96c84d29962617a90ae4f96353233f925aa/diff:/data/lib/docker/overlay2/05b91a8fc50deabd6cea89b25c417e8319cc10b72a955c2cf593e4acd17750cf/diff:/data/lib/docker/overlay2/2353183a17aabdd994698f5ddf0df3e9fb46074eabeb8afbf3af52bdd9c81b8d/diff:/data/lib/docker/overlay2/4da6ded5ad62a760b2006258d9122e6fc5066ba9b575a543bab1925f86701cc1/diff:/data/lib/docker/overlay2/6a88dc2682e1aa07c0cbe800f1d70a99c624987b9f186f82162dd3d920ce8128/diff:/data/lib/docker/overlay2/6b288b7534b6fa6adb250f3e2d664e53fe486982a24a499277277e40507761a4/diff:/data/lib/docker/overlay2/444fb401e05a5ccc6649cdb85d99c5c44dc2648c0c2634912afc19df9bc151cc/diff:/data/lib/docker/overlay2/3e9737948414f3243702a9eebf2ba04883b588b1255e2f851c5909fbe502f058/diff:/data/lib/docker/overlay2/43e5859e24e69793cb76c88581ee99919c6f6ecdd1663d8e02bc56ab234dc0ac/diff:/data/lib/docker/overlay2/50271d3291362ca76fded64856fd01f20155d8cb3ecbd108d123a1a4750886a4/diff:/data/lib/docker/overlay2/adb42419536658b8dcc53737ae8fe57c834436824504ce4506dd37464241e6b4/diff:/data/lib/docker/overlay2/7276e287a6bc5d5ace8f812c796f667bc1fe3d1ec1c18de6b68620d76cc1027a/diff:/data/lib/docker/overlay2/32be844cd8996d63da23f3684745317e327779bc8ab305b4f848a49072d1becf/diff:/data/lib/docker/overlay2/b7fc0989966fb857aeefa8fbb35fa6ffae5175d9276c733f688ff98d87a3384e/diff:/data/lib/docker/overlay2/2130a7207cfbeac906cecfdb950df561186818c1f6971c938f07b0f301c68574/diff:/data/lib/docker/overlay2/0874cb839f1092d81a083700ac7955ce0927c72a96474b6adeecde88dfde4588/diff:/data/lib/docker/overlay2/f2863241711d278c92cddd981d1156118a1b6a1e4dc8162bbb0399f283c752c7/diff", + "MergedDir": "/data/lib/docker/overlay2/72151ab5ff423e77b3cd8b630184608d229cbb1f567a9bf7193116e5eecc5060/merged", + "UpperDir": "/data/lib/docker/overlay2/72151ab5ff423e77b3cd8b630184608d229cbb1f567a9bf7193116e5eecc5060/diff", + "WorkDir": "/data/lib/docker/overlay2/72151ab5ff423e77b3cd8b630184608d229cbb1f567a9bf7193116e5eecc5060/work" + }, + "Name": "overlay2" + }, + "Mounts": [ + { + "Type": "bind", + "Source": "/etc/resolv.conf", + "Destination": "/etc/resolv.conf", + "Mode": "", + "RW": true, + "Propagation": "rprivate" + }, + { + "Type": "bind", + "Source": "/dlwsdata/work/dixu/.ssh", + "Destination": "/home/dixu/.ssh", + "Mode": "ro", + "RW": false, + "Propagation": "rprivate" + }, + { + "Type": "bind", + "Source": "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/volumes/kubernetes.io~empty-dir/dshm", + "Destination": "/dev/shm", + "Mode": "", + "RW": true, + "Propagation": "rprivate" + }, + { + "Type": "bind", + "Source": "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/etc-hosts", + "Destination": "/etc/hosts", + "Mode": "", + "RW": true, + "Propagation": "rprivate" + }, + { + "Type": "bind", + "Source": "/dlwsdata/work/dixu", + "Destination": "/home/dixu", + "Mode": "", + "RW": true, + "Propagation": "rprivate" + }, + { + "Type": "bind", + "Source": "/dlwsdata/work/dixu/jobs/190418/0c435eee-d31f-43d5-a1b3-442845fa1d0c", + "Destination": "/job", + "Mode": "", + "RW": true, + "Propagation": "rprivate" + }, + { + "Type": "bind", + "Source": "/dlwsdata/storage/imagenet", + "Destination": "/data", + "Mode": "", + "RW": true, + "Propagation": "rprivate" + }, + { + "Type": "bind", + "Source": "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/volumes/kubernetes.io~secret/default-token-xqzf9", + "Destination": "/var/run/secrets/kubernetes.io/serviceaccount", + "Mode": "ro", + "RW": false, + "Propagation": "rprivate" + }, + { + "Type": "bind", + "Source": "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/containers/0c435eee-d31f-43d5-a1b3-442845fa1d0c/12a87052", + "Destination": "/dev/termination-log", + "Mode": "", + "RW": true, + "Propagation": "rprivate" + } + ], + "Config": { + "Hostname": "0c435eee-d31f-43d5-a1b3-442845fa1d0c", + "Domainname": "", + "User": "0", + "AttachStdin": false, + "AttachStdout": false, + "AttachStderr": false, + "ExposedPorts": { + "1280/tcp": {}, + "1443/tcp": {}, + "180/tcp": {}, + "6006/tcp": {}, + "8888/tcp": {} + }, + "Tty": false, + "OpenStdin": false, + "StdinOnce": false, + "Env": [ + "NVIDIA_VISIBLE_DEVICES=GPU-7c583998-b3ff-a885-8979-2d32d334cde4", + "DLWS_USER_NAME=dixu", + "JOB_ID=0c435eee-d31f-43d5-a1b3-442845fa1d0c", + "DLWS_JOB_ID=0c435eee-d31f-43d5-a1b3-442845fa1d0c", + "DLWS_NUM_GPU_PER_WORKER=1", + "POD_IP=10.2.48.5", + "LD_LIBRARY_PATH=/usr/local/nvidia/lib64/", + "FAMILY_TOKEN=98012454719049418c85ae134680583a", + "DLWS_REST_API=None", + "DLWS_NUM_WORKER=1", + "POD_NAME=0c435eee-d31f-43d5-a1b3-442845fa1d0c", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_8888_PORT_8888_TCP_PROTO=tcp", + "ZK_HEADLESS_PORT_3888_TCP_ADDR=10.3.1.100", + "METRICS_PROMETHEUS_SERVICE_PORT=9090", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_22_SERVICE_HOST=10.3.135.168", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_22_PORT_22_TCP_ADDR=10.3.135.168", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_8888_SERVICE_PORT=8888", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_8888_PORT=tcp://10.3.73.43:8888", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_8888_PORT_8888_TCP_PORT=8888", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_8888_SERVICE_PORT=8888", + "ZK_HEADLESS_PORT_2888_TCP_ADDR=10.3.1.100", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_22_PORT_22_TCP_ADDR=10.3.9.191", + "METRICS_PROMETHEUS_SERVICE_HOST=10.3.106.89", + "METRICS_PROMETHEUS_PORT_9090_TCP=tcp://10.3.106.89:9090", + "KUBERNETES_PORT_443_TCP_PROTO=tcp", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_8888_PORT_8888_TCP_PROTO=tcp", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_8888_PORT_8888_TCP_PORT=8888", + "ZK_HEADLESS_SERVICE_PORT_LEADER_ELECTION=3888", + "ZK_HEADLESS_SERVICE_PORT_CLIENT=2181", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_6006_PORT_6006_TCP_PROTO=tcp", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_6006_SERVICE_PORT_INTERACTIVE=6006", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_6006_PORT_6006_TCP=tcp://10.3.15.252:6006", + "METRICS_PROMETHEUS_PORT_9090_TCP_PORT=9090", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_22_PORT=tcp://10.3.135.168:22", + "ZK_HEADLESS_SERVICE_PORT=2888", + "ZK_HEADLESS_PORT_2181_TCP=tcp://10.3.1.100:2181", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_6006_SERVICE_HOST=10.3.15.252", + "METRICS_PROMETHEUS_PORT=tcp://10.3.106.89:9090", + "KUBERNETES_PORT_443_TCP_ADDR=10.3.0.1", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_8888_PORT_8888_TCP=tcp://10.3.16.47:8888", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_6006_PORT_6006_TCP_PORT=6006", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_6006_PORT_6006_TCP_PORT=6006", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_22_PORT_22_TCP=tcp://10.3.135.168:22", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_8888_PORT_8888_TCP=tcp://10.3.73.43:8888", + "ZK_HEADLESS_PORT_2181_TCP_ADDR=10.3.1.100", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_6006_SERVICE_PORT=6006", + "KUBERNETES_SERVICE_HOST=10.3.0.1", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_8888_SERVICE_PORT_INTERACTIVE=8888", + "ZK_HEADLESS_PORT_2888_TCP=tcp://10.3.1.100:2888", + "ZK_HEADLESS_PORT_3888_TCP=tcp://10.3.1.100:3888", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_6006_SERVICE_PORT_INTERACTIVE=6006", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_22_PORT_22_TCP_PROTO=tcp", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_8888_PORT_8888_TCP_ADDR=10.3.73.43", + "KUBERNETES_PORT_443_TCP_PORT=443", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_8888_SERVICE_PORT_INTERACTIVE=8888", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_8888_PORT=tcp://10.3.16.47:8888", + "ZK_HEADLESS_PORT_2888_TCP_PORT=2888", + "ZK_HEADLESS_PORT_2181_TCP_PROTO=tcp", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_22_PORT_22_TCP_PORT=22", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_6006_PORT=tcp://10.3.154.214:6006", + "KUBERNETES_SERVICE_PORT_HTTPS=443", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_6006_PORT_6006_TCP_PROTO=tcp", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_22_PORT_22_TCP_PORT=22", + "ZK_HEADLESS_PORT=tcp://10.3.1.100:2888", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_6006_PORT_6006_TCP_ADDR=10.3.15.252", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_22_PORT=tcp://10.3.9.191:22", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_6006_PORT_6006_TCP_ADDR=10.3.154.214", + "KUBERNETES_SERVICE_PORT=443", + "ZK_HEADLESS_SERVICE_PORT_SERVER=2888", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_6006_PORT=tcp://10.3.15.252:6006", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_22_SERVICE_PORT=22", + "METRICS_PROMETHEUS_SERVICE_PORT_WEB=9090", + "METRICS_PROMETHEUS_PORT_9090_TCP_PROTO=tcp", + "KUBERNETES_PORT=tcp://10.3.0.1:443", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_8888_PORT_8888_TCP_ADDR=10.3.16.47", + "ZK_HEADLESS_SERVICE_HOST=10.3.1.100", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_6006_SERVICE_HOST=10.3.154.214", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_22_SERVICE_PORT=22", + "ZK_HEADLESS_PORT_3888_TCP_PROTO=tcp", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_6006_PORT_6006_TCP=tcp://10.3.154.214:6006", + "METRICS_PROMETHEUS_PORT_9090_TCP_ADDR=10.3.106.89", + "KUBERNETES_PORT_443_TCP=tcp://10.3.0.1:443", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_8888_SERVICE_HOST=10.3.16.47", + "ZK_HEADLESS_PORT_2888_TCP_PROTO=tcp", + "ZK_HEADLESS_PORT_3888_TCP_PORT=3888", + "ZK_HEADLESS_PORT_2181_TCP_PORT=2181", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_6006_SERVICE_PORT=6006", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_22_PORT_22_TCP=tcp://10.3.9.191:22", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_22_SERVICE_HOST=10.3.9.191", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_22_SERVICE_PORT_INTERACTIVE=22", + "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_22_PORT_22_TCP_PROTO=tcp", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_22_SERVICE_PORT_INTERACTIVE=22", + "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_8888_SERVICE_HOST=10.3.73.43", + "PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "CUDA_VERSION=9.0.176", + "CUDA_PKG_VERSION=9-0=9.0.176-1", + "NVIDIA_DRIVER_CAPABILITIES=compute,utility", + "NVIDIA_REQUIRE_CUDA=cuda>=9.0", + "BAZEL_VERSION=0.11.0", + "CI_BUILD_PYTHON=python3", + "TF_NEED_CUDA=1", + "TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0,6.1", + "TF_CUDA_VERSION=9.0", + "TF_CUDNN_VERSION=7", + "APACHE_RUN_USER=www-data", + "APACHE_RUN_GROUP=www-data", + "APACHE_LOG_DIR=/var/log/apache2", + "DOTNET_CLI_TELEMETRY_OPTOUT=1" + ], + "Cmd": null, + "Healthcheck": { + "Test": [ + "NONE" + ] + }, + "Image": "dlws/tutorial-tensorflow@sha256:cf850a71c7d54bdf0e41f544d2661677919ec6f26ab833ef8a7b39ec0ef429d1", + "Volumes": null, + "WorkingDir": "/tensorflow", + "Entrypoint": [ + "bash", + "/job/launch-0c435eee-d31f-43d5-a1b3-442845fa1d0c.sh" + ], + "OnBuild": null, + "Labels": { + "annotation.io.kubernetes.container.hash": "789c3bdd", + "annotation.io.kubernetes.container.restartCount": "0", + "annotation.io.kubernetes.container.terminationMessagePath": "/dev/termination-log", + "annotation.io.kubernetes.container.terminationMessagePolicy": "File", + "annotation.io.kubernetes.pod.terminationGracePeriod": "30", + "com.nvidia.build.id": "63756748", + "com.nvidia.build.ref": "2b1c8edf8d79830ad811baff9630adb3bcb5db46", + "com.nvidia.cuda.version": "9.0.176", + "com.nvidia.volumes.needed": "nvidia_driver", + "io.kubernetes.container.logpath": "/var/log/pods/611006c3-61a3-11e9-9957-000d3a1707fc/0c435eee-d31f-43d5-a1b3-442845fa1d0c_0.log", + "io.kubernetes.container.name": "0c435eee-d31f-43d5-a1b3-442845fa1d0c", + "io.kubernetes.docker.type": "container", + "io.kubernetes.pod.name": "0c435eee-d31f-43d5-a1b3-442845fa1d0c", + "io.kubernetes.pod.namespace": "default", + "io.kubernetes.pod.uid": "611006c3-61a3-11e9-9957-000d3a1707fc", + "io.kubernetes.sandbox.id": "82a8cc32dbd40766690f0955abf1f2d20db82077217898f85ef68dcf60232ad1", + "maintainer": "Craig Citro " + } + }, + "NetworkSettings": { + "Bridge": "", + "SandboxID": "", + "HairpinMode": false, + "LinkLocalIPv6Address": "", + "LinkLocalIPv6PrefixLen": 0, + "Ports": {}, + "SandboxKey": "", + "SecondaryIPAddresses": null, + "SecondaryIPv6Addresses": null, + "EndpointID": "", + "Gateway": "", + "GlobalIPv6Address": "", + "GlobalIPv6PrefixLen": 0, + "IPAddress": "", + "IPPrefixLen": 0, + "IPv6Gateway": "", + "MacAddress": "", + "Networks": {} + } + } +] diff --git a/src/job-exporter/test/test_docker_inspect.py b/src/job-exporter/test/test_docker_inspect.py index e0a63574e8..454cfc2c9a 100644 --- a/src/job-exporter/test/test_docker_inspect.py +++ b/src/job-exporter/test/test_docker_inspect.py @@ -76,5 +76,20 @@ def test_parse_docker_inspect_BUGFIX(self): 30332) self.assertEqual(target_inspect_info, inspect_info) + def test_adapt_dlts_jobs(self): + sample_path = "data/dlts_docker_inspect.json" + with open(sample_path, "r") as f: + docker_inspect = f.read() + + inspect_info = parse_docker_inspect(docker_inspect) + target_inspect_info = InspectResult( + "dixu", + "0c435eee-d31f-43d5-a1b3-442845fa1d0c", + None, + None, + "GPU-7c583998-b3ff-a885-8979-2d32d334cde4", + 3533) + self.assertEqual(target_inspect_info, inspect_info) + if __name__ == '__main__': unittest.main() diff --git a/src/watchdog/src/watchdog.py b/src/watchdog/src/watchdog.py index 55fc4fc829..70b65fb51d 100644 --- a/src/watchdog/src/watchdog.py +++ b/src/watchdog/src/watchdog.py @@ -28,6 +28,8 @@ import signal import faulthandler import gc +import re +import collections import yaml import prometheus_client @@ -55,28 +57,80 @@ list_pods_histogram = Histogram("k8s_api_list_pods_latency_seconds", "Response latency for list pods from k8s api (seconds)") +list_ns_histogram = Histogram("k8s_api_list_ns_latency_seconds", + "Response latency for list namespaces from k8s api (seconds)") + list_nodes_histogram = Histogram("k8s_api_list_nodes_latency_seconds", "Response latency for list nodes from k8s api (seconds)") def gen_pai_pod_gauge(): return GaugeMetricFamily("pai_pod_count", "count of pai pod", - labels=["service_name", "name", "phase", "host_ip", + labels=["service_name", "name", "namespace", "phase", "host_ip", "initialized", "pod_scheduled", "ready"]) def gen_pai_container_gauge(): return GaugeMetricFamily("pai_container_count", "count of container pod", - labels=["service_name", "pod_name", "name", "state", "host_ip", "ready"]) + labels=["service_name", "pod_name", "name", "namespace", "state", + "host_ip", "ready"]) def gen_pai_node_gauge(): return GaugeMetricFamily("pai_node_count", "count of pai node", - labels=["name", "disk_pressure", "memory_pressure", "out_of_disk", "ready"]) + labels=["name", "disk_pressure", "memory_pressure", "out_of_disk", "ready", "unschedulable"]) def gen_k8s_api_gauge(): return GaugeMetricFamily("k8s_api_server_count", "count of k8s api server", labels=["error", "host_ip"]) +def gen_k8s_node_gpu_available(): + return GaugeMetricFamily("k8s_node_gpu_available", "gpu available on k8s node", + labels=["host_ip"]) + +# reserved gpu means gpu not allocated to tasks and the node is being marked as +# unschedulable. +def gen_k8s_node_gpu_reserved(): + return GaugeMetricFamily("k8s_node_gpu_reserved", "gpu reserved on k8s node", + labels=["host_ip"]) + +def gen_k8s_node_gpu_total(): + return GaugeMetricFamily("k8s_node_gpu_total", "gpu total on k8s node", + labels=["host_ip"]) + ##### watchdog will generate above metrics +def walk_json_field_safe(obj, *fields): + """ for example a=[{"a": {"b": 2}}] + walk_json_field_safe(a, 0, "a", "b") will get 2 + walk_json_field_safe(a, 0, "not_exist") will get None + """ + try: + for f in fields: + obj = obj[f] + return obj + except: + return None + +def convert_to_byte(data): + data = data.lower() + number = float(re.findall(r"[0-9.]+", data)[0]) + if "t" in data: + return number * 10 ** 12 + elif "g" in data: + return number * 10 ** 9 + elif "m" in data: + return number * 10 ** 6 + elif "k" in data: + return number * 10 ** 3 + elif "ti" in data: + return number * 2 ** 40 + elif "gi" in data: + return number * 2 ** 30 + elif "mi" in data: + return number * 2 ** 20 + elif "ki" in data: + return number * 2 ** 10 + else: + return number + class AtomicRef(object): """ a thread safe way to store and get object, should not modify data get from this ref """ def __init__(self): @@ -120,16 +174,36 @@ def catch_exception(fn, msg, default, *args, **kwargs): logger.exception(msg) return default +class PodInfo(object): + def __init__(self, name, gpu): + self.name = name + self.gpu = gpu -def parse_pod_item(pai_pod_gauge, pai_container_gauge, pod): + def __repr__(self): + return "%s: %s" % (self.name, self.gpu) + +def parse_pod_item(pod, namespace, pai_pod_gauge, pai_container_gauge, pods_info): """ add metrics to pai_pod_gauge or pai_container_gauge if successfully paesed pod. Because we are parsing json outputed by k8s, its format is subjected to change, we should test if field exists before accessing it to avoid KeyError """ pod_name = pod["metadata"]["name"] + host_ip = walk_json_field_safe(pod, "status", "hostIP") or "unscheduled" + + used_gpu = 0 + containers = walk_json_field_safe(pod, "spec", "containers") + if containers is not None: + for container in containers: + req_gpu = int(walk_json_field_safe(container, "resources", "requests", + "nvidia.com/gpu") or 0) + limit_gpu = int(walk_json_field_safe(container, "resources", "limits", + "nvidia.com/gpu") or 0) + used_gpu += max(req_gpu, limit_gpu) + pods_info[host_ip].append(PodInfo(pod_name, used_gpu)) + labels = pod["metadata"].get("labels") if labels is None or "app" not in labels: - logger.warning("unkown pod %s", pod["metadata"]["name"]) + logger.info("unknown pod %s", pod["metadata"]["name"]) return None service_name = labels["app"] # get pai service name from label @@ -141,10 +215,6 @@ def parse_pod_item(pai_pod_gauge, pai_container_gauge, pod): else: phase = "unknown" - host_ip = "unscheduled" # can not specify None here, None will cause None exception - if status.get("hostIP") is not None: - host_ip = status["hostIP"] - initialized = pod_scheduled = ready = "unknown" conditions = status.get("conditions") @@ -163,7 +233,7 @@ def parse_pod_item(pai_pod_gauge, pai_container_gauge, pod): error_counter.labels(type="unknown_pod_cond").inc() logger.error("unexpected condition %s in pod %s", cond_t, pod_name) - pai_pod_gauge.add_metric([service_name, pod_name, phase, host_ip, + pai_pod_gauge.add_metric([service_name, pod_name, namespace, phase, host_ip, initialized, pod_scheduled, ready], 1) # generate pai_containers @@ -189,19 +259,21 @@ def parse_pod_item(pai_pod_gauge, pai_container_gauge, pod): container_state = list(state.keys())[0].lower() pai_container_gauge.add_metric([service_name, pod_name, container_name, - container_state, host_ip, str(ready).lower()], 1) + namespace, container_state, host_ip, str(ready).lower()], 1) - return pai_pod_gauge, pai_container_gauge - -def process_pods_status(pai_pod_gauge, pai_container_gauge, podsJsonObject): +def process_pods_status(pods_object, namespace, pai_pod_gauge, pai_container_gauge, + pods_info): def _map_fn(item): return catch_exception(parse_pod_item, "catch exception when parsing pod item", None, - pai_pod_gauge, pai_container_gauge, item) + item, + namespace, + pai_pod_gauge, pai_container_gauge, + pods_info) - list(map(_map_fn, podsJsonObject["items"])) + list(map(_map_fn, pods_object["items"])) def collect_healthz(gauge, histogram, scheme, address, port, url, ca_path, headers): @@ -217,52 +289,163 @@ def collect_healthz(gauge, histogram, scheme, address, port, url, ca_path, heade gauge.add_metric([error, address], 1) -def collect_k8s_component(k8s_gauge, api_server_scheme, api_server_ip, api_server_port, ca_path, headers): +def collect_k8s_component(api_server_scheme, api_server_ip, api_server_port, ca_path, headers): + k8s_gauge = gen_k8s_api_gauge() + collect_healthz(k8s_gauge, api_healthz_histogram, api_server_scheme, api_server_ip, api_server_port, "/healthz", ca_path, headers) -def parse_node_item(pai_node_gauge, node): - name = node["metadata"]["name"] + return [k8s_gauge] + + +def parse_node_item(node, pai_node_gauge, + node_gpu_avail, node_gpu_total, node_gpu_reserved, + pods_info): + + ip = None + + addresses = walk_json_field_safe(node, "status", "addresses") + if addresses is not None: + for addr in addresses: + if addr.get("type") == "InternalIP": + ip = addr.get("address") - disk_pressure = memory_pressure = out_of_disk = ready = "unknown" + if ip is None: + ip = node["metadata"]["name"] + + disk_pressure = memory_pressure = out_of_disk = ready = unschedulable = "unknown" if node.get("status") is not None: status = node["status"] - if status.get("conditions") is not None: - conditions = status["conditions"] - + conditions = walk_json_field_safe(status, "conditions") + if conditions is not None: for cond in conditions: cond_t = cond["type"] - status = cond["status"].lower() + node_status = cond["status"].lower() if cond_t == "DiskPressure": - disk_pressure = status + disk_pressure = node_status elif cond_t == "MemoryPressure": - memory_pressure = status + memory_pressure = node_status elif cond_t == "OutOfDisk": - out_of_disk = status + out_of_disk = node_status elif cond_t == "Ready": - ready = status + ready = node_status else: error_counter.labels(type="unknown_node_cond").inc() - logger.error("unexpected condition %s in node %s", cond_t, name) + logger.error("unexpected condition %s in node %s", cond_t, ip) + + # https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node/node-allocatable.md + # [Allocatable] = [Node Capacity] - [Kube-Reserved] - [System-Reserved] - [Hard-Eviction-Threshold] + total_gpu = 0 + + allocatable = walk_json_field_safe(status, "allocatable") + if allocatable is not None: + gpu1 = int(walk_json_field_safe(allocatable, "alpha.kubernetes.io/nvidia-gpu") or "0") + gpu2 = int(walk_json_field_safe(allocatable, "nvidia.com/gpu") or "0") + + total_gpu = max(gpu1, gpu2) + node_gpu_total.add_metric([ip], total_gpu) + else: + capacity = walk_json_field_safe(status, "capacity") + if capacity is not None: + gpu1 = int(walk_json_field_safe(capacity, "alpha.kubernetes.io/nvidia-gpu") or "0") + gpu2 = int(walk_json_field_safe(capacity, "nvidia.com/gpu") or "0") + total_gpu = max(gpu1. gpu2) + + node_gpu_total.add_metric([ip], total_gpu) + + # Because k8s api's node api do not record how much resource left for + # allocation, so we have to compute it ourselves. + used_gpu = 0 + + if pods_info.get(ip) is not None: + for pod in pods_info[ip]: + used_gpu += pod.gpu + + # if a node is marked as unschedulable, the available gpu will be 0 + # and reserved gpu will be `total - used` + if walk_json_field_safe(node, "spec", "unschedulable") != True: + node_gpu_avail.add_metric([ip], max(0, total_gpu - used_gpu)) + node_gpu_reserved.add_metric([ip], 0) + else: + node_gpu_avail.add_metric([ip], 0) + node_gpu_reserved.add_metric([ip], max(0, total_gpu - used_gpu)) + else: + logger.warning("unexpected structure of node %s: %s", ip, json.dumps(node)) + + unschedulable_s = walk_json_field_safe(node, "spec", "unschedulable") + if unschedulable_s is True: + unschedulable = "true" else: - logger.warning("unexpected structure of node %s: %s", name, json.dumps(node)) + unschedulable = "false" - pai_node_gauge.add_metric([name, disk_pressure, memory_pressure, out_of_disk, ready], 1) + pai_node_gauge.add_metric([ip, disk_pressure, memory_pressure, out_of_disk, ready, unschedulable], 1) - return pai_node_gauge +def process_nodes_status(nodes_object, pods_info): + pai_node_gauge = gen_pai_node_gauge() + node_gpu_avail = gen_k8s_node_gpu_available() + node_gpu_reserved = gen_k8s_node_gpu_reserved() + node_gpu_total = gen_k8s_node_gpu_total() -def process_nodes_status(pai_node_gauge, nodesJsonObject): def _map_fn(item): return catch_exception(parse_node_item, "catch exception when parsing node item", None, - pai_node_gauge, item) + item, + pai_node_gauge, + node_gpu_avail, + node_gpu_total, + node_gpu_reserved, + pods_info) + + list(map(_map_fn, nodes_object["items"])) + + return [pai_node_gauge, + node_gpu_avail, node_gpu_total, node_gpu_reserved] + + +def process_pods(k8s_api_addr, ca_path, headers, pods_info): + list_namespace_url = "{}/api/v1/namespaces".format(k8s_api_addr) + + ns_object = request_with_histogram(list_namespace_url, list_ns_histogram, + ca_path, headers) + + namespaces = [] + + ns_items = walk_json_field_safe(ns_object, "items") + if ns_items is not None: + for ns in ns_items: + ns_name = walk_json_field_safe(ns, "metadata", "name") + if ns_name is not None: + namespaces.append(ns_name) - list(map(_map_fn, nodesJsonObject["items"])) + pai_pod_gauge = gen_pai_pod_gauge() + pai_container_gauge = gen_pai_container_gauge() + + for ns in namespaces: + list_pods_url = "{}/api/v1/namespaces/{}/pods".format(k8s_api_addr, ns) + try: + pods_object = request_with_histogram(list_pods_url, list_pods_histogram, + ca_path, headers) + process_pods_status(pods_object, ns, pai_pod_gauge, pai_container_gauge, + pods_info) + except Exception as e: + error_counter.labels(type="parse").inc() + logger.exception("failed to process pods from namespace %s", ns) + + return [pai_pod_gauge, pai_container_gauge] + + +def process_nodes(k8s_api_addr, ca_path, headers, pods_info): + list_nodes_url = "{}/api/v1/nodes/".format(k8s_api_addr) + + nodes_object = request_with_histogram(list_nodes_url, list_nodes_histogram, + ca_path, headers) + + return process_nodes_status(nodes_object, pods_info) def load_machine_list(configFilePath): @@ -352,33 +535,21 @@ def loop(args, atomic_ref): bearer = bearer_file.read() headers = {'Authorization': "Bearer {}".format(bearer)} - list_pods_url = "{}/api/v1/namespaces/default/pods/".format(address) - list_nodes_url = "{}/api/v1/nodes/".format(address) - while True: - # these gauge is generate on each iteration - pai_pod_gauge = gen_pai_pod_gauge() - pai_container_gauge = gen_pai_container_gauge() - pai_node_gauge = gen_pai_node_gauge() - k8s_gauge = gen_k8s_api_gauge() - + result = [] try: - # 1. check service level status - podsStatus = request_with_histogram(list_pods_url, list_pods_histogram, ca_path, headers) - process_pods_status(pai_pod_gauge, pai_container_gauge, podsStatus) + pods_info = collections.defaultdict(lambda : []) + + result.extend(process_pods(address, ca_path, headers, pods_info)) - # 2. check nodes level status - nodes_status = request_with_histogram(list_nodes_url, list_nodes_histogram, ca_path, headers) - process_nodes_status(pai_node_gauge, nodes_status) + result.extend(process_nodes(address, ca_path, headers, pods_info)) - # 3. check k8s level status - collect_k8s_component(k8s_gauge, api_server_scheme, api_server_ip, api_server_port, ca_path, headers) + result.extend(collect_k8s_component(api_server_scheme, api_server_ip, api_server_port, ca_path, headers)) except Exception as e: error_counter.labels(type="unknown").inc() logger.exception("watchdog failed in one iteration") - atomic_ref.get_and_set([pai_pod_gauge, pai_container_gauge, pai_node_gauge, - k8s_gauge]) + atomic_ref.get_and_set(result) time.sleep(float(args.interval)) diff --git a/src/watchdog/test/data/dlws_nodes_list.json b/src/watchdog/test/data/dlws_nodes_list.json new file mode 100644 index 0000000000..433053d071 --- /dev/null +++ b/src/watchdog/test/data/dlws_nodes_list.json @@ -0,0 +1,158 @@ +{ + "kind": "NodeList", + "apiVersion": "v1", + "metadata": { + "selfLink": "/api/v1/nodes", + "resourceVersion": "6263195" + }, + "items": [ + { + "metadata": { + "name": "dltsp40-infra01", + "selfLink": "/api/v1/nodes/dltsp40-infra01", + "uid": "13334d00-4cc9-11e9-9957-000d3a1707fc", + "resourceVersion": "6263192", + "creationTimestamp": "2019-03-22T17:36:50Z", + "labels": { + "kubernetes.io/hostname": "dltsp40-infra01", + "yarnrm2": "active", + "freeflowrouter": "active", + "elasticsearch": "active", + "hdfsdatanode": "active", + "yarnrm1": "active", + "collectd-node-agent": "active", + "detectron": "active", + "fluentd-es-config-v0.1.0": "active", + "grafana": "active", + "hdfsjournal": "active", + "hdfsstandby": "active", + "journalnode": "active", + "nginx": "active", + "cloud-fluentd-es-v2.0.2": "active", + "webportal": "active", + "google-cadvisor": "active", + "hdfsnn1": "active", + "kibana": "active", + "namenode1": "active", + "sparknode": "active", + "all": "active", + "infrastructure": "active", + "nvidia-device-plugin-daemonset": "active", + "nvidiaheartbeat": "active", + "recogserver": "active", + "restfulapi": "active", + "yarnnodemanager": "active", + "zk": "active", + "datanode": "active", + "zookeeper": "active", + "default": "active", + "elasticsearch-logging": "active", + "jobmanager": "active", + "mysql": "active", + "zk-config": "active", + "cloud-collectd-node-agent": "active", + "beta.kubernetes.io/os": "linux", + "cloud-fluentd-es-config-v0.1.0": "active", + "fluentd-es-v2.0.2": "active", + "hdfsformat": "active", + "hdfsnn2": "active", + "zk-headless": "active", + "FragmentGPUJob": "active", + "dlws-grafana": "active", + "influxdb": "active", + "beta.kubernetes.io/arch": "amd64" + }, + "annotations": { + "node.alpha.kubernetes.io/ttl": "0", + "volumes.kubernetes.io/controller-managed-attach-detach": "true" + } + }, + "spec": { + "externalID": "dltsp40-infra01", + "providerID": "aztools://dltsp40-infra01", + "taints": [ + { + "key": "node-role.kubernetes.io/master", + "effect": "NoSchedule" + } + ] + }, + "status": { + "capacity": { + "alpha.kubernetes.io/nvidia-gpu": "4", + "cpu": "16", + "memory": "57709692Ki", + "pods": "110" + }, + "allocatable": { + "cpu": "16", + "memory": "57607292Ki", + "pods": "110", + "alpha.kubernetes.io/nvidia-gpu": "4" + }, + "conditions": [ + { + "type": "OutOfDisk", + "status": "False", + "lastHeartbeatTime": "2019-04-18T08:44:11Z", + "lastTransitionTime": "2019-03-22T17:36:49Z", + "reason": "KubeletHasSufficientDisk", + "message": "kubelet has sufficient disk space available" + }, + { + "type": "MemoryPressure", + "status": "False", + "lastHeartbeatTime": "2019-04-18T08:44:11Z", + "lastTransitionTime": "2019-03-22T17:36:49Z", + "reason": "KubeletHasSufficientMemory", + "message": "kubelet has sufficient memory available" + }, + { + "type": "DiskPressure", + "status": "False", + "lastHeartbeatTime": "2019-04-18T08:44:11Z", + "lastTransitionTime": "2019-03-22T17:36:49Z", + "reason": "KubeletHasNoDiskPressure", + "message": "kubelet has no disk pressure" + }, + { + "type": "Ready", + "status": "True", + "lastHeartbeatTime": "2019-04-18T08:44:11Z", + "lastTransitionTime": "2019-03-22T17:37:10Z", + "reason": "KubeletReady", + "message": "kubelet is posting ready status. AppArmor enabled" + } + ], + "addresses": [ + { + "type": "InternalIP", + "address": "192.168.255.1" + }, + { + "type": "Hostname", + "address": "dltsp40-infra01" + } + ], + "daemonEndpoints": { + "kubeletEndpoint": { + "Port": 10250 + } + }, + "nodeInfo": { + "machineID": "0fcc50243f694b94b64db96ab895ee7c", + "systemUUID": "29e4606f-2997-0345-b916-11050543c01b", + "bootID": "32861647-ea09-44c7-82dd-38c92a831ebf", + "kernelVersion": "4.18.0-1013-azure", + "osImage": "Ubuntu 18.04.2 LTS", + "containerRuntimeVersion": "docker://18.9.3", + "kubeletVersion": "v1.9.0", + "kubeProxyVersion": "v1.9.0", + "operatingSystem": "linux", + "architecture": "amd64" + }, + "images": [] + } + } + ] +} diff --git a/src/watchdog/test/data/dlws_nodes_list_with_unschedulable.json b/src/watchdog/test/data/dlws_nodes_list_with_unschedulable.json new file mode 100644 index 0000000000..dc70851fe1 --- /dev/null +++ b/src/watchdog/test/data/dlws_nodes_list_with_unschedulable.json @@ -0,0 +1,159 @@ +{ + "kind": "NodeList", + "apiVersion": "v1", + "metadata": { + "selfLink": "/api/v1/nodes", + "resourceVersion": "6263195" + }, + "items": [ + { + "metadata": { + "name": "dltsp40-infra01", + "selfLink": "/api/v1/nodes/dltsp40-infra01", + "uid": "13334d00-4cc9-11e9-9957-000d3a1707fc", + "resourceVersion": "6263192", + "creationTimestamp": "2019-03-22T17:36:50Z", + "labels": { + "kubernetes.io/hostname": "dltsp40-infra01", + "yarnrm2": "active", + "freeflowrouter": "active", + "elasticsearch": "active", + "hdfsdatanode": "active", + "yarnrm1": "active", + "collectd-node-agent": "active", + "detectron": "active", + "fluentd-es-config-v0.1.0": "active", + "grafana": "active", + "hdfsjournal": "active", + "hdfsstandby": "active", + "journalnode": "active", + "nginx": "active", + "cloud-fluentd-es-v2.0.2": "active", + "webportal": "active", + "google-cadvisor": "active", + "hdfsnn1": "active", + "kibana": "active", + "namenode1": "active", + "sparknode": "active", + "all": "active", + "infrastructure": "active", + "nvidia-device-plugin-daemonset": "active", + "nvidiaheartbeat": "active", + "recogserver": "active", + "restfulapi": "active", + "yarnnodemanager": "active", + "zk": "active", + "datanode": "active", + "zookeeper": "active", + "default": "active", + "elasticsearch-logging": "active", + "jobmanager": "active", + "mysql": "active", + "zk-config": "active", + "cloud-collectd-node-agent": "active", + "beta.kubernetes.io/os": "linux", + "cloud-fluentd-es-config-v0.1.0": "active", + "fluentd-es-v2.0.2": "active", + "hdfsformat": "active", + "hdfsnn2": "active", + "zk-headless": "active", + "FragmentGPUJob": "active", + "dlws-grafana": "active", + "influxdb": "active", + "beta.kubernetes.io/arch": "amd64" + }, + "annotations": { + "node.alpha.kubernetes.io/ttl": "0", + "volumes.kubernetes.io/controller-managed-attach-detach": "true" + } + }, + "spec": { + "externalID": "dltsp40-infra01", + "providerID": "aztools://dltsp40-infra01", + "unschedulable": true, + "taints": [ + { + "key": "node-role.kubernetes.io/master", + "effect": "NoSchedule" + } + ] + }, + "status": { + "capacity": { + "alpha.kubernetes.io/nvidia-gpu": "4", + "cpu": "16", + "memory": "57709692Ki", + "pods": "110" + }, + "allocatable": { + "cpu": "16", + "memory": "57607292Ki", + "pods": "110", + "alpha.kubernetes.io/nvidia-gpu": "4" + }, + "conditions": [ + { + "type": "OutOfDisk", + "status": "False", + "lastHeartbeatTime": "2019-04-18T08:44:11Z", + "lastTransitionTime": "2019-03-22T17:36:49Z", + "reason": "KubeletHasSufficientDisk", + "message": "kubelet has sufficient disk space available" + }, + { + "type": "MemoryPressure", + "status": "False", + "lastHeartbeatTime": "2019-04-18T08:44:11Z", + "lastTransitionTime": "2019-03-22T17:36:49Z", + "reason": "KubeletHasSufficientMemory", + "message": "kubelet has sufficient memory available" + }, + { + "type": "DiskPressure", + "status": "False", + "lastHeartbeatTime": "2019-04-18T08:44:11Z", + "lastTransitionTime": "2019-03-22T17:36:49Z", + "reason": "KubeletHasNoDiskPressure", + "message": "kubelet has no disk pressure" + }, + { + "type": "Ready", + "status": "True", + "lastHeartbeatTime": "2019-04-18T08:44:11Z", + "lastTransitionTime": "2019-03-22T17:37:10Z", + "reason": "KubeletReady", + "message": "kubelet is posting ready status. AppArmor enabled" + } + ], + "addresses": [ + { + "type": "InternalIP", + "address": "192.168.255.1" + }, + { + "type": "Hostname", + "address": "dltsp40-infra01" + } + ], + "daemonEndpoints": { + "kubeletEndpoint": { + "Port": 10250 + } + }, + "nodeInfo": { + "machineID": "0fcc50243f694b94b64db96ab895ee7c", + "systemUUID": "29e4606f-2997-0345-b916-11050543c01b", + "bootID": "32861647-ea09-44c7-82dd-38c92a831ebf", + "kernelVersion": "4.18.0-1013-azure", + "osImage": "Ubuntu 18.04.2 LTS", + "containerRuntimeVersion": "docker://18.9.3", + "kubeletVersion": "v1.9.0", + "kubeProxyVersion": "v1.9.0", + "operatingSystem": "linux", + "architecture": "amd64" + }, + "images": [] + } + } + ] +} diff --git a/src/watchdog/test/data/nodes_list.json b/src/watchdog/test/data/nodes_list.json index 18ae5acc82..38bce834c5 100644 --- a/src/watchdog/test/data/nodes_list.json +++ b/src/watchdog/test/data/nodes_list.json @@ -18,6 +18,18 @@ "externalID": "10.151.40.4" }, "status": { + "capacity": { + "alpha.kubernetes.io/nvidia-gpu": "0", + "cpu": "16", + "memory": "57709692Ki", + "pods": "110" + }, + "allocatable": { + "cpu": "16", + "memory": "57607292Ki", + "pods": "110", + "alpha.kubernetes.io/nvidia-gpu": "0" + }, "conditions": [ { "type": "OutOfDisk", diff --git a/src/watchdog/test/test_watchdog.py b/src/watchdog/test/test_watchdog.py index 27c6e84055..3beadfbe2a 100644 --- a/src/watchdog/test/test_watchdog.py +++ b/src/watchdog/test/test_watchdog.py @@ -22,6 +22,7 @@ import json import logging import logging.config +import collections import prometheus_client @@ -62,10 +63,12 @@ def get_data_test_input(self, path): def test_parse_pods_status(self): obj = json.loads(self.get_data_test_input("data/pods_list.json")) - pod_gauge = watchdog.gen_pai_node_gauge() + pod_gauge = watchdog.gen_pai_pod_gauge() container_gauge = watchdog.gen_pai_container_gauge() + pod_info = collections.defaultdict(lambda : []) - watchdog.process_pods_status(pod_gauge, container_gauge, obj) + watchdog.process_pods_status(obj, "default", + pod_gauge, container_gauge, pod_info) self.assertTrue(len(pod_gauge.samples) > 0) self.assertTrue(len(container_gauge.samples) > 0) @@ -73,19 +76,22 @@ def test_parse_pods_status(self): def test_process_nodes_status(self): obj = json.loads(self.get_data_test_input("data/nodes_list.json")) - gauge = watchdog.gen_pai_node_gauge() + gauges = watchdog.process_nodes_status(obj, {}) - watchdog.process_nodes_status(gauge, obj) + self.assertTrue(len(gauges) == 4) - self.assertTrue(len(gauge.samples) > 0) + for gauge in gauges: + self.assertTrue(len(gauge.samples) > 0) def test_process_pods_with_no_condition(self): obj = json.loads(self.get_data_test_input("data/no_condtion_pod.json")) pod_gauge = watchdog.gen_pai_pod_gauge() container_gauge = watchdog.gen_pai_container_gauge() + pod_info = collections.defaultdict(lambda : []) - watchdog.process_pods_status(pod_gauge, container_gauge, obj) + watchdog.process_pods_status(obj, "default", + pod_gauge, container_gauge, pod_info) self.assertTrue(len(pod_gauge.samples) > 0) self.assertEqual(0, len(container_gauge.samples)) @@ -97,8 +103,10 @@ class CustomCollector(object): def collect(self): pod_gauge = watchdog.gen_pai_pod_gauge() container_gauge = watchdog.gen_pai_container_gauge() + pod_info = collections.defaultdict(lambda : []) - watchdog.process_pods_status(pod_gauge, container_gauge, obj) + watchdog.process_pods_status(obj, "default", + pod_gauge, container_gauge, pod_info) yield pod_gauge yield container_gauge @@ -108,5 +116,58 @@ def collect(self): # expect no exception prometheus_client.write_to_textfile("/tmp/test_watchdog.prom", registry) + def test_process_dlws_nodes_status(self): + obj = json.loads(self.get_data_test_input("data/dlws_nodes_list.json")) + + pod_info = collections.defaultdict(lambda : []) + pod_info["192.168.255.1"].append(watchdog.PodInfo("job1", 2)) + gauges = watchdog.process_nodes_status(obj, pod_info) + + self.assertTrue(len(gauges) == 4) + + self.assertEqual("k8s_node_gpu_available", gauges[1].name) + self.assertEqual(1, len(gauges[1].samples)) + self.assertEqual(2, gauges[1].samples[0].value) + self.assertEqual("k8s_node_gpu_total", gauges[2].name) + self.assertEqual(1, len(gauges[2].samples)) + self.assertEqual(4, gauges[2].samples[0].value) + self.assertEqual("k8s_node_gpu_reserved", gauges[3].name) + self.assertEqual(1, len(gauges[3].samples)) + self.assertEqual(0, gauges[3].samples[0].value) + + for gauge in gauges: + self.assertTrue(len(gauge.samples) > 0) + + for gauge in gauges[1:]: + self.assertEqual("192.168.255.1", gauge.samples[0].labels["host_ip"]) + + def test_process_dlws_nodes_status_with_unscheduable(self): + obj = json.loads(self.get_data_test_input("data/dlws_nodes_list_with_unschedulable.json")) + + pod_info = collections.defaultdict(lambda : []) + pod_info["192.168.255.1"].append(watchdog.PodInfo("job1", 2)) + gauges = watchdog.process_nodes_status(obj, pod_info) + + self.assertTrue(len(gauges) == 4) + + self.assertEqual("pai_node_count", gauges[0].name) + self.assertEqual(1, len(gauges[0].samples)) + self.assertEqual("true", gauges[0].samples[0].labels["unschedulable"]) + self.assertEqual("k8s_node_gpu_available", gauges[1].name) + self.assertEqual(1, len(gauges[1].samples)) + self.assertEqual(0, gauges[1].samples[0].value) + self.assertEqual("k8s_node_gpu_total", gauges[2].name) + self.assertEqual(1, len(gauges[2].samples)) + self.assertEqual(4, gauges[2].samples[0].value) + self.assertEqual("k8s_node_gpu_reserved", gauges[3].name) + self.assertEqual(1, len(gauges[3].samples)) + self.assertEqual(2, gauges[3].samples[0].value) + + for gauge in gauges: + self.assertTrue(len(gauge.samples) > 0) + + for gauge in gauges[1:]: + self.assertEqual("192.168.255.1", gauge.samples[0].labels["host_ip"]) + if __name__ == '__main__': unittest.main()