Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
emit reserved gpu
Browse files Browse the repository at this point in the history
  • Loading branch information
xudifsd committed Apr 28, 2019
1 parent c5ea05a commit 9f6b9f2
Show file tree
Hide file tree
Showing 3 changed files with 207 additions and 5 deletions.
21 changes: 18 additions & 3 deletions src/watchdog/src/watchdog.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ def gen_k8s_node_gpu_available():
return GaugeMetricFamily("k8s_node_gpu_available", "gpu available on k8s node",
labels=["host_ip"])

# reserved gpu means gpu not allocated to tasks and the node is being marked as
# unschedulable.
def gen_k8s_node_gpu_reserved():
return GaugeMetricFamily("k8s_node_gpu_reserved", "gpu reserved on k8s node",
labels=["host_ip"])

def gen_k8s_node_gpu_total():
return GaugeMetricFamily("k8s_node_gpu_total", "gpu total on k8s node",
labels=["host_ip"])
Expand Down Expand Up @@ -293,7 +299,7 @@ def collect_k8s_component(api_server_scheme, api_server_ip, api_server_port, ca_


def parse_node_item(node, pai_node_gauge,
node_gpu_avail, node_gpu_total,
node_gpu_avail, node_gpu_total, node_gpu_reserved,
pods_info):

ip = None
Expand Down Expand Up @@ -358,7 +364,14 @@ def parse_node_item(node, pai_node_gauge,
for pod in pods_info[ip]:
used_gpu += pod.gpu

node_gpu_avail.add_metric([ip], max(0, total_gpu - used_gpu))
# if a node is marked as unschedulable, the available gpu will be 0
# and reserved gpu will be `total - used`
if walk_json_field_safe(node, "spec", "unschedulable") != True:
node_gpu_avail.add_metric([ip], max(0, total_gpu - used_gpu))
node_gpu_reserved.add_metric([ip], 0)
else:
node_gpu_avail.add_metric([ip], 0)
node_gpu_reserved.add_metric([ip], max(0, total_gpu - used_gpu))
else:
logger.warning("unexpected structure of node %s: %s", ip, json.dumps(node))

Expand All @@ -368,6 +381,7 @@ def parse_node_item(node, pai_node_gauge,
def process_nodes_status(nodes_object, pods_info):
pai_node_gauge = gen_pai_node_gauge()
node_gpu_avail = gen_k8s_node_gpu_available()
node_gpu_reserved = gen_k8s_node_gpu_reserved()
node_gpu_total = gen_k8s_node_gpu_total()

def _map_fn(item):
Expand All @@ -378,12 +392,13 @@ def _map_fn(item):
pai_node_gauge,
node_gpu_avail,
node_gpu_total,
node_gpu_reserved,
pods_info)

list(map(_map_fn, nodes_object["items"]))

return [pai_node_gauge,
node_gpu_avail, node_gpu_total]
node_gpu_avail, node_gpu_total, node_gpu_reserved]


def process_pods(k8s_api_addr, ca_path, headers, pods_info):
Expand Down
159 changes: 159 additions & 0 deletions src/watchdog/test/data/dlws_nodes_list_with_unschedulable.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
{
"kind": "NodeList",
"apiVersion": "v1",
"metadata": {
"selfLink": "/api/v1/nodes",
"resourceVersion": "6263195"
},
"items": [
{
"metadata": {
"name": "dltsp40-infra01",
"selfLink": "/api/v1/nodes/dltsp40-infra01",
"uid": "13334d00-4cc9-11e9-9957-000d3a1707fc",
"resourceVersion": "6263192",
"creationTimestamp": "2019-03-22T17:36:50Z",
"labels": {
"kubernetes.io/hostname": "dltsp40-infra01",
"yarnrm2": "active",
"freeflowrouter": "active",
"elasticsearch": "active",
"hdfsdatanode": "active",
"yarnrm1": "active",
"collectd-node-agent": "active",
"detectron": "active",
"fluentd-es-config-v0.1.0": "active",
"grafana": "active",
"hdfsjournal": "active",
"hdfsstandby": "active",
"journalnode": "active",
"nginx": "active",
"cloud-fluentd-es-v2.0.2": "active",
"webportal": "active",
"google-cadvisor": "active",
"hdfsnn1": "active",
"kibana": "active",
"namenode1": "active",
"sparknode": "active",
"all": "active",
"infrastructure": "active",
"nvidia-device-plugin-daemonset": "active",
"nvidiaheartbeat": "active",
"recogserver": "active",
"restfulapi": "active",
"yarnnodemanager": "active",
"zk": "active",
"datanode": "active",
"zookeeper": "active",
"default": "active",
"elasticsearch-logging": "active",
"jobmanager": "active",
"mysql": "active",
"zk-config": "active",
"cloud-collectd-node-agent": "active",
"beta.kubernetes.io/os": "linux",
"cloud-fluentd-es-config-v0.1.0": "active",
"fluentd-es-v2.0.2": "active",
"hdfsformat": "active",
"hdfsnn2": "active",
"zk-headless": "active",
"FragmentGPUJob": "active",
"dlws-grafana": "active",
"influxdb": "active",
"beta.kubernetes.io/arch": "amd64"
},
"annotations": {
"node.alpha.kubernetes.io/ttl": "0",
"volumes.kubernetes.io/controller-managed-attach-detach": "true"
}
},
"spec": {
"externalID": "dltsp40-infra01",
"providerID": "aztools://dltsp40-infra01",
"unschedulable": true,
"taints": [
{
"key": "node-role.kubernetes.io/master",
"effect": "NoSchedule"
}
]
},
"status": {
"capacity": {
"alpha.kubernetes.io/nvidia-gpu": "4",
"cpu": "16",
"memory": "57709692Ki",
"pods": "110"
},
"allocatable": {
"cpu": "16",
"memory": "57607292Ki",
"pods": "110",
"alpha.kubernetes.io/nvidia-gpu": "4"
},
"conditions": [
{
"type": "OutOfDisk",
"status": "False",
"lastHeartbeatTime": "2019-04-18T08:44:11Z",
"lastTransitionTime": "2019-03-22T17:36:49Z",
"reason": "KubeletHasSufficientDisk",
"message": "kubelet has sufficient disk space available"
},
{
"type": "MemoryPressure",
"status": "False",
"lastHeartbeatTime": "2019-04-18T08:44:11Z",
"lastTransitionTime": "2019-03-22T17:36:49Z",
"reason": "KubeletHasSufficientMemory",
"message": "kubelet has sufficient memory available"
},
{
"type": "DiskPressure",
"status": "False",
"lastHeartbeatTime": "2019-04-18T08:44:11Z",
"lastTransitionTime": "2019-03-22T17:36:49Z",
"reason": "KubeletHasNoDiskPressure",
"message": "kubelet has no disk pressure"
},
{
"type": "Ready",
"status": "True",
"lastHeartbeatTime": "2019-04-18T08:44:11Z",
"lastTransitionTime": "2019-03-22T17:37:10Z",
"reason": "KubeletReady",
"message": "kubelet is posting ready status. AppArmor enabled"
}
],
"addresses": [
{
"type": "InternalIP",
"address": "192.168.255.1"
},
{
"type": "Hostname",
"address": "dltsp40-infra01"
}
],
"daemonEndpoints": {
"kubeletEndpoint": {
"Port": 10250
}
},
"nodeInfo": {
"machineID": "0fcc50243f694b94b64db96ab895ee7c",
"systemUUID": "29e4606f-2997-0345-b916-11050543c01b",
"bootID": "32861647-ea09-44c7-82dd-38c92a831ebf",
"kernelVersion": "4.18.0-1013-azure",
"osImage": "Ubuntu 18.04.2 LTS",
"containerRuntimeVersion": "docker://18.9.3",
"kubeletVersion": "v1.9.0",
"kubeProxyVersion": "v1.9.0",
"operatingSystem": "linux",
"architecture": "amd64"
},
"images": []
}
}
]
}
32 changes: 30 additions & 2 deletions src/watchdog/test/test_watchdog.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def test_process_nodes_status(self):

gauges = watchdog.process_nodes_status(obj, {})

self.assertTrue(len(gauges) == 3)
self.assertTrue(len(gauges) == 4)

for gauge in gauges:
self.assertTrue(len(gauge.samples) > 0)
Expand Down Expand Up @@ -123,14 +123,42 @@ def test_process_dlws_nodes_status(self):
pod_info["192.168.255.1"].append(watchdog.PodInfo("job1", 2))
gauges = watchdog.process_nodes_status(obj, pod_info)

self.assertTrue(len(gauges) == 3)
self.assertTrue(len(gauges) == 4)

self.assertEqual("k8s_node_gpu_available", gauges[1].name)
self.assertEqual(1, len(gauges[1].samples))
self.assertEqual(2, gauges[1].samples[0].value)
self.assertEqual("k8s_node_gpu_total", gauges[2].name)
self.assertEqual(1, len(gauges[2].samples))
self.assertEqual(4, gauges[2].samples[0].value)
self.assertEqual("k8s_node_gpu_reserved", gauges[3].name)
self.assertEqual(1, len(gauges[3].samples))
self.assertEqual(0, gauges[3].samples[0].value)

for gauge in gauges:
self.assertTrue(len(gauge.samples) > 0)

for gauge in gauges[1:]:
self.assertEqual("192.168.255.1", gauge.samples[0].labels["host_ip"])

def test_process_dlws_nodes_status_with_unscheduable(self):
obj = json.loads(self.get_data_test_input("data/dlws_nodes_list_with_unschedulable.json"))

pod_info = collections.defaultdict(lambda : [])
pod_info["192.168.255.1"].append(watchdog.PodInfo("job1", 2))
gauges = watchdog.process_nodes_status(obj, pod_info)

self.assertTrue(len(gauges) == 4)

self.assertEqual("k8s_node_gpu_available", gauges[1].name)
self.assertEqual(1, len(gauges[1].samples))
self.assertEqual(0, gauges[1].samples[0].value)
self.assertEqual("k8s_node_gpu_total", gauges[2].name)
self.assertEqual(1, len(gauges[2].samples))
self.assertEqual(4, gauges[2].samples[0].value)
self.assertEqual("k8s_node_gpu_reserved", gauges[3].name)
self.assertEqual(1, len(gauges[3].samples))
self.assertEqual(2, gauges[3].samples[0].value)

for gauge in gauges:
self.assertTrue(len(gauge.samples) > 0)
Expand Down

0 comments on commit 9f6b9f2

Please sign in to comment.