From eae57d8e64601bb61f18cbf4a6d3b9f7d378d66a Mon Sep 17 00:00:00 2001
From: Di Xu <xudifsd@gmail.com>
Date: Fri, 10 May 2019 10:54:32 +0800
Subject: [PATCH] adapt dlws jobs (#2604)

---
 docs/alerting/watchdog-metrics.md             |   8 +
 .../deploy/job-exporter.yaml.template         |   2 +
 src/job-exporter/src/docker_inspect.py        |   8 +-
 src/job-exporter/src/main.py                  |  26 +-
 src/job-exporter/src/nvidia.py                |   5 +-
 .../test/data/dlts_docker_inspect.json        | 403 ++++++++++++++++++
 src/job-exporter/test/test_docker_inspect.py  |  15 +
 src/watchdog/src/watchdog.py                  | 279 +++++++++---
 src/watchdog/test/data/dlws_nodes_list.json   | 158 +++++++
 .../dlws_nodes_list_with_unschedulable.json   | 159 +++++++
 src/watchdog/test/data/nodes_list.json        |  12 +
 src/watchdog/test/test_watchdog.py            |  75 +++-
 12 files changed, 1070 insertions(+), 80 deletions(-)
 create mode 100644 src/job-exporter/test/data/dlts_docker_inspect.json
 create mode 100644 src/watchdog/test/data/dlws_nodes_list.json
 create mode 100644 src/watchdog/test/data/dlws_nodes_list_with_unschedulable.json

diff --git a/docs/alerting/watchdog-metrics.md b/docs/alerting/watchdog-metrics.md
index e62fec843b..36b18c06a7 100644
--- a/docs/alerting/watchdog-metrics.md
+++ b/docs/alerting/watchdog-metrics.md
@@ -57,6 +57,14 @@ vi watchdog-xx.log
 | ---------- |  ----------- |
 | k8s_api_server_count | use label `error` to represent status, if `error` != "ok", means k8s api server is not functioning correctly |
 
+## K8s resource Metrics
+| Metric name| Description |
+| ---------- |  ----------- |
+| k8s_node_gpu_total | Total Gpu |
+| k8s_node_gpu_available | Total gpu count - used gpu count |
+| k8s_node_gpu_reserved | If node is marked as unschedulable via `kubectl cordon $node` all unused gpus are deemed as reserved |
+
+
 ## Other Metrics
 | Metric name| Description |
 | ---------- |  ----------- |
diff --git a/src/job-exporter/deploy/job-exporter.yaml.template b/src/job-exporter/deploy/job-exporter.yaml.template
index 8a9c4682ba..31dd0eb463 100644
--- a/src/job-exporter/deploy/job-exporter.yaml.template
+++ b/src/job-exporter/deploy/job-exporter.yaml.template
@@ -68,6 +68,8 @@ spec:
           value: {{ cluster_cfg["job-exporter"]["logging-level"] }}
         - name: NV_DRIVER
           value: /var/drivers/nvidia/current
+        - name: NVIDIA_VISIBLE_DEVICES
+          value: all
         volumeMounts:
         - mountPath: /var/run/docker.sock
           name: docker-socket
diff --git a/src/job-exporter/src/docker_inspect.py b/src/job-exporter/src/docker_inspect.py
index 4d35dcb206..c95418314c 100644
--- a/src/job-exporter/src/docker_inspect.py
+++ b/src/job-exporter/src/docker_inspect.py
@@ -49,7 +49,7 @@ def __eq__(self, o):
 
 
 keys = {"PAI_JOB_NAME", "PAI_USER_NAME", "PAI_CURRENT_TASK_ROLE_NAME", "GPU_ID",
-        "PAI_TASK_INDEX"}
+        "PAI_TASK_INDEX", "DLWS_JOB_ID", "DLWS_USER_NAME"}
 
 
 def parse_docker_inspect(inspect_output):
@@ -79,15 +79,13 @@ def parse_docker_inspect(inspect_output):
     pid = utils.walk_json_field_safe(obj, 0, "State", "Pid")
 
     return InspectResult(
-            m.get("PAI_USER_NAME"),
-            m.get("PAI_JOB_NAME"),
+            m.get("PAI_USER_NAME") or m.get("DLWS_USER_NAME"),
+            m.get("PAI_JOB_NAME") or m.get("DLWS_JOB_ID"),
             m.get("PAI_CURRENT_TASK_ROLE_NAME"),
             m.get("PAI_TASK_INDEX"),
             m.get("GPU_ID"),
             pid)
 
-    return {"env": envs, "labels": labels, "pid": pid}
-
 def inspect(container_id, histogram, timeout):
     try:
         result = utils.exec_cmd(
diff --git a/src/job-exporter/src/main.py b/src/job-exporter/src/main.py
index 982c71179f..bd857b9a91 100644
--- a/src/job-exporter/src/main.py
+++ b/src/job-exporter/src/main.py
@@ -63,7 +63,12 @@ def config_environ():
             os.path.join(driver_path, "lib") + os.pathsep + \
             os.path.join(driver_path, "lib64")
 
-    logger.debug("LD_LIBRARY_PATH is %s", os.environ["LD_LIBRARY_PATH"])
+    driver_bin_path = os.path.join(driver_path, "bin")
+    os.environ["PATH"] = os.environ["PATH"] + ":" + driver_bin_path
+
+    logger.debug("LD_LIBRARY_PATH is %s, PATH is %s",
+            os.environ["LD_LIBRARY_PATH"],
+            os.environ["PATH"])
 
 
 def try_remove_old_prom_file(path):
@@ -82,16 +87,17 @@ def get_gpu_count(path):
 
     logger.debug("hostname is %s, ip is %s", hostname, ip)
 
-    with open(path) as f:
-        gpu_config = json.load(f)
+    if os.path.isfile(path):
+        with open(path) as f:
+            gpu_config = json.load(f)
+
+        if hostname is not None and gpu_config["nodes"].get(hostname) is not None:
+            return gpu_config["nodes"][hostname]["gpuCount"]
+        elif ip is not None and gpu_config["nodes"].get(ip) is not None:
+            return gpu_config["nodes"][ip]["gpuCount"]
 
-    if hostname is not None and gpu_config["nodes"].get(hostname) is not None:
-        return gpu_config["nodes"][hostname]["gpuCount"]
-    elif ip is not None and gpu_config["nodes"].get(ip) is not None:
-        return gpu_config["nodes"][ip]["gpuCount"]
-    else:
-        logger.warning("failed to find gpu count from config %s", gpu_config)
-        return 0
+    logger.warning("failed to find gpu count from config %s", path)
+    return 0
 
 
 def register_stack_trace_dump():
diff --git a/src/job-exporter/src/nvidia.py b/src/job-exporter/src/nvidia.py
index c9af57d4fe..aa0d3af261 100644
--- a/src/job-exporter/src/nvidia.py
+++ b/src/job-exporter/src/nvidia.py
@@ -162,11 +162,8 @@ def parse_smi_xml_result(smi):
     return result
 
 def nvidia_smi(histogram, timeout):
-    driver_path = os.environ["NV_DRIVER"]
-    bin_path = os.path.join(driver_path, "bin/nvidia-smi")
-
     try:
-        smi_output = utils.exec_cmd([bin_path, "-q", "-x"],
+        smi_output = utils.exec_cmd(["nvidia-smi", "-q", "-x"],
                 histogram=histogram, timeout=timeout)
 
         return parse_smi_xml_result(smi_output)
diff --git a/src/job-exporter/test/data/dlts_docker_inspect.json b/src/job-exporter/test/data/dlts_docker_inspect.json
new file mode 100644
index 0000000000..f7f114aa09
--- /dev/null
+++ b/src/job-exporter/test/data/dlts_docker_inspect.json
@@ -0,0 +1,403 @@
+[
+    {
+        "Id": "e1a9cf8a0ad00a0d5bd2ea250c266ac48aa95940b83190433102a6b05675bf96",
+        "Created": "2019-04-18T06:31:21.961595409Z",
+        "Path": "bash",
+        "Args": [
+            "/job/launch-0c435eee-d31f-43d5-a1b3-442845fa1d0c.sh"
+        ],
+        "State": {
+            "Status": "running",
+            "Running": true,
+            "Paused": false,
+            "Restarting": false,
+            "OOMKilled": false,
+            "Dead": false,
+            "Pid": 3533,
+            "ExitCode": 0,
+            "Error": "",
+            "StartedAt": "2019-04-18T06:31:36.178067796Z",
+            "FinishedAt": "0001-01-01T00:00:00Z"
+        },
+        "Image": "sha256:ee0a1ed21fb27d93bff04b000841d28d77e767ba435a1fdb2033c5c1b373cf55",
+        "ResolvConfPath": "/etc/resolv.conf",
+        "HostnamePath": "/data/lib/docker/containers/82a8cc32dbd40766690f0955abf1f2d20db82077217898f85ef68dcf60232ad1/hostname",
+        "HostsPath": "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/etc-hosts",
+        "LogPath": "/data/lib/docker/containers/e1a9cf8a0ad00a0d5bd2ea250c266ac48aa95940b83190433102a6b05675bf96/e1a9cf8a0ad00a0d5bd2ea250c266ac48aa95940b83190433102a6b05675bf96-json.log",
+        "Name": "/k8s_0c435eee-d31f-43d5-a1b3-442845fa1d0c_0c435eee-d31f-43d5-a1b3-442845fa1d0c_default_611006c3-61a3-11e9-9957-000d3a1707fc_0",
+        "RestartCount": 0,
+        "Driver": "overlay2",
+        "Platform": "linux",
+        "MountLabel": "",
+        "ProcessLabel": "",
+        "AppArmorProfile": "docker-default",
+        "ExecIDs": null,
+        "HostConfig": {
+            "Binds": [
+                "/etc/resolv.conf:/etc/resolv.conf",
+                "/dlwsdata/work/dixu:/home/dixu",
+                "/dlwsdata/work/dixu/jobs/190418/0c435eee-d31f-43d5-a1b3-442845fa1d0c:/job",
+                "/dlwsdata/storage/imagenet:/data",
+                "/dlwsdata/work/dixu/.ssh:/home/dixu/.ssh:ro",
+                "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/volumes/kubernetes.io~empty-dir/dshm:/dev/shm",
+                "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/volumes/kubernetes.io~secret/default-token-xqzf9:/var/run/secrets/kubernetes.io/serviceaccount:ro",
+                "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/etc-hosts:/etc/hosts",
+                "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/containers/0c435eee-d31f-43d5-a1b3-442845fa1d0c/12a87052:/dev/termination-log"
+            ],
+            "ContainerIDFile": "",
+            "LogConfig": {
+                "Type": "json-file",
+                "Config": {}
+            },
+            "NetworkMode": "container:82a8cc32dbd40766690f0955abf1f2d20db82077217898f85ef68dcf60232ad1",
+            "PortBindings": null,
+            "RestartPolicy": {
+                "Name": "",
+                "MaximumRetryCount": 0
+            },
+            "AutoRemove": false,
+            "VolumeDriver": "",
+            "VolumesFrom": null,
+            "CapAdd": null,
+            "CapDrop": null,
+            "Dns": null,
+            "DnsOptions": null,
+            "DnsSearch": null,
+            "ExtraHosts": null,
+            "GroupAdd": null,
+            "IpcMode": "container:82a8cc32dbd40766690f0955abf1f2d20db82077217898f85ef68dcf60232ad1",
+            "Cgroup": "",
+            "Links": null,
+            "OomScoreAdj": 999,
+            "PidMode": "",
+            "Privileged": false,
+            "PublishAllPorts": false,
+            "ReadonlyRootfs": false,
+            "SecurityOpt": [
+                "seccomp=unconfined"
+            ],
+            "UTSMode": "",
+            "UsernsMode": "",
+            "ShmSize": 67108864,
+            "Runtime": "nvidia",
+            "ConsoleSize": [
+                0,
+                0
+            ],
+            "Isolation": "",
+            "CpuShares": 1024,
+            "Memory": 0,
+            "NanoCpus": 0,
+            "CgroupParent": "/kubepods/burstable/pod611006c3-61a3-11e9-9957-000d3a1707fc",
+            "BlkioWeight": 0,
+            "BlkioWeightDevice": null,
+            "BlkioDeviceReadBps": null,
+            "BlkioDeviceWriteBps": null,
+            "BlkioDeviceReadIOps": null,
+            "BlkioDeviceWriteIOps": null,
+            "CpuPeriod": 0,
+            "CpuQuota": 0,
+            "CpuRealtimePeriod": 0,
+            "CpuRealtimeRuntime": 0,
+            "CpusetCpus": "",
+            "CpusetMems": "",
+            "Devices": [],
+            "DeviceCgroupRules": null,
+            "DiskQuota": 0,
+            "KernelMemory": 0,
+            "MemoryReservation": 0,
+            "MemorySwap": 0,
+            "MemorySwappiness": null,
+            "OomKillDisable": false,
+            "PidsLimit": 0,
+            "Ulimits": null,
+            "CpuCount": 0,
+            "CpuPercent": 0,
+            "IOMaximumIOps": 0,
+            "IOMaximumBandwidth": 0,
+            "MaskedPaths": [
+                "/proc/asound",
+                "/proc/acpi",
+                "/proc/kcore",
+                "/proc/keys",
+                "/proc/latency_stats",
+                "/proc/timer_list",
+                "/proc/timer_stats",
+                "/proc/sched_debug",
+                "/proc/scsi",
+                "/sys/firmware"
+            ],
+            "ReadonlyPaths": [
+                "/proc/bus",
+                "/proc/fs",
+                "/proc/irq",
+                "/proc/sys",
+                "/proc/sysrq-trigger"
+            ]
+        },
+        "GraphDriver": {
+            "Data": {
+                "LowerDir": "/data/lib/docker/overlay2/72151ab5ff423e77b3cd8b630184608d229cbb1f567a9bf7193116e5eecc5060-init/diff:/data/lib/docker/overlay2/acfef063aca95a1298a01d345bbd78582dd51bfb8786a16b5671e33eac96746f/diff:/data/lib/docker/overlay2/31f7d9efccb65a14dc23ce17b6dbac5eff1fac35bf97c965b34996d851ff6d1d/diff:/data/lib/docker/overlay2/e89ad2b850ed61f0bdc66c6292a6a8e14a76a4b3320831af563c91061e9fe7b6/diff:/data/lib/docker/overlay2/e243dd5be35a82b04b5e39fdaadbb7fa4ad769e06ae2205af0835fff522c193a/diff:/data/lib/docker/overlay2/ec6566e670ff00ef479a98400cee0ac36a88621928bfa713830dd7711cbcf62f/diff:/data/lib/docker/overlay2/af88915a503c89dd25206d355d30a13c75a49f27fbd1a3a190f18853d1365fc2/diff:/data/lib/docker/overlay2/6cc52eaee8a51b53a35a531b7f30506c44936fc8feec7c89fb7bc726989f50d9/diff:/data/lib/docker/overlay2/45b910fadb95ec6d08d39565bd2efb0c64f91f159620cbdf9896a2333c730d3f/diff:/data/lib/docker/overlay2/c58a45f6ccec97c062f6ac15c9a4fbfe1297008cb3c97266788161311e38ba66/diff:/data/lib/docker/overlay2/46d8b5a01a6e6c122fcfc6a6b6845a1baa8d2c6391414363be4ddf0ae136d377/diff:/data/lib/docker/overlay2/2fa00776b11abd79b67d3dc7d4ed6d34d0f8ea1e51e6ee5c1ee6c5939b2b4aa4/diff:/data/lib/docker/overlay2/82093c30847f81d398aad62ce3af0163ee7a3891b8c55c707a146ab69a600220/diff:/data/lib/docker/overlay2/447bedcd0ceac2f7f794fb01029bb81cbec0dcedc4e313b710a5ba6e8e9e5842/diff:/data/lib/docker/overlay2/a9820704ffc18f6ecb97ec03f15bdb0a5d63d39a4df9933dfd398338f5a57e94/diff:/data/lib/docker/overlay2/3c525d697511952e95d2536c6eb6fbc6020f0d2b1d78d025b8faa106d29dab17/diff:/data/lib/docker/overlay2/e1e79aacdabc9cb9c5929f2aec9c85f54022727a753b7558bca62daca86ba592/diff:/data/lib/docker/overlay2/2489dc456813d7b48bf1008d64846013103342b5d8b464424814dfd10253bdbf/diff:/data/lib/docker/overlay2/b00ae5899efef74b0aee509c24cf7b709f6a50a0bfbfe88c6c2f6f2cbe7b5127/diff:/data/lib/docker/overlay2/a902e9872075c97b4c54f2f0ae7d5d388db8b38780fa091e1f8e8a40e5266c8a/diff:/data/lib/docker/overlay2/df9a00088c7486e5bfa974c1576ac5a700e09d4ed601c4f177ce2c1d0bb37d97/diff:/data/lib/docker/overlay2/cd6fcfe7805a8d5adce5c36e65a753dda04f1932799905364707f8d320db8d44/diff:/data/lib/docker/overlay2/f521a15cf2aba16e21812e18138e978a9cddebb9018fb4d6d26a40e19cce80e4/diff:/data/lib/docker/overlay2/a6138afc2e6f51c7c7af8f5489511b8e2749fa4c998274acc9995af36f74241f/diff:/data/lib/docker/overlay2/b58eeaba82d11ef82379c1e85c1923d2ad8c35fcae1695f416bb6e4880380c19/diff:/data/lib/docker/overlay2/becb17f194415ece39e91426ae5182632fc13694d73bad414a8639238e5a5856/diff:/data/lib/docker/overlay2/d72cf469f5669609674441b2a99ebf9a438f1428ce0e76e0498c2e33ec3aa7ec/diff:/data/lib/docker/overlay2/4845d90ff2e9779e48edbec8d9637d7f0ea6ee9a4e514ca44fac397153e09bcc/diff:/data/lib/docker/overlay2/c732c2fa2c716850ffcb4a8d99a89b0cf98ec007212243fda43fe6e47a541e31/diff:/data/lib/docker/overlay2/4003f388d4b4bc5c6301fcdaccbda331e9f123566cdc511320ef7b0dd48e547e/diff:/data/lib/docker/overlay2/8d7c2ded6546c2357827ef63c02e7b9e6b0f513ec00a938a443382f3e77e0743/diff:/data/lib/docker/overlay2/fbf94ec74c2ef8899caa34cd6ceb8a24e40bfeacef1c8df2a7480005764a1bce/diff:/data/lib/docker/overlay2/60211c809b7f4d35c0c10542fbf2d98f3ab595e5344962bf639a6d4423450370/diff:/data/lib/docker/overlay2/c37768a7ff37fc09e74f405bc43346ceb93501a748cab3562de8739409760a57/diff:/data/lib/docker/overlay2/43d5c7c2cc0e5552ab6c1fc79884ec6c7ea8af8dbf0d409674e13501854da0fc/diff:/data/lib/docker/overlay2/0063d873e8bf59c501eae61344e82e8deeb7de78380d50e4b7da579967a0d7ed/diff:/data/lib/docker/overlay2/f18fb9fbdffc608879d0c3c0e7e94b0363d4e2f83908e69dd3b76e89123a4493/diff:/data/lib/docker/overlay2/bc17c791c97ac6f0ba7f8f04a5961d13c1f39972e1c9ce772d2e3e3ceaaf73a9/diff:/data/lib/docker/overlay2/5d075ef85806b1ad57f87cb36885b3213d217da28d08d9087948913b4b4832f9/diff:/data/lib/docker/overlay2/20a4a09494d2e22faf90cba06b47651e0abe2d1a83ed0350c6a9b1c550491521/diff:/data/lib/docker/overlay2/685bba86a073e670857261de47fd3d75afbf5881e68adb38222c1db505e726cd/diff:/data/lib/docker/overlay2/4861f6e118e3aca0b957bdb7c52f9696cfea158accc5e490e9ad4aa73b6e6aa4/diff:/data/lib/docker/overlay2/4c30c034d6d31b45e28df5566ff83138b2e9572cccb49551a3d8237240356261/diff:/data/lib/docker/overlay2/6162a88ed8598d54c4ea5c79e81bd714fd214b2c750843f40f3fd051b4d83a2a/diff:/data/lib/docker/overlay2/904d09c62c4419182145dc67d7dd485bbf89a728bff4e55c4eacf1bd10ae83de/diff:/data/lib/docker/overlay2/5f9010164b907678c212a2902c71d78a57825e37a4f39e4ad170d45c8a3a2f9b/diff:/data/lib/docker/overlay2/cfa6df1b93d137aaaac993e1511572914b6c4438f26ebd9c28a383ff57186f8b/diff:/data/lib/docker/overlay2/3d66bebd75c111c88688e69d54abd96c84d29962617a90ae4f96353233f925aa/diff:/data/lib/docker/overlay2/05b91a8fc50deabd6cea89b25c417e8319cc10b72a955c2cf593e4acd17750cf/diff:/data/lib/docker/overlay2/2353183a17aabdd994698f5ddf0df3e9fb46074eabeb8afbf3af52bdd9c81b8d/diff:/data/lib/docker/overlay2/4da6ded5ad62a760b2006258d9122e6fc5066ba9b575a543bab1925f86701cc1/diff:/data/lib/docker/overlay2/6a88dc2682e1aa07c0cbe800f1d70a99c624987b9f186f82162dd3d920ce8128/diff:/data/lib/docker/overlay2/6b288b7534b6fa6adb250f3e2d664e53fe486982a24a499277277e40507761a4/diff:/data/lib/docker/overlay2/444fb401e05a5ccc6649cdb85d99c5c44dc2648c0c2634912afc19df9bc151cc/diff:/data/lib/docker/overlay2/3e9737948414f3243702a9eebf2ba04883b588b1255e2f851c5909fbe502f058/diff:/data/lib/docker/overlay2/43e5859e24e69793cb76c88581ee99919c6f6ecdd1663d8e02bc56ab234dc0ac/diff:/data/lib/docker/overlay2/50271d3291362ca76fded64856fd01f20155d8cb3ecbd108d123a1a4750886a4/diff:/data/lib/docker/overlay2/adb42419536658b8dcc53737ae8fe57c834436824504ce4506dd37464241e6b4/diff:/data/lib/docker/overlay2/7276e287a6bc5d5ace8f812c796f667bc1fe3d1ec1c18de6b68620d76cc1027a/diff:/data/lib/docker/overlay2/32be844cd8996d63da23f3684745317e327779bc8ab305b4f848a49072d1becf/diff:/data/lib/docker/overlay2/b7fc0989966fb857aeefa8fbb35fa6ffae5175d9276c733f688ff98d87a3384e/diff:/data/lib/docker/overlay2/2130a7207cfbeac906cecfdb950df561186818c1f6971c938f07b0f301c68574/diff:/data/lib/docker/overlay2/0874cb839f1092d81a083700ac7955ce0927c72a96474b6adeecde88dfde4588/diff:/data/lib/docker/overlay2/f2863241711d278c92cddd981d1156118a1b6a1e4dc8162bbb0399f283c752c7/diff",
+                "MergedDir": "/data/lib/docker/overlay2/72151ab5ff423e77b3cd8b630184608d229cbb1f567a9bf7193116e5eecc5060/merged",
+                "UpperDir": "/data/lib/docker/overlay2/72151ab5ff423e77b3cd8b630184608d229cbb1f567a9bf7193116e5eecc5060/diff",
+                "WorkDir": "/data/lib/docker/overlay2/72151ab5ff423e77b3cd8b630184608d229cbb1f567a9bf7193116e5eecc5060/work"
+            },
+            "Name": "overlay2"
+        },
+        "Mounts": [
+            {
+                "Type": "bind",
+                "Source": "/etc/resolv.conf",
+                "Destination": "/etc/resolv.conf",
+                "Mode": "",
+                "RW": true,
+                "Propagation": "rprivate"
+            },
+            {
+                "Type": "bind",
+                "Source": "/dlwsdata/work/dixu/.ssh",
+                "Destination": "/home/dixu/.ssh",
+                "Mode": "ro",
+                "RW": false,
+                "Propagation": "rprivate"
+            },
+            {
+                "Type": "bind",
+                "Source": "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/volumes/kubernetes.io~empty-dir/dshm",
+                "Destination": "/dev/shm",
+                "Mode": "",
+                "RW": true,
+                "Propagation": "rprivate"
+            },
+            {
+                "Type": "bind",
+                "Source": "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/etc-hosts",
+                "Destination": "/etc/hosts",
+                "Mode": "",
+                "RW": true,
+                "Propagation": "rprivate"
+            },
+            {
+                "Type": "bind",
+                "Source": "/dlwsdata/work/dixu",
+                "Destination": "/home/dixu",
+                "Mode": "",
+                "RW": true,
+                "Propagation": "rprivate"
+            },
+            {
+                "Type": "bind",
+                "Source": "/dlwsdata/work/dixu/jobs/190418/0c435eee-d31f-43d5-a1b3-442845fa1d0c",
+                "Destination": "/job",
+                "Mode": "",
+                "RW": true,
+                "Propagation": "rprivate"
+            },
+            {
+                "Type": "bind",
+                "Source": "/dlwsdata/storage/imagenet",
+                "Destination": "/data",
+                "Mode": "",
+                "RW": true,
+                "Propagation": "rprivate"
+            },
+            {
+                "Type": "bind",
+                "Source": "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/volumes/kubernetes.io~secret/default-token-xqzf9",
+                "Destination": "/var/run/secrets/kubernetes.io/serviceaccount",
+                "Mode": "ro",
+                "RW": false,
+                "Propagation": "rprivate"
+            },
+            {
+                "Type": "bind",
+                "Source": "/var/lib/kubelet/pods/611006c3-61a3-11e9-9957-000d3a1707fc/containers/0c435eee-d31f-43d5-a1b3-442845fa1d0c/12a87052",
+                "Destination": "/dev/termination-log",
+                "Mode": "",
+                "RW": true,
+                "Propagation": "rprivate"
+            }
+        ],
+        "Config": {
+            "Hostname": "0c435eee-d31f-43d5-a1b3-442845fa1d0c",
+            "Domainname": "",
+            "User": "0",
+            "AttachStdin": false,
+            "AttachStdout": false,
+            "AttachStderr": false,
+            "ExposedPorts": {
+                "1280/tcp": {},
+                "1443/tcp": {},
+                "180/tcp": {},
+                "6006/tcp": {},
+                "8888/tcp": {}
+            },
+            "Tty": false,
+            "OpenStdin": false,
+            "StdinOnce": false,
+            "Env": [
+                "NVIDIA_VISIBLE_DEVICES=GPU-7c583998-b3ff-a885-8979-2d32d334cde4",
+                "DLWS_USER_NAME=dixu",
+                "JOB_ID=0c435eee-d31f-43d5-a1b3-442845fa1d0c",
+                "DLWS_JOB_ID=0c435eee-d31f-43d5-a1b3-442845fa1d0c",
+                "DLWS_NUM_GPU_PER_WORKER=1",
+                "POD_IP=10.2.48.5",
+                "LD_LIBRARY_PATH=/usr/local/nvidia/lib64/",
+                "FAMILY_TOKEN=98012454719049418c85ae134680583a",
+                "DLWS_REST_API=None",
+                "DLWS_NUM_WORKER=1",
+                "POD_NAME=0c435eee-d31f-43d5-a1b3-442845fa1d0c",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_8888_PORT_8888_TCP_PROTO=tcp",
+                "ZK_HEADLESS_PORT_3888_TCP_ADDR=10.3.1.100",
+                "METRICS_PROMETHEUS_SERVICE_PORT=9090",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_22_SERVICE_HOST=10.3.135.168",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_22_PORT_22_TCP_ADDR=10.3.135.168",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_8888_SERVICE_PORT=8888",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_8888_PORT=tcp://10.3.73.43:8888",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_8888_PORT_8888_TCP_PORT=8888",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_8888_SERVICE_PORT=8888",
+                "ZK_HEADLESS_PORT_2888_TCP_ADDR=10.3.1.100",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_22_PORT_22_TCP_ADDR=10.3.9.191",
+                "METRICS_PROMETHEUS_SERVICE_HOST=10.3.106.89",
+                "METRICS_PROMETHEUS_PORT_9090_TCP=tcp://10.3.106.89:9090",
+                "KUBERNETES_PORT_443_TCP_PROTO=tcp",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_8888_PORT_8888_TCP_PROTO=tcp",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_8888_PORT_8888_TCP_PORT=8888",
+                "ZK_HEADLESS_SERVICE_PORT_LEADER_ELECTION=3888",
+                "ZK_HEADLESS_SERVICE_PORT_CLIENT=2181",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_6006_PORT_6006_TCP_PROTO=tcp",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_6006_SERVICE_PORT_INTERACTIVE=6006",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_6006_PORT_6006_TCP=tcp://10.3.15.252:6006",
+                "METRICS_PROMETHEUS_PORT_9090_TCP_PORT=9090",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_22_PORT=tcp://10.3.135.168:22",
+                "ZK_HEADLESS_SERVICE_PORT=2888",
+                "ZK_HEADLESS_PORT_2181_TCP=tcp://10.3.1.100:2181",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_6006_SERVICE_HOST=10.3.15.252",
+                "METRICS_PROMETHEUS_PORT=tcp://10.3.106.89:9090",
+                "KUBERNETES_PORT_443_TCP_ADDR=10.3.0.1",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_8888_PORT_8888_TCP=tcp://10.3.16.47:8888",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_6006_PORT_6006_TCP_PORT=6006",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_6006_PORT_6006_TCP_PORT=6006",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_22_PORT_22_TCP=tcp://10.3.135.168:22",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_8888_PORT_8888_TCP=tcp://10.3.73.43:8888",
+                "ZK_HEADLESS_PORT_2181_TCP_ADDR=10.3.1.100",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_6006_SERVICE_PORT=6006",
+                "KUBERNETES_SERVICE_HOST=10.3.0.1",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_8888_SERVICE_PORT_INTERACTIVE=8888",
+                "ZK_HEADLESS_PORT_2888_TCP=tcp://10.3.1.100:2888",
+                "ZK_HEADLESS_PORT_3888_TCP=tcp://10.3.1.100:3888",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_6006_SERVICE_PORT_INTERACTIVE=6006",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_22_PORT_22_TCP_PROTO=tcp",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_8888_PORT_8888_TCP_ADDR=10.3.73.43",
+                "KUBERNETES_PORT_443_TCP_PORT=443",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_8888_SERVICE_PORT_INTERACTIVE=8888",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_8888_PORT=tcp://10.3.16.47:8888",
+                "ZK_HEADLESS_PORT_2888_TCP_PORT=2888",
+                "ZK_HEADLESS_PORT_2181_TCP_PROTO=tcp",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_22_PORT_22_TCP_PORT=22",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_6006_PORT=tcp://10.3.154.214:6006",
+                "KUBERNETES_SERVICE_PORT_HTTPS=443",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_6006_PORT_6006_TCP_PROTO=tcp",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_22_PORT_22_TCP_PORT=22",
+                "ZK_HEADLESS_PORT=tcp://10.3.1.100:2888",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_6006_PORT_6006_TCP_ADDR=10.3.15.252",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_22_PORT=tcp://10.3.9.191:22",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_6006_PORT_6006_TCP_ADDR=10.3.154.214",
+                "KUBERNETES_SERVICE_PORT=443",
+                "ZK_HEADLESS_SERVICE_PORT_SERVER=2888",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_6006_PORT=tcp://10.3.15.252:6006",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_22_SERVICE_PORT=22",
+                "METRICS_PROMETHEUS_SERVICE_PORT_WEB=9090",
+                "METRICS_PROMETHEUS_PORT_9090_TCP_PROTO=tcp",
+                "KUBERNETES_PORT=tcp://10.3.0.1:443",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_8888_PORT_8888_TCP_ADDR=10.3.16.47",
+                "ZK_HEADLESS_SERVICE_HOST=10.3.1.100",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_6006_SERVICE_HOST=10.3.154.214",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_22_SERVICE_PORT=22",
+                "ZK_HEADLESS_PORT_3888_TCP_PROTO=tcp",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_6006_PORT_6006_TCP=tcp://10.3.154.214:6006",
+                "METRICS_PROMETHEUS_PORT_9090_TCP_ADDR=10.3.106.89",
+                "KUBERNETES_PORT_443_TCP=tcp://10.3.0.1:443",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_8888_SERVICE_HOST=10.3.16.47",
+                "ZK_HEADLESS_PORT_2888_TCP_PROTO=tcp",
+                "ZK_HEADLESS_PORT_3888_TCP_PORT=3888",
+                "ZK_HEADLESS_PORT_2181_TCP_PORT=2181",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_6006_SERVICE_PORT=6006",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_22_PORT_22_TCP=tcp://10.3.9.191:22",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_22_SERVICE_HOST=10.3.9.191",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_22_SERVICE_PORT_INTERACTIVE=22",
+                "INTERACTIVE_0C435EEE_D31F_43D5_A1B3_442845FA1D0C_22_PORT_22_TCP_PROTO=tcp",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_22_SERVICE_PORT_INTERACTIVE=22",
+                "INTERACTIVE_5EB0E55C_263A_43DF_9203_6D85350FD8DC_8888_SERVICE_HOST=10.3.73.43",
+                "PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+                "CUDA_VERSION=9.0.176",
+                "CUDA_PKG_VERSION=9-0=9.0.176-1",
+                "NVIDIA_DRIVER_CAPABILITIES=compute,utility",
+                "NVIDIA_REQUIRE_CUDA=cuda>=9.0",
+                "BAZEL_VERSION=0.11.0",
+                "CI_BUILD_PYTHON=python3",
+                "TF_NEED_CUDA=1",
+                "TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0,6.1",
+                "TF_CUDA_VERSION=9.0",
+                "TF_CUDNN_VERSION=7",
+                "APACHE_RUN_USER=www-data",
+                "APACHE_RUN_GROUP=www-data",
+                "APACHE_LOG_DIR=/var/log/apache2",
+                "DOTNET_CLI_TELEMETRY_OPTOUT=1"
+            ],
+            "Cmd": null,
+            "Healthcheck": {
+                "Test": [
+                    "NONE"
+                ]
+            },
+            "Image": "dlws/tutorial-tensorflow@sha256:cf850a71c7d54bdf0e41f544d2661677919ec6f26ab833ef8a7b39ec0ef429d1",
+            "Volumes": null,
+            "WorkingDir": "/tensorflow",
+            "Entrypoint": [
+                "bash",
+                "/job/launch-0c435eee-d31f-43d5-a1b3-442845fa1d0c.sh"
+            ],
+            "OnBuild": null,
+            "Labels": {
+                "annotation.io.kubernetes.container.hash": "789c3bdd",
+                "annotation.io.kubernetes.container.restartCount": "0",
+                "annotation.io.kubernetes.container.terminationMessagePath": "/dev/termination-log",
+                "annotation.io.kubernetes.container.terminationMessagePolicy": "File",
+                "annotation.io.kubernetes.pod.terminationGracePeriod": "30",
+                "com.nvidia.build.id": "63756748",
+                "com.nvidia.build.ref": "2b1c8edf8d79830ad811baff9630adb3bcb5db46",
+                "com.nvidia.cuda.version": "9.0.176",
+                "com.nvidia.volumes.needed": "nvidia_driver",
+                "io.kubernetes.container.logpath": "/var/log/pods/611006c3-61a3-11e9-9957-000d3a1707fc/0c435eee-d31f-43d5-a1b3-442845fa1d0c_0.log",
+                "io.kubernetes.container.name": "0c435eee-d31f-43d5-a1b3-442845fa1d0c",
+                "io.kubernetes.docker.type": "container",
+                "io.kubernetes.pod.name": "0c435eee-d31f-43d5-a1b3-442845fa1d0c",
+                "io.kubernetes.pod.namespace": "default",
+                "io.kubernetes.pod.uid": "611006c3-61a3-11e9-9957-000d3a1707fc",
+                "io.kubernetes.sandbox.id": "82a8cc32dbd40766690f0955abf1f2d20db82077217898f85ef68dcf60232ad1",
+                "maintainer": "Craig Citro <craigcitro@google.com>"
+            }
+        },
+        "NetworkSettings": {
+            "Bridge": "",
+            "SandboxID": "",
+            "HairpinMode": false,
+            "LinkLocalIPv6Address": "",
+            "LinkLocalIPv6PrefixLen": 0,
+            "Ports": {},
+            "SandboxKey": "",
+            "SecondaryIPAddresses": null,
+            "SecondaryIPv6Addresses": null,
+            "EndpointID": "",
+            "Gateway": "",
+            "GlobalIPv6Address": "",
+            "GlobalIPv6PrefixLen": 0,
+            "IPAddress": "",
+            "IPPrefixLen": 0,
+            "IPv6Gateway": "",
+            "MacAddress": "",
+            "Networks": {}
+        }
+    }
+]
diff --git a/src/job-exporter/test/test_docker_inspect.py b/src/job-exporter/test/test_docker_inspect.py
index e0a63574e8..454cfc2c9a 100644
--- a/src/job-exporter/test/test_docker_inspect.py
+++ b/src/job-exporter/test/test_docker_inspect.py
@@ -76,5 +76,20 @@ def test_parse_docker_inspect_BUGFIX(self):
                 30332)
         self.assertEqual(target_inspect_info, inspect_info)
 
+    def test_adapt_dlts_jobs(self):
+        sample_path = "data/dlts_docker_inspect.json"
+        with open(sample_path, "r") as f:
+            docker_inspect = f.read()
+
+        inspect_info = parse_docker_inspect(docker_inspect)
+        target_inspect_info = InspectResult(
+                "dixu",
+                "0c435eee-d31f-43d5-a1b3-442845fa1d0c",
+                None,
+                None,
+                "GPU-7c583998-b3ff-a885-8979-2d32d334cde4",
+                3533)
+        self.assertEqual(target_inspect_info, inspect_info)
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/src/watchdog/src/watchdog.py b/src/watchdog/src/watchdog.py
index 55fc4fc829..70b65fb51d 100644
--- a/src/watchdog/src/watchdog.py
+++ b/src/watchdog/src/watchdog.py
@@ -28,6 +28,8 @@
 import signal
 import faulthandler
 import gc
+import re
+import collections
 
 import yaml
 import prometheus_client
@@ -55,28 +57,80 @@
 list_pods_histogram = Histogram("k8s_api_list_pods_latency_seconds",
         "Response latency for list pods from k8s api (seconds)")
 
+list_ns_histogram = Histogram("k8s_api_list_ns_latency_seconds",
+        "Response latency for list namespaces from k8s api (seconds)")
+
 list_nodes_histogram = Histogram("k8s_api_list_nodes_latency_seconds",
         "Response latency for list nodes from k8s api (seconds)")
 
 def gen_pai_pod_gauge():
     return GaugeMetricFamily("pai_pod_count", "count of pai pod",
-            labels=["service_name", "name", "phase", "host_ip",
+            labels=["service_name", "name", "namespace", "phase", "host_ip",
                 "initialized", "pod_scheduled", "ready"])
 
 def gen_pai_container_gauge():
     return GaugeMetricFamily("pai_container_count", "count of container pod",
-            labels=["service_name", "pod_name", "name", "state", "host_ip", "ready"])
+            labels=["service_name", "pod_name", "name", "namespace", "state",
+                "host_ip", "ready"])
 
 def gen_pai_node_gauge():
     return GaugeMetricFamily("pai_node_count", "count of pai node",
-            labels=["name", "disk_pressure", "memory_pressure", "out_of_disk", "ready"])
+            labels=["name", "disk_pressure", "memory_pressure", "out_of_disk", "ready", "unschedulable"])
 
 def gen_k8s_api_gauge():
     return GaugeMetricFamily("k8s_api_server_count", "count of k8s api server",
             labels=["error", "host_ip"])
 
+def gen_k8s_node_gpu_available():
+    return GaugeMetricFamily("k8s_node_gpu_available", "gpu available on k8s node",
+            labels=["host_ip"])
+
+# reserved gpu means gpu not allocated to tasks and the node is being marked as
+# unschedulable.
+def gen_k8s_node_gpu_reserved():
+    return GaugeMetricFamily("k8s_node_gpu_reserved", "gpu reserved on k8s node",
+            labels=["host_ip"])
+
+def gen_k8s_node_gpu_total():
+    return GaugeMetricFamily("k8s_node_gpu_total", "gpu total on k8s node",
+            labels=["host_ip"])
+
 ##### watchdog will generate above metrics
 
+def walk_json_field_safe(obj, *fields):
+    """ for example a=[{"a": {"b": 2}}]
+    walk_json_field_safe(a, 0, "a", "b") will get 2
+    walk_json_field_safe(a, 0, "not_exist") will get None
+    """
+    try:
+        for f in fields:
+            obj = obj[f]
+        return obj
+    except:
+        return None
+
+def convert_to_byte(data):
+    data = data.lower()
+    number = float(re.findall(r"[0-9.]+", data)[0])
+    if "t" in data:
+        return number * 10 ** 12
+    elif "g" in data:
+        return number * 10 ** 9
+    elif "m" in data:
+        return number * 10 ** 6
+    elif "k" in data:
+        return number * 10 ** 3
+    elif "ti" in data:
+        return number * 2 ** 40
+    elif "gi" in data:
+        return number * 2 ** 30
+    elif "mi" in data:
+        return number * 2 ** 20
+    elif "ki" in data:
+        return number * 2 ** 10
+    else:
+        return number
+
 class AtomicRef(object):
     """ a thread safe way to store and get object, should not modify data get from this ref """
     def __init__(self):
@@ -120,16 +174,36 @@ def catch_exception(fn, msg, default, *args, **kwargs):
         logger.exception(msg)
         return default
 
+class PodInfo(object):
+    def __init__(self, name, gpu):
+        self.name = name
+        self.gpu = gpu
 
-def parse_pod_item(pai_pod_gauge, pai_container_gauge, pod):
+    def __repr__(self):
+        return "%s: %s" % (self.name, self.gpu)
+
+def parse_pod_item(pod, namespace, pai_pod_gauge, pai_container_gauge, pods_info):
     """ add metrics to pai_pod_gauge or pai_container_gauge if successfully paesed pod.
     Because we are parsing json outputed by k8s, its format is subjected to change,
     we should test if field exists before accessing it to avoid KeyError """
 
     pod_name = pod["metadata"]["name"]
+    host_ip = walk_json_field_safe(pod, "status", "hostIP") or "unscheduled"
+
+    used_gpu = 0
+    containers = walk_json_field_safe(pod, "spec", "containers")
+    if containers is not None:
+        for container in containers:
+            req_gpu = int(walk_json_field_safe(container, "resources", "requests",
+                    "nvidia.com/gpu") or 0)
+            limit_gpu = int(walk_json_field_safe(container, "resources", "limits",
+                    "nvidia.com/gpu") or 0)
+            used_gpu += max(req_gpu, limit_gpu)
+    pods_info[host_ip].append(PodInfo(pod_name, used_gpu))
+
     labels = pod["metadata"].get("labels")
     if labels is None or "app" not in labels:
-        logger.warning("unkown pod %s", pod["metadata"]["name"])
+        logger.info("unknown pod %s", pod["metadata"]["name"])
         return None
 
     service_name = labels["app"] # get pai service name from label
@@ -141,10 +215,6 @@ def parse_pod_item(pai_pod_gauge, pai_container_gauge, pod):
     else:
         phase = "unknown"
 
-    host_ip = "unscheduled" # can not specify None here, None will cause None exception
-    if status.get("hostIP") is not None:
-        host_ip = status["hostIP"]
-
     initialized = pod_scheduled = ready = "unknown"
 
     conditions = status.get("conditions")
@@ -163,7 +233,7 @@ def parse_pod_item(pai_pod_gauge, pai_container_gauge, pod):
                 error_counter.labels(type="unknown_pod_cond").inc()
                 logger.error("unexpected condition %s in pod %s", cond_t, pod_name)
 
-    pai_pod_gauge.add_metric([service_name, pod_name, phase, host_ip,
+    pai_pod_gauge.add_metric([service_name, pod_name, namespace, phase, host_ip,
         initialized, pod_scheduled, ready], 1)
 
     # generate pai_containers
@@ -189,19 +259,21 @@ def parse_pod_item(pai_pod_gauge, pai_container_gauge, pod):
                     container_state = list(state.keys())[0].lower()
 
             pai_container_gauge.add_metric([service_name, pod_name, container_name,
-                container_state, host_ip, str(ready).lower()], 1)
+                namespace, container_state, host_ip, str(ready).lower()], 1)
 
-    return pai_pod_gauge, pai_container_gauge
 
-
-def process_pods_status(pai_pod_gauge, pai_container_gauge, podsJsonObject):
+def process_pods_status(pods_object, namespace, pai_pod_gauge, pai_container_gauge,
+        pods_info):
     def _map_fn(item):
         return catch_exception(parse_pod_item,
                 "catch exception when parsing pod item",
                 None,
-                pai_pod_gauge, pai_container_gauge, item)
+                item,
+                namespace,
+                pai_pod_gauge, pai_container_gauge,
+                pods_info)
 
-    list(map(_map_fn, podsJsonObject["items"]))
+    list(map(_map_fn, pods_object["items"]))
 
 
 def collect_healthz(gauge, histogram, scheme, address, port, url, ca_path, headers):
@@ -217,52 +289,163 @@ def collect_healthz(gauge, histogram, scheme, address, port, url, ca_path, heade
         gauge.add_metric([error, address], 1)
 
 
-def collect_k8s_component(k8s_gauge, api_server_scheme, api_server_ip, api_server_port, ca_path, headers):
+def collect_k8s_component(api_server_scheme, api_server_ip, api_server_port, ca_path, headers):
+    k8s_gauge = gen_k8s_api_gauge()
+
     collect_healthz(k8s_gauge, api_healthz_histogram,
             api_server_scheme, api_server_ip, api_server_port, "/healthz", ca_path, headers)
 
-def parse_node_item(pai_node_gauge, node):
-    name = node["metadata"]["name"]
+    return [k8s_gauge]
+
+
+def parse_node_item(node, pai_node_gauge,
+        node_gpu_avail, node_gpu_total, node_gpu_reserved,
+        pods_info):
+
+    ip = None
+
+    addresses = walk_json_field_safe(node, "status", "addresses")
+    if addresses is not None:
+        for addr in addresses:
+            if addr.get("type") == "InternalIP":
+                ip = addr.get("address")
 
-    disk_pressure = memory_pressure = out_of_disk = ready = "unknown"
+    if ip is None:
+        ip = node["metadata"]["name"]
+
+    disk_pressure = memory_pressure = out_of_disk = ready = unschedulable = "unknown"
 
     if node.get("status") is not None:
         status = node["status"]
 
-        if status.get("conditions") is not None:
-            conditions = status["conditions"]
-
+        conditions = walk_json_field_safe(status, "conditions")
+        if conditions is not None:
             for cond in conditions:
                 cond_t = cond["type"]
-                status = cond["status"].lower()
+                node_status = cond["status"].lower()
 
                 if cond_t == "DiskPressure":
-                    disk_pressure = status
+                    disk_pressure = node_status
                 elif cond_t == "MemoryPressure":
-                    memory_pressure = status
+                    memory_pressure = node_status
                 elif cond_t == "OutOfDisk":
-                    out_of_disk = status
+                    out_of_disk = node_status
                 elif cond_t == "Ready":
-                    ready = status
+                    ready = node_status
                 else:
                     error_counter.labels(type="unknown_node_cond").inc()
-                    logger.error("unexpected condition %s in node %s", cond_t, name)
+                    logger.error("unexpected condition %s in node %s", cond_t, ip)
+
+        # https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node/node-allocatable.md
+        # [Allocatable] = [Node Capacity] - [Kube-Reserved] - [System-Reserved] - [Hard-Eviction-Threshold]
+        total_gpu = 0
+
+        allocatable = walk_json_field_safe(status, "allocatable")
+        if allocatable is not None:
+            gpu1 = int(walk_json_field_safe(allocatable, "alpha.kubernetes.io/nvidia-gpu") or "0")
+            gpu2 = int(walk_json_field_safe(allocatable, "nvidia.com/gpu") or "0")
+
+            total_gpu = max(gpu1, gpu2)
+            node_gpu_total.add_metric([ip], total_gpu)
+        else:
+            capacity = walk_json_field_safe(status, "capacity")
+            if capacity is not None:
+                gpu1 = int(walk_json_field_safe(capacity, "alpha.kubernetes.io/nvidia-gpu") or "0")
+                gpu2 = int(walk_json_field_safe(capacity, "nvidia.com/gpu") or "0")
+                total_gpu = max(gpu1. gpu2)
+
+                node_gpu_total.add_metric([ip], total_gpu)
+
+        # Because k8s api's node api do not record how much resource left for
+        # allocation, so we have to compute it ourselves.
+        used_gpu = 0
+
+        if pods_info.get(ip) is not None:
+            for pod in pods_info[ip]:
+                used_gpu += pod.gpu
+
+        # if a node is marked as unschedulable, the available gpu will be 0
+        # and reserved gpu will be `total - used`
+        if walk_json_field_safe(node, "spec", "unschedulable") != True:
+            node_gpu_avail.add_metric([ip], max(0, total_gpu - used_gpu))
+            node_gpu_reserved.add_metric([ip], 0)
+        else:
+            node_gpu_avail.add_metric([ip], 0)
+            node_gpu_reserved.add_metric([ip], max(0, total_gpu - used_gpu))
+    else:
+        logger.warning("unexpected structure of node %s: %s", ip, json.dumps(node))
+
+    unschedulable_s = walk_json_field_safe(node, "spec", "unschedulable")
+    if unschedulable_s is True:
+        unschedulable = "true"
     else:
-        logger.warning("unexpected structure of node %s: %s", name, json.dumps(node))
+        unschedulable = "false"
 
-    pai_node_gauge.add_metric([name, disk_pressure, memory_pressure, out_of_disk, ready], 1)
+    pai_node_gauge.add_metric([ip, disk_pressure, memory_pressure, out_of_disk, ready, unschedulable], 1)
 
-    return pai_node_gauge
 
+def process_nodes_status(nodes_object, pods_info):
+    pai_node_gauge = gen_pai_node_gauge()
+    node_gpu_avail = gen_k8s_node_gpu_available()
+    node_gpu_reserved = gen_k8s_node_gpu_reserved()
+    node_gpu_total = gen_k8s_node_gpu_total()
 
-def process_nodes_status(pai_node_gauge, nodesJsonObject):
     def _map_fn(item):
         return catch_exception(parse_node_item,
                 "catch exception when parsing node item",
                 None,
-                pai_node_gauge, item)
+                item,
+                pai_node_gauge,
+                node_gpu_avail,
+                node_gpu_total,
+                node_gpu_reserved,
+                pods_info)
+
+    list(map(_map_fn, nodes_object["items"]))
+
+    return [pai_node_gauge,
+            node_gpu_avail, node_gpu_total, node_gpu_reserved]
+
+
+def process_pods(k8s_api_addr, ca_path, headers, pods_info):
+    list_namespace_url = "{}/api/v1/namespaces".format(k8s_api_addr)
+
+    ns_object = request_with_histogram(list_namespace_url, list_ns_histogram,
+            ca_path, headers)
+
+    namespaces = []
+
+    ns_items = walk_json_field_safe(ns_object, "items")
+    if ns_items is not None:
+        for ns in ns_items:
+            ns_name = walk_json_field_safe(ns, "metadata", "name")
+            if ns_name is not None:
+                namespaces.append(ns_name)
 
-    list(map(_map_fn, nodesJsonObject["items"]))
+    pai_pod_gauge = gen_pai_pod_gauge()
+    pai_container_gauge = gen_pai_container_gauge()
+
+    for ns in namespaces:
+        list_pods_url = "{}/api/v1/namespaces/{}/pods".format(k8s_api_addr, ns)
+        try:
+            pods_object = request_with_histogram(list_pods_url, list_pods_histogram,
+                    ca_path, headers)
+            process_pods_status(pods_object, ns, pai_pod_gauge, pai_container_gauge,
+                    pods_info)
+        except Exception as e:
+            error_counter.labels(type="parse").inc()
+            logger.exception("failed to process pods from namespace %s", ns)
+
+    return [pai_pod_gauge, pai_container_gauge]
+
+
+def process_nodes(k8s_api_addr, ca_path, headers, pods_info):
+    list_nodes_url = "{}/api/v1/nodes/".format(k8s_api_addr)
+
+    nodes_object = request_with_histogram(list_nodes_url, list_nodes_histogram,
+            ca_path, headers)
+
+    return process_nodes_status(nodes_object, pods_info)
 
 
 def load_machine_list(configFilePath):
@@ -352,33 +535,21 @@ def loop(args, atomic_ref):
            bearer = bearer_file.read()
            headers = {'Authorization': "Bearer {}".format(bearer)}
 
-    list_pods_url = "{}/api/v1/namespaces/default/pods/".format(address)
-    list_nodes_url = "{}/api/v1/nodes/".format(address)
-
     while True:
-        # these gauge is generate on each iteration
-        pai_pod_gauge = gen_pai_pod_gauge()
-        pai_container_gauge = gen_pai_container_gauge()
-        pai_node_gauge = gen_pai_node_gauge()
-        k8s_gauge = gen_k8s_api_gauge()
-
+        result = []
         try:
-            # 1. check service level status
-            podsStatus = request_with_histogram(list_pods_url, list_pods_histogram, ca_path, headers)
-            process_pods_status(pai_pod_gauge, pai_container_gauge, podsStatus)
+            pods_info = collections.defaultdict(lambda : [])
+
+            result.extend(process_pods(address, ca_path, headers, pods_info))
 
-            # 2. check nodes level status
-            nodes_status = request_with_histogram(list_nodes_url, list_nodes_histogram, ca_path, headers) 
-            process_nodes_status(pai_node_gauge, nodes_status)
+            result.extend(process_nodes(address, ca_path, headers, pods_info))
 
-            # 3. check k8s level status
-            collect_k8s_component(k8s_gauge, api_server_scheme, api_server_ip, api_server_port, ca_path, headers)
+            result.extend(collect_k8s_component(api_server_scheme, api_server_ip, api_server_port, ca_path, headers))
         except Exception as e:
             error_counter.labels(type="unknown").inc()
             logger.exception("watchdog failed in one iteration")
 
-        atomic_ref.get_and_set([pai_pod_gauge, pai_container_gauge, pai_node_gauge,
-            k8s_gauge])
+        atomic_ref.get_and_set(result)
 
         time.sleep(float(args.interval))
 
diff --git a/src/watchdog/test/data/dlws_nodes_list.json b/src/watchdog/test/data/dlws_nodes_list.json
new file mode 100644
index 0000000000..433053d071
--- /dev/null
+++ b/src/watchdog/test/data/dlws_nodes_list.json
@@ -0,0 +1,158 @@
+{
+  "kind": "NodeList",
+  "apiVersion": "v1",
+  "metadata": {
+    "selfLink": "/api/v1/nodes",
+    "resourceVersion": "6263195"
+  },
+  "items": [
+    {
+      "metadata": {
+        "name": "dltsp40-infra01",
+        "selfLink": "/api/v1/nodes/dltsp40-infra01",
+        "uid": "13334d00-4cc9-11e9-9957-000d3a1707fc",
+        "resourceVersion": "6263192",
+        "creationTimestamp": "2019-03-22T17:36:50Z",
+        "labels": {
+          "kubernetes.io/hostname": "dltsp40-infra01",
+          "yarnrm2": "active",
+          "freeflowrouter": "active",
+          "elasticsearch": "active",
+          "hdfsdatanode": "active",
+          "yarnrm1": "active",
+          "collectd-node-agent": "active",
+          "detectron": "active",
+          "fluentd-es-config-v0.1.0": "active",
+          "grafana": "active",
+          "hdfsjournal": "active",
+          "hdfsstandby": "active",
+          "journalnode": "active",
+          "nginx": "active",
+          "cloud-fluentd-es-v2.0.2": "active",
+          "webportal": "active",
+          "google-cadvisor": "active",
+          "hdfsnn1": "active",
+          "kibana": "active",
+          "namenode1": "active",
+          "sparknode": "active",
+          "all": "active",
+          "infrastructure": "active",
+          "nvidia-device-plugin-daemonset": "active",
+          "nvidiaheartbeat": "active",
+          "recogserver": "active",
+          "restfulapi": "active",
+          "yarnnodemanager": "active",
+          "zk": "active",
+          "datanode": "active",
+          "zookeeper": "active",
+          "default": "active",
+          "elasticsearch-logging": "active",
+          "jobmanager": "active",
+          "mysql": "active",
+          "zk-config": "active",
+          "cloud-collectd-node-agent": "active",
+          "beta.kubernetes.io/os": "linux",
+          "cloud-fluentd-es-config-v0.1.0": "active",
+          "fluentd-es-v2.0.2": "active",
+          "hdfsformat": "active",
+          "hdfsnn2": "active",
+          "zk-headless": "active",
+          "FragmentGPUJob": "active",
+          "dlws-grafana": "active",
+          "influxdb": "active",
+          "beta.kubernetes.io/arch": "amd64"
+        },
+        "annotations": {
+          "node.alpha.kubernetes.io/ttl": "0",
+          "volumes.kubernetes.io/controller-managed-attach-detach": "true"
+        }
+      },
+      "spec": {
+        "externalID": "dltsp40-infra01",
+        "providerID": "aztools://dltsp40-infra01",
+        "taints": [
+          {
+            "key": "node-role.kubernetes.io/master",
+            "effect": "NoSchedule"
+          }
+        ]
+      },
+      "status": {
+        "capacity": {
+          "alpha.kubernetes.io/nvidia-gpu": "4",
+          "cpu": "16",
+          "memory": "57709692Ki",
+          "pods": "110"
+        },
+        "allocatable": {
+          "cpu": "16",
+          "memory": "57607292Ki",
+          "pods": "110",
+          "alpha.kubernetes.io/nvidia-gpu": "4"
+        },
+        "conditions": [
+          {
+            "type": "OutOfDisk",
+            "status": "False",
+            "lastHeartbeatTime": "2019-04-18T08:44:11Z",
+            "lastTransitionTime": "2019-03-22T17:36:49Z",
+            "reason": "KubeletHasSufficientDisk",
+            "message": "kubelet has sufficient disk space available"
+          },
+          {
+            "type": "MemoryPressure",
+            "status": "False",
+            "lastHeartbeatTime": "2019-04-18T08:44:11Z",
+            "lastTransitionTime": "2019-03-22T17:36:49Z",
+            "reason": "KubeletHasSufficientMemory",
+            "message": "kubelet has sufficient memory available"
+          },
+          {
+            "type": "DiskPressure",
+            "status": "False",
+            "lastHeartbeatTime": "2019-04-18T08:44:11Z",
+            "lastTransitionTime": "2019-03-22T17:36:49Z",
+            "reason": "KubeletHasNoDiskPressure",
+            "message": "kubelet has no disk pressure"
+          },
+          {
+            "type": "Ready",
+            "status": "True",
+            "lastHeartbeatTime": "2019-04-18T08:44:11Z",
+            "lastTransitionTime": "2019-03-22T17:37:10Z",
+            "reason": "KubeletReady",
+            "message": "kubelet is posting ready status. AppArmor enabled"
+          }
+        ],
+        "addresses": [
+          {
+            "type": "InternalIP",
+            "address": "192.168.255.1"
+          },
+          {
+            "type": "Hostname",
+            "address": "dltsp40-infra01"
+          }
+        ],
+        "daemonEndpoints": {
+          "kubeletEndpoint": {
+            "Port": 10250
+          }
+        },
+        "nodeInfo": {
+          "machineID": "0fcc50243f694b94b64db96ab895ee7c",
+          "systemUUID": "29e4606f-2997-0345-b916-11050543c01b",
+          "bootID": "32861647-ea09-44c7-82dd-38c92a831ebf",
+          "kernelVersion": "4.18.0-1013-azure",
+          "osImage": "Ubuntu 18.04.2 LTS",
+          "containerRuntimeVersion": "docker://18.9.3",
+          "kubeletVersion": "v1.9.0",
+          "kubeProxyVersion": "v1.9.0",
+          "operatingSystem": "linux",
+          "architecture": "amd64"
+        },
+        "images": []
+      }
+    }
+  ]
+}
diff --git a/src/watchdog/test/data/dlws_nodes_list_with_unschedulable.json b/src/watchdog/test/data/dlws_nodes_list_with_unschedulable.json
new file mode 100644
index 0000000000..dc70851fe1
--- /dev/null
+++ b/src/watchdog/test/data/dlws_nodes_list_with_unschedulable.json
@@ -0,0 +1,159 @@
+{
+  "kind": "NodeList",
+  "apiVersion": "v1",
+  "metadata": {
+    "selfLink": "/api/v1/nodes",
+    "resourceVersion": "6263195"
+  },
+  "items": [
+    {
+      "metadata": {
+        "name": "dltsp40-infra01",
+        "selfLink": "/api/v1/nodes/dltsp40-infra01",
+        "uid": "13334d00-4cc9-11e9-9957-000d3a1707fc",
+        "resourceVersion": "6263192",
+        "creationTimestamp": "2019-03-22T17:36:50Z",
+        "labels": {
+          "kubernetes.io/hostname": "dltsp40-infra01",
+          "yarnrm2": "active",
+          "freeflowrouter": "active",
+          "elasticsearch": "active",
+          "hdfsdatanode": "active",
+          "yarnrm1": "active",
+          "collectd-node-agent": "active",
+          "detectron": "active",
+          "fluentd-es-config-v0.1.0": "active",
+          "grafana": "active",
+          "hdfsjournal": "active",
+          "hdfsstandby": "active",
+          "journalnode": "active",
+          "nginx": "active",
+          "cloud-fluentd-es-v2.0.2": "active",
+          "webportal": "active",
+          "google-cadvisor": "active",
+          "hdfsnn1": "active",
+          "kibana": "active",
+          "namenode1": "active",
+          "sparknode": "active",
+          "all": "active",
+          "infrastructure": "active",
+          "nvidia-device-plugin-daemonset": "active",
+          "nvidiaheartbeat": "active",
+          "recogserver": "active",
+          "restfulapi": "active",
+          "yarnnodemanager": "active",
+          "zk": "active",
+          "datanode": "active",
+          "zookeeper": "active",
+          "default": "active",
+          "elasticsearch-logging": "active",
+          "jobmanager": "active",
+          "mysql": "active",
+          "zk-config": "active",
+          "cloud-collectd-node-agent": "active",
+          "beta.kubernetes.io/os": "linux",
+          "cloud-fluentd-es-config-v0.1.0": "active",
+          "fluentd-es-v2.0.2": "active",
+          "hdfsformat": "active",
+          "hdfsnn2": "active",
+          "zk-headless": "active",
+          "FragmentGPUJob": "active",
+          "dlws-grafana": "active",
+          "influxdb": "active",
+          "beta.kubernetes.io/arch": "amd64"
+        },
+        "annotations": {
+          "node.alpha.kubernetes.io/ttl": "0",
+          "volumes.kubernetes.io/controller-managed-attach-detach": "true"
+        }
+      },
+      "spec": {
+        "externalID": "dltsp40-infra01",
+        "providerID": "aztools://dltsp40-infra01",
+        "unschedulable": true,
+        "taints": [
+          {
+            "key": "node-role.kubernetes.io/master",
+            "effect": "NoSchedule"
+          }
+        ]
+      },
+      "status": {
+        "capacity": {
+          "alpha.kubernetes.io/nvidia-gpu": "4",
+          "cpu": "16",
+          "memory": "57709692Ki",
+          "pods": "110"
+        },
+        "allocatable": {
+          "cpu": "16",
+          "memory": "57607292Ki",
+          "pods": "110",
+          "alpha.kubernetes.io/nvidia-gpu": "4"
+        },
+        "conditions": [
+          {
+            "type": "OutOfDisk",
+            "status": "False",
+            "lastHeartbeatTime": "2019-04-18T08:44:11Z",
+            "lastTransitionTime": "2019-03-22T17:36:49Z",
+            "reason": "KubeletHasSufficientDisk",
+            "message": "kubelet has sufficient disk space available"
+          },
+          {
+            "type": "MemoryPressure",
+            "status": "False",
+            "lastHeartbeatTime": "2019-04-18T08:44:11Z",
+            "lastTransitionTime": "2019-03-22T17:36:49Z",
+            "reason": "KubeletHasSufficientMemory",
+            "message": "kubelet has sufficient memory available"
+          },
+          {
+            "type": "DiskPressure",
+            "status": "False",
+            "lastHeartbeatTime": "2019-04-18T08:44:11Z",
+            "lastTransitionTime": "2019-03-22T17:36:49Z",
+            "reason": "KubeletHasNoDiskPressure",
+            "message": "kubelet has no disk pressure"
+          },
+          {
+            "type": "Ready",
+            "status": "True",
+            "lastHeartbeatTime": "2019-04-18T08:44:11Z",
+            "lastTransitionTime": "2019-03-22T17:37:10Z",
+            "reason": "KubeletReady",
+            "message": "kubelet is posting ready status. AppArmor enabled"
+          }
+        ],
+        "addresses": [
+          {
+            "type": "InternalIP",
+            "address": "192.168.255.1"
+          },
+          {
+            "type": "Hostname",
+            "address": "dltsp40-infra01"
+          }
+        ],
+        "daemonEndpoints": {
+          "kubeletEndpoint": {
+            "Port": 10250
+          }
+        },
+        "nodeInfo": {
+          "machineID": "0fcc50243f694b94b64db96ab895ee7c",
+          "systemUUID": "29e4606f-2997-0345-b916-11050543c01b",
+          "bootID": "32861647-ea09-44c7-82dd-38c92a831ebf",
+          "kernelVersion": "4.18.0-1013-azure",
+          "osImage": "Ubuntu 18.04.2 LTS",
+          "containerRuntimeVersion": "docker://18.9.3",
+          "kubeletVersion": "v1.9.0",
+          "kubeProxyVersion": "v1.9.0",
+          "operatingSystem": "linux",
+          "architecture": "amd64"
+        },
+        "images": []
+      }
+    }
+  ]
+}
diff --git a/src/watchdog/test/data/nodes_list.json b/src/watchdog/test/data/nodes_list.json
index 18ae5acc82..38bce834c5 100644
--- a/src/watchdog/test/data/nodes_list.json
+++ b/src/watchdog/test/data/nodes_list.json
@@ -18,6 +18,18 @@
         "externalID": "10.151.40.4"
       },
       "status": {
+        "capacity": {
+          "alpha.kubernetes.io/nvidia-gpu": "0",
+          "cpu": "16",
+          "memory": "57709692Ki",
+          "pods": "110"
+        },
+        "allocatable": {
+          "cpu": "16",
+          "memory": "57607292Ki",
+          "pods": "110",
+          "alpha.kubernetes.io/nvidia-gpu": "0"
+        },
         "conditions": [
           {
             "type": "OutOfDisk",
diff --git a/src/watchdog/test/test_watchdog.py b/src/watchdog/test/test_watchdog.py
index 27c6e84055..3beadfbe2a 100644
--- a/src/watchdog/test/test_watchdog.py
+++ b/src/watchdog/test/test_watchdog.py
@@ -22,6 +22,7 @@
 import json
 import logging
 import logging.config
+import collections
 
 import prometheus_client
 
@@ -62,10 +63,12 @@ def get_data_test_input(self, path):
     def test_parse_pods_status(self):
         obj = json.loads(self.get_data_test_input("data/pods_list.json"))
 
-        pod_gauge = watchdog.gen_pai_node_gauge()
+        pod_gauge = watchdog.gen_pai_pod_gauge()
         container_gauge = watchdog.gen_pai_container_gauge()
+        pod_info = collections.defaultdict(lambda : [])
 
-        watchdog.process_pods_status(pod_gauge, container_gauge, obj)
+        watchdog.process_pods_status(obj, "default",
+                pod_gauge, container_gauge, pod_info)
 
         self.assertTrue(len(pod_gauge.samples) > 0)
         self.assertTrue(len(container_gauge.samples) > 0)
@@ -73,19 +76,22 @@ def test_parse_pods_status(self):
     def test_process_nodes_status(self):
         obj = json.loads(self.get_data_test_input("data/nodes_list.json"))
 
-        gauge = watchdog.gen_pai_node_gauge()
+        gauges = watchdog.process_nodes_status(obj, {})
 
-        watchdog.process_nodes_status(gauge, obj)
+        self.assertTrue(len(gauges) == 4)
 
-        self.assertTrue(len(gauge.samples) > 0)
+        for gauge in gauges:
+            self.assertTrue(len(gauge.samples) > 0)
 
     def test_process_pods_with_no_condition(self):
         obj = json.loads(self.get_data_test_input("data/no_condtion_pod.json"))
 
         pod_gauge = watchdog.gen_pai_pod_gauge()
         container_gauge = watchdog.gen_pai_container_gauge()
+        pod_info = collections.defaultdict(lambda : [])
 
-        watchdog.process_pods_status(pod_gauge, container_gauge, obj)
+        watchdog.process_pods_status(obj, "default",
+                pod_gauge, container_gauge, pod_info)
 
         self.assertTrue(len(pod_gauge.samples) > 0)
         self.assertEqual(0, len(container_gauge.samples))
@@ -97,8 +103,10 @@ class CustomCollector(object):
             def collect(self):
                 pod_gauge = watchdog.gen_pai_pod_gauge()
                 container_gauge = watchdog.gen_pai_container_gauge()
+                pod_info = collections.defaultdict(lambda : [])
 
-                watchdog.process_pods_status(pod_gauge, container_gauge, obj)
+                watchdog.process_pods_status(obj, "default",
+                        pod_gauge, container_gauge, pod_info)
 
                 yield pod_gauge
                 yield container_gauge
@@ -108,5 +116,58 @@ def collect(self):
         # expect no exception
         prometheus_client.write_to_textfile("/tmp/test_watchdog.prom", registry)
 
+    def test_process_dlws_nodes_status(self):
+        obj = json.loads(self.get_data_test_input("data/dlws_nodes_list.json"))
+
+        pod_info = collections.defaultdict(lambda : [])
+        pod_info["192.168.255.1"].append(watchdog.PodInfo("job1", 2))
+        gauges = watchdog.process_nodes_status(obj, pod_info)
+
+        self.assertTrue(len(gauges) == 4)
+
+        self.assertEqual("k8s_node_gpu_available", gauges[1].name)
+        self.assertEqual(1, len(gauges[1].samples))
+        self.assertEqual(2, gauges[1].samples[0].value)
+        self.assertEqual("k8s_node_gpu_total", gauges[2].name)
+        self.assertEqual(1, len(gauges[2].samples))
+        self.assertEqual(4, gauges[2].samples[0].value)
+        self.assertEqual("k8s_node_gpu_reserved", gauges[3].name)
+        self.assertEqual(1, len(gauges[3].samples))
+        self.assertEqual(0, gauges[3].samples[0].value)
+
+        for gauge in gauges:
+            self.assertTrue(len(gauge.samples) > 0)
+
+        for gauge in gauges[1:]:
+            self.assertEqual("192.168.255.1", gauge.samples[0].labels["host_ip"])
+
+    def test_process_dlws_nodes_status_with_unscheduable(self):
+        obj = json.loads(self.get_data_test_input("data/dlws_nodes_list_with_unschedulable.json"))
+
+        pod_info = collections.defaultdict(lambda : [])
+        pod_info["192.168.255.1"].append(watchdog.PodInfo("job1", 2))
+        gauges = watchdog.process_nodes_status(obj, pod_info)
+
+        self.assertTrue(len(gauges) == 4)
+
+        self.assertEqual("pai_node_count", gauges[0].name)
+        self.assertEqual(1, len(gauges[0].samples))
+        self.assertEqual("true", gauges[0].samples[0].labels["unschedulable"])
+        self.assertEqual("k8s_node_gpu_available", gauges[1].name)
+        self.assertEqual(1, len(gauges[1].samples))
+        self.assertEqual(0, gauges[1].samples[0].value)
+        self.assertEqual("k8s_node_gpu_total", gauges[2].name)
+        self.assertEqual(1, len(gauges[2].samples))
+        self.assertEqual(4, gauges[2].samples[0].value)
+        self.assertEqual("k8s_node_gpu_reserved", gauges[3].name)
+        self.assertEqual(1, len(gauges[3].samples))
+        self.assertEqual(2, gauges[3].samples[0].value)
+
+        for gauge in gauges:
+            self.assertTrue(len(gauge.samples) > 0)
+
+        for gauge in gauges[1:]:
+            self.assertEqual("192.168.255.1", gauge.samples[0].labels["host_ip"])
+
 if __name__ == '__main__':
     unittest.main()