microsoft · xudifsd · May 10, 2019 · Apr 18, 2019 · Apr 18, 2019 · Apr 18, 2019
diff --git a/docs/alerting/watchdog-metrics.md b/docs/alerting/watchdog-metrics.md
@@ -57,6 +57,14 @@ vi watchdog-xx.log
 | ---------- |  ----------- |
 | k8s_api_server_count | use label `error` to represent status, if `error` != "ok", means k8s api server is not functioning correctly |
 
+## K8s resource Metrics
+| Metric name| Description |
+| ---------- |  ----------- |
+| k8s_node_gpu_total | Total Gpu |
+| k8s_node_gpu_available | Total gpu count - used gpu count |
+| k8s_node_gpu_reserved | If node is marked as unschedulable via `kubectl cordon $node` all unused gpus are deemed as reserved |
+
+
 ## Other Metrics
 | Metric name| Description |
 | ---------- |  ----------- |

diff --git a/src/job-exporter/deploy/job-exporter.yaml.template b/src/job-exporter/deploy/job-exporter.yaml.template
@@ -68,6 +68,8 @@ spec:
           value: {{ cluster_cfg["job-exporter"]["logging-level"] }}
         - name: NV_DRIVER
           value: /var/drivers/nvidia/current
+        - name: NVIDIA_VISIBLE_DEVICES
+          value: all
         volumeMounts:
         - mountPath: /var/run/docker.sock
           name: docker-socket

diff --git a/src/job-exporter/src/docker_inspect.py b/src/job-exporter/src/docker_inspect.py
@@ -49,7 +49,7 @@ def __eq__(self, o):
 
 
 keys = {"PAI_JOB_NAME", "PAI_USER_NAME", "PAI_CURRENT_TASK_ROLE_NAME", "GPU_ID",
-        "PAI_TASK_INDEX"}
+        "PAI_TASK_INDEX", "DLWS_JOB_ID", "DLWS_USER_NAME"}
 
 
 def parse_docker_inspect(inspect_output):
@@ -79,15 +79,13 @@ def parse_docker_inspect(inspect_output):
     pid = utils.walk_json_field_safe(obj, 0, "State", "Pid")
 
     return InspectResult(
-            m.get("PAI_USER_NAME"),
-            m.get("PAI_JOB_NAME"),
+            m.get("PAI_USER_NAME") or m.get("DLWS_USER_NAME"),
+            m.get("PAI_JOB_NAME") or m.get("DLWS_JOB_ID"),
             m.get("PAI_CURRENT_TASK_ROLE_NAME"),
             m.get("PAI_TASK_INDEX"),
             m.get("GPU_ID"),
             pid)
 
-    return {"env": envs, "labels": labels, "pid": pid}
-
 def inspect(container_id, histogram, timeout):
     try:
         result = utils.exec_cmd(

diff --git a/src/job-exporter/src/main.py b/src/job-exporter/src/main.py
@@ -63,7 +63,12 @@ def config_environ():
             os.path.join(driver_path, "lib") + os.pathsep + \
             os.path.join(driver_path, "lib64")
 
-    logger.debug("LD_LIBRARY_PATH is %s", os.environ["LD_LIBRARY_PATH"])
+    driver_bin_path = os.path.join(driver_path, "bin")
+    os.environ["PATH"] = os.environ["PATH"] + ":" + driver_bin_path
+
+    logger.debug("LD_LIBRARY_PATH is %s, PATH is %s",
+            os.environ["LD_LIBRARY_PATH"],
+            os.environ["PATH"])
 
 
 def try_remove_old_prom_file(path):
@@ -82,16 +87,17 @@ def get_gpu_count(path):
 
     logger.debug("hostname is %s, ip is %s", hostname, ip)
 
-    with open(path) as f:
-        gpu_config = json.load(f)
+    if os.path.isfile(path):
+        with open(path) as f:
+            gpu_config = json.load(f)
+
+        if hostname is not None and gpu_config["nodes"].get(hostname) is not None:
+            return gpu_config["nodes"][hostname]["gpuCount"]
+        elif ip is not None and gpu_config["nodes"].get(ip) is not None:
+            return gpu_config["nodes"][ip]["gpuCount"]
 
-    if hostname is not None and gpu_config["nodes"].get(hostname) is not None:
-        return gpu_config["nodes"][hostname]["gpuCount"]
-    elif ip is not None and gpu_config["nodes"].get(ip) is not None:
-        return gpu_config["nodes"][ip]["gpuCount"]
-    else:
-        logger.warning("failed to find gpu count from config %s", gpu_config)
-        return 0
+    logger.warning("failed to find gpu count from config %s", path)
+    return 0
 
 
 def register_stack_trace_dump():

diff --git a/src/job-exporter/src/nvidia.py b/src/job-exporter/src/nvidia.py
@@ -162,11 +162,8 @@ def parse_smi_xml_result(smi):
     return result
 
 def nvidia_smi(histogram, timeout):
-    driver_path = os.environ["NV_DRIVER"]
-    bin_path = os.path.join(driver_path, "bin/nvidia-smi")
-
     try:
-        smi_output = utils.exec_cmd([bin_path, "-q", "-x"],
+        smi_output = utils.exec_cmd(["nvidia-smi", "-q", "-x"],
                 histogram=histogram, timeout=timeout)
 
         return parse_smi_xml_result(smi_output)

diff --git a/src/job-exporter/test/data/dlts_docker_inspect.json b/src/job-exporter/test/data/dlts_docker_inspect.json
diff --git a/src/job-exporter/test/test_docker_inspect.py b/src/job-exporter/test/test_docker_inspect.py
@@ -76,5 +76,20 @@ def test_parse_docker_inspect_BUGFIX(self):
                 30332)
         self.assertEqual(target_inspect_info, inspect_info)
 
+    def test_adapt_dlts_jobs(self):
+        sample_path = "data/dlts_docker_inspect.json"
+        with open(sample_path, "r") as f:
+            docker_inspect = f.read()
+
+        inspect_info = parse_docker_inspect(docker_inspect)
+        target_inspect_info = InspectResult(
+                "dixu",
+                "0c435eee-d31f-43d5-a1b3-442845fa1d0c",
+                None,
+                None,
+                "GPU-7c583998-b3ff-a885-8979-2d32d334cde4",
+                3533)
+        self.assertEqual(target_inspect_info, inspect_info)
+
 if __name__ == '__main__':
     unittest.main()