Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
adapt dlws jobs (#2604)
Browse files Browse the repository at this point in the history
  • Loading branch information
xudifsd authored May 10, 2019
1 parent 0eb286f commit eae57d8
Show file tree
Hide file tree
Showing 12 changed files with 1,070 additions and 80 deletions.
8 changes: 8 additions & 0 deletions docs/alerting/watchdog-metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ vi watchdog-xx.log
| ---------- | ----------- |
| k8s_api_server_count | use label `error` to represent status, if `error` != "ok", means k8s api server is not functioning correctly |

## K8s resource Metrics
| Metric name| Description |
| ---------- | ----------- |
| k8s_node_gpu_total | Total Gpu |
| k8s_node_gpu_available | Total gpu count - used gpu count |
| k8s_node_gpu_reserved | If node is marked as unschedulable via `kubectl cordon $node` all unused gpus are deemed as reserved |


## Other Metrics
| Metric name| Description |
| ---------- | ----------- |
Expand Down
2 changes: 2 additions & 0 deletions src/job-exporter/deploy/job-exporter.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ spec:
value: {{ cluster_cfg["job-exporter"]["logging-level"] }}
- name: NV_DRIVER
value: /var/drivers/nvidia/current
- name: NVIDIA_VISIBLE_DEVICES
value: all
volumeMounts:
- mountPath: /var/run/docker.sock
name: docker-socket
Expand Down
8 changes: 3 additions & 5 deletions src/job-exporter/src/docker_inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __eq__(self, o):


keys = {"PAI_JOB_NAME", "PAI_USER_NAME", "PAI_CURRENT_TASK_ROLE_NAME", "GPU_ID",
"PAI_TASK_INDEX"}
"PAI_TASK_INDEX", "DLWS_JOB_ID", "DLWS_USER_NAME"}


def parse_docker_inspect(inspect_output):
Expand Down Expand Up @@ -79,15 +79,13 @@ def parse_docker_inspect(inspect_output):
pid = utils.walk_json_field_safe(obj, 0, "State", "Pid")

return InspectResult(
m.get("PAI_USER_NAME"),
m.get("PAI_JOB_NAME"),
m.get("PAI_USER_NAME") or m.get("DLWS_USER_NAME"),
m.get("PAI_JOB_NAME") or m.get("DLWS_JOB_ID"),
m.get("PAI_CURRENT_TASK_ROLE_NAME"),
m.get("PAI_TASK_INDEX"),
m.get("GPU_ID"),
pid)

return {"env": envs, "labels": labels, "pid": pid}

def inspect(container_id, histogram, timeout):
try:
result = utils.exec_cmd(
Expand Down
26 changes: 16 additions & 10 deletions src/job-exporter/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,12 @@ def config_environ():
os.path.join(driver_path, "lib") + os.pathsep + \
os.path.join(driver_path, "lib64")

logger.debug("LD_LIBRARY_PATH is %s", os.environ["LD_LIBRARY_PATH"])
driver_bin_path = os.path.join(driver_path, "bin")
os.environ["PATH"] = os.environ["PATH"] + ":" + driver_bin_path

logger.debug("LD_LIBRARY_PATH is %s, PATH is %s",
os.environ["LD_LIBRARY_PATH"],
os.environ["PATH"])


def try_remove_old_prom_file(path):
Expand All @@ -82,16 +87,17 @@ def get_gpu_count(path):

logger.debug("hostname is %s, ip is %s", hostname, ip)

with open(path) as f:
gpu_config = json.load(f)
if os.path.isfile(path):
with open(path) as f:
gpu_config = json.load(f)

if hostname is not None and gpu_config["nodes"].get(hostname) is not None:
return gpu_config["nodes"][hostname]["gpuCount"]
elif ip is not None and gpu_config["nodes"].get(ip) is not None:
return gpu_config["nodes"][ip]["gpuCount"]

if hostname is not None and gpu_config["nodes"].get(hostname) is not None:
return gpu_config["nodes"][hostname]["gpuCount"]
elif ip is not None and gpu_config["nodes"].get(ip) is not None:
return gpu_config["nodes"][ip]["gpuCount"]
else:
logger.warning("failed to find gpu count from config %s", gpu_config)
return 0
logger.warning("failed to find gpu count from config %s", path)
return 0


def register_stack_trace_dump():
Expand Down
5 changes: 1 addition & 4 deletions src/job-exporter/src/nvidia.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,11 +162,8 @@ def parse_smi_xml_result(smi):
return result

def nvidia_smi(histogram, timeout):
driver_path = os.environ["NV_DRIVER"]
bin_path = os.path.join(driver_path, "bin/nvidia-smi")

try:
smi_output = utils.exec_cmd([bin_path, "-q", "-x"],
smi_output = utils.exec_cmd(["nvidia-smi", "-q", "-x"],
histogram=histogram, timeout=timeout)

return parse_smi_xml_result(smi_output)
Expand Down
403 changes: 403 additions & 0 deletions src/job-exporter/test/data/dlts_docker_inspect.json

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions src/job-exporter/test/test_docker_inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,5 +76,20 @@ def test_parse_docker_inspect_BUGFIX(self):
30332)
self.assertEqual(target_inspect_info, inspect_info)

def test_adapt_dlts_jobs(self):
sample_path = "data/dlts_docker_inspect.json"
with open(sample_path, "r") as f:
docker_inspect = f.read()

inspect_info = parse_docker_inspect(docker_inspect)
target_inspect_info = InspectResult(
"dixu",
"0c435eee-d31f-43d5-a1b3-442845fa1d0c",
None,
None,
"GPU-7c583998-b3ff-a885-8979-2d32d334cde4",
3533)
self.assertEqual(target_inspect_info, inspect_info)

if __name__ == '__main__':
unittest.main()
Loading

0 comments on commit eae57d8

Please sign in to comment.