Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

adapt dlws jobs #2604

Merged
merged 9 commits into from
May 10, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/alerting/watchdog-metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ vi watchdog-xx.log
| ---------- | ----------- |
| k8s_api_server_count | use label `error` to represent status, if `error` != "ok", means k8s api server is not functioning correctly |

## K8s resource Metrics
| Metric name| Description |
| ---------- | ----------- |
| k8s_node_gpu_total | Total Gpu |
| k8s_node_gpu_available | Total gpu count - used gpu count |
| k8s_node_gpu_reserved | If node is marked as unschedulable via `kubectl cordon $node` all unused gpus are deemed as reserved |


## Other Metrics
| Metric name| Description |
| ---------- | ----------- |
Expand Down
2 changes: 2 additions & 0 deletions src/job-exporter/deploy/job-exporter.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ spec:
value: {{ cluster_cfg["job-exporter"]["logging-level"] }}
- name: NV_DRIVER
value: /var/drivers/nvidia/current
- name: NVIDIA_VISIBLE_DEVICES
value: all
volumeMounts:
- mountPath: /var/run/docker.sock
name: docker-socket
Expand Down
8 changes: 3 additions & 5 deletions src/job-exporter/src/docker_inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __eq__(self, o):


keys = {"PAI_JOB_NAME", "PAI_USER_NAME", "PAI_CURRENT_TASK_ROLE_NAME", "GPU_ID",
"PAI_TASK_INDEX"}
"PAI_TASK_INDEX", "DLWS_JOB_ID", "DLWS_USER_NAME"}


def parse_docker_inspect(inspect_output):
Expand Down Expand Up @@ -79,15 +79,13 @@ def parse_docker_inspect(inspect_output):
pid = utils.walk_json_field_safe(obj, 0, "State", "Pid")

return InspectResult(
m.get("PAI_USER_NAME"),
m.get("PAI_JOB_NAME"),
m.get("PAI_USER_NAME") or m.get("DLWS_USER_NAME"),
m.get("PAI_JOB_NAME") or m.get("DLWS_JOB_ID"),
m.get("PAI_CURRENT_TASK_ROLE_NAME"),
m.get("PAI_TASK_INDEX"),
m.get("GPU_ID"),
pid)

return {"env": envs, "labels": labels, "pid": pid}

def inspect(container_id, histogram, timeout):
try:
result = utils.exec_cmd(
Expand Down
26 changes: 16 additions & 10 deletions src/job-exporter/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,12 @@ def config_environ():
os.path.join(driver_path, "lib") + os.pathsep + \
os.path.join(driver_path, "lib64")

logger.debug("LD_LIBRARY_PATH is %s", os.environ["LD_LIBRARY_PATH"])
driver_bin_path = os.path.join(driver_path, "bin")
os.environ["PATH"] = os.environ["PATH"] + ":" + driver_bin_path

logger.debug("LD_LIBRARY_PATH is %s, PATH is %s",
os.environ["LD_LIBRARY_PATH"],
os.environ["PATH"])


def try_remove_old_prom_file(path):
Expand All @@ -82,16 +87,17 @@ def get_gpu_count(path):

logger.debug("hostname is %s, ip is %s", hostname, ip)

with open(path) as f:
gpu_config = json.load(f)
if os.path.isfile(path):
with open(path) as f:
gpu_config = json.load(f)

if hostname is not None and gpu_config["nodes"].get(hostname) is not None:
return gpu_config["nodes"][hostname]["gpuCount"]
elif ip is not None and gpu_config["nodes"].get(ip) is not None:
return gpu_config["nodes"][ip]["gpuCount"]

if hostname is not None and gpu_config["nodes"].get(hostname) is not None:
return gpu_config["nodes"][hostname]["gpuCount"]
elif ip is not None and gpu_config["nodes"].get(ip) is not None:
return gpu_config["nodes"][ip]["gpuCount"]
else:
logger.warning("failed to find gpu count from config %s", gpu_config)
return 0
logger.warning("failed to find gpu count from config %s", path)
return 0


def register_stack_trace_dump():
Expand Down
5 changes: 1 addition & 4 deletions src/job-exporter/src/nvidia.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,11 +162,8 @@ def parse_smi_xml_result(smi):
return result

def nvidia_smi(histogram, timeout):
driver_path = os.environ["NV_DRIVER"]
bin_path = os.path.join(driver_path, "bin/nvidia-smi")

try:
smi_output = utils.exec_cmd([bin_path, "-q", "-x"],
smi_output = utils.exec_cmd(["nvidia-smi", "-q", "-x"],
histogram=histogram, timeout=timeout)

return parse_smi_xml_result(smi_output)
Expand Down
403 changes: 403 additions & 0 deletions src/job-exporter/test/data/dlts_docker_inspect.json

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions src/job-exporter/test/test_docker_inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,5 +76,20 @@ def test_parse_docker_inspect_BUGFIX(self):
30332)
self.assertEqual(target_inspect_info, inspect_info)

def test_adapt_dlts_jobs(self):
sample_path = "data/dlts_docker_inspect.json"
with open(sample_path, "r") as f:
docker_inspect = f.read()

inspect_info = parse_docker_inspect(docker_inspect)
target_inspect_info = InspectResult(
"dixu",
"0c435eee-d31f-43d5-a1b3-442845fa1d0c",
None,
None,
"GPU-7c583998-b3ff-a885-8979-2d32d334cde4",
3533)
self.assertEqual(target_inspect_info, inspect_info)

if __name__ == '__main__':
unittest.main()
Loading