Skip to content

Commit

Permalink
Add cluster log utilities
Browse files Browse the repository at this point in the history
  • Loading branch information
jacobtomlinson committed Nov 17, 2023
1 parent b6ba0c3 commit ff962d2
Show file tree
Hide file tree
Showing 2 changed files with 152 additions and 7 deletions.
33 changes: 33 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,36 @@ https://dbc-dp-xxxx.cloud.databricks.com/driver-proxy/o/xxxx/xx-xxx-xxxx/8087/st
![](https://user-images.githubusercontent.com/1610850/281442274-450d41c6-2eb6-42a1-8de6-c4a1a1b84193.png)

![](https://user-images.githubusercontent.com/1610850/281441285-9b84d5f1-d58a-45dc-9354-7385e1599d1f.png)

### Troubleshooting with cluster logs

If you're experiencing problems starting your Dask Databricks cluster then viewing logs for your init scripts can help narrow down the problem.

When you create your cluster we recommend that you [configure your logs](https://docs.databricks.com/en/clusters/configure.html#cluster-log-delivery) to write to somewhere like `dbfs:/cluster_init_logs`.

To make viewing these logs a little easier we've included a couple of CLI utilities in `dask-databricks` to help you navigate them.

#### Listing clusters

You can get a full list of available logs with the `dask databricks ls <path>` command where the path is the DBFS location you configured your logs to write to.

```console
$ dask databricks logs ls dbfs:/cluster_init_logs

Cluster Start time Node Count Node IPs
──────────────────────────────────────────────────────────────────────────────────────
1234-987654-a1b2c3d4 Nov 16 2023 10:36 2 10.0.0.1, 10.0.0.2
```

#### Viewing logs

Once you have your cluster ID you can view the logs from the latest launch of that cluster with `dask databricks cat <path> <cluster>`.

```console
$ dask databricks logs cat dbfs:/cluster_init_logs 1234-987654-a1b2c3d4
Cluster: 1234-987654-a1b2c3d4
Start time: Nov 16 2023 10:36
10.0.0.1: Start Python bootstrap
10.0.0.1: PYSPARK_PYTHON is /databricks/python3/bin/python
...
```
126 changes: 119 additions & 7 deletions dask_databricks/cli.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,30 @@
import logging
import os
import random
import socket
import subprocess
import sys
import time
from datetime import datetime

import click
from rich import box
from rich.color import ANSI_COLOR_NAMES
from rich.console import Console
from rich.logging import RichHandler
from rich.table import Table

console = Console()

NODE_COLOURS = ["medium_spring_green", "light_steel_blue1", "wheat1", "medium_orchid"]

# Generate list of random colours from rich
# import random
# from rich.color import Color
#
# for i in range(100):
# colour = Color.random()
# print(f'"{colour.name}",', end="


def get_logger():
Expand Down Expand Up @@ -59,21 +77,115 @@ def logs():
"""View cluster init logs."""


@logs.command()
@click.argument("path")
def ls(path):
def _get_logs_at_path(path):
try:
from databricks.sdk.runtime import dbutils
except ImportError:
raise RuntimeError("Please install databricks-sdk.")
clusters = {}

log_files = []
for cluster in dbutils.fs.ls(path):
cluster_id = cluster.path.split("/")[-1]
clusters[cluster_id] = {}
for node in dbutils.fs.ls(cluster.path + "/init_scripts"):
for log in dbutils.fs.ls(node.path):
log_files.append(log.path)
for log in sorted(log_files, key=lambda x: x.split("/")[-1]):
print(log)
filename = log.path.split("/")[-1]
channel = filename.split(".")[-2]
datetime = "_".join(filename.split("_")[:2])
node_name = log.path.split("/")[-2].split("_", 1)[-1].replace("_", ".")
if datetime not in clusters[cluster_id]:
clusters[cluster_id][datetime] = {}

if node_name not in clusters[cluster_id][datetime]:
clusters[cluster_id][datetime][node_name] = {}

clusters[cluster_id][datetime][node_name][channel] = log.path
return clusters


def _get_node_color(i):
if i < len(NODE_COLOURS):
return NODE_COLOURS[i]
else:
return random.choice(list(ANSI_COLOR_NAMES))


def _prettify_launch_time(launch_time):
return datetime.strptime(launch_time, "%Y%m%d_%H%M%S").strftime("%b %d %Y %H:%M")


@logs.command()
@click.argument("path")
@click.option("--show-filenames", help="Show filenames in the output", is_flag=True, default=False, show_default=True)
def ls(path, show_filenames):
# TODO add flag to list filenames
table = Table(box=box.SIMPLE_HEAD)
table.add_column("Cluster", style="cyan", no_wrap=True)
table.add_column("Start time", style="plum2")
table.add_column("Node Count")
table.add_column("Node IPs")
if show_filenames:
table.add_column("Filenames")
with console.status("[bright_black]Finding logs..."):
clusters = _get_logs_at_path(path)
for cluster in clusters:
first = True
for launch_time in sorted(clusters[cluster], reverse=True):
pretty_launch_time = _prettify_launch_time(launch_time)
cluster_name = cluster if first else ""
node_list = ", ".join(
f"[{_get_node_color(i)}]{name}[/{_get_node_color(i)}]"
for i, name in enumerate(clusters[cluster][launch_time])
)
data = [cluster_name, pretty_launch_time, str(len(clusters[cluster][launch_time])), node_list]
if show_filenames:
filenames = ""
for i, node in enumerate(clusters[cluster][launch_time]):
for channel in ["stdout", "stderr"]:
node_colour = _get_node_color(i)
filenames += f"[{node_colour}]{clusters[cluster][launch_time][node][channel]}[/{node_colour}]\n"
data.append(filenames)
table.add_row(*data)
first = False

console.print(table)


@logs.command()
@click.argument("path")
@click.argument("cluster")
def cat(path, cluster):
# TODO add a flag for selecting which start time to view
# TODO add a flag to filter which nodes to view logs for
try:
from databricks.sdk.runtime import dbutils
except ImportError:
raise RuntimeError("Please install databricks-sdk.")

with console.status("[bright_black]Finding logs..."):
clusters = _get_logs_at_path(path)

if cluster not in clusters:
console.print(f"Cluster {cluster} not found.", style="bold red", highlight=False)
console.print(
f"Hint: Try running dask [b i]databricks logs ls {path}[/b i] to list clusters.",
style="bright_black",
highlight=False,
)
sys.exit(1)

most_recent = sorted(clusters[cluster].keys())[-1]

console.print(f"Cluster: {cluster}", style="bold cyan", highlight=False)
console.print(f"Start time: {_prettify_launch_time(most_recent)}", style="bold cyan", highlight=False)

for i, node in enumerate(clusters[cluster][most_recent]):
for channel in ["stdout", "stderr"]:
for line in dbutils.fs.head(clusters[cluster][most_recent][node][channel], 65536).split("\n"):
node_colour = _get_node_color(i)
console.print(
f"[{node_colour}]{node}[/{node_colour}]: {line}", style="grey89" if channel == "stdout" else "plum4"
)


if __name__ == "__main__":
Expand Down

0 comments on commit ff962d2

Please sign in to comment.