Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Basic implementation of Troubleshooter class for Overwatcher #37

Merged
merged 22 commits into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
be1a255
Add some initial files
albireox Nov 6, 2024
854ba76
Merge branch 'main' into troubleshooter
albireox Nov 6, 2024
913d0cd
Improve basic structure of the troubleshooter module
albireox Nov 6, 2024
5f6aaec
Try to fix building docker image for pull request
albireox Nov 6, 2024
e8fd7fd
Attempt with ${{ github.head_ref || github.ref_name }}
albireox Nov 6, 2024
1bf9481
Merge branch 'main' into troubleshooter
albireox Nov 6, 2024
33110b3
Add categorisation methods to the ErrorCode enum
albireox Nov 6, 2024
7ed39cc
Merge remote-tracking branch 'origin/main' into troubleshooter
albireox Nov 6, 2024
a21ffa7
Very basic but complete implementataion of the troubleshooter
albireox Nov 6, 2024
b2d6921
Some tweaks to OverwatcherTask logging
albireox Nov 6, 2024
5c59c11
Fix bug in twilight flats recipe for extra flats
albireox Nov 7, 2024
ba546d4
Do not require overwatcher to be enabled for calibrations if dome doe…
albireox Nov 7, 2024
5f2ccc5
Merge branch 'main' into troubleshooter
albireox Nov 7, 2024
9efda29
The troubleshooter blocks the observing loop while troubleshooting
albireox Nov 7, 2024
c32a66e
Disable overwatcher and run cleanup in safety after closing dome
albireox Nov 7, 2024
11631ee
Add option to disable overwatcher on shutdown
albireox Nov 7, 2024
9697ab9
Run a cleanup when disabling the observe loop with immediate=True
albireox Nov 7, 2024
815ec4c
Add placeholder for emitting a critical error in the troubleshooter
albireox Nov 7, 2024
163ec02
Add MJD to the notification record in the DB
albireox Nov 7, 2024
6635a41
Merge branch 'main' into troubleshooter
albireox Nov 7, 2024
e839246
Merge branch 'main' into troubleshooter
albireox Nov 7, 2024
351525f
Add max_start_time to quick_cals and bias_sequence calibrations
albireox Nov 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions src/gort/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,46 @@ class ErrorCode(Enum):
CALIBRATION_ERROR = 900
UNKNOWN_ERROR = 9999

def is_telescope_error(self):
"""Returns True if the error is related to the telescope."""

return self.value >= 100 and self.value < 200

def is_ag_error(self):
"""Returns True if the error is related to the autoguider."""

return self.value >= 200 and self.value < 300

def is_spectrograph_error(self):
"""Returns True if the error is related to the spectrograph."""

return self.value >= 300 and self.value < 400

def is_nps_error(self):
"""Returns True if the error is related to the NPS."""

return self.value >= 400 and self.value < 500

def is_enclosure_error(self):
"""Returns True if the error is related to the enclosure."""

return self.value >= 500 and self.value < 600

def is_guiding_error(self):
"""Returns True if the error is related to the guider."""

return self.value >= 600 and self.value < 700

def is_scheduler_error(self):
"""Returns True if the error is related to the scheduler."""

return self.value >= 700 and self.value < 800

def is_observer_error(self):
"""Returns True if the error is related to the observer."""

return self.value >= 800 and self.value < 900


class GuiderStatus(Flag):
"""Maskbits with the guider status."""
Expand Down
4 changes: 2 additions & 2 deletions src/gort/etc/calibrations.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
- name: quick_cals
recipe: quick_cals
min_start_time: 1800
max_start_time: null
max_start_time: 3600
time_mode: secs_after_sunset
after: null
required: true
Expand All @@ -25,7 +25,7 @@
- name: bias_sequence
recipe: bias_sequence
min_start_time: null
max_start_time: null
max_start_time: 7200
time_mode: null
after: quick_cals
required: true
Expand Down
12 changes: 12 additions & 0 deletions src/gort/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,18 @@ class OverwatcherError(GortError):
pass


class TroubleshooterCriticalError(OverwatcherError):
"""A critical error in the troubleshooter that will shut down the system."""

pass


class TroubleshooterTimeoutError(OverwatcherError):
"""The troubleshooter timed out while running a recipe."""

pass


class RemoteCommandError(GortError):
"""An error in a remote command to an actor."""

Expand Down
4 changes: 2 additions & 2 deletions src/gort/overwatcher/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ class OverwatcherBaseTask:
keep_alive: ClassVar[bool] = True
restart_on_error: ClassVar[bool] = True

def __init__(self):
def __init__(self, log: LogNamespace | None = None):
self._task_runner: asyncio.Task | None = None
self._heartbeat_task: asyncio.Task | None = None

self._log: Mock | LogNamespace = Mock()
self._log: Mock | LogNamespace = log or Mock()

async def run(self):
"""Runs the task."""
Expand Down
3 changes: 2 additions & 1 deletion src/gort/overwatcher/helpers/notifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import httpx

from sdsstools import Configuration
from sdsstools import Configuration, get_sjd
from sdsstools.utils import GatheringTaskGroup

from gort.core import LogNamespace
Expand Down Expand Up @@ -189,6 +189,7 @@ async def notify(
[
{
"date": datetime.datetime.now(tz=datetime.UTC),
"mjd": get_sjd("LCO"),
"level": level,
"message": message,
"payload": json.dumps(payload),
Expand Down
60 changes: 19 additions & 41 deletions src/gort/overwatcher/observer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,12 @@

from astropy.time import Time

from gort.enums import ErrorCode
from gort.exceptions import GortError
from gort.exceptions import GortError, TroubleshooterTimeoutError
from gort.exposure import Exposure
from gort.overwatcher import OverwatcherModule
from gort.overwatcher.core import OverwatcherModuleTask
from gort.tile import Tile
from gort.tools import cancel_task, run_in_executor, set_tile_status
from gort.tools import cancel_task, run_in_executor


if TYPE_CHECKING:
Expand Down Expand Up @@ -174,6 +173,10 @@ async def stop_observing(
)
self.observe_loop = await cancel_task(self.observe_loop)

# The guiders may have been left running or the spectrograph may still
# be exposing. Clean up to avoid issues.
await self.gort.cleanup(readout=False)

else:
await self.overwatcher.notify(
f"Stopping observations after this tile. Reason: {reason}"
Expand All @@ -197,6 +200,9 @@ async def observe_loop_task(self):
exp: Exposure | list[Exposure] | bool = False

try:
# Wait in case the troubleshooter is doing something.
await self.overwatcher.troubleshooter.wait_until_ready(300)

# We want to avoid re-acquiring the tile between dithers. We call
# the scheduler here and control the dither position loop ourselves.
tile: Tile = await run_in_executor(Tile.from_scheduler)
Expand All @@ -217,6 +223,8 @@ async def observe_loop_task(self):
await self.gort.guiders.focus()

for dpos in tile.dither_positions:
await self.overwatcher.troubleshooter.wait_until_ready(300)

# The exposure will complete in 900 seconds + acquisition + readout
self.next_exposure_completes = time() + 90 + 900 + 60

Expand All @@ -242,46 +250,16 @@ async def observe_loop_task(self):
except asyncio.CancelledError:
break

except Exception as err:
# TODO: this should be moved to the troubleshooting module, but
# for now handling it here.

if isinstance(err, GortError):
# If the acquisition failed, disable the tile and try again.
if err.error_code == ErrorCode.ACQUISITION_FAILED:
tile_id: int | None = err.payload.get("tile_id", None)
if tile_id is None:
await notify(
'Cannot disable tile without a "tile_id. '
"Continuing observations without disabling tile.",
level="error",
)
else:
await set_tile_status(tile_id, enabled=False)
await notify(
f"tile_id={tile_id} has been disabled. "
"Continuing observations.",
level="warning",
)

# If the scheduler cannot find a tile, wait a minute and try again.
elif err.error_code == ErrorCode.SCHEDULER_CANNOT_FIND_TILE:
await notify(
"The scheduler was not able to find a valid tile to "
"observe. Waiting 60 seconds before trying again.",
level="warning",
)
await asyncio.sleep(60)
continue

# No specific troubleshooting available. Report the error,
# do a cleanup and try again.
except TroubleshooterTimeoutError:
await notify(
f"An error occurred during the observation: {err} "
"Running the cleanup recipe.",
level="error",
"The troubleshooter timed out after 300 seconds. "
"Cancelling observations.",
level="critical",
)
await self.gort.cleanup(readout=False)
break

except Exception as err:
await self.overwatcher.troubleshooter.handle(err)

finally:
if self.is_cancelling:
Expand Down
25 changes: 20 additions & 5 deletions src/gort/overwatcher/overwatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from gort.overwatcher.helpers import DomeHelper
from gort.overwatcher.helpers.notifier import NotifierMixIn
from gort.overwatcher.helpers.tasks import DailyTasks
from gort.overwatcher.troubleshooter.troubleshooter import Troubleshooter


@dataclasses.dataclass
Expand All @@ -49,7 +50,11 @@ def __init__(self, overwatcher: Overwatcher):
super().__init__()

self.overwatcher = overwatcher
self.log = self.overwatcher.log

# A bit configuring but _log is used internally, mainly for
# OverwatcherBaseTask.run() and log is for external use.
self._log = self.overwatcher.log
self.log = self._log


class OverwatcherMainTask(OverwatcherTask):
Expand Down Expand Up @@ -228,7 +233,7 @@ async def handle_reenable(self):
if self._pending_close_dome:
return

self.log.info("Undoing the cancellation of the observing loop.")
self._log.info("Undoing the cancellation of the observing loop.")
observer._cancelling = False
self.overwatcher.gort.observer.cancelling = False

Expand Down Expand Up @@ -289,7 +294,7 @@ def __init__(
self.state.dry_run = dry_run

self.dome = DomeHelper(self)

self.troubleshooter = Troubleshooter(self)
self.tasks: list[OverwatcherTask] = [
OverwatcherMainTask(self),
OverwatcherPingTask(self),
Expand Down Expand Up @@ -339,18 +344,25 @@ async def shutdown(
reason: str = "undefined",
retry: bool = True,
park: bool = True,
disable_overwatcher: bool = False,
):
"""Shuts down the observatory."""

# Check if the dome is already closed, then do nothing.
if await self.dome.is_closing():
dome_closed = await self.dome.is_closing()
enabled = self.state.enabled
observing = self.observer.is_observing

if dome_closed and not enabled and not observing:
return

if not reason.endswith("."):
reason += "."

await self.notify(f"Triggering shutdown. Reason: {reason}", level="warning")

if disable_overwatcher:
await self.notify("The Overwatcher will be disabled.", level="warning")

if not self.state.dry_run:
stop = asyncio.create_task(self.observer.stop_observing(immediate=True))
shutdown = asyncio.create_task(self.dome.shutdown(retry=retry, park=park))
Expand All @@ -367,6 +379,9 @@ async def shutdown(
error=err,
)

if disable_overwatcher:
self.state.enabled = False

async def cancel(self):
"""Cancels the overwatcher tasks."""

Expand Down
8 changes: 8 additions & 0 deletions src/gort/overwatcher/safety.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,14 @@ async def task(self):
)
await self.module.close_dome()

# Now run a shutdown. This should not try to close the dome
# since that's already done, but it will stop the observe loop,
# clean-up, etc.
await self.overwatcher.shutdown(
reason="safety alerts detected",
disable_overwatcher=True,
)

elif self.failed:
# We have failed closing the dome as a last resort. We have issued
# a critical alert. We don't try closing the dome again.
Expand Down
12 changes: 12 additions & 0 deletions src/gort/overwatcher/troubleshooter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# @Author: José Sánchez-Gallego (gallegoj@uw.edu)
# @Date: 2024-11-05
# @Filename: __init__.py
# @License: BSD 3-clause (http://www.opensource.org/licenses/BSD-3-Clause)

from __future__ import annotations

from .recipes import TroubleshooterRecipe
from .troubleshooter import Troubleshooter
Loading
Loading