Skip to content

Commit

Permalink
do cross-check with condor_q and condor_history (#395)
Browse files Browse the repository at this point in the history
* do cross-check with condor_q and condor_history

* <bot> update requirements-docs.txt

* <bot> update requirements-tests.txt

* <bot> update requirements.txt

* actually get history

* save timestamp

* make sure tests won't fail on the edge of day/hour boundaries by mocking the current time

* catch both old and new submit dir formats

* separate out condor and stderr reset reasons

* try harder to set the site and failure reason

* don't overwrite reason

* adjust check time to account for initial JEL loading

* add bus error and connection timeout as transient errors

---------

Co-authored-by: github-actions <github-actions@github.com>
  • Loading branch information
dsschult and github-actions authored Oct 19, 2024
1 parent 4a81549 commit 9ff909f
Show file tree
Hide file tree
Showing 6 changed files with 372 additions and 91 deletions.
16 changes: 14 additions & 2 deletions iceprod/server/grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,31 +283,35 @@ async def task_idle(self, task: GridTask):
if e.response.status_code != 404:
raise

async def task_processing(self, task: GridTask):
async def task_processing(self, task: GridTask, site: str | None = None):
"""
Tell IceProd API a task is now processing (put in the "processing" status).
Args:
task: IceProd task info
site: computing site the task is running at
"""
if not task.task_id or not task.instance_id:
raise RuntimeError("Either task_id or instance_id is empty")

args = {
'instance_id': task.instance_id,
}
if site:
args['site'] = site
try:
await self.rest_client.request('POST', f'/tasks/{task.task_id}/task_actions/processing', args)
except requests.exceptions.HTTPError as e:
if e.response.status_code != 404:
raise

async def task_reset(self, task: GridTask, reason: str | None = None):
async def task_reset(self, task: GridTask, reason: str | None = None, stats: dict | None = None):
"""
Tell IceProd API a task should be reset back to the "waiting" status.
Args:
task: IceProd task info
stats: task resource statistics
reason: A reason for failure
"""
if not task.task_id or not task.instance_id:
Expand All @@ -316,6 +320,10 @@ async def task_reset(self, task: GridTask, reason: str | None = None):
args = {
'instance_id': task.instance_id,
}
if stats:
args['resources'] = stats.get('resources', {})
if site := stats.get('site'):
args['site'] = site
if reason:
args['reason'] = reason
try:
Expand Down Expand Up @@ -354,6 +362,8 @@ async def task_failure(self, task: GridTask, reason: str | None = None, stats: d
args['reason'] = reason
if stats:
args['resources'] = stats.get('resources', {})
if site := stats.get('site'):
args['site'] = site
try:
await self.rest_client.request('POST', f'/tasks/{task.task_id}/task_actions/failed', args)
except requests.exceptions.HTTPError as e:
Expand Down Expand Up @@ -391,6 +401,8 @@ async def task_success(self, task: GridTask, stats: dict | None = None, stdout:
}
if stats:
resources = stats.get('resources', {})
if site := stats.get('site'):
args['site'] = site
if 'time' in resources:
args['time_used'] = resources['time']*3600.

Expand Down
Loading

0 comments on commit 9ff909f

Please sign in to comment.