-
-
Notifications
You must be signed in to change notification settings - Fork 718
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix regression where unknown tasks were allowed to be stolen #5392
Merged
fjetter
merged 2 commits into
dask:main
from
fjetter:delete_tests_steal_unknown_functions
Oct 13, 2021
Merged
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -399,10 +399,15 @@ class WorkerState: | |
.. attribute:: processing: {TaskState: cost} | ||
|
||
A dictionary of tasks that have been submitted to this worker. | ||
Each task state is asssociated with the expected cost in seconds | ||
Each task state is associated with the expected cost in seconds | ||
of running that task, summing both the task's expected computation | ||
time and the expected communication time of its result. | ||
|
||
If a task is already executing on the worker and the excecution time is | ||
twice the learned average TaskGroup duration, this will be set to twice | ||
the current executing time. If the task is unknown, the default task | ||
duration is used instead of the TaskGroup average. | ||
|
||
Multiple tasks may be submitted to a worker in advance and the worker | ||
will run them eventually, depending on its execution resources | ||
(but see :doc:`work-stealing`). | ||
|
@@ -900,6 +905,18 @@ def name(self) -> str: | |
def all_durations(self) -> "defaultdict[str, float]": | ||
return self._all_durations | ||
|
||
@ccall | ||
@exceptval(check=False) | ||
def add_duration(self, action: str, start: double, stop: double): | ||
duration = stop - start | ||
self._all_durations[action] += duration | ||
if action == "compute": | ||
old = self._duration_average | ||
if old < 0: | ||
self._duration_average = duration | ||
else: | ||
self._duration_average = 0.5 * duration + 0.5 * old | ||
|
||
@property | ||
def duration_average(self) -> double: | ||
return self._duration_average | ||
|
@@ -1062,6 +1079,18 @@ def nbytes_total(self): | |
def duration(self) -> double: | ||
return self._duration | ||
|
||
@ccall | ||
@exceptval(check=False) | ||
def add_duration(self, action: str, start: double, stop: double): | ||
duration = stop - start | ||
self._all_durations[action] += duration | ||
if action == "compute": | ||
if self._stop < stop: | ||
self._stop = stop | ||
self._start = self._start or start | ||
self._duration += duration | ||
self._prefix.add_duration(action, start, stop) | ||
|
||
@property | ||
def types(self) -> set: | ||
return self._types | ||
|
@@ -2568,6 +2597,8 @@ def set_duration_estimate(self, ts: TaskState, ws: WorkerState) -> double: | |
If a task takes longer than twice the current average duration we | ||
estimate the task duration to be 2x current-runtime, otherwise we set it | ||
to be the average duration. | ||
|
||
See also ``_remove_from_processing`` | ||
""" | ||
exec_time: double = ws._executing.get(ts, 0) | ||
duration: double = self.get_task_duration(ts) | ||
|
@@ -2577,7 +2608,11 @@ def set_duration_estimate(self, ts: TaskState, ws: WorkerState) -> double: | |
else: | ||
comm: double = self.get_comm_cost(ts, ws) | ||
total_duration = duration + comm | ||
old = ws._processing.get(ts, 0) | ||
ws._processing[ts] = total_duration | ||
self._total_occupancy += total_duration - old | ||
ws._occupancy += total_duration - old | ||
|
||
return total_duration | ||
|
||
def transition_waiting_processing(self, key): | ||
|
@@ -2602,10 +2637,8 @@ def transition_waiting_processing(self, key): | |
return recommendations, client_msgs, worker_msgs | ||
worker = ws._address | ||
|
||
duration_estimate = self.set_duration_estimate(ts, ws) | ||
self.set_duration_estimate(ts, ws) | ||
ts._processing_on = ws | ||
ws._occupancy += duration_estimate | ||
self._total_occupancy += duration_estimate | ||
ts.state = "processing" | ||
self.consume_resources(ts, ws) | ||
self.check_idle_saturated(ws) | ||
|
@@ -2684,7 +2717,6 @@ def transition_processing_memory( | |
worker_msgs: dict = {} | ||
try: | ||
ts: TaskState = self._tasks[key] | ||
tg: TaskGroup = ts._group | ||
|
||
assert worker | ||
assert isinstance(worker, str) | ||
|
@@ -2719,57 +2751,26 @@ def transition_processing_memory( | |
} | ||
] | ||
|
||
has_compute_startstop: bool = False | ||
compute_start: double | ||
compute_stop: double | ||
############################# | ||
# Update Timing Information # | ||
############################# | ||
if startstops: | ||
startstop: dict | ||
for startstop in startstops: | ||
stop = startstop["stop"] | ||
start = startstop["start"] | ||
action = startstop["action"] | ||
if not has_compute_startstop and action == "compute": | ||
compute_start = start | ||
compute_stop = stop | ||
has_compute_startstop = True | ||
|
||
# record timings of all actions -- a cheaper way of | ||
# getting timing info compared with get_task_stream() | ||
ts._prefix._all_durations[action] += stop - start | ||
tg._all_durations[action] += stop - start | ||
ts._group.add_duration( | ||
stop=startstop["stop"], | ||
start=startstop["start"], | ||
action=startstop["action"], | ||
) | ||
|
||
############################# | ||
# Update Timing Information # | ||
############################# | ||
if has_compute_startstop and ws._processing.get(ts, True): | ||
# Update average task duration for worker | ||
old_duration: double = ts._prefix._duration_average | ||
new_duration: double = compute_stop - compute_start | ||
avg_duration: double | ||
if old_duration < 0: | ||
avg_duration = new_duration | ||
else: | ||
avg_duration = 0.5 * old_duration + 0.5 * new_duration | ||
|
||
ts._prefix._duration_average = avg_duration | ||
tg._duration += new_duration | ||
tg._start = tg._start or compute_start | ||
if tg._stop < compute_stop: | ||
tg._stop = compute_stop | ||
|
||
s: set = self._unknown_durations.pop(ts._prefix._name, None) | ||
tts: TaskState | ||
if s: | ||
for tts in s: | ||
if tts._processing_on is not None: | ||
wws = tts._processing_on | ||
comm: double = self.get_comm_cost(tts, wws) | ||
old: double = wws._processing[tts] | ||
new: double = avg_duration + comm | ||
diff: double = new - old | ||
wws._processing[tts] = new | ||
wws._occupancy += diff | ||
self._total_occupancy += diff | ||
s: set = self._unknown_durations.pop(ts._prefix._name, set()) | ||
tts: TaskState | ||
for tts in s: | ||
if tts._processing_on: | ||
self.set_duration_estimate(tts, tts._processing_on) | ||
steal = self.extensions.get("stealing") | ||
if steal: | ||
steal.put_key_in_stealable(tts) | ||
|
||
############################ | ||
# Update State Information # | ||
|
@@ -3314,10 +3315,14 @@ def get_comm_cost(self, ts: TaskState, ws: WorkerState) -> double: | |
return nbytes / self._bandwidth | ||
|
||
@ccall | ||
def get_task_duration(self, ts: TaskState, default: double = -1) -> double: | ||
""" | ||
Get the estimated computation cost of the given task | ||
(not including any communication cost). | ||
def get_task_duration(self, ts: TaskState) -> double: | ||
"""Get the estimated computation cost of the given task (not including | ||
any communication cost). | ||
|
||
If no data has been observed, value of | ||
`distributed.scheduler.default-task-durations` are used. If none is set | ||
for this task, `distributed.scheduler.unknown-task-duration` is used | ||
instead. | ||
""" | ||
duration: double = ts._prefix._duration_average | ||
if duration >= 0: | ||
|
@@ -3327,7 +3332,7 @@ def get_task_duration(self, ts: TaskState, default: double = -1) -> double: | |
if s is None: | ||
self._unknown_durations[ts._prefix._name] = s = set() | ||
s.add(ts) | ||
return default if default >= 0 else self.UNKNOWN_TASK_DURATION | ||
return self.UNKNOWN_TASK_DURATION | ||
|
||
@ccall | ||
@exceptval(check=False) | ||
|
@@ -7586,7 +7591,6 @@ def reevaluate_occupancy(self, worker_index: Py_ssize_t = 0): | |
try: | ||
if self.status == Status.closed: | ||
return | ||
|
||
last = time() | ||
next_time = timedelta(seconds=0.1) | ||
|
||
|
@@ -7713,6 +7717,8 @@ def _remove_from_processing( | |
) -> str: # -> str | None | ||
""" | ||
Remove *ts* from the set of processing tasks. | ||
|
||
See also ``Scheduler.set_duration_estimate`` | ||
""" | ||
ws: WorkerState = ts._processing_on | ||
ts._processing_on = None # type: ignore | ||
|
@@ -7870,6 +7876,7 @@ def _task_to_msg(state: SchedulerState, ts: TaskState, duration: double = -1) -> | |
ws: WorkerState | ||
dts: TaskState | ||
|
||
# FIXME: The duration attribute is not used on worker. We could safe ourselves the time to compute and submit this | ||
if duration < 0: | ||
duration = state.get_task_duration(ts) | ||
|
||
|
@@ -7942,27 +7949,19 @@ def _task_to_client_msgs(state: SchedulerState, ts: TaskState) -> dict: | |
@exceptval(check=False) | ||
def _reevaluate_occupancy_worker(state: SchedulerState, ws: WorkerState): | ||
"""See reevaluate_occupancy""" | ||
old: double = ws._occupancy | ||
new: double = 0 | ||
diff: double | ||
ts: TaskState | ||
est: double | ||
old = ws._occupancy | ||
for ts in ws._processing: | ||
est = state.set_duration_estimate(ts, ws) | ||
new += est | ||
state.set_duration_estimate(ts, ws) | ||
|
||
ws._occupancy = new | ||
diff = new - old | ||
state._total_occupancy += diff | ||
state.check_idle_saturated(ws) | ||
|
||
# significant increase in duration | ||
if new > old * 1.3: | ||
steal = state._extensions.get("stealing") | ||
if steal is not None: | ||
for ts in ws._processing: | ||
steal.remove_key_from_stealable(ts) | ||
steal.put_key_in_stealable(ts) | ||
steal = state.extensions.get("stealing") | ||
if not steal: | ||
return | ||
if ws._occupancy > old * 1.3 or old > ws._occupancy * 1.3: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are also cases where the occupancy actually significantly reduced but we might still want to reapply stealing. Until #5379 is fixed, this might increase the likelihood of that deadlock but we should allow deviations in both directions |
||
for ts in ws._processing: | ||
steal.remove_key_from_stealable(ts) | ||
steal.put_key_in_stealable(ts) | ||
|
||
|
||
@cfunc | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This regression was partially introduced since the occupancy update here would not put a previously unknown task into the stealing whitelist. I didn't wan tto add on top of this function and decided to go for a refactoring instead