Skip to content

Commit

Permalink
Make oom_kill logic less aggressive (AcademySoftwareFoundation#1388)
Browse files Browse the repository at this point in the history
The logic was impacting more jobs than it needed to when
trying to protect a host from reaching OOM state. This change
filters the list of jobs to only target jobs that are using
more than they had initially reserved.
  • Loading branch information
DiegoTavares authored Jun 20, 2024
1 parent 2752ad5 commit b3e433b
Show file tree
Hide file tree
Showing 6 changed files with 11 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,8 @@ public boolean increaseReservedMemory(ProcInterface p, long value) {
"host.pk_host = ? " +
"AND " +
"proc.int_mem_reserved != 0 " +
"AND " +
"proc.int_virt_used >= proc.int_mem_pre_reserved " +
"ORDER BY " +
"proc.int_virt_used / proc.int_mem_pre_reserved DESC " +
") AS t1 LIMIT 1";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -649,7 +649,7 @@ private VirtualProc killWorstMemoryOffender(final DispatchHost host) {
return proc;
}
catch (EmptyResultDataAccessException e) {
logger.error(host.name + " is under OOM and no proc is running on it.");
logger.error(host.name + " is under OOM and no proc is memory overboard.");
return null;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ void setHostStatistics(HostInterface host,
void unbookProc(ProcInterface proc);

/**
* Returns the proc who is most deliquent on memory allocation
* For a given host, return the proc using more memory above what it had initially reserved
* @param h
* @return
*/
Expand Down
2 changes: 1 addition & 1 deletion cuebot/src/main/resources/opencue.properties
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ dispatcher.booking_queue.max_pool_size=6
dispatcher.booking_queue.queue_capacity=1000

# Percentage of used memory to consider a risk for triggering oom-killer
dispatcher.oom_max_safe_used_memory_threshold=0.95
dispatcher.oom_max_safe_used_memory_threshold=0.98

# How much can a frame exceed its reserved memory.
# - 0.5 means 50% above reserve
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,9 @@ public void testFindReservedMemoryOffender() {

// Increase the memory usage as frames are added
procDao.updateProcMemoryUsage(frame,
1000*i, 1000*i, 1000*i, 1000*i, 0, 0, children);
1000*i, 1000*i,
Dispatcher.MEM_RESERVED_DEFAULT*i, Dispatcher.MEM_RESERVED_DEFAULT*i,
0, 0, children);
i++;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,7 @@ public void testMemoryAggressionMemoryWarning() {
.setLayerId(proc1.getLayerId())
.setFrameId(proc1.getFrameId())
.setResourceId(proc1.getProcId())
.setVsize(CueUtil.GB2)
.setRss(CueUtil.GB2)
.setMaxRss(CueUtil.GB2)
.build();
Expand All @@ -558,6 +559,7 @@ public void testMemoryAggressionMemoryWarning() {
.setLayerId(proc2.getLayerId())
.setFrameId(proc2.getFrameId())
.setResourceId(proc2.getProcId())
.setVsize(CueUtil.GB4)
.setRss(CueUtil.GB4)
.setMaxRss(CueUtil.GB4)
.build();
Expand All @@ -569,6 +571,7 @@ public void testMemoryAggressionMemoryWarning() {
.setLayerId(proc3.getLayerId())
.setFrameId(proc3.getFrameId())
.setResourceId(proc3.getProcId())
.setVsize(memoryUsedProc3)
.setRss(memoryUsedProc3)
.setMaxRss(memoryUsedProc3)
.build();
Expand Down

0 comments on commit b3e433b

Please sign in to comment.