diff --git a/VERSION.in b/VERSION.in index c43e1055f..f3040840f 100644 --- a/VERSION.in +++ b/VERSION.in @@ -1 +1 @@ -0.12 +0.13 diff --git a/cuebot/src/main/java/com/imageworks/spcue/DispatchFrame.java b/cuebot/src/main/java/com/imageworks/spcue/DispatchFrame.java index db946b90c..781401165 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/DispatchFrame.java +++ b/cuebot/src/main/java/com/imageworks/spcue/DispatchFrame.java @@ -43,7 +43,9 @@ public class DispatchFrame extends FrameEntity implements FrameInterface { public int maxCores; public boolean threadable; public long minMemory; - public long minGpu; + public int minGpus; + public int maxGpus; + public long minGpuMemory; public String services; } diff --git a/cuebot/src/main/java/com/imageworks/spcue/DispatchHost.java b/cuebot/src/main/java/com/imageworks/spcue/DispatchHost.java index e1b3cc8f2..495d0a9b1 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/DispatchHost.java +++ b/cuebot/src/main/java/com/imageworks/spcue/DispatchHost.java @@ -35,13 +35,16 @@ public class DispatchHost extends Entity public int cores; public int idleCores; + public int gpus; + public int idleGpus; + // Basically an 0 = auto, 1 = all. public int threadMode; public long memory; public long idleMemory; - public long gpu; - public long idleGpu; + public long gpuMemory; + public long idleGpuMemory; public String tags; public String os; @@ -53,11 +56,13 @@ public class DispatchHost extends Entity * booked to this host. */ public int strandedCores = 0; + public int strandedGpus = 0; // To reserve resources for future gpu job long idleMemoryOrig = 0; int idleCoresOrig = 0; - long idleGpuOrig = 0; + long idleGpuMemoryOrig = 0; + int idleGpusOrig = 0; public String getHostId() { return id; @@ -72,7 +77,7 @@ public String getFacilityId() { } @Override - public boolean hasAdditionalResources(int minCores, long minMemory, long minGpu) { + public boolean hasAdditionalResources(int minCores, long minMemory, int minGpus, long minGpuMemory) { if (idleCores < minCores) { return false; @@ -80,7 +85,10 @@ public boolean hasAdditionalResources(int minCores, long minMemory, long minGpu) else if (idleMemory < minMemory) { return false; } - else if (idleGpu < minGpu) { + else if (idleGpus < minGpus) { + return false; + } + else if (idleGpuMemory < minGpuMemory) { return false; } @@ -88,10 +96,11 @@ else if (idleGpu < minGpu) { } @Override - public void useResources(int coreUnits, long memory, long gpu) { + public void useResources(int coreUnits, long memory, int gpuUnits, long gpuMemory) { idleCores = idleCores - coreUnits; idleMemory = idleMemory - memory; - idleGpu = idleGpu - gpu; + idleGpus = idleGpus - gpuUnits; + idleGpuMemory = idleGpuMemory - gpuMemory; } /** @@ -99,14 +108,16 @@ public void useResources(int coreUnits, long memory, long gpu) { * */ public void removeGpu() { - if (idleGpu > 0 && idleGpuOrig == 0) { + if (idleGpuMemory > 0 && idleGpuMemoryOrig == 0) { idleMemoryOrig = idleMemory; idleCoresOrig = idleCores; - idleGpuOrig = idleGpu; + idleGpuMemoryOrig = idleGpuMemory; + idleGpusOrig = idleGpus; idleMemory = idleMemory - Math.min(CueUtil.GB4, idleMemory); idleCores = idleCores - Math.min(100, idleCores); - idleGpu = 0; + idleGpuMemory = idleGpuMemory - Math.min(CueUtil.GB4, idleGpuMemory); + idleGpus = idleGpus - Math.min(1, idleGpus); } } @@ -115,14 +126,16 @@ public void removeGpu() { * */ public void restoreGpu() { - if (idleGpuOrig > 0) { + if (idleGpuMemoryOrig > 0) { idleMemory = idleMemoryOrig; idleCores = idleCoresOrig; - idleGpu = idleGpuOrig; + idleGpuMemory = idleGpuMemoryOrig; + idleGpus = idleGpusOrig; idleMemoryOrig = 0; idleCoresOrig = 0; - idleGpuOrig = 0; + idleGpuMemoryOrig = 0; + idleGpusOrig = 0; } } } diff --git a/cuebot/src/main/java/com/imageworks/spcue/ExecutionSummary.java b/cuebot/src/main/java/com/imageworks/spcue/ExecutionSummary.java index a13529ad8..afe85121a 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/ExecutionSummary.java +++ b/cuebot/src/main/java/com/imageworks/spcue/ExecutionSummary.java @@ -28,6 +28,9 @@ public class ExecutionSummary { public long coreTime; public long coreTimeSuccess; public long coreTimeFail; + public long gpuTime; + public long gpuTimeSuccess; + public long gpuTimeFail; public long highMemoryKb; public long getHighMemoryKb() { @@ -69,5 +72,29 @@ public long getCoreTimeFail() { public void setCoreTimeFail(long coreTimeFail) { this.coreTimeFail = coreTimeFail; } + + public long getGpuTime() { + return gpuTime; + } + + public void setGpuTime(long gpuTime) { + this.gpuTime = gpuTime; + } + + public long getGpuTimeSuccess() { + return gpuTimeSuccess; + } + + public void setGpuTimeSuccess(long gpuTimeSuccess) { + this.gpuTimeSuccess = gpuTimeSuccess; + } + + public long getGpuTimeFail() { + return gpuTimeFail; + } + + public void setGpuTimeFail(long gpuTimeFail) { + this.gpuTimeFail = gpuTimeFail; + } } diff --git a/cuebot/src/main/java/com/imageworks/spcue/GroupDetail.java b/cuebot/src/main/java/com/imageworks/spcue/GroupDetail.java index b67a53b77..cd9f8a998 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/GroupDetail.java +++ b/cuebot/src/main/java/com/imageworks/spcue/GroupDetail.java @@ -23,11 +23,16 @@ public class GroupDetail extends Entity implements GroupInterface, DepartmentInt public int jobMinCores = -1; public int jobMaxCores = -1; + public int jobMinGpus = -1; + public int jobMaxGpus = -1; public int jobPriority = -1; public int minCores = -1; public int maxCores = -1; + public int minGpus = -1; + public int maxGpus = -1; + public String parentId = null; public String showId; public String deptId; diff --git a/cuebot/src/main/java/com/imageworks/spcue/HostEntity.java b/cuebot/src/main/java/com/imageworks/spcue/HostEntity.java index 96defaf61..5a019e8f1 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/HostEntity.java +++ b/cuebot/src/main/java/com/imageworks/spcue/HostEntity.java @@ -36,10 +36,12 @@ public class HostEntity extends Entity implements HostInterface { public int procs; public int cores; public int idleCores; - public int memory; - public int idleMemory; - public int gpu; - public int idleGpu; + public long memory; + public long idleMemory; + public int gpus; + public int idleGpus; + public long gpuMemory; + public long idleGpuMemory; public boolean unlockAtBoot; @@ -57,10 +59,12 @@ public HostEntity(Host grpcHost) { this.nimbyEnabled = grpcHost.getNimbyEnabled(); this.cores = (int) grpcHost.getCores(); this.idleCores = (int) grpcHost.getIdleCores(); - this.memory = (int) grpcHost.getMemory(); - this.idleMemory = (int) grpcHost.getIdleMemory(); - this.gpu = (int) grpcHost.getGpu(); - this.idleGpu = (int) grpcHost.getIdleGpu(); + this.memory = grpcHost.getMemory(); + this.idleMemory = grpcHost.getIdleMemory(); + this.gpus = (int) grpcHost.getGpus(); + this.idleGpus = (int) grpcHost.getIdleGpus(); + this.gpuMemory = grpcHost.getGpuMemory(); + this.idleGpuMemory = grpcHost.getIdleGpuMemory(); } public String getHostId() { diff --git a/cuebot/src/main/java/com/imageworks/spcue/Inherit.java b/cuebot/src/main/java/com/imageworks/spcue/Inherit.java index 73651c33d..1fdb23336 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/Inherit.java +++ b/cuebot/src/main/java/com/imageworks/spcue/Inherit.java @@ -28,6 +28,8 @@ public enum Inherit { Priority, MinCores, MaxCores, + MinGpus, + MaxGpus, All } diff --git a/cuebot/src/main/java/com/imageworks/spcue/JobDetail.java b/cuebot/src/main/java/com/imageworks/spcue/JobDetail.java index 29286ffe3..dad6f8a6d 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/JobDetail.java +++ b/cuebot/src/main/java/com/imageworks/spcue/JobDetail.java @@ -46,12 +46,15 @@ public class JobDetail extends JobEntity implements JobInterface, DepartmentInte public int priority = 1; public int minCoreUnits = 100; public int maxCoreUnits = 200000; + public int minGpuUnits = 0; + public int maxGpuUnits = 1000; public boolean isLocal = false; public String localHostName; public int localMaxCores; - public int localMaxMemory; + public long localMaxMemory; public int localThreadNumber; - public int localMaxGpu; + public int localMaxGpus; + public long localMaxGpuMemory; public String getDepartmentId() { return deptId; diff --git a/cuebot/src/main/java/com/imageworks/spcue/LayerDetail.java b/cuebot/src/main/java/com/imageworks/spcue/LayerDetail.java index 3b473f8c1..565995d9d 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/LayerDetail.java +++ b/cuebot/src/main/java/com/imageworks/spcue/LayerDetail.java @@ -32,9 +32,11 @@ public class LayerDetail extends LayerEntity implements LayerInterface { public LayerType type; public int minimumCores; public int maximumCores; + public int minimumGpus; + public int maximumGpus; public boolean isThreadable; public long minimumMemory; - public long minimumGpu; + public long minimumGpuMemory; public int chunkSize; public int timeout; public int timeout_llu; @@ -116,12 +118,20 @@ public void setMinimumMemory(long minimumMemory) { this.minimumMemory = minimumMemory; } - public long getMinimumGpu() { - return minimumGpu; + public int getMinimumGpus() { + return minimumGpus; } - public void setMinimumGpu(long minimumGpu) { - this.minimumGpu = minimumGpu; + public void setMinimumGpus(int minimumGpus) { + this.minimumGpus = minimumGpus; + } + + public long getMinimumGpuMemory() { + return minimumGpuMemory; + } + + public void setMinimumGpuMemory(long minimumGpuMemory) { + this.minimumGpuMemory = minimumGpuMemory; } public int getChunkSize() { diff --git a/cuebot/src/main/java/com/imageworks/spcue/LocalHostAssignment.java b/cuebot/src/main/java/com/imageworks/spcue/LocalHostAssignment.java index cc6287253..3e073fa73 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/LocalHostAssignment.java +++ b/cuebot/src/main/java/com/imageworks/spcue/LocalHostAssignment.java @@ -35,11 +35,13 @@ public class LocalHostAssignment extends Entity private int idleCoreUnits; private long idleMemory; - private long idleGpu; + private int idleGpuUnits; + private long idleGpuMemory; private long maxMemory; - private long maxGpu; + private long maxGpuMemory; private int maxCoreUnits; + private int maxGpuUnits; private int threads; @@ -52,15 +54,16 @@ public class LocalHostAssignment extends Entity public LocalHostAssignment() { } - public LocalHostAssignment(int maxCores, int threads, long maxMemory, long maxGpu) { + public LocalHostAssignment(int maxCores, int threads, long maxMemory, int maxGpus, long maxGpuMemory) { this.maxCoreUnits = maxCores; this.threads = threads; this.maxMemory = maxMemory; - this.maxGpu = maxGpu; + this.maxGpuUnits = maxGpus; + this.maxGpuMemory = maxGpuMemory; } @Override - public boolean hasAdditionalResources(int minCores, long minMemory, long minGpu) { + public boolean hasAdditionalResources(int minCores, long minMemory, int minGpus, long minGpuMemory) { if (idleCoreUnits < minCores) { return false; @@ -68,7 +71,10 @@ public boolean hasAdditionalResources(int minCores, long minMemory, long minGpu) else if (idleMemory < minMemory) { return false; } - else if (idleGpu < minGpu) { + else if (idleGpuUnits < minGpus) { + return false; + } + else if (idleGpuMemory < minGpuMemory) { return false; } @@ -76,10 +82,11 @@ else if (idleGpu < minGpu) { } @Override - public void useResources(int coreUnits, long memory, long gpu) { + public void useResources(int coreUnits, long memory, int gpuUnits, long gpuMemory) { idleCoreUnits = idleCoreUnits - coreUnits; idleMemory = idleMemory - memory; - idleGpu = idleGpu - gpu; + idleGpuUnits = idleGpuUnits - gpuUnits; + idleGpuMemory = idleGpuMemory - gpuMemory; } public int getThreads() { @@ -110,16 +117,24 @@ public long getIdleMemory() { return this.idleMemory; } - public long getMaxGpu() { - return maxGpu; + public int getMaxGpuUnits() { + return maxGpuUnits; + } + + public void setMaxGpuUnits(int maxGpuUnits) { + this.maxGpuUnits = maxGpuUnits; + } + + public long getMaxGpuMemory() { + return maxGpuMemory; } - public void setMaxGpu(long maxGpu) { - this.maxGpu = maxGpu; + public void setMaxGpuMemory(long maxGpuMemory) { + this.maxGpuMemory = maxGpuMemory; } - public long getIdleGpu() { - return this.idleGpu; + public long getIdleGpuMemory() { + return this.idleGpuMemory; } public int getIdleCoreUnits() { @@ -134,8 +149,16 @@ public void setIdleMemory(long idleMemory) { this.idleMemory = idleMemory; } - public void setIdleGpu(long idleGpu) { - this.idleGpu = idleGpu; + public int getIdleGpuUnits() { + return this.idleGpuUnits; + } + + public void setIdleGpuUnits(int idleGpuUnits) { + this.idleGpuUnits = idleGpuUnits; + } + + public void setIdleGpuMemory(long idleGpuMemory) { + this.idleGpuMemory = idleGpuMemory; } public String getHostId() { diff --git a/cuebot/src/main/java/com/imageworks/spcue/ResourceUsage.java b/cuebot/src/main/java/com/imageworks/spcue/ResourceUsage.java index aae8921e4..b45af0838 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/ResourceUsage.java +++ b/cuebot/src/main/java/com/imageworks/spcue/ResourceUsage.java @@ -25,9 +25,10 @@ public class ResourceUsage { private final long coreTimeSeconds; + private final long gpuTimeSeconds; private final long clockTimeSeconds; - public ResourceUsage(long clockTime, int corePoints) { + public ResourceUsage(long clockTime, int corePoints, int gpuPoints) { if (clockTime < 1) { clockTime = 1; @@ -38,14 +39,21 @@ public ResourceUsage(long clockTime, int corePoints) { coreTime = 1; } + long gpuTime = clockTime * gpuPoints; + clockTimeSeconds = clockTime; coreTimeSeconds = coreTime; + gpuTimeSeconds = gpuTime; } public long getCoreTimeSeconds() { return coreTimeSeconds; } + public long getGpuTimeSeconds() { + return gpuTimeSeconds; + } + public long getClockTimeSeconds() { return clockTimeSeconds; } diff --git a/cuebot/src/main/java/com/imageworks/spcue/ServiceEntity.java b/cuebot/src/main/java/com/imageworks/spcue/ServiceEntity.java index 373877e69..16d03c5c5 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/ServiceEntity.java +++ b/cuebot/src/main/java/com/imageworks/spcue/ServiceEntity.java @@ -40,6 +40,17 @@ public class ServiceEntity extends Entity { */ public int maxCores = 0; + /** + * Determines the default minimum gpus per frame. + */ + public int minGpus = 0; + + /** + * Determines the default minimum gpus per frame. 0 indicates + * the feature is disabled. + */ + public int maxGpus = 0; + /** * Determines the default minimum memory per frame. */ @@ -48,7 +59,7 @@ public class ServiceEntity extends Entity { /** * Determines the default minimum gpu per frame. */ - public long minGpu = Dispatcher.GPU_RESERVED_DEFAULT; + public long minGpuMemory = Dispatcher.MEM_GPU_RESERVED_DEFAULT; /** * Determines the default tags. diff --git a/cuebot/src/main/java/com/imageworks/spcue/ShowEntity.java b/cuebot/src/main/java/com/imageworks/spcue/ShowEntity.java index 8a4d768af..1d2f675e1 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/ShowEntity.java +++ b/cuebot/src/main/java/com/imageworks/spcue/ShowEntity.java @@ -25,6 +25,8 @@ public class ShowEntity extends Entity implements ShowInterface { public boolean paused; public int defaultMinCores; public int defaultMaxCores; + public int defaultMinGpus; + public int defaultMaxGpus; public String[] commentMail; public String getShowId() { diff --git a/cuebot/src/main/java/com/imageworks/spcue/StrandedGpus.java b/cuebot/src/main/java/com/imageworks/spcue/StrandedGpus.java new file mode 100644 index 000000000..91b9ad76a --- /dev/null +++ b/cuebot/src/main/java/com/imageworks/spcue/StrandedGpus.java @@ -0,0 +1,44 @@ + +/* + * Copyright Contributors to the OpenCue Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +package com.imageworks.spcue; + +public final class StrandedGpus { + + /** + * The maximum time this object should be valid. + */ + private static final long MAX_AGE_MILLIS = 5000l; + + private final int gpus; + private final long expireTime = System.currentTimeMillis() + MAX_AGE_MILLIS; + + public StrandedGpus(int gpus) { + this.gpus = gpus; + } + + public int getGpus() { + return this.gpus; + } + + public boolean isExpired() { + return System.currentTimeMillis() > expireTime; + } +} + diff --git a/cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java b/cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java index 4316b708d..28b54799d 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/VirtualProc.java @@ -36,7 +36,11 @@ public class VirtualProc extends FrameEntity implements ProcInterface { public long memoryMax; public long virtualMemoryUsed; public long virtualMemoryMax; - public long gpuReserved; + + public int gpusReserved; + public long gpuMemoryReserved; + public long gpuMemoryUsed; + public long gpuMemoryMax; public boolean unbooked; public boolean usageRecorded = false; @@ -91,8 +95,8 @@ public static final VirtualProc build(DispatchHost host, DispatchFrame frame) { proc.coresReserved = frame.minCores; proc.memoryReserved = frame.minMemory; - // This reserves all the gpu memory on a host for one frame - proc.gpuReserved = (frame.minGpu > 0) ? host.idleGpu : 0; + proc.gpusReserved = frame.minGpus; + proc.gpuMemoryReserved = frame.minGpuMemory; /* * Frames that are announcing cores less than 100 are not multi-threaded @@ -208,7 +212,8 @@ public static final VirtualProc build(DispatchHost host, proc.coresReserved = lja.getThreads() * 100; proc.memoryReserved = frame.minMemory; - proc.gpuReserved = frame.minGpu; + proc.gpusReserved = frame.minGpus; + proc.gpuMemoryReserved = frame.minGpuMemory; int wholeCores = (int) (Math.floor(host.idleCores / 100.0)); if (wholeCores == 0) { diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/BookingDao.java b/cuebot/src/main/java/com/imageworks/spcue/dao/BookingDao.java index 9fe9703cb..f3bb09915 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/BookingDao.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/BookingDao.java @@ -38,6 +38,15 @@ public interface BookingDao { */ boolean updateMaxCores(LocalHostAssignment l, int maxCoreUnits); + /** + * Updates the maximum number of gpus the given local + * host assignment should use. + * + * @param l + * @return + */ + boolean updateMaxGpus(LocalHostAssignment l, int gpus); + /** * Updates the maximum amount of memory a given local host * assignment should use. @@ -54,7 +63,7 @@ public interface BookingDao { * @param l * @return */ - boolean updateMaxGpu(LocalHostAssignment l, long maxGpu); + boolean updateMaxGpuMemory(LocalHostAssignment l, long maxGpuMemory); /** * Create a new LocalHostAssignment attached to the given job. @@ -150,6 +159,16 @@ void insertLocalHostAssignment(HostInterface host, FrameInterface frame, */ int getCoreUsageDifference(LocalHostAssignment l, int coreUnits); + /** + * Return the difference between the number of assigned gpus and + * the given gpuUnits. + * + * @param l + * @param gpuUnits + * @return + */ + int getGpuUsageDifference(LocalHostAssignment l, int gpuUnits); + /** * Allocate additional cores from the given host. * @@ -168,6 +187,24 @@ void insertLocalHostAssignment(HostInterface host, FrameInterface frame, */ boolean deallocateCoresFromHost(HostInterface h, int cores); + /** + * Allocate additional gpus from the given host. + * + * @param h + * @param gpus + * @return + */ + boolean allocateGpusFromHost(HostInterface h, int gpus); + + /** + * Deallocate gpu from the given host, returning them to its pool. + * + * @param h + * @param gpus + * @return + */ + boolean deallocateGpusFromHost(HostInterface h, int gpus); + /** * Return true if the Host has a resource deficit. A * deficit can occur if there are more resources in use than the diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/GroupDao.java b/cuebot/src/main/java/com/imageworks/spcue/dao/GroupDao.java index 181062df6..dfb49dd9c 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/GroupDao.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/GroupDao.java @@ -137,6 +137,40 @@ public interface GroupDao { */ public void updateMinCores(GroupInterface group, int value); + + /** + * Sets the maximum number of gpus the group should be running. + * + * @param group + * @param value + */ + void updateDefaultJobMaxGpus(GroupInterface group, int value); + + /** + * Sets the minimum number of gpus the group should be running. + * + * @param group + * @param value + */ + void updateDefaultJobMinGpus(GroupInterface group, int value); + + /** + * Sets the maximum number of gpus for this group + * + * @param group + * @param value + */ + public void updateMaxGpus(GroupInterface group, int value); + + /** + * Set the minimum number of gpus for this group + * + * @param group + * @param value + */ + + public void updateMinGpus(GroupInterface group, int value); + /** * Renames the group * @@ -186,6 +220,14 @@ public interface GroupDao { */ boolean isOverMinCores(JobInterface job); + /** + * Returns true if the group of the specified job is at or over its min gpus + * + * @param job + * @return + */ + boolean isOverMinGpus(JobInterface job); + /** * Returns true if the group is managed. * diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/HostDao.java b/cuebot/src/main/java/com/imageworks/spcue/dao/HostDao.java index 04cd49f46..768bcdbd2 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/HostDao.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/HostDao.java @@ -263,8 +263,8 @@ public interface HostDao { * @param freeSwap long * @param totalMcp long * @param freeMcp long - * @param totalGpu long - * @param freeGpu long + * @param totalGpuMemory long + * @param freeGpuMemory long * @param load int * @param os String */ @@ -272,7 +272,7 @@ void updateHostStats(HostInterface host, long totalMemory, long freeMemory, long totalSwap, long freeSwap, long totalMcp, long freeMcp, - long totalGpu, long freeGpu, + long totalGpuMemory, long freeGpuMemory, int load, Timestamp bootTime, String os); /** @@ -293,6 +293,16 @@ void updateHostStats(HostInterface host, */ int getStrandedCoreUnits(HostInterface h); + /** + * Return the number of whole stranded gpus on this host. The must have + * less than Dispacher.MEM_STRANDED_THRESHHOLD for the gpus to be + * considered stranded. + * + * @param h HostInterface + * @return int + */ + int getStrandedGpus(HostInterface h); + /** * Return true if the host is preferring a particular show. * diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/JobDao.java b/cuebot/src/main/java/com/imageworks/spcue/dao/JobDao.java index 4ffaf2f43..3882f95a7 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/JobDao.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/JobDao.java @@ -59,6 +59,24 @@ public interface JobDao { */ public void updateMinCores(GroupInterface g, int cores); + /** + * Updates all jobs in the speficed group to the + * max gpu value. + * + * @param g + * @param gpu + */ + public void updateMaxGpus(GroupInterface g, int gpus); + + /** + * Updates all jobs in the specifid group to the + * min gpu value. + * + * @param g + * @param gpu + */ + public void updateMinGpus(GroupInterface g, int gpus); + /** * Updates all jobs in the specified group to the * set priority. @@ -255,6 +273,39 @@ public interface JobDao { */ boolean isOverMaxCores(JobInterface job, int coreUnits); + /** + * reteurns true if job is over its minimum gpus + * + * @param job + * @return boolean + */ + boolean isOverMinGpus(JobInterface job); + + /** + * returns true if job is over max gpus + * + * @param job + * @return + */ + boolean isOverMaxGpus(JobInterface job); + + /** + * returns true if job is at its max gpus + * + * @param job + * @return + */ + boolean isAtMaxGpus(JobInterface job); + + /** + * Return true if adding given gpus to the job + * will set the job over its max gpus value. + * + * @param job + * @param gpus + * @return + */ + boolean isOverMaxGpus(JobInterface job, int gpus); /** * sets the jobs new priority value @@ -280,6 +331,22 @@ public interface JobDao { */ void updateMaxCores(JobInterface j, int v); + /** + * sets the jobs new min gpu value + * + * @param j + * @param v + */ + void updateMinGpus(JobInterface j, int v); + + /** + * sets the jobs new max gpu value + * + * @param j + * @param v + */ + void updateMaxGpus(JobInterface j, int v); + /** * Update a job's paused state * diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/LayerDao.java b/cuebot/src/main/java/com/imageworks/spcue/dao/LayerDao.java index 243cbbce9..ba8295462 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/LayerDao.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/LayerDao.java @@ -131,6 +131,15 @@ public interface LayerDao { */ void updateLayerMinCores(LayerInterface layer, int val); + + /** + * update the number of gpus the layer requires + * + * @param layer + * @param val + */ + void updateLayerMinGpus(LayerInterface layer, int val); + /** * update the amount of memory required by all subsequent * running frames in the specified layer. @@ -147,7 +156,7 @@ public interface LayerDao { * @param layer * @param val */ - void updateLayerMinGpu(LayerInterface layer, long gpu); + void updateLayerMinGpuMemory(LayerInterface layer, long val); /** * Update a layer with new host tags. @@ -207,9 +216,9 @@ public interface LayerDao { * value is larger than the current value * * @param layer - * @param gpu + * @param val */ - void increaseLayerMinGpu(LayerInterface layer, long gpu); + void increaseLayerMinGpuMemory(LayerInterface layer, long val); /** * Tries to find a max RSS value for layer in the specified job. The @@ -256,10 +265,10 @@ public interface LayerDao { * job with the new gpu requirement. * * @param job - * @param gpu + * @param mem * @param type */ - void updateMinGpu(JobInterface job, long gpu, LayerType type); + void updateMinGpuMemory(JobInterface job, long mem, LayerType type); /** * Update all layers of the set type in the specified job @@ -271,6 +280,16 @@ public interface LayerDao { */ void updateMinCores(JobInterface job, int cores, LayerType type); + /** + * Update all layers of the set type in the specified job + * with the new min cores requirement. + * + * @param job + * @param gpus + * @param type + */ + void updateMinGpus(JobInterface job, int gpus, LayerType type); + /** * Update a layer's max cores value, which limits how * much threading can go on. @@ -395,6 +414,16 @@ public interface LayerDao { */ void updateLayerMaxCores(LayerInterface layer, int val); + /** + * Set the layer's max gpus value to the given int. The + * max gpu value will not allow the dispatcher to + * book over the given number of gpu. + * + * @param layer + * @param val + */ + void updateLayerMaxGpus(LayerInterface layer, int val); + /** * Add a limit to the given layer. * diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/ProcDao.java b/cuebot/src/main/java/com/imageworks/spcue/dao/ProcDao.java index 5efdd01d2..31e49a208 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/ProcDao.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/ProcDao.java @@ -54,7 +54,7 @@ public interface ProcDao { * @return */ - long getReservedGpu(ProcInterface proc); + long getReservedGpuMemory(ProcInterface proc); /** * Return the proc that has exceeded its reserved memory by the largest factor. diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/ShowDao.java b/cuebot/src/main/java/com/imageworks/spcue/dao/ShowDao.java index 1853662a5..f0cdcbba7 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/ShowDao.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/ShowDao.java @@ -81,6 +81,21 @@ public interface ShowDao { */ void updateShowDefaultMaxCores(ShowInterface s, int val); + /** + * + * @param s + * @param val + */ + void updateShowDefaultMinGpus(ShowInterface s, int val); + + /** + * + * @param s + * @param val + */ + void updateShowDefaultMaxGpus(ShowInterface s, int val); + + /** * Disabling this would stop new proc assignement. The show would get no new * procs, but any procs already assigned to a job would continue to diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/BookingDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/BookingDaoJdbc.java index 550cddc17..08e1634aa 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/BookingDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/BookingDaoJdbc.java @@ -55,12 +55,14 @@ public class BookingDaoJdbc extends "int_mem_idle,"+ "int_cores_max,"+ "int_cores_idle,"+ - "int_gpu_idle,"+ - "int_gpu_max,"+ + "int_gpu_mem_idle,"+ + "int_gpu_mem_max,"+ + "int_gpus_max,"+ + "int_gpus_idle,"+ "int_threads "+ ") " + "VALUES " + - "(?,?,?,?,?,?,?,?,?,?,?,?,?)"; + "(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"; @Override public void insertLocalHostAssignment(HostInterface h, JobInterface job, LocalHostAssignment l) { @@ -71,7 +73,8 @@ public void insertLocalHostAssignment(HostInterface h, JobInterface job, LocalHo l.setType(RenderPartitionType.JOB_PARTITION); l.setIdleCoreUnits(l.getMaxCoreUnits()); l.setIdleMemory(l.getMaxMemory()); - l.setIdleGpu(l.getMaxGpu()); + l.setIdleGpuUnits(l.getMaxGpuUnits()); + l.setIdleGpuMemory(l.getMaxGpuMemory()); getJdbcTemplate().update( INSERT_LOCAL_JOB_ASSIGNMENT, @@ -85,8 +88,10 @@ public void insertLocalHostAssignment(HostInterface h, JobInterface job, LocalHo l.getMaxMemory(), l.getMaxCoreUnits(), l.getMaxCoreUnits(), - l.getMaxGpu(), - l.getMaxGpu(), + l.getMaxGpuMemory(), + l.getMaxGpuMemory(), + l.getMaxGpuUnits(), + l.getMaxGpuUnits(), l.getThreads()); } @@ -100,7 +105,8 @@ public void insertLocalHostAssignment(HostInterface h, LayerInterface layer, Loc l.setType(RenderPartitionType.LAYER_PARTITION); l.setIdleCoreUnits(l.getMaxCoreUnits()); l.setIdleMemory(l.getMaxMemory()); - l.setIdleGpu(l.getMaxGpu()); + l.setIdleGpuUnits(l.getMaxGpuUnits()); + l.setIdleGpuMemory(l.getMaxGpuMemory()); getJdbcTemplate().update( INSERT_LOCAL_JOB_ASSIGNMENT, @@ -114,8 +120,10 @@ public void insertLocalHostAssignment(HostInterface h, LayerInterface layer, Loc l.getMaxMemory(), l.getMaxCoreUnits(), l.getMaxCoreUnits(), - l.getMaxGpu(), - l.getMaxGpu(), + l.getMaxGpuMemory(), + l.getMaxGpuMemory(), + l.getMaxGpuUnits(), + l.getMaxGpuUnits(), l.getThreads()); } @@ -130,7 +138,8 @@ public void insertLocalHostAssignment(HostInterface h, FrameInterface frame, Loc l.setType(RenderPartitionType.FRAME_PARTITION); l.setIdleCoreUnits(l.getMaxCoreUnits()); l.setIdleMemory(l.getMaxMemory()); - l.setIdleGpu(l.getMaxGpu()); + l.setIdleGpuUnits(l.getMaxGpuUnits()); + l.setIdleGpuMemory(l.getMaxGpuMemory()); getJdbcTemplate().update( INSERT_LOCAL_JOB_ASSIGNMENT, @@ -144,8 +153,10 @@ public void insertLocalHostAssignment(HostInterface h, FrameInterface frame, Loc l.getMaxMemory(), l.getMaxCoreUnits(), l.getMaxCoreUnits(), - l.getMaxGpu(), - l.getMaxGpu(), + l.getMaxGpuMemory(), + l.getMaxGpuMemory(), + l.getMaxGpuUnits(), + l.getMaxGpuUnits(), l.getThreads()); } public static final RowMapper LJA_MAPPER = @@ -155,11 +166,13 @@ public LocalHostAssignment mapRow(final ResultSet rs, int rowNum) throws SQLExce l.id = rs.getString("pk_host_local"); l.setMaxCoreUnits(rs.getInt("int_cores_max")); l.setMaxMemory(rs.getLong("int_mem_max")); - l.setMaxGpu(rs.getLong("int_gpu_max")); + l.setMaxGpuUnits(rs.getInt("int_gpus_max")); + l.setMaxGpuMemory(rs.getLong("int_gpu_mem_max")); l.setThreads(rs.getInt("int_threads")); l.setIdleCoreUnits(rs.getInt("int_cores_idle")); l.setIdleMemory(rs.getLong("int_mem_idle")); - l.setIdleGpu(rs.getLong("int_gpu_idle")); + l.setIdleGpuUnits(rs.getInt("int_gpus_idle")); + l.setIdleGpuMemory(rs.getLong("int_gpu_mem_idle")); l.setJobId(rs.getString("pk_job")); l.setLayerId(rs.getString("pk_layer")); l.setFrameId(rs.getString("pk_frame")); @@ -180,8 +193,10 @@ public LocalHostAssignment mapRow(final ResultSet rs, int rowNum) throws SQLExce "int_mem_max,"+ "int_cores_idle,"+ "int_cores_max,"+ - "int_gpu_idle,"+ - "int_gpu_max,"+ + "int_gpu_mem_idle,"+ + "int_gpu_mem_max,"+ + "int_gpus_idle,"+ + "int_gpus_max,"+ "int_threads, "+ "str_type " + "FROM " + @@ -257,6 +272,13 @@ public int getCoreUsageDifference(LocalHostAssignment l, int coreUnits) { Integer.class, coreUnits, l.getId()); } + @Override + public int getGpuUsageDifference(LocalHostAssignment l, int gpuUnits) { + return getJdbcTemplate().queryForObject( + "SELECT ? - int_gpus_max FROM host_local WHERE pk_host_local=?", + Integer.class, gpuUnits, l.getId()); + } + private static final String UPDATE_MAX_CORES = "UPDATE " + "host_local " + @@ -272,6 +294,21 @@ public boolean updateMaxCores(LocalHostAssignment l, int coreUnits) { coreUnits, coreUnits, l.getId()) > 0; } + private static final String UPDATE_MAX_GPUS = + "UPDATE " + + "host_local " + + "SET " + + "int_gpus_idle = int_gpus_idle + (? - int_gpus_max), " + + "int_gpus_max = ? "+ + "WHERE " + + "pk_host_local = ? "; + + @Override + public boolean updateMaxGpus(LocalHostAssignment l, int gpuUnits) { + return getJdbcTemplate().update(UPDATE_MAX_GPUS, + gpuUnits, gpuUnits, l.getId()) > 0; + } + private static final String UPDATE_MAX_MEMORY = "UPDATE " + "host_local " + @@ -287,19 +324,19 @@ public boolean updateMaxMemory(LocalHostAssignment l, long maxMemory) { UPDATE_MAX_MEMORY, maxMemory, maxMemory, l.getId()) > 0; } - private static final String UPDATE_MAX_GPU = + private static final String UPDATE_MAX_GPU_MEMORY = "UPDATE " + "host_local " + "SET " + - "int_gpu_idle = int_gpu_idle + (? - int_gpu_max), " + - "int_gpu_max = ? "+ + "int_gpu_mem_idle = int_gpu_mem_idle + (? - int_gpu_mem_max), " + + "int_gpu_mem_max = ? "+ "WHERE " + "pk_host_local = ? "; @Override - public boolean updateMaxGpu(LocalHostAssignment l, long maxGpu) { + public boolean updateMaxGpuMemory(LocalHostAssignment l, long maxGpuMemory) { return getJdbcTemplate().update( - UPDATE_MAX_GPU, maxGpu, maxGpu, l.getId()) > 0; + UPDATE_MAX_GPU_MEMORY, maxGpuMemory, maxGpuMemory, l.getId()) > 0; } @Override @@ -331,6 +368,26 @@ public boolean allocateCoresFromHost(HostInterface h, int cores) { } + /** + * + * @param h HostInterface + * @param gpus int + * @return boolean + */ + @Override + public boolean allocateGpusFromHost(HostInterface h, int gpus) { + + try { + return getJdbcTemplate().update( + "UPDATE host SET int_gpus_idle = int_gpus_idle - ? " + + "WHERE pk_host = ?", + gpus, h.getHostId()) > 0; + } catch (DataAccessException e) { + throw new ResourceReservationFailureException("Failed to allocate " + + gpus + " GPU from host, " + e); + } + } + /** * * @param h HostInterface @@ -349,12 +406,31 @@ public boolean deallocateCoresFromHost(HostInterface h, int cores) { } } + /** + * + * @param h HostInterface + * @param gpus int + * @return boolean + */ + @Override + public boolean deallocateGpusFromHost(HostInterface h, int gpus) { + try { + return getJdbcTemplate().update( + "UPDATE host SET int_gpus_idle = int_gpus_idle + ? WHERE pk_host = ?", + gpus, h.getHostId()) > 0; + } catch (DataAccessException e) { + throw new ResourceReservationFailureException("Failed to de-allocate " + + gpus + " GPU from host, " + e); + } + } + @Override public boolean hasResourceDeficit(HostInterface host) { return getJdbcTemplate().queryForObject( "SELECT COUNT(1) FROM host_local WHERE " + "(int_cores_max < int_cores_max - int_cores_idle OR " + - "int_gpu_max < int_gpu_max - int_gpu_idle OR " + + "int_gpus_max < int_gpus_max - int_gpus_idle OR " + + "int_gpu_mem_max < int_gpu_mem_max - int_gpu_mem_idle OR " + "int_mem_max < int_mem_max - int_mem_idle) AND " + "host_local.pk_host= ?", Integer.class, host.getHostId()) > 0; diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java index 0443f691a..fb267ddbd 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java @@ -52,6 +52,12 @@ public class DispatchQuery { "OR " + "folder_resource.int_cores < folder_resource.int_max_cores " + ") " + + "AND " + + "(" + + "folder_resource.int_max_gpus = -1 " + + "OR " + + "folder_resource.int_gpus < folder_resource.int_max_gpus " + + ") " + "AND job.str_state = 'PENDING' " + "AND job.b_paused = false " + "AND job.pk_show = ? " + @@ -66,8 +72,10 @@ public class DispatchQuery { "AND layer.int_cores_min <= ? " + "AND layer.int_mem_min <= ? " + "AND (CASE WHEN layer.b_threadable = true THEN 1 ELSE 0 END) >= ? " + - "AND layer.int_gpu_min BETWEEN ? AND ? " + + "AND layer.int_gpus_min <= ? " + + "AND layer.int_gpu_mem_min BETWEEN ? AND ? " + "AND job_resource.int_cores + layer.int_cores_min < job_resource.int_max_cores " + + "AND job_resource.int_gpus + layer.int_gpus_min < job_resource.int_max_gpus " + "AND host.str_tags ~* ('(?x)' || layer.str_tags) " + "AND host.str_name = ? " + "AND layer.pk_layer IN (" + @@ -165,7 +173,7 @@ public class DispatchQuery { "AND " + "l.int_mem_min <= host_local.int_mem_idle " + "AND " + - "l.int_gpu_min <= host_local.int_gpu_idle " + + "l.int_gpu_mem_min <= host_local.int_gpu_mem_idle " + "AND " + "l.pk_layer IN (" + "SELECT " + @@ -219,6 +227,8 @@ public class DispatchQuery { "folder.pk_folder = folder_resource.pk_folder " + "AND " + "(folder_resource.int_max_cores = -1 OR folder_resource.int_cores < folder_resource.int_max_cores) " + + "AND " + + "(folder_resource.int_max_gpus = -1 OR folder_resource.int_gpus < folder_resource.int_max_gpus) " + "AND " + "job_resource.float_tier < 1.00 " + "AND " + @@ -263,7 +273,9 @@ public class DispatchQuery { "AND " + "l.int_mem_min <= ? " + "AND " + - "l.int_gpu_min = ? " + + "l.int_gpus_min <= ? " + + "AND " + + "l.int_gpu_mem_min = ? " + "AND " + "h.str_tags ~* ('(?x)' || l.str_tags) " + "AND " + @@ -320,10 +332,14 @@ public class DispatchQuery { "folder.pk_folder = folder_resource.pk_folder " + "AND " + "(folder_resource.int_max_cores = -1 OR folder_resource.int_cores < folder_resource.int_max_cores) " + + "AND " + + "(folder_resource.int_max_gpus = -1 OR folder_resource.int_gpus < folder_resource.int_max_gpus) " + "AND " + "job_resource.int_priority > ?" + "AND " + "job_resource.int_cores < job_resource.int_max_cores " + + "AND " + + "job_resource.int_gpus < job_resource.int_max_gpus " + "AND " + "job.str_state = 'PENDING' " + "AND " + @@ -360,7 +376,9 @@ public class DispatchQuery { "AND " + "l.int_mem_min <= ? " + "AND " + - "l.int_gpu_min = ? " + + "l.int_gpus_min <= ? " + + "AND " + + "l.int_gpu_mem_min = ? " + "AND " + "h.str_tags ~* ('(?x)' || l.str_tags) " + "AND " + @@ -417,7 +435,9 @@ public class DispatchQuery { "int_cores_min, " + "int_cores_max, " + "int_mem_min, " + - "int_gpu_min, " + + "int_gpus_min, " + + "int_gpus_max, " + + "int_gpu_mem_min, " + "str_cmd, " + "str_range, " + "int_chunk_size, " + @@ -450,7 +470,9 @@ public class DispatchQuery { "layer.int_cores_min, " + "layer.int_cores_max, " + "layer.int_mem_min, " + - "layer.int_gpu_min, " + + "layer.int_gpus_min, " + + "layer.int_gpus_max, " + + "layer.int_gpu_mem_min, " + "layer.str_cmd, " + "layer.str_range, " + "layer.int_chunk_size, " + @@ -468,7 +490,9 @@ public class DispatchQuery { "AND " + "layer.int_mem_min <= ? " + "AND " + - "layer.int_gpu_min BETWEEN ? AND ? " + + "layer.int_gpus_min <= ? " + + "AND " + + "layer.int_gpu_mem_min BETWEEN ? AND ? " + "AND " + "frame.str_state='WAITING' " + "AND " + @@ -524,9 +548,11 @@ public class DispatchQuery { "layer_type, " + "int_cores_min, " + "int_cores_max, " + + "int_gpus_min, " + + "int_gpus_max, " + "b_threadable, " + "int_mem_min, " + - "int_gpu_min, " + + "int_gpu_mem_min, " + "str_cmd, " + "str_range, " + "int_chunk_size, " + @@ -557,9 +583,11 @@ public class DispatchQuery { "layer.str_type AS layer_type, " + "layer.int_cores_min, " + "layer.int_cores_max, " + + "layer.int_gpus_min, " + + "layer.int_gpus_max, " + "layer.b_threadable, " + "layer.int_mem_min, " + - "layer.int_gpu_min, " + + "layer.int_gpu_mem_min, " + "layer.str_cmd, " + "layer.str_range, " + "layer.int_chunk_size, " + @@ -579,7 +607,9 @@ public class DispatchQuery { "AND " + "(CASE WHEN layer.b_threadable = true THEN 1 ELSE 0 END) >= ? " + "AND " + - "layer.int_gpu_min BETWEEN ? AND ? " + + "layer.int_gpus_min <= ? " + + "AND " + + "layer.int_gpu_mem_min BETWEEN ? AND ? " + "AND " + "frame.str_state='WAITING' " + "AND " + @@ -636,7 +666,9 @@ public class DispatchQuery { "int_cores_min, " + "int_cores_max, " + "int_mem_min, " + - "int_gpu_min, " + + "int_gpus_min, " + + "int_gpus_max, " + + "int_gpu_mem_min, " + "str_cmd, " + "str_range, " + "int_chunk_size, " + @@ -669,7 +701,9 @@ public class DispatchQuery { "layer.int_cores_min, " + "layer.int_cores_max, " + "layer.int_mem_min, " + - "layer.int_gpu_min, " + + "layer.int_gpus_min, " + + "layer.int_gpus_max, " + + "layer.int_gpu_mem_min, " + "layer.str_cmd, " + "layer.str_range, " + "layer.int_chunk_size, " + @@ -685,7 +719,7 @@ public class DispatchQuery { "AND " + "layer.int_mem_min <= ? " + "AND " + - "layer.int_gpu_min <= ? " + + "layer.int_gpu_mem_min <= ? " + "AND " + "frame.str_state='WAITING' " + "AND " + @@ -739,9 +773,11 @@ public class DispatchQuery { "layer_type, " + "int_cores_min, " + "int_cores_max, " + + "int_gpus_min, " + + "int_gpus_max, " + "b_threadable, " + "int_mem_min, " + - "int_gpu_min, " + + "int_gpu_mem_min, " + "str_cmd, " + "str_range, " + "int_chunk_size, " + @@ -774,7 +810,9 @@ public class DispatchQuery { "layer.int_cores_max, " + "layer.b_threadable, " + "layer.int_mem_min, " + - "layer.int_gpu_min, " + + "layer.int_gpus_min, " + + "layer.int_gpus_max, " + + "layer.int_gpu_mem_min, " + "layer.str_cmd, " + "layer.str_range, " + "layer.int_chunk_size, " + @@ -790,7 +828,7 @@ public class DispatchQuery { "AND " + "layer.int_mem_min <= ? " + "AND " + - "layer.int_gpu_min <= ? " + + "layer.int_gpu_mem_min <= ? " + "AND " + "frame.str_state='WAITING' " + "AND " + @@ -849,7 +887,9 @@ public class DispatchQuery { "int_cores_min, " + "int_cores_max, " + "int_mem_min, " + - "int_gpu_min, " + + "int_gpus_min, " + + "int_gpus_max, " + + "int_gpu_mem_min, " + "str_cmd, " + "str_range, " + "int_chunk_size, " + @@ -882,7 +922,9 @@ public class DispatchQuery { "layer.int_cores_min, " + "layer.int_cores_max, " + "layer.int_mem_min, " + - "layer.int_gpu_min, " + + "layer.int_gpus_min, " + + "layer.int_gpus_max, " + + "layer.int_gpu_mem_min, " + "layer.str_cmd, " + "layer.str_range, " + "layer.int_chunk_size, " + @@ -900,7 +942,9 @@ public class DispatchQuery { "AND " + "layer.int_mem_min <= ? " + "AND " + - "layer.int_gpu_min = ? " + + "layer.int_gpus_min <= ? " + + "AND " + + "layer.int_gpu_mem_min <= ? " + "AND " + "frame.str_state='WAITING' " + "AND " + @@ -958,7 +1002,9 @@ public class DispatchQuery { "int_cores_max, " + "b_threadable, " + "int_mem_min, " + - "int_gpu_min, " + + "int_gpus_min, " + + "int_gpus_max, " + + "int_gpu_mem_min, " + "str_cmd, " + "str_range, " + "int_chunk_size, " + @@ -991,7 +1037,9 @@ public class DispatchQuery { "layer.int_cores_max, " + "layer.b_threadable, " + "layer.int_mem_min, " + - "layer.int_gpu_min, " + + "layer.int_gpus_min, " + + "layer.int_gpus_max, " + + "layer.int_gpu_mem_min, " + "layer.str_cmd, " + "layer.str_range, " + "layer.int_chunk_size, " + @@ -1011,7 +1059,9 @@ public class DispatchQuery { "AND " + "(CASE WHEN layer.b_threadable = true THEN 1 ELSE 0 END) >= ? " + "AND " + - "layer.int_gpu_min <= ? " + + "layer.int_gpus_min <= ? " + + "AND " + + "layer.int_gpu_mem_min <= ? " + "AND " + "frame.str_state='WAITING' " + "AND " + @@ -1068,7 +1118,9 @@ public class DispatchQuery { "int_cores_min, " + "int_cores_max, " + "int_mem_min, " + - "int_gpu_min, " + + "int_gpus_min, " + + "int_gpus_max, " + + "int_gpu_mem_min, " + "str_cmd, " + "str_range, " + "int_chunk_size, " + @@ -1100,7 +1152,9 @@ public class DispatchQuery { "layer.b_threadable, " + "layer.int_cores_min, " + "layer.int_mem_min, " + - "layer.int_gpu_min, " + + "layer.int_gpus_min, " + + "layer.int_gpus_max, " + + "layer.int_gpu_mem_min, " + "layer.int_cores_max, " + "layer.str_cmd, " + "layer.str_range, " + @@ -1117,7 +1171,7 @@ public class DispatchQuery { "AND " + "layer.int_mem_min <= ? " + "AND " + - "layer.int_gpu_min <= ? " + + "layer.int_gpu_mem_min <= ? " + "AND " + "frame.str_state='WAITING' " + "AND " + @@ -1173,7 +1227,9 @@ public class DispatchQuery { "int_cores_max, " + "b_threadable, " + "int_mem_min, " + - "int_gpu_min, " + + "int_gpus_min, " + + "int_gpus_max, " + + "int_gpu_mem_min, " + "str_cmd, " + "str_range, " + "int_chunk_size, " + @@ -1206,7 +1262,9 @@ public class DispatchQuery { "layer.int_cores_max, " + "layer.b_threadable, " + "layer.int_mem_min, " + - "layer.int_gpu_min, " + + "layer.int_gpus_min, " + + "layer.int_gpus_max, " + + "layer.int_gpu_mem_min, " + "layer.str_cmd, " + "layer.str_range, " + "layer.int_chunk_size, " + @@ -1222,7 +1280,7 @@ public class DispatchQuery { "AND " + "layer.int_mem_min <= ? " + "AND " + - "layer.int_gpu_min <= ? " + + "layer.int_gpu_mem_min <= ? " + "AND " + "frame.str_state='WAITING' " + "AND " + diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatcherDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatcherDaoJdbc.java index d3e50525d..10f506e9b 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatcherDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatcherDaoJdbc.java @@ -190,11 +190,12 @@ private Set findDispatchJobs(DispatchHost host, int numJobs, boolean shu s.getShowId(), host.getFacilityId(), host.os, host.idleCores, host.idleMemory, threadMode(host.threadMode), - (host.idleGpu > 0) ? 1: 0, host.idleGpu, + host.idleGpus, + (host.idleGpuMemory > 0) ? 1 : 0, host.idleGpuMemory, host.getName(), numJobs * 10)); if (result.size() < 1) { - if (host.gpu == 0) { + if (host.gpuMemory == 0) { s.skip(host.tags, host.idleCores, host.idleMemory); } } @@ -225,7 +226,8 @@ public Set findDispatchJobs(DispatchHost host, GroupInterface g) { g.getGroupId(),host.getFacilityId(), host.os, host.idleCores, host.idleMemory, threadMode(host.threadMode), - (host.idleGpu > 0) ? 1: 0, host.idleGpu, + host.idleGpus, + (host.idleGpuMemory > 0) ? 1 : 0, host.idleGpuMemory, host.getName(), 50)); return result; @@ -240,7 +242,7 @@ public List findNextDispatchFrames(JobInterface job, FIND_LOCAL_DISPATCH_FRAME_BY_JOB_AND_PROC, FrameDaoJdbc.DISPATCH_FRAME_MAPPER, proc.memoryReserved, - proc.gpuReserved, + proc.gpuMemoryReserved, job.getJobId(), limit); } @@ -250,7 +252,8 @@ public List findNextDispatchFrames(JobInterface job, FrameDaoJdbc.DISPATCH_FRAME_MAPPER, proc.coresReserved, proc.memoryReserved, - (proc.gpuReserved > 0) ? 1: 0, proc.gpuReserved, + proc.gpusReserved, + (proc.gpuMemoryReserved > 0) ? 1 : 0, proc.gpuMemoryReserved, job.getJobId(), proc.hostName, job.getJobId(), limit); } @@ -264,7 +267,7 @@ public List findNextDispatchFrames(JobInterface job, return getJdbcTemplate().query( FIND_LOCAL_DISPATCH_FRAME_BY_JOB_AND_HOST, FrameDaoJdbc.DISPATCH_FRAME_MAPPER, - host.idleMemory, host.idleGpu, job.getJobId(), + host.idleMemory, host.idleGpuMemory, job.getJobId(), limit); } else { @@ -273,7 +276,8 @@ public List findNextDispatchFrames(JobInterface job, FrameDaoJdbc.DISPATCH_FRAME_MAPPER, host.idleCores, host.idleMemory, threadMode(host.threadMode), - (host.idleGpu > 0) ? 1: 0, host.idleGpu, + host.idleGpus, + (host.idleGpuMemory > 0) ? 1 : 0, host.idleGpuMemory, job.getJobId(), host.getName(), job.getJobId(), limit); } @@ -288,7 +292,7 @@ public List findNextDispatchFrames(LayerInterface layer, return getJdbcTemplate().query( FIND_LOCAL_DISPATCH_FRAME_BY_LAYER_AND_PROC, FrameDaoJdbc.DISPATCH_FRAME_MAPPER, - proc.memoryReserved, proc.gpuReserved, + proc.memoryReserved, proc.gpuMemoryReserved, layer.getLayerId(), limit); } @@ -297,7 +301,7 @@ public List findNextDispatchFrames(LayerInterface layer, FIND_DISPATCH_FRAME_BY_LAYER_AND_PROC, FrameDaoJdbc.DISPATCH_FRAME_MAPPER, proc.coresReserved, proc.memoryReserved, - proc.gpuReserved, + proc.gpusReserved, proc.gpuMemoryReserved, layer.getLayerId(), layer.getLayerId(), proc.hostName, limit); } @@ -311,7 +315,7 @@ public List findNextDispatchFrames(LayerInterface layer, return getJdbcTemplate().query( FIND_LOCAL_DISPATCH_FRAME_BY_LAYER_AND_HOST, FrameDaoJdbc.DISPATCH_FRAME_MAPPER, - host.idleMemory, host.idleGpu, layer.getLayerId(), + host.idleMemory, host.idleGpuMemory, layer.getLayerId(), limit); } else { @@ -320,7 +324,7 @@ public List findNextDispatchFrames(LayerInterface layer, FrameDaoJdbc.DISPATCH_FRAME_MAPPER, host.idleCores, host.idleMemory, threadMode(host.threadMode), - host.idleGpu, layer.getLayerId(), layer.getLayerId(), + host.idleGpus, host.idleGpuMemory, layer.getLayerId(), layer.getLayerId(), host.getName(), limit); } } @@ -345,7 +349,7 @@ public boolean findUnderProcedJob(JobInterface excludeJob, VirtualProc proc) { Integer.class, excludeJob.getShowId(), proc.getFacilityId(), proc.os, excludeJob.getShowId(), proc.getFacilityId(), proc.os, - proc.coresReserved, proc.memoryReserved, proc.gpuReserved, + proc.coresReserved, proc.memoryReserved, proc.gpusReserved, proc.gpuMemoryReserved, proc.hostName) > 0; } catch (org.springframework.dao.EmptyResultDataAccessException e) { return false; @@ -363,7 +367,7 @@ public boolean higherPriorityJobExists(JobDetail baseJob, VirtualProc proc) { HIGHER_PRIORITY_JOB_BY_FACILITY_EXISTS, Boolean.class, baseJob.priority, proc.getFacilityId(), proc.os, proc.getFacilityId(), proc.os, - proc.coresReserved, proc.memoryReserved, proc.gpuReserved, + proc.coresReserved, proc.memoryReserved, proc.gpusReserved, proc.gpuMemoryReserved, proc.hostName); } catch (org.springframework.dao.EmptyResultDataAccessException e) { return false; @@ -384,7 +388,8 @@ public Set findDispatchJobs(DispatchHost host, show.getShowId(), host.getFacilityId(), host.os, host.idleCores, host.idleMemory, threadMode(host.threadMode), - (host.idleGpu > 0) ? 1: 0, host.idleGpu, + host.idleGpus, + (host.idleGpuMemory > 0) ? 1 : 0, host.idleGpuMemory, host.getName(), numJobs * 10)); return result; diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java index d7aeef3b5..e905d8e35 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/FrameDaoJdbc.java @@ -66,7 +66,9 @@ public class FrameDaoJdbc extends JdbcDaoSupport implements FrameDao { "ts_updated = current_timestamp, " + "int_version = int_version + 1, " + "int_total_past_core_time = int_total_past_core_time + " + - "round(INTERVAL_TO_SECONDS(current_timestamp - ts_started) * int_cores / 100) " + + "round(INTERVAL_TO_SECONDS(current_timestamp - ts_started) * int_cores / 100)," + + "int_total_past_gpu_time = int_total_past_gpu_time + " + + "round(INTERVAL_TO_SECONDS(current_timestamp - ts_started) * int_gpus) " + "WHERE " + "frame.pk_frame = ? " + "AND " + @@ -93,7 +95,9 @@ public boolean updateFrameStopped(FrameInterface frame, FrameState state, "int_mem_max_used = ?, " + "int_version = int_version + 1, " + "int_total_past_core_time = int_total_past_core_time + " + - "round(INTERVAL_TO_SECONDS(current_timestamp + interval '1' second - ts_started) * int_cores / 100) " + + "round(INTERVAL_TO_SECONDS(current_timestamp + interval '1' second - ts_started) * int_cores / 100), " + + "int_total_past_gpu_time = int_total_past_gpu_time + " + + "round(INTERVAL_TO_SECONDS(current_timestamp + interval '1' second - ts_started) * int_gpus) " + "WHERE " + "frame.pk_frame = ? " + "AND " + @@ -149,7 +153,8 @@ public boolean updateFrameCleared(FrameInterface frame) { "str_host = ?, " + "int_cores = ?, " + "int_mem_reserved = ?, " + - "int_gpu_reserved = ?, " + + "int_gpus = ?, " + + "int_gpu_mem_reserved = ?, " + "ts_updated = current_timestamp, " + "ts_started = current_timestamp, " + "ts_stopped = null, " + @@ -200,7 +205,7 @@ public void updateFrameStarted(VirtualProc proc, FrameInterface frame) { int result = getJdbcTemplate().update(UPDATE_FRAME_STARTED, FrameState.RUNNING.toString(), proc.hostName, proc.coresReserved, - proc.memoryReserved, proc.gpuReserved, frame.getFrameId(), + proc.memoryReserved, proc.gpusReserved, proc.gpuMemoryReserved, frame.getFrameId(), FrameState.WAITING.toString(), frame.getVersion()); if (result == 0) { @@ -226,7 +231,8 @@ public void updateFrameStarted(VirtualProc proc, FrameInterface frame) { "str_host=?, " + "int_cores=?, "+ "int_mem_reserved = ?, " + - "int_gpu_reserved = ?, " + + "int_gpus = ?, " + + "int_gpu_mem_reserved = ?, " + "ts_updated = current_timestamp, " + "ts_started = current_timestamp, " + "ts_stopped = null, "+ @@ -240,7 +246,7 @@ public void updateFrameStarted(VirtualProc proc, FrameInterface frame) { public boolean updateFrameFixed(VirtualProc proc, FrameInterface frame) { return getJdbcTemplate().update(UPDATE_FRAME_FIXED, FrameState.RUNNING.toString(), proc.hostName, proc.coresReserved, - proc.memoryReserved, proc.gpuReserved, frame.getFrameId()) == 1; + proc.memoryReserved, proc.gpusReserved, proc.gpuMemoryReserved, frame.getFrameId()) == 1; } @Override @@ -276,7 +282,9 @@ public DispatchFrame mapRow(ResultSet rs, int rowNum) throws SQLException { frame.maxCores = rs.getInt("int_cores_max"); frame.threadable = rs.getBoolean("b_threadable"); frame.minMemory = rs.getLong("int_mem_min"); - frame.minGpu = rs.getLong("int_gpu_min"); + frame.minGpus = rs.getInt("int_gpus_min"); + frame.maxGpus = rs.getInt("int_gpus_max"); + frame.minGpuMemory = rs.getLong("int_gpu_mem_min"); frame.version = rs.getInt("int_version"); frame.services = rs.getString("str_services"); return frame; @@ -308,7 +316,9 @@ public DispatchFrame mapRow(ResultSet rs, int rowNum) throws SQLException { "layer.int_cores_max,"+ "layer.b_threadable,"+ "layer.int_mem_min, "+ - "layer.int_gpu_min, "+ + "layer.int_gpus_min,"+ + "layer.int_gpus_max,"+ + "layer.int_gpu_mem_min, "+ "layer.str_range, "+ "layer.int_chunk_size, " + "layer.str_services " + @@ -402,7 +412,7 @@ public FrameDetail mapRow(ResultSet rs, int rowNum) throws SQLException { frame.version = rs.getInt("int_version"); if (rs.getString("str_host") != null) { - frame.lastResource = String.format("%s/%d",rs.getString("str_host"),rs.getInt("int_cores")); + frame.lastResource = String.format("%s/%d/%d",rs.getString("str_host"),rs.getInt("int_cores"),rs.getInt("int_gpus")); } else { frame.lastResource = ""; @@ -931,7 +941,8 @@ public ResourceUsage mapRow(ResultSet rs, int rowNum) throws SQLException { return new ResourceUsage( rs.getLong("int_clock_time"), - rs.getInt("int_cores")); + rs.getInt("int_cores"), + rs.getInt("int_gpus")); } }; @@ -947,7 +958,8 @@ public ResourceUsage getResourceUsage(FrameInterface f) { "SELECT " + "COALESCE(interval_to_seconds(current_timestamp - ts_started), 1) " + "AS int_clock_time, " + - "COALESCE(int_cores, 100) AS int_cores " + + "COALESCE(int_cores, 100) AS int_cores," + + "int_gpus " + "FROM " + "frame " + "WHERE " + diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/GroupDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/GroupDaoJdbc.java index 9051131ea..b502bb680 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/GroupDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/GroupDaoJdbc.java @@ -232,6 +232,73 @@ public boolean isOverMinCores(JobInterface job) { Integer.class, job.getJobId()) > 0; } + @Override + public void updateDefaultJobMaxGpus(GroupInterface group, int value) { + if (value <= 0) { value = CueUtil.FEATURE_DISABLED; } + if (value < CueUtil.ONE_CORE && value != CueUtil.FEATURE_DISABLED) { + String msg = "The default max cores for a job must " + + "be greater than a single core"; + throw new IllegalArgumentException(msg); + } + getJdbcTemplate().update( + "UPDATE folder SET int_job_max_gpus=? WHERE pk_folder=?", + value, group.getId()); + } + + @Override + public void updateDefaultJobMinGpus(GroupInterface group, int value) { + if (value <= 0) { value = CueUtil.FEATURE_DISABLED; } + if (value < CueUtil.ONE_CORE && value != CueUtil.FEATURE_DISABLED) { + String msg = "The default min cores for a job must " + + "be greater than a single core"; + throw new IllegalArgumentException(msg); + } + getJdbcTemplate().update( + "UPDATE folder SET int_job_min_gpu=? WHERE pk_folder=?", + value, group.getId()); + } + + @Override + public void updateMaxGpus(GroupInterface group, int value) { + if (value < 0) { value = CueUtil.FEATURE_DISABLED; } + if (value < CueUtil.ONE_CORE && value != CueUtil.FEATURE_DISABLED) { + String msg = "The group max cores feature must " + + "be a whole core or greater, pass in: " + value; + throw new IllegalArgumentException(msg); + } + + getJdbcTemplate().update( + "UPDATE folder_resource SET int_max_gpu=? WHERE pk_folder=?", + value, group.getId()); + } + + @Override + public void updateMinGpus(GroupInterface group, int value) { + if (value < 0) { value = 0; } + getJdbcTemplate().update( + "UPDATE folder_resource SET int_min_gpus=? WHERE pk_folder=?", + value, group.getId()); + } + + private static final String IS_OVER_MIN_GPUS = + "SELECT " + + "COUNT(1) " + + "FROM " + + "job,"+ + "folder_resource fr "+ + "WHERE " + + "job.pk_folder = fr.pk_folder " + + "AND " + + "fr.int_gpus > fr.int_min_gpus " + + "AND "+ + "job.pk_job = ?"; + + @Override + public boolean isOverMinGpus(JobInterface job) { + return getJdbcTemplate().queryForObject(IS_OVER_MIN_GPUS, + Integer.class, job.getJobId()) > 0; + } + @Override public void updateDefaultJobPriority(GroupInterface group, int value) { if (value < 0) { value = CueUtil.FEATURE_DISABLED; } @@ -251,6 +318,8 @@ public void updateDefaultJobPriority(GroupInterface group, int value) { "folder.pk_folder, " + "folder.int_job_max_cores,"+ "folder.int_job_min_cores,"+ + "folder.int_job_max_gpus,"+ + "folder.int_job_min_gpus,"+ "folder.int_job_priority,"+ "folder.str_name,"+ "folder.pk_parent_folder,"+ @@ -258,7 +327,9 @@ public void updateDefaultJobPriority(GroupInterface group, int value) { "folder.pk_dept,"+ "folder_level.int_level, " + "folder_resource.int_min_cores,"+ - "folder_resource.int_max_cores " + + "folder_resource.int_max_cores," + + "folder_resource.int_min_gpus,"+ + "folder_resource.int_max_gpus " + "FROM " + "folder, "+ "folder_level, " + @@ -273,6 +344,8 @@ public void updateDefaultJobPriority(GroupInterface group, int value) { "folder.pk_folder, " + "folder.int_job_max_cores,"+ "folder.int_job_min_cores,"+ + "folder.int_job_max_gpus,"+ + "folder.int_job_min_gpus,"+ "folder.int_job_priority,"+ "folder.str_name,"+ "folder.pk_parent_folder,"+ @@ -280,7 +353,9 @@ public void updateDefaultJobPriority(GroupInterface group, int value) { "folder.pk_dept,"+ "folder_level.int_level, " + "folder_resource.int_min_cores,"+ - "folder_resource.int_max_cores " + + "folder_resource.int_max_cores," + + "folder_resource.int_min_gpus,"+ + "folder_resource.int_max_gpus " + "FROM " + "folder, "+ "folder_level, " + @@ -393,6 +468,8 @@ public GroupDetail mapRow(ResultSet rs, int rowNum) throws SQLException { group.id = rs.getString("pk_folder"); group.jobMaxCores = rs.getInt("int_job_max_cores"); group.jobMinCores = rs.getInt("int_job_min_cores"); + group.jobMaxGpus = rs.getInt("int_job_max_gpus"); + group.jobMinGpus = rs.getInt("int_job_min_gpus"); group.jobPriority = rs.getInt("int_job_priority"); group.name = rs.getString("str_name"); group.parentId = rs.getString("pk_parent_folder"); diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/HostDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/HostDaoJdbc.java index 1efd6b597..5c106335c 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/HostDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/HostDaoJdbc.java @@ -71,10 +71,12 @@ public HostEntity mapRow(ResultSet rs, int rowNum) throws SQLException { host.unlockAtBoot = rs.getBoolean("b_unlock_boot"); host.cores = rs.getInt("int_cores"); host.idleCores = rs.getInt("int_cores_idle"); - host.memory = rs.getInt("int_mem"); - host.idleMemory = rs.getInt("int_mem_idle"); - host.gpu = rs.getInt("int_gpu"); - host.idleGpu = rs.getInt("int_gpu_idle"); + host.memory = rs.getLong("int_mem"); + host.idleMemory = rs.getLong("int_mem_idle"); + host.gpus = rs.getInt("int_gpus"); + host.idleGpus = rs.getInt("int_gpus_idle"); + host.gpuMemory = rs.getLong("int_gpu_mem"); + host.idleGpuMemory = rs.getLong("int_gpu_mem_idle"); host.dateBooted = rs.getDate("ts_booted"); host.dateCreated = rs.getDate("ts_created"); host.datePinged = rs.getDate("ts_ping"); @@ -110,8 +112,10 @@ public HostInterface mapRow(final ResultSet rs, int rowNum) throws SQLException "host.int_cores_idle,"+ "host.int_mem,"+ "host.int_mem_idle,"+ - "host.int_gpu,"+ - "host.int_gpu_idle,"+ + "host.int_gpus,"+ + "host.int_gpus_idle,"+ + "host.int_gpu_mem,"+ + "host.int_gpu_mem_idle,"+ "host.ts_created,"+ "host.str_name, " + "host_stat.str_state,"+ @@ -199,12 +203,14 @@ public DispatchHost mapRow(ResultSet rs, int rowNum) throws SQLException { host.facilityId = rs.getString("pk_facility"); host.name = rs.getString("str_name"); host.lockState = LockState.valueOf(rs.getString("str_lock_state")); - host.memory = rs.getInt("int_mem"); + host.memory = rs.getLong("int_mem"); host.cores = rs.getInt("int_cores"); - host.gpu= rs.getInt("int_gpu"); - host.idleMemory= rs.getInt("int_mem_idle"); + host.gpus = rs.getInt("int_gpus"); + host.gpuMemory = rs.getLong("int_gpu_mem"); + host.idleMemory= rs.getLong("int_mem_idle"); host.idleCores = rs.getInt("int_cores_idle"); - host.idleGpu= rs.getInt("int_gpu_idle"); + host.idleGpuMemory = rs.getLong("int_gpu_mem_idle"); + host.idleGpus = rs.getInt("int_gpus_idle"); host.isNimby = rs.getBoolean("b_nimby"); host.threadMode = rs.getInt("int_thread_mode"); host.tags = rs.getString("str_tags"); @@ -225,8 +231,10 @@ public DispatchHost mapRow(ResultSet rs, int rowNum) throws SQLException { "host.int_cores_idle, " + "host.int_mem,"+ "host.int_mem_idle, "+ - "host.int_gpu,"+ - "host.int_gpu_idle, "+ + "host.int_gpus, "+ + "host.int_gpus_idle, " + + "host.int_gpu_mem,"+ + "host.int_gpu_mem_idle, "+ "host.b_nimby, "+ "host.int_thread_mode, "+ "host.str_tags, " + @@ -276,12 +284,14 @@ public DispatchHost getDispatchHost(String id) { "int_cores_idle, " + "int_mem,"+ "int_mem_idle,"+ - "int_gpu,"+ - "int_gpu_idle,"+ + "int_gpus, " + + "int_gpus_idle, " + + "int_gpu_mem,"+ + "int_gpu_mem_idle,"+ "str_fqdn, " + "int_thread_mode "+ ") " + - "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)", + "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", "INSERT INTO " + "host_stat " + @@ -290,8 +300,8 @@ public DispatchHost getDispatchHost(String id) { "pk_host,"+ "int_mem_total, " + "int_mem_free,"+ - "int_gpu_total, " + - "int_gpu_free,"+ + "int_gpu_mem_total, " + + "int_gpu_mem_free,"+ "int_swap_total, " + "int_swap_free,"+ "int_mcp_total, " + @@ -347,28 +357,17 @@ public void insertRenderHost(RenderHost host, AllocationInterface a, boolean use os = Dispatcher.OS_DEFAULT; } - long totalGpu; - if (host.getAttributesMap().containsKey("totalGpu")) - totalGpu = Integer.parseInt(host.getAttributesMap().get("totalGpu")); - else - totalGpu = 0; - - long freeGpu; - if (host.getAttributesMap().containsKey("freeGpu")) - freeGpu = Integer.parseInt(host.getAttributesMap().get("freeGpu")); - else - freeGpu = 0; - - getJdbcTemplate().update(INSERT_HOST_DETAIL[0], hid, a.getAllocationId(), name, host.getNimbyEnabled(), LockState.OPEN.toString(), host.getNumProcs(), coreUnits, coreUnits, - memUnits, memUnits, totalGpu, totalGpu, + memUnits, memUnits, + host.getNumGpus(), host.getNumGpus(), + host.getTotalGpuMem(), host.getTotalGpuMem(), fqdn, threadMode.getNumber()); getJdbcTemplate().update(INSERT_HOST_DETAIL[1], hid, hid, host.getTotalMem(), host.getFreeMem(), - totalGpu, freeGpu, + host.getTotalGpuMem(), host.getFreeGpuMem(), host.getTotalSwap(), host.getFreeSwap(), host.getTotalMcp(), host.getFreeMcp(), host.getLoad(), new Timestamp(host.getBootTime() * 1000l), @@ -396,8 +395,8 @@ public CallableStatement createCallableStatement(Connection con) throws SQLExcep "int_swap_free = ?, "+ "int_mcp_total = ?, " + "int_mcp_free = ?, " + - "int_gpu_total = ?, " + - "int_gpu_free = ?, " + + "int_gpu_mem_total = ?, " + + "int_gpu_mem_free = ?, " + "int_load = ?," + "ts_booted = ?, " + "ts_ping = current_timestamp, "+ @@ -410,7 +409,7 @@ public void updateHostStats(HostInterface host, long totalMemory, long freeMemory, long totalSwap, long freeSwap, long totalMcp, long freeMcp, - long totalGpu, long freeGpu, + long totalGpuMemory, long freeGpuMemory, int load, Timestamp bootTime, String os) { @@ -420,7 +419,7 @@ public void updateHostStats(HostInterface host, getJdbcTemplate().update(UPDATE_RENDER_HOST, totalMemory, freeMemory, totalSwap, - freeSwap, totalMcp, freeMcp, totalGpu, freeGpu, load, + freeSwap, totalMcp, freeMcp, totalGpuMemory, freeGpuMemory, load, bootTime, os, host.getHostId()); } @@ -440,12 +439,8 @@ public void updateHostResources(HostInterface host, HostReport report) { long memory = convertMemoryUnits(report.getHost()); int cores = report.getHost().getNumProcs() * report.getHost().getCoresPerProc(); - - long totalGpu; - if (report.getHost().getAttributesMap().containsKey("totalGpu")) - totalGpu = Integer.parseInt(report.getHost().getAttributesMap().get("totalGpu")); - else - totalGpu = 0; + long gpu_memory = report.getHost().getTotalGpuMem(); + int gpus = report.getHost().getNumGpus(); getJdbcTemplate().update( "UPDATE " + @@ -456,16 +451,20 @@ public void updateHostResources(HostInterface host, HostReport report) { "int_cores_idle=?," + "int_mem=?," + "int_mem_idle=?, " + - "int_gpu=?," + - "int_gpu_idle=? " + + "int_gpus=?," + + "int_gpus_idle=?," + + "int_gpu_mem=?," + + "int_gpu_mem_idle=? " + "WHERE " + "pk_host=? "+ "AND " + "int_cores = int_cores_idle " + "AND " + - "int_mem = int_mem_idle", + "int_mem = int_mem_idle " + + "AND " + + "int_gpus = int_gpus_idle", report.getHost().getNimbyEnabled(), cores, cores, - memory, memory, totalGpu, totalGpu, host.getId()); + memory, memory, gpus, gpus, gpu_memory, gpu_memory, host.getId()); } @Override @@ -628,6 +627,18 @@ public int getStrandedCoreUnits(HostInterface h) { } } + @Override + public int getStrandedGpus(HostInterface h) { + try { + int idle_gpus = getJdbcTemplate().queryForObject( + "SELECT int_gpus_idle FROM host WHERE pk_host = ?", + Integer.class, h.getHostId()); + return idle_gpus; + } catch (EmptyResultDataAccessException e) { + return 0; + } + } + private static final String IS_HOST_UP = "SELECT " + "COUNT(1) " + diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/JobDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/JobDaoJdbc.java index 3705324be..8009b247c 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/JobDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/JobDaoJdbc.java @@ -116,6 +116,8 @@ public JobDetail mapRow(ResultSet rs, int rowNum) throws SQLException { job.logDir = rs.getString("str_log_dir"); job.maxCoreUnits = rs.getInt("int_max_cores"); job.minCoreUnits = rs.getInt("int_min_cores"); + job.maxGpuUnits = rs.getInt("int_max_gpus"); + job.minGpuUnits = rs.getInt("int_min_gpus"); job.name = rs.getString("str_name"); job.priority = rs.getInt("int_priority"); job.shot = rs.getString("str_shot"); @@ -218,6 +220,8 @@ public boolean isJobComplete(JobInterface job) { "job.int_max_retries,"+ "job_resource.int_max_cores,"+ "job_resource.int_min_cores,"+ + "job_resource.int_max_gpus,"+ + "job_resource.int_min_gpus,"+ "job_resource.int_priority,"+ "show.str_name AS show_name, " + "dept.str_name AS dept_name, "+ @@ -364,6 +368,32 @@ public void updateMaxCores(JobInterface j, int v) { v, j.getJobId()); } + @Override + public void updateMinGpus(GroupInterface g, int v) { + getJdbcTemplate().update("UPDATE job_resource SET int_min_gpus=? WHERE " + + "pk_job IN (SELECT pk_job FROM job WHERE pk_folder=?)", + v, g.getGroupId()); + } + + @Override + public void updateMaxGpus(GroupInterface g, int v) { + getJdbcTemplate().update("UPDATE job_resource SET int_max_gpus=? WHERE " + + "pk_job IN (SELECT pk_job FROM job WHERE pk_folder=?)", + v, g.getGroupId()); + } + + @Override + public void updateMinGpus(JobInterface j, int v) { + getJdbcTemplate().update("UPDATE job_resource SET int_min_gpus=? WHERE pk_job=?", + v, j.getJobId()); + } + + @Override + public void updateMaxGpus(JobInterface j, int v) { + getJdbcTemplate().update("UPDATE job_resource SET int_max_gpus=? WHERE pk_job=?", + v, j.getJobId()); + } + @Override public void updatePaused(JobInterface j, boolean b) { getJdbcTemplate().update("UPDATE job SET b_paused=? WHERE pk_job=?", @@ -632,6 +662,60 @@ public boolean isAtMaxCores(JobInterface job) { Integer.class, job.getJobId()) > 0; } + private static final String IS_JOB_OVER_MIN_GPUS = + "SELECT " + + "COUNT(1) " + + "FROM " + + "job_resource " + + "WHERE " + + "job_resource.pk_job = ? " + + "AND " + + "job_resource.int_gpus > job_resource.int_min_gpus"; + + @Override + public boolean isOverMinGpus(JobInterface job) { + return getJdbcTemplate().queryForObject(IS_JOB_OVER_MIN_GPUS, + Integer.class, job.getJobId()) > 0; + } + + private static final String IS_JOB_OVER_MAX_GPUS = + "SELECT " + + "COUNT(1) " + + "FROM " + + "job_resource " + + "WHERE " + + "job_resource.pk_job = ? " + + "AND " + + "job_resource.int_gpus + ? > job_resource.int_max_gpus"; + + @Override + public boolean isOverMaxGpus(JobInterface job) { + return getJdbcTemplate().queryForObject(IS_JOB_OVER_MAX_GPUS, + Integer.class, job.getJobId(), 0) > 0; + } + + @Override + public boolean isOverMaxGpus(JobInterface job, int gpu) { + return getJdbcTemplate().queryForObject(IS_JOB_OVER_MAX_GPUS, + Integer.class, job.getJobId(), gpu) > 0; + } + + private static final String IS_JOB_AT_MAX_GPUS = + "SELECT " + + "COUNT(1) " + + "FROM " + + "job_resource " + + "WHERE " + + "job_resource.pk_job = ? " + + "AND " + + "job_resource.int_gpus >= job_resource.int_max_gpus "; + + @Override + public boolean isAtMaxGpus(JobInterface job) { + return getJdbcTemplate().queryForObject(IS_JOB_AT_MAX_GPUS, + Integer.class, job.getJobId()) > 0; + } + @Override public void updateMaxFrameRetries(JobInterface j, int max_retries) { if (max_retries < 0) { @@ -685,8 +769,10 @@ public FrameStateTotals mapRow(ResultSet rs, int rowNum) throws SQLException { private static final String GET_EXECUTION_SUMMARY = "SELECT " + "job_usage.int_core_time_success,"+ - "job_usage.int_core_time_fail," + - "job_mem.int_max_rss " + + "job_usage.int_core_time_fail,"+ + "job_usage.int_gpu_time_success,"+ + "job_usage.int_gpu_time_fail,"+ + "job_mem.int_max_rss " + "FROM " + "job," + "job_usage, "+ @@ -707,6 +793,9 @@ public ExecutionSummary mapRow(ResultSet rs, int rowNum) throws SQLException { e.coreTimeSuccess = rs.getLong("int_core_time_success"); e.coreTimeFail = rs.getLong("int_core_time_fail"); e.coreTime = e.coreTimeSuccess + e.coreTimeFail; + e.gpuTimeSuccess = rs.getLong("int_gpu_time_success"); + e.gpuTimeFail = rs.getLong("int_gpu_time_fail"); + e.gpuTime = e.gpuTimeSuccess + e.gpuTimeFail; e.highMemoryKb = rs.getLong("int_max_rss"); return e; @@ -795,6 +884,20 @@ public void updateParent(JobInterface job, GroupDetail dest, Inherit[] inherits) } break; + case MinGpus: + if (dest.jobMinGpus != CueUtil.FEATURE_DISABLED) { + query.append("int_min_gpus=?,"); + values.add(dest.jobMinGpus); + } + break; + + case MaxGpus: + if (dest.jobMaxGpus != CueUtil.FEATURE_DISABLED) { + query.append("int_max_gpus=?,"); + values.add(dest.jobMaxGpus); + } + break; + case All: if (dest.jobPriority != CueUtil.FEATURE_DISABLED) { query.append("int_priority=?,"); @@ -810,6 +913,16 @@ public void updateParent(JobInterface job, GroupDetail dest, Inherit[] inherits) query.append("int_max_cores=?,"); values.add(dest.jobMaxCores); } + + if (dest.jobMinGpus != CueUtil.FEATURE_DISABLED) { + query.append("int_min_gpus=?,"); + values.add(dest.jobMinGpus); + } + + if (dest.jobMaxGpus != CueUtil.FEATURE_DISABLED) { + query.append("int_max_gpus=?,"); + values.add(dest.jobMaxGpus); + } break; } } @@ -851,6 +964,8 @@ public void updateParent(JobInterface job, GroupDetail dest, Inherit[] inherits) "job_stat.int_waiting_count != 0" + "AND " + "job_resource.int_cores < job_resource.int_max_cores " + + "AND " + + "job_resource.int_gpus < job_resource.int_max_gpus " + "AND " + "job.pk_facility = ? " + "LIMIT 1"; @@ -922,11 +1037,13 @@ public void updateUsage(JobInterface job, ResourceUsage usage, int exitStatus) { "job_usage " + "SET " + "int_core_time_success = int_core_time_success + ?," + + "int_gpu_time_success = int_gpu_time_success + ?," + "int_clock_time_success = int_clock_time_success + ?,"+ "int_frame_success_count = int_frame_success_count + 1 " + "WHERE " + "pk_job = ? ", usage.getCoreTimeSeconds(), + usage.getGpuTimeSeconds(), usage.getClockTimeSeconds(), job.getJobId()); diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/LayerDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/LayerDaoJdbc.java index 26654f392..212963519 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/LayerDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/LayerDaoJdbc.java @@ -205,7 +205,8 @@ public LayerDetail mapRow(ResultSet rs, int rowNum) throws SQLException { layer.range = rs.getString("str_range"); layer.minimumCores = rs.getInt("int_cores_min"); layer.minimumMemory = rs.getLong("int_mem_min"); - layer.minimumGpu = rs.getLong("int_gpu_min"); + layer.minimumGpus = rs.getInt("int_gpus_min"); + layer.minimumGpuMemory = rs.getLong("int_gpu_mem_min"); layer.type = LayerType.valueOf(rs.getString("str_type")); layer.tags = Sets.newHashSet( rs.getString("str_tags").replaceAll(" ", "").split("\\|")); @@ -311,12 +312,14 @@ public LayerInterface getLayer(String id) { "int_cores_max, "+ "b_threadable, " + "int_mem_min, " + - "int_gpu_min, " + + "int_gpus_min, "+ + "int_gpus_max, "+ + "int_gpu_mem_min, " + "str_services, " + "int_timeout," + "int_timeout_llu " + ") " + - "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"; + "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"; @Override public void insertLayerDetail(LayerDetail l) { @@ -326,7 +329,7 @@ public void insertLayerDetail(LayerDetail l) { l.range, l.chunkSize, l.dispatchOrder, StringUtils.join(l.tags," | "), l.type.toString(), l.minimumCores, l.maximumCores, l.isThreadable, - l.minimumMemory, l.minimumGpu, StringUtils.join(l.services,","), + l.minimumMemory, l.minimumGpus, l.maximumGpus, l.minimumGpuMemory, StringUtils.join(l.services,","), l.timeout, l.timeout_llu); } @@ -340,9 +343,9 @@ public void updateLayerMinMemory(LayerInterface layer, long val) { } @Override - public void updateLayerMinGpu(LayerInterface layer, long gpu) { - getJdbcTemplate().update("UPDATE layer SET int_gpu_min=? WHERE pk_layer=?", - gpu, layer.getLayerId()); + public void updateLayerMinGpuMemory(LayerInterface layer, long kb) { + getJdbcTemplate().update("UPDATE layer SET int_gpu_mem_min=? WHERE pk_layer=?", + kb, layer.getLayerId()); } private static final String BALANCE_MEM = @@ -392,9 +395,9 @@ public void increaseLayerMinMemory(LayerInterface layer, long val) { } @Override - public void increaseLayerMinGpu(LayerInterface layer, long gpu) { - getJdbcTemplate().update("UPDATE layer SET int_gpu_min=? WHERE pk_layer=? AND int_gpu_min < ?", - gpu, layer.getLayerId(), gpu); + public void increaseLayerMinGpuMemory(LayerInterface layer, long kb) { + getJdbcTemplate().update("UPDATE layer SET int_gpu_mem_min=? WHERE pk_layer=? AND int_gpu_mem_min < ?", + kb, layer.getLayerId(), kb); } @Override @@ -412,6 +415,18 @@ public void updateLayerMaxCores(LayerInterface layer, int val) { val, layer.getLayerId()); } + @Override + public void updateLayerMinGpus(LayerInterface layer, int val) { + getJdbcTemplate().update("UPDATE layer SET int_gpus_min=? WHERE pk_layer=?", + val, layer.getLayerId()); + } + + @Override + public void updateLayerMaxGpus(LayerInterface layer, int val) { + getJdbcTemplate().update("UPDATE layer SET int_gpus_max=? WHERE pk_layer=?", + val, layer.getLayerId()); + } + private static final String UPDATE_LAYER_MAX_RSS = "UPDATE " + "layer_mem " + @@ -489,6 +504,8 @@ public FrameStateTotals mapRow(ResultSet rs, int rowNum) throws SQLException { "SELECT " + "layer_usage.int_core_time_success,"+ "layer_usage.int_core_time_fail," + + "layer_usage.int_gpu_time_success,"+ + "layer_usage.int_gpu_time_fail," + "layer_usage.int_clock_time_success," + "layer_mem.int_max_rss " + "FROM " + @@ -512,6 +529,9 @@ public ExecutionSummary mapRow(ResultSet rs, int rowNum) throws SQLException { e.coreTimeSuccess = rs.getLong("int_core_time_success"); e.coreTimeFail = rs.getLong("int_core_time_fail"); e.coreTime = e.coreTimeSuccess + e.coreTimeFail; + e.gpuTimeSuccess = rs.getLong("int_gpu_time_success"); + e.gpuTimeFail = rs.getLong("int_gpu_time_fail"); + e.gpuTime = e.gpuTimeSuccess + e.gpuTimeFail; e.highMemoryKb = rs.getLong("int_max_rss"); return e; } @@ -608,10 +628,10 @@ public void updateMinMemory(JobInterface job, long mem, LayerType type) { } @Override - public void updateMinGpu(JobInterface job, long gpu, LayerType type) { + public void updateMinGpuMemory(JobInterface job, long kb, LayerType type) { getJdbcTemplate().update( - "UPDATE layer SET int_gpu_min=? WHERE pk_job=? AND str_type=?", - gpu, job.getJobId(), type.toString()); + "UPDATE layer SET int_gpu_mem_min=? WHERE pk_job=? AND str_type=?", + kb, job.getJobId(), type.toString()); } @Override @@ -621,6 +641,13 @@ public void updateMinCores(JobInterface job, int cores, LayerType type) { cores, job.getJobId(), type.toString()); } + @Override + public void updateMinGpus(JobInterface job, int gpus, LayerType type) { + getJdbcTemplate().update( + "UPDATE layer SET int_gpus_min=? WHERE pk_job=? AND str_type=?", + gpus, job.getJobId(), type.toString()); + } + @Override public void updateThreadable(LayerInterface layer, boolean threadable) { getJdbcTemplate().update( @@ -664,6 +691,8 @@ public void enableMemoryOptimizer(LayerInterface layer, boolean value) { "layer.pk_layer = ? " + "AND " + "layer.int_cores_min = 100 " + + "AND " + + "layer.int_gpus_min = 0 " + "AND " + "str_tags LIKE '%general%' " + "AND " + @@ -686,7 +715,8 @@ public boolean isOptimizable(LayerInterface l, int succeeded, float avg) { private static final String THREAD_STATS = "SELECT " + "avg(interval_to_seconds(ts_stopped - ts_started)) AS avg, " + - "int_cores " + + "int_cores, " + + "int_gpus " + "FROM " + "frame " + "WHERE " + @@ -695,8 +725,11 @@ public boolean isOptimizable(LayerInterface l, int succeeded, float avg) { "frame.int_checkpoint_count = 0 " + "AND " + "int_cores > 0 " + + "AND " + + "int_gpus > 0 " + "GROUP BY " + - "int_cores " + + "int_cores, " + + "int_gpus " + "ORDER BY " + "int_cores DESC "; @@ -724,11 +757,13 @@ public void updateUsage(LayerInterface layer, ResourceUsage usage, int exitStatu "layer_usage " + "SET " + "int_core_time_success = int_core_time_success + ?," + + "int_gpu_time_success = int_gpu_time_success + ?," + "int_clock_time_success = int_clock_time_success + ?,"+ "int_frame_success_count = int_frame_success_count + 1 " + "WHERE " + "pk_layer = ? ", usage.getCoreTimeSeconds(), + usage.getGpuTimeSeconds(), usage.getClockTimeSeconds(), layer.getLayerId()); diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/NestedWhiteboardDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/NestedWhiteboardDaoJdbc.java index f2cff28a5..924a65a96 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/NestedWhiteboardDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/NestedWhiteboardDaoJdbc.java @@ -73,8 +73,12 @@ public CachedJobWhiteboardMapper(NestedJobWhiteboardMapper result) { "folder.int_job_priority as int_def_job_priority, " + "folder.int_job_min_cores as int_def_job_min_cores, " + "folder.int_job_max_cores as int_def_job_max_cores, " + + "folder.int_job_min_gpus as int_def_job_min_gpus, " + + "folder.int_job_max_gpus as int_def_job_max_gpus, " + "folder_resource.int_min_cores AS folder_min_cores, " + "folder_resource.int_max_cores AS folder_max_cores, " + + "folder_resource.int_min_gpus AS folder_min_gpus, " + + "folder_resource.int_max_gpus AS folder_max_gpus, " + "folder_level.int_level, " + "job.pk_job, " + "job.str_name, " + @@ -101,13 +105,18 @@ public CachedJobWhiteboardMapper(NestedJobWhiteboardMapper result) { "job_stat.int_succeeded_count, " + "job_usage.int_core_time_success, " + "job_usage.int_core_time_fail, " + + "job_usage.int_gpu_time_success, " + + "job_usage.int_gpu_time_fail, " + "job_usage.int_frame_success_count, " + "job_usage.int_frame_fail_count, " + "job_usage.int_clock_time_high, " + "job_usage.int_clock_time_success, " + "(job_resource.int_cores + job_resource.int_local_cores) AS int_cores, " + + "(job_resource.int_gpus + job_resource.int_local_gpus) AS int_gpus, " + "job_resource.int_min_cores, " + + "job_resource.int_min_gpus, " + "job_resource.int_max_cores, " + + "job_resource.int_max_gpus, " + "job_mem.int_max_rss " + "FROM " + "show, " + @@ -165,8 +174,12 @@ public NestedGroup mapRow(ResultSet rs, int rowNum) throws SQLException { .setDefaultJobPriority(rs.getInt("int_def_job_priority")) .setDefaultJobMinCores(Convert.coreUnitsToCores(rs.getInt("int_def_job_min_cores"))) .setDefaultJobMaxCores(Convert.coreUnitsToCores(rs.getInt("int_def_job_max_cores"))) + .setDefaultJobMinGpus(rs.getInt("int_def_job_min_gpus")) + .setDefaultJobMaxGpus(rs.getInt("int_def_job_max_gpus")) .setMaxCores(Convert.coreUnitsToCores(rs.getInt("folder_max_cores"))) .setMinCores(Convert.coreUnitsToCores(rs.getInt("folder_min_cores"))) + .setMaxGpus(rs.getInt("folder_max_gpus")) + .setMinGpus(rs.getInt("folder_min_gpus")) .setLevel(rs.getInt("int_level")) .setDepartment(rs.getString("dept_name")) .build(); @@ -254,6 +267,8 @@ private static final NestedJob mapResultSetToJob(ResultSet rs) throws SQLExcepti .setLogDir(rs.getString("str_log_dir")) .setMaxCores(Convert.coreUnitsToCores(rs.getInt("int_max_cores"))) .setMinCores(Convert.coreUnitsToCores(rs.getInt("int_min_cores"))) + .setMaxGpus(rs.getInt("int_max_cores")) + .setMinGpus(rs.getInt("int_min_cores")) .setName(rs.getString("str_name")) .setPriority(rs.getInt("int_priority")) .setShot(rs.getString("str_shot")) @@ -295,8 +310,10 @@ private static final NestedJob mapResultSetToJob(ResultSet rs) throws SQLExcepti "host_stat.ts_ping, " + "host.int_cores, " + "host.int_cores_idle, " + - "host.int_gpu, " + - "host.int_gpu_idle, " + + "host.int_gpus, " + + "host.int_gpus_idle, " + + "host.int_gpu_mem, " + + "host.int_gpu_mem_idle, " + "host.int_mem, " + "host.int_mem_idle, " + "host.str_lock_state, " + @@ -310,15 +327,16 @@ private static final NestedJob mapResultSetToJob(ResultSet rs) throws SQLExcepti "host_stat.int_swap_free, " + "host_stat.int_mcp_total, " + "host_stat.int_mcp_free, " + - "host_stat.int_gpu_total, " + - "host_stat.int_gpu_free, " + + "host_stat.int_gpu_mem_total, " + + "host_stat.int_gpu_mem_free, " + "host_stat.int_load, " + "proc.pk_proc, " + "proc.int_cores_reserved AS proc_cores, " + + "proc.int_gpus_reserved AS proc_gpus, " + "proc.int_mem_reserved AS proc_memory, " + "proc.int_mem_used AS used_memory, " + "proc.int_mem_max_used AS max_memory, " + - "proc.int_gpu_reserved AS proc_gpu, " + + "proc.int_gpu_mem_reserved AS proc_gpu_memory, " + "proc.ts_ping, " + "proc.ts_booked, " + "proc.ts_dispatched, " + @@ -445,10 +463,13 @@ public NestedHost mapRow(ResultSet rs, int row) throws SQLException { proc = NestedProc.newBuilder() .setId(pid) .setName(CueUtil.buildProcName(host.getName(), - rs.getInt("proc_cores"))) + rs.getInt("proc_cores"), + rs.getInt("proc_gpus"))) .setReservedCores(Convert.coreUnitsToCores( rs.getInt("proc_cores"))) + .setReservedGpus(rs.getInt("proc_gpus")) .setReservedMemory(rs.getLong("proc_memory")) + .setReservedGpuMemory(rs.getLong("proc_gpu_memory")) .setUsedMemory(rs.getLong("used_memory")) .setFrameName(rs.getString("frame_name")) .setJobName(rs.getString("job_name")) diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java index be8643b7e..ba9f33c1f 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ProcDaoJdbc.java @@ -108,9 +108,12 @@ public boolean deleteVirtualProc(VirtualProc proc) { "int_mem_reserved, " + "int_mem_pre_reserved, " + "int_mem_used, "+ - "int_gpu_reserved, " + + "int_gpus_reserved, " + + "int_gpu_mem_reserved, " + + "int_gpu_mem_pre_reserved, " + + "int_gpu_mem_used, " + "b_local " + - ") VALUES (?,?,?,?,?,?,?,?,?,?,?,?) "; + ") VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) "; public void insertVirtualProc(VirtualProc proc) { proc.id = SqlUtil.genKeyRandom(); @@ -121,7 +124,9 @@ public void insertVirtualProc(VirtualProc proc) { proc.getLayerId(), proc.getJobId(), proc.getFrameId(), proc.coresReserved, proc.memoryReserved, proc.memoryReserved, Dispatcher.MEM_RESERVED_MIN, - proc.gpuReserved, proc.isLocalDispatch); + proc.gpusReserved, proc.gpuMemoryReserved, + proc.gpuMemoryReserved, Dispatcher.MEM_GPU_RESERVED_MIN, + proc.isLocalDispatch); // Update all of the resource counts procCreated(proc); @@ -278,7 +283,9 @@ public VirtualProc mapRow(ResultSet rs, int rowNum) throws SQLException { proc.coresReserved =rs.getInt("int_cores_reserved"); proc.memoryReserved = rs.getLong("int_mem_reserved"); proc.memoryMax = rs.getLong("int_mem_max_used"); - proc.gpuReserved = rs.getLong("int_gpu_reserved"); + proc.gpusReserved = rs.getInt("int_gpus_reserved"); + proc.gpuMemoryReserved = rs.getLong("int_gpu_mem_reserved"); + proc.gpuMemoryMax = rs.getLong("int_gpu_mem_max_used"); proc.virtualMemoryMax = rs.getLong("int_virt_max_used"); proc.virtualMemoryUsed = rs.getLong("int_virt_used"); proc.memoryUsed = rs.getLong("int_mem_used"); @@ -305,7 +312,10 @@ public VirtualProc mapRow(ResultSet rs, int rowNum) throws SQLException { "proc.int_mem_reserved,"+ "proc.int_mem_max_used,"+ "proc.int_mem_used,"+ - "proc.int_gpu_reserved,"+ + "proc.int_gpus_reserved,"+ + "proc.int_gpu_mem_reserved,"+ + "proc.int_gpu_mem_max_used,"+ + "proc.int_gpu_mem_used,"+ "proc.int_virt_max_used,"+ "proc.int_virt_used,"+ "host.str_name AS host_name, " + @@ -551,7 +561,10 @@ public boolean increaseReservedMemory(ProcInterface p, long value) { "int_mem_reserved," + "int_mem_max_used,"+ "int_mem_used,"+ - "int_gpu_reserved," + + "int_gpus_reserved," + + "int_gpu_mem_reserved," + + "int_gpu_mem_max_used," + + "int_gpu_mem_used," + "int_virt_max_used,"+ "int_virt_used,"+ "host_name, " + @@ -578,9 +591,9 @@ public long getReservedMemory(ProcInterface proc) { Long.class, proc.getProcId()); } - public long getReservedGpu(ProcInterface proc) { + public long getReservedGpuMemory(ProcInterface proc) { return getJdbcTemplate().queryForObject( - "SELECT int_gpu_reserved FROM proc WHERE pk_proc=?", + "SELECT int_gpu_mem_reserved FROM proc WHERE pk_proc=?", Long.class, proc.getProcId()); } @@ -694,22 +707,24 @@ private void procDestroyed(VirtualProc proc) { "SET " + "int_cores_idle = int_cores_idle + ?," + "int_mem_idle = int_mem_idle + ?, " + - "int_gpu_idle = int_gpu_idle + ? " + + "int_gpus_idle = int_gpus_idle + ?," + + "int_gpu_mem_idle = int_gpu_mem_idle + ? " + "WHERE " + "pk_host = ?", - proc.coresReserved, proc.memoryReserved, proc.gpuReserved, proc.getHostId()); + proc.coresReserved, proc.memoryReserved, proc.gpusReserved, proc.gpuMemoryReserved, proc.getHostId()); if (!proc.isLocalDispatch) { getJdbcTemplate().update( "UPDATE " + "subscription " + "SET " + - "int_cores = int_cores - ? " + + "int_cores = int_cores - ?," + + "int_gpus = int_gpus - ? " + "WHERE " + "pk_show = ? " + "AND " + "pk_alloc = ?", - proc.coresReserved, proc.getShowId(), + proc.coresReserved, proc.gpusReserved, proc.getShowId(), proc.getAllocationId()); } @@ -717,10 +732,11 @@ private void procDestroyed(VirtualProc proc) { "UPDATE " + "layer_resource " + "SET " + - "int_cores = int_cores - ? " + + "int_cores = int_cores - ?," + + "int_gpus = int_gpus - ? " + "WHERE " + "pk_layer = ?", - proc.coresReserved, proc.getLayerId()); + proc.coresReserved, proc.gpusReserved, proc.getLayerId()); if (!proc.isLocalDispatch) { @@ -728,33 +744,36 @@ private void procDestroyed(VirtualProc proc) { "UPDATE " + "job_resource " + "SET " + - "int_cores = int_cores - ? " + + "int_cores = int_cores - ?," + + "int_gpus = int_gpus - ? " + "WHERE " + "pk_job = ?", - proc.coresReserved, proc.getJobId()); + proc.coresReserved, proc.gpusReserved, proc.getJobId()); getJdbcTemplate().update( "UPDATE " + "folder_resource " + "SET " + - "int_cores = int_cores - ? " + + "int_cores = int_cores - ?," + + "int_gpus = int_gpus - ? " + "WHERE " + "pk_folder = " + "(SELECT pk_folder FROM job WHERE pk_job=?)", - proc.coresReserved, proc.getJobId()); + proc.coresReserved, proc.gpusReserved, proc.getJobId()); getJdbcTemplate().update( "UPDATE " + "point " + "SET " + - "int_cores = int_cores - ? " + + "int_cores = int_cores - ?, " + + "int_gpus = int_gpus - ? " + "WHERE " + "pk_dept = " + "(SELECT pk_dept FROM job WHERE pk_job=?) " + "AND " + "pk_show = " + "(SELECT pk_show FROM job WHERE pk_job=?) ", - proc.coresReserved, proc.getJobId(), proc.getJobId()); + proc.coresReserved, proc.gpusReserved, proc.getJobId(), proc.getJobId()); } if (proc.isLocalDispatch) { @@ -763,10 +782,11 @@ private void procDestroyed(VirtualProc proc) { "UPDATE " + "job_resource " + "SET " + - "int_local_cores = int_local_cores - ? " + + "int_local_cores = int_local_cores - ?, " + + "int_local_gpus = int_local_gpus - ? " + "WHERE " + "pk_job = ?", - proc.coresReserved, proc.getJobId()); + proc.coresReserved, proc.gpusReserved, proc.getJobId()); getJdbcTemplate().update( "UPDATE " + @@ -774,14 +794,16 @@ private void procDestroyed(VirtualProc proc) { "SET " + "int_cores_idle = int_cores_idle + ?, " + "int_mem_idle = int_mem_idle + ?, " + - "int_gpu_idle = int_gpu_idle + ? " + + "int_gpus_idle = int_gpus_idle + ?, " + + "int_gpu_mem_idle = int_gpu_mem_idle + ? " + "WHERE " + "pk_job = ? " + "AND " + "pk_host = ? ", proc.coresReserved, proc.memoryReserved, - proc.gpuReserved, + proc.gpusReserved, + proc.gpuMemoryReserved, proc.getJobId(), proc.getHostId()); } @@ -802,10 +824,11 @@ private void procCreated(VirtualProc proc) { "SET " + "int_cores_idle = int_cores_idle - ?," + "int_mem_idle = int_mem_idle - ?, " + - "int_gpu_idle = int_gpu_idle - ? " + + "int_gpus_idle = int_gpus_idle - ?," + + "int_gpu_mem_idle = int_gpu_mem_idle - ? " + "WHERE " + "pk_host = ?", - proc.coresReserved, proc.memoryReserved, proc.gpuReserved, proc.getHostId()); + proc.coresReserved, proc.memoryReserved, proc.gpusReserved, proc.gpuMemoryReserved, proc.getHostId()); /** @@ -817,12 +840,13 @@ private void procCreated(VirtualProc proc) { "UPDATE " + "subscription " + "SET " + - "int_cores = int_cores + ? " + + "int_cores = int_cores + ?," + + "int_gpus = int_gpus + ? " + "WHERE " + "pk_show = ? " + "AND " + "pk_alloc = ?", - proc.coresReserved, proc.getShowId(), + proc.coresReserved, proc.gpusReserved, proc.getShowId(), proc.getAllocationId()); } @@ -830,10 +854,11 @@ private void procCreated(VirtualProc proc) { "UPDATE " + "layer_resource " + "SET " + - "int_cores = int_cores + ? " + + "int_cores = int_cores + ?," + + "int_gpus = int_gpus + ? " + "WHERE " + "pk_layer = ?", - proc.coresReserved, proc.getLayerId()); + proc.coresReserved, proc.gpusReserved, proc.getLayerId()); if (!proc.isLocalDispatch) { @@ -841,33 +866,36 @@ private void procCreated(VirtualProc proc) { "UPDATE " + "job_resource " + "SET " + - "int_cores = int_cores + ? " + + "int_cores = int_cores + ?," + + "int_gpus = int_gpus + ? " + "WHERE " + "pk_job = ?", - proc.coresReserved, proc.getJobId()); + proc.coresReserved, proc.gpusReserved, proc.getJobId()); getJdbcTemplate().update( "UPDATE " + "folder_resource " + "SET " + - "int_cores = int_cores + ? " + + "int_cores = int_cores + ?," + + "int_gpus = int_gpus + ? " + "WHERE " + "pk_folder = " + "(SELECT pk_folder FROM job WHERE pk_job=?)", - proc.coresReserved, proc.getJobId()); + proc.coresReserved, proc.gpusReserved, proc.getJobId()); getJdbcTemplate().update( "UPDATE " + "point " + "SET " + - "int_cores = int_cores + ? " + + "int_cores = int_cores + ?," + + "int_gpus = int_gpus + ? " + "WHERE " + "pk_dept = " + "(SELECT pk_dept FROM job WHERE pk_job=?) " + "AND " + "pk_show = " + "(SELECT pk_show FROM job WHERE pk_job=?) ", - proc.coresReserved, proc.getJobId(), proc.getJobId()); + proc.coresReserved, proc.gpusReserved, proc.getJobId(), proc.getJobId()); } if (proc.isLocalDispatch) { @@ -876,23 +904,28 @@ private void procCreated(VirtualProc proc) { "UPDATE " + "job_resource " + "SET " + - "int_local_cores = int_local_cores + ? " + + "int_local_cores = int_local_cores + ?," + + "int_local_gpus = int_local_gpus + ? " + "WHERE " + "pk_job = ?", - proc.coresReserved, proc.getJobId()); + proc.coresReserved, proc.gpusReserved, proc.getJobId()); getJdbcTemplate().update( "UPDATE " + "host_local " + "SET " + "int_cores_idle = int_cores_idle - ?, " + - "int_mem_idle = int_mem_idle - ? " + + "int_mem_idle = int_mem_idle - ?," + + "int_gpus_idle = int_gpus_idle - ?, " + + "int_gpu_mem_idle = int_gpu_mem_idle - ? " + "WHERE " + "pk_job = ? " + "AND " + "pk_host = ?", proc.coresReserved, proc.memoryReserved, + proc.gpusReserved, + proc.gpuMemoryReserved, proc.getJobId(), proc.getHostId()); } diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ServiceDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ServiceDaoJdbc.java index b31d9ade0..6330cc8cb 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ServiceDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ServiceDaoJdbc.java @@ -60,7 +60,9 @@ public ServiceEntity mapRow(ResultSet rs, int rowNum) throws SQLException { s.minCores = rs.getInt("int_cores_min"); s.maxCores = rs.getInt("int_cores_max"); s.minMemory = rs.getLong("int_mem_min"); - s.minGpu = rs.getLong("int_gpu_min"); + s.minGpus = rs.getInt("int_gpus_min"); + s.maxGpus = rs.getInt("int_gpus_max"); + s.minGpuMemory = rs.getLong("int_gpu_mem_min"); s.threadable = rs.getBoolean("b_threadable"); s.tags = splitTags(rs.getString("str_tags")); s.timeout = rs.getInt("int_timeout"); @@ -79,7 +81,9 @@ public ServiceOverrideEntity mapRow(ResultSet rs, int rowNum) s.minCores = rs.getInt("int_cores_min"); s.maxCores = rs.getInt("int_cores_max"); s.minMemory = rs.getLong("int_mem_min"); - s.minGpu = rs.getLong("int_gpu_min"); + s.minGpus = rs.getInt("int_gpus_min"); + s.maxGpus = rs.getInt("int_gpus_max"); + s.minGpuMemory = rs.getLong("int_gpu_mem_min"); s.threadable = rs.getBoolean("b_threadable"); s.tags = splitTags(rs.getString("str_tags")); s.showId = rs.getString("pk_show"); @@ -97,7 +101,9 @@ public ServiceOverrideEntity mapRow(ResultSet rs, int rowNum) "service.int_cores_min," + "service.int_cores_max," + "service.int_mem_min," + - "service.int_gpu_min," + + "service.int_gpus_min," + + "service.int_gpus_max," + + "service.int_gpu_mem_min," + "service.str_tags, " + "service.int_timeout, " + "service.int_timeout_llu " + @@ -119,7 +125,9 @@ public ServiceEntity get(String id) { "show_service.int_cores_min," + "show_service.int_cores_max, "+ "show_service.int_mem_min," + - "show_service.int_gpu_min," + + "show_service.int_gpus_min," + + "show_service.int_gpus_max, "+ + "show_service.int_gpu_mem_min," + "show_service.str_tags," + "show_service.int_timeout," + "show_service.int_timeout_llu," + @@ -167,18 +175,21 @@ public boolean isOverridden(String service, String show) { "int_cores_min," + "int_cores_max, "+ "int_mem_min," + - "int_gpu_min," + + "int_gpus_min," + + "int_gpus_max, "+ + "int_gpu_mem_min," + "str_tags," + "int_timeout," + "int_timeout_llu " + - ") VALUES (?,?,?,?,?,?,?,?,?,?)"; + ") VALUES (?,?,?,?,?,?,?,?,?,?,?,?)"; @Override public void insert(ServiceEntity service) { service.id = SqlUtil.genKeyRandom(); getJdbcTemplate().update(INSERT_SERVICE, service.id, service.name, service.threadable, service.minCores, - service.maxCores, service.minMemory, service.minGpu, + service.maxCores, service.minMemory, + service.minGpus, service.maxGpus, service.minGpuMemory, StringUtils.join(service.tags.toArray(), " | "), service.timeout, service.timeout_llu); } @@ -194,11 +205,13 @@ public void insert(ServiceEntity service) { "int_cores_min," + "int_cores_max," + "int_mem_min," + - "int_gpu_min," + + "int_gpus_min," + + "int_gpus_max," + + "int_gpu_mem_min," + "str_tags," + "int_timeout," + "int_timeout_llu " + - ") VALUES (?,?,?,?,?,?,?,?,?,?,?)"; + ") VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)"; @Override public void insert(ServiceOverrideEntity service) { @@ -206,7 +219,7 @@ public void insert(ServiceOverrideEntity service) { getJdbcTemplate().update(INSERT_SERVICE_WITH_SHOW, service.id, service.showId, service.name, service.threadable, service.minCores, service.maxCores, service.minMemory, - service.minGpu, joinTags(service.tags), + service.minGpus, service.maxGpus, service.minGpuMemory, joinTags(service.tags), service.timeout, service.timeout_llu); } @@ -219,7 +232,9 @@ service.minGpu, joinTags(service.tags), "int_cores_min=?," + "int_cores_max=?,"+ "int_mem_min=?," + - "int_gpu_min=?," + + "int_gpus_min=?," + + "int_gpus_max=?," + + "int_gpu_mem_min=?," + "str_tags=?," + "int_timeout=?," + "int_timeout_llu=? " + @@ -230,7 +245,7 @@ service.minGpu, joinTags(service.tags), public void update(ServiceEntity service) { getJdbcTemplate().update(UPDATE_SERVICE, service.name, service.threadable, service.minCores, service.maxCores, - service.minMemory, service.minGpu, joinTags(service.tags), + service.minMemory, service.minGpus, service.maxGpus, service.minGpuMemory, joinTags(service.tags), service.timeout, service.timeout_llu, service.getId()); } @@ -243,7 +258,9 @@ service.minMemory, service.minGpu, joinTags(service.tags), "int_cores_min=?," + "int_cores_max=?," + "int_mem_min=?," + - "int_gpu_min=?," + + "int_gpus_min=?," + + "int_gpus_max=?," + + "int_gpu_mem_min=?," + "str_tags=?," + "int_timeout=?," + "int_timeout_llu=? " + @@ -254,7 +271,7 @@ service.minMemory, service.minGpu, joinTags(service.tags), public void update(ServiceOverrideEntity service) { getJdbcTemplate().update(UPDATE_SERVICE_WITH_SHOW, service.name, service.threadable, service.minCores, service.maxCores, - service.minMemory, service.minGpu, joinTags(service.tags), + service.minMemory, service.minGpus, service.maxGpus, service.minGpuMemory, joinTags(service.tags), service.timeout, service.timeout_llu, service.getId()); } diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ShowDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ShowDaoJdbc.java index 893455be3..add49a178 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ShowDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/ShowDaoJdbc.java @@ -44,6 +44,8 @@ public ShowEntity mapRow(ResultSet rs, int rowNum) throws SQLException { show.id = rs.getString("pk_show"); show.defaultMaxCores = rs.getInt("int_default_max_cores"); show.defaultMinCores = rs.getInt("int_default_min_cores"); + show.defaultMaxGpus = rs.getInt("int_default_max_gpus"); + show.defaultMinGpus = rs.getInt("int_default_min_gpus"); show.active = rs.getBoolean("b_active"); if (rs.getString("str_comment_email") != null) { @@ -61,6 +63,8 @@ public ShowEntity mapRow(ResultSet rs, int rowNum) throws SQLException { "show.pk_show, " + "show.int_default_max_cores, " + "show.int_default_min_cores, " + + "show.int_default_max_gpus, " + + "show.int_default_min_gpus, " + "show.str_name, " + "show.b_active, " + "show.str_comment_email " + @@ -72,6 +76,8 @@ public ShowEntity mapRow(ResultSet rs, int rowNum) throws SQLException { "show.pk_show, " + "show.int_default_max_cores, " + "show.int_default_min_cores, " + + "show.int_default_max_gpus, " + + "show.int_default_min_gpus, " + "show_alias.str_name, " + "show.b_active, " + "show.str_comment_email " + @@ -101,6 +107,8 @@ public ShowEntity getShowDetail(String id) { "show.pk_show, " + "show.int_default_max_cores, " + "show.int_default_min_cores, " + + "show.int_default_max_gpus, " + + "show.int_default_min_gpus, " + "show.str_name, " + "show.b_active, " + "show.str_comment_email " + @@ -180,6 +188,18 @@ public void updateShowDefaultMaxCores(ShowInterface s, int val) { val, s.getShowId()); } + public void updateShowDefaultMinGpus(ShowInterface s, int val) { + getJdbcTemplate().update( + "UPDATE show SET int_default_min_gpus=? WHERE pk_show=?", + val, s.getShowId()); + } + + public void updateShowDefaultMaxGpus(ShowInterface s, int val) { + getJdbcTemplate().update( + "UPDATE show SET int_default_max_gpus=? WHERE pk_show=?", + val, s.getShowId()); + } + @Override public void updateBookingEnabled(ShowInterface s, boolean enabled) { getJdbcTemplate().update( diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/WhiteboardDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/WhiteboardDaoJdbc.java index 95712d605..cb3aede1f 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/WhiteboardDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/WhiteboardDaoJdbc.java @@ -895,7 +895,10 @@ public RenderPartition mapRow(ResultSet rs, int rowNum) throws SQLException { .setThreads(rs.getInt("int_threads")) .setMaxMemory(rs.getLong("int_mem_max")) .setMemory( rs.getLong("int_mem_max") - rs.getLong("int_mem_idle")) - .setMaxGpu(rs.getLong("int_gpu_max")) + .setGpus(rs.getInt("int_gpus_max") - rs.getInt("int_gpus_idle")) + .setMaxGpus(rs.getInt("int_gpus_max")) + .setGpuMemory(rs.getLong("int_gpu_mem_max") - rs.getLong("int_gpu_mem_idle")) + .setMaxGpuMemory(rs.getLong("int_gpu_mem_max")) .setHost(SqlUtil.getString(rs,"str_host_name")) .setJob(SqlUtil.getString(rs,"str_job_name")) .setRenderPartType(RenderPartitionType.valueOf(SqlUtil.getString(rs,"str_type"))) @@ -947,11 +950,13 @@ public Proc mapRow(ResultSet rs, int row) throws SQLException { return Proc.newBuilder() .setId(SqlUtil.getString(rs,"pk_proc")) .setName(CueUtil.buildProcName(SqlUtil.getString(rs,"host_name"), - rs.getInt("int_cores_reserved"))) + rs.getInt("int_cores_reserved"), rs.getInt("int_gpus_reserved"))) .setReservedCores(Convert.coreUnitsToCores(rs.getInt("int_cores_reserved"))) .setReservedMemory(rs.getLong("int_mem_reserved")) - .setReservedGpu(rs.getLong("int_gpu_reserved")) + .setReservedGpus(rs.getInt("int_gpus_reserved")) + .setReservedGpuMemory(rs.getLong("int_gpu_mem_reserved")) .setUsedMemory(rs.getLong("int_mem_used")) + .setUsedGpuMemory(rs.getLong("int_gpu_mem_used")) .setFrameName(SqlUtil.getString(rs, "frame_name")) .setJobName(SqlUtil.getString(rs,"job_name")) .setGroupName(SqlUtil.getString(rs,"folder_name")) @@ -1006,20 +1011,22 @@ public static NestedHost.Builder mapNestedHostBuilder(ResultSet rs) throws SQLEx .setFreeMcp(rs.getLong("int_mcp_free")) .setFreeMemory(rs.getLong("int_mem_free")) .setFreeSwap(rs.getLong("int_swap_free")) - .setFreeGpu(rs.getLong("int_gpu_free")) + .setFreeGpuMemory(rs.getLong("int_gpu_mem_free")) .setLoad(rs.getInt("int_load")) .setNimbyEnabled(rs.getBoolean("b_nimby")) .setCores(Convert.coreUnitsToCores(rs.getInt("int_cores"))) .setIdleCores(Convert.coreUnitsToCores(rs.getInt("int_cores_idle"))) .setMemory(rs.getLong("int_mem")) .setIdleMemory(rs.getLong("int_mem_idle")) - .setGpu(rs.getLong("int_gpu")) - .setIdleGpu(rs.getLong("int_gpu_idle")) + .setGpus(rs.getInt("int_gpus")) + .setIdleGpus(rs.getInt("int_gpus_idle")) + .setGpuMemory(rs.getLong("int_gpu_mem")) + .setIdleGpuMemory(rs.getLong("int_gpu_mem_idle")) .setState(HardwareState.valueOf(SqlUtil.getString(rs,"host_state"))) .setTotalMcp(rs.getLong("int_mcp_total")) .setTotalMemory(rs.getLong("int_mem_total")) .setTotalSwap(rs.getLong("int_swap_total")) - .setTotalGpu(rs.getLong("int_gpu_total")) + .setTotalGpuMemory(rs.getLong("int_gpu_mem_total")) .setPingTime((int) (rs.getTimestamp("ts_ping").getTime() / 1000)) .setLockState(LockState.valueOf(SqlUtil.getString(rs,"str_lock_state"))) .setHasComment(rs.getBoolean("b_comment")) @@ -1041,20 +1048,22 @@ public static Host.Builder mapHostBuilder(ResultSet rs) throws SQLException { builder.setFreeMcp(rs.getLong("int_mcp_free")); builder.setFreeMemory(rs.getLong("int_mem_free")); builder.setFreeSwap(rs.getLong("int_swap_free")); - builder.setFreeGpu(rs.getLong("int_gpu_free")); + builder.setFreeGpuMemory(rs.getLong("int_gpu_mem_free")); builder.setLoad(rs.getInt("int_load")); builder.setNimbyEnabled(rs.getBoolean("b_nimby")); builder.setCores(Convert.coreUnitsToCores(rs.getInt("int_cores"))); builder.setIdleCores(Convert.coreUnitsToCores(rs.getInt("int_cores_idle"))); builder.setMemory(rs.getLong("int_mem")); builder.setIdleMemory(rs.getLong("int_mem_idle")); - builder.setGpu(rs.getLong("int_gpu")); - builder.setIdleGpu(rs.getLong("int_gpu_idle")); + builder.setGpus(rs.getInt("int_gpus")); + builder.setIdleGpus(rs.getInt("int_gpus_idle")); + builder.setGpuMemory(rs.getLong("int_gpu_mem")); + builder.setIdleGpuMemory(rs.getLong("int_gpu_mem_idle")); builder.setState(HardwareState.valueOf(SqlUtil.getString(rs,"host_state"))); builder.setTotalMcp(rs.getLong("int_mcp_total")); builder.setTotalMemory(rs.getLong("int_mem_total")); builder.setTotalSwap(rs.getLong("int_swap_total")); - builder.setTotalGpu(rs.getLong("int_gpu_total")); + builder.setTotalGpuMemory(rs.getLong("int_gpu_mem_total")); builder.setPingTime((int) (rs.getTimestamp("ts_ping").getTime() / 1000)); builder.setLockState(LockState.valueOf(SqlUtil.getString(rs,"str_lock_state"))); builder.setHasComment(rs.getBoolean("b_comment")); @@ -1109,6 +1118,11 @@ public Allocation mapRow(ResultSet rs, int rowNum) throws SQLException { .setIdleCores(Convert.coreUnitsToCores(rs.getInt("int_idle_cores"))) .setRunningCores(Convert.coreUnitsToCores(rs.getInt("int_running_cores"))) .setLockedCores(Convert.coreUnitsToCores(rs.getInt("int_locked_cores"))) + .setGpus(rs.getInt("int_gpus")) + .setAvailableGpus(rs.getInt("int_available_gpus")) + .setIdleGpus(rs.getInt("int_idle_gpus")) + .setRunningGpus(rs.getInt("int_running_gpus")) + .setLockedGpus(rs.getInt("int_locked_gpus")) .setHosts(rs.getInt("int_hosts")) .setDownHosts(rs.getInt("int_down_hosts")) .setLockedHosts(rs.getInt("int_locked_hosts")) @@ -1128,6 +1142,7 @@ public Group mapRow(ResultSet rs, int rowNum) throws SQLException { .setDependFrames(rs.getInt("int_depend_count")) .setPendingJobs(rs.getInt("int_job_count")) .setReservedCores(Convert.coreUnitsToCores(rs.getInt("int_cores"))) + .setReservedGpus(rs.getInt("int_gpus")) .build(); return Group.newBuilder() .setId(SqlUtil.getString(rs,"pk_folder")) @@ -1136,8 +1151,12 @@ public Group mapRow(ResultSet rs, int rowNum) throws SQLException { .setDefaultJobPriority(rs.getInt("int_job_priority")) .setDefaultJobMinCores(Convert.coreUnitsToCores(rs.getInt("int_job_min_cores"))) .setDefaultJobMaxCores(Convert.coreUnitsToCores(rs.getInt("int_job_max_cores"))) + .setDefaultJobMinGpus(rs.getInt("int_job_min_gpus")) + .setDefaultJobMaxGpus(rs.getInt("int_job_max_gpus")) .setMaxCores(Convert.coreUnitsToCores(rs.getInt("int_max_cores"))) .setMinCores(Convert.coreUnitsToCores(rs.getInt("int_min_cores"))) + .setMaxGpus(rs.getInt("int_max_gpus")) + .setMinGpus(rs.getInt("int_min_gpus")) .setLevel(rs.getInt("int_level")) .setParentId(SqlUtil.getString(rs, "pk_parent_folder")) .setGroupStats(stats) @@ -1153,6 +1172,8 @@ public Job mapRow(ResultSet rs, int rowNum) throws SQLException { .setLogDir(SqlUtil.getString(rs, "str_log_dir")) .setMaxCores(Convert.coreUnitsToCores(rs.getInt("int_max_cores"))) .setMinCores(Convert.coreUnitsToCores(rs.getInt("int_min_cores"))) + .setMaxGpus(rs.getInt("int_max_gpus")) + .setMinGpus(rs.getInt("int_min_gpus")) .setName(SqlUtil.getString(rs,"str_name")) .setPriority(rs.getInt("int_priority")) .setShot(SqlUtil.getString(rs,"str_shot")) @@ -1189,6 +1210,7 @@ public static JobStats mapJobStats(ResultSet rs) throws SQLException { JobStats.Builder statsBuilder = JobStats.newBuilder() .setReservedCores(Convert.coreUnitsToCores(rs.getInt("int_cores"))) + .setReservedGpus(rs.getInt("int_gpus")) .setMaxRss(rs.getLong("int_max_rss")) .setTotalFrames(rs.getInt("int_frame_count")) .setTotalLayers(rs.getInt("int_layer_count")) @@ -1202,6 +1224,9 @@ public static JobStats mapJobStats(ResultSet rs) throws SQLException { .setFailedCoreSec(rs.getLong("int_core_time_fail")) .setRenderedCoreSec(rs.getLong("int_core_time_success")) .setTotalCoreSec( rs.getLong("int_core_time_fail") + rs.getLong("int_core_time_success")) + .setFailedGpuSec(rs.getLong("int_gpu_time_fail")) + .setRenderedGpuSec(rs.getLong("int_gpu_time_success")) + .setTotalGpuSec(rs.getLong("int_gpu_time_fail") + rs.getLong("int_gpu_time_success")) .setRenderedFrameCount( rs.getLong("int_frame_success_count")) .setFailedFrameCount(rs.getLong("int_frame_fail_count")) .setHighFrameSec(rs.getInt("int_clock_time_high")); @@ -1236,7 +1261,9 @@ public Layer mapRow(ResultSet rs, int rowNum) throws SQLException { .setMaxCores(Convert.coreUnitsToCores(rs.getInt("int_cores_max"))) .setIsThreadable(rs.getBoolean("b_threadable")) .setMinMemory(rs.getLong("int_mem_min")) - .setMinGpu(rs.getLong("int_gpu_min")) + .setMinGpus(rs.getInt("int_gpus_min")) + .setMaxGpus(rs.getInt("int_gpus_max")) + .setMinGpuMemory(rs.getLong("int_gpu_mem_min")) .setType(LayerType.valueOf(SqlUtil.getString(rs,"str_type"))) .addAllTags(Sets.newHashSet( SqlUtil.getString(rs,"str_tags"). @@ -1249,6 +1276,7 @@ public Layer mapRow(ResultSet rs, int rowNum) throws SQLException { LayerStats.Builder statsBuilder = LayerStats.newBuilder() .setReservedCores(Convert.coreUnitsToCores(rs.getInt("int_cores"))) + .setReservedGpus(rs.getInt("int_gpus")) .setMaxRss(rs.getLong("int_max_rss")) .setTotalFrames(rs.getInt("int_total_count")) .setWaitingFrames(rs.getInt("int_waiting_count")) @@ -1263,6 +1291,9 @@ public Layer mapRow(ResultSet rs, int rowNum) throws SQLException { .setRenderedCoreSec(rs.getLong("int_core_time_success")) .setTotalCoreSec( rs.getLong("int_core_time_fail") + rs.getLong("int_core_time_success")) + .setFailedGpuSec(rs.getLong("int_gpu_time_fail")) + .setRenderedGpuSec(rs.getLong("int_gpu_time_success")) + .setTotalGpuSec(rs.getLong("int_gpu_time_fail") + rs.getLong("int_gpu_time_success")) .setRenderedFrameCount( rs.getLong("int_frame_success_count")) .setFailedFrameCount(rs.getLong("int_frame_fail_count")) .setHighFrameSec(rs.getInt("int_clock_time_high")) @@ -1301,6 +1332,7 @@ public Subscription mapRow(ResultSet rs, int rowNum) throws SQLException { .setBurst(rs.getInt("int_burst")) .setName(rs.getString("name")) .setReservedCores(rs.getInt("int_cores")) + .setReservedGpus(rs.getInt("int_gpus")) .setSize(rs.getInt("int_size")) .setAllocationName(rs.getString("alloc_name")) .setShowName(rs.getString("show_name")) @@ -1321,9 +1353,10 @@ public UpdatedFrame mapRow(ResultSet rs, int rowNum) throws SQLException { .setUsedMemory(rs.getInt("int_mem_used")); if (SqlUtil.getString(rs, "str_host") != null) { - builder.setLastResource(String.format(Locale.ROOT, "%s/%2.2f", + builder.setLastResource(String.format(Locale.ROOT, "%s/%2.2f/%d", SqlUtil.getString(rs, "str_host"), - Convert.coreUnitsToCores(rs.getInt("int_cores")))); + Convert.coreUnitsToCores(rs.getInt("int_cores")), + rs.getInt("int_gpus"))); } else { builder.setLastResource(""); } @@ -1360,14 +1393,14 @@ public Frame mapRow(ResultSet rs, int rowNum) throws SQLException { .setLayerName(SqlUtil.getString(rs,"layer_name")) .setUsedMemory(rs.getLong("int_mem_used")) .setReservedMemory(rs.getLong("int_mem_reserved")) - .setReservedGpu(rs.getLong("int_gpu_reserved")) + .setReservedGpuMemory(rs.getLong("int_gpu_mem_reserved")) .setCheckpointState(CheckpointState.valueOf( SqlUtil.getString(rs,"str_checkpoint_state"))) .setCheckpointCount(rs.getInt("int_checkpoint_count")); if (SqlUtil.getString(rs,"str_host") != null) { builder.setLastResource(CueUtil.buildProcName(SqlUtil.getString(rs,"str_host"), - rs.getInt("int_cores"))); + rs.getInt("int_cores"), rs.getInt("int_gpus"))); } else { builder.setLastResource(""); } @@ -1388,9 +1421,12 @@ public Frame mapRow(ResultSet rs, int rowNum) throws SQLException { } builder.setTotalCoreTime(rs.getInt("int_total_past_core_time")); + builder.setTotalGpuTime(rs.getInt("int_total_past_gpu_time")); if (builder.getState() == FrameState.RUNNING) { builder.setTotalCoreTime(builder.getTotalCoreTime() + (int)(System.currentTimeMillis() / 1000 - builder.getStartTime()) * rs.getInt("int_cores") / 100); + builder.setTotalGpuTime(builder.getTotalGpuTime() + + (int)(System.currentTimeMillis() / 1000 - builder.getStartTime()) * rs.getInt("int_gpus")); } return builder.build(); } @@ -1406,7 +1442,9 @@ public Service mapRow(ResultSet rs, int rowNum) throws SQLException { .setMinCores(rs.getInt("int_cores_min")) .setMaxCores(rs.getInt("int_cores_max")) .setMinMemory(rs.getInt("int_mem_min")) - .setMinGpu(rs.getInt("int_gpu_min")) + .setMinGpus(rs.getInt("int_gpus_min")) + .setMaxGpus(rs.getInt("int_gpus_max")) + .setMinGpuMemory(rs.getInt("int_gpu_mem_min")) .addAllTags(Lists.newArrayList(ServiceDaoJdbc.splitTags( SqlUtil.getString(rs,"str_tags")))) .setTimeout(rs.getInt("int_timeout")) @@ -1425,7 +1463,9 @@ public ServiceOverride mapRow(ResultSet rs, int rowNum) throws SQLException { .setMinCores(rs.getInt("int_cores_min")) .setMaxCores(rs.getInt("int_cores_max")) .setMinMemory(rs.getInt("int_mem_min")) - .setMinGpu(rs.getInt("int_gpu_min")) + .setMinGpus(rs.getInt("int_gpus_min")) + .setMaxGpus(rs.getInt("int_gpus_max")) + .setMinGpuMemory(rs.getInt("int_gpu_mem_min")) .addAllTags(Lists.newArrayList(ServiceDaoJdbc.splitTags( SqlUtil.getString(rs,"str_tags")))) .setTimeout(rs.getInt("int_timeout")) @@ -1450,6 +1490,7 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { .setRenderedFrameCount(rs.getLong("int_frame_success_count")) .setFailedFrameCount(rs.getLong("int_frame_fail_count")) .setReservedCores(Convert.coreUnitsToCores(rs.getInt("int_cores"))) + .setReservedGpus(rs.getInt("int_gpus")) .setPendingJobs(rs.getInt("int_job_count")) .build(); return Show.newBuilder() @@ -1458,6 +1499,8 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { .setActive(rs.getBoolean("b_active")) .setDefaultMaxCores(Convert.coreUnitsToCores(rs.getInt("int_default_max_cores"))) .setDefaultMinCores(Convert.coreUnitsToCores(rs.getInt("int_default_min_cores"))) + .setDefaultMaxGpus(rs.getInt("int_default_max_gpus")) + .setDefaultMinGpus(rs.getInt("int_default_min_gpus")) .setBookingEnabled(rs.getBoolean("b_booking_enabled")) .setDispatchEnabled(rs.getBoolean("b_dispatch_enabled")) .setCommentEmail(SqlUtil.getString(rs,"str_comment_email")) @@ -1513,13 +1556,15 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "frame.str_state,"+ "frame.str_host,"+ "frame.int_cores,"+ + "frame.int_gpus,"+ "frame.int_mem_max_used," + "frame.int_mem_used, " + "frame.int_mem_reserved, " + - "frame.int_gpu_reserved, " + + "frame.int_gpu_mem_reserved, " + "frame.str_checkpoint_state,"+ "frame.int_checkpoint_count,"+ "frame.int_total_past_core_time,"+ + "frame.int_total_past_gpu_time,"+ "layer.str_name AS layer_name," + "job.str_name AS job_name "+ "FROM "+ @@ -1556,7 +1601,10 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "proc.int_mem_reserved, " + "proc.int_mem_used, " + "proc.int_mem_max_used, " + - "proc.int_gpu_reserved, " + + "proc.int_gpus_reserved, " + + "proc.int_gpu_mem_reserved, " + + "proc.int_gpu_mem_used, " + + "proc.int_gpu_mem_max_used, " + "proc.ts_ping, " + "proc.ts_booked, " + "proc.ts_dispatched, " + @@ -1593,6 +1641,7 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "frame.str_state,"+ "frame.str_host,"+ "frame.int_cores,"+ + "frame.int_gpus,"+ "frame.ts_llu,"+ "COALESCE(proc.int_mem_max_used, frame.int_mem_max_used) AS int_mem_max_used," + "COALESCE(proc.int_mem_used, frame.int_mem_used) AS int_mem_used " + @@ -1617,6 +1666,11 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "vs_alloc_usage.int_running_cores,"+ "vs_alloc_usage.int_available_cores,"+ "vs_alloc_usage.int_locked_cores,"+ + "vs_alloc_usage.int_gpus,"+ + "vs_alloc_usage.int_idle_gpus,"+ + "vs_alloc_usage.int_running_gpus,"+ + "vs_alloc_usage.int_available_gpus,"+ + "vs_alloc_usage.int_locked_gpus,"+ "vs_alloc_usage.int_hosts,"+ "vs_alloc_usage.int_locked_hosts,"+ "vs_alloc_usage.int_down_hosts "+ @@ -1650,6 +1704,8 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "str_ti_task,"+ "int_cores,"+ "int_min_cores,"+ + "int_gpus,"+ + "int_min_gpus,"+ "b_managed " + "FROM " + "point," + @@ -1672,6 +1728,8 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "str_ti_task,"+ "int_cores,"+ "int_min_cores,"+ + "int_gpus,"+ + "int_min_gpus,"+ "b_managed " + "FROM " + "point," + @@ -1702,11 +1760,13 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "host_local.pk_host_local,"+ "host_local.int_cores_idle,"+ "host_local.int_cores_max,"+ + "host_local.int_gpus_idle,"+ + "host_local.int_gpus_max,"+ "host_local.int_threads,"+ "host_local.int_mem_idle,"+ "host_local.int_mem_max,"+ - "host_local.int_gpu_idle,"+ - "host_local.int_gpu_max,"+ + "host_local.int_gpu_mem_idle,"+ + "host_local.int_gpu_mem_max,"+ "host_local.str_type,"+ "(SELECT str_name FROM host WHERE host.pk_host = host_local.pk_host) " + "AS str_host_name,"+ @@ -1775,6 +1835,10 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "folder.int_job_max_cores," + "folder_resource.int_min_cores,"+ "folder_resource.int_max_cores,"+ + "folder.int_job_min_gpus," + + "folder.int_job_max_gpus," + + "folder_resource.int_min_gpus,"+ + "folder_resource.int_max_gpus,"+ "folder.b_default, " + "folder_level.int_level, " + "c.int_waiting_count, " + @@ -1782,7 +1846,8 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "c.int_running_count,"+ "c.int_dead_count,"+ "c.int_job_count,"+ - "c.int_cores " + + "c.int_cores," + + "c.int_gpus " + "FROM " + "folder, " + "folder_level," + @@ -1817,6 +1882,8 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "job.str_log_dir," + "job_resource.int_max_cores," + "job_resource.int_min_cores," + + "job_resource.int_max_gpus," + + "job_resource.int_min_gpus," + "job.str_name," + "job.str_shot,"+ "job.str_state,"+ @@ -1843,12 +1910,15 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "job_stat.int_succeeded_count, "+ "job_usage.int_core_time_success, "+ "job_usage.int_core_time_fail, " + + "job_usage.int_gpu_time_success, "+ + "job_usage.int_gpu_time_fail, " + "job_usage.int_frame_success_count, "+ "job_usage.int_frame_fail_count, "+ "job_usage.int_clock_time_high,"+ "job_usage.int_clock_time_success,"+ "job_mem.int_max_rss,"+ - "(job_resource.int_cores + job_resource.int_local_cores) AS int_cores " + + "(job_resource.int_cores + job_resource.int_local_cores) AS int_cores," + + "(job_resource.int_gpus + job_resource.int_local_gpus) AS int_gpus " + "FROM " + "job,"+ "folder,"+ @@ -1885,6 +1955,8 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "layer_stat.int_succeeded_count," + "layer_usage.int_core_time_success," + "layer_usage.int_core_time_fail, "+ + "layer_usage.int_gpu_time_success," + + "layer_usage.int_gpu_time_fail, "+ "layer_usage.int_frame_success_count, "+ "layer_usage.int_frame_fail_count, "+ "layer_usage.int_clock_time_low, "+ @@ -1892,7 +1964,8 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "layer_usage.int_clock_time_success," + "layer_usage.int_clock_time_fail," + "layer_mem.int_max_rss,"+ - "layer_resource.int_cores " + + "layer_resource.int_cores," + + "layer_resource.int_gpus " + "FROM " + "layer, " + "job," + @@ -1923,6 +1996,8 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "layer_stat.int_succeeded_count, " + "layer_usage.int_core_time_success, " + "layer_usage.int_core_time_fail, " + + "layer_usage.int_gpu_time_success, " + + "layer_usage.int_gpu_time_fail, " + "layer_usage.int_frame_success_count, " + "layer_usage.int_frame_fail_count, " + "layer_usage.int_clock_time_low, " + @@ -1931,6 +2006,7 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "layer_usage.int_clock_time_fail, " + "layer_mem.int_max_rss, " + "layer_resource.int_cores, " + + "layer_resource.int_gpus, " + "limit_names.str_limit_names " + "FROM " + "layer " + @@ -1976,6 +2052,7 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "COALESCE(vs_show_stat.int_running_count,0) AS int_running_count," + "COALESCE(vs_show_stat.int_dead_count,0) AS int_dead_count," + "COALESCE(vs_show_resource.int_cores,0) AS int_cores, " + + "COALESCE(vs_show_resource.int_gpus,0) AS int_gpus, " + "COALESCE(vs_show_stat.int_job_count,0) AS int_job_count " + "FROM " + "show " + @@ -1992,7 +2069,9 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "service.int_cores_min," + "service.int_cores_max," + "service.int_mem_min," + - "service.int_gpu_min," + + "service.int_gpus_min," + + "service.int_gpus_max," + + "service.int_gpu_mem_min," + "service.str_tags," + "service.int_timeout," + "service.int_timeout_llu " + @@ -2007,7 +2086,9 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "show_service.int_cores_min," + "show_service.int_cores_max," + "show_service.int_mem_min," + - "show_service.int_gpu_min," + + "show_service.int_gpus_min," + + "show_service.int_gpus_max," + + "show_service.int_gpu_mem_min," + "show_service.str_tags," + "show_service.int_timeout," + "show_service.int_timeout_llu " + @@ -2023,6 +2104,8 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "task.str_shot,"+ "task.int_min_cores + task.int_adjust_cores AS int_min_cores, "+ "task.int_adjust_cores, " + + "task.int_min_gpus + task.int_adjust_gpus AS int_min_gpus, "+ + "task.int_adjust_gpus, " + "dept.str_name AS str_dept "+ "FROM " + "task,"+ @@ -2045,8 +2128,10 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "host.int_cores_idle,"+ "host.int_mem,"+ "host.int_mem_idle,"+ - "host.int_gpu,"+ - "host.int_gpu_idle,"+ + "host.int_gpus,"+ + "host.int_gpus_idle,"+ + "host.int_gpu_mem,"+ + "host.int_gpu_mem_idle,"+ "host.str_tags,"+ "host.str_lock_state,"+ "host.b_comment,"+ @@ -2058,8 +2143,8 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "host_stat.int_swap_free,"+ "host_stat.int_mcp_total,"+ "host_stat.int_mcp_free,"+ - "host_stat.int_gpu_total,"+ - "host_stat.int_gpu_free,"+ + "host_stat.int_gpu_mem_total,"+ + "host_stat.int_gpu_mem_free,"+ "host_stat.int_load, " + "alloc.str_name AS alloc_name " + "FROM " + @@ -2097,6 +2182,7 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "subscription.int_burst, " + "subscription.int_size, " + "subscription.int_cores, " + + "subscription.int_gpus, " + "show.str_name AS show_name, " + "alloc.str_name AS alloc_name, " + "facility.str_name AS facility_name " + @@ -2135,10 +2221,14 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException { "frame.int_mem_max_used," + "frame.int_mem_used, " + "frame.int_mem_reserved, " + - "frame.int_gpu_reserved, " + + "frame.int_gpus,"+ + "frame.int_gpu_mem_max_used, " + + "frame.int_gpu_mem_used, " + + "frame.int_gpu_mem_reserved, " + "frame.str_checkpoint_state,"+ "frame.int_checkpoint_count,"+ "frame.int_total_past_core_time,"+ + "frame.int_total_past_gpu_time,"+ "layer.str_name AS layer_name," + "job.str_name AS job_name, "+ "ROW_NUMBER() OVER " + diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/AbstractDispatcher.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/AbstractDispatcher.java index ddf3b2a2b..73f5aef73 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/AbstractDispatcher.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/AbstractDispatcher.java @@ -129,6 +129,7 @@ public boolean dispatchHost(DispatchFrame frame, VirtualProc proc) { dispatchSummary(proc, frame, "Booking"); DispatchSupport.bookedProcs.getAndIncrement(); DispatchSupport.bookedCores.addAndGet(proc.coresReserved); + DispatchSupport.bookedGpus.addAndGet(proc.gpusReserved); return true; } catch (FrameReservationException fre) { /* @@ -222,8 +223,10 @@ private static void dispatchSummary(VirtualProc p, DispatchFrame f, String type) " cores / " + CueUtil.KbToMb(p.memoryReserved) + " memory / " + - p.gpuReserved + - " gpu on " + + p.gpusReserved + + " gpus / " + + CueUtil.KbToMb(p.gpuMemoryReserved) + + " gpu memory " + p.getName() + " to " + f.show + "/" + f.shot; logger.info(msg); diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/CoreUnitDispatcher.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/CoreUnitDispatcher.java index d57caf3e9..beacefd97 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/CoreUnitDispatcher.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/CoreUnitDispatcher.java @@ -135,7 +135,8 @@ private List dispatchJobs(DispatchHost host, Set jobs) { if (!host.hasAdditionalResources( Dispatcher.CORE_POINTS_RESERVED_MIN, Dispatcher.MEM_RESERVED_MIN, - Dispatcher.GPU_RESERVED_MIN)) { + Dispatcher.GPU_UNITS_RESERVED_MIN, + Dispatcher.MEM_GPU_RESERVED_MIN)) { return procs; } @@ -179,7 +180,8 @@ private Set getGpuJobs(DispatchHost host, ShowInterface show) { if (host.hasAdditionalResources( Dispatcher.CORE_POINTS_RESERVED_DEFAULT, Dispatcher.MEM_RESERVED_MIN, - 1)) { + Dispatcher.GPU_UNITS_RESERVED_DEFAULT, + Dispatcher.MEM_GPU_RESERVED_DEFAULT)) { if (show == null) jobs = dispatchSupport.findDispatchJobs(host, getIntProperty("dispatcher.job_query_max")); @@ -262,11 +264,12 @@ public List dispatchHost(DispatchHost host, JobInterface job) { if (host.idleCores < frame.minCores || host.idleMemory < frame.minMemory || - host.idleGpu < frame.minGpu) { + host.idleGpus < frame.minGpus || + host.idleGpuMemory < frame.minGpuMemory) { break; } - if (!dispatchSupport.isJobBookable(job, proc.coresReserved)) { + if (!dispatchSupport.isJobBookable(job, proc.coresReserved, proc.gpusReserved)) { break; } @@ -289,17 +292,19 @@ public void wrapDispatchFrame() { DispatchSupport.bookedProcs.getAndIncrement(); DispatchSupport.bookedCores.addAndGet(proc.coresReserved); + DispatchSupport.bookedGpus.addAndGet(proc.gpusReserved); if (host.strandedCores > 0) { dispatchSupport.pickupStrandedCores(host); break; } - host.useResources(proc.coresReserved, proc.memoryReserved, proc.gpuReserved); + host.useResources(proc.coresReserved, proc.memoryReserved, proc.gpusReserved, proc.gpuMemoryReserved); if (!host.hasAdditionalResources( Dispatcher.CORE_POINTS_RESERVED_MIN, Dispatcher.MEM_RESERVED_MIN, - Dispatcher.GPU_RESERVED_MIN)) { + Dispatcher.GPU_UNITS_RESERVED_MIN, + Dispatcher.MEM_GPU_RESERVED_MIN)) { break; } else if (procs.size() >= getIntProperty("dispatcher.job_frame_dispatch_max")) { @@ -398,8 +403,10 @@ private void dispatchSummary(VirtualProc p, DispatchFrame f, String type) { " cores / " + CueUtil.KbToMb(p.memoryReserved) + " memory / " + - p.gpuReserved + - " gpu on " + + p.gpusReserved + + " gpus / " + + CueUtil.KbToMb(p.gpuMemoryReserved) + + " gpu memory " + p.getName() + " to " + f.show + "/" + f.shot; logger.trace(msg); diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupport.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupport.java index ebdd5082d..47dac264a 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupport.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupport.java @@ -77,6 +77,11 @@ public interface DispatchSupport { */ static final AtomicLong bookedCores = new AtomicLong(0); + /** + * Long for counting how many gpus have been booked + */ + static final AtomicLong bookedGpus = new AtomicLong(0); + /** * Long for counting how many procs have been booked */ @@ -122,6 +127,16 @@ public interface DispatchSupport { */ static final AtomicLong strandedCoresCount = new AtomicLong(0); + /** + * Count number of picked up gpus. + */ + static final AtomicLong pickedUpGpusCount = new AtomicLong(0); + + /** + * Count number of stranded gpus. + */ + static final AtomicLong strandedGpusCount = new AtomicLong(0); + /** * Set the proc's frame assignment to null; * @@ -456,7 +471,7 @@ void updateProcMemoryUsage(FrameInterface frame, long rss, long maxRss, long vsi * @param job * @return */ - boolean isJobBookable(JobInterface job, int coreUnits); + boolean isJobBookable(JobInterface job, int coreUnits, int gpuUnits); /** * Return true if the specified show is at or over its @@ -511,6 +526,40 @@ void updateProcMemoryUsage(FrameInterface frame, long rss, long maxRss, long vsi */ void determineIdleCores(DispatchHost host, int load); + /** + * Pickup any gpus that were stranded on the given host. + * + * @param host + */ + void pickupStrandedGpus(DispatchHost host); + + /** + * Return true if the host has stranded gpus. + * + * @param host + * @return + */ + boolean hasStrandedGpus(HostInterface host); + + /** + * Add stranded gpus for the given host. Stranded + * gpus will automatically be added to the next frame dispatched + * from the host to make up for gpus stranded with no memory. + * + * @param host + * @param gpus + */ + void strandGpus(DispatchHost host, int gpus); + + /** + * Lowers the perceived idle gpus on a machine if + * the load is over certain threshold. + * + * @param host + * @param load + */ + void determineIdleGpus(DispatchHost host, int load); + /** * Return a set of job IDs that can take the given host. * diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java index 3e3d82b2f..ad1d8196c 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java @@ -42,6 +42,7 @@ import com.imageworks.spcue.ResourceUsage; import com.imageworks.spcue.ShowInterface; import com.imageworks.spcue.StrandedCores; +import com.imageworks.spcue.StrandedGpus; import com.imageworks.spcue.VirtualProc; import com.imageworks.spcue.dao.BookingDao; import com.imageworks.spcue.dao.DispatcherDao; @@ -82,6 +83,9 @@ public class DispatchSupportService implements DispatchSupport { private ConcurrentHashMap strandedCores = new ConcurrentHashMap(); + private ConcurrentHashMap strandedGpus = + new ConcurrentHashMap(); + @Override public void pickupStrandedCores(DispatchHost host) { logger.info(host + "picked up stranded cores"); @@ -113,6 +117,35 @@ public void strandCores(DispatchHost host, int cores) { strandedCoresCount.getAndIncrement(); } + @Override + public void pickupStrandedGpus(DispatchHost host) { + logger.info(host + "picked up stranded gpu"); + pickedUpGpusCount.getAndIncrement(); + strandedGpus.remove(host.getHostId()); + } + + @Override + public boolean hasStrandedGpus(HostInterface host) { + StrandedGpus stranded = strandedGpus.get(host.getHostId()); + if (stranded == null) { + return false; + } + if (stranded.isExpired()) { + return false; + } + + return true; + } + + @Override + public void strandGpus(DispatchHost host, int gpus) { + logger.info(host + " found " + gpus + ", stranded gpu"); + host.strandedGpus = gpus; + strandedGpus.putIfAbsent(host.getHostId(), new StrandedGpus(gpus)); + strandedGpusCount.getAndIncrement(); + } + + @Transactional(readOnly = true) public List findNextDispatchFrames(JobInterface job, VirtualProc proc, int limit) { return dispatcherDao.findNextDispatchFrames(job, proc, limit); @@ -245,7 +278,7 @@ public boolean isJobBookable(JobInterface job) { @Override @Transactional(propagation = Propagation.REQUIRED, readOnly=true) - public boolean isJobBookable(JobInterface job, int coreUnits) { + public boolean isJobBookable(JobInterface job, int coreUnits, int gpuUnits) { if (!jobDao.hasPendingFrames(job)) { return false; @@ -255,6 +288,10 @@ public boolean isJobBookable(JobInterface job, int coreUnits) { return false; } + if (jobDao.isOverMaxGpus(job, gpuUnits)) { + return false; + } + return true; } @@ -363,6 +400,7 @@ public RunFrame prepareRqdRunFrame(VirtualProc proc, DispatchFrame frame) { .setLayerId(frame.getLayerId()) .setResourceId(proc.getProcId()) .setNumCores(proc.coresReserved) + .setNumGpus(proc.gpusReserved) .setStartTime(System.currentTimeMillis()) .setIgnoreNimby(proc.isLocalDispatch) .putAllEnvironment(jobDao.getEnvironment(frame)) @@ -370,6 +408,8 @@ public RunFrame prepareRqdRunFrame(VirtualProc proc, DispatchFrame frame) { .putEnvironment("CUE3", "1") .putEnvironment("CUE_THREADS", String.valueOf(threads)) .putEnvironment("CUE_MEMORY", String.valueOf(proc.memoryReserved)) + .putEnvironment("CUE_GPUS", String.valueOf(proc.gpusReserved)) + .putEnvironment("CUE_GPU_MEMORY", String.valueOf(proc.gpuMemoryReserved)) .putEnvironment("CUE_LOG_PATH", frame.logDir) .putEnvironment("CUE_RANGE", frame.range) .putEnvironment("CUE_CHUNK", String.valueOf(frame.chunkSize)) @@ -575,6 +615,14 @@ public void determineIdleCores(DispatchHost host, int load) { } } + @Override + public void determineIdleGpus(DispatchHost host, int load) { + int idleGpu = host.gpus - load; + if (idleGpu < host.idleGpus) { + host.idleGpus = idleGpu; + } + } + public DispatcherDao getDispatcherDao() { return dispatcherDao; } diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/Dispatcher.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/Dispatcher.java index d29c51f9c..3440fb595 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/Dispatcher.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/Dispatcher.java @@ -44,6 +44,10 @@ public interface Dispatcher { // The minimum amount of core points you can assign to a frame. public static final int CORE_POINTS_RESERVED_MIN = 10; + // The minimum amount of gpu points you can assign to a frame. + public static final int GPU_UNITS_RESERVED_DEFAULT = 0; + public static final int GPU_UNITS_RESERVED_MIN = 0; + // Amount of load per core a host can have before the perceived // number of idle cores is modified to reflect load conditions // on the host. @@ -69,13 +73,13 @@ public interface Dispatcher { // The default amount of gpu memory reserved for a frame if no gpu memory // reservation settings are specified - public static final long GPU_RESERVED_DEFAULT = 0; + public static final long MEM_GPU_RESERVED_DEFAULT = 0; // The minimum amount of gpu memory that can be assigned to a frame. - public static final long GPU_RESERVED_MIN = 0; + public static final long MEM_GPU_RESERVED_MIN = 0; // The maximum amount of gpu memory that can be assigned to a frame. - public static final long GPU_RESERVED_MAX = CueUtil.GB4; + public static final long MEM_GPU_RESERVED_MAX = CueUtil.GB * 1024; // Return value for cleared frame public static final int EXIT_STATUS_FRAME_CLEARED = 299; diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java index fe2482720..20afd374c 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java @@ -401,7 +401,7 @@ else if (report.getHost().getNimbyLocked()) { // Then check for higher priority jobs // If not, rebook this job if (job.autoUnbook && proc.coresReserved >= 100) { - if (jobManager.isOverMinCores(job)) { + if (jobManager.isOverMinCores(job) && jobManager.isOverMinGpus(job)) { try { boolean unbook = diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java index 69b602430..34e02021e 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java @@ -135,18 +135,6 @@ public void handleHostReport(HostReport report, boolean isBoot) { long startTime = System.currentTimeMillis(); try { - long totalGpu; - if (report.getHost().getAttributesMap().containsKey("totalGpu")) - totalGpu = Integer.parseInt(report.getHost().getAttributesMap().get("totalGpu")); - else - totalGpu = 0; - - long freeGpu; - if (report.getHost().getAttributesMap().containsKey("freeGpu")) - freeGpu = Integer.parseInt(report.getHost().getAttributesMap().get("freeGpu")); - else - freeGpu = 0; - long swapOut = 0; if (report.getHost().getAttributesMap().containsKey("swapout")) { swapOut = Integer.parseInt(report.getHost().getAttributesMap().get("swapout")); @@ -163,7 +151,7 @@ public void handleHostReport(HostReport report, boolean isBoot) { rhost.getTotalMem(), rhost.getFreeMem(), rhost.getTotalSwap(), rhost.getFreeSwap(), rhost.getTotalMcp(), rhost.getFreeMcp(), - totalGpu, freeGpu, + rhost.getTotalGpuMem(), rhost.getFreeGpuMem(), rhost.getLoad(), new Timestamp(rhost.getBootTime() * 1000l), rhost.getAttributesMap().get("SP_OS")); diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/LocalDispatcher.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/LocalDispatcher.java index 55497b83e..23bf6f73a 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/LocalDispatcher.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/LocalDispatcher.java @@ -111,7 +111,8 @@ private List dispatchHost(DispatchHost host, JobInterface job, */ if (!lha.hasAdditionalResources(lha.getThreads() * 100, frame.minMemory, - frame.minGpu)) { + frame.minGpus, + frame.minGpuMemory)) { continue; } @@ -141,10 +142,11 @@ private List dispatchHost(DispatchHost host, JobInterface job, * This should stay here and not go into VirtualProc * or else the count will be off if you fail to book. */ - lha.useResources(proc.coresReserved, proc.memoryReserved, proc.gpuReserved); + lha.useResources(proc.coresReserved, proc.memoryReserved, proc.gpusReserved, proc.gpuMemoryReserved); if (!lha.hasAdditionalResources(lha.getThreads() * 100, Dispatcher.MEM_RESERVED_MIN, - Dispatcher.GPU_RESERVED_MIN)) { + Dispatcher.GPU_UNITS_RESERVED_MIN, + Dispatcher.MEM_GPU_RESERVED_MIN)) { break; } @@ -196,7 +198,8 @@ private List dispatchHost(DispatchHost host, LayerInterface layer, */ if (!lha.hasAdditionalResources(lha.getThreads() * 100, frame.minMemory, - frame.minGpu)) { + frame.minGpus, + frame.minGpuMemory)) { continue; } @@ -226,10 +229,11 @@ private List dispatchHost(DispatchHost host, LayerInterface layer, * This should stay here and not go into VirtualProc * or else the count will be off if you fail to book. */ - lha.useResources(proc.coresReserved, proc.memoryReserved, proc.gpuReserved); + lha.useResources(proc.coresReserved, proc.memoryReserved, proc.gpusReserved, proc.gpuMemoryReserved); if (!lha.hasAdditionalResources(100, Dispatcher.MEM_RESERVED_MIN, - Dispatcher.GPU_RESERVED_MIN)) { + Dispatcher.GPU_UNITS_RESERVED_MIN, + Dispatcher.MEM_GPU_RESERVED_MIN)) { break; } @@ -272,7 +276,8 @@ private List dispatchHost(DispatchHost host, FrameInterface frame, DispatchFrame dframe = jobManager.getDispatchFrame(frame.getId()); if (!lha.hasAdditionalResources(lha.getMaxCoreUnits(), dframe.minMemory, - dframe.minGpu)) { + lha.getMaxGpuUnits(), + dframe.minGpuMemory)) { return procs; } @@ -382,7 +387,8 @@ private void prepHost(DispatchHost host, LocalHostAssignment lha) { host.isLocalDispatch = true; host.idleCores = lha.getIdleCoreUnits(); host.idleMemory = lha.getIdleMemory(); - host.idleGpu = lha.getIdleGpu(); + host.idleGpus = lha.getIdleGpuUnits(); + host.idleGpuMemory = lha.getIdleGpuMemory(); } diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/RedirectManager.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/RedirectManager.java index a3519f10e..24b1681e9 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/RedirectManager.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/RedirectManager.java @@ -258,6 +258,8 @@ public boolean addRedirect(VirtualProc proc, GroupInterface group, DispatchHost host = hostManager.getDispatchHost(proc.getHostId()); host.idleCores = proc.coresReserved; host.idleMemory = proc.memoryReserved; + host.idleGpus = proc.gpusReserved; + host.idleGpuMemory = proc.gpuMemoryReserved; if (dispatchSupport.findDispatchJobs(host, group).size() < 1) { logger.info("Failed to find a pending job in group: " + group.getName()); diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/ResourceContainer.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/ResourceContainer.java index c829eb390..0d1141bc1 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/ResourceContainer.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/ResourceContainer.java @@ -27,19 +27,21 @@ public interface ResourceContainer { * * @param minCores * @param minMemory - * @param minGpu + * @param minGpus + * @param minGpuMemory * @return */ - public boolean hasAdditionalResources(int minCores, long minMemory, long minGpu); + public boolean hasAdditionalResources(int minCores, long minMemory, int minGpus, long minGpuMemory); /** * Subtract the given resources from the grand totals. * * @param coreUnits * @param memory - * @param gpu + * @param gpuUnits + * @param gpuMemory */ - public void useResources(int coreUnits, long memory, long gpu); + public void useResources(int coreUnits, long memory, int gpuUnits, long gpuMemory); } diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/commands/DispatchBookHost.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/commands/DispatchBookHost.java index c8971a0f4..594444573 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/commands/DispatchBookHost.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/commands/DispatchBookHost.java @@ -92,14 +92,16 @@ else if (job != null) { if (host.hasAdditionalResources( Dispatcher.CORE_POINTS_RESERVED_MIN, Dispatcher.MEM_RESERVED_MIN, - Dispatcher.GPU_RESERVED_MIN)) { + Dispatcher.GPU_UNITS_RESERVED_MIN, + Dispatcher.MEM_GPU_RESERVED_MIN)) { dispatcher.dispatchHost(host); } if (host.hasAdditionalResources( Dispatcher.CORE_POINTS_RESERVED_MIN, Dispatcher.MEM_RESERVED_MIN, - Dispatcher.GPU_RESERVED_MIN)) { + Dispatcher.GPU_UNITS_RESERVED_MIN, + Dispatcher.MEM_GPU_RESERVED_MIN)) { dispatcher.dispatchHostToAllShows(host); } } diff --git a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageFrame.java b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageFrame.java index 7cc811cfb..27511fb94 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageFrame.java +++ b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageFrame.java @@ -269,7 +269,7 @@ public void addRenderPartition(FrameAddRenderPartitionRequest request, lha.setThreads(request.getThreads()); lha.setMaxCoreUnits(request.getMaxCores() * 100); lha.setMaxMemory(request.getMaxMemory()); - lha.setMaxGpu(request.getMaxGpu()); + lha.setMaxGpuMemory(request.getMaxGpuMemory()); lha.setType(RenderPartitionType.FRAME_PARTITION); if (localBookingSupport.bookLocal(frame, request.getHost(), request.getUsername(), lha)) { diff --git a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageGroup.java b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageGroup.java index 8fd7e10ce..b8f3cd43e 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageGroup.java +++ b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageGroup.java @@ -55,6 +55,10 @@ import com.imageworks.spcue.grpc.job.GroupSetDefJobMaxCoresResponse; import com.imageworks.spcue.grpc.job.GroupSetDefJobMinCoresRequest; import com.imageworks.spcue.grpc.job.GroupSetDefJobMinCoresResponse; +import com.imageworks.spcue.grpc.job.GroupSetDefJobMaxGpusRequest; +import com.imageworks.spcue.grpc.job.GroupSetDefJobMaxGpusResponse; +import com.imageworks.spcue.grpc.job.GroupSetDefJobMinGpusRequest; +import com.imageworks.spcue.grpc.job.GroupSetDefJobMinGpusResponse; import com.imageworks.spcue.grpc.job.GroupSetDefJobPriorityRequest; import com.imageworks.spcue.grpc.job.GroupSetDefJobPriorityResponse; import com.imageworks.spcue.grpc.job.GroupSetDeptRequest; @@ -65,6 +69,10 @@ import com.imageworks.spcue.grpc.job.GroupSetMaxCoresResponse; import com.imageworks.spcue.grpc.job.GroupSetMinCoresRequest; import com.imageworks.spcue.grpc.job.GroupSetMinCoresResponse; +import com.imageworks.spcue.grpc.job.GroupSetMaxGpusRequest; +import com.imageworks.spcue.grpc.job.GroupSetMaxGpusResponse; +import com.imageworks.spcue.grpc.job.GroupSetMinGpusRequest; +import com.imageworks.spcue.grpc.job.GroupSetMinGpusResponse; import com.imageworks.spcue.grpc.job.GroupSetNameRequest; import com.imageworks.spcue.grpc.job.GroupSetNameResponse; import com.imageworks.spcue.grpc.job.Job; @@ -189,6 +197,24 @@ public void setDefaultJobMinCores(GroupSetDefJobMinCoresRequest request, StreamO responseObserver.onCompleted(); } + @Override + public void setDefaultJobMaxGpus(GroupSetDefJobMaxGpusRequest request, + StreamObserver responseObserver) { + GroupInterface group = getGroupInterface(request.getGroup()); + groupManager.setGroupDefaultJobMaxGpus(group, request.getMaxGpus()); + responseObserver.onNext(GroupSetDefJobMaxGpusResponse.newBuilder().build()); + responseObserver.onCompleted(); + } + + @Override + public void setDefaultJobMinGpus(GroupSetDefJobMinGpusRequest request, + StreamObserver responseObserver) { + GroupInterface group = getGroupInterface(request.getGroup()); + groupManager.setGroupDefaultJobMinGpus(group, request.getMinGpus()); + responseObserver.onNext(GroupSetDefJobMinGpusResponse.newBuilder().build()); + responseObserver.onCompleted(); + } + @Override public void setName(GroupSetNameRequest request, StreamObserver responseObserver) { GroupInterface group = getGroupInterface(request.getGroup()); @@ -262,6 +288,24 @@ public void setMinCores(GroupSetMinCoresRequest request, responseObserver.onCompleted(); } + @Override + public void setMaxGpus(GroupSetMaxGpusRequest request, + StreamObserver responseObserver) { + GroupInterface group = getGroupInterface(request.getGroup()); + groupManager.setGroupMaxGpus(group, request.getMaxGpus()); + responseObserver.onNext(GroupSetMaxGpusResponse.newBuilder().build()); + responseObserver.onCompleted(); + } + + @Override + public void setMinGpus(GroupSetMinGpusRequest request, + StreamObserver responseObserver) { + GroupInterface group = getGroupInterface(request.getGroup()); + groupManager.setGroupMinGpus(group, request.getMinGpus()); + responseObserver.onNext(GroupSetMinGpusResponse.newBuilder().build()); + responseObserver.onCompleted(); + } + public GroupDao getGroupDao() { return groupDao; } diff --git a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageJob.java b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageJob.java index dd00476cd..c13177a74 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageJob.java +++ b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageJob.java @@ -124,10 +124,14 @@ import com.imageworks.spcue.grpc.job.JobSetGroupResponse; import com.imageworks.spcue.grpc.job.JobSetMaxCoresRequest; import com.imageworks.spcue.grpc.job.JobSetMaxCoresResponse; +import com.imageworks.spcue.grpc.job.JobSetMaxGpusRequest; +import com.imageworks.spcue.grpc.job.JobSetMaxGpusResponse; import com.imageworks.spcue.grpc.job.JobSetMaxRetriesRequest; import com.imageworks.spcue.grpc.job.JobSetMaxRetriesResponse; import com.imageworks.spcue.grpc.job.JobSetMinCoresRequest; import com.imageworks.spcue.grpc.job.JobSetMinCoresResponse; +import com.imageworks.spcue.grpc.job.JobSetMinGpusRequest; +import com.imageworks.spcue.grpc.job.JobSetMinGpusResponse; import com.imageworks.spcue.grpc.job.JobSetPriorityRequest; import com.imageworks.spcue.grpc.job.JobSetPriorityResponse; import com.imageworks.spcue.grpc.job.JobStaggerFramesRequest; @@ -376,6 +380,36 @@ public void setMinCores(JobSetMinCoresRequest request, StreamObserver responseObserver) { + try{ + setupJobData(request.getJob()); + jobDao.updateMaxGpus(job, request.getVal()); + responseObserver.onNext(JobSetMaxGpusResponse.newBuilder().build()); + responseObserver.onCompleted(); + } + catch (EmptyResultDataAccessException e) { + responseObserver.onError(Status.INTERNAL + .withDescription("Failed to find job data") + .asRuntimeException()); + } + } + + @Override + public void setMinGpus(JobSetMinGpusRequest request, StreamObserver responseObserver) { + try{ + setupJobData(request.getJob()); + jobDao.updateMinGpus(job, request.getVal()); + responseObserver.onNext(JobSetMinGpusResponse.newBuilder().build()); + responseObserver.onCompleted(); + } + catch (EmptyResultDataAccessException e) { + responseObserver.onError(Status.INTERNAL + .withDescription("Failed to find job data") + .asRuntimeException()); + } + } + @Override public void setPriority(JobSetPriorityRequest request, StreamObserver responseObserver) { try{ @@ -772,7 +806,8 @@ public void addRenderPartition(JobAddRenderPartRequest request, StreamObserver responseObserver) { + updateLayer(request.getLayer()); + jobManager.setLayerMinGpus(layer, request.getMinGpus()); + responseObserver.onNext(LayerSetMinGpusResponse.newBuilder().build()); + responseObserver.onCompleted(); + } + @Override public void setMinMemory(LayerSetMinMemoryRequest request, StreamObserver responseObserver) { updateLayer(request.getLayer()); @@ -234,10 +246,11 @@ public void setMinMemory(LayerSetMinMemoryRequest request, StreamObserver responseObserver) { + public void setMinGpuMemory(LayerSetMinGpuMemoryRequest request, + StreamObserver responseObserver) { updateLayer(request.getLayer()); - layerDao.updateLayerMinGpu(layer, request.getGpu()); - responseObserver.onNext(LayerSetMinGpuResponse.newBuilder().build()); + layerDao.updateLayerMinGpuMemory(layer, request.getGpuMemory()); + responseObserver.onNext(LayerSetMinGpuMemoryResponse.newBuilder().build()); responseObserver.onCompleted(); } @@ -388,7 +401,8 @@ public void addRenderPartition(LayerAddRenderPartitionRequest request, lha.setThreads(request.getThreads()); lha.setMaxCoreUnits(request.getMaxCores() * 100); lha.setMaxMemory(request.getMaxMemory()); - lha.setMaxGpu(request.getMaxGpu()); + lha.setMaxGpuUnits(request.getMaxGpus()); + lha.setMaxGpuMemory(request.getMaxGpuMemory()); lha.setType(RenderPartitionType.LAYER_PARTITION); if (localBookingSupport.bookLocal(layer, request.getHost(), request.getUsername(), lha)) { RenderPartition partition = whiteboard.getRenderPartition(lha); @@ -449,6 +463,14 @@ public void setMaxCores(LayerSetMaxCoresRequest request, StreamObserver responseObserver) { + updateLayer(request.getLayer()); + jobManager.setLayerMaxGpus(layer, request.getMaxGpus()); + responseObserver.onNext(LayerSetMaxGpusResponse.newBuilder().build()); + responseObserver.onCompleted(); + } + public DependManager getDependManager() { return dependManager; } diff --git a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageRenderPartition.java b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageRenderPartition.java index 88e53ee95..413f1982c 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageRenderPartition.java +++ b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageRenderPartition.java @@ -45,7 +45,7 @@ public void delete(RenderPartDeleteRequest request, StreamObserver responseObserver) { LocalHostAssignment localJobAssign = getLocalHostAssignment(request.getRenderPartition()); - bookingManager.setMaxResources(localJobAssign, request.getCores(), request.getMemory(), request.getGpu()); + bookingManager.setMaxResources(localJobAssign, request.getCores(), request.getMemory(), request.getGpus(), request.getGpuMemory()); responseObserver.onNext(RenderPartSetMaxResourcesResponse.newBuilder().build()); responseObserver.onCompleted(); } diff --git a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageService.java b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageService.java index eae767006..70a15f3bf 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageService.java +++ b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageService.java @@ -55,7 +55,9 @@ public void createService(ServiceCreateServiceRequest request, service.minCores = request.getData().getMinCores(); service.maxCores = request.getData().getMaxCores(); service.minMemory = request.getData().getMinMemory(); - service.minGpu = request.getData().getMinGpu(); + service.minGpus = request.getData().getMinGpus(); + service.maxGpus = request.getData().getMaxGpus(); + service.minGpuMemory = request.getData().getMinGpuMemory(); service.tags = Sets.newLinkedHashSet(request.getData().getTagsList()); service.threadable = request.getData().getThreadable(); service.timeout = request.getData().getTimeout(); @@ -129,7 +131,9 @@ private ServiceEntity toServiceEntity(Service service) { entity.minCores = service.getMinCores(); entity.maxCores = service.getMaxCores(); entity.minMemory = service.getMinMemory(); - entity.minGpu = service.getMinGpu(); + entity.minGpus = service.getMinGpus(); + entity.maxGpus = service.getMaxGpus(); + entity.minGpuMemory = service.getMinGpuMemory(); entity.tags = new LinkedHashSet<> (service.getTagsList()); entity.threadable = service.getThreadable(); entity.timeout = service.getTimeout(); diff --git a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageServiceOverride.java b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageServiceOverride.java index bd90575b5..ed3d46107 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageServiceOverride.java +++ b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageServiceOverride.java @@ -67,7 +67,9 @@ private ServiceEntity toServiceEntity(Service service) { entity.minCores = service.getMinCores(); entity.maxCores = service.getMaxCores(); entity.minMemory = service.getMinMemory(); - entity.minGpu = service.getMinGpu(); + entity.minGpus = service.getMinGpus(); + entity.maxGpus = service.getMaxGpus(); + entity.minGpuMemory = service.getMinGpuMemory(); entity.tags = new LinkedHashSet<>(service.getTagsList()); entity.threadable = service.getThreadable(); entity.timeout = service.getTimeout(); diff --git a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageShow.java b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageShow.java index 6dd94a5ff..6e5fbcbe8 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageShow.java +++ b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageShow.java @@ -93,6 +93,10 @@ import com.imageworks.spcue.grpc.show.ShowSetDefaultMaxCoresResponse; import com.imageworks.spcue.grpc.show.ShowSetDefaultMinCoresRequest; import com.imageworks.spcue.grpc.show.ShowSetDefaultMinCoresResponse; +import com.imageworks.spcue.grpc.show.ShowSetDefaultMaxGpusRequest; +import com.imageworks.spcue.grpc.show.ShowSetDefaultMaxGpusResponse; +import com.imageworks.spcue.grpc.show.ShowSetDefaultMinGpusRequest; +import com.imageworks.spcue.grpc.show.ShowSetDefaultMinGpusResponse; import com.imageworks.spcue.grpc.subscription.Subscription; import com.imageworks.spcue.grpc.subscription.SubscriptionSeq; import com.imageworks.spcue.service.AdminManager; @@ -257,6 +261,24 @@ public void setDefaultMinCores(ShowSetDefaultMinCoresRequest request, responseObserver.onCompleted(); } + @Override + public void setDefaultMaxGpus(ShowSetDefaultMaxGpusRequest request, + StreamObserver responseObserver) { + ShowEntity show = getShowEntity(request.getShow()); + showDao.updateShowDefaultMaxGpus(show, request.getMaxGpus()); + responseObserver.onNext(ShowSetDefaultMaxGpusResponse.newBuilder().build()); + responseObserver.onCompleted(); + } + + @Override + public void setDefaultMinGpus(ShowSetDefaultMinGpusRequest request, + StreamObserver responseObserver) { + ShowEntity show = getShowEntity(request.getShow()); + showDao.updateShowDefaultMinGpus(show, request.getMinGpus()); + responseObserver.onNext(ShowSetDefaultMinGpusResponse.newBuilder().build()); + responseObserver.onCompleted(); + } + @Override public void findFilter(ShowFindFilterRequest request, StreamObserver responseObserver) { @@ -361,7 +383,9 @@ public void createServiceOverride(ShowCreateServiceOverrideRequest request, service.minCores = requestService.getMinCores(); service.maxCores = requestService.getMaxCores(); service.minMemory = requestService.getMinMemory(); - service.minGpu = requestService.getMinGpu(); + service.minGpus = requestService.getMinGpus(); + service.maxGpus = requestService.getMaxGpus(); + service.minGpuMemory = requestService.getMinGpuMemory(); service.tags = Sets.newLinkedHashSet(requestService.getTagsList()); service.threadable = requestService.getThreadable(); serviceManager.createService(service); diff --git a/cuebot/src/main/java/com/imageworks/spcue/service/BookingManager.java b/cuebot/src/main/java/com/imageworks/spcue/service/BookingManager.java index 6c348eeb4..b5cfb8455 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/service/BookingManager.java +++ b/cuebot/src/main/java/com/imageworks/spcue/service/BookingManager.java @@ -122,9 +122,10 @@ public void createLocalHostAssignment(DispatchHost host, * @param l * @param maxCoreUnits * @param maxMemory - * @param maxGpu + * @param maxGpuUnits + * @param maxGpuMemory */ - void setMaxResources(LocalHostAssignment l, int maxCoreUnits, long maxMemory, long maxGpu); + void setMaxResources(LocalHostAssignment l, int maxCoreUnits, long maxMemory, int maxGpuUnits, long maxGpuMemory); /** * Remove a LocalHostAssignment if there are no procs assigned to it. diff --git a/cuebot/src/main/java/com/imageworks/spcue/service/BookingManagerService.java b/cuebot/src/main/java/com/imageworks/spcue/service/BookingManagerService.java index 02b2fe948..1a2b6cee2 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/service/BookingManagerService.java +++ b/cuebot/src/main/java/com/imageworks/spcue/service/BookingManagerService.java @@ -72,7 +72,7 @@ public boolean hasActiveLocalFrames(HostInterface host) { @Override public void setMaxResources(LocalHostAssignment l, int maxCoreUnits, - long maxMemory, long maxGpu) { + long maxMemory, int maxGpuUnits, long maxGpuMemory) { HostInterface host = hostDao.getHost(l.getHostId()); @@ -84,8 +84,12 @@ public void setMaxResources(LocalHostAssignment l, int maxCoreUnits, bookingDao.updateMaxMemory(l, maxMemory); } - if (maxGpu > 0) { - bookingDao.updateMaxGpu(l, maxGpu); + if (maxGpuUnits > 0) { + bookingDao.updateMaxGpus(l, maxGpuUnits); + } + + if (maxGpuMemory > 0) { + bookingDao.updateMaxGpuMemory(l, maxGpuMemory); } } diff --git a/cuebot/src/main/java/com/imageworks/spcue/service/GroupManager.java b/cuebot/src/main/java/com/imageworks/spcue/service/GroupManager.java index 9017304f8..2e3cf70be 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/service/GroupManager.java +++ b/cuebot/src/main/java/com/imageworks/spcue/service/GroupManager.java @@ -34,6 +34,10 @@ public interface GroupManager { void setGroupMinCores(GroupInterface g, int coreUnits); void setGroupDefaultJobMinCores(GroupInterface g, int coreUnits); void setGroupDefaultJobMaxCores(GroupInterface g, int coreUnits); + void setGroupMaxGpus(GroupInterface g, int gpuUnits); + void setGroupMinGpus(GroupInterface g, int gpuUnits); + void setGroupDefaultJobMinGpus(GroupInterface g, int gpuUnits); + void setGroupDefaultJobMaxGpus(GroupInterface g, int gpuUnits); void setGroupDefaultJobPriority(GroupInterface g, int priority); /** diff --git a/cuebot/src/main/java/com/imageworks/spcue/service/GroupManagerService.java b/cuebot/src/main/java/com/imageworks/spcue/service/GroupManagerService.java index 7e785c0ea..89fc25193 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/service/GroupManagerService.java +++ b/cuebot/src/main/java/com/imageworks/spcue/service/GroupManagerService.java @@ -78,6 +78,32 @@ public void setGroupMinCores(GroupInterface g, int coreUnits) { groupDao.updateMinCores(g,coreUnits); } + @Override + public void setGroupDefaultJobMaxGpus(GroupInterface g, int gpuUnits) { + groupDao.updateDefaultJobMaxGpus(g,gpuUnits); + if (gpuUnits != CueUtil.FEATURE_DISABLED && !groupDao.isManaged(g)) { + jobDao.updateMaxGpus(g, gpuUnits); + } + } + + @Override + public void setGroupDefaultJobMinGpus(GroupInterface g, int gpuUnits) { + groupDao.updateDefaultJobMinGpus(g,gpuUnits); + if (gpuUnits != CueUtil.FEATURE_DISABLED && !groupDao.isManaged(g)) { + jobDao.updateMinGpus(g, gpuUnits); + } + } + + @Override + public void setGroupMaxGpus(GroupInterface g, int gpuUnits) { + groupDao.updateMaxGpus(g, gpuUnits); + } + + @Override + public void setGroupMinGpus(GroupInterface g, int gpuUnits) { + groupDao.updateMinGpus(g, gpuUnits); + } + @Override public void setGroupParent(GroupInterface group, GroupInterface newParent) { groupDao.updateGroupParent(group, newParent); diff --git a/cuebot/src/main/java/com/imageworks/spcue/service/HostManager.java b/cuebot/src/main/java/com/imageworks/spcue/service/HostManager.java index 19704e65e..8b176c77e 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/service/HostManager.java +++ b/cuebot/src/main/java/com/imageworks/spcue/service/HostManager.java @@ -112,8 +112,8 @@ public interface HostManager { * @param freeSwap * @param totalMcp * @param freeMcp - * @param totalGpu - * @param freeGpu + * @param totalGpuMemory + * @param freeGpuMemory * @param load * @param bootTime * @param os @@ -122,7 +122,7 @@ void setHostStatistics(HostInterface host, long totalMemory, long freeMemory, long totalSwap, long freeSwap, long totalMcp, long freeMcp, - long totalGpu, long freeGpu, + long totalGpuMemory, long freeGpuMemory, int load, Timestamp bootTime, String os); @@ -212,6 +212,11 @@ void setHostStatistics(HostInterface host, */ int getStrandedCoreUnits(HostInterface h); + /** + * Return the number of stranded cores on the host. + */ + int getStrandedGpuUnits(HostInterface h); + /** * Return true of the host prefers a particular show. * diff --git a/cuebot/src/main/java/com/imageworks/spcue/service/HostManagerService.java b/cuebot/src/main/java/com/imageworks/spcue/service/HostManagerService.java index 9bbaaa6e4..ccd355889 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/service/HostManagerService.java +++ b/cuebot/src/main/java/com/imageworks/spcue/service/HostManagerService.java @@ -124,7 +124,7 @@ public void setHostStatistics(HostInterface host, long totalMemory, long freeMemory, long totalSwap, long freeSwap, long totalMcp, long freeMcp, - long totalGpu, long freeGpu, + long totalGpuMemory, long freeGpuMemory, int load, Timestamp bootTime, String os) { @@ -132,7 +132,7 @@ public void setHostStatistics(HostInterface host, totalMemory, freeMemory, totalSwap, freeSwap, totalMcp, freeMcp, - totalGpu, freeGpu, + totalGpuMemory, freeGpuMemory, load, bootTime, os); } @@ -268,6 +268,12 @@ public int getStrandedCoreUnits(HostInterface h) { return hostDao.getStrandedCoreUnits(h); } + @Override + @Transactional(propagation = Propagation.REQUIRED, readOnly=true) + public int getStrandedGpuUnits(HostInterface h) { + return hostDao.getStrandedGpus(h); + } + @Override @Transactional(propagation = Propagation.REQUIRED, readOnly=true) public boolean verifyRunningProc(String procId, String frameId) { diff --git a/cuebot/src/main/java/com/imageworks/spcue/service/JobLauncher.java b/cuebot/src/main/java/com/imageworks/spcue/service/JobLauncher.java index e84bebb16..14f2a5741 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/service/JobLauncher.java +++ b/cuebot/src/main/java/com/imageworks/spcue/service/JobLauncher.java @@ -109,7 +109,8 @@ public void launch(final JobSpec spec) { lha.setThreads(d.localThreadNumber); lha.setMaxCoreUnits(d.localMaxCores * 100); lha.setMaxMemory(d.localMaxMemory); - lha.setMaxGpu(d.localMaxGpu); + lha.setMaxGpuUnits(d.localMaxGpus); + lha.setMaxGpuMemory(d.localMaxGpuMemory); lha.setType(RenderPartitionType.JOB_PARTITION); try { diff --git a/cuebot/src/main/java/com/imageworks/spcue/service/JobManager.java b/cuebot/src/main/java/com/imageworks/spcue/service/JobManager.java index 4ab1b2120..6ab4bb38e 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/service/JobManager.java +++ b/cuebot/src/main/java/com/imageworks/spcue/service/JobManager.java @@ -292,6 +292,14 @@ public interface JobManager { */ boolean isOverMinCores(JobInterface job); + /** + * Return true if the given job is booked greater than min gpus. + * + * @param job + * @return + */ + boolean isOverMinGpus(JobInterface job); + /** * Increase the layer memory requirement to given KB value. * @@ -453,6 +461,22 @@ public interface JobManager { */ void setLayerMinCores(LayerInterface layer, int coreUnits); + /** + * Update the max gpu value for the given layer. + * + * @param layer + * @param gpuUnits + */ + void setLayerMaxGpus(LayerInterface layer, int gpuUnits); + + /** + * Update the min gpu value for the given layer. + * + * @param layer + * @param gpuUnits + */ + void setLayerMinGpus(LayerInterface layer, int gpuUnits); + /** * Add a limit to the given layer. * diff --git a/cuebot/src/main/java/com/imageworks/spcue/service/JobManagerService.java b/cuebot/src/main/java/com/imageworks/spcue/service/JobManagerService.java index 68821ed64..a4f6f1ebb 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/service/JobManagerService.java +++ b/cuebot/src/main/java/com/imageworks/spcue/service/JobManagerService.java @@ -135,6 +135,12 @@ public boolean isOverMinCores(JobInterface job) { return jobDao.isOverMinCores(job); } + @Override + @Transactional(propagation = Propagation.REQUIRED, readOnly=true) + public boolean isOverMinGpus(JobInterface job) { + return jobDao.isOverMinGpus(job); + } + @Transactional(propagation = Propagation.REQUIRED, readOnly=true) public DispatchJob getDispatchJob(String id) { return jobDao.getDispatchJob(id); @@ -450,6 +456,16 @@ public void setLayerMaxCores(LayerInterface layer, int coreUnits) { layerDao.updateLayerMaxCores(layer, coreUnits); } + @Override + public void setLayerMinGpus(LayerInterface layer, int gpu) { + layerDao.updateLayerMinGpus(layer, gpu); + } + + @Override + public void setLayerMaxGpus(LayerInterface layer, int gpu) { + layerDao.updateLayerMaxGpus(layer, gpu); + } + @Override public void addLayerLimit(LayerInterface layer, String limitId) { layerDao.addLimit(layer, limitId); diff --git a/cuebot/src/main/java/com/imageworks/spcue/service/JobSpec.java b/cuebot/src/main/java/com/imageworks/spcue/service/JobSpec.java index 1e498eaf4..30bf7cfd3 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/service/JobSpec.java +++ b/cuebot/src/main/java/com/imageworks/spcue/service/JobSpec.java @@ -286,11 +286,17 @@ private BuildableJob handleJobTag(Element jobTag) { if (local.getAttributeValue("cores") != null) job.localMaxCores = Integer.parseInt(local.getAttributeValue("cores")); if (local.getAttributeValue("memory") != null) - job.localMaxMemory = Integer.parseInt(local.getAttributeValue("memory")); + job.localMaxMemory = Long.parseLong(local.getAttributeValue("memory")); if (local.getAttributeValue("threads") != null) job.localThreadNumber = Integer.parseInt(local.getAttributeValue("threads")); - if (local.getAttributeValue("gpu") != null) - job.localMaxGpu = Integer.parseInt(local.getAttributeValue("gpu")); + if (local.getAttributeValue("gpus") != null) + job.localMaxGpus = Integer.parseInt(local.getAttributeValue("gpus")); + if (local.getAttributeValue("gpu") != null) { + logger.warn(job.name + " localbook has the deprecated gpu. Use gpu_memory."); + job.localMaxGpuMemory = Long.parseLong(local.getAttributeValue("gpu")); + } + if (local.getAttributeValue("gpu_memory") != null) + job.localMaxGpuMemory = Long.parseLong(local.getAttributeValue("gpu_memory")); } job.maxCoreUnits = 20000; @@ -423,11 +429,12 @@ private void handleLayerTags(BuildableJob buildableJob, Element jobTag) { determineResourceDefaults(layerTag, buildableJob, layer); determineChunkSize(layerTag, layer); determineMinimumCores(layerTag, layer); + determineMinimumGpus(layerTag, layer); determineThreadable(layerTag, layer); determineTags(buildableJob, layer, layerTag); determineMinimumMemory(buildableJob, layerTag, layer, buildableLayer); - determineMinimumGpu(buildableJob, layerTag, layer); + determineMinimumGpuMemory(buildableJob, layerTag, layer); // set a timeout value on the layer if (layerTag.getChildTextTrim("timeout") != null) { @@ -521,44 +528,53 @@ else if (minMemory < Dispatcher.MEM_RESERVED_MIN) { } /** - * If the gpu option is set, set minimumGpu to that supplied value + * If the gpu_memory option is set, set minimumGpuMemory to that supplied value * * @param layerTag * @param layer */ - private void determineMinimumGpu(BuildableJob buildableJob, Element layerTag, + private void determineMinimumGpuMemory(BuildableJob buildableJob, Element layerTag, LayerDetail layer) { - if (layerTag.getChildTextTrim("gpu") == null) { + String gpu = layerTag.getChildTextTrim("gpu"); + String gpuMemory = layerTag.getChildTextTrim("gpu_memory"); + if (gpu == null && gpuMemory == null) { return; } - long minGpu; - String memory = layerTag.getChildTextTrim("gpu").toLowerCase(); + String memory = null; + if (gpu != null) { + logger.warn(buildableJob.detail.name + "/" + layer.name + + " has the deprecated gpu. Use gpu_memory."); + memory = gpu.toLowerCase(); + } + if (gpuMemory != null) + memory = gpuMemory.toLowerCase(); + long minGpuMemory; try { - minGpu = convertMemoryInput(memory); + minGpuMemory = convertMemoryInput(memory); // Some quick sanity checks to make sure gpu memory hasn't gone // over or under reasonable defaults. - if (minGpu> Dispatcher.GPU_RESERVED_MAX) { + if (minGpuMemory > Dispatcher.MEM_GPU_RESERVED_MAX) { throw new SpecBuilderException("Gpu memory requirements exceed " + "maximum. Are you specifying the correct units?"); } - else if (minGpu < Dispatcher.GPU_RESERVED_MIN) { + else if (minGpuMemory < Dispatcher.MEM_GPU_RESERVED_MIN) { logger.warn(buildableJob.detail.name + "/" + layer.name + "Specified too little gpu memory, defaulting to: " + - Dispatcher.GPU_RESERVED_MIN); - minGpu = Dispatcher.GPU_RESERVED_MIN; + Dispatcher.MEM_GPU_RESERVED_MIN); + minGpuMemory = Dispatcher.MEM_GPU_RESERVED_MIN; } - layer.minimumGpu = minGpu; + layer.minimumGpuMemory = minGpuMemory; } catch (Exception e) { logger.info("Error setting gpu memory for " + buildableJob.detail.name + "/" + layer.name + " failed, reason: " + e + ". Using default."); - layer.minimumGpu = Dispatcher.GPU_RESERVED_DEFAULT; + layer.minimumGpuMemory = Dispatcher.MEM_GPU_RESERVED_DEFAULT; } } @@ -598,6 +614,20 @@ private void determineMinimumCores(Element layerTag, LayerDetail layer) { layer.minimumCores = corePoints; } + /** + * Gpu is a int. + * + * If no gpu value is specified, we default to the value of + * Dispatcher.GPU_RESERVED_DEFAULT + */ + private void determineMinimumGpus(Element layerTag, LayerDetail layer) { + + String gpus = layerTag.getChildTextTrim("gpus"); + if (gpus != null) { + layer.minimumGpus = Integer.valueOf(gpus); + } + } + private void determineChunkSize(Element layerTag, LayerDetail layer) { layer.chunkSize = Integer.parseInt(layerTag.getChildTextTrim("chunk")); } @@ -702,7 +732,9 @@ private void determineResourceDefaults(Element layerTag, layer.maximumCores = primaryService.maxCores; layer.minimumCores = primaryService.minCores; layer.minimumMemory = primaryService.minMemory; - layer.minimumGpu = primaryService.minGpu; + layer.maximumGpus = primaryService.maxGpus; + layer.minimumGpus = primaryService.minGpus; + layer.minimumGpuMemory = primaryService.minGpuMemory; layer.tags.addAll(primaryService.tags); layer.services.addAll(services); layer.limits.addAll(limits); diff --git a/cuebot/src/main/java/com/imageworks/spcue/util/CueUtil.java b/cuebot/src/main/java/com/imageworks/spcue/util/CueUtil.java index a7d89e7ee..f8b028cfa 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/util/CueUtil.java +++ b/cuebot/src/main/java/com/imageworks/spcue/util/CueUtil.java @@ -207,11 +207,11 @@ public static final String KbToMb(long kb) { return String.format("%dMB", kb / 1024); } - public static final long convertKbToFakeKb64bit(int Kb) { + public static final long convertKbToFakeKb64bit(long Kb) { return (long) (Math.ceil((Kb * 0.0009765625) * 0.0009765625) * 1048576) - Dispatcher.MEM_RESERVED_SYSTEM; } - public static final long convertKbToFakeKb32bit(int Kb) { + public static final long convertKbToFakeKb32bit(long Kb) { return (long) (Math.floor((Kb * 0.0009765625) * 0.0009765625) * 1048576) - Dispatcher.MEM_RESERVED_SYSTEM; } @@ -235,10 +235,10 @@ public final static String buildFrameName(LayerInterface layer, int num) { return String.format("%04d-%s", num, layer.getName()); } - public final static String buildProcName(String host, int cores) { - return String.format(Locale.ROOT, "%s/%4.2f", host, Convert.coreUnitsToCores(cores)); - + public final static String buildProcName(String host, int cores, int gpus) { + return String.format(Locale.ROOT, "%s/%4.2f/%d", host, Convert.coreUnitsToCores(cores), gpus); } + /** * for logging how long an operation took * diff --git a/cuebot/src/main/resources/conf/ddl/postgres/migrations/V11__Support_multiple_GPU.sql b/cuebot/src/main/resources/conf/ddl/postgres/migrations/V11__Support_multiple_GPU.sql new file mode 100644 index 000000000..acb292586 --- /dev/null +++ b/cuebot/src/main/resources/conf/ddl/postgres/migrations/V11__Support_multiple_GPU.sql @@ -0,0 +1,1078 @@ +-- Support multiple GPU + +-- frame_history + +ALTER TABLE frame_history ADD COLUMN int_gpus INT DEFAULT 0 NOT NULL; +ALTER TABLE frame_history ADD COLUMN int_gpu_mem_reserved BIGINT DEFAULT 0 NOT NULL; +ALTER TABLE frame_history ADD COLUMN int_gpu_mem_max_used BIGINT DEFAULT 0 NOT NULL; + + +-- show_service + +ALTER TABLE show_service RENAME COLUMN int_gpu_min TO int_gpu_mem_min; +ALTER TABLE show_service ALTER COLUMN int_gpu_mem_min TYPE BIGINT; +ALTER TABLE show_service ADD COLUMN int_gpus_min INT DEFAULT 0 NOT NULL; +ALTER TABLE show_service ADD COLUMN int_gpus_max INT DEFAULT 0 NOT NULL; + +ALTER INDEX i_show_service_int_gpu_min RENAME TO i_show_service_int_gpu_mem_min;; +CREATE INDEX i_show_service_int_gpus_min ON show_service (int_gpus_min); + + +-- host_local + +DROP TRIGGER verify_host_local ON host_local; +ALTER TABLE host_local ALTER COLUMN int_mem_max TYPE BIGINT; +ALTER TABLE host_local ALTER COLUMN int_mem_idle TYPE BIGINT; +ALTER TABLE host_local RENAME COLUMN int_gpu_idle TO int_gpu_mem_idle; +ALTER TABLE host_local ALTER COLUMN int_gpu_mem_idle TYPE BIGINT; +ALTER TABLE host_local RENAME COLUMN int_gpu_max TO int_gpu_mem_max; +ALTER TABLE host_local ALTER COLUMN int_gpu_mem_max TYPE BIGINT; +ALTER TABLE host_local ADD COLUMN int_gpus_idle INT DEFAULT 0 NOT NULL; +ALTER TABLE host_local ADD COLUMN int_gpus_max INT DEFAULT 0 NOT NULL; + +CREATE INDEX i_host_local_int_gpus_idle ON host_local (int_gpus_idle); +CREATE INDEX i_host_local_int_gpus_max ON host_local (int_gpus_max); + + +-- service + +ALTER TABLE service RENAME COLUMN int_gpu_min TO int_gpu_mem_min; +ALTER TABLE service ALTER COLUMN int_gpu_mem_min TYPE BIGINT; +ALTER TABLE service ADD COLUMN int_gpus_min INT DEFAULT 0 NOT NULL; +ALTER TABLE service ADD COLUMN int_gpus_max INT DEFAULT 0 NOT NULL; + +ALTER INDEX i_service_int_gpu_min RENAME TO i_service_int_gpu_mem_min; +CREATE INDEX i_service_int_gpus_min ON service (int_gpus_min); + + +-- job_local + +ALTER TABLE job_local ADD COLUMN int_gpus INT DEFAULT 0 NOT NULL; +ALTER TABLE job_local ADD COLUMN int_max_gpus INT DEFAULT 0 NOT NULL; + + +-- task + +ALTER TABLE task ADD COLUMN int_min_gpus INT DEFAULT 0 NOT NULL; +ALTER TABLE task ADD COLUMN int_adjust_gpus INT DEFAULT 0 NOT NULL; + + +-- point + +ALTER TABLE point ADD COLUMN int_gpus INT DEFAULT 0 NOT NULL; +ALTER TABLE point ADD COLUMN int_min_gpus INT DEFAULT 0 NOT NULL; + + +-- folder_resource + +ALTER TABLE folder_resource ADD COLUMN int_gpus INT DEFAULT 0 NOT NULL; +ALTER TABLE folder_resource ADD COLUMN int_max_gpus INT DEFAULT -1 NOT NULL; +ALTER TABLE folder_resource ADD COLUMN int_min_gpus INT DEFAULT 0 NOT NULL; + +CREATE INDEX i_folder_res_int_max_gpus ON folder_resource (int_max_gpus); + + +-- layer_history + +ALTER TABLE layer_history ADD COLUMN int_gpus_min INT DEFAULT 0 NOT NULL; +ALTER TABLE layer_history ADD COLUMN int_gpu_time_success BIGINT DEFAULT 0 NOT NULL; +ALTER TABLE layer_history ADD COLUMN int_gpu_time_fail BIGINT DEFAULT 0 NOT NULL; +ALTER TABLE layer_history ADD COLUMN int_gpu_mem_min BIGINT DEFAULT 0 NOT NULL; +ALTER TABLE layer_history ADD COLUMN int_gpu_mem_max BIGINT DEFAULT 0 NOT NULL; + + +-- job_history + +ALTER TABLE job_history ADD COLUMN int_gpu_time_success BIGINT DEFAULT 0 NOT NULL; +ALTER TABLE job_history ADD COLUMN int_gpu_time_fail BIGINT DEFAULT 0 NOT NULL; +ALTER TABLE job_history ADD COLUMN int_gpu_mem_max BIGINT DEFAULT 0 NOT NULL; + + +-- job_usage + +ALTER TABLE job_usage ADD COLUMN int_gpu_time_success BIGINT DEFAULT 0 NOT NULL; +ALTER TABLE job_usage ADD COLUMN int_gpu_time_fail BIGINT DEFAULT 0 NOT NULL; + + +-- job_resource + +ALTER TABLE job_resource ALTER COLUMN int_max_rss TYPE BIGINT; +ALTER TABLE job_resource ALTER COLUMN int_max_vss TYPE BIGINT; +ALTER TABLE job_resource ADD COLUMN int_gpus INT DEFAULT 0 NOT NULL; +ALTER TABLE job_resource ADD COLUMN int_min_gpus INT DEFAULT 0 NOT NULL; +ALTER TABLE job_resource ADD COLUMN int_max_gpus INT DEFAULT 100 NOT NULL; +ALTER TABLE job_resource ADD COLUMN int_local_gpus INT DEFAULT 0 NOT NULL; +ALTER TABLE job_resource ADD COLUMN int_gpu_mem_max BIGINT DEFAULT 0 NOT NULL; + +CREATE INDEX i_job_resource_gpus_min_max ON job_resource (int_min_gpus, int_max_gpus); +CREATE INDEX i_job_resource_gpus ON job_resource (int_gpus); +CREATE INDEX i_job_resource_max_gpus ON job_resource (int_max_gpus); + + +-- subscription + +ALTER TABLE subscription ADD COLUMN int_gpus INT DEFAULT 0 NOT NULL; + + +-- show + +ALTER TABLE show ADD COLUMN int_default_min_gpus INT DEFAULT 100 NOT NULL; +ALTER TABLE show ADD COLUMN int_default_max_gpus INT DEFAULT 100000 NOT NULL; + + +-- proc + +ALTER TABLE proc RENAME COLUMN int_gpu_reserved TO int_gpu_mem_reserved; +ALTER TABLE proc ALTER COLUMN int_gpu_mem_reserved TYPE BIGINT; +ALTER TABLE proc ADD COLUMN int_gpus_reserved INT DEFAULT 0 NOT NULL; +ALTER TABLE proc ADD COLUMN int_gpu_mem_used BIGINT DEFAULT 0 NOT NULL; +ALTER TABLE proc ADD COLUMN int_gpu_mem_max_used BIGINT DEFAULT 0 NOT NULL; +ALTER TABLE proc ADD COLUMN int_gpu_mem_pre_reserved BIGINT DEFAULT 0 NOT NULL; + +ALTER INDEX i_proc_int_gpu_reserved RENAME TO i_proc_int_gpu_mem_reserved; + + +-- layer_usage + +ALTER TABLE layer_usage ADD COLUMN int_gpu_time_success BIGINT DEFAULT 0 NOT NULL; +ALTER TABLE layer_usage ADD COLUMN int_gpu_time_fail BIGINT DEFAULT 0 NOT NULL; + + +-- layer_mem + +ALTER TABLE layer_mem ALTER COLUMN int_max_rss TYPE BIGINT; +ALTER TABLE layer_mem ALTER COLUMN int_max_vss TYPE BIGINT; +ALTER TABLE layer_mem ADD COLUMN int_gpu_mem_max BIGINT DEFAULT 0 NOT NULL; + + +-- layer_resource + +ALTER TABLE layer_resource ALTER COLUMN int_max_rss TYPE BIGINT; +ALTER TABLE layer_resource ALTER COLUMN int_max_vss TYPE BIGINT; +ALTER TABLE layer_resource ADD COLUMN int_gpus INT DEFAULT 0 NOT NULL; +ALTER TABLE layer_resource ADD COLUMN int_gpu_mem_max BIGINT DEFAULT 0 NOT NULL; + + +-- layer + +ALTER TABLE layer RENAME COLUMN int_gpu_min TO int_gpu_mem_min; +ALTER TABLE layer ALTER COLUMN int_gpu_mem_min TYPE BIGINT; +ALTER TABLE layer ADD COLUMN int_gpus_min BIGINT DEFAULT 0 NOT NULL; +ALTER TABLE layer ADD COLUMN int_gpus_max BIGINT DEFAULT 0 NOT NULL; + +ALTER INDEX i_layer_int_gpu_min RENAME TO i_layer_int_gpu_mem_min; +CREATE INDEX i_layer_cores_gpus_mem ON layer (int_cores_min, int_gpus_min, int_mem_min, int_gpu_mem_min); +CREATE INDEX i_layer_cores_gpus_mem_thread ON layer (int_cores_min, int_gpus_min, int_mem_min, int_gpu_mem_min, b_threadable); + + +-- job_mem + +ALTER TABLE job_mem ALTER COLUMN int_max_rss TYPE BIGINT; +ALTER TABLE job_mem ALTER COLUMN int_max_vss TYPE BIGINT; +ALTER TABLE job_mem ADD COLUMN int_gpu_mem_max BIGINT DEFAULT 0 NOT NULL; + + +-- job + +ALTER TABLE job ADD COLUMN int_min_gpus INT DEFAULT 0 NOT NULL; +ALTER TABLE job ADD COLUMN int_max_gpus INT DEFAULT 100000 NOT NULL; + + +-- host_stat + +ALTER TABLE host_stat RENAME COLUMN int_gpu_total TO int_gpu_mem_total; +ALTER TABLE host_stat ALTER COLUMN int_gpu_mem_total TYPE BIGINT; +ALTER TABLE host_stat RENAME COLUMN int_gpu_free TO int_gpu_mem_free; +ALTER TABLE host_stat ALTER COLUMN int_gpu_mem_free TYPE BIGINT; + +ALTER INDEX i_host_stat_int_gpu_total RENAME TO i_host_stat_int_gpu_mem_total; +ALTER INDEX i_host_stat_int_gpu_free RENAME TO i_host_stat_int_gpu_mem_free; + + +-- host + +ALTER TABLE host RENAME COLUMN int_gpu TO int_gpu_mem; +ALTER TABLE host ALTER COLUMN int_gpu_mem TYPE BIGINT; +ALTER TABLE host RENAME COLUMN int_gpu_idle TO int_gpu_mem_idle; +ALTER TABLE host ALTER COLUMN int_gpu_mem_idle TYPE BIGINT; +ALTER TABLE host ADD COLUMN int_gpus BIGINT DEFAULT 0 NOT NULL; +ALTER TABLE host ADD COLUMN int_gpus_idle BIGINT DEFAULT 0 NOT NULL; + +CREATE INDEX i_host_int_gpu_mem ON host (int_gpu_mem); +CREATE INDEX i_host_int_gpu_mem_idle ON host (int_gpu_mem_idle); +CREATE INDEX i_host_int_gpus ON host (int_gpus); +CREATE INDEX i_host_int_gpus_idle ON host (int_gpus_idle); + + +-- frame + +ALTER TABLE frame RENAME COLUMN int_gpu_reserved TO int_gpu_mem_reserved; +ALTER TABLE frame ALTER COLUMN int_gpu_mem_reserved TYPE BIGINT; +ALTER TABLE frame ADD COLUMN int_gpu_mem_used BIGINT DEFAULT 0 NOT NULL; +ALTER TABLE frame ADD COLUMN int_gpu_mem_max_used BIGINT DEFAULT 0 NOT NULL; +ALTER TABLE frame ADD COLUMN int_gpus INT DEFAULT 0 NOT NULL; +ALTER TABLE frame ADD COLUMN int_total_past_gpu_time INT DEFAULT 0 NOT NULL; + +ALTER INDEX i_frame_int_gpu_reserved RENAME TO i_frame_int_gpu_mem_reserved; + + +-- folder + +ALTER TABLE folder ADD COLUMN int_job_min_gpus INT DEFAULT -1 NOT NULL; +ALTER TABLE folder ADD COLUMN int_job_max_gpus INT DEFAULT -1 NOT NULL; +ALTER TABLE folder ADD COLUMN int_min_gpus INT DEFAULT 0 NOT NULL; +ALTER TABLE folder ADD COLUMN int_max_gpus INT DEFAULT -1 NOT NULL; + + +-- Views + +DROP VIEW vs_show_resource; +CREATE VIEW vs_show_resource (pk_show, int_cores, int_gpus) AS + SELECT + job.pk_show, + SUM(int_cores) AS int_cores, SUM(int_gpus) AS int_gpus + FROM + job, + job_resource + WHERE + job.pk_job = job_resource.pk_job + AND + job.str_state='PENDING' + GROUP BY + job.pk_show; + + +DROP VIEW vs_job_resource; +CREATE VIEW vs_job_resource (pk_job, int_procs, int_cores, int_gpus, int_mem_reserved) AS + SELECT + job.pk_job, + COUNT(proc.pk_proc) AS int_procs, + COALESCE(SUM(int_cores_reserved),0) AS int_cores, + COALESCE(SUM(int_gpus_reserved),0) AS int_gpus, + COALESCE(SUM(int_mem_reserved),0) AS int_mem_reserved + FROM + job LEFT JOIN proc ON (proc.pk_job = job.pk_job) + GROUP BY + job.pk_job; + + +DROP VIEW vs_alloc_usage; +CREATE VIEW vs_alloc_usage (pk_alloc, int_cores, int_idle_cores, int_running_cores, int_locked_cores, int_available_cores, int_gpus, int_idle_gpus, int_running_gpus, int_locked_gpus, int_available_gpus, int_hosts, int_locked_hosts, int_down_hosts) AS + SELECT + alloc.pk_alloc, + COALESCE(SUM(host.int_cores),0) AS int_cores, + COALESCE(SUM(host.int_cores_idle),0) AS int_idle_cores, + COALESCE(SUM(host.int_cores - host.int_cores_idle),0) as int_running_cores, + COALESCE((SELECT SUM(int_cores) FROM host WHERE host.pk_alloc=alloc.pk_alloc AND (str_lock_state='NIMBY_LOCKED' OR str_lock_state='LOCKED')),0) AS int_locked_cores, + COALESCE((SELECT SUM(int_cores_idle) FROM host h,host_stat hs WHERE h.pk_host = hs.pk_host AND h.pk_alloc=alloc.pk_alloc AND h.str_lock_state='OPEN' AND hs.str_state ='UP'),0) AS int_available_cores, + COALESCE(SUM(host.int_gpus),0) AS int_gpus, + COALESCE(SUM(host.int_gpus_idle),0) AS int_idle_gpus, + COALESCE(SUM(host.int_gpus - host.int_gpus_idle),0) as int_running_gpus, + COALESCE((SELECT SUM(int_gpus) FROM host WHERE host.pk_alloc=alloc.pk_alloc AND (str_lock_state='NIMBY_LOCKED' OR str_lock_state='LOCKED')),0) AS int_locked_gpus, + COALESCE((SELECT SUM(int_gpus_idle) FROM host h,host_stat hs WHERE h.pk_host = hs.pk_host AND h.pk_alloc=alloc.pk_alloc AND h.str_lock_state='OPEN' AND hs.str_state ='UP'),0) AS int_available_gpus, + COUNT(host.pk_host) AS int_hosts, + (SELECT COUNT(*) FROM host WHERE host.pk_alloc=alloc.pk_alloc AND str_lock_state='LOCKED') AS int_locked_hosts, + (SELECT COUNT(*) FROM host h,host_stat hs WHERE h.pk_host = hs.pk_host AND h.pk_alloc=alloc.pk_alloc AND hs.str_state='DOWN') AS int_down_hosts + FROM + alloc LEFT JOIN host ON (alloc.pk_alloc = host.pk_alloc) + GROUP BY + alloc.pk_alloc; + + +DROP VIEW vs_folder_counts; +CREATE VIEW vs_folder_counts (pk_folder, int_depend_count, int_waiting_count, int_running_count, int_dead_count, int_cores, int_gpus, int_job_count) AS + SELECT + folder.pk_folder, + COALESCE(SUM(int_depend_count),0) AS int_depend_count, + COALESCE(SUM(int_waiting_count),0) AS int_waiting_count, + COALESCE(SUM(int_running_count),0) AS int_running_count, + COALESCE(SUM(int_dead_count),0) AS int_dead_count, + COALESCE(SUM(int_cores),0) AS int_cores, + COALESCE(SUM(int_gpus),0) AS int_gpus, + COALESCE(COUNT(job.pk_job),0) AS int_job_count +FROM + folder + LEFT JOIN + job ON (folder.pk_folder = job.pk_folder AND job.str_state='PENDING') + LEFT JOIN + job_stat ON (job.pk_job = job_stat.pk_job) + LEFT JOIN + job_resource ON (job.pk_job = job_resource.pk_job) + GROUP BY + folder.pk_folder; + + +DROP VIEW v_history_frame; +CREATE VIEW v_history_frame (pk_frame_history, pk_frame, pk_layer, pk_job, str_name, str_state, + int_mem_reserved, int_mem_max_used, int_cores, int_gpu_mem_reserved, int_gpu_mem_max_used, int_gpus, + str_host, int_exit_status, str_alloc_name, + b_alloc_billable, str_facility_name, int_ts_started, int_ts_stopped, int_checkpoint_count, + str_show_name, dt_last_modified) AS + SELECT + fh.PK_FRAME_HISTORY, + fh.PK_FRAME, + fh.PK_LAYER, + fh.PK_JOB, + fh.STR_NAME, + fh.STR_STATE, + fh.INT_MEM_RESERVED, + fh.INT_MEM_MAX_USED, + fh.INT_CORES, + fh.INT_GPU_MEM_RESERVED, + fh.INT_GPU_MEM_MAX_USED, + fh.INT_GPUS, + fh.STR_HOST, + fh.INT_EXIT_STATUS, + a.STR_NAME STR_ALLOC_NAME, + a.B_BILLABLE B_ALLOC_BILLABLE, + f.STR_NAME STR_FACILITY_NAME, + fh.INT_TS_STARTED, + fh.INT_TS_STOPPED, + fh.INT_CHECKPOINT_COUNT, + null str_show_name, + fh.dt_last_modified + FROM frame_history fh + JOIN job_history jh + ON fh.pk_job = jh.pk_job + LEFT OUTER JOIN alloc a + ON fh.pk_alloc = a.pk_alloc + LEFT OUTER JOIN facility f + ON a.pk_facility = f.pk_facility + WHERE fh.dt_last_modified >= (SELECT dt_begin FROM history_period) + AND fh.dt_last_modified < (SELECT dt_end FROM history_period); + + +DROP VIEW v_history_job; +CREATE VIEW v_history_job (pk_job, str_name, str_shot, str_user, int_core_time_success, int_core_time_fail, int_gpu_time_success, int_gpu_time_fail, int_frame_count, int_layer_count, int_waiting_count, int_dead_count, int_depend_count, int_eaten_count, int_succeeded_count, int_running_count, int_max_rss, int_gpu_mem_max, b_archived, str_facility_name, str_dept_name, int_ts_started, int_ts_stopped, str_show_name, dt_last_modified) AS + select +jh.PK_JOB, +jh.STR_NAME, +jh.STR_SHOT, +jh.STR_USER, +jh.INT_CORE_TIME_SUCCESS, +jh.INT_CORE_TIME_FAIL, +jh.INT_GPU_TIME_SUCCESS, +jh.INT_GPU_TIME_FAIL, +jh.INT_FRAME_COUNT, +jh.INT_LAYER_COUNT, +jh.INT_WAITING_COUNT, +jh.INT_DEAD_COUNT, +jh.INT_DEPEND_COUNT, +jh.INT_EATEN_COUNT, +jh.INT_SUCCEEDED_COUNT, +jh.INT_RUNNING_COUNT, +jh.INT_MAX_RSS, +jh.INT_GPU_MEM_MAX, +jh.B_ARCHIVED, +f.str_name STR_FACILITY_NAME, +d.str_name str_dept_name, +jh.INT_TS_STARTED, +jh.INT_TS_STOPPED, +s.str_name str_show_name, +jh.dt_last_modified +from job_history jh, show s, facility f, dept d +where jh.pk_show = s.pk_show +and jh.pk_facility = f.pk_facility +and jh.pk_dept = d.pk_dept +and ( + jh.dt_last_modified >= ( + select dt_begin + from history_period + ) + or + jh.int_ts_stopped = 0 +); + + +DROP VIEW v_history_layer; +CREATE VIEW v_history_layer (pk_layer, pk_job, str_name, str_type, int_cores_min, + int_mem_min, int_gpus_min, int_gpu_mem_min, int_core_time_success, int_core_time_fail, + int_gpu_time_success, int_gpu_time_fail, int_frame_count, int_layer_count, + int_waiting_count, int_dead_count, int_depend_count, int_eaten_count, int_succeeded_count, + int_running_count, int_max_rss, int_gpu_mem_max, b_archived, str_services, str_show_name, dt_last_modified) AS + SELECT +lh.PK_LAYER, +lh.PK_JOB, +lh.STR_NAME, +lh.STR_TYPE, +lh.INT_CORES_MIN, +lh.INT_MEM_MIN, +lh.INT_GPUS_MIN, +lh.INT_GPU_MEM_MIN, +lh.INT_CORE_TIME_SUCCESS, +lh.INT_CORE_TIME_FAIL, +lh.INT_GPU_TIME_SUCCESS, +lh.INT_GPU_TIME_FAIL, +lh.INT_FRAME_COUNT, +lh.INT_LAYER_COUNT, +lh.INT_WAITING_COUNT, +lh.INT_DEAD_COUNT, +lh.INT_DEPEND_COUNT, +lh.INT_EATEN_COUNT, +lh.INT_SUCCEEDED_COUNT, +lh.INT_RUNNING_COUNT, +lh.INT_MAX_RSS, +lh.INT_GPU_MEM_MAX, +lh.B_ARCHIVED, +lh.STR_SERVICES, +s.str_name str_show_name, +lh.dt_last_modified +from layer_history lh, job_history jh, show s +where lh.pk_job = jh.pk_job +and jh.pk_show = s.pk_show +and jh.dt_last_modified >= ( + select dt_begin + from history_period +) +and jh.dt_last_modified < ( + select dt_end + from history_period +); + + +-- Types + +ALTER TYPE JobStatType ADD ATTRIBUTE int_gpu_time_success BIGINT; +ALTER TYPE JobStatType ADD ATTRIBUTE int_gpu_time_fail BIGINT; +ALTER TYPE JobStatType ADD ATTRIBUTE int_gpu_mem_max BIGINT; + +ALTER TYPE LayerStatType ADD ATTRIBUTE int_gpu_time_success BIGINT; +ALTER TYPE LayerStatType ADD ATTRIBUTE int_gpu_time_fail BIGINT; +ALTER TYPE LayerStatType ADD ATTRIBUTE int_gpu_mem_max BIGINT; + + +-- Functions + +CREATE OR REPLACE FUNCTION recalculate_subs() +RETURNS VOID AS $body$ +DECLARE + r RECORD; +BEGIN + -- + -- concatenates all tags in host_tag and sets host.str_tags + -- + UPDATE subscription SET int_cores = 0; + UPDATE subscription SET int_gpus = 0; + FOR r IN + SELECT proc.pk_show, alloc.pk_alloc, sum(proc.int_cores_reserved) as c, sum(proc.int_gpus_reserved) as d + FROM proc, host, alloc + WHERE proc.pk_host = host.pk_host AND host.pk_alloc = alloc.pk_alloc + GROUP BY proc.pk_show, alloc.pk_alloc + LOOP + UPDATE subscription SET int_cores = r.c, int_gpus = r.d WHERE pk_alloc=r.pk_alloc AND pk_show=r.pk_show; + + END LOOP; +END; +$body$ +LANGUAGE PLPGSQL; + + +CREATE OR REPLACE FUNCTION tmp_populate_folder() +RETURNS VOID AS $body$ +DECLARE + t RECORD; +BEGIN + FOR t IN + SELECT pk_folder, pk_show, sum(int_cores) AS c, sum(int_gpus) AS d + FROM job, job_resource + WHERE job.pk_job = job_resource.pk_job + GROUP by pk_folder, pk_show + LOOP + UPDATE folder_resource SET int_cores = t.c, int_gpus = t.d WHERE pk_folder = t.pk_folder; + COMMIT; + END LOOP; +END; +$body$ +LANGUAGE PLPGSQL; + + +CREATE OR REPLACE FUNCTION tmp_populate_point() +RETURNS VOID AS $body$ +DECLARE + t RECORD; +BEGIN + FOR t IN + SELECT pk_dept, pk_show, sum(int_cores) AS c, sum(int_gpus) AS d + FROM job, job_resource + WHERE job.pk_job = job_resource.pk_job + GROUP BY pk_dept, pk_show + LOOP + UPDATE point SET int_cores = t.c , int_gpus = t.d WHERE pk_show = t.pk_show AND pk_dept = t.pk_dept; + END LOOP; +END; +$body$ +LANGUAGE PLPGSQL; + + +CREATE OR REPLACE FUNCTION tmp_populate_sub() +RETURNS VOID AS $body$ +DECLARE + t RECORD; +BEGIN + FOR t IN + SELECT proc.pk_show, host.pk_alloc, sum(int_cores_reserved) AS c, sum(int_gpus_reserved) AS d + FROM proc, host + WHERE proc.pk_host = host.pk_host + GROUP BY proc.pk_show, host.pk_alloc + LOOP + UPDATE subscription SET int_cores = t.c, int_gpus = t.d WHERE pk_show = t.pk_show AND pk_alloc = t.pk_alloc; + END LOOP; +END; +$body$ +LANGUAGE PLPGSQL; + + +CREATE OR REPLACE FUNCTION trigger__after_job_moved() +RETURNS TRIGGER AS $body$ +DECLARE + int_core_count INT; + int_gpu_count INT; +BEGIN + SELECT int_cores, int_gpus INTO int_core_count, int_gpu_count + FROM job_resource WHERE pk_job = NEW.pk_job; + + IF int_core_count > 0 THEN + UPDATE folder_resource SET int_cores = int_cores + int_core_count + WHERE pk_folder = NEW.pk_folder; + + UPDATE folder_resource SET int_cores = int_cores - int_core_count + WHERE pk_folder = OLD.pk_folder; + END IF; + + IF int_gpu_count > 0 THEN + UPDATE folder_resource SET int_gpus = int_gpus + int_gpu_count + WHERE pk_folder = NEW.pk_folder; + + UPDATE folder_resource SET int_gpus = int_gpus - int_gpu_count + WHERE pk_folder = OLD.pk_folder; + END IF; + RETURN NULL; +END +$body$ +LANGUAGE PLPGSQL; + + +CREATE OR REPLACE FUNCTION trigger__before_delete_job() +RETURNS TRIGGER AS $body$ +DECLARE + js JobStatType; +BEGIN + SELECT + job_usage.int_core_time_success, + job_usage.int_core_time_fail, + job_usage.int_gpu_time_success, + job_usage.int_gpu_time_fail, + job_stat.int_waiting_count, + job_stat.int_dead_count, + job_stat.int_depend_count, + job_stat.int_eaten_count, + job_stat.int_succeeded_count, + job_stat.int_running_count, + job_mem.int_max_rss, + job_mem.int_gpu_mem_max + INTO + js + FROM + job_mem, + job_usage, + job_stat + WHERE + job_usage.pk_job = job_mem.pk_job + AND + job_stat.pk_job = job_mem.pk_job + AND + job_mem.pk_job = OLD.pk_job; + + UPDATE + job_history + SET + pk_dept = OLD.pk_dept, + int_core_time_success = js.int_core_time_success, + int_core_time_fail = js.int_core_time_fail, + int_gpu_time_success = js.int_gpu_time_success, + int_gpu_time_fail = js.int_gpu_time_fail, + int_frame_count = OLD.int_frame_count, + int_layer_count = OLD.int_layer_count, + int_waiting_count = js.int_waiting_count, + int_dead_count = js.int_dead_count, + int_depend_count = js.int_depend_count, + int_eaten_count = js.int_eaten_count, + int_succeeded_count = js.int_succeeded_count, + int_running_count = js.int_running_count, + int_max_rss = js.int_max_rss, + int_gpu_mem_max = js.int_gpu_mem_max, + b_archived = true, + int_ts_stopped = COALESCE(epoch(OLD.ts_stopped), epoch(current_timestamp)) + WHERE + pk_job = OLD.pk_job; + + DELETE FROM depend WHERE pk_job_depend_on=OLD.pk_job OR pk_job_depend_er=OLD.pk_job; + DELETE FROM frame WHERE pk_job=OLD.pk_job; + DELETE FROM layer WHERE pk_job=OLD.pk_job; + DELETE FROM job_env WHERE pk_job=OLD.pk_job; + DELETE FROM job_stat WHERE pk_job=OLD.pk_job; + DELETE FROM job_resource WHERE pk_job=OLD.pk_job; + DELETE FROM job_usage WHERE pk_job=OLD.pk_job; + DELETE FROM job_mem WHERE pk_job=OLD.pk_job; + DELETE FROM comments WHERE pk_job=OLD.pk_job; + + RETURN OLD; +END +$body$ +LANGUAGE PLPGSQL; + + +CREATE OR REPLACE FUNCTION trigger__after_job_finished() +RETURNS TRIGGER AS $body$ +DECLARE + ts INT := cast(epoch(current_timestamp) as integer); + js JobStatType; + ls LayerStatType; + one_layer RECORD; +BEGIN + SELECT + job_usage.int_core_time_success, + job_usage.int_core_time_fail, + job_usage.int_gpu_time_success, + job_usage.int_gpu_time_fail, + job_stat.int_waiting_count, + job_stat.int_dead_count, + job_stat.int_depend_count, + job_stat.int_eaten_count, + job_stat.int_succeeded_count, + job_stat.int_running_count, + job_mem.int_max_rss, + job_mem.int_gpu_mem_max + INTO + js + FROM + job_mem, + job_usage, + job_stat + WHERE + job_usage.pk_job = job_mem.pk_job + AND + job_stat.pk_job = job_mem.pk_job + AND + job_mem.pk_job = NEW.pk_job; + + UPDATE + job_history + SET + pk_dept = NEW.pk_dept, + int_core_time_success = js.int_core_time_success, + int_core_time_fail = js.int_core_time_fail, + int_gpu_time_success = js.int_gpu_time_success, + int_gpu_time_fail = js.int_gpu_time_fail, + int_frame_count = NEW.int_frame_count, + int_layer_count = NEW.int_layer_count, + int_waiting_count = js.int_waiting_count, + int_dead_count = js.int_dead_count, + int_depend_count = js.int_depend_count, + int_eaten_count = js.int_eaten_count, + int_succeeded_count = js.int_succeeded_count, + int_running_count = js.int_running_count, + int_max_rss = js.int_max_rss, + int_gpu_mem_max = js.int_gpu_mem_max, + int_ts_stopped = ts + WHERE + pk_job = NEW.pk_job; + + FOR one_layer IN (SELECT pk_layer from layer where pk_job = NEW.pk_job) + LOOP + SELECT + layer_usage.int_core_time_success, + layer_usage.int_core_time_fail, + layer_usage.int_gpu_time_success, + layer_usage.int_gpu_time_fail, + layer_stat.int_total_count, + layer_stat.int_waiting_count, + layer_stat.int_dead_count, + layer_stat.int_depend_count, + layer_stat.int_eaten_count, + layer_stat.int_succeeded_count, + layer_stat.int_running_count, + layer_mem.int_max_rss, + layer_mem.int_gpu_mem_max + INTO + ls + FROM + layer_mem, + layer_usage, + layer_stat + WHERE + layer_usage.pk_layer = layer_mem.pk_layer + AND + layer_stat.pk_layer = layer_mem.pk_layer + AND + layer_mem.pk_layer = one_layer.pk_layer; + + UPDATE + layer_history + SET + int_core_time_success = ls.int_core_time_success, + int_core_time_fail = ls.int_core_time_fail, + int_gpu_time_success = ls.int_gpu_time_success, + int_gpu_time_fail = ls.int_gpu_time_fail, + int_frame_count = ls.int_total_count, + int_waiting_count = ls.int_waiting_count, + int_dead_count = ls.int_dead_count, + int_depend_count = ls.int_depend_count, + int_eaten_count = ls.int_eaten_count, + int_succeeded_count = ls.int_succeeded_count, + int_running_count = ls.int_running_count, + int_max_rss = ls.int_max_rss, + int_gpu_mem_max = ls.int_gpu_mem_max + WHERE + pk_layer = one_layer.pk_layer; + END LOOP; + + /** + * Delete any local core assignments from this job. + **/ + DELETE FROM job_local WHERE pk_job=NEW.pk_job; + + RETURN NEW; +END; +$body$ +LANGUAGE PLPGSQL; + + +CREATE OR REPLACE FUNCTION trigger__after_job_dept_update() +RETURNS TRIGGER AS $body$ +DECLARE + int_running_cores INT; + int_running_gpus INT; +BEGIN + /** + * Handles the accounting for moving a job between departments. + **/ + SELECT int_cores, int_gpus INTO int_running_cores, int_running_gpus + FROM job_resource WHERE pk_job = NEW.pk_job; + + IF int_running_cores > 0 THEN + UPDATE point SET int_cores = int_cores + int_running_cores + WHERE pk_dept = NEW.pk_dept AND pk_show = NEW.pk_show; + + UPDATE point SET int_cores = int_cores - int_running_cores + WHERE pk_dept = OLD.pk_dept AND pk_show = OLD.pk_show; + END IF; + + IF int_running_gpus > 0 THEN + UPDATE point SET int_gpus = int_gpus + int_running_gpus + WHERE pk_dept = NEW.pk_dept AND pk_show = NEW.pk_show; + + UPDATE point SET int_gpus = int_gpus - int_running_gpus + WHERE pk_dept = OLD.pk_dept AND pk_show = OLD.pk_show; + END IF; + + RETURN NULL; +END; +$body$ +LANGUAGE PLPGSQL; + + +CREATE OR REPLACE FUNCTION trigger__verify_host_local() +RETURNS TRIGGER AS $body$ +BEGIN + /** + * Check to see if the new cores exceeds max cores. This check is only + * done if NEW.int_max_cores is equal to OLD.int_max_cores and + * NEW.int_cores > OLD.int_cores, otherwise this error will be thrown + * when people lower the max. + **/ + IF NEW.int_cores_idle < 0 THEN + RAISE EXCEPTION 'host local doesnt have enough idle cores.'; + END IF; + + IF NEW.int_mem_idle < 0 THEN + RAISE EXCEPTION 'host local doesnt have enough idle memory'; + END IF; + + IF NEW.int_gpus_idle < 0 THEN + RAISE EXCEPTION 'host local doesnt have enough GPU idle cores.'; + END IF; + + IF NEW.int_gpu_mem_idle < 0 THEN + RAISE EXCEPTION 'host local doesnt have enough GPU idle memory.'; + END IF; + + RETURN NEW; +END; +$body$ +LANGUAGE PLPGSQL; + +CREATE TRIGGER verify_host_local BEFORE UPDATE ON host_local +FOR EACH ROW + WHEN ((NEW.int_cores_max = OLD.int_cores_max AND NEW.int_mem_max = OLD.int_mem_max) AND + (NEW.int_cores_idle != OLD.int_cores_idle OR NEW.int_mem_idle != OLD.int_mem_idle) AND + (NEW.int_gpus_max = OLD.int_gpus_max AND NEW.int_gpu_mem_max = OLD.int_gpu_mem_max) AND + (NEW.int_gpus_idle != OLD.int_gpus_idle OR NEW.int_gpu_mem_idle != OLD.int_gpu_mem_idle)) + EXECUTE PROCEDURE trigger__verify_host_local(); + + +CREATE OR REPLACE FUNCTION trigger__after_insert_layer() +RETURNS TRIGGER AS $body$ +BEGIN + INSERT INTO layer_stat (pk_layer_stat, pk_layer, pk_job) VALUES (NEW.pk_layer, NEW.pk_layer, NEW.pk_job); + INSERT INTO layer_resource (pk_layer_resource, pk_layer, pk_job) VALUES (NEW.pk_layer, NEW.pk_layer, NEW.pk_job); + INSERT INTO layer_usage (pk_layer_usage, pk_layer, pk_job) VALUES (NEW.pk_layer, NEW.pk_layer, NEW.pk_job); + INSERT INTO layer_mem (pk_layer_mem, pk_layer, pk_job) VALUES (NEW.pk_layer, NEW.pk_layer, NEW.pk_job); + + INSERT INTO layer_history + (pk_layer, pk_job, str_name, str_type, int_cores_min, int_mem_min, int_gpus_min, int_gpu_mem_min, b_archived,str_services) + VALUES + (NEW.pk_layer, NEW.pk_job, NEW.str_name, NEW.str_type, NEW.int_cores_min, NEW.int_mem_min, NEW.int_gpus_min, NEW.int_gpu_mem_min, false, NEW.str_services); + + RETURN NEW; +END; +$body$ +LANGUAGE PLPGSQL; + + +CREATE OR REPLACE FUNCTION trigger__before_delete_layer() +RETURNS TRIGGER AS $body$ +DECLARE + js LayerStatType; +BEGIN + SELECT + layer_usage.int_core_time_success, + layer_usage.int_core_time_fail, + layer_usage.int_gpu_time_success, + layer_usage.int_gpu_time_fail, + layer_stat.int_total_count, + layer_stat.int_waiting_count, + layer_stat.int_dead_count, + layer_stat.int_depend_count, + layer_stat.int_eaten_count, + layer_stat.int_succeeded_count, + layer_stat.int_running_count, + layer_mem.int_max_rss, + layer_mem.int_gpu_mem_max + INTO + js + FROM + layer_mem, + layer_usage, + layer_stat + WHERE + layer_usage.pk_layer = layer_mem.pk_layer + AND + layer_stat.pk_layer = layer_mem.pk_layer + AND + layer_mem.pk_layer = OLD.pk_layer; + + UPDATE + layer_history + SET + int_core_time_success = js.int_core_time_success, + int_core_time_fail = js.int_core_time_fail, + int_gpu_time_success = js.int_gpu_time_success, + int_gpu_time_fail = js.int_gpu_time_fail, + int_frame_count = js.int_total_count, + int_waiting_count = js.int_waiting_count, + int_dead_count = js.int_dead_count, + int_depend_count = js.int_depend_count, + int_eaten_count = js.int_eaten_count, + int_succeeded_count = js.int_succeeded_count, + int_running_count = js.int_running_count, + int_max_rss = js.int_max_rss, + int_gpu_mem_max = js.int_gpu_mem_max, + b_archived = true + WHERE + pk_layer = OLD.pk_layer; + + DELETE FROM layer_resource where pk_layer=OLD.pk_layer; + DELETE FROM layer_stat where pk_layer=OLD.pk_layer; + DELETE FROM layer_usage where pk_layer=OLD.pk_layer; + DELETE FROM layer_env where pk_layer=OLD.pk_layer; + DELETE FROM layer_mem where pk_layer=OLD.pk_layer; + DELETE FROM layer_output where pk_layer=OLD.pk_layer; + + RETURN OLD; +END; +$body$ +LANGUAGE PLPGSQL; + + +CREATE OR REPLACE FUNCTION trigger__verify_host_resources() +RETURNS TRIGGER AS $body$ +BEGIN + IF NEW.int_cores_idle < 0 THEN + RAISE EXCEPTION 'unable to allocate additional core units'; + END IF; + + If NEW.int_mem_idle < 0 THEN + RAISE EXCEPTION 'unable to allocate additional memory'; + END IF; + + If NEW.int_gpus_idle < 0 THEN + RAISE EXCEPTION 'unable to allocate additional GPU units'; + END IF; + + If NEW.int_gpu_mem_idle < 0 THEN + RAISE EXCEPTION 'unable to allocate additional GPU memory'; + END IF; + RETURN NEW; +END; +$body$ +LANGUAGE PLPGSQL; + +DROP TRIGGER verify_host_resources ON host; +CREATE TRIGGER verify_host_resources BEFORE UPDATE ON host +FOR EACH ROW + WHEN (NEW.int_cores_idle != OLD.int_cores_idle + OR NEW.int_mem_idle != OLD.int_mem_idle + OR NEW.int_gpus_idle != OLD.int_gpus_idle + OR NEW.int_gpu_mem_idle != OLD.int_gpu_mem_idle) + EXECUTE PROCEDURE trigger__verify_host_resources(); + + +CREATE OR REPLACE FUNCTION trigger__verify_job_resources() +RETURNS TRIGGER AS $body$ +BEGIN + /** + * Check to see if the new cores exceeds max cores. This check is only + * done if NEW.int_max_cores is equal to OLD.int_max_cores and + * NEW.int_cores > OLD.int_cores, otherwise this error will be thrown + * at the wrong time. + **/ + IF NEW.int_cores > NEW.int_max_cores THEN + RAISE EXCEPTION 'job has exceeded max cores'; + END IF; + IF NEW.int_gpus > NEW.int_max_gpus THEN + RAISE EXCEPTION 'job has exceeded max GPU units'; + END IF; + RETURN NEW; +END; +$body$ +LANGUAGE PLPGSQL; + +DROP TRIGGER verify_job_resources ON job_resource; +CREATE TRIGGER verify_job_resources BEFORE UPDATE ON job_resource +FOR EACH ROW + WHEN (NEW.int_max_cores = OLD.int_max_cores AND NEW.int_cores > OLD.int_cores OR + NEW.int_max_gpus = OLD.int_max_gpus AND NEW.int_gpus > OLD.int_gpus) + EXECUTE PROCEDURE trigger__verify_job_resources(); + + +CREATE OR REPLACE FUNCTION trigger__update_proc_update_layer() +RETURNS TRIGGER AS $body$ +DECLARE + lr RECORD; +BEGIN + FOR lr IN ( + SELECT + pk_layer + FROM + layer_stat + WHERE + pk_layer IN (OLD.pk_layer, NEW.pk_layer) + ORDER BY layer_stat.pk_layer DESC + ) LOOP + + IF lr.pk_layer = OLD.pk_layer THEN + + UPDATE layer_resource SET + int_cores = int_cores - OLD.int_cores_reserved, + int_gpus = int_gpus - OLD.int_gpus_reserved + WHERE + pk_layer = OLD.pk_layer; + + ELSE + + UPDATE layer_resource SET + int_cores = int_cores + NEW.int_cores_reserved, + int_gpus = int_gpus + NEW.int_gpus_reserved + WHERE + pk_layer = NEW.pk_layer; + END IF; + + END LOOP; + RETURN NULL; +END; +$body$ +LANGUAGE PLPGSQL; + + +CREATE OR REPLACE FUNCTION trigger__frame_history_open() +RETURNS TRIGGER AS $body$ +DECLARE + str_pk_alloc VARCHAR(36) := null; + int_checkpoint INT := 0; +BEGIN + + IF OLD.str_state = 'RUNNING' THEN + + IF NEW.int_exit_status = 299 THEN + + EXECUTE 'DELETE FROM frame_history WHERE int_ts_stopped = 0 AND pk_frame=$1' USING + NEW.pk_frame; + + ELSE + If NEW.str_state = 'CHECKPOINT' THEN + int_checkpoint := 1; + END IF; + + EXECUTE + 'UPDATE + frame_history + SET + int_mem_max_used=$1, + int_gpu_mem_max_used=$2, + int_ts_stopped=$3, + int_exit_status=$4, + int_checkpoint_count=$5 + WHERE + int_ts_stopped = 0 AND pk_frame=$6' + USING + NEW.int_mem_max_used, + NEW.int_gpu_mem_max_used, + epoch(current_timestamp), + NEW.int_exit_status, + int_checkpoint, + NEW.pk_frame; + END IF; + END IF; + + IF NEW.str_state = 'RUNNING' THEN + + SELECT pk_alloc INTO str_pk_alloc FROM host WHERE str_name=NEW.str_host; + + EXECUTE + 'INSERT INTO + frame_history + ( + pk_frame, + pk_layer, + pk_job, + str_name, + str_state, + int_cores, + int_mem_reserved, + int_gpus, + int_gpu_mem_reserved, + str_host, + int_ts_started, + pk_alloc + ) + VALUES + ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12)' + USING NEW.pk_frame, + NEW.pk_layer, + NEW.pk_job, + NEW.str_name, + 'RUNNING', + NEW.int_cores, + NEW.int_mem_reserved, + NEW.int_gpus, + NEW.int_gpu_mem_reserved, + NEW.str_host, + epoch(current_timestamp), + str_pk_alloc; + END IF; + RETURN NULL; + +END; +$body$ +LANGUAGE PLPGSQL; diff --git a/cuebot/src/main/resources/public/dtd/cjsl-1.12.dtd b/cuebot/src/main/resources/public/dtd/cjsl-1.12.dtd new file mode 100644 index 000000000..222e04cfc --- /dev/null +++ b/cuebot/src/main/resources/public/dtd/cjsl-1.12.dtd @@ -0,0 +1,97 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/BookingDaoTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/BookingDaoTests.java index 8c09dfde4..c6c03d604 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/BookingDaoTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/BookingDaoTests.java @@ -109,8 +109,8 @@ public DispatchHost createHost() { .setState(HardwareState.UP) .setFacility("spi") .addTags("general") - .putAttributes("freeGpu", String.format("%d", CueUtil.MB512)) - .putAttributes("totalGpu", String.format("%d", CueUtil.MB512)) + .setFreeGpuMem((int) CueUtil.MB512) + .setTotalGpuMem((int) CueUtil.MB512) .build(); DispatchHost dh = hostManager.createHost(host); hostManager.setAllocation(dh, @@ -138,7 +138,7 @@ public void insertLocalJobAssignment() { LocalHostAssignment lja = new LocalHostAssignment(); lja.setMaxCoreUnits(200); lja.setMaxMemory(CueUtil.GB4); - lja.setMaxGpu(1); + lja.setMaxGpuMemory(1); lja.setThreads(2); bookingDao.insertLocalHostAssignment(h, j, lja); @@ -149,7 +149,7 @@ public void insertLocalJobAssignment() { Integer.class, j.getJobId())); assertEquals(Integer.valueOf(1), jdbcTemplate.queryForObject( - "SELECT int_gpu_max FROM host_local WHERE pk_job=?", + "SELECT int_gpu_mem_max FROM host_local WHERE pk_job=?", Integer.class, j.getJobId())); assertEquals(Integer.valueOf(200), jdbcTemplate.queryForObject( @@ -161,7 +161,7 @@ public void insertLocalJobAssignment() { Long.class, j.getJobId())); assertEquals(Integer.valueOf(1), jdbcTemplate.queryForObject( - "SELECT int_gpu_max FROM host_local WHERE pk_job=?", + "SELECT int_gpu_mem_max FROM host_local WHERE pk_job=?", Integer.class, j.getJobId())); assertEquals(Integer.valueOf(200), jdbcTemplate.queryForObject( @@ -185,7 +185,7 @@ public void insertLocalLayerAssignment() { LocalHostAssignment lja = new LocalHostAssignment(); lja.setMaxCoreUnits(200); lja.setMaxMemory(CueUtil.GB4); - lja.setMaxGpu(1); + lja.setMaxGpuMemory(1); lja.setThreads(2); bookingDao.insertLocalHostAssignment(h, layer, lja); @@ -212,7 +212,7 @@ public void insertLocalLayerAssignment() { Long.class, j.getJobId())); assertEquals(Integer.valueOf(1), jdbcTemplate.queryForObject( - "SELECT int_gpu_max FROM host_local WHERE pk_job=?", + "SELECT int_gpu_mem_max FROM host_local WHERE pk_job=?", Integer.class, j.getJobId())); assertEquals(Integer.valueOf(200), jdbcTemplate.queryForObject( @@ -237,7 +237,7 @@ public void insertLocalFrameAssignment() { LocalHostAssignment lja = new LocalHostAssignment(); lja.setMaxCoreUnits(200); lja.setMaxMemory(CueUtil.GB4); - lja.setMaxGpu(1); + lja.setMaxGpuMemory(1); lja.setThreads(2); bookingDao.insertLocalHostAssignment(h, frame, lja); @@ -264,7 +264,7 @@ public void insertLocalFrameAssignment() { Long.class, j.getJobId())); assertEquals(Integer.valueOf(1), jdbcTemplate.queryForObject( - "SELECT int_gpu_max FROM host_local WHERE pk_job=?", + "SELECT int_gpu_mem_max FROM host_local WHERE pk_job=?", Integer.class, j.getJobId())); assertEquals(Integer.valueOf(200), jdbcTemplate.queryForObject( @@ -288,7 +288,7 @@ public void testGetLocalJobAssignment() { lja.setMaxCoreUnits(200); lja.setMaxMemory(CueUtil.GB4); lja.setThreads(2); - lja.setMaxGpu(1); + lja.setMaxGpuMemory(1); bookingDao.insertLocalHostAssignment(h, j, lja); @@ -297,7 +297,7 @@ public void testGetLocalJobAssignment() { assertEquals(lja.getMaxCoreUnits(), lja2.getMaxCoreUnits()); assertEquals(lja.getMaxMemory(), lja2.getMaxMemory()); - assertEquals(lja.getMaxGpu(), lja2.getMaxGpu()); + assertEquals(lja.getMaxGpuMemory(), lja2.getMaxGpuMemory()); assertEquals(lja.getThreads(), lja2.getThreads()); } @@ -314,7 +314,7 @@ public void testGetRenderPartition() { lja.setMaxCoreUnits(200); lja.setMaxMemory(CueUtil.GB4); lja.setThreads(2); - lja.setMaxGpu(1); + lja.setMaxGpuMemory(1); bookingDao.insertLocalHostAssignment(h, j, lja); @@ -324,7 +324,7 @@ public void testGetRenderPartition() { assertEquals(lja.getMaxCoreUnits(), lja2.getMaxCoreUnits()); assertEquals(lja.getMaxMemory(), lja2.getMaxMemory()); assertEquals(lja.getThreads(), lja2.getThreads()); - assertEquals(lja.getMaxGpu(), lja2.getMaxGpu()); + assertEquals(lja.getMaxGpuMemory(), lja2.getMaxGpuMemory()); RenderPartition rp = whiteboard.getRenderPartition(lja2); @@ -332,9 +332,9 @@ public void testGetRenderPartition() { assertEquals(lja2.getMaxMemory(), rp.getMaxMemory()); assertEquals(lja2.getThreads(), rp.getThreads()); logger.info("--------------------"); - logger.info(lja2.getMaxGpu()); - logger.info(rp.getMaxGpu()); - assertEquals(lja2.getMaxGpu(), rp.getMaxGpu()); + logger.info(lja2.getMaxGpuMemory()); + logger.info(rp.getMaxGpuMemory()); + assertEquals(lja2.getMaxGpuMemory(), rp.getMaxGpuMemory()); assertEquals(h.getName(), rp.getHost()); assertEquals(j.getName(), rp.getJob()); } @@ -351,7 +351,7 @@ public void testGetProcs() { lja.setMaxCoreUnits(200); lja.setMaxMemory(CueUtil.GB4); lja.setThreads(2); - lja.setMaxGpu(1); + lja.setMaxGpuMemory(1); bookingDao.insertLocalHostAssignment(h, j, lja); @@ -370,7 +370,7 @@ public void updateMaxCores() { lja.setMaxCoreUnits(200); lja.setMaxMemory(CueUtil.GB4); lja.setThreads(2); - lja.setMaxGpu(1); + lja.setMaxGpuMemory(1); bookingDao.insertLocalHostAssignment(h, j, lja); assertTrue(bookingDao.updateMaxCores(lja, 100)); @@ -403,7 +403,7 @@ public void updateMaxMemory() { lja.setMaxCoreUnits(200); lja.setMaxMemory(CueUtil.GB4); lja.setThreads(2); - lja.setMaxGpu(1); + lja.setMaxGpuMemory(1); bookingDao.insertLocalHostAssignment(h, j, lja); bookingDao.updateMaxMemory(lja, CueUtil.GB2); @@ -424,7 +424,7 @@ public void updateMaxMemory() { @Test @Transactional @Rollback(true) - public void updateMaxGpu() { + public void updateMaxGpuMemory() { DispatchHost h = createHost(); JobDetail j = launchJob(); @@ -433,7 +433,7 @@ public void updateMaxGpu() { lja.setMaxCoreUnits(200); lja.setMaxMemory(CueUtil.GB4); lja.setThreads(2); - lja.setMaxGpu(1); + lja.setMaxGpuMemory(1); bookingDao.insertLocalHostAssignment(h, j, lja); bookingDao.updateMaxMemory(lja, CueUtil.GB2); @@ -442,15 +442,15 @@ public void updateMaxGpu() { assertEquals(CueUtil.GB2, lj2.getIdleMemory()); assertEquals(CueUtil.GB2, lj2.getMaxMemory()); - assertEquals(1, lj2.getMaxGpu()); + assertEquals(1, lj2.getMaxGpuMemory()); - bookingDao.updateMaxGpu(lja, 2); + bookingDao.updateMaxGpuMemory(lja, 2); lj2 = bookingDao.getLocalJobAssignment(lja.id); assertEquals(CueUtil.GB2, lj2.getIdleMemory()); assertEquals(CueUtil.GB2, lj2.getMaxMemory()); - assertEquals(2, lj2.getMaxGpu()); + assertEquals(2, lj2.getMaxGpuMemory()); } } diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/CommentDaoTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/CommentDaoTests.java index 886400823..668e666e9 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/CommentDaoTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/CommentDaoTests.java @@ -153,8 +153,8 @@ public void testInsertCommentOnHost() { .addTags("linux") .setState(HardwareState.UP) .setFacility("spi") - .putAttributes("freeGpu", String.format("%d", CueUtil.MB512)) - .putAttributes("totalGpu", String.format("%d", CueUtil.MB512)) + .setFreeGpuMem((int) CueUtil.MB512) + .setTotalGpuMem((int) CueUtil.MB512) .build(); CommentDetail d = new CommentDetail(); diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/DeedDaoTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/DeedDaoTests.java index 23a93a2c1..a04e7e5e6 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/DeedDaoTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/DeedDaoTests.java @@ -86,8 +86,8 @@ public DispatchHost createHost() { .addTags("general") .setState(HardwareState.UP) .setFacility("spi") - .putAttributes("freeGpu", String.format("%d", CueUtil.MB512)) - .putAttributes("totalGpu", String.format("%d", CueUtil.MB512)) + .setFreeGpuMem((int) CueUtil.MB512) + .setTotalGpuMem((int) CueUtil.MB512) .build(); DispatchHost dh = hostManager.createHost(host); diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/DispatcherDaoTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/DispatcherDaoTests.java index abf9e34a7..a400bb26a 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/DispatcherDaoTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/DispatcherDaoTests.java @@ -374,7 +374,7 @@ public void testFindDispatchJobsByLocal() { lja.setThreads(1); lja.setMaxMemory(CueUtil.GB16); lja.setMaxCoreUnits(200); - lja.setMaxGpu(1); + lja.setMaxGpuMemory(1); bookingDao.insertLocalHostAssignment(host, job, lja); jobs = dispatcherDao.findLocalDispatchJobs(host); diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/FrameDaoTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/FrameDaoTests.java index c18c78a28..8d64d918e 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/FrameDaoTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/FrameDaoTests.java @@ -60,6 +60,7 @@ import com.imageworks.spcue.service.JobLauncher; import com.imageworks.spcue.service.JobManager; import com.imageworks.spcue.test.AssumingPostgresEngine; +import com.imageworks.spcue.util.CueUtil; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -126,8 +127,8 @@ public void create() { .addAllTags(ImmutableList.of("mcore", "4core", "8g")) .setState(HardwareState.UP) .setFacility("spi") - .putAttributes("freeGpu", "512") - .putAttributes("totalGpu", "512") + .setFreeGpuMem((int) CueUtil.MB512) + .setTotalGpuMem((int) CueUtil.MB512) .build(); hostManager.createHost(host); diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/HostDaoTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/HostDaoTests.java index b53e1078a..df965893b 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/HostDaoTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/HostDaoTests.java @@ -100,8 +100,8 @@ public static RenderHost buildRenderHost(String name) { .addAllTags(ImmutableList.of("linux", "64bit")) .setState(HardwareState.UP) .setFacility("spi") - .putAttributes("freeGpu", String.format("%d", CueUtil.MB512)) - .putAttributes("totalGpu", String.format("%d", CueUtil.MB512)) + .setFreeGpuMem((int) CueUtil.MB512) + .setTotalGpuMem((int) CueUtil.MB512) .build(); return host; diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/JobDaoTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/JobDaoTests.java index 07a32f184..597489592 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/JobDaoTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/JobDaoTests.java @@ -683,7 +683,7 @@ public void testUpdateUsage() { JobInterface job = jobDao.findJob(spec.getJobs().get(0).detail.name); /** 60 seconds of 100 core units **/ - ResourceUsage usage = new ResourceUsage(60, 33); + ResourceUsage usage = new ResourceUsage(60, 33, 0); assertTrue(usage.getClockTimeSeconds() > 0); assertTrue(usage.getCoreTimeSeconds() > 0); diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/LayerDaoTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/LayerDaoTests.java index 938107f36..06864a9bc 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/LayerDaoTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/LayerDaoTests.java @@ -471,12 +471,12 @@ public void updateMinMemory() { @Test @Transactional @Rollback(true) - public void updateMinGpu() { - long gpu = CueUtil.GB; + public void updateMinGpuMemory() { + long mem = CueUtil.GB; LayerDetail layer = getLayer(); - layerDao.updateMinGpu(layer, gpu, LayerType.RENDER); - assertEquals(Long.valueOf(gpu),jdbcTemplate.queryForObject( - "SELECT int_gpu_min FROM layer WHERE pk_layer=?", + layerDao.updateMinGpuMemory(layer, mem, LayerType.RENDER); + assertEquals(Long.valueOf(mem),jdbcTemplate.queryForObject( + "SELECT int_gpu_mem_min FROM layer WHERE pk_layer=?", Long.class, layer.getLayerId())); } @@ -590,7 +590,7 @@ public void testUpdateUsage() { Integer.class, layer.getId())); /** 60 seconds of 100 core units **/ - ResourceUsage usage = new ResourceUsage(60, 33); + ResourceUsage usage = new ResourceUsage(60, 33, 0); assertTrue(usage.getClockTimeSeconds() > 0); assertTrue(usage.getCoreTimeSeconds() > 0); diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ProcDaoTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ProcDaoTests.java index c43d98180..6c9efc3e5 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ProcDaoTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ProcDaoTests.java @@ -633,7 +633,7 @@ public void testGetReservedMemory() { @Test @Transactional @Rollback(true) - public void testGetReservedGpu() { + public void testGetReservedGpuMemory() { DispatchHost host = createHost(); JobDetail job = launchJob(); @@ -645,11 +645,11 @@ public void testGetReservedGpu() { procDao.insertVirtualProc(proc); VirtualProc _proc = procDao.findVirtualProc(frame); - assertEquals(Long.valueOf(Dispatcher.GPU_RESERVED_DEFAULT), jdbcTemplate.queryForObject( - "SELECT int_gpu_reserved FROM proc WHERE pk_proc=?", + assertEquals(Long.valueOf(Dispatcher.MEM_GPU_RESERVED_DEFAULT), jdbcTemplate.queryForObject( + "SELECT int_gpu_mem_reserved FROM proc WHERE pk_proc=?", Long.class, _proc.id)); - assertEquals(Dispatcher.GPU_RESERVED_DEFAULT, - procDao.getReservedGpu(_proc)); + assertEquals(Dispatcher.MEM_GPU_RESERVED_DEFAULT, + procDao.getReservedGpuMemory(_proc)); } @Test diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ServiceDaoTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ServiceDaoTests.java index fee824fc1..16168f245 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ServiceDaoTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ServiceDaoTests.java @@ -70,7 +70,7 @@ public void testInsertService() { s.timeout = 0; s.timeout_llu = 0; s.minMemory = CueUtil.GB4; - s.minGpu = CueUtil.GB; + s.minGpuMemory = CueUtil.GB; s.threadable = false; s.tags.addAll(Sets.newHashSet(new String[] { "general"})); @@ -88,7 +88,7 @@ public void testUpdateService() { s.timeout = 0; s.timeout_llu = 0; s.minMemory = CueUtil.GB4; - s.minGpu = CueUtil.GB; + s.minGpuMemory = CueUtil.GB; s.threadable = false; s.tags.addAll(Sets.newHashSet(new String[] { "general"})); @@ -100,7 +100,7 @@ public void testUpdateService() { s.timeout = 0; s.timeout_llu = 0; s.minMemory = CueUtil.GB8; - s.minGpu = CueUtil.GB2; + s.minGpuMemory = CueUtil.GB2; s.threadable = true; s.tags = Sets.newLinkedHashSet(); s.tags.add("linux"); @@ -125,7 +125,7 @@ public void testDeleteService() { s.timeout = 0; s.timeout_llu = 0; s.minMemory = CueUtil.GB4; - s.minGpu = CueUtil.GB; + s.minGpuMemory = CueUtil.GB; s.threadable = false; s.tags.addAll(Sets.newHashSet(new String[] { "general"})); @@ -149,7 +149,7 @@ public void testInsertServiceOverride() { s.timeout = 0; s.timeout_llu = 0; s.minMemory = CueUtil.GB4; - s.minGpu = CueUtil.GB; + s.minGpuMemory = CueUtil.GB; s.threadable = false; s.tags.addAll(Sets.newHashSet(new String[] { "general"})); s.showId = "00000000-0000-0000-0000-000000000000"; @@ -168,7 +168,7 @@ public void testUpdateServiceOverride() { s.timeout = 0; s.timeout_llu = 0; s.minMemory = CueUtil.GB4; - s.minGpu = CueUtil.GB2; + s.minGpuMemory = CueUtil.GB2; s.threadable = false; s.tags.addAll(Sets.newHashSet(new String[] { "general"})); s.showId = "00000000-0000-0000-0000-000000000000"; @@ -182,7 +182,7 @@ public void testUpdateServiceOverride() { s.timeout = 10; s.timeout_llu = 10; s.minMemory = CueUtil.GB8; - s.minGpu = CueUtil.GB4; + s.minGpuMemory = CueUtil.GB4; s.threadable = true; s.tags = Sets.newLinkedHashSet(); s.tags.add("linux"); @@ -195,7 +195,7 @@ public void testUpdateServiceOverride() { assertEquals(s.timeout, s1.timeout); assertEquals(s.timeout_llu, s1.timeout_llu); assertEquals(s.minMemory, s1.minMemory); - assertEquals(s.minGpu, s1.minGpu); + assertEquals(s.minGpuMemory, s1.minGpuMemory); assertEquals(s.threadable, s1.threadable); assertEquals(s.tags.toArray()[0], s1.tags.toArray()[0]); } @@ -210,7 +210,7 @@ public void testDeleteServiceOverride() { s.timeout = 0; s.timeout_llu = 0; s.minMemory = CueUtil.GB4; - s.minGpu = CueUtil.GB; + s.minGpuMemory = CueUtil.GB; s.threadable = false; s.tags.addAll(Sets.newHashSet(new String[] { "general"})); s.showId = "00000000-0000-0000-0000-000000000000"; diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ShowDaoTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ShowDaoTests.java index b27114554..d430ab3b0 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ShowDaoTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/ShowDaoTests.java @@ -84,8 +84,8 @@ public DispatchHost createHost() { .addTags("general") .setState(HardwareState.UP) .setFacility("spi") - .putAttributes("freeGpu", String.format("%d", CueUtil.MB512)) - .putAttributes("totalGpu", String.format("%d", CueUtil.MB512)) + .setFreeGpuMem((int) CueUtil.MB512) + .setTotalGpuMem((int) CueUtil.MB512) .build(); DispatchHost dh = hostManager.createHost(host); diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/WhiteboardDaoTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/WhiteboardDaoTests.java index 99449337b..8807514d4 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/WhiteboardDaoTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/WhiteboardDaoTests.java @@ -278,8 +278,8 @@ public RenderHost getRenderHost() { .setCoresPerProc(400) .setState(HardwareState.DOWN) .setFacility("spi") - .putAttributes("freeGpu", String.format("%d", CueUtil.MB512)) - .putAttributes("totalGpu", String.format("%d", CueUtil.MB512)) + .setFreeGpuMem((int) CueUtil.MB512) + .setTotalGpuMem((int) CueUtil.MB512) .build(); return host; } @@ -1246,7 +1246,7 @@ public void getRenderPartition() { jobLauncher.launch(new File("src/test/resources/conf/jobspec/jobspec_dispatch_test.xml")); JobDetail job = jobManager.findJobDetail("pipe-dev.cue-testuser_shell_dispatch_test_v1"); - LocalHostAssignment lba = new LocalHostAssignment(800, 8, CueUtil.GB8, 1); + LocalHostAssignment lba = new LocalHostAssignment(800, 8, CueUtil.GB8, 1, 1); bookingManager.createLocalHostAssignment(hd, job, lba); whiteboardDao.getRenderPartition(lba); @@ -1263,7 +1263,7 @@ public void getRenderPartitionsByHost() { jobLauncher.launch(new File("src/test/resources/conf/jobspec/jobspec_dispatch_test.xml")); JobDetail job = jobManager.findJobDetail("pipe-dev.cue-testuser_shell_dispatch_test_v1"); - LocalHostAssignment lba = new LocalHostAssignment(800, 8, CueUtil.GB8, 1); + LocalHostAssignment lba = new LocalHostAssignment(800, 8, CueUtil.GB8, 1, 1); bookingManager.createLocalHostAssignment(hd, job, lba); assertEquals(1, whiteboardDao.getRenderPartitions(hd).getRenderPartitionsCount()); diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/CoreUnitDispatcherGpuJobTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/CoreUnitDispatcherGpuJobTests.java index d99041b65..4cc1c1f03 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/CoreUnitDispatcherGpuJobTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/CoreUnitDispatcherGpuJobTests.java @@ -114,8 +114,8 @@ public void createHost() { .setState(HardwareState.UP) .setFacility("spi") .putAttributes("SP_OS", "Linux") - .putAttributes("freeGpu", String.format("%d", CueUtil.MB512)) - .putAttributes("totalGpu", String.format("%d", CueUtil.MB512)) + .setFreeGpuMem((int) CueUtil.MB512) + .setTotalGpuMem((int) CueUtil.MB512) .build(); hostManager.createHost(host, @@ -153,7 +153,7 @@ public void testDispatchGpuRemovedHostToNonGpuJob() { host.idleMemory = host.idleMemory - Math.min(CueUtil.GB4, host.idleMemory); host.idleCores = host.idleCores - Math.min(100, host.idleCores); - host.idleGpu = 0; + host.idleGpuMemory = 0; List procs = dispatcher.dispatchHost(host, job); assertEquals(0, procs.size()); } diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/CoreUnitDispatcherGpuTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/CoreUnitDispatcherGpuTests.java index 9318258ad..0a4f6b74a 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/CoreUnitDispatcherGpuTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/CoreUnitDispatcherGpuTests.java @@ -114,8 +114,8 @@ public void createHost() { .setState(HardwareState.UP) .setFacility("spi") .putAttributes("SP_OS", "Linux") - .putAttributes("freeGpu", String.format("%d", CueUtil.MB512)) - .putAttributes("totalGpu", String.format("%d", CueUtil.MB512)) + .setFreeGpuMem((int) CueUtil.MB512) + .setTotalGpuMem((int) CueUtil.MB512) .build(); hostManager.createHost(host, @@ -153,7 +153,7 @@ public void testDispatchGpuRemovedHostToNonGpuJob() { host.idleMemory = host.idleMemory - Math.min(CueUtil.GB4, host.idleMemory); host.idleCores = host.idleCores - Math.min(100, host.idleCores); - host.idleGpu = 0; + host.idleGpuMemory = 0; List procs = dispatcher.dispatchHost(host, job); assertEquals(1, procs.size()); } @@ -202,17 +202,20 @@ public void testDispatchHostRemoveRestoreGpu() { long idleMemoryOrig = host.idleMemory; int idleCoresOrig = host.idleCores; - long idleGpuOrig = host.idleGpu; + long idleGpuMemoryOrig = host.idleGpuMemory; + int idleGpusOrig = host.idleGpus; host.removeGpu(); - assertEquals(0, host.idleGpu); + assertEquals(0, host.idleGpuMemory); + assertEquals(0, host.idleGpus); assertEquals(idleMemoryOrig - CueUtil.GB4, host.idleMemory); assertEquals(idleCoresOrig - 100, host.idleCores); host.restoreGpu(); assertEquals(idleMemoryOrig, host.idleMemory); assertEquals(idleCoresOrig, host.idleCores); - assertEquals(idleGpuOrig, host.idleGpu); + assertEquals(idleGpuMemoryOrig, host.idleGpuMemory); + assertEquals(idleGpusOrig, host.idleGpus); } @Test @@ -222,7 +225,7 @@ public void dispatchProcToJob() { DispatchHost host = getHost(); JobDetail job = getJob(); - host.idleGpu = 0; + host.idleGpuMemory = 0; List procs = dispatcher.dispatchHost(host, job); VirtualProc proc = procs.get(0); dispatcher.dispatchProcToJob(proc, job); diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/CoreUnitDispatcherGpusJobTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/CoreUnitDispatcherGpusJobTests.java new file mode 100644 index 000000000..4972b8f9b --- /dev/null +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/CoreUnitDispatcherGpusJobTests.java @@ -0,0 +1,277 @@ + +/* + * Copyright Contributors to the OpenCue Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +package com.imageworks.spcue.test.dispatcher; + +import java.io.File; +import java.util.List; +import javax.annotation.Resource; + +import org.junit.Before; +import org.junit.Test; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.transaction.annotation.Transactional; + +import com.imageworks.spcue.DispatchFrame; +import com.imageworks.spcue.DispatchHost; +import com.imageworks.spcue.LayerDetail; +import com.imageworks.spcue.JobDetail; +import com.imageworks.spcue.VirtualProc; +import com.imageworks.spcue.dao.criteria.FrameSearchFactory; +import com.imageworks.spcue.dao.FrameDao; +import com.imageworks.spcue.dao.LayerDao; +import com.imageworks.spcue.depend.LayerOnLayer; +import com.imageworks.spcue.dispatcher.DispatchSupport; +import com.imageworks.spcue.dispatcher.Dispatcher; +import com.imageworks.spcue.grpc.host.HardwareState; +import com.imageworks.spcue.grpc.job.FrameState; +import com.imageworks.spcue.grpc.report.RenderHost; +import com.imageworks.spcue.service.AdminManager; +import com.imageworks.spcue.service.DependManager; +import com.imageworks.spcue.service.HostManager; +import com.imageworks.spcue.service.JobLauncher; +import com.imageworks.spcue.service.JobManager; +import com.imageworks.spcue.test.TransactionalTest; +import com.imageworks.spcue.util.CueUtil; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +@ContextConfiguration +public class CoreUnitDispatcherGpusJobTests extends TransactionalTest { + + @Resource + JobManager jobManager; + + @Resource + JobLauncher jobLauncher; + + @Resource + HostManager hostManager; + + @Resource + AdminManager adminManager; + + @Resource + Dispatcher dispatcher; + + @Resource + DispatchSupport dispatchSupport; + + @Resource + LayerDao layerDao; + + @Resource + FrameDao frameDao; + + @Resource + FrameSearchFactory frameSearchFactory; + + @Resource + DependManager dependManager; + + private static final String HOSTNAME = "beta"; + + private static final String CPU_JOB = "pipe-default-testuser_test_cpu"; + + private static final String GPU_JOB = "pipe-default-testuser_test_gpu"; + + private static final String GPU_OVERBOOK_JOB = "pipe-default-testuser_test_gpu_overbook"; + + @Before + public void launchJob() { + jobLauncher.testMode = true; + jobLauncher.launch( + new File("src/test/resources/conf/jobspec/jobspec_dispatch_gpus_test.xml")); + } + + @Before + public void setTestMode() { + dispatcher.setTestMode(true); + } + + @Before + public void createHost() { + RenderHost host = RenderHost.newBuilder() + .setName(HOSTNAME) + .setBootTime(1192369572) + .setFreeMcp(76020) + .setFreeMem((int) CueUtil.GB8) + .setFreeSwap(20760) + .setLoad(0) + .setTotalMcp(195430) + .setTotalMem(CueUtil.GB8) + .setTotalSwap(CueUtil.GB2) + .setNimbyEnabled(false) + .setNumProcs(40) + .setCoresPerProc(100) + .addTags("test") + .setState(HardwareState.UP) + .setFacility("spi") + .putAttributes("SP_OS", "Linux") + .setNumGpus(8) + .setFreeGpuMem(CueUtil.GB32) + .setTotalGpuMem(CueUtil.GB32) + .build(); + + hostManager.createHost(host, + adminManager.findAllocationDetail("spi", "general")); + } + + public DispatchHost getHost() { + return hostManager.findDispatchHost(HOSTNAME); + } + + @Test + @Transactional + @Rollback(true) + public void testDispatchHost() { + DispatchHost host = getHost(); + + List procs = dispatcher.dispatchHost(host); + // All jobs are paused. procs should be empty. + assertTrue(procs.isEmpty()); + } + + @Test + @Transactional + @Rollback(true) + public void testDispatchCpuJob() { + JobDetail job = jobManager.findJobDetail(CPU_JOB); + jobManager.setJobPaused(job, false); + + DispatchHost host = getHost(); + List procs = dispatcher.dispatchHost(host, job); + // Cuebot doesn't dispatch non-GPU job to GPU host. procs should be empty. + assertTrue(procs.isEmpty()); + } + + @Test + @Transactional + @Rollback(true) + public void testDispatchGpuJob() { + JobDetail job = jobManager.findJobDetail(GPU_JOB); + jobManager.setJobPaused(job, false); + + DispatchHost host = getHost(); + List procs = dispatcher.dispatchHost(host, job); + + /* + * The job contains 4 layers. + * - test_gpus_0_layer gpus=0 gpu_memory=1 + * - test_gpu_memory_0_layer gpus=1 gpu_memory=0 + * - test_gpus_1_layer gpus=1 gpu_memory=1 + * - test_gpus_4_kayer gpus=4 gpu_memory=7g + * + * Cuebot doesn't dispatch test_gpu_memory_0_layer because gpu_memory is 0. + * Also job_frame_dispatch_max is 2, + * the procs should be test_gpus_0_layer and test_gpus_1_layer. + */ + assertEquals(2, procs.size()); + + VirtualProc proc0 = procs.get(0); + LayerDetail layer0 = layerDao.findLayerDetail(job, "test_gpus_0_layer"); + assertEquals(layer0.id, proc0.layerId); + assertEquals(100, proc0.coresReserved); + assertEquals(3355443, proc0.memoryReserved); + assertEquals(0, proc0.gpusReserved); + assertEquals(1048576, proc0.gpuMemoryReserved); + + VirtualProc proc1 = procs.get(1); + LayerDetail layer1 = layerDao.findLayerDetail(job, "test_gpus_1_layer"); + assertEquals(layer1.id, proc1.layerId); + assertEquals(100, proc1.coresReserved); + assertEquals(3355443, proc1.memoryReserved); + assertEquals(1, proc1.gpusReserved); + assertEquals(1048576, proc0.gpuMemoryReserved); + } + + @Test + @Transactional + @Rollback(true) + public void testDispatchGpuJobWithDependency() { + JobDetail job = jobManager.findJobDetail(GPU_JOB); + LayerDetail dl0 = layerDao.findLayerDetail(job, "test_gpus_0_layer"); + LayerDetail dl1 = layerDao.findLayerDetail(job, "test_gpu_memory_0_layer"); + LayerOnLayer depend = new LayerOnLayer(dl0, dl1); + dependManager.createDepend(depend); + jobManager.setJobPaused(job, false); + + DispatchHost host = getHost(); + List procs = dispatcher.dispatchHost(host, job); + + /* + * The job contains 4 layers. + * - test_gpus_0_layer gpus=0 gpu_memory=1 + * - test_gpu_memory_0_layer gpus=1 gpu_memory=0 + * - test_gpus_1_layer gpus=1 gpu_memory=1 + * - test_gpus_4_kayer gpus=4 gpu_memory=7g + * + * Cuebot doesn't dispatch test_gpu_memory_0_layer because gpu_memory is 0. + * And test_gpus_0_layer depends on test_gpu_memory_0_layer. + * So the procs should be test_gpus_1_layer and test_gpus_4_layer. + */ + assertEquals(2, procs.size()); + + VirtualProc proc0 = procs.get(0); + LayerDetail layer0 = layerDao.findLayerDetail(job, "test_gpus_1_layer"); + assertEquals(layer0.id, proc0.layerId); + assertEquals(100, proc0.coresReserved); + assertEquals(3355443, proc0.memoryReserved); + assertEquals(1, proc0.gpusReserved); + assertEquals(1048576, proc0.gpuMemoryReserved); + + VirtualProc proc1 = procs.get(1); + LayerDetail layer1 = layerDao.findLayerDetail(job, "test_gpus_4_layer"); + assertEquals(layer1.id, proc1.layerId); + assertEquals(100, proc1.coresReserved); + assertEquals(3355443, proc1.memoryReserved); + assertEquals(4, proc1.gpusReserved); + assertEquals(7340032, proc1.gpuMemoryReserved); + } + + @Test + @Transactional + @Rollback(true) + public void testDispatchGpuOverbookJob() { + JobDetail job = jobManager.findJobDetail(GPU_OVERBOOK_JOB); + jobManager.setJobPaused(job, false); + + DispatchHost host = getHost(); + List procs = dispatcher.dispatchHost(host, job); + + /* + * The job contains 2 layers. + * - test_gpus_6_layer gpus=6 gpu_memory=1 + * - test_gpus_3_layer gpus=3 gpu_memory=1 + * the procs should be only test_gpus_6_layer since host only has 8 GPUs. + */ + assertEquals(1, procs.size()); + + VirtualProc proc0 = procs.get(0); + LayerDetail layer0 = layerDao.findLayerDetail(job, "test_gpus_6_layer"); + assertEquals(layer0.id, proc0.layerId); + assertEquals(100, proc0.coresReserved); + assertEquals(3355443, proc0.memoryReserved); + assertEquals(6, proc0.gpusReserved); + assertEquals(1048576, proc0.gpuMemoryReserved); + } +} + diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/DispatchSupportTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/DispatchSupportTests.java index baa2353fb..98c60fd9c 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/DispatchSupportTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/DispatchSupportTests.java @@ -110,8 +110,8 @@ public void createHost() { .setState(HardwareState.UP) .setFacility("spi") .putAttributes("SP_OS", "Linux") - .putAttributes("freeGpu", String.format("%d", CueUtil.MB512)) - .putAttributes("totalGpu", String.format("%d", CueUtil.MB512)) + .setFreeGpuMem((int) CueUtil.MB512) + .setTotalGpuMem((int) CueUtil.MB512) .build(); hostManager.createHost(host, diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/FrameCompleteHandlerTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/FrameCompleteHandlerTests.java new file mode 100644 index 000000000..8888e0453 --- /dev/null +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/FrameCompleteHandlerTests.java @@ -0,0 +1,236 @@ + +/* + * Copyright Contributors to the OpenCue Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +package com.imageworks.spcue.test.dispatcher; + +import java.io.File; +import java.util.List; +import javax.annotation.Resource; + +import org.junit.Before; +import org.junit.Test; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.transaction.annotation.Transactional; + +import com.imageworks.spcue.DispatchHost; +import com.imageworks.spcue.FrameInterface; +import com.imageworks.spcue.JobDetail; +import com.imageworks.spcue.LayerDetail; +import com.imageworks.spcue.VirtualProc; +import com.imageworks.spcue.dao.LayerDao; +import com.imageworks.spcue.dispatcher.Dispatcher; +import com.imageworks.spcue.dispatcher.FrameCompleteHandler; +import com.imageworks.spcue.grpc.host.HardwareState; +import com.imageworks.spcue.grpc.report.FrameCompleteReport; +import com.imageworks.spcue.grpc.report.RenderHost; +import com.imageworks.spcue.grpc.report.RunningFrameInfo; +import com.imageworks.spcue.service.AdminManager; +import com.imageworks.spcue.service.HostManager; +import com.imageworks.spcue.service.JobLauncher; +import com.imageworks.spcue.service.JobManager; +import com.imageworks.spcue.test.TransactionalTest; +import com.imageworks.spcue.util.CueUtil; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +@ContextConfiguration +public class FrameCompleteHandlerTests extends TransactionalTest { + + @Resource + AdminManager adminManager; + + @Resource + FrameCompleteHandler frameCompleteHandler; + + @Resource + HostManager hostManager; + + @Resource + JobLauncher jobLauncher; + + @Resource + JobManager jobManager; + + @Resource + LayerDao layerDao; + + @Resource + Dispatcher dispatcher; + + private static final String HOSTNAME = "beta"; + + @Before + public void setTestMode() { + dispatcher.setTestMode(true); + } + + @Before + public void launchJob() { + jobLauncher.testMode = true; + jobLauncher.launch( + new File("src/test/resources/conf/jobspec/jobspec_gpus_test.xml")); + } + + @Before + public void createHost() { + RenderHost host = RenderHost.newBuilder() + .setName(HOSTNAME) + .setBootTime(1192369572) + .setFreeMcp(76020) + .setFreeMem((int) CueUtil.GB8) + .setFreeSwap(20760) + .setLoad(0) + .setTotalMcp(195430) + .setTotalMem(CueUtil.GB8) + .setTotalSwap(CueUtil.GB2) + .setNimbyEnabled(false) + .setNumProcs(40) + .setCoresPerProc(100) + .setState(HardwareState.UP) + .setFacility("spi") + .putAttributes("SP_OS", "Linux") + .setNumGpus(8) + .setFreeGpuMem(CueUtil.GB16 * 8) + .setTotalGpuMem(CueUtil.GB16 * 8) + .build(); + + hostManager.createHost(host, + adminManager.findAllocationDetail("spi", "general")); + } + + public DispatchHost getHost() { + return hostManager.findDispatchHost(HOSTNAME); + } + + @Test + @Transactional + @Rollback(true) + public void testGpuReport() { + JobDetail job = jobManager.findJobDetail("pipe-default-testuser_test0"); + LayerDetail layer = layerDao.findLayerDetail(job, "layer0"); + jobManager.setJobPaused(job, false); + + DispatchHost host = getHost(); + List procs = dispatcher.dispatchHost(host); + assertEquals(1, procs.size()); + VirtualProc proc = procs.get(0); + + assertEquals(7, host.idleGpus); + assertEquals(CueUtil.GB16 * 8 - CueUtil.GB, host.idleGpuMemory); + + RunningFrameInfo info = RunningFrameInfo.newBuilder() + .setJobId(proc.getJobId()) + .setLayerId(proc.getLayerId()) + .setFrameId(proc.getFrameId()) + .setResourceId(proc.getProcId()) + .build(); + FrameCompleteReport report = FrameCompleteReport.newBuilder() + .setFrame(info) + .setExitStatus(0) + .build(); + frameCompleteHandler.handleFrameCompleteReport(report); + + assertTrue(jobManager.isLayerComplete(layer)); + assertTrue(jobManager.isJobComplete(job)); + } + + @Test + @Transactional + @Rollback(true) + public void testGpuReportMultiple() { + JobDetail job0 = jobManager.findJobDetail("pipe-default-testuser_test0"); + LayerDetail layer0_0 = layerDao.findLayerDetail(job0, "layer0"); + jobManager.setJobPaused(job0, false); + + JobDetail job1 = jobManager.findJobDetail("pipe-default-testuser_test1"); + LayerDetail layer1_0 = layerDao.findLayerDetail(job1, "layer0"); + jobManager.setJobPaused(job1, false); + + DispatchHost host = getHost(); + List procs = dispatcher.dispatchHost(host); + assertEquals(2, procs.size()); + + assertEquals(4, host.idleGpus); + assertEquals(CueUtil.GB16 * 8 - CueUtil.GB2, host.idleGpuMemory); + + for (VirtualProc proc : procs) { + RunningFrameInfo info = RunningFrameInfo.newBuilder() + .setJobId(proc.getJobId()) + .setLayerId(proc.getLayerId()) + .setFrameId(proc.getFrameId()) + .setResourceId(proc.getProcId()) + .build(); + FrameCompleteReport report = FrameCompleteReport.newBuilder() + .setFrame(info) + .setExitStatus(0) + .build(); + frameCompleteHandler.handleFrameCompleteReport(report); + } + + assertTrue(jobManager.isLayerComplete(layer0_0)); + assertTrue(jobManager.isJobComplete(job0)); + assertTrue(jobManager.isLayerComplete(layer1_0)); + assertTrue(jobManager.isJobComplete(job1)); + } + + @Test + @Transactional + @Rollback(true) + public void testGpuReportOver() { + JobDetail job1 = jobManager.findJobDetail("pipe-default-testuser_test1"); + LayerDetail layer1_0 = layerDao.findLayerDetail(job1, "layer0"); + jobManager.setJobPaused(job1, false); + + JobDetail job2 = jobManager.findJobDetail("pipe-default-testuser_test2"); + LayerDetail layer2_0 = layerDao.findLayerDetail(job2, "layer0"); + jobManager.setJobPaused(job2, false); + + DispatchHost host = getHost(); + List procs = dispatcher.dispatchHost(host); + assertEquals(1, procs.size()); + + assertTrue(host.idleGpus == 5 || host.idleGpus == 2); + assertEquals(CueUtil.GB16 * 8 - CueUtil.GB, host.idleGpuMemory); + + for (VirtualProc proc : procs) { + RunningFrameInfo info = RunningFrameInfo.newBuilder() + .setJobId(proc.getJobId()) + .setLayerId(proc.getLayerId()) + .setFrameId(proc.getFrameId()) + .setResourceId(proc.getProcId()) + .build(); + FrameCompleteReport report = FrameCompleteReport.newBuilder() + .setFrame(info) + .setExitStatus(0) + .build(); + frameCompleteHandler.handleFrameCompleteReport(report); + } + + assertEquals(1, + (jobManager.isLayerComplete(layer1_0) ? 1 : 0) + + (jobManager.isLayerComplete(layer2_0) ? 1 : 0)); + assertEquals(1, + (jobManager.isJobComplete(job1) ? 1 : 0) + + (jobManager.isJobComplete(job2) ? 1 : 0)); + } +} + diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerGpuTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerGpuTests.java new file mode 100644 index 000000000..dee9d0792 --- /dev/null +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerGpuTests.java @@ -0,0 +1,124 @@ + +/* + * Copyright Contributors to the OpenCue Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +package com.imageworks.spcue.test.dispatcher; + +import javax.annotation.Resource; + +import org.junit.Before; +import org.junit.Test; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.transaction.annotation.Transactional; + +import com.imageworks.spcue.DispatchHost; +import com.imageworks.spcue.dispatcher.Dispatcher; +import com.imageworks.spcue.dispatcher.HostReportHandler; +import com.imageworks.spcue.grpc.host.HardwareState; +import com.imageworks.spcue.grpc.host.LockState; +import com.imageworks.spcue.grpc.report.CoreDetail; +import com.imageworks.spcue.grpc.report.HostReport; +import com.imageworks.spcue.grpc.report.RenderHost; +import com.imageworks.spcue.service.AdminManager; +import com.imageworks.spcue.service.HostManager; +import com.imageworks.spcue.test.TransactionalTest; +import com.imageworks.spcue.util.CueUtil; + +import static org.junit.Assert.assertEquals; + +@ContextConfiguration +public class HostReportHandlerGpuTests extends TransactionalTest { + + @Resource + AdminManager adminManager; + + @Resource + HostManager hostManager; + + @Resource + HostReportHandler hostReportHandler; + + @Resource + Dispatcher dispatcher; + + private static final String HOSTNAME = "beta"; + + @Before + public void setTestMode() { + dispatcher.setTestMode(true); + } + + private static CoreDetail getCoreDetail(int total, int idle, int booked, int locked) { + return CoreDetail.newBuilder() + .setTotalCores(total) + .setIdleCores(idle) + .setBookedCores(booked) + .setLockedCores(locked) + .build(); + } + + private DispatchHost getHost() { + return hostManager.findDispatchHost(HOSTNAME); + } + + private static RenderHost getRenderHost() { + return RenderHost.newBuilder() + .setName(HOSTNAME) + .setBootTime(1192369572) + .setFreeMcp(76020) + .setFreeMem(53500) + .setFreeSwap(20760) + .setLoad(0) + .setTotalMcp(195430) + .setTotalMem(1048576L * 4096) + .setTotalSwap(20960) + .setNimbyEnabled(false) + .setNumProcs(2) + .setCoresPerProc(100) + .addTags("test") + .setState(HardwareState.UP) + .setFacility("spi") + .putAttributes("SP_OS", "Linux") + .setNumGpus(64) + .setFreeGpuMem(1048576L * 2000) + .setTotalGpuMem(1048576L * 2048) + .build(); + } + + @Test + @Transactional + @Rollback(true) + public void testHandleHostReport() { + CoreDetail cores = getCoreDetail(200, 200, 0, 0); + HostReport report = HostReport.newBuilder() + .setHost(getRenderHost()) + .setCoreInfo(cores) + .build(); + + hostReportHandler.handleHostReport(report, true); + DispatchHost host = getHost(); + assertEquals(host.lockState, LockState.OPEN); + assertEquals(host.memory, 4294443008L); + assertEquals(host.gpus, 64); + assertEquals(host.idleGpus, 64); + assertEquals(host.gpuMemory, 1048576L * 2048); + assertEquals(host.idleGpuMemory, 2147483648L); + } +} + diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerTests.java index f24375cce..f4d706cdb 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/HostReportHandlerTests.java @@ -104,8 +104,8 @@ private static RenderHost getRenderHost() { .setState(HardwareState.UP) .setFacility("spi") .putAttributes("SP_OS", "Linux") - .putAttributes("freeGpu", String.format("%d", CueUtil.MB512)) - .putAttributes("totalGpu", String.format("%d", CueUtil.MB512)) + .setFreeGpuMem((int) CueUtil.MB512) + .setTotalGpuMem((int) CueUtil.MB512) .build(); } diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/LocalDispatcherTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/LocalDispatcherTests.java index 88f195111..97a270085 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/LocalDispatcherTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/LocalDispatcherTests.java @@ -109,8 +109,8 @@ public void createHost() { .setFacility("spi") .addTags("test") .putAttributes("SP_OS", "Linux") - .putAttributes("freeGpu", String.format("%d", CueUtil.MB512)) - .putAttributes("totalGpu", String.format("%d", CueUtil.MB512)) + .setFreeGpuMem((int) CueUtil.MB512) + .setTotalGpuMem((int) CueUtil.MB512) .build(); hostManager.createHost(host, @@ -171,7 +171,7 @@ public void testDispatchHostAutoDetectLayer() { JobDetail job = getJob(); LayerInterface layer = jobManager.getLayers(job).get(0); - LocalHostAssignment lba = new LocalHostAssignment(300, 1, CueUtil.GB8, 1); + LocalHostAssignment lba = new LocalHostAssignment(300, 1, CueUtil.GB8, 0, 0); bookingManager.createLocalHostAssignment(host, layer, lba); List procs = localDispatcher.dispatchHost(host); @@ -205,7 +205,7 @@ public void testDispatchHostAutoDetectFrame() { LayerInterface layer = jobManager.getLayers(job).get(0); FrameInterface frame = jobManager.findFrame(layer, 5); - LocalHostAssignment lba = new LocalHostAssignment(200, 1, CueUtil.GB8, 1); + LocalHostAssignment lba = new LocalHostAssignment(200, 1, CueUtil.GB8, 0, 0); bookingManager.createLocalHostAssignment(host, frame, lba); List procs = localDispatcher.dispatchHost(host); @@ -228,7 +228,7 @@ public void testDispatchHostToLocalJob() { DispatchHost host = getHost(); JobDetail job = getJob(); - LocalHostAssignment lba = new LocalHostAssignment(200, 1, CueUtil.GB8, 1); + LocalHostAssignment lba = new LocalHostAssignment(200, 1, CueUtil.GB8, 0, 0); bookingManager.createLocalHostAssignment(host, job, lba); List procs = localDispatcher.dispatchHost(host, job); @@ -258,7 +258,7 @@ public void testDispatchHostToLocalLayer() { JobDetail job = getJob(); LayerInterface layer = jobManager.getLayers(job).get(0); - LocalHostAssignment lba = new LocalHostAssignment(300, 1, CueUtil.GB8, 1); + LocalHostAssignment lba = new LocalHostAssignment(300, 1, CueUtil.GB8, 0, 0); bookingManager.createLocalHostAssignment(host, layer, lba); List procs = localDispatcher.dispatchHost(host, layer); @@ -292,7 +292,7 @@ public void testDispatchHostToLocalFrame() { LayerInterface layer = jobManager.getLayers(job).get(0); FrameInterface frame = jobManager.findFrame(layer, 5); - LocalHostAssignment lba = new LocalHostAssignment(200, 1, CueUtil.GB8, 1); + LocalHostAssignment lba = new LocalHostAssignment(200, 1, CueUtil.GB8, 0, 0); bookingManager.createLocalHostAssignment(host, frame, lba); List procs = localDispatcher.dispatchHost(host, frame); @@ -317,7 +317,7 @@ public void testDispatchHostToLocalFrameTwice() { LayerInterface layer = jobManager.getLayers(job).get(0); FrameInterface frame = jobManager.findFrame(layer, 5); - LocalHostAssignment lba = new LocalHostAssignment(200, 1, CueUtil.GB8, 1); + LocalHostAssignment lba = new LocalHostAssignment(200, 1, CueUtil.GB8, 0, 0); bookingManager.createLocalHostAssignment(host, frame, lba); List procs = localDispatcher.dispatchHost(host, frame); @@ -345,7 +345,7 @@ public void testDispatchHostToLocalJobDeficit() { DispatchHost host = getHost(); JobDetail job = getJob(); - LocalHostAssignment lba = new LocalHostAssignment(800, 8, CueUtil.GB8, 1); + LocalHostAssignment lba = new LocalHostAssignment(800, 8, CueUtil.GB8, 0, 0); bookingManager.createLocalHostAssignment(host, job, lba); List procs = localDispatcher.dispatchHost(host, job); @@ -365,7 +365,7 @@ public void testDispatchHostToLocalJobDeficit() { * Now, lower our min cores to create a deficit. */ assertFalse(bookingManager.hasResourceDeficit(host)); - bookingManager.setMaxResources(lba, 700, 0, 1); + bookingManager.setMaxResources(lba, 700, 0, 0, 0); assertTrue(bookingManager.hasResourceDeficit(host)); } } diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/TestBookingQueue.java b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/TestBookingQueue.java index a34cc1d3e..7502e0687 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/TestBookingQueue.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/dispatcher/TestBookingQueue.java @@ -74,8 +74,8 @@ public void create() { .setState(HardwareState.UP) .setFacility("spi") .addAllTags(ImmutableList.of("mcore", "4core", "8g")) - .putAttributes("freeGpu", String.format("%d", CueUtil.MB512)) - .putAttributes("totalGpu", String.format("%d", CueUtil.MB512)) + .setFreeGpuMem((int) CueUtil.MB512) + .setTotalGpuMem((int) CueUtil.MB512) .build(); hostManager.createHost(host); diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/service/BookingManagerTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/service/BookingManagerTests.java index e94705898..9b6813c33 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/service/BookingManagerTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/service/BookingManagerTests.java @@ -125,8 +125,8 @@ public DispatchHost createHost() { .setState(HardwareState.UP) .setFacility("spi") .addTags("general") - .putAttributes("freeGpu", String.format("%d", CueUtil.MB512)) - .putAttributes("totalGpu", String.format("%d", CueUtil.MB512)) + .setFreeGpuMem((int) CueUtil.MB512) + .setTotalGpuMem((int) CueUtil.MB512) .build(); DispatchHost dh = hostManager.createHost(host); @@ -319,23 +319,23 @@ public void setMaxResources() { /* * Lower the cores. */ - bookingManager.setMaxResources(lja, 100, CueUtil.GB2, CueUtil.MB256); + bookingManager.setMaxResources(lja, 100, CueUtil.GB2, 1, CueUtil.MB256); LocalHostAssignment l2 = bookingManager.getLocalHostAssignment(lja.id); assertEquals(100, l2.getMaxCoreUnits()); assertEquals(CueUtil.GB2, l2.getMaxMemory()); - assertEquals(CueUtil.MB256, l2.getMaxGpu()); + assertEquals(CueUtil.MB256, l2.getMaxGpuMemory()); /* * Raise the values. */ - bookingManager.setMaxResources(lja, 200, CueUtil.GB4, CueUtil.MB512); + bookingManager.setMaxResources(lja, 200, CueUtil.GB4, 1, CueUtil.MB512); l2 = bookingManager.getLocalHostAssignment(lja.id); assertEquals(200, l2.getMaxCoreUnits()); assertEquals(CueUtil.GB4, l2.getMaxMemory()); - assertEquals(CueUtil.MB512, l2.getMaxGpu()); + assertEquals(CueUtil.MB512, l2.getMaxGpuMemory()); } @Test @@ -351,7 +351,7 @@ public void setIllegalMaxResources() { LocalHostAssignment lja = new LocalHostAssignment(); lja.setMaxCoreUnits(200); lja.setMaxMemory(CueUtil.GB4); - lja.setMaxGpu(CueUtil.MB512); + lja.setMaxGpuMemory(CueUtil.MB512); lja.setThreads(2); bookingManager.createLocalHostAssignment(h, j, lja); @@ -359,7 +359,7 @@ public void setIllegalMaxResources() { /* * Raise the cores too high */ - bookingManager.setMaxResources(lja, 800, CueUtil.GB2, 0); + bookingManager.setMaxResources(lja, 800, CueUtil.GB2, 0, 0); } @Test diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/service/HostManagerTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/service/HostManagerTests.java index ed89219da..cf86e5362 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/service/HostManagerTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/service/HostManagerTests.java @@ -114,8 +114,8 @@ public DispatchHost createHost() { .setState(HardwareState.UP) .setFacility("spi") .addAllTags(ImmutableList.of("linux", "64bit")) - .putAttributes("freeGpu", "512") - .putAttributes("totalGpu", "512") + .setFreeGpuMem((int) CueUtil.MB512) + .setTotalGpuMem((int) CueUtil.MB512) .build(); hostDao.insertRenderHost(host, diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/service/JobManagerTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/service/JobManagerTests.java index b2446fe20..3be56bf06 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/service/JobManagerTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/service/JobManagerTests.java @@ -463,7 +463,7 @@ public void optimizeLayer() { .stream() .limit(5) .forEach(frame -> frameDao.updateFrameState(frame, FrameState.SUCCEEDED)); - layerDao.updateUsage(layer, new ResourceUsage(100, 3500 * 5), 0); + layerDao.updateUsage(layer, new ResourceUsage(100, 3500 * 5, 0), 0); // Test to make sure our optimization jobManager.optimizeLayer(layer, 100, CueUtil.MB512, 120); diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/service/JobSpecTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/service/JobSpecTests.java index e68daa551..d3f94dce7 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/service/JobSpecTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/service/JobSpecTests.java @@ -29,6 +29,8 @@ import org.springframework.test.context.junit4.AbstractTransactionalJUnit4SpringContextTests; import org.springframework.test.context.support.AnnotationConfigContextLoader; +import com.imageworks.spcue.BuildableJob; +import com.imageworks.spcue.LayerDetail; import com.imageworks.spcue.SpecBuilderException; import com.imageworks.spcue.config.TestAppConfig; import com.imageworks.spcue.service.JobLauncher; @@ -95,4 +97,21 @@ public void testParseInvalidShot() { "Shot names must be alpha numeric, no dashes or punctuation."); } } + + @Test + public void testParseGpuSuccess() { + String xml = readJobSpec("jobspec_1_12.xml"); + JobSpec spec = jobLauncher.parse(xml); + assertEquals(spec.getDoc().getDocType().getPublicID(), + "SPI Cue Specification Language"); + assertEquals(spec.getDoc().getDocType().getSystemID(), + "http://localhost:8080/spcue/dtd/cjsl-1.12.dtd"); + assertEquals(spec.getJobs().size(), 1); + BuildableJob job = spec.getJobs().get(0); + assertEquals(job.detail.name, "testing-default-testuser_test"); + LayerDetail layer = job.getBuildableLayers().get(0).layerDetail; + assertEquals(layer.getMinimumGpus(), 1); + assertEquals(layer.getMinimumGpuMemory(), 1048576); + } + } diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/service/OwnerManagerTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/service/OwnerManagerTests.java index e11bab099..224dcac75 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/service/OwnerManagerTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/service/OwnerManagerTests.java @@ -82,8 +82,8 @@ public DispatchHost createHost() { .setState(HardwareState.UP) .setFacility("spi") .addTags("general") - .putAttributes("freeGpu", String.format("%d", CueUtil.MB512)) - .putAttributes("totalGpu", String.format("%d", CueUtil.MB512)) + .setFreeGpuMem((int) CueUtil.MB512) + .setTotalGpuMem((int) CueUtil.MB512) .build(); DispatchHost dh = hostManager.createHost(host); diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/service/ServiceManagerTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/service/ServiceManagerTests.java index 3573cbe59..5354d763e 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/service/ServiceManagerTests.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/service/ServiceManagerTests.java @@ -84,7 +84,7 @@ public void testCreateService() { s.name = "dillweed"; s.minCores = 100; s.minMemory = CueUtil.GB4; - s.minGpu = CueUtil.GB2; + s.minGpuMemory = CueUtil.GB2; s.threadable = false; s.timeout = 0; s.timeout_llu = 0; @@ -105,7 +105,7 @@ public void testOverrideExistingService() { s.timeout = 10; s.timeout_llu = 10; s.minMemory = CueUtil.GB8; - s.minGpu = CueUtil.GB2; + s.minGpuMemory = CueUtil.GB2; s.threadable = false; s.tags.addAll(Sets.newHashSet("general")); s.showId = "00000000-0000-0000-0000-000000000000"; @@ -118,7 +118,7 @@ public void testOverrideExistingService() { assertEquals(10, newService.timeout); assertEquals(10, newService.timeout_llu); assertEquals(CueUtil.GB8, newService.minMemory); - assertEquals(CueUtil.GB2, newService.minGpu); + assertEquals(CueUtil.GB2, newService.minGpuMemory); assertFalse(newService.threadable); assertTrue(s.tags.contains("general")); @@ -127,7 +127,7 @@ public void testOverrideExistingService() { // now check the original is back. newService = serviceManager.getService("arnold", s.showId); assertEquals(100, newService.minCores); - assertEquals(0, newService.minGpu); + assertEquals(0, newService.minGpuMemory); } @Test @@ -151,7 +151,7 @@ public void testJobLaunch() { assertEquals(shell.minCores, shellLayer.minimumCores); assertEquals(shell.minMemory, shellLayer.minimumMemory); - assertEquals(shell.minGpu, shellLayer.minimumGpu); + assertEquals(shell.minGpuMemory, shellLayer.minimumGpuMemory); assertFalse(shellLayer.isThreadable); assertEquals(shell.tags, shellLayer.tags); assertThat(shellLayer.services, contains("shell", "katana", "unknown")); @@ -164,7 +164,7 @@ public void testJobLaunch() { assertEquals(cuda.minCores, cudaLayer.minimumCores); assertEquals(cuda.minMemory, cudaLayer.minimumMemory); - assertEquals(cuda.minGpu, cudaLayer.minimumGpu); + assertEquals(cuda.minGpuMemory, cudaLayer.minimumGpuMemory); assertFalse(cudaLayer.isThreadable); assertEquals(cuda.tags, cudaLayer.tags); assertThat(cudaLayer.services, contains("cuda")); diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/util/CueUtilTester.java b/cuebot/src/test/java/com/imageworks/spcue/test/util/CueUtilTester.java index 9bfc19e41..d3a4abe76 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/test/util/CueUtilTester.java +++ b/cuebot/src/test/java/com/imageworks/spcue/test/util/CueUtilTester.java @@ -154,9 +154,9 @@ public void testCoreUnitsToCoresWithScale() { @Test public void testBuildProcName() { - assertEquals("drack100/1.00", CueUtil.buildProcName("drack100",100)); - assertEquals("drack100/1.40", CueUtil.buildProcName("drack100",140)); - assertEquals("drack100/2.01", CueUtil.buildProcName("drack100",201)); + assertEquals("drack100/1.00/1", CueUtil.buildProcName("drack100",100,1)); + assertEquals("drack100/1.40/0", CueUtil.buildProcName("drack100",140,0)); + assertEquals("drack100/2.01/2", CueUtil.buildProcName("drack100",201,2)); } @Test diff --git a/cuebot/src/test/resources/conf/ddl/postgres/test_data.sql b/cuebot/src/test/resources/conf/ddl/postgres/test_data.sql index b5596b91d..14c56afcc 100644 --- a/cuebot/src/test/resources/conf/ddl/postgres/test_data.sql +++ b/cuebot/src/test/resources/conf/ddl/postgres/test_data.sql @@ -102,7 +102,7 @@ Insert into SERVICE (PK_SERVICE,STR_NAME,B_THREADABLE,INT_CORES_MIN,INT_MEM_MIN, Insert into SERVICE (PK_SERVICE,STR_NAME,B_THREADABLE,INT_CORES_MIN,INT_MEM_MIN,STR_TAGS) values ('AAAAAAAA-AAAA-AAAA-AAAA-AAAAAAAAAA15','makemovie',false,50,1048576,'util') -Insert into SERVICE (PK_SERVICE,STR_NAME,B_THREADABLE,INT_CORES_MIN,INT_MEM_MIN,STR_TAGS,INT_CORES_MAX,INT_GPU_MIN) values ('488c75f0-eae4-4dd0-83e0-29b982adbbff','cuda',true,100,3354624,'cuda',0,262144) +Insert into SERVICE (PK_SERVICE,STR_NAME,B_THREADABLE,INT_CORES_MIN,INT_MEM_MIN,STR_TAGS,INT_CORES_MAX,INT_GPU_MEM_MIN) values ('488c75f0-eae4-4dd0-83e0-29b982adbbff','cuda',true,100,3354624,'cuda',0,262144) Insert into CONFIG (PK_CONFIG,STR_KEY,INT_VALUE,LONG_VALUE,STR_VALUE,B_VALUE) values ('00000000-0000-0000-0000-000000000005','MAX_FRAME_RETRIES',16,0,null,false) diff --git a/cuebot/src/test/resources/conf/dtd/cjsl-1.12.dtd b/cuebot/src/test/resources/conf/dtd/cjsl-1.12.dtd new file mode 100644 index 000000000..222e04cfc --- /dev/null +++ b/cuebot/src/test/resources/conf/dtd/cjsl-1.12.dtd @@ -0,0 +1,97 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/cuebot/src/test/resources/conf/jobspec/jobspec_1_12.xml b/cuebot/src/test/resources/conf/jobspec/jobspec_1_12.xml new file mode 100644 index 000000000..65036370b --- /dev/null +++ b/cuebot/src/test/resources/conf/jobspec/jobspec_1_12.xml @@ -0,0 +1,49 @@ + + + + + + + + + local + testing + default + testuser + 9860 + + + False + 2 + False + + + + echo $CUE_GPU_CORES + 1-10 + 1 + 1 + 1 + + + shell + + + + + + diff --git a/cuebot/src/test/resources/conf/jobspec/jobspec_dispatch_gpus_test.xml b/cuebot/src/test/resources/conf/jobspec/jobspec_dispatch_gpus_test.xml new file mode 100644 index 000000000..dd3ce459b --- /dev/null +++ b/cuebot/src/test/resources/conf/jobspec/jobspec_dispatch_gpus_test.xml @@ -0,0 +1,133 @@ + + + + + + + + + spi + pipe + default + testuser + 9860 + + + True + 2 + False + + + + echo $CUE_GPU_CORES + 1-10 + 1 + 1 + + + shell + + + + + echo $CUE_GPU_CORES + 1-10 + 1 + 1 + + + shell + + + + + echo $CUE_GPU_CORES + 1-10 + 1 + 1 + 1 + + + shell + + + + + echo $CUE_GPU_CORES + 1-10 + 1 + 4 + 7g + + + shell + + + + + + + True + 2 + False + + + + echo $CUE_GPU_CORES + 1-10 + 1 + 6 + 1 + + + shell + + + + + echo $CUE_GPU_CORES + 1-10 + 1 + 3 + 7g + + + shell + + + + + + + True + 2 + False + + + + echo CPU + 1-10 + 1 + + + shell + + + + + + diff --git a/cuebot/src/test/resources/conf/jobspec/jobspec_gpus_test.xml b/cuebot/src/test/resources/conf/jobspec/jobspec_gpus_test.xml new file mode 100644 index 000000000..3b6b00ae7 --- /dev/null +++ b/cuebot/src/test/resources/conf/jobspec/jobspec_gpus_test.xml @@ -0,0 +1,76 @@ + + + + + + + + + spi + pipe + default + testuser + 9860 + + + True + + + true + 0 + 1 + 1 + 1 + + shell + + + + + + + True + + + true + 0 + 1 + 3 + 1 + + shell + + + + + + + True + + + true + 0 + 1 + 6 + 1 + + shell + + + + + diff --git a/cuegui/cuegui/CueJobMonitorTree.py b/cuegui/cuegui/CueJobMonitorTree.py index 8d1307771..3c8fe9633 100644 --- a/cuegui/cuegui/CueJobMonitorTree.py +++ b/cuegui/cuegui/CueJobMonitorTree.py @@ -98,53 +98,71 @@ def __init__(self, parent): data=lambda job: "%.02f" % job.data.job_stats.reserved_cores, sort=lambda job: job.data.job_stats.reserved_cores, tip="The number of reserved cores.") - self.addColumn("Wait", 45, id=6, + self.addColumn("Gpus", 55, id=6, + data=lambda job: "%d" % job.data.job_stats.reserved_gpus, + sort=lambda job: job.data.job_stats.reserved_gpus, + tip="The number of reserved gpus.") + self.addColumn("Wait", 45, id=7, data=lambda job: job.data.job_stats.waiting_frames, sort=lambda job: job.data.job_stats.waiting_frames, tip="The number of waiting frames.") - self.addColumn("Depend", 55, id=7, + self.addColumn("Depend", 55, id=8, data=lambda job: job.data.job_stats.depend_frames, sort=lambda job: job.data.job_stats.depend_frames, tip="The number of dependent frames.") - self.addColumn("Total", 50, id=8, + self.addColumn("Total", 50, id=9, data=lambda job: job.data.job_stats.total_frames, sort=lambda job: job.data.job_stats.total_frames, tip="The total number of frames.") - self.addColumn("_Booking Bar", 150, id=9, + self.addColumn("_Booking Bar", 150, id=10, delegate=cuegui.ItemDelegate.JobBookingBarDelegate) - self.addColumn("Min", 38, id=10, + self.addColumn("Min", 38, id=11, data=lambda job: "%.0f" % job.data.min_cores, sort=lambda job: job.data.min_cores, tip="The minimum number of running cores that the cuebot\n" "will try to maintain.") - self.addColumn("Max", 38, id=11, + self.addColumn("Max", 38, id=12, data=lambda job: "%.0f" % job.data.max_cores, sort=lambda job: job.data.max_cores, tip="The maximum number of running cores that the cuebot\n" "will allow.") + self.addColumn("Min Gpus", 38, id=13, + data=lambda job: "%d" % job.data.min_gpus, + sort=lambda job: job.data.min_gpus, + tip="The minimum number of running gpus that the cuebot\n" + "will try to maintain.") + self.addColumn("Max Gpus", 38, id=14, + data=lambda job: "%d" % job.data.max_gpus, + sort=lambda job: job.data.max_gpus, + tip="The maximum number of running gpus that the cuebot\n" + "will allow.") self.addColumn( - "Age", 50, id=12, + "Age", 50, id=15, data=lambda job: cuegui.Utils.secondsToHHHMM(self.currtime - job.data.start_time), sort=lambda job: self.currtime - job.data.start_time, tip="The HOURS:MINUTES since the job was launched.") - self.addColumn("Pri", 30, id=13, + self.addColumn("Pri", 30, id=16, data=lambda job: job.data.priority, sort=lambda job: job.data.priority, tip="The job priority. The cuebot uses this as a suggestion\n" "to determine what job needs the next available matching\n" "resource.") - self.addColumn("ETA", 65, id=14, + self.addColumn("ETA", 65, id=17, data=lambda job: "", tip="(Inacurate and disabled until a better solution exists)\n" "A very rough estimate of the number of HOURS:MINUTES\n" "it will be before the entire job is done.") - self.addColumn("MaxRss", 60, id=15, + self.addColumn("MaxRss", 60, id=18, data=lambda job: cuegui.Utils.memoryToString(job.data.job_stats.max_rss), sort=lambda job: job.data.job_stats.max_rss, tip="The most memory used at one time by any single frame.") - self.addColumn("_Blank", 20, id=16, + self.addColumn("MaxGpuMem", 60, id=19, + data=lambda job: cuegui.Utils.memoryToString(job.data.job_stats.max_gpu_mem), + sort=lambda job: job.data.job_stats.max_gpu_mem, + tip="The most gpu memory used at one time by any single frame.") + self.addColumn("_Blank", 20, id=20, tip="Spacer") - self.addColumn("Progress", 0, id=17, + self.addColumn("Progress", 0, id=21, delegate=cuegui.ItemDelegate.JobThinProgressBarDelegate, tip="A visual overview of the job progress.\n" "Green \t is succeeded\n" @@ -164,23 +182,31 @@ def __init__(self, parent): self.addColumn("", 0, id=5, data=lambda group: "%.2f" % group.data.stats.reserved_cores) self.addColumn("", 0, id=6, + data=lambda group: "%d" % group.data.stats.reserved_gpus) + self.addColumn("", 0, id=7, data=lambda group: group.data.stats.waiting_frames) - self.addColumn("", 0, id=7) self.addColumn("", 0, id=8) - self.addColumn("", 0, id=9, - data=lambda group: (group.data.min_cores or "")) + self.addColumn("", 0, id=9) self.addColumn("", 0, id=10, + data=lambda group: (group.data.min_cores or "")) + self.addColumn("", 0, id=11, data=lambda group: ( group.data.max_cores > 0 and group.data.max_cores or "")) - self.addColumn("", 0, id=11) - self.addColumn("", 0, id=12) - self.addColumn("", 0, id=13) + self.addColumn("", 0, id=12, + data=lambda group: (group.data.min_gpus or "")) + self.addColumn("", 0, id=13, + data=lambda group: ( + group.data.max_gpus > 0 and group.data.max_gpus or "")) self.addColumn("", 0, id=14) self.addColumn("", 0, id=15) - self.addColumn("", 0, id=16, + self.addColumn("", 0, id=16) + self.addColumn("", 0, id=17) + self.addColumn("", 0, id=18) + self.addColumn("", 0, id=19) + self.addColumn("", 0, id=20, data=lambda group: (group.data.department != "Unknown" and group.data.department or "")) - self.addColumn("", 0, id=17) + self.addColumn("", 0, id=21) cuegui.AbstractTreeWidget.AbstractTreeWidget.__init__(self, parent) @@ -528,6 +554,8 @@ def contextMenuEvent(self, e): menu.addSeparator() self.__menuActions.jobs().addAction(menu, "setMinCores") self.__menuActions.jobs().addAction(menu, "setMaxCores") + self.__menuActions.jobs().addAction(menu, "setMinGpu") + self.__menuActions.jobs().addAction(menu, "setMaxGpu") self.__menuActions.jobs().addAction(menu, "setPriority") self.__menuActions.jobs().addAction(menu, "setMaxRetries") if counts["job"] == 1: diff --git a/cuegui/cuegui/FrameMonitorTree.py b/cuegui/cuegui/FrameMonitorTree.py index be3362095..41df65209 100644 --- a/cuegui/cuegui/FrameMonitorTree.py +++ b/cuegui/cuegui/FrameMonitorTree.py @@ -100,25 +100,29 @@ def __init__(self, parent): data=lambda job, frame: (self.getCores(frame, format_as_string=True) or ""), sort=lambda job, frame: (self.getCores(frame)), tip="The number of cores a frame is using") - self.addColumn("Host", 120, id=6, + self.addColumn("GPUs", 55, id=6, + data=lambda job, frame: (self.getGpus(frame, format_as_string=True) or ""), + sort=lambda job, frame: (self.getGpus(frame)), + tip="The number of gpus a frame is using") + self.addColumn("Host", 120, id=7, data=lambda job, frame: frame.data.last_resource, sort=lambda job, frame: frame.data.last_resource, tip="The last or current resource that the frame used or is using.") - self.addColumn("Retries", 55, id=7, + self.addColumn("Retries", 55, id=8, data=lambda job, frame: frame.data.retry_count, sort=lambda job, frame: frame.data.retry_count, tip="The number of times that each frame has had to retry.") - self.addColumn("_CheckpointEnabled", 20, id=8, + self.addColumn("_CheckpointEnabled", 20, id=9, data=lambda job, frame: "", sort=lambda job, frame: ( frame.data.checkpoint_state == opencue.api.job_pb2.ENABLED), tip="A green check mark here indicates the frame has written out at least " "1 checkpoint segment.") - self.addColumn("CheckP", 55, id=9, + self.addColumn("CheckP", 55, id=10, data=lambda job, frame: frame.data.checkpoint_count, sort=lambda job, frame: frame.data.checkpoint_count, tip="The number of times a frame has been checkpointed.") - self.addColumn("Runtime", 70, id=10, + self.addColumn("Runtime", 70, id=11, data=lambda job, frame: (cuegui.Utils.secondsToHMMSS( frame.data.start_time and frame.data.stop_time and @@ -138,7 +142,7 @@ def __init__(self, parent): tip="The amount of HOURS:MINUTES:SECONDS that the frame\n" "has run for or last ran for.\n") - self.addColumn("LLU", 70, id=11, + self.addColumn("LLU", 70, id=12, data=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and self.frameLogDataBuffer.getLastLineData( job, frame)[FrameLogDataBuffer.LLU] or ""), @@ -150,7 +154,7 @@ def __init__(self, parent): "time without an update is an indication of a stuck\n" "frame for most types of jobs") - self.addColumn("Memory", 60, id=12, + self.addColumn("Memory", 60, id=13, data=lambda job, frame: ( frame.data.state == opencue.api.job_pb2.RUNNING and cuegui.Utils.memoryToString(frame.data.used_memory) or @@ -162,7 +166,20 @@ def __init__(self, parent): "If a frame is not running:\n" "\t The most memory this frame has used at one time.") - self.addColumn("Remain", 70, id=13, + self.addColumn("GPU Memory", 60, id=14, + data=lambda job, frame: ( + frame.data.state == opencue.api.job_pb2.RUNNING and + cuegui.Utils.memoryToString(frame.data.used_gpu_memory) or + cuegui.Utils.memoryToString(frame.data.max_gpu_memory)), + sort=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and + frame.data.used_gpu_memory or + frame.data.max_gpu_memory), + tip="If a frame is running:\n" + "\t The amount of GPU memory currently used by the frame.\n" + "If a frame is not running:\n" + "\t The most GPU memory this frame has used at one time.") + + self.addColumn("Remain", 70, id=15, data=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and self.frameEtaDataBuffer.getEtaFormatted(job, frame) or ""), @@ -170,16 +187,16 @@ def __init__(self, parent): self.frameEtaDataBuffer.getEta(job, frame) or -1), tip="Hours:Minutes:Seconds remaining.") - self.addColumn("Start Time", 100, id=14, + self.addColumn("Start Time", 100, id=16, data=lambda job, frame: (self.getTimeString(frame.data.start_time) or ""), sort=lambda job, frame: (self.getTimeString(frame.data.start_time) or ""), tip="The time the frame was started or retried.") - self.addColumn("Stop Time", 100, id=15, + self.addColumn("Stop Time", 100, id=17, data=lambda job, frame: (self.getTimeString(frame.data.stop_time) or ""), sort=lambda job, frame: (self.getTimeString(frame.data.stop_time) or ""), tip="The time that the frame finished or died.") - self.addColumn("Last Line", 0, id=16, + self.addColumn("Last Line", 0, id=18, data=lambda job, frame: (frame.data.state == opencue.api.job_pb2.RUNNING and self.frameLogDataBuffer.getLastLineData( job, frame)[FrameLogDataBuffer.LASTLINE] or ""), @@ -240,7 +257,7 @@ def getCores(frame, format_as_string=False): """Gets the number of cores a frame is using.""" cores = None - m = re.search(r".*\/(\d+\.?\d*)", frame.data.last_resource) + m = re.search(r".*\/(\d+\.?\d*)\/.*", frame.data.last_resource) if m: cores = float(m.group(1)) @@ -249,6 +266,20 @@ def getCores(frame, format_as_string=False): return cores + @staticmethod + def getGpus(frame, format_as_string=False): + """Gets the number of gpus a frame is using.""" + gpus = None + + m = re.search(r".*\/.*\/(\d+)", frame.data.last_resource) + if m: + gpus = m.group(1) + + if not format_as_string: + gpus = int(gpus) + + return gpus + @staticmethod def getTimeString(timestamp): """Gets a timestamp formatted as a string.""" diff --git a/cuegui/cuegui/GroupDialog.py b/cuegui/cuegui/GroupDialog.py index 2c59f405c..2a38d906a 100644 --- a/cuegui/cuegui/GroupDialog.py +++ b/cuegui/cuegui/GroupDialog.py @@ -56,6 +56,11 @@ def __init__(self, parentGroup, modifyGroup, defaults, parent): __minCores = defaults["minCores"] __maxCores = defaults["maxCores"] + __defaultJobMinGpus = defaults["defaultJobMinGpus"] + __defaultJobMaxGpus = defaults["defaultJobMaxGpus"] + __minGpus = defaults["minGpus"] + __maxGpus = defaults["maxGpus"] + self.setWindowTitle(__title) layout.addWidget(QtWidgets.QLabel(__message, self), 0, 1, 1, 3) @@ -90,8 +95,25 @@ def __init__(self, parentGroup, modifyGroup, defaults, parent): __modify and __maxCores != -1.0, __maxCores, 1) + (self._defaultJobMinGpusCheck, self._defaultJobMinGpusValue) = \ + self.__createToggleSpinBox("Job Default Minimum Gpus", 8, + __modify and __defaultJobMinGpus != -1, + __defaultJobMinGpus, 1) + (self._defaultJobMaxGpusCheck, self._defaultJobMaxGpusValue) = \ + self.__createToggleSpinBox("Job Default Maximum Gpus", 9, + __modify and __defaultJobMaxGpus != -1, + __defaultJobMaxGpus, 1) + (self._minGpusCheck, self._minGpusValue) = \ + self.__createToggleSpinBox("Group Minimum Gpus", 10, + __modify and __minGpus != 0, + __minGpus) + (self._maxGpusCheck, self._maxGpusValue) = \ + self.__createToggleSpinBox("Group Maximum Gpus", 11, + __modify and __maxGpus != -1, + __maxGpus, 1) + self.__createButtons( - QtWidgets.QDialogButtonBox.Save | QtWidgets.QDialogButtonBox.Cancel, 8, 3) + QtWidgets.QDialogButtonBox.Save | QtWidgets.QDialogButtonBox.Cancel, 12, 3) def __createToggleDoubleSpinBox( self, text, row, startEnabled = False, currentValue = 0, minValue = 0): @@ -169,6 +191,26 @@ def accept(self): float(self._maxCoresValue.value()), __group.data.max_cores, float(-1)) + self.__setValue(self._defaultJobMinGpusCheck, + __group.setDefaultJobMinGpus, + float(self._defaultJobMinGpusValue.value()), + __group.data.default_job_min_gpus, -1) + + self.__setValue(self._defaultJobMaxGpusCheck, + __group.setDefaultJobMaxGpus, + float(self._defaultJobMaxGpusValue.value()), + __group.data.default_job_max_gpus, -1) + + self.__setValue(self._minGpusCheck, + __group.setMinGpus, + float(self._minGpusValue.value()), + __group.data.min_gpus, 0) + + self.__setValue(self._maxGpusCheck, + __group.setMaxGpus, + float(self._maxGpusValue.value()), + __group.data.max_gpus, -1) + self.close() @staticmethod @@ -195,7 +237,11 @@ def __init__(self, modifyGroup, parent=None): "defaultJobMinCores": modifyGroup.data.default_job_min_cores, "defaultJobMaxCores": modifyGroup.data.default_job_max_cores, "minCores": modifyGroup.data.min_cores, - "maxCores": modifyGroup.data.max_cores} + "maxCores": modifyGroup.data.max_cores, + "defaultJobMinGpus": modifyGroup.data.default_job_min_gpus, + "defaultJobMaxGpus": modifyGroup.data.default_job_max_gpus, + "minGpus": modifyGroup.data.min_gpus, + "maxGpus": modifyGroup.data.max_gpus} GroupDialog.__init__(self, None, modifyGroup, defaults, parent) @@ -212,5 +258,9 @@ def __init__(self, parentGroup, parent=None): "defaultJobMinCores": 1.0, "defaultJobMaxCores": 1.0, "minCores": 0.0, - "maxCores": 1.0} + "maxCores": 1.0, + "defaultJobMinGpus": 0, + "defaultJobMaxGpus": 0, + "minGpus": 0, + "maxGpus": 0} GroupDialog.__init__(self, parentGroup, None, defaults, parent) diff --git a/cuegui/cuegui/HostMonitorTree.py b/cuegui/cuegui/HostMonitorTree.py index f9e5a7c90..3574b4c71 100644 --- a/cuegui/cuegui/HostMonitorTree.py +++ b/cuegui/cuegui/HostMonitorTree.py @@ -78,9 +78,9 @@ def __init__(self, parent): data=lambda host: cuegui.Utils.memoryToString(host.data.free_memory), sort=lambda host: host.data.free_memory, tip="The amount of used memory (red) vs available gpu memory (green)") - self.addColumn("GPU", 60, id=6, - data=lambda host: cuegui.Utils.memoryToString(host.data.free_gpu), - sort=lambda host: host.data.free_gpu, + self.addColumn("GPU Memory", 60, id=6, + data=lambda host: cuegui.Utils.memoryToString(host.data.free_gpu_memory), + sort=lambda host: host.data.free_gpu_memory, delegate=cuegui.ItemDelegate.HostGpuBarDelegate, tip="The amount of used gpu memory (red) vs available gpu memory (green)") self.addColumn("freeMcp", 60, id=7, @@ -105,27 +105,36 @@ def __init__(self, parent): data=lambda host: cuegui.Utils.memoryToString(host.data.idle_memory), sort=lambda host: host.data.idle_memory, tip="The amount of unreserved memory.") - self.addColumn("GPU", 50, id=12, - data=lambda host: cuegui.Utils.memoryToString(host.data.gpu), - sort=lambda host: host.data.gpu, + self.addColumn("GPUs", 50, id=12, + data=lambda host: "%d" % host.data.gpus, + sort=lambda host: host.data.gpus, + tip="The total number of gpus.\n\n" + "On a frame it is the number of gpus reserved.") + self.addColumn("Idle GPUs", 40, id=13, + data=lambda host: "%d" % host.data.idle_gpus, + sort=lambda host: host.data.idle_gpus, + tip="The number of gpus that are not reserved.") + self.addColumn("GPU Mem", 50, id=14, + data=lambda host: cuegui.Utils.memoryToString(host.data.gpu_memory), + sort=lambda host: host.data.gpu_memory, tip="The total amount of reservable gpu memory.\n\n" "On a frame it is the amount of gpu memory reserved.") - self.addColumn("Idle", 50, id=13, - data=lambda host: cuegui.Utils.memoryToString(host.data.idle_gpu), - sort=lambda host: host.data.idle_gpu, + self.addColumn("Gpu Mem Idle", 50, id=15, + data=lambda host: cuegui.Utils.memoryToString(host.data.idle_gpu_memory), + sort=lambda host: host.data.idle_gpu_memory, tip="The amount of unreserved gpu memory.") - self.addColumn("Ping", 50, id=14, + self.addColumn("Ping", 50, id=16, data=lambda host: int(time.time() - host.data.ping_time), sort=lambda host: host.data.ping_time, tip="The number of seconds since the cuebot last received\n" "a report from the host. A host is configured to report\n" "in every 60 seconds so a number larger than this\n" "indicates a problem") - self.addColumn("Hardware", 70, id=15, + self.addColumn("Hardware", 70, id=17, data=lambda host: HardwareState.Name(host.data.state), tip="The state of the hardware as Up or Down.\n\n" "On a frame it is the amount of memory used.") - self.addColumn("Locked", 90, id=16, + self.addColumn("Locked", 90, id=18, data=lambda host: LockState.Name(host.data.lock_state), tip="A host can be:\n" "Locked \t\t It was manually locked to prevent booking\n" @@ -133,12 +142,12 @@ def __init__(self, parent): "NimbyLocked \t It is a desktop machine and there is\n" "\t\t someone actively using it or not enough \n" "\t\t resources are available on a desktop.") - self.addColumn("ThreadMode", 80, id=17, + self.addColumn("ThreadMode", 80, id=19, data=lambda host: ThreadMode.Name(host.data.thread_mode), tip="A frame that runs on this host will:\n" "All: Use all cores.\n" "Auto: Use the number of cores as decided by the cuebot.\n") - self.addColumn("Tags/Job", 50, id=18, + self.addColumn("Tags/Job", 50, id=20, data=lambda host: ",".join(host.data.tags), tip="The tags applied to the host.\n\n" "On a frame it is the name of the job.") @@ -340,7 +349,8 @@ def data(self, col, role): self.rpcObject.data.total_memory] if role == QtCore.Qt.UserRole + 3: - return [self.rpcObject.data.total_gpu - self.rpcObject.data.free_gpu, - self.rpcObject.data.total_gpu] + return [self.rpcObject.data.total_gpu_memory - + self.rpcObject.data.free_gpu_memory, + self.rpcObject.data.total_gpu_memory] return cuegui.Constants.QVARIANT_NULL diff --git a/cuegui/cuegui/LayerDialog.py b/cuegui/cuegui/LayerDialog.py index c337dda80..4c79b805f 100644 --- a/cuegui/cuegui/LayerDialog.py +++ b/cuegui/cuegui/LayerDialog.py @@ -117,12 +117,12 @@ def __init__(self, layers, parent=None): self.mem_max_kb = int(self.mem_max_gb * 1024 * 1024) self.mem_min_kb = int(self.mem_min_gb * 1024 * 1024) - self.gpu_max_kb = 2 * 1024 * 1024 - self.gpu_min_kb = 0 - self.gpu_tick_kb = 256 * 1024 - self.gpu_max_gb = 2.0 - self.gpu_min_gb = 0.0 - self.gpu_tick_gb = .25 + self.gpu_mem_max_kb = 256 * 1024 * 1024 + self.gpu_mem_min_kb = 0 + self.gpu_mem_tick_kb = 256 * 1024 + self.gpu_mem_max_gb = 256.0 + self.gpu_mem_min_gb = 0.0 + self.gpu_mem_tick_gb = .25 self.__group = QtWidgets.QGroupBox("Resource Options", self) @@ -180,16 +180,28 @@ def __init__(self, layers, parent=None): # Limits self.__limits = LayerLimitsWidget(self.__layers, self) + # Min gpus + self.__min_gpus = QtWidgets.QSpinBox(self) + self.__min_gpus.setValue(0) + self.__min_gpus.setRange(0, int(self._cfg().get('max_gpus', 16))) + self.__min_gpus.setSingleStep(1) + + # Max gpus + self.__max_gpus = QtWidgets.QSpinBox(self) + self.__max_gpus.setRange(0, int(self._cfg().get('max_gpus', 16))) + self.__max_gpus.setSingleStep(1) + # GPU Memory - self.__gpu = SlideSpinner(self) - self.__gpu.slider.setMinimumWidth(200) - self.__gpu.slider.setRange(self.gpu_min_kb, self.gpu_max_kb // self.gpu_tick_kb) - self.__gpu.slider.setTickInterval(1) - self.__gpu.slider.setSingleStep(1) - self.__gpu.slider.setPageStep(1) - self.__gpu.spinner.setSuffix(' GB') - self.__gpu.spinner.setRange(self.gpu_min_gb, self.gpu_max_gb) - self.__gpu.spinner.setSingleStep(self.gpu_tick_gb) + self.__gpu_mem = SlideSpinner(self) + self.__gpu_mem.slider.setMinimumWidth(200) + self.__gpu_mem.slider.setRange(self.gpu_mem_min_kb, + self.gpu_mem_max_kb // self.gpu_mem_tick_kb) + self.__gpu_mem.slider.setTickInterval(1) + self.__gpu_mem.slider.setSingleStep(1) + self.__gpu_mem.slider.setPageStep(1) + self.__gpu_mem.spinner.setSuffix(' GB') + self.__gpu_mem.spinner.setRange(self.gpu_mem_min_gb, self.gpu_mem_max_gb) + self.__gpu_mem.spinner.setSingleStep(self.gpu_mem_tick_gb) # Our dialog buttons. self.__buttons = QtWidgets.QDialogButtonBox(QtWidgets.QDialogButtonBox.Save | @@ -200,16 +212,18 @@ def __init__(self, layers, parent=None): # Setup signals self.__mem.slider.valueChanged.connect(self.__translateToMemSpinbox) self.__mem.spinner.valueChanged.connect(self.__translateToMemSlider) - self.__gpu.slider.valueChanged.connect(self.__translateToGpuSpinbox) - self.__gpu.spinner.valueChanged.connect(self.__translateToGpuSlider) + self.__gpu_mem.slider.valueChanged.connect(self.__translateToGpuMemSpinbox) + self.__gpu_mem.spinner.valueChanged.connect(self.__translateToGpuMemSlider) self.__buttons.accepted.connect(self.verify) self.__buttons.rejected.connect(self.reject) # Set actual values once signals are setup self.__mem.slider.setValue(self.getMaxMemory()) - self.__gpu.slider.setValue(self.getMaxGpu()) + self.__gpu_mem.slider.setValue(self.getMaxGpuMemory()) self.__core.setValue(self.getMinCores()) self.__max_cores.setValue(self.getMaxCores()) + self.__min_gpus.setValue(self.getMinGpus()) + self.__max_gpus.setValue(self.getMaxGpus()) self.__timeout.setValue(self.getTimeout()) self.__timeout_llu.setValue(self.getTimeoutLLU()) @@ -236,8 +250,16 @@ def __init__(self, layers, parent=None): self.__thread, True), multiSelect)) + layout.addWidget(EnableableItem(LayerPropertiesItem("Min GPUs:", + self.__min_gpus, + False), + multiSelect)) + layout.addWidget(EnableableItem(LayerPropertiesItem("Max GPUs:", + self.__max_gpus, + False), + multiSelect)) layout.addWidget(EnableableItem(LayerPropertiesItem("Minimum Gpu Memory:", - self.__gpu, + self.__gpu_mem, False), multiSelect)) layout.addWidget(EnableableItem(LayerPropertiesItem("Timeout:", @@ -280,8 +302,8 @@ def verify(self): if mem_value < self.mem_min_kb or mem_value > self.mem_max_kb: warning("The memory setting is too high.") return False - gpu_value = self.__gpu.slider.value() - if gpu_value < self.gpu_min_kb or gpu_value > self.gpu_max_kb: + gpu_mem_value = self.__gpu_mem.slider.value() + if gpu_mem_value < self.gpu_mem_min_kb or gpu_mem_value > self.gpu_mem_max_kb: warning("The gpu memory setting is too high.") return False @@ -302,8 +324,8 @@ def apply(self): layer.setMaxCores(self.__max_cores.value() * 100.0) if self.__thread.isEnabled(): layer.setThreadable(self.__thread.isChecked()) - if self.__gpu.isEnabled(): - layer.setMinGpu(self.__gpu.slider.value() * self.gpu_tick_kb) + if self.__gpu_mem.isEnabled(): + layer.setMinGpuMemory(self.__gpu_mem.slider.value() * self.gpu_mem_tick_kb) if self.__timeout.isEnabled(): layer.setTimeout(self.__timeout.value()) if self.__timeout_llu.isEnabled(): @@ -322,9 +344,9 @@ def getMaxMemory(self): result = layer.data.min_memory return result - def getMaxGpu(self): - """Gets the layer max GPU.""" - return max([layer.data.min_gpu // self.gpu_tick_kb for layer in self.__layers]) + def getMaxGpuMemory(self): + """Gets the layer max GPU memory.""" + return max([layer.data.min_gpu_memory // self.gpu_mem_tick_kb for layer in self.__layers]) def getMinCores(self): """Gets the layer min cores.""" @@ -342,6 +364,22 @@ def getMaxCores(self): result = layer.data.max_cores return result + def getMinGpus(self): + """Gets the layer min gpus.""" + result = 0 + for layer in self.__layers: + if layer.data.min_gpus > result: + result = layer.data.min_gpus + return result + + def getMaxGpus(self): + """Gets the layer max gpus.""" + result = 0 + for layer in self.__layers: + if layer.data.max_gpus > result: + result = layer.data.max_gpus + return result + def getThreading(self): """Gets whether the layer is threadable.""" result = False @@ -382,12 +420,11 @@ def __translateToMemSpinbox(self, value): def __translateToMemSlider(self, value): self.__mem.slider.setValue(int(value * 1048576.0)) - def __translateToGpuSpinbox(self, value): - self.__gpu.spinner.setValue(float(value * self.gpu_tick_kb) / 1024.0 / 1024.0) - - def __translateToGpuSlider(self, value): - self.__gpu.slider.setValue(int(value * 1024.0 * 1024.0) // self.gpu_tick_kb) + def __translateToGpuMemSpinbox(self, value): + self.__gpu_mem.spinner.setValue(float(value * self.gpu_mem_tick_kb) / 1024.0 / 1024.0) + def __translateToGpuMemSlider(self, value): + self.__gpu_mem.slider.setValue(int(value * 1024.0 * 1024.0) // self.gpu_mem_tick_kb) class LayerTagsWidget(QtWidgets.QWidget): """ diff --git a/cuegui/cuegui/LayerMonitorTree.py b/cuegui/cuegui/LayerMonitorTree.py index fdf0c249e..5b15450b6 100644 --- a/cuegui/cuegui/LayerMonitorTree.py +++ b/cuegui/cuegui/LayerMonitorTree.py @@ -74,65 +74,70 @@ def __init__(self, parent): "will reserve for its use. If the frame begins to use\n" "more memory than this, the cuebot will increase this\n" "number.") - self.addColumn("Gpu", 40, id=8, - data=lambda layer: cuegui.Utils.memoryToString(layer.data.min_gpu), - sort=lambda layer: layer.data.min_gpu, + self.addColumn("Gpus", 45, id=8, + data=lambda layer: "%d" % layer.data.min_gpus, + sort=lambda layer: layer.data.min_gpus, + tip="The number of gpus that the frames in this layer\n" + "will reserve as a minimum.") + self.addColumn("Gpu Memory", 40, id=9, + data=lambda layer: cuegui.Utils.memoryToString(layer.data.min_gpu_memory), + sort=lambda layer: layer.data.min_gpu_memory, tip="The amount of gpu memory each frame in this layer\n" "will reserve for its use. Note that we may not have\n" "machines as much gpu memory as you request.") self.addColumn( - "MaxRss", 60, id=9, + "MaxRss", 60, id=10, data=lambda layer: cuegui.Utils.memoryToString(layer.data.layer_stats.max_rss), sort=lambda layer: layer.data.layer_stats.max_rss, tip="Maximum amount of memory used by any frame in\n" "this layer at any time since the job was launched.") - self.addColumn("Total", 40, id=10, + self.addColumn("Total", 40, id=11, data=lambda layer: layer.data.layer_stats.total_frames, sort=lambda layer: layer.data.layer_stats.total_frames, tip="Total number of frames in this layer.") - self.addColumn("Done", 40, id=11, + self.addColumn("Done", 40, id=12, data=lambda layer: layer.data.layer_stats.succeeded_frames, sort=lambda layer: layer.data.layer_stats.succeeded_frames, tip="Total number of done frames in this layer.") - self.addColumn("Run", 40, id=12, + self.addColumn("Run", 40, id=13, data=lambda layer: layer.data.layer_stats.running_frames, sort=lambda layer: layer.data.layer_stats.running_frames, tip="Total number or running frames in this layer.") - self.addColumn("Depend", 53, id=13, + self.addColumn("Depend", 53, id=14, data=lambda layer: layer.data.layer_stats.depend_frames, sort=lambda layer: layer.data.layer_stats.depend_frames, tip="Total number of dependent frames in this layer.") - self.addColumn("Wait", 40, id=14, + self.addColumn("Wait", 40, id=15, data=lambda layer: layer.data.layer_stats.waiting_frames, sort=lambda layer: layer.data.layer_stats.waiting_frames, tip="Total number of waiting frames in this layer.") - self.addColumn("Eaten", 40, id=15, + self.addColumn("Eaten", 40, id=16, data=lambda layer: layer.data.layer_stats.eaten_frames, sort=lambda layer: layer.data.layer_stats.eaten_frames, tip="Total number of eaten frames in this layer.") - self.addColumn("Dead", 40, id=16, + self.addColumn("Dead", 40, id=17, data=lambda layer: layer.data.layer_stats.dead_frames, sort=lambda layer: layer.data.layer_stats.dead_frames, tip="Total number of dead frames in this layer.") self.addColumn( - "Avg", 65, id=17, + "Avg", 65, id=18, data=lambda layer: cuegui.Utils.secondsToHHMMSS(layer.data.layer_stats.avg_frame_sec), sort=lambda layer: layer.data.layer_stats.avg_frame_sec, tip="Average number of HOURS:MINUTES:SECONDS per frame\nin this layer.") - self.addColumn("Tags", 100, id=18, + self.addColumn("Tags", 100, id=19, data=lambda layer: " | ".join(layer.data.tags), tip="The tags define what resources may be booked on\n" "frames in this layer.") - self.addColumn("Progress", 100, id=19, + self.addColumn("Progress", 100, id=20, delegate=cuegui.ItemDelegate.ProgressDelegate, data=lambda layer: layer.percentCompleted(), sort=lambda layer: layer.percentCompleted(), tip="Progress for the Layer") - self.addColumn("Timeout", 45, id=20, + self.addColumn("Timeout", 45, id=21, data=lambda layer: cuegui.Utils.secondsToHHHMM(layer.data.timeout*60), sort=lambda layer: layer.data.timeout, tip="Timeout for the frames, Hours:Minutes") - self.addColumn("Timeout LLU", 45, id=21, + self.addColumn("Timeout LLU", 45, id=22, data=lambda layer: cuegui.Utils.secondsToHHHMM(layer.data.timeout_llu*60), sort=lambda layer: layer.data.timeout_llu, tip="Timeout for a frames\' LLU, Hours:Minutes") diff --git a/cuegui/cuegui/MenuActions.py b/cuegui/cuegui/MenuActions.py index 75f1bbc4a..68f0b6cc8 100644 --- a/cuegui/cuegui/MenuActions.py +++ b/cuegui/cuegui/MenuActions.py @@ -269,6 +269,38 @@ def setMaxCores(self, rpcObjects=None): job.setMaxCores(float(value)) self._update() + setMinGpu_info = ["Set Minimum Gpu...", "Set Job(s) Minimum Gpu", "configure"] + def setMinGpu(self, rpcObjects=None): + jobs = self._getOnlyJobObjects(rpcObjects) + if jobs: + current = max([job.data.min_cores for job in jobs]) + title = "Set Minimum Gpu" + body = "Please enter the new minimum gpu value:" + (value, choice) = QtWidgets.QInputDialog.getDouble(self._caller, + title, body, + current, + 0, 50000, 0) + if choice: + for job in jobs: + job.setMinGpu(float(value)) + self._update() + + setMaxGpu_info = ["Set Maximum Gpu...", "Set Job(s) Maximum Gpu", "configure"] + def setMaxGpu(self, rpcObjects=None): + jobs = self._getOnlyJobObjects(rpcObjects) + if jobs: + current = max([job.data.max_cores for job in jobs]) + title = "Set Maximum Gpu" + body = "Please enter the new maximum gpu value:" + (value, choice) = QtWidgets.QInputDialog.getDouble(self._caller, + title, body, + current, + 0, 50000, 0) + if choice: + for job in jobs: + job.setMaxGpu(float(value)) + self._update() + setPriority_info = ["Set Priority...", None, "configure"] def setPriority(self, rpcObjects=None): @@ -1460,6 +1492,24 @@ def clearRepair(self, rpcObjects=None): host.setHardwareState(down) self._update() + setThreadModeAuto_info = ["Thread Mode Auto", None, "configure"] + def setThreadModeAuto(self, rpcObjects=None): + for host in self._getOnlyHostObjects(rpcObjects): + host.setThreadMode("AUTO") + self._update() + + setThreadModeAll_info = ["Thread Mode All", None, "configure"] + def setThreadModeAll(self, rpcObjects=None): + for host in self._getOnlyHostObjects(rpcObjects): + host.setThreadMode("ALL") + self._update() + + setThreadModeVariable_info = ["Thread Mode Variable", None, "configure"] + def setThreadModeVariable(self, rpcObjects=None): + for host in self._getOnlyHostObjects(rpcObjects): + host.setThreadMode("VARIABLE") + self._update() + class ProcActions(AbstractActions): """Actions for procs.""" diff --git a/cuegui/cuegui/ProcMonitorTree.py b/cuegui/cuegui/ProcMonitorTree.py index efbf0c86f..9f67dabe3 100644 --- a/cuegui/cuegui/ProcMonitorTree.py +++ b/cuegui/cuegui/ProcMonitorTree.py @@ -60,7 +60,7 @@ def __init__(self, parent): tip="The amount of memory used.") self.addColumn( "GPU Used", 100, id=5, - data=lambda proc: cuegui.Utils.memoryToString(proc.data.reserved_gpu), + data=lambda proc: cuegui.Utils.memoryToString(proc.data.reserved_gpu_memory), tip="The amount of gpu memory used.") self.addColumn( "Age", 60, id=6, diff --git a/cuegui/cuegui/ServiceDialog.py b/cuegui/cuegui/ServiceDialog.py index 0288168f5..f88e568c0 100644 --- a/cuegui/cuegui/ServiceDialog.py +++ b/cuegui/cuegui/ServiceDialog.py @@ -61,11 +61,11 @@ def __init__(self, parent=None): self.min_memory = QtWidgets.QSpinBox(self) self.min_memory.setRange(512, int(self._cfg().get('max_memory', 48)) * 1024) self.min_memory.setValue(3276) - self.min_gpu = QtWidgets.QSpinBox(self) - self.min_gpu.setRange(self.gpu_min_mb, self.gpu_max_mb) - self.min_gpu.setValue(self.gpu_min_mb) - self.min_gpu.setSingleStep(self.gpu_tick_mb) - self.min_gpu.setSuffix(" MB") + self.min_gpu_memory = QtWidgets.QSpinBox(self) + self.min_gpu_memory.setRange(self.gpu_min_mb, self.gpu_max_mb) + self.min_gpu_memory.setValue(self.gpu_min_mb) + self.min_gpu_memory.setSingleStep(self.gpu_tick_mb) + self.min_gpu_memory.setSuffix(" MB") self.timeout = QtWidgets.QSpinBox(self) self.timeout.setRange(0, 4320) self.timeout.setValue(0) @@ -84,7 +84,7 @@ def __init__(self, parent=None): layout.addWidget(QtWidgets.QLabel("Min Memory MB:", self), 4, 0) layout.addWidget(self.min_memory, 4, 1) layout.addWidget(QtWidgets.QLabel("Min Gpu Memory MB:", self), 5, 0) - layout.addWidget(self.min_gpu, 5, 1) + layout.addWidget(self.min_gpu_memory, 5, 1) layout.addWidget(QtWidgets.QLabel("Timeout (in minutes):", self), 6, 0) layout.addWidget(self.timeout, 6, 1) layout.addWidget(QtWidgets.QLabel("Timeout LLU (in minutes):", self), 7, 0) @@ -124,7 +124,7 @@ def setService(self, service): self.min_cores.setValue(service.data.min_cores) self.max_cores.setValue(service.data.max_cores) self.min_memory.setValue(service.data.min_memory // 1024) - self.min_gpu.setValue(service.data.min_gpu // 1024) + self.min_gpu_memory.setValue(service.data.min_gpu_memory // 1024) self._tags_w.set_tags(service.data.tags) self.timeout.setValue(service.data.timeout) self.timeout_llu.setValue(service.data.timeout_llu) @@ -141,7 +141,7 @@ def new(self): self.min_cores.setValue(100) self.max_cores.setValue(100) self.min_memory.setValue(3276) - self.min_gpu.setValue(self.gpu_min_mb) + self.min_gpu_memory.setValue(self.gpu_min_mb) self.timeout.setValue(0) self.timeout_llu.setValue(0) self._tags_w.set_tags(['general']) @@ -168,7 +168,7 @@ def save(self): service.setMinCores(self.min_cores.value()) service.setMaxCores(self.max_cores.value()) service.setMinMemory(self.min_memory.value() * 1024) - service.setMinGpu(self.min_gpu.value() * 1024) + service.setMinGpu(self.min_gpu_memory.value() * 1024) service.setTimeout(self.timeout.value()) service.setTimeoutLLU(self.timeout_llu.value()) service.setTags(self._tags_w.get_tags()) diff --git a/cuegui/cuegui/config/cue_resources.yaml b/cuegui/cuegui/config/cue_resources.yaml index d54cfcbfc..501b6aff4 100644 --- a/cuegui/cuegui/config/cue_resources.yaml +++ b/cuegui/cuegui/config/cue_resources.yaml @@ -10,6 +10,9 @@ max_cores: 32 max_memory: 128 +max_gpus: 8 +max_gpu_memory: 128 + # Redirect Plugin maximum allowed core-hour cutoff. # Users will not be able to search for procs with frames that have been diff --git a/cuegui/tests/FrameMonitorTree_tests.py b/cuegui/tests/FrameMonitorTree_tests.py index e28e8229f..c3c2b4963 100644 --- a/cuegui/tests/FrameMonitorTree_tests.py +++ b/cuegui/tests/FrameMonitorTree_tests.py @@ -120,7 +120,7 @@ def test_tickFullUpdate(self, getFramesMock, getUpdatedFramesMock): def test_getCores(self): frame = opencue.wrappers.frame.Frame( - opencue.compiled_proto.job_pb2.Frame(last_resource='foo/125.82723')) + opencue.compiled_proto.job_pb2.Frame(last_resource='foo/125.82723/0')) self.assertEqual(125.82723, self.frameMonitorTree.getCores(frame)) self.assertEqual('125.83', self.frameMonitorTree.getCores(frame, format_as_string=True)) diff --git a/cuegui/tests/LayerDialog_tests.py b/cuegui/tests/LayerDialog_tests.py index 1f9624d59..5e515775d 100644 --- a/cuegui/tests/LayerDialog_tests.py +++ b/cuegui/tests/LayerDialog_tests.py @@ -55,13 +55,15 @@ def setUp(self, get_stub_mock, get_layer_mock, get_limits_mock): 'layer1Id': opencue.wrappers.layer.Layer( opencue.compiled_proto.job_pb2.Layer( id='layer1Id', name='layer1Name', range='1-5', tags=['tag1', 'tag2'], - min_cores=1, max_cores=3, is_threadable=False, min_memory=2097152, min_gpu=1, + min_cores=1, max_cores=3, is_threadable=False, + min_memory=2097152, min_gpu_memory=1, chunk_size=1, timeout=30, timeout_llu=1, memory_optimizer_enabled=True, limits=['limit1Name', 'limit2Name'])), 'layer2Id': opencue.wrappers.layer.Layer( opencue.compiled_proto.job_pb2.Layer( id='layer2Id', name='layer2Name', range='2-22', tags=['tag2', 'tag3'], - min_cores=2, max_cores=2, is_threadable=True, min_memory=6291456, min_gpu=2, + min_cores=2, max_cores=2, is_threadable=True, + min_memory=6291456, min_gpu_memory=2, chunk_size=5, timeout=60, timeout_llu=5, memory_optimizer_enabled=False, limits=['limit2Name', 'limit3Name'])), } @@ -124,12 +126,12 @@ def test__should_display_current_values(self): self.assertTrue(self.layer_properties_dialog._LayerPropertiesDialog__thread.isChecked()) self.assertEqual( - int(self.layer_properties_dialog.gpu_min_gb * 1024 * 1024), - self.layer_properties_dialog._LayerPropertiesDialog__gpu.slider.minimum()) + int(self.layer_properties_dialog.gpu_mem_min_gb * 1024 * 1024), + self.layer_properties_dialog._LayerPropertiesDialog__gpu_mem.slider.minimum()) self.assertEqual( - int(self.layer_properties_dialog.gpu_max_gb * 1024 * 1024) // - int(self.layer_properties_dialog.gpu_tick_gb * 1024 * 1024), - self.layer_properties_dialog._LayerPropertiesDialog__gpu.slider.maximum()) + int(self.layer_properties_dialog.gpu_mem_max_gb * 1024 * 1024) // + int(self.layer_properties_dialog.gpu_mem_tick_gb * 1024 * 1024), + self.layer_properties_dialog._LayerPropertiesDialog__gpu_mem.slider.maximum()) # Layer with the highest timeout determines the initial value. self.assertEqual(60, self.layer_properties_dialog._LayerPropertiesDialog__timeout.value()) @@ -163,13 +165,13 @@ def test__should_fail_on_memory_too_low(self): self.assertFalse(self.layer_properties_dialog.verify()) def test__should_fail_on_gpu_too_high(self): - self.layer_properties_dialog._LayerPropertiesDialog__gpu.slider.setValue( - self.layer_properties_dialog.gpu_max_kb * 2) + self.layer_properties_dialog._LayerPropertiesDialog__gpu_mem.slider.setValue( + self.layer_properties_dialog.gpu_mem_max_kb * 2) self.assertFalse(self.layer_properties_dialog.verify()) def test__should_fail_on_gpu_too_low(self): - self.layer_properties_dialog._LayerPropertiesDialog__gpu.slider.setValue( - self.layer_properties_dialog.gpu_min_kb / 3) + self.layer_properties_dialog._LayerPropertiesDialog__gpu_mem.slider.setValue( + self.layer_properties_dialog.gpu_mem_min_kb / 3) self.assertFalse(self.layer_properties_dialog.verify()) def test__should_apply_new_settings(self): @@ -206,9 +208,10 @@ def test__should_apply_new_settings(self): self.layer_properties_dialog._LayerPropertiesDialog__thread.parent().parent().enable(True) self.layer_properties_dialog._LayerPropertiesDialog__thread.setChecked(new_is_threadable) - new_min_gpu = 6 - self.layer_properties_dialog._LayerPropertiesDialog__gpu.parent().parent().enable(True) - self.layer_properties_dialog._LayerPropertiesDialog__gpu.slider.setValue(new_min_gpu) + new_min_gpu_memory = 6 + self.layer_properties_dialog._LayerPropertiesDialog__gpu_mem.parent().parent().enable(True) + self.layer_properties_dialog._LayerPropertiesDialog__gpu_mem.slider.setValue( + new_min_gpu_memory) new_timeout = 20 self.layer_properties_dialog._LayerPropertiesDialog__timeout.parent().parent().enable(True) @@ -239,10 +242,10 @@ def test__should_apply_new_settings(self): layer2_mock.setMaxCores.assert_called_with(100 * new_max_cores) layer1_mock.setThreadable.assert_called_with(new_is_threadable) layer2_mock.setThreadable.assert_called_with(new_is_threadable) - layer1_mock.setMinGpu.assert_called_with( - new_min_gpu * self.layer_properties_dialog.gpu_tick_kb) - layer2_mock.setMinGpu.assert_called_with( - new_min_gpu * self.layer_properties_dialog.gpu_tick_kb) + layer1_mock.setMinGpuMemory.assert_called_with( + new_min_gpu_memory * self.layer_properties_dialog.gpu_mem_tick_kb) + layer2_mock.setMinGpuMemory.assert_called_with( + new_min_gpu_memory * self.layer_properties_dialog.gpu_mem_tick_kb) layer1_mock.setTimeout.assert_called_with(new_timeout) layer2_mock.setTimeout.assert_called_with(new_timeout) layer1_mock.setTimeoutLLU.assert_called_with(new_timeout_llu) diff --git a/proto/facility.proto b/proto/facility.proto index 49c95537c..ae7f3d4b8 100644 --- a/proto/facility.proto +++ b/proto/facility.proto @@ -105,6 +105,11 @@ message AllocationStats { int32 hosts = 6; int32 locked_hosts = 7; int32 down_hosts = 8; + float gpus = 9; + float available_gpus = 10; + float idle_gpus = 11; + float running_gpus = 12; + float locked_gpus = 13; } diff --git a/proto/host.proto b/proto/host.proto index 90c957816..f2193bdb9 100644 --- a/proto/host.proto +++ b/proto/host.proto @@ -249,16 +249,16 @@ message Host { float idle_cores = 7; int64 memory = 8; int64 idle_memory = 9; - int64 gpu = 10; - int64 idle_gpu = 11; + int64 gpu_memory = 10; + int64 idle_gpu_memory = 11; int64 total_swap = 12; int64 total_memory = 13; - int64 total_gpu = 14; + int64 total_gpu_memory = 14; int64 total_mcp = 15; int64 free_swap = 16; int64 free_memory = 17; int64 free_mcp = 18; - int64 free_gpu = 19; + int64 free_gpu_memory = 19; int32 load = 20; int32 boot_time = 21; int32 ping_time = 22; @@ -267,6 +267,8 @@ message Host { HardwareState state = 25; LockState lock_state = 26; ThreadMode thread_mode = 27; + float gpus = 28; + float idle_gpus = 29; } message HostSearchCriteria { @@ -292,16 +294,16 @@ message NestedHost { float idle_cores = 7; int64 memory = 8; int64 idle_memory = 9; - int64 gpu = 10; - int64 idle_gpu = 11; + int64 gpu_memory = 10; + int64 idle_gpu_memory = 11; int64 total_swap = 12; int64 total_memory = 13; - int64 total_gpu = 14; + int64 total_gpu_memory = 14; int64 total_mcp = 15; int64 free_swap = 16; int64 free_memory = 17; int64 free_mcp = 18; - int64 free_gpu = 19; + int64 free_gpu_memory = 19; int32 load = 20; int32 boot_time = 21; int32 ping_time = 22; @@ -311,6 +313,8 @@ message NestedHost { LockState lock_state = 26; ThreadMode thread_mode = 27; NestedProcSeq procs = 28; + float gpus = 29; + float idle_gpus = 30; } message NestedHostSeq { @@ -328,7 +332,7 @@ message NestedProc { int32 bookedTime = 8; int32 dispatch_time = 9; int64 reserved_memory = 10; - int64 reserverd_gpu = 11; + int64 reserved_gpu_memory = 11; int64 used_memory = 12; float reserved_cores = 13; bool unbooked = 14; @@ -336,6 +340,8 @@ message NestedProc { string redirect_target = 16; repeated string services = 17; NestedHost parent = 18; + int64 used_gpu_memory = 19; + float reserved_gpus = 20; } message NestedProcSeq { @@ -360,13 +366,15 @@ message Proc { int32 bookedTime = 8; int32 dispatch_time = 9; int64 reserved_memory = 10; - int64 reserved_gpu = 11; + int64 reserved_gpu_memory = 11; int64 used_memory = 12; float reserved_cores = 13; bool unbooked = 14; string log_path = 15; string redirect_target = 16; repeated string services = 17; + int64 used_gpu_memory = 18; + float reserved_gpus = 19; } message ProcSearchCriteria { diff --git a/proto/job.proto b/proto/job.proto index 4a74f3aa8..240fba609 100644 --- a/proto/job.proto +++ b/proto/job.proto @@ -100,6 +100,12 @@ service GroupInterface { // Set the Default Job Min Core values to all in the provided group rpc SetDefaultJobMinCores(GroupSetDefJobMinCoresRequest) returns (GroupSetDefJobMinCoresResponse); + // Set the Default Job Max Gpu values to all in the provided group + rpc SetDefaultJobMaxGpus(GroupSetDefJobMaxGpusRequest) returns (GroupSetDefJobMaxGpusResponse); + + // Set the Default Job Min Gpu values to all in the provided group + rpc SetDefaultJobMinGpus(GroupSetDefJobMinGpusRequest) returns (GroupSetDefJobMinGpusResponse); + // Set the Default Job Priority values to all in the provided group rpc SetDefaultJobPriority(GroupSetDefJobPriorityRequest) returns (GroupSetDefJobPriorityResponse); @@ -115,6 +121,12 @@ service GroupInterface { // Set the groups Min Cores values rpc SetMinCores(GroupSetMinCoresRequest) returns (GroupSetMinCoresResponse); + // Set the group's Max Gpu value + rpc SetMaxGpus(GroupSetMaxGpusRequest) returns (GroupSetMaxGpusResponse); + + // Set the groups Min Gpu values + rpc SetMinGpus(GroupSetMinGpusRequest) returns (GroupSetMinGpusResponse); + // Set the groups name rpc SetName(GroupSetNameRequest) returns (GroupSetNameResponse); } @@ -248,6 +260,12 @@ service JobInterface { // Sets the minimum number of procs that can run on this job rpc SetMinCores(JobSetMinCoresRequest) returns (JobSetMinCoresResponse); + // Sets the maximum number of Gpu that can run on this job + rpc SetMaxGpus(JobSetMaxGpusRequest) returns (JobSetMaxGpusResponse); + + // Sets the minimum number of Gpu that can run on this job + rpc SetMinGpus(JobSetMinGpusRequest) returns (JobSetMinGpusResponse); + // Sets the job priority rpc SetPriority(JobSetPriorityRequest) returns (JobSetPriorityResponse); @@ -331,7 +349,16 @@ service LayerInterface { // Set the Min Cores for this layer rpc SetMinCores(LayerSetMinCoresRequest) returns (LayerSetMinCoresResponse); - // Set the Min gpu value for the layer + // The maximum number of Gpu to run on a given frame within this layer. + rpc SetMaxGpus(LayerSetMaxGpusRequest) returns (LayerSetMaxGpusResponse); + + // Set the Min Gpus for this layer + rpc SetMinGpus(LayerSetMinGpusRequest) returns (LayerSetMinGpusResponse); + + // Set the Min gpu memory value for the layer + rpc SetMinGpuMemory(LayerSetMinGpuMemoryRequest) returns (LayerSetMinGpuMemoryResponse); + + // [Deprecated] Set the Min gpu memory value for the layer rpc SetMinGpu(LayerSetMinGpuRequest) returns (LayerSetMinGpuResponse); // Set the Min Memory value for the layer @@ -450,12 +477,15 @@ message Frame { int64 max_rss = 11; int64 used_memory = 12; int64 reserved_memory = 13; - int64 reserved_gpu = 14; + int64 reserved_gpu_memory = 14; string last_resource = 15; CheckpointState checkpoint_state = 16; int32 checkpoint_count = 17; int32 total_core_time = 18; int32 llu_time = 19; + int32 total_gpu_time = 20; + int64 max_gpu_memory = 21; + int64 used_gpu_memory = 22; } // Object for frame searching @@ -499,6 +529,8 @@ message UpdatedFrame { int64 used_memory = 8; string last_resource = 9; int32 llu_time = 10; + int64 max_gpu_memory = 11; + int64 used_gpu_memory = 12; } message UpdatedFrameSeq { @@ -528,6 +560,10 @@ message Group { int32 level = 9; string parent_id = 10; GroupStats group_stats = 11; + float default_job_min_gpus = 12; + float default_job_max_gpus = 13; + float min_gpus = 14; + float max_gpus = 15; } message GroupSeq { @@ -541,6 +577,7 @@ message GroupStats { int32 waiting_frames = 4; int32 pending_jobs = 5; float reserved_cores = 6; + float reserved_gpus = 7; } // JOB ---- @@ -567,6 +604,8 @@ message Job { int32 start_time = 18; int32 stop_time = 19; JobStats job_stats = 20; + float min_gpus = 21; + float max_gpus = 22; } // Use to filter the job search. Please note that by searching for non-pending jobs, the output is limited to 200 jobs @@ -606,6 +645,11 @@ message JobStats { int64 failed_core_sec = 18; int64 max_rss = 19; float reserved_cores = 20; + int64 total_gpu_sec = 21; + int64 rendered_gpu_sec = 22; + int64 failed_gpu_sec = 23; + float reserved_gpus = 24; + int64 max_gpu_memory = 25; } // LAYER ---- @@ -618,7 +662,7 @@ message Layer { float max_cores = 6; bool is_threadable = 7; int64 min_memory = 8; - int64 min_gpu = 9; + int64 min_gpu_memory = 9; int32 chunk_size = 10; int32 dispatch_order = 11; LayerType type = 12; @@ -631,6 +675,8 @@ message Layer { repeated string limits = 17; int32 timeout = 18; int32 timeout_llu = 19; + float min_gpus = 20; + float max_gpus = 21; } message LayerSeq { @@ -658,6 +704,11 @@ message LayerStats { int64 failed_core_sec = 18; int64 max_rss = 19; float reserved_cores = 20; + int64 total_gpu_sec = 21; + int64 rendered_gpu_sec = 22; + int64 failed_gpu_sec = 23; + float reserved_gpus = 24; + int64 max_gpu_memory = 25; } // NestedGroup --- @@ -675,6 +726,10 @@ message NestedGroup { NestedGroupSeq groups = 11; repeated string jobs = 12; GroupStats stats = 13; + float default_job_min_gpus = 14; + float default_job_max_gpus = 15; + float min_gpus = 16; + float max_gpus = 17; } message NestedGroupSeq { @@ -706,6 +761,8 @@ message NestedJob { int32 stop_time = 19; NestedGroup parent = 20; JobStats stats = 21; + float min_gpus = 22; + float max_gpus = 23; } @@ -719,8 +776,9 @@ message FrameAddRenderPartitionRequest { int32 threads = 3; int32 max_cores = 4; int64 max_memory = 5; - int64 max_gpu = 6; + int64 max_gpu_memory = 6; string username = 7; + int32 max_gpus = 8; } message FrameAddRenderPartitionResponse { @@ -944,6 +1002,22 @@ message GroupSetDefJobMinCoresRequest { message GroupSetDefJobMinCoresResponse {} // Empty +// SetDefaultJobMaxGpus +message GroupSetDefJobMaxGpusRequest { + Group group = 1; + int32 max_gpus = 2; +} + +message GroupSetDefJobMaxGpusResponse {} // Empty + +// SetDefaultJobMinGpus +message GroupSetDefJobMinGpusRequest { + Group group = 1; + int32 min_gpus = 2; +} + +message GroupSetDefJobMinGpusResponse {} // Empty + // SetDefJobPriority message GroupSetDefJobPriorityRequest { Group group = 1; @@ -984,6 +1058,22 @@ message GroupSetMinCoresRequest { message GroupSetMinCoresResponse {} // Empty +// SetMaxGpus +message GroupSetMaxGpusRequest { + Group group = 1; + int32 max_gpus = 2; +} + +message GroupSetMaxGpusResponse {} // Empty + +// SetMinGpus +message GroupSetMinGpusRequest { + Group group = 1; + int32 min_gpus = 2; +} + +message GroupSetMinGpusResponse {} // Empty + // SetName message GroupSetNameRequest { Group group = 1; @@ -1008,8 +1098,9 @@ message JobAddRenderPartRequest { int32 threads = 3; int32 max_cores = 4; int64 max_memory = 5; - int64 max_gpu = 6; + int64 max_gpu_memory = 6; string username = 7; + int32 max_gpus = 8; } message JobAddRenderPartResponse { @@ -1296,6 +1387,14 @@ message JobSetMaxCoresRequest { message JobSetMaxCoresResponse {} // Empty +// SetMaxGpus +message JobSetMaxGpusRequest { + Job job = 1; + int32 val = 2; +} + +message JobSetMaxGpusResponse {} // Empty + // SetMaxRetries message JobSetMaxRetriesRequest { Job job = 1; @@ -1312,6 +1411,14 @@ message JobSetMinCoresRequest { message JobSetMinCoresResponse {} // Empty +// SetMinGpus +message JobSetMinGpusRequest { + Job job = 1; + int32 val = 2; +} + +message JobSetMinGpusResponse {} // Empty + // SetPriority message JobSetPriorityRequest { Job job = 1; @@ -1346,8 +1453,9 @@ message LayerAddRenderPartitionRequest { int32 threads = 3; int32 max_cores = 4; int64 max_memory = 5; - int64 max_gpu = 6; + int64 max_gpu_memory = 6; string username = 7; + int32 max_gpus = 8; } message LayerAddRenderPartitionResponse { @@ -1545,14 +1653,39 @@ message LayerSetMinCoresRequest { message LayerSetMinCoresResponse {} // Empty -// SetMinGpu +// [Deprecated] SetMinGpu message LayerSetMinGpuRequest { - Layer layer = 1; - int64 gpu = 2; + Layer layer = 1 [deprecated=true]; + int64 gpu = 2 [deprecated=true]; } +// [Deprecated] message LayerSetMinGpuResponse {} // Empty +// SetMaxGpus +message LayerSetMaxGpusRequest { + Layer layer = 1; + int32 max_gpus = 2; +} + +message LayerSetMaxGpusResponse {} // Empty + +// SetMinGpus +message LayerSetMinGpusRequest { + Layer layer = 1; + int32 min_gpus = 2; +} + +message LayerSetMinGpusResponse {} // Empty + +// SetMinGpuMemory +message LayerSetMinGpuMemoryRequest { + Layer layer = 1; + int64 gpu_memory = 2; +} + +message LayerSetMinGpuMemoryResponse {} // Empty + // SetMinMemory message LayerSetMinMemoryRequest { Layer layer = 1; diff --git a/proto/renderPartition.proto b/proto/renderPartition.proto index 9d7b554f3..c8553ed3c 100644 --- a/proto/renderPartition.proto +++ b/proto/renderPartition.proto @@ -40,8 +40,11 @@ message RenderPartition { int64 memory = 8; int32 max_cores = 9; int64 max_memory = 10; - int64 max_gpu = 11; + int64 max_gpu_memory = 11; int32 threads = 12; + int32 gpus = 13; + int32 max_gpus = 14; + int64 gpu_memory = 15; } message RenderPartitionSeq { @@ -61,7 +64,8 @@ message RenderPartSetMaxResourcesRequest { RenderPartition render_partition = 1; int32 cores = 2; int64 memory = 3; - int64 gpu = 4; + int64 gpu_memory = 4; + int32 gpus = 5; } message RenderPartSetMaxResourcesResponse {} // Empty diff --git a/proto/report.proto b/proto/report.proto index 7a1fffdb2..d53103663 100644 --- a/proto/report.proto +++ b/proto/report.proto @@ -59,17 +59,20 @@ message RenderHost { string facility= 4; // The name of the facility that the host is in int32 num_procs = 5; // the number of physical procs on this machine int32 cores_per_proc = 6; // the number of cores per proc - int32 total_swap = 7; // the total size of the swap in kB - int32 total_mem = 8; // the total size of the main memory pool in kB - int32 total_mcp = 9; // the total size of MCP in kB - int32 free_swap = 10; // the current amount of free swap in kB - int32 free_mem = 11; // the current amount of free memory in kB - int32 free_mcp = 12; // the current amount of free MCP in kB + int64 total_swap = 7; // the total size of the swap in kB + int64 total_mem = 8; // the total size of the main memory pool in kB + int64 total_mcp = 9; // the total size of MCP in kB + int64 free_swap = 10; // the current amount of free swap in kB + int64 free_mem = 11; // the current amount of free memory in kB + int64 free_mcp = 12; // the current amount of free MCP in kB int32 load = 13; // the current load on the proc int32 boot_time = 14; // the time the proc was booted repeated string tags = 15; // an array of default tags that are added to the host record host.HardwareState state = 16; // hardware state for the host map attributes = 17; // additional data can be provided about the host + int32 num_gpus = 18; // the number of physical GPU's + int64 free_gpu_mem = 19; // the current amount of free gpu memory in kB + int64 total_gpu_mem = 20; // the total size of gpu memory in kB }; message RunningFrameInfo { @@ -87,6 +90,9 @@ message RunningFrameInfo { int64 vsize = 12; // kB map attributes = 13; //additional data can be provided about the running frame int64 llu_time = 14; + int32 num_gpus = 15; + int64 max_used_gpu_memory = 16; // kB + int64 used_gpu_memory = 17; // kB }; diff --git a/proto/rqd.proto b/proto/rqd.proto index a4b83f0c3..f67add41b 100644 --- a/proto/rqd.proto +++ b/proto/rqd.proto @@ -110,6 +110,7 @@ message RunFrame { bool ignore_nimby = 20; map environment = 21; map attributes = 22; + int32 num_gpus = 23; } message RunFrameSeq { diff --git a/proto/service.proto b/proto/service.proto index 8b554b388..23633bc40 100644 --- a/proto/service.proto +++ b/proto/service.proto @@ -46,11 +46,13 @@ message Service { bool threadable = 3; int32 min_cores = 4; int32 max_cores = 5; - int32 min_memory = 6; - int32 min_gpu = 7; + int64 min_memory = 6; + int64 min_gpu_memory = 7; repeated string tags = 8; int32 timeout = 9; int32 timeout_llu = 10; + int32 min_gpus = 11; + int32 max_gpus = 12; } message ServiceSeq { diff --git a/proto/show.proto b/proto/show.proto index ca237cc54..f0fbf34b4 100644 --- a/proto/show.proto +++ b/proto/show.proto @@ -99,6 +99,12 @@ service ShowInterface { // sets a show's default min procs rpc SetDefaultMinCores(ShowSetDefaultMinCoresRequest) returns (ShowSetDefaultMinCoresResponse); + + // sets a show's default max Gpus + rpc SetDefaultMaxGpus(ShowSetDefaultMaxGpusRequest) returns (ShowSetDefaultMaxGpusResponse); + + // sets a show's default min Gpus + rpc SetDefaultMinGpus(ShowSetDefaultMinGpusRequest) returns (ShowSetDefaultMinGpusResponse); } @@ -114,6 +120,8 @@ message Show { bool dispatch_enabled = 7; bool active = 8; ShowStats show_stats = 9; + float default_min_gpus = 10; + float default_max_gpus = 11; } message ShowSeq { @@ -130,6 +138,7 @@ message ShowStats { int64 rendered_frame_count = 7; int64 failed_frame_count = 8; float reserved_cores = 9; + float reserved_gpus = 10; } @@ -375,3 +384,19 @@ message ShowSetDefaultMinCoresRequest { } message ShowSetDefaultMinCoresResponse {} // Empty + +// SetDefaultMaxGpus +message ShowSetDefaultMaxGpusRequest { + Show show = 1; + int32 max_gpus = 2; +} + +message ShowSetDefaultMaxGpusResponse {} // Empty + +// SetDefaultMinGpus +message ShowSetDefaultMinGpusRequest { + Show show = 1; + int32 min_gpus = 2; +} + +message ShowSetDefaultMinGpusResponse {} // Empty diff --git a/proto/subscription.proto b/proto/subscription.proto index 8c7817da4..8ac100e7e 100644 --- a/proto/subscription.proto +++ b/proto/subscription.proto @@ -41,6 +41,7 @@ message Subscription { int32 size = 6; int32 burst = 7; int32 reserved_cores = 8; + int32 reserved_gpus = 9; } message SubscriptionSeq { diff --git a/pycue/opencue/wrappers/group.py b/pycue/opencue/wrappers/group.py index 59685cc8b..1c8175e57 100644 --- a/pycue/opencue/wrappers/group.py +++ b/pycue/opencue/wrappers/group.py @@ -67,6 +67,24 @@ def setMinCores(self, value): self.stub.SetMinCores(job_pb2.GroupSetMinCoresRequest(group=self.data, min_cores=value), timeout=Cuebot.Timeout) + def setMaxGpus(self, value): + """Sets the maximum gpus of everything in the group. + + :type value: int + :param value: new maximum number of gpus + """ + self.stub.SetMaxGpus(job_pb2.GroupSetMaxGpusRequest(group=self.data, max_gpus=value), + timeout=Cuebot.Timeout) + + def setMinGpus(self, value): + """Sets the minimum gpus of everything the group. + + :type value: int + :param value: new minimum number of gpus + """ + self.stub.SetMinGpus(job_pb2.GroupSetMinGpusRequest(group=self.data, min_gpus=value), + timeout=Cuebot.Timeout) + def setDefaultJobPriority(self, value): """Sets the default job priority for everything in the group. @@ -97,6 +115,26 @@ def setDefaultJobMaxCores(self, value): job_pb2.GroupSetDefJobMaxCoresRequest(group=self.data, max_cores=value), timeout=Cuebot.Timeout) + def setDefaultJobMinGpus(self, value): + """Sets the default job minimum gpus for everything in the group. + + :type value: int + :param value: new default job minimum gpus + """ + self.stub.SetDefaultJobMinGpus( + job_pb2.GroupSetDefJobMinGpusRequest(group=self.data, min_gpus=value), + timeout=Cuebot.Timeout) + + def setDefaultJobMaxGpus(self, value): + """Sets the default job maximum gpus for everything in the group. + + :type value: int + :param value: new default job maximum gpus + """ + self.stub.SetDefaultJobMaxGpus( + job_pb2.GroupSetDefJobMaxGpusRequest(group=self.data, max_gpus=value), + timeout=Cuebot.Timeout) + def getGroups(self): """Returns child groups of this group. diff --git a/pycue/opencue/wrappers/job.py b/pycue/opencue/wrappers/job.py index eb977a280..733c91e61 100644 --- a/pycue/opencue/wrappers/job.py +++ b/pycue/opencue/wrappers/job.py @@ -126,6 +126,20 @@ def setMaxCores(self, maxCores): self.stub.SetMaxCores(job_pb2.JobSetMaxCoresRequest(job=self.data, val=maxCores), timeout=Cuebot.Timeout) + def setMinGpus(self, minGpus): + """Sets the minimum procs value + :type minGpus: int + :param minGpus: New minimum cores value""" + self.stub.SetMinGpus(job_pb2.JobSetMinGpusRequest(job=self.data, val=minGpus), + timeout=Cuebot.Timeout) + + def setMaxGpus(self, maxGpus): + """Sets the maximum procs value + :type maxGpus: int + :param maxGpus: New maximum cores value""" + self.stub.SetMaxGpus(job_pb2.JobSetMaxGpusRequest(job=self.data, val=maxGpus), + timeout=Cuebot.Timeout) + def setPriority(self, priority): """Sets the job priority. @@ -211,7 +225,7 @@ def setAutoEating(self, value): self.stub.SetAutoEat(job_pb2.JobSetAutoEatRequest(job=self.data, value=value), timeout=Cuebot.Timeout) - def addRenderPartition(self, hostname, threads, max_cores, num_mem, max_gpu): + def addRenderPartition(self, hostname, threads, max_cores, num_mem, max_gpus, max_gpu_memory): """Adds a render partition to the job. :type hostname: str @@ -222,8 +236,10 @@ def addRenderPartition(self, hostname, threads, max_cores, num_mem, max_gpu): :param max_cores: max cores enabled for the partition :type num_mem: int :param num_mem: amount of memory reserved for the partition - :type max_gpu: int - :param max_gpu: max gpu cores enabled for the partition + :type max_gpus: int + :param max_gpus: max gpu cores enabled for the partition + :type max_gpu_memory: int + :param max_gpu_memory: amount of gpu memory reserved for the partition """ self.stub.AddRenderPartition( job_pb2.JobAddRenderPartRequest(job=self.data, @@ -231,7 +247,8 @@ def addRenderPartition(self, hostname, threads, max_cores, num_mem, max_gpu): threads=threads, max_cores=max_cores, max_memory=num_mem, - max_gpu=max_gpu, + max_gpus=max_gpus, + max_gpu_memory=max_gpu_memory, username=os.getenv("USER", "unknown"))) def getWhatDependsOnThis(self): @@ -492,6 +509,20 @@ def maxCores(self): """ return self.data.max_cores + def minGpus(self): + """Returns the minimum number of gpus the job needs. + :rtype: int + :return: job's min gpus + """ + return self.data.min_gpus + + def maxGpus(self): + """Returns the maximum number of gpus the job will use. + :rtype: int + :return: job's max gpus + """ + return self.data.max_gpus + def os(self): """Returns the job's operating system. @@ -823,6 +854,18 @@ def setMaxCores(self, maxCores): """ self.asJob().setMaxCores(maxCores) + def setMinGpus(self, minGpus): + """Sets the minimum gpus value + :type minGpus: int + :param minGpus: New minimum gpus value""" + self.asJob().setMinGpus(minGpus) + + def setMaxGpus(self, maxGpus): + """Sets the maximum gpus value + :type maxGpus: int + :param maxGpus: New maximum gpus value""" + self.asJob().setMaxGpus(maxGpus) + def setPriority(self, priority): """Sets the job priority. diff --git a/pycue/opencue/wrappers/layer.py b/pycue/opencue/wrappers/layer.py index 7c33b68e7..0e34985bf 100644 --- a/pycue/opencue/wrappers/layer.py +++ b/pycue/opencue/wrappers/layer.py @@ -140,14 +140,30 @@ def setMinCores(self, cores): job_pb2.LayerSetMinCoresRequest(layer=self.data, cores=cores/100.0), timeout=Cuebot.Timeout) - def setMinGpu(self, gpu): + def setMaxGpus(self, max_gpus): + """Sets the maximum number of gpus that this layer requires. + :type max_gpus: int + :param max_gpus: gpu cores""" + return self.stub.SetMaxGpus( + job_pb2.LayerSetMaxGpusRequest(layer=self.data, max_gpus=max_gpus), + timeout=Cuebot.Timeout) + + def setMinGpus(self, min_gpus): + """Sets the minimum number of gpus that this layer requires. + :type min_gpus: int + :param min_gpus: gou cores""" + return self.stub.SetMinGpus( + job_pb2.LayerSetMinGpusRequest(layer=self.data, min_gpus=min_gpus), + timeout=Cuebot.Timeout) + + def setMinGpuMemory(self, gpu_memory): """Sets the minimum number of gpu memory that this layer requires. - :type gpu: int - :param gpu: gpu value + :type gpu_memory: int + :param gpu_memory: gpu_memory value """ - return self.stub.SetMinGpu( - job_pb2.LayerSetMinGpuRequest(layer=self.data, gpu=gpu), + return self.stub.SetMinGpuMemory( + job_pb2.LayerSetMinGpuMemoryRequest(layer=self.data, gpu_memory=gpu_memory), timeout=Cuebot.Timeout) def setMinMemory(self, memory): @@ -401,6 +417,12 @@ def coresReserved(self): """ return self.data.layer_stats.reserved_cores + def gpusReserved(self): + """Returns the number of gpus reserved on this layer + :rtype: float + :return: gpus reserved""" + return self.data.layer_stats.reserved_gpus + def minCores(self): """Returns the minimum number of cores that frames in this layer require. @@ -409,6 +431,12 @@ def minCores(self): """ return self.data.min_cores + def minGpus(self): + """Returns the minimum number of gpus that frames in this layer require + :rtype: int + :return: Minimum number of gpus required""" + return self.data.min_gpus + def minMemory(self): """Returns the minimum amount of memory that frames in this layer require. diff --git a/pycue/opencue/wrappers/show.py b/pycue/opencue/wrappers/show.py index 750d05645..9fff4b47d 100644 --- a/pycue/opencue/wrappers/show.py +++ b/pycue/opencue/wrappers/show.py @@ -167,6 +167,32 @@ def setDefaultMinCores(self, mincores): timeout=Cuebot.Timeout) return response + def setDefaultMaxGpus(self, maxgpus): + """Sets the default maximum number of gpus + that new jobs are launched with. + :type: float + :param: value to set maxGpu to + :rtype: show_pb2.ShowSetDefaultMaxGpuResponse + :return: response is empty + """ + response = self.stub.SetDefaultMaxGpus(show_pb2.ShowSetDefaultMaxGpusRequest( + show=self.data, max_gpu=maxgpus), + timeout=Cuebot.Timeout) + return response + + def setDefaultMinGpus(self, mingpus): + """Sets the default minimum number of gpus + all new jobs are launched with. + :type: float + :param: value to set minGpus to + :rtype: show_pb2.ShowSetDefaultMinGpusResponse + :return: response is empty + """ + response = self.stub.SetDefaultMinGpus(show_pb2.ShowSetDefaultMinGpusRequest( + show=self.data, min_gpu=mingpus), + timeout=Cuebot.Timeout) + return response + def findFilter(self, name): """Finds a filter by name. diff --git a/pycue/tests/wrappers/layer_test.py b/pycue/tests/wrappers/layer_test.py index cf8fb0c33..4f5578681 100644 --- a/pycue/tests/wrappers/layer_test.py +++ b/pycue/tests/wrappers/layer_test.py @@ -201,18 +201,18 @@ def testSetMaxCores(self, getStubMock): job_pb2.LayerSetMaxCoresRequest(layer=layer.data, cores=testCoresActual), timeout=mock.ANY) - def testSetMinGpu(self, getStubMock): + def testSetMinGpuMemory(self, getStubMock): stubMock = mock.Mock() - stubMock.SetMinGpu.return_value = job_pb2.LayerSetMinGpuResponse() + stubMock.SetMinGpuMemory.return_value = job_pb2.LayerSetMinGpuResponse() getStubMock.return_value = stubMock testCores = 100 layer = opencue.wrappers.layer.Layer( job_pb2.Layer(name=TEST_LAYER_NAME)) - layer.setMinGpu(testCores) + layer.setMinGpuMemory(testCores) - stubMock.SetMinGpu.assert_called_with( - job_pb2.LayerSetMinGpuRequest(layer=layer.data, gpu=testCores), + stubMock.SetMinGpuMemory.assert_called_with( + job_pb2.LayerSetMinGpuMemoryRequest(layer=layer.data, gpu_memory=testCores), timeout=mock.ANY) def testSetMinMemory(self, getStubMock): diff --git a/pyoutline/etc/outline.cfg b/pyoutline/etc/outline.cfg index 6d0f129a4..fafbfa636 100644 --- a/pyoutline/etc/outline.cfg +++ b/pyoutline/etc/outline.cfg @@ -5,7 +5,7 @@ wrapper_dir = %(home)s/wrappers user_dir = bin_dir = %(home)s/bin backend = cue -spec_version = 1.11 +spec_version = 1.12 facility = local domain = example.com maxretries = 2 diff --git a/pyoutline/outline/backend/cue.py b/pyoutline/outline/backend/cue.py index 6cd03105d..333b5ac59 100644 --- a/pyoutline/outline/backend/cue.py +++ b/pyoutline/outline/backend/cue.py @@ -323,6 +323,30 @@ def _serialize(launcher, use_pycuerun): if layer.get_arg("memory"): sub_element(spec_layer, "memory", "%s" % (layer.get_arg("memory"))) + gpus = None + if layer.get_arg("gpus"): + if spec_version >= Version("1.12"): + gpus = layer.get_arg("gpus") + else: + _warning_spec_version(spec_version, "gpus") + + gpu_memory = None + if layer.get_arg("gpu_memory"): + if spec_version >= Version("1.12"): + gpu_memory = layer.get_arg("gpu_memory") + else: + _warning_spec_version(spec_version, "gpu_memory") + + if gpus or gpu_memory: + # Cuebot expects non-zero positive value on gpus and gpu_memory + if gpus is None: + gpus = 1 + if gpu_memory is None: + gpu_memory = "1g" + + sub_element(spec_layer, "gpus", "%d" % gpus) + sub_element(spec_layer, "gpu_memory", "%s" % gpu_memory) + if layer.get_arg("timeout"): if spec_version >= Version("1.10"): sub_element(spec_layer, "timeout", "%s" % (layer.get_arg("timeout"))) diff --git a/pyoutline/tests/specver_test.py b/pyoutline/tests/specver_test.py index 3244e6807..f4c077d4b 100644 --- a/pyoutline/tests/specver_test.py +++ b/pyoutline/tests/specver_test.py @@ -63,3 +63,24 @@ def test_1_11(self): self.assertEqual(root.find("job/layers/layer/timeout").text, "420") self.assertEqual(root.find("job/layers/layer/timeout_llu").text, "4200") self.assertEqual(root.find("job/priority").text, "42") + + def _makeGpuSpec(self): + ol = outline.Outline(name="spec_version_test") + layer = outline.modules.shell.Shell("test_layer", command=["/bin/ls"]) + layer.set_arg("gpus", 4) + layer.set_arg("gpu_memory", 8 * 1024 * 1024) + ol.add_layer(layer) + l = outline.cuerun.OutlineLauncher(ol) + return Et.fromstring(l.serialize()) + + def test_gpu_1_11(self): + outline.config.set("outline", "spec_version", "1.11") + root = self._makeGpuSpec() + self.assertIsNone(root.find("job/layers/layer/gpus")) + self.assertIsNone(root.find("job/layers/layer/gpus_memory")) + + def test_gpu_1_12(self): + outline.config.set("outline", "spec_version", "1.12") + root = self._makeGpuSpec() + self.assertEqual(root.find("job/layers/layer/gpus").text, "4") + self.assertEqual(root.find("job/layers/layer/gpu_memory").text, "8388608") diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index dd58e35a6..b7dcae31a 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -90,7 +90,7 @@ def __createEnvVariables(self): self.frameEnv["maxframetime"] = "0" self.frameEnv["minspace"] = "200" self.frameEnv["CUE3"] = "True" - self.frameEnv["CUE_GPU_MEMORY"] = str(self.rqCore.machine.getGpuMemory()) + self.frameEnv["CUE_GPU_MEMORY"] = str(self.rqCore.machine.getGpuMemoryFree()) self.frameEnv["SP_NOMYCSHRC"] = "1" for key in self.runFrame.environment: @@ -103,6 +103,10 @@ def __createEnvVariables(self): len(self.runFrame.attributes['CPU_LIST'].split(',')))) self.frameEnv['CUE_HT'] = "True" + # Add GPU's to use all assigned GPU cores + if 'GPU_LIST' in self.runFrame.attributes: + self.frameEnv['CUE_GPU_CORES'] = self.runFrame.attributes['GPU_LIST'] + def _createCommandFile(self, command): """Creates a file that subprocess. Popen then executes. @type command: string @@ -187,6 +191,8 @@ def __writeFooter(self): print("%-20s%s" % ("endTime", time.ctime(self.endTime)), file=self.rqlog) print("%-20s%s" % ("maxrss", self.frameInfo.maxRss), file=self.rqlog) + print("%-20s%s" % ("maxUsedGpuMemory", + self.frameInfo.maxUsedGpuMemory), file=self.rqlog) print("%-20s%s" % ("utime", self.frameInfo.utime), file=self.rqlog) print("%-20s%s" % ("stime", self.frameInfo.stime), file=self.rqlog) print("%-20s%s" % ("renderhost", self.rqCore.machine.getHostname()), file=self.rqlog) @@ -531,7 +537,9 @@ def run(self): # Delay keeps the cuebot from spamming failing booking requests time.sleep(10) finally: - self.rqCore.releaseCores(self.runFrame.num_cores, runFrame.attributes.get('CPU_LIST')) + self.rqCore.releaseCores(self.runFrame.num_cores, runFrame.attributes.get('CPU_LIST'), + runFrame.attributes.get('GPU_LIST') + if 'GPU_LIST' in self.runFrame.attributes else None) self.rqCore.deleteFrame(self.runFrame.frame_id) @@ -733,7 +741,7 @@ def killAllFrame(self, reason): pass time.sleep(1) - def releaseCores(self, reqRelease, releaseHT=None): + def releaseCores(self, reqRelease, releaseHT=None, releaseGpus=None): """The requested number of cores are released @type reqRelease: int @param reqRelease: Number of cores to release, 100 = 1 physical core""" @@ -753,6 +761,9 @@ def releaseCores(self, reqRelease, releaseHT=None): if releaseHT: self.machine.releaseHT(releaseHT) + if releaseGpus: + self.machine.releaseGpus(releaseGpus) + finally: self.__threadLock.release() @@ -851,6 +862,11 @@ def launchFrame(self, runFrame): if reserveHT: runFrame.attributes['CPU_LIST'] = reserveHT + if runFrame.num_gpus: + reserveGpus = self.machine.reserveGpus(runFrame.num_gpus) + if reserveGpus: + runFrame.attributes['GPU_LIST'] = reserveGpus + # They must be available at this point, reserve them # pylint: disable=no-member self.cores.idle_cores -= runFrame.num_cores diff --git a/rqd/rqd/rqmachine.py b/rqd/rqd/rqmachine.py index 915c13afc..3da883111 100644 --- a/rqd/rqd/rqmachine.py +++ b/rqd/rqd/rqmachine.py @@ -74,6 +74,7 @@ def __init__(self, rqCore, coreInfo): self.__rqCore = rqCore self.__coreInfo = coreInfo self.__tasksets = set() + self.__gpusets = set() if platform.system() == 'Linux': self.__vmstat = rqd.rqswap.VmStat() @@ -97,6 +98,7 @@ def __init__(self, rqCore, coreInfo): self.__pidHistory = {} self.setupHT() + self.setupGpu() def isNimbySafeToRunJobs(self): """Returns False if nimby should be triggered due to resource limits""" @@ -276,6 +278,14 @@ def rssUpdate(self, frames): frame.rss = rss frame.maxRss = max(rss, frame.maxRss) + if 'GPU_LIST' in frame.runFrame.attributes: + usedGpuMemory = 0 + for unitId in frame.runFrame.attributes.get('GPU_LIST').split(','): + usedGpuMemory += self.getGpuMemoryUsed(unitId) + + frame.usedGpuMemory = usedGpuMemory + frame.maxUsedGpuMemory = max(usedGpuMemory, frame.maxUsedGpuMemory) + if os.path.exists(frame.runFrame.log_dir_file): stat = os.stat(frame.runFrame.log_dir_file).st_mtime frame.lluTime = int(stat) @@ -315,44 +325,71 @@ def getBootTime(self): return int(line.split()[1]) return 0 + @rqd.rqutil.Memoize + def getGpuCount(self): + """Returns the total gpu's on the machine""" + return self.__getGpuValues()['count'] + @rqd.rqutil.Memoize def getGpuMemoryTotal(self): """Returns the total gpu memory in kb for CUE_GPU_MEMORY""" return self.__getGpuValues()['total'] - def getGpuMemory(self): + def getGpuMemoryFree(self): """Returns the available gpu memory in kb for CUE_GPU_MEMORY""" return self.__getGpuValues()['free'] + def getGpuMemoryUsed(self, unitId): + """Returns the available gpu memory in kb for CUE_GPU_MEMORY""" + usedMemory = self.__getGpuValues()['used'] + return usedMemory[unitId] if unitId in usedMemory else 0 + # pylint: disable=attribute-defined-outside-init + def __resetGpuResults(self): + self.gpuResults = {'count': 0, 'total': 0, 'free': 0, 'used': {}, 'updated': 0} + def __getGpuValues(self): if not hasattr(self, 'gpuNotSupported'): if not hasattr(self, 'gpuResults'): - self.gpuResults = {'total': 0, 'free': 0, 'updated': 0} + self.__resetGpuResults() if not rqd.rqconstants.ALLOW_GPU: self.gpuNotSupported = True return self.gpuResults - if self.gpuResults['updated'] > time.time() - 60: + if self.gpuResults['updated'] > int(time.time()) - 60: return self.gpuResults try: - # /shots/spi/home/bin/spinux1/cudaInfo - # /shots/spi/home/bin/rhel7/cudaInfo - cudaInfo = subprocess.getoutput('/usr/local/spi/rqd3/cudaInfo') - if 'There is no device supporting CUDA' in cudaInfo: - self.gpuNotSupported = True - else: - results = cudaInfo.splitlines()[-1].split() - # TotalMem 1023 Mb FreeMem 968 Mb - # The int(math.ceil(int(x) / 32.0) * 32) rounds up to the next multiple of 32 - self.gpuResults['total'] = ( - int(math.ceil(int(results[1]) / 32.0) * 32) * KILOBYTE) - self.gpuResults['free'] = int(results[4]) * KILOBYTE - self.gpuResults['updated'] = time.time() + nvidia_smi = subprocess.getoutput( + 'nvidia-smi --query-gpu=memory.total,memory.free,count' + ' --format=csv,noheader') + total = 0 + free = 0 + count = 0 + unitId = 0 + for line in nvidia_smi.splitlines(): + # Example "16130 MiB, 16103 MiB, 8" + # 1 MiB = 1048.576 KB + l = line.split() + unitTotal = math.ceil(int(l[0]) * 1048.576) + unitFree = math.ceil(int(l[2]) * 1048.576) + total += unitTotal + free += unitFree + count = int(l[-1]) + self.gpuResults['used'][str(unitId)] = unitTotal - unitFree + unitId += 1 + + self.gpuResults['total'] = int(total) + self.gpuResults['free'] = int(free) + self.gpuResults['count'] = count + self.gpuResults['updated'] = int(time.time()) # pylint: disable=broad-except except Exception as e: + self.gpuNotSupported = True + self.__resetGpuResults() log.warning( - 'Failed to get FreeMem from cudaInfo due to: %s at %s', + 'Failed to query nvidia-smi due to: %s at %s', e, traceback.extract_tb(sys.exc_info()[2])) + else: + self.__resetGpuResults() return self.gpuResults def __getSwapout(self): @@ -592,7 +629,10 @@ def updateMachineStats(self): self.__renderHost.free_swap = freeSwapMem self.__renderHost.free_mem = freeMem + cachedMem - self.__renderHost.attributes['freeGpu'] = str(self.getGpuMemory()) + self.__renderHost.num_gpus = self.getGpuCount() + self.__renderHost.total_gpu_mem = self.getGpuMemoryTotal() + self.__renderHost.free_gpu_mem = self.getGpuMemoryFree() + self.__renderHost.attributes['swapout'] = self.__getSwapout() elif platform.system() == 'Darwin': @@ -647,6 +687,10 @@ def setupHT(self): if self.__enabledHT(): self.__tasksets = set(range(self.__coreInfo.total_cores // 100)) + def setupGpu(self): + """ Setup rqd for Gpus """ + self.__gpusets = set(range(self.getGpuCount())) + def reserveHT(self, reservedCores): """ Reserve cores for use by taskset taskset -c 0,1,8,9 COMMAND @@ -698,3 +742,32 @@ def releaseHT(self, reservedHT): for core in reservedHT.split(','): if int(core) < self.__coreInfo.total_cores // 100: self.__tasksets.add(int(core)) + + def reserveGpus(self, reservedGpus): + """ Reserve gpus + @type reservedGpus: int + @param reservedGpus: The total gpus reserved by the frame. + @rtype: string + @return: The gpu-list. ex: '0,1,8,9' + """ + if len(self.__gpusets) < reservedGpus: + err = 'Not launching, insufficient GPUs to reserve based on reservedGpus' + log.critical(err) + raise rqd.rqexceptions.CoreReservationFailureException(err) + + gpusets = [] + for _ in range(reservedGpus): + gpu = self.__gpusets.pop() + gpusets.append(str(gpu)) + + return ','.join(gpusets) + + def releaseGpus(self, reservedGpus): + """ Release gpus + @type: string + @param: The gpu-list to release. ex: '0,1,8,9' + """ + log.debug('GPU set: Releasing gpu - %s', reservedGpus) + for gpu in reservedGpus.split(','): + if int(gpu) < self.getGpuCount(): + self.__gpusets.add(int(gpu)) diff --git a/rqd/rqd/rqnetwork.py b/rqd/rqd/rqnetwork.py index da5bee5cc..43abc8229 100644 --- a/rqd/rqd/rqnetwork.py +++ b/rqd/rqd/rqnetwork.py @@ -62,6 +62,10 @@ def __init__(self, rqCore, runFrame): self.vsize = 0 self.maxVsize = 0 + self.numGpus = 0 + self.usedGpuMemory = 0 + self.maxUsedGpuMemory = 0 + self.realtime = 0 self.utime = 0 self.stime = 0 @@ -84,7 +88,10 @@ def runningFrameInfo(self): max_vsize=self.maxVsize, vsize=self.vsize, attributes=self.runFrame.attributes, - llu_time=self.lluTime + llu_time=self.lluTime, + num_gpus=self.numGpus, + max_used_gpu_memory=self.maxUsedGpuMemory, + used_gpu_memory=self.usedGpuMemory ) return runningFrameInfo diff --git a/rqd/tests/rqconstants_tests.py b/rqd/tests/rqconstants_tests.py index 0df71790c..46da55f39 100644 --- a/rqd/tests/rqconstants_tests.py +++ b/rqd/tests/rqconstants_tests.py @@ -41,7 +41,6 @@ from .rqmachine_tests import ( CPUINFO, - CUDAINFO, LOADAVG_LOW_USAGE, MEMINFO_MODERATE_USAGE, PROC_STAT, @@ -78,7 +77,7 @@ def decorator(*args, **kwargs): return decorator -@mock.patch("subprocess.getoutput", new=mock.MagicMock(return_value=CUDAINFO)) +@mock.patch("subprocess.getoutput", new=mock.MagicMock(return_value="")) @mock.patch.object( rqd.rqutil.Memoize, "isCached", new=mock.MagicMock(return_value=False) ) diff --git a/rqd/tests/rqmachine_tests.py b/rqd/tests/rqmachine_tests.py index 7c7a650e1..b4bfe7bad 100644 --- a/rqd/tests/rqmachine_tests.py +++ b/rqd/tests/rqmachine_tests.py @@ -153,17 +153,13 @@ '16781318 0 0 0 0 17 4 0 0 0 0 0 6303248 6304296 23932928 140725890743234 ' '140725890743420 140725890743420 140725890744298 0') -CUDAINFO = ' TotalMem 1023 Mb FreeMem 968 Mb' - -@mock.patch('subprocess.getoutput', new=mock.MagicMock(return_value=CUDAINFO)) @mock.patch.object(rqd.rqutil.Memoize, 'isCached', new=mock.MagicMock(return_value=False)) @mock.patch('platform.system', new=mock.MagicMock(return_value='Linux')) @mock.patch('os.statvfs', new=mock.MagicMock()) @mock.patch('rqd.rqutil.getHostname', new=mock.MagicMock(return_value='arbitrary-hostname')) class MachineTests(pyfakefs.fake_filesystem_unittest.TestCase): - @mock.patch('subprocess.getoutput', new=mock.MagicMock(return_value=CUDAINFO)) @mock.patch('os.statvfs', new=mock.MagicMock()) @mock.patch('platform.system', new=mock.MagicMock(return_value='Linux')) def setUp(self): @@ -318,29 +314,39 @@ def test_getBootTime(self): self.assertEqual(1569882758, self.machine.getBootTime()) - @mock.patch( - 'subprocess.getoutput', - new=mock.MagicMock(return_value=' TotalMem 1023 Mb FreeMem 968 Mb')) - def test_getGpuMemoryTotal(self): + def _resetGpuStat(self): if hasattr(self.machine, 'gpuNotSupported'): delattr(self.machine, 'gpuNotSupported') if hasattr(self.machine, 'gpuResults'): delattr(self.machine, 'gpuResults') - rqd.rqconstants.ALLOW_GPU = True - - self.assertEqual(1048576, self.machine.getGpuMemoryTotal()) - @mock.patch( - 'subprocess.getoutput', - new=mock.MagicMock(return_value=' TotalMem 1023 Mb FreeMem 968 Mb')) - def test_getGpuMemory(self): - if hasattr(self.machine, 'gpuNotSupported'): - delattr(self.machine, 'gpuNotSupported') - if hasattr(self.machine, 'gpuResults'): - delattr(self.machine, 'gpuResults') - rqd.rqconstants.ALLOW_GPU = True + @mock.patch.object( + rqd.rqconstants, 'ALLOW_GPU', new=mock.MagicMock(return_value=True)) + @mock.patch('subprocess.getoutput', + new=mock.MagicMock(return_value='16130 MiB, 16119 MiB, 1')) + def test_getGpuStat(self): + self._resetGpuStat() + self.assertEqual(1, self.machine.getGpuCount()) + self.assertEqual(16913531, self.machine.getGpuMemoryTotal()) + self.assertEqual(16901997, self.machine.getGpuMemoryFree()) - self.assertEqual(991232, self.machine.getGpuMemory()) + @mock.patch.object( + rqd.rqconstants, 'ALLOW_GPU', new=mock.MagicMock(return_value=True)) + @mock.patch('subprocess.getoutput', + new=mock.MagicMock(return_value="""\ +16130 MiB, 16103 MiB, 8 +16130 MiB, 16119 MiB, 8 +16130 MiB, 16119 MiB, 8 +16130 MiB, 16119 MiB, 8 +16130 MiB, 4200 MiB, 8 +16130 MiB, 16119 MiB, 8 +16130 MiB, 16119 MiB, 8 +16130 MiB, 16119 MiB, 8""")) + def test_multipleGpus(self): + self._resetGpuStat() + self.assertEqual(8, self.machine.getGpuCount()) + self.assertEqual(135308248, self.machine.getGpuMemoryTotal()) + self.assertEqual(122701222, self.machine.getGpuMemoryFree()) def test_getPathEnv(self): self.assertEqual( @@ -365,16 +371,12 @@ def test_reboot(self, popenMock): popenMock.assert_called_with(['/usr/bin/sudo', '/sbin/reboot', '-f']) - @mock.patch( - 'subprocess.getoutput', - new=mock.MagicMock(return_value=' TotalMem 1023 Mb FreeMem 968 Mb')) def test_getHostInfo(self): # pylint: disable=no-member hostInfo = self.machine.getHostInfo() self.assertEqual(4105212, hostInfo.free_swap) self.assertEqual(25699176, hostInfo.free_mem) - self.assertEqual('991232', hostInfo.attributes['freeGpu']) self.assertEqual('0', hostInfo.attributes['swapout']) self.assertEqual(25, hostInfo.load) self.assertEqual(False, hostInfo.nimby_enabled)