This repository has been archived by the owner on May 8, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
nvidia.go
173 lines (161 loc) · 5.55 KB
/
nvidia.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
package nvidia
import (
"bytes"
"encoding/csv"
"os/exec"
"strconv"
"sync"
"time"
//"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
"github.com/xxxserxxx/gotop/v4/devices"
)
// Set up variables and register this plug-in with the main code.
// The functions Register*(f) tell gotop which of these plugin functions to
// call to update data; the RegisterStartup() function sets the function
// that gotop will call when everything else has been done and the plugin
// should start collecting data.
//
// In this plugin, one call to the nvidia program returns *all* the data
// we're looking for, but gotop will call each update function during each
// cycle. This means that the nvidia program would be called 3 (or more)
// times per update, which isn't very efficient. Therefore, we make this
// code more complex to run a job in the background that runs the nvidia
// tool periodically and puts the results into hashes; the update functions
// then just sync data from those hashes into the return data.
func init() {
_temps = make(map[string]int)
_mems = make(map[string]devices.MemoryInfo)
_cpus = make(map[string]int)
errors = make(map[string]error)
devices.RegisterTemp(updateNvidiaTemp)
devices.RegisterMem(updateNvidiaMem)
devices.RegisterCPU(updateNvidiaUsage)
lock = sync.Mutex{}
devices.RegisterStartup(startup)
}
// updateNvidiaTemp copies data from the local _temps cache into the passed-in
// return-value map. It is called once per cycle by gotop.
func updateNvidiaTemp(temps map[string]int) map[string]error {
lock.Lock()
defer lock.Unlock()
for k, v := range _temps {
temps[k] = v
}
return errors
}
// updateNvidiaMem copies data from the local _mems cache into the passed-in
// return-value map. It is called once per cycle by gotop.
func updateNvidiaMem(mems map[string]devices.MemoryInfo) map[string]error {
lock.Lock()
defer lock.Unlock()
for k, v := range _mems {
mems[k] = v
}
return errors
}
// updateNvidiaUsage copies data from the local _cpus cache into the passed-in
// return-value map. It is called once per cycle by gotop.
func updateNvidiaUsage(cpus map[string]int, _ bool) map[string]error {
lock.Lock()
defer lock.Unlock()
for k, v := range _cpus {
cpus[k] = v
}
return errors
}
// startup is called once by gotop, and forks a thread to call the nvidia
// tool periodically and update the cached cpu, memory, and temperature
// values that are used by the update*() functions to return data to gotop.
//
// The vars argument contains command-line arguments to allow the plugin
// to change runtime options; the only option currently supported is the
// `nvidia-refresh` arg, which is expected to be a time.Duration value and
// sets how frequently the nvidia tool is called to refresh the date.
func startup(vars map[string]string) error {
var err error
// Get the refresh period from the passed-in command-line/config
// file options
refresh := time.Second
if v, ok := vars["nvidia-refresh"]; ok {
if refresh, err = time.ParseDuration(v); err != nil {
return err
}
}
// update once to populate the device names, for the widgets.
update()
// Fork off a long-running job to call the nvidia tool periodically,
// parse out the values, and put them in the cache.
go func() {
timer := time.Tick(refresh)
for range timer {
update()
}
}()
return nil
}
// Caches for the output from the nvidia tool; the update() functions pull
// from these and return the values to gotop when requested.
var (
_temps map[string]int
_mems map[string]devices.MemoryInfo
_cpus map[string]int
// A cache of errors generated by the background job running the nvidia tool;
// these errors are returned to gotop when it calls the update() functions.
errors map[string]error
)
var lock sync.Mutex
// update calls the nvidia tool, parses the output, and caches the results
// in the various _* maps. The metric data parsed is: name, index,
// temperature.gpu, utilization.gpu, utilization.memory, memory.total,
// memory.free, memory.used
//
// If this function encounters an error calling `nvidia-smi`, it caches the
// error and returns immediately. We expect exec errors only when the tool
// isn't available, or when it fails for some reason; no exec error cases
// are recoverable. This does **not** stop the cache job; that will continue
// to run and continue to call update().
func update() {
bs, err := exec.Command(
"nvidia-smi",
"--query-gpu=name,index,temperature.gpu,utilization.gpu,memory.total,memory.used",
"--format=csv,noheader,nounits").Output()
if err != nil {
errors["nvidia"] = err
//bs = []byte("GeForce GTX 1080 Ti, 0, 31, 9, 11175, 206")
return
}
csvReader := csv.NewReader(bytes.NewReader(bs))
csvReader.TrimLeadingSpace = true
records, err := csvReader.ReadAll()
if err != nil {
errors["nvidia"] = err
return
}
// Ensure we're not trying to modify the caches while they're being read by the update() functions.
lock.Lock()
defer lock.Unlock()
// Errors during parsing are recorded, but do not stop parsing.
for _, row := range records {
// The name of the devices is the nvidia-smi "<name>.<index>"
name := row[0] + "." + row[1]
if _temps[name], err = strconv.Atoi(row[2]); err != nil {
errors[name] = err
}
if _cpus[name], err = strconv.Atoi(row[3]); err != nil {
errors[name] = err
}
t, err := strconv.Atoi(row[4])
if err != nil {
errors[name] = err
}
u, err := strconv.Atoi(row[5])
if err != nil {
errors[name] = err
}
_mems[name] = devices.MemoryInfo{
Total: 1048576 * uint64(t),
Used: 1048576 * uint64(u),
UsedPercent: (float64(u) / float64(t)) * 100.0,
}
}
}