Add NVIDIA self-heal tools and per-GPU SAT status
This commit is contained in:
@@ -28,6 +28,12 @@ var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) {
|
||||
}
|
||||
return a.ListNvidiaGPUs()
|
||||
}
|
||||
var apiListNvidiaGPUStatuses = func(a *app.App) ([]platform.NvidiaGPUStatus, error) {
|
||||
if a == nil {
|
||||
return nil, fmt.Errorf("app not configured")
|
||||
}
|
||||
return a.ListNvidiaGPUStatuses()
|
||||
}
|
||||
|
||||
// ── Job ID counter ────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -788,6 +794,42 @@ func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
|
||||
writeJSON(w, gpus)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIGNVIDIAGPUStatuses(w http.ResponseWriter, _ *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
gpus, err := apiListNvidiaGPUStatuses(h.opts.App)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if gpus == nil {
|
||||
gpus = []platform.NvidiaGPUStatus{}
|
||||
}
|
||||
writeJSON(w, gpus)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIGNVIDIAReset(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var req struct {
|
||||
Index int `json:"index"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
result, err := h.opts.App.ResetNvidiaGPU(req.Index)
|
||||
status := "ok"
|
||||
if err != nil {
|
||||
status = "error"
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": status, "output": result.Body})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
|
||||
Reference in New Issue
Block a user