Add NVIDIA self-heal tools and per-GPU SAT status

This commit is contained in:
Mikhail Chusavitin
2026-04-07 20:20:05 +03:00
parent 93cfa78e8c
commit 531d1ca366
13 changed files with 863 additions and 14 deletions

View File

@@ -28,6 +28,12 @@ var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) {
}
return a.ListNvidiaGPUs()
}
var apiListNvidiaGPUStatuses = func(a *app.App) ([]platform.NvidiaGPUStatus, error) {
if a == nil {
return nil, fmt.Errorf("app not configured")
}
return a.ListNvidiaGPUStatuses()
}
// ── Job ID counter ────────────────────────────────────────────────────────────
@@ -788,6 +794,42 @@ func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
writeJSON(w, gpus)
}
func (h *handler) handleAPIGNVIDIAGPUStatuses(w http.ResponseWriter, _ *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
gpus, err := apiListNvidiaGPUStatuses(h.opts.App)
if err != nil {
writeError(w, http.StatusInternalServerError, err.Error())
return
}
if gpus == nil {
gpus = []platform.NvidiaGPUStatus{}
}
writeJSON(w, gpus)
}
func (h *handler) handleAPIGNVIDIAReset(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
var req struct {
Index int `json:"index"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid request body")
return
}
result, err := h.opts.App.ResetNvidiaGPU(req.Index)
status := "ok"
if err != nil {
status = "error"
}
writeJSON(w, map[string]string{"status": status, "output": result.Body})
}
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")