Add NVIDIA self-heal tools and per-GPU SAT status

This commit is contained in:
Mikhail Chusavitin
2026-04-07 20:20:05 +03:00
parent 93cfa78e8c
commit 531d1ca366
13 changed files with 863 additions and 14 deletions

View File

@@ -122,6 +122,8 @@ type satRunner interface {
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
ResetNvidiaGPU(index int) (string, error)
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
@@ -521,6 +523,15 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
return a.sat.ListNvidiaGPUs()
}
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
return a.sat.ListNvidiaGPUStatuses()
}
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
out, err := a.sat.ResetNvidiaGPU(index)
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
}
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir