Compare commits

..

2 Commits
v6.3 ... v6.4

Author SHA1 Message Date
Mikhail Chusavitin
0a4bb596f6 Improve install-to-RAM verification for ISO boots 2026-04-07 20:21:06 +03:00
Mikhail Chusavitin
531d1ca366 Add NVIDIA self-heal tools and per-GPU SAT status 2026-04-07 20:20:05 +03:00
17 changed files with 931 additions and 24 deletions

View File

@@ -122,6 +122,8 @@ type satRunner interface {
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
ResetNvidiaGPU(index int) (string, error)
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
@@ -521,6 +523,15 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
return a.sat.ListNvidiaGPUs() return a.sat.ListNvidiaGPUs()
} }
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
return a.sat.ListNvidiaGPUStatuses()
}
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
out, err := a.sat.ResetNvidiaGPU(index)
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
}
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) { func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
if strings.TrimSpace(baseDir) == "" { if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir baseDir = DefaultSATBaseDir

View File

@@ -135,6 +135,8 @@ type fakeSAT struct {
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error) listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
runAMDPackFn func(string) (string, error) runAMDPackFn func(string) (string, error)
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error) listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
listNvidiaGPUStatusesFn func() ([]platform.NvidiaGPUStatus, error)
resetNvidiaGPUFn func(int) (string, error)
} }
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) { func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
@@ -201,6 +203,20 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
return nil, nil return nil, nil
} }
func (f fakeSAT) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
if f.listNvidiaGPUStatusesFn != nil {
return f.listNvidiaGPUStatusesFn()
}
return nil, nil
}
func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) {
if f.resetNvidiaGPUFn != nil {
return f.resetNvidiaGPUFn(index)
}
return "", nil
}
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) { func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
return f.runMemoryFn(baseDir) return f.runMemoryFn(baseDir)
} }
@@ -805,6 +821,9 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
for _, want := range []string{ for _, want := range []string{
"/system/ip-link.txt", "/system/ip-link.txt",
"/system/ip-link-stats.txt", "/system/ip-link-stats.txt",
"/system/kernel-aer-nvidia.txt",
"/system/lspci-nvidia-bridges-vv.txt",
"/system/pcie-aer-sysfs.txt",
"/system/ethtool-info.txt", "/system/ethtool-info.txt",
"/system/ethtool-link.txt", "/system/ethtool-link.txt",
"/system/ethtool-module.txt", "/system/ethtool-module.txt",

View File

@@ -3,6 +3,7 @@ package app
import ( import (
"os" "os"
"path/filepath" "path/filepath"
"strconv"
"sort" "sort"
"strings" "strings"
@@ -18,6 +19,7 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
} }
if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok { if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary) applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
applyNvidiaPerGPUStatus(snap.PCIeDevices, baseDir)
} }
if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok { if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
applyMemorySAT(snap.Memory, summary) applyMemorySAT(snap.Memory, summary)
@@ -32,6 +34,100 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
applyComponentStatusDB(snap, db) applyComponentStatusDB(snap, db)
} }
type nvidiaPerGPUStatus struct {
runStatus string
reason string
}
func applyNvidiaPerGPUStatus(devs []schema.HardwarePCIeDevice, baseDir string) {
statusByIndex, ts, ok := loadLatestNvidiaPerGPUStatus(baseDir)
if !ok {
return
}
for i := range devs {
if devs[i].Telemetry == nil {
continue
}
rawIdx, ok := devs[i].Telemetry["nvidia_gpu_index"]
if !ok {
continue
}
idx, ok := telemetryInt(rawIdx)
if !ok {
continue
}
st, ok := statusByIndex[idx]
if !ok {
continue
}
status, description, ok := satKeyStatus(st.runStatus, firstNonEmpty(strings.TrimSpace(st.reason), "nvidia GPU SAT"))
if !ok {
continue
}
mergeComponentStatusPreferDetail(&devs[i].HardwareComponentStatus, ts, status, description)
}
}
func loadLatestNvidiaPerGPUStatus(baseDir string) (map[int]nvidiaPerGPUStatus, string, bool) {
matches, err := filepath.Glob(filepath.Join(baseDir, "gpu-nvidia-*"))
if err != nil || len(matches) == 0 {
return nil, "", false
}
sort.Strings(matches)
runDir := matches[len(matches)-1]
summaryRaw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
if err != nil {
return nil, "", false
}
summaryKV := parseKeyValueSummary(string(summaryRaw))
runAtUTC := strings.TrimSpace(summaryKV["run_at_utc"])
files, err := filepath.Glob(filepath.Join(runDir, "gpu-*-status.txt"))
if err != nil || len(files) == 0 {
return nil, "", false
}
out := make(map[int]nvidiaPerGPUStatus, len(files))
for _, file := range files {
raw, err := os.ReadFile(file)
if err != nil {
continue
}
kv := parseKeyValueSummary(string(raw))
idx, err := strconv.Atoi(strings.TrimSpace(kv["gpu_index"]))
if err != nil {
continue
}
out[idx] = nvidiaPerGPUStatus{
runStatus: strings.ToUpper(strings.TrimSpace(kv["run_status"])),
reason: strings.TrimSpace(kv["reason"]),
}
}
if len(out) == 0 {
return nil, "", false
}
return out, runAtUTC, true
}
func telemetryInt(v any) (int, bool) {
switch value := v.(type) {
case int:
return value, true
case int32:
return int(value), true
case int64:
return int(value), true
case float64:
return int(value), true
case string:
n, err := strconv.Atoi(strings.TrimSpace(value))
if err != nil {
return 0, false
}
return n, true
default:
return 0, false
}
}
type satSummary struct { type satSummary struct {
runAtUTC string runAtUTC string
overall string overall string
@@ -176,6 +272,31 @@ func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt,
} }
} }
func mergeComponentStatusPreferDetail(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
if component == nil || satStatus == "" {
return
}
current := strings.TrimSpace(ptrString(component.Status))
newSeverity := statusSeverity(satStatus)
currentSeverity := statusSeverity(current)
if current == "" || current == "Unknown" || newSeverity > currentSeverity {
mergeComponentStatus(component, changedAt, satStatus, description)
return
}
if newSeverity == currentSeverity && strings.TrimSpace(description) != "" {
component.Status = appStringPtr(satStatus)
component.ErrorDescription = appStringPtr(description)
if strings.TrimSpace(changedAt) != "" {
component.StatusChangedAt = appStringPtr(changedAt)
component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
Status: satStatus,
ChangedAt: changedAt,
Details: appStringPtr(description),
})
}
}
}
func statusSeverity(status string) int { func statusSeverity(status string) int {
switch strings.TrimSpace(status) { switch strings.TrimSpace(status) {
case "Critical": case "Critical":

View File

@@ -59,3 +59,51 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status) t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
} }
} }
func TestApplyLatestSATStatusesMarksNvidiaGPUByPerGPUStatusFile(t *testing.T) {
baseDir := t.TempDir()
runDir := filepath.Join(baseDir, "gpu-nvidia-20260407-162123")
if err := os.MkdirAll(runDir, 0755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte("run_at_utc=2026-04-07T16:21:23Z\noverall_status=FAILED\n"), 0644); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(runDir, "gpu-1-status.txt"), []byte("gpu_index=1\ngpu_name=NVIDIA H100 PCIe\nrun_status=FAILED\nreason=GPU requires reset\n"), 0644); err != nil {
t.Fatal(err)
}
class := "VideoController"
manufacturer := "NVIDIA Corporation"
bdf0 := "0000:4b:00.0"
bdf1 := "0000:4f:00.0"
snap := schema.HardwareSnapshot{
PCIeDevices: []schema.HardwarePCIeDevice{
{
DeviceClass: &class,
Manufacturer: &manufacturer,
BDF: &bdf0,
Telemetry: map[string]any{"nvidia_gpu_index": 0},
},
{
DeviceClass: &class,
Manufacturer: &manufacturer,
BDF: &bdf1,
Telemetry: map[string]any{"nvidia_gpu_index": 1},
},
},
}
applyLatestSATStatuses(&snap, baseDir, nil)
if snap.PCIeDevices[1].Status == nil || *snap.PCIeDevices[1].Status != "Critical" {
t.Fatalf("gpu1 status=%v want Critical", snap.PCIeDevices[1].Status)
}
if snap.PCIeDevices[1].ErrorDescription == nil || *snap.PCIeDevices[1].ErrorDescription != "GPU requires reset failed" {
got := "<nil>"
if snap.PCIeDevices[1].ErrorDescription != nil {
got = *snap.PCIeDevices[1].ErrorDescription
}
t.Fatalf("gpu1 error=%q want per-gpu reason", got)
}
}

View File

@@ -40,7 +40,36 @@ var supportBundleCommands = []struct {
{name: "system/mount.txt", cmd: []string{"mount"}}, {name: "system/mount.txt", cmd: []string{"mount"}},
{name: "system/df-h.txt", cmd: []string{"df", "-h"}}, {name: "system/df-h.txt", cmd: []string{"df", "-h"}},
{name: "system/dmesg.txt", cmd: []string{"dmesg"}}, {name: "system/dmesg.txt", cmd: []string{"dmesg"}},
{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
if command -v dmesg >/dev/null 2>&1; then
dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
else
echo "dmesg not found"
fi
`}},
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}}, {name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
if ! command -v lspci >/dev/null 2>&1; then
echo "lspci not found"
exit 0
fi
found=0
for gpu in $(lspci -Dn | awk '$3 ~ /^10de:/ {print $1}'); do
found=1
echo "=== GPU $gpu ==="
lspci -s "$gpu" -vv 2>&1 || true
bridge=$(basename "$(readlink -f "/sys/bus/pci/devices/$gpu/.." 2>/dev/null)" 2>/dev/null)
if [ -n "$bridge" ] && [ "$bridge" != "$gpu" ]; then
echo
echo "=== UPSTREAM $bridge for $gpu ==="
lspci -s "$bridge" -vv 2>&1 || true
fi
echo
done
if [ "$found" -eq 0 ]; then
echo "no NVIDIA PCI devices found"
fi
`}},
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", ` {name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
for d in /sys/bus/pci/devices/*/; do for d in /sys/bus/pci/devices/*/; do
vendor=$(cat "$d/vendor" 2>/dev/null) vendor=$(cat "$d/vendor" 2>/dev/null)
@@ -51,6 +80,30 @@ for d in /sys/bus/pci/devices/*/; do
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)" printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
done done
done done
`}},
{name: "system/pcie-aer-sysfs.txt", cmd: []string{"sh", "-c", `
found=0
for dev in /sys/bus/pci/devices/*; do
[ -e "$dev" ] || continue
bdf=$(basename "$dev")
block=""
for f in aer_dev_correctable aer_dev_fatal aer_dev_nonfatal aer_rootport_total_err_cor aer_rootport_total_err_fatal aer_rootport_total_err_nonfatal; do
if [ -r "$dev/$f" ]; then
if [ -z "$block" ]; then
block=1
found=1
echo "=== $bdf ==="
fi
printf " %-30s %s\n" "$f" "$(cat "$dev/$f" 2>/dev/null)"
fi
done
if [ -n "$block" ]; then
echo
fi
done
if [ "$found" -eq 0 ]; then
echo "no PCIe AER sysfs counters found"
fi
`}}, `}},
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", ` {name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
if ! command -v ethtool >/dev/null 2>&1; then if ! command -v ethtool >/dev/null 2>&1; then

View File

@@ -13,6 +13,7 @@ import (
const nvidiaVendorID = 0x10de const nvidiaVendorID = 0x10de
type nvidiaGPUInfo struct { type nvidiaGPUInfo struct {
Index int
BDF string BDF string
Serial string Serial string
VBIOS string VBIOS string
@@ -132,6 +133,7 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
} }
info := nvidiaGPUInfo{ info := nvidiaGPUInfo{
Index: parseRequiredInt(rec[0]),
BDF: bdf, BDF: bdf,
Serial: strings.TrimSpace(rec[2]), Serial: strings.TrimSpace(rec[2]),
VBIOS: strings.TrimSpace(rec[3]), VBIOS: strings.TrimSpace(rec[3]),
@@ -187,6 +189,14 @@ func parseMaybeInt(v string) *int {
return &n return &n
} }
func parseRequiredInt(v string) int {
n, err := strconv.Atoi(strings.TrimSpace(v))
if err != nil {
return 0
}
return n
}
func pcieLinkGenLabel(gen int) string { func pcieLinkGenLabel(gen int) string {
return fmt.Sprintf("Gen%d", gen) return fmt.Sprintf("Gen%d", gen)
} }
@@ -240,6 +250,10 @@ func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
} }
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) { func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
if dev.Telemetry == nil {
dev.Telemetry = map[string]any{}
}
dev.Telemetry["nvidia_gpu_index"] = info.Index
if info.TemperatureC != nil { if info.TemperatureC != nil {
dev.TemperatureC = info.TemperatureC dev.TemperatureC = info.TemperatureC
} }

View File

@@ -86,6 +86,9 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" { if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
t.Fatalf("firmware: got %v", out[0].Firmware) t.Fatalf("firmware: got %v", out[0].Firmware)
} }
if out[0].Telemetry == nil || out[0].Telemetry["nvidia_gpu_index"] != 0 {
t.Fatalf("telemetry nvidia_gpu_index: got %#v", out[0].Telemetry)
}
if out[0].Status == nil || *out[0].Status != statusWarning { if out[0].Status == nil || *out[0].Status != statusWarning {
t.Fatalf("status: got %v", out[0].Status) t.Fatalf("status: got %v", out[0].Status)
} }

View File

@@ -116,25 +116,47 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
if err := ctx.Err(); err != nil { if err := ctx.Err(); err != nil {
return err return err
} }
if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err)) mediumRebound := false
if err := bindMount(dstDir, "/run/live/medium"); err != nil {
log(fmt.Sprintf("Warning: rebind /run/live/medium → %s failed: %v", dstDir, err))
} else {
mediumRebound = true
} }
log("Verifying live medium now served from RAM...") log("Verifying live medium now served from RAM...")
status := s.LiveBootSource() status := s.LiveBootSource()
if err := verifyInstallToRAMStatus(status); err != nil { if err := verifyInstallToRAMStatus(status, dstDir, mediumRebound, log); err != nil {
return err return err
} }
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status))) if status.InRAM {
log("Done. Installation media can be safely disconnected.") log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
}
log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
return nil return nil
} }
func verifyInstallToRAMStatus(status LiveBootSource) error { func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
if status.InRAM { if status.InRAM {
return nil return nil
} }
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s", describeLiveBootSource(status))
// The live medium mount was not redirected to RAM. This is expected when
// booting from an ISO/CD-ROM: the squashfs loop device has a non-zero
// offset and LOOP_CHANGE_FD cannot be used; the bind mount also fails
// because the CD-ROM mount is in use. Check whether files were at least
// copied to the tmpfs directory — that is sufficient for safe disconnection
// once the kernel has paged in all actively-used data.
files, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
if len(files) > 0 {
if !mediumRebound {
log(fmt.Sprintf("Note: squashfs copied to RAM (%s) but /run/live/medium still shows the original source.", dstDir))
log("This is normal for CD-ROM boots. For a fully transparent RAM boot, add 'toram' to the kernel parameters.")
}
return nil
}
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s and no squashfs found in %s", describeLiveBootSource(status), dstDir)
} }
func describeLiveBootSource(status LiveBootSource) string { func describeLiveBootSource(status LiveBootSource) string {
@@ -247,7 +269,31 @@ func findLoopForFile(backingFile string) (string, error) {
return "", fmt.Errorf("no loop device found for %s", backingFile) return "", fmt.Errorf("no loop device found for %s", backingFile)
} }
// loopDeviceOffset returns the byte offset configured for the loop device,
// or -1 if it cannot be determined.
func loopDeviceOffset(loopDev string) int64 {
out, err := exec.Command("losetup", "--json", loopDev).Output()
if err != nil {
return -1
}
var result struct {
Loopdevices []struct {
Offset int64 `json:"offset"`
} `json:"loopdevices"`
}
if err := json.Unmarshal(out, &result); err != nil || len(result.Loopdevices) == 0 {
return -1
}
return result.Loopdevices[0].Offset
}
func reassociateLoopDevice(loopDev, newFile string) error { func reassociateLoopDevice(loopDev, newFile string) error {
// LOOP_CHANGE_FD requires lo_offset == 0. ISO/CD-ROM loop devices are
// typically set up with a non-zero offset (squashfs lives inside the ISO),
// so the ioctl returns EINVAL. Detect this early for a clear error message.
if off := loopDeviceOffset(loopDev); off > 0 {
return fmt.Errorf("loop device has non-zero offset (%d bytes, typical for ISO/CD-ROM) — LOOP_CHANGE_FD not supported; use 'toram' kernel parameter for RAM boot", off)
}
if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil { if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
return nil return nil
} }

View File

@@ -26,3 +26,8 @@ func loopChangeFD(loopDev, newFile string) error {
} }
return nil return nil
} }
// bindMount binds src over dst using the syscall directly (avoids exec PATH issues).
func bindMount(src, dst string) error {
return syscall.Mount(src, dst, "", syscall.MS_BIND, "")
}

View File

@@ -7,3 +7,7 @@ import "errors"
func loopChangeFD(loopDev, newFile string) error { func loopChangeFD(loopDev, newFile string) error {
return errors.New("LOOP_CHANGE_FD not available on this platform") return errors.New("LOOP_CHANGE_FD not available on this platform")
} }
func bindMount(src, dst string) error {
return errors.New("bind mount not available on this platform")
}

View File

@@ -33,14 +33,17 @@ func TestInferLiveBootKind(t *testing.T) {
func TestVerifyInstallToRAMStatus(t *testing.T) { func TestVerifyInstallToRAMStatus(t *testing.T) {
t.Parallel() t.Parallel()
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}); err != nil { dstDir := t.TempDir()
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}, dstDir, false, nil); err != nil {
t.Fatalf("expected success for RAM-backed status, got %v", err) t.Fatalf("expected success for RAM-backed status, got %v", err)
} }
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"})
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"}, dstDir, false, nil)
if err == nil { if err == nil {
t.Fatal("expected verification failure when media is still on USB") t.Fatal("expected verification failure when media is still on USB")
} }
if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1)" { if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1) and no squashfs found in "+dstDir {
t.Fatalf("error=%q", got) t.Fatalf("error=%q", got)
} }
} }

View File

@@ -88,6 +88,37 @@ type NvidiaGPU struct {
MemoryMB int `json:"memory_mb"` MemoryMB int `json:"memory_mb"`
} }
type NvidiaGPUStatus struct {
Index int `json:"index"`
Name string `json:"name"`
BDF string `json:"bdf,omitempty"`
Serial string `json:"serial,omitempty"`
Status string `json:"status"`
RawLine string `json:"raw_line,omitempty"`
NeedsReset bool `json:"needs_reset"`
ParseFailure bool `json:"parse_failure,omitempty"`
}
type nvidiaGPUHealth struct {
Index int
Name string
NeedsReset bool
RawLine string
ParseFailure bool
}
type nvidiaGPUStatusFile struct {
Index int
Name string
RunStatus string
Reason string
Health string
HealthRaw string
Observed bool
Selected bool
FailingJob string
}
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi. // AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
type AMDGPUInfo struct { type AMDGPUInfo struct {
Index int `json:"index"` Index int `json:"index"`
@@ -269,6 +300,72 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
return gpus, nil return gpus, nil
} }
func (s *System) ListNvidiaGPUStatuses() ([]NvidiaGPUStatus, error) {
out, err := satExecCommand(
"nvidia-smi",
"--query-gpu=index,name,pci.bus_id,serial,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
"--format=csv,noheader,nounits",
).Output()
if err != nil {
return nil, fmt.Errorf("nvidia-smi: %w", err)
}
var gpus []NvidiaGPUStatus
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
parts := strings.Split(line, ",")
if len(parts) < 4 {
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
continue
}
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
if err != nil {
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
continue
}
upper := strings.ToUpper(line)
needsReset := strings.Contains(upper, "GPU REQUIRES RESET")
status := "OK"
if needsReset {
status = "RESET_REQUIRED"
}
gpus = append(gpus, NvidiaGPUStatus{
Index: idx,
Name: strings.TrimSpace(parts[1]),
BDF: normalizeNvidiaBusID(strings.TrimSpace(parts[2])),
Serial: strings.TrimSpace(parts[3]),
Status: status,
RawLine: line,
NeedsReset: needsReset,
})
}
sort.Slice(gpus, func(i, j int) bool { return gpus[i].Index < gpus[j].Index })
return gpus, nil
}
func normalizeNvidiaBusID(v string) string {
v = strings.TrimSpace(strings.ToLower(v))
parts := strings.Split(v, ":")
if len(parts) == 3 && len(parts[0]) > 4 {
parts[0] = parts[0][len(parts[0])-4:]
return strings.Join(parts, ":")
}
return v
}
func (s *System) ResetNvidiaGPU(index int) (string, error) {
if index < 0 {
return "", fmt.Errorf("gpu index must be >= 0")
}
raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
if len(raw) == 0 && err == nil {
raw = []byte("GPU reset completed.\n")
}
return string(raw), err
}
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs. // RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
// Measures collective communication bandwidth over NVLink/PCIe. // Measures collective communication bandwidth over NVLink/PCIe.
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
@@ -604,7 +701,7 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}}, satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}}, satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
satJob{name: "04-dcgmi-diag.log", cmd: diagArgs}, satJob{name: "04-dcgmi-diag.log", cmd: diagArgs, gpuIndices: gpuIndices},
) )
} }
@@ -652,11 +749,23 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
var summary strings.Builder var summary strings.Builder
stats := satStats{} stats := satStats{}
nvidiaPack := strings.HasPrefix(prefix, "gpu-nvidia")
perGPU := map[int]*nvidiaGPUStatusFile{}
selectedGPUIndices := map[int]struct{}{}
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339)) fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
for _, job := range jobs { for _, job := range jobs {
if ctx.Err() != nil { if ctx.Err() != nil {
break break
} }
for _, idx := range job.gpuIndices {
selectedGPUIndices[idx] = struct{}{}
status := perGPU[idx]
if status == nil {
status = &nvidiaGPUStatusFile{Index: idx}
perGPU[idx] = status
}
status.Selected = true
}
cmd := make([]string, 0, len(job.cmd)) cmd := make([]string, 0, len(job.cmd))
for _, arg := range job.cmd { for _, arg := range job.cmd {
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir)) cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
@@ -665,10 +774,37 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
var out []byte var out []byte
var err error var err error
if job.collectGPU { if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc) if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
} else { if logFunc != nil {
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc) logFunc(msg)
}
out = []byte(msg + "\n")
err = healthErr
}
}
if err == nil {
if job.collectGPU {
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
} else {
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
}
}
if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
if logFunc != nil {
logFunc(msg)
}
if len(out) > 0 && !bytes.HasSuffix(out, []byte("\n")) {
out = append(out, '\n')
}
out = append(out, []byte(msg+"\n")...)
if err == nil {
err = healthErr
}
}
} }
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil { if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
@@ -679,6 +815,11 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
} }
status, rc := classifySATResult(job.name, out, err) status, rc := classifySATResult(job.name, out, err)
stats.Add(status) stats.Add(status)
if nvidiaPack && len(job.gpuIndices) > 0 && nvidiaJobNeedsHealthCheck(job) {
for _, idx := range job.gpuIndices {
updateNvidiaGPUStatus(perGPU, idx, status, job.name, string(out))
}
}
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log") key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc) fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
fmt.Fprintf(&summary, "%s_status=%s\n", key, status) fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
@@ -687,6 +828,11 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil { if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
return "", err return "", err
} }
if nvidiaPack {
if err := writeNvidiaGPUStatusFiles(runDir, stats.Overall(), perGPU, selectedGPUIndices); err != nil {
return "", err
}
}
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz") archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
if err := createTarGz(archive, runDir); err != nil { if err := createTarGz(archive, runDir); err != nil {
@@ -695,6 +841,197 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
return archive, nil return archive, nil
} }
func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
entry := perGPU[idx]
if entry == nil {
entry = &nvidiaGPUStatusFile{Index: idx}
perGPU[idx] = entry
}
if nvidiaSATStatusSeverity(status) >= nvidiaSATStatusSeverity(entry.RunStatus) {
entry.RunStatus = status
entry.FailingJob = jobName
entry.Reason = firstLine(detail)
}
}
func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPUStatusFile, selected map[int]struct{}) error {
health, err := readNvidiaGPUHealth()
if err == nil {
for _, gpu := range health {
entry := perGPU[gpu.Index]
if entry == nil {
entry = &nvidiaGPUStatusFile{Index: gpu.Index}
perGPU[gpu.Index] = entry
}
entry.Name = gpu.Name
entry.Observed = true
entry.HealthRaw = gpu.RawLine
if gpu.NeedsReset {
entry.Health = "RESET_REQUIRED"
if entry.RunStatus == "" || nvidiaSATStatusSeverity("FAILED") >= nvidiaSATStatusSeverity(entry.RunStatus) {
entry.RunStatus = "FAILED"
if strings.TrimSpace(entry.Reason) == "" {
entry.Reason = "GPU requires reset"
}
}
} else {
entry.Health = "OK"
}
}
}
for idx := range selected {
entry := perGPU[idx]
if entry == nil {
entry = &nvidiaGPUStatusFile{Index: idx}
perGPU[idx] = entry
}
entry.Selected = true
}
var indices []int
for idx := range perGPU {
indices = append(indices, idx)
}
sort.Ints(indices)
for _, idx := range indices {
entry := perGPU[idx]
if entry.RunStatus == "" {
entry.RunStatus = overall
}
if entry.Health == "" {
entry.Health = "UNKNOWN"
}
if entry.Name == "" {
entry.Name = "unknown"
}
var body strings.Builder
fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
fmt.Fprintf(&body, "gpu_name=%s\n", entry.Name)
fmt.Fprintf(&body, "selected=%t\n", entry.Selected)
fmt.Fprintf(&body, "observed=%t\n", entry.Observed)
fmt.Fprintf(&body, "run_status=%s\n", entry.RunStatus)
fmt.Fprintf(&body, "health_status=%s\n", entry.Health)
if strings.TrimSpace(entry.FailingJob) != "" {
fmt.Fprintf(&body, "failing_job=%s\n", entry.FailingJob)
}
if strings.TrimSpace(entry.Reason) != "" {
fmt.Fprintf(&body, "reason=%s\n", entry.Reason)
}
if strings.TrimSpace(entry.HealthRaw) != "" {
fmt.Fprintf(&body, "health_raw=%s\n", entry.HealthRaw)
}
if err := os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-status.txt", idx)), []byte(body.String()), 0644); err != nil {
return err
}
}
return nil
}
func nvidiaSATStatusSeverity(status string) int {
switch strings.ToUpper(strings.TrimSpace(status)) {
case "FAILED":
return 3
case "PARTIAL", "UNSUPPORTED":
return 2
case "OK":
return 1
default:
return 0
}
}
func firstLine(s string) string {
s = strings.TrimSpace(s)
if s == "" {
return ""
}
if idx := strings.IndexByte(s, '\n'); idx >= 0 {
return strings.TrimSpace(s[:idx])
}
return s
}
func nvidiaJobNeedsHealthCheck(job satJob) bool {
if job.collectGPU {
return true
}
name := strings.ToLower(strings.TrimSpace(job.name))
return strings.Contains(name, "dcgmi") ||
strings.Contains(name, "gpu-burn") ||
strings.Contains(name, "gpu-stress") ||
strings.Contains(name, "dcgmproftester")
}
func checkNvidiaJobHealth(selected []int) (string, error) {
health, err := readNvidiaGPUHealth()
if err != nil {
return "", nil
}
var bad []nvidiaGPUHealth
selectedSet := make(map[int]struct{}, len(selected))
for _, idx := range selected {
selectedSet[idx] = struct{}{}
}
for _, gpu := range health {
if len(selectedSet) > 0 {
if _, ok := selectedSet[gpu.Index]; !ok {
continue
}
}
if gpu.NeedsReset {
bad = append(bad, gpu)
}
}
if len(bad) == 0 {
return "", nil
}
lines := make([]string, 0, len(bad)+1)
lines = append(lines, "NVIDIA GPU health check failed:")
for _, gpu := range bad {
lines = append(lines, fmt.Sprintf("gpu %d (%s) requires reset: %s", gpu.Index, gpu.Name, gpu.RawLine))
}
return strings.Join(lines, "\n"), errors.New("nvidia gpu requires reset")
}
func readNvidiaGPUHealth() ([]nvidiaGPUHealth, error) {
out, err := satExecCommand(
"nvidia-smi",
"--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
"--format=csv,noheader,nounits",
).Output()
if err != nil {
return nil, fmt.Errorf("nvidia-smi: %w", err)
}
return parseNvidiaGPUHealth(string(out)), nil
}
func parseNvidiaGPUHealth(raw string) []nvidiaGPUHealth {
var gpus []nvidiaGPUHealth
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
parts := strings.Split(line, ",")
if len(parts) < 2 {
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
continue
}
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
if err != nil {
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
continue
}
upper := strings.ToUpper(line)
gpus = append(gpus, nvidiaGPUHealth{
Index: idx,
Name: strings.TrimSpace(parts[1]),
NeedsReset: strings.Contains(upper, "GPU REQUIRES RESET"),
RawLine: line,
})
}
return gpus
}
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) { func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
start := time.Now().UTC() start := time.Now().UTC()
resolvedCmd, err := resolveSATCommand(cmd) resolvedCmd, err := resolveSATCommand(cmd)
@@ -818,6 +1155,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
// nvidia-smi on a machine with no NVIDIA GPU // nvidia-smi on a machine with no NVIDIA GPU
strings.Contains(text, "couldn't communicate with the nvidia driver") || strings.Contains(text, "couldn't communicate with the nvidia driver") ||
strings.Contains(text, "no nvidia gpu") || strings.Contains(text, "no nvidia gpu") ||
// Some NVMe firmwares start self-test but never expose progress to nvme-cli
// while waiting, so the CLI stops polling without proving device failure.
(strings.Contains(name, "self-test") &&
strings.Contains(text, "no progress for") &&
strings.Contains(text, "stop waiting")) ||
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) { (strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
return "UNSUPPORTED", rc return "UNSUPPORTED", rc
} }

View File

@@ -216,6 +216,74 @@ func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
} }
} }
func TestParseNvidiaGPUHealthDetectsResetRequired(t *testing.T) {
t.Parallel()
got := parseNvidiaGPUHealth("0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n")
if len(got) != 2 {
t.Fatalf("len=%d want 2", len(got))
}
if got[0].NeedsReset {
t.Fatalf("gpu0 unexpectedly marked reset-required")
}
if !got[1].NeedsReset {
t.Fatalf("gpu1 should be marked reset-required: %#v", got[1])
}
}
func TestCheckNvidiaJobHealthReturnsErrorForSelectedResetRequiredGPU(t *testing.T) {
oldExecCommand := satExecCommand
satExecCommand = func(name string, args ...string) *exec.Cmd {
if name == "nvidia-smi" {
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
}
return exec.Command(name, args...)
}
t.Cleanup(func() { satExecCommand = oldExecCommand })
msg, err := checkNvidiaJobHealth([]int{1})
if err == nil {
t.Fatal("expected health check error")
}
if !strings.Contains(msg, "gpu 1") || !strings.Contains(strings.ToLower(msg), "requires reset") {
t.Fatalf("unexpected message: %q", msg)
}
}
func TestWriteNvidiaGPUStatusFilesCreatesPerGPUFiles(t *testing.T) {
dir := t.TempDir()
oldExecCommand := satExecCommand
satExecCommand = func(name string, args ...string) *exec.Cmd {
if name == "nvidia-smi" {
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
}
return exec.Command(name, args...)
}
t.Cleanup(func() { satExecCommand = oldExecCommand })
perGPU := map[int]*nvidiaGPUStatusFile{
0: {Index: 0, RunStatus: "OK"},
1: {Index: 1, RunStatus: "FAILED", FailingJob: "02-dcgmi-targeted-stress.log", Reason: "NVIDIA GPU health check failed:"},
}
if err := writeNvidiaGPUStatusFiles(dir, "FAILED", perGPU, map[int]struct{}{0: {}, 1: {}}); err != nil {
t.Fatalf("writeNvidiaGPUStatusFiles error: %v", err)
}
raw, err := os.ReadFile(filepath.Join(dir, "gpu-1-status.txt"))
if err != nil {
t.Fatalf("ReadFile gpu-1-status.txt: %v", err)
}
text := string(raw)
if !strings.Contains(text, "run_status=FAILED") {
t.Fatalf("missing run status:\n%s", text)
}
if !strings.Contains(text, "health_status=RESET_REQUIRED") {
t.Fatalf("missing health status:\n%s", text)
}
if !strings.Contains(text, "failing_job=02-dcgmi-targeted-stress.log") {
t.Fatalf("missing failing job:\n%s", text)
}
}
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) { func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
oldLookPath := satLookPath oldLookPath := satLookPath
satLookPath = func(file string) (string, error) { satLookPath = func(file string) (string, error) {
@@ -341,6 +409,7 @@ func TestClassifySATResult(t *testing.T) {
}{ }{
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"}, {name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"}, {name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
{name: "nvme wait timeout without progress", job: "nvme-device-self-test", out: "Short Device self-test started\nWaiting for self test completion...\nno progress for 78 seconds, stop waiting", err: errors.New("rc 1"), status: "UNSUPPORTED"},
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"}, {name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"}, {name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
} }

View File

@@ -28,6 +28,12 @@ var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) {
} }
return a.ListNvidiaGPUs() return a.ListNvidiaGPUs()
} }
var apiListNvidiaGPUStatuses = func(a *app.App) ([]platform.NvidiaGPUStatus, error) {
if a == nil {
return nil, fmt.Errorf("app not configured")
}
return a.ListNvidiaGPUStatuses()
}
// ── Job ID counter ──────────────────────────────────────────────────────────── // ── Job ID counter ────────────────────────────────────────────────────────────
@@ -788,6 +794,42 @@ func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
writeJSON(w, gpus) writeJSON(w, gpus)
} }
func (h *handler) handleAPIGNVIDIAGPUStatuses(w http.ResponseWriter, _ *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
gpus, err := apiListNvidiaGPUStatuses(h.opts.App)
if err != nil {
writeError(w, http.StatusInternalServerError, err.Error())
return
}
if gpus == nil {
gpus = []platform.NvidiaGPUStatus{}
}
writeJSON(w, gpus)
}
func (h *handler) handleAPIGNVIDIAReset(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
var req struct {
Index int `json:"index"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid request body")
return
}
result, err := h.opts.App.ResetNvidiaGPU(req.Index)
status := "ok"
if err != nil {
status = "error"
}
writeJSON(w, map[string]string{"status": status, "output": result.Body})
}
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) { func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil { if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured") writeError(w, http.StatusServiceUnavailable, "app not configured")

View File

@@ -2476,7 +2476,7 @@ func renderNetwork() string {
func renderServicesInline() string { func renderServicesInline() string {
return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p> return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div> <div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div>
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div> <div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
<div id="svc-out" style="display:none;margin-top:12px"> <div id="svc-out" style="display:none;margin-top:12px">
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px"> <div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
@@ -2547,11 +2547,6 @@ function svcAction(btn, name, action) {
btn.disabled = false; btn.disabled = false;
}); });
} }
function restartGPUDrivers() {
var btn = document.querySelector('[onclick*="restartGPUDrivers"]');
if (!btn) { svcAction({textContent:'',disabled:false}, 'bee-nvidia', 'restart'); return; }
svcAction(btn, 'bee-nvidia', 'restart');
}
loadServices(); loadServices();
</script>` </script>`
} }
@@ -2811,6 +2806,124 @@ loadDisplays();
</script>` </script>`
} }
func renderNvidiaSelfHealInline() string {
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">
<button id="nvidia-restart-btn" class="btn btn-secondary" onclick="nvidiaRestartDrivers()">Restart GPU Drivers</button>
<button class="btn btn-sm btn-secondary" onclick="loadNvidiaSelfHeal()">&#8635; Refresh</button>
</div>
<div id="nvidia-self-heal-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVIDIA GPU status...</div>
<div id="nvidia-self-heal-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
<div id="nvidia-self-heal-out" style="display:none;margin-top:12px">
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
<span id="nvidia-self-heal-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
<span id="nvidia-self-heal-out-status" style="font-size:12px"></span>
</div>
<div id="nvidia-self-heal-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
</div>
<script>
function nvidiaSelfHealShowResult(label, status, output) {
var out = document.getElementById('nvidia-self-heal-out');
var term = document.getElementById('nvidia-self-heal-terminal');
var statusEl = document.getElementById('nvidia-self-heal-out-status');
var labelEl = document.getElementById('nvidia-self-heal-out-label');
out.style.display = 'block';
labelEl.textContent = label;
term.textContent = output || '(no output)';
term.scrollTop = term.scrollHeight;
if (status === 'ok') {
statusEl.textContent = '✓ done';
statusEl.style.color = 'var(--ok-fg, #2c662d)';
} else {
statusEl.textContent = '✗ failed';
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
}
}
function nvidiaRestartDrivers() {
var btn = document.getElementById('nvidia-restart-btn');
var original = btn.textContent;
btn.disabled = true;
btn.textContent = 'Restarting...';
nvidiaSelfHealShowResult('restart bee-nvidia', 'ok', 'Running...');
fetch('/api/services/action', {
method:'POST',
headers:{'Content-Type':'application/json'},
body:JSON.stringify({name:'bee-nvidia', action:'restart'})
}).then(r=>r.json()).then(d => {
nvidiaSelfHealShowResult('restart bee-nvidia', d.status || 'error', d.output || d.error || '(no output)');
setTimeout(function() {
loadServices();
loadNvidiaSelfHeal();
}, 800);
}).catch(e => {
nvidiaSelfHealShowResult('restart bee-nvidia', 'error', 'Request failed: ' + e);
}).finally(() => {
btn.disabled = false;
btn.textContent = original;
});
}
function nvidiaResetGPU(index, btn) {
var original = btn.textContent;
btn.disabled = true;
btn.textContent = 'Resetting...';
nvidiaSelfHealShowResult('reset gpu ' + index, 'ok', 'Running...');
fetch('/api/gpu/nvidia-reset', {
method:'POST',
headers:{'Content-Type':'application/json'},
body:JSON.stringify({index:index})
}).then(r=>r.json()).then(d => {
nvidiaSelfHealShowResult('reset gpu ' + index, d.status || 'error', d.output || '(no output)');
setTimeout(loadNvidiaSelfHeal, 1000);
}).catch(e => {
nvidiaSelfHealShowResult('reset gpu ' + index, 'error', 'Request failed: ' + e);
}).finally(() => {
btn.disabled = false;
btn.textContent = original;
});
}
function loadNvidiaSelfHeal() {
var status = document.getElementById('nvidia-self-heal-status');
var table = document.getElementById('nvidia-self-heal-table');
status.textContent = 'Loading NVIDIA GPU status...';
status.style.color = 'var(--muted)';
table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
fetch('/api/gpu/nvidia-status').then(r=>r.json()).then(gpus => {
if (!Array.isArray(gpus) || gpus.length === 0) {
status.textContent = 'No NVIDIA GPUs detected or nvidia-smi is unavailable.';
table.innerHTML = '';
return;
}
status.textContent = gpus.length + ' NVIDIA GPU(s) detected.';
const rows = gpus.map(g => {
const serial = g.serial || '';
const bdf = g.bdf || '';
const id = serial || bdf || ('gpu-' + g.index);
const badge = g.status === 'OK' ? 'badge-ok' : g.status === 'RESET_REQUIRED' ? 'badge-err' : 'badge-warn';
const details = [];
if (serial) details.push('serial ' + serial);
if (bdf) details.push('bdf ' + bdf);
if (g.parse_failure && g.raw_line) details.push(g.raw_line);
return '<tr>'
+ '<td style="white-space:nowrap">' + g.index + '</td>'
+ '<td>' + (g.name || 'unknown') + '</td>'
+ '<td style="font-family:monospace">' + id + '</td>'
+ '<td><span class="badge ' + badge + '">' + (g.status || 'UNKNOWN') + '</span>'
+ (details.length ? '<div style="margin-top:4px;font-size:12px;color:var(--muted)">' + details.join(' | ') + '</div>' : '')
+ '</td>'
+ '<td style="white-space:nowrap"><button class="btn btn-sm btn-secondary" onclick="nvidiaResetGPU(' + g.index + ', this)">Reset GPU</button></td>'
+ '</tr>';
}).join('');
table.innerHTML = '<table><tr><th>GPU</th><th>Model</th><th>ID</th><th>Status</th><th>Action</th></tr>' + rows + '</table>';
}).catch(e => {
status.textContent = 'Error loading NVIDIA GPU status: ' + e;
status.style.color = 'var(--crit-fg, #9f3a38)';
table.innerHTML = '';
});
}
loadNvidiaSelfHeal();
</script>`
}
// ── Tools ───────────────────────────────────────────────────────────────────── // ── Tools ─────────────────────────────────────────────────────────────────────
func renderTools() string { func renderTools() string {
@@ -2871,6 +2984,9 @@ function installToRAM() {
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">&#8635; Check</button></div> <div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">&#8635; Check</button></div>
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div> <div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
renderNvidiaSelfHealInline() + `</div></div>
<div class="card"><div class="card-head">Network</div><div class="card-body">` + <div class="card"><div class="card-head">Network</div><div class="card-body">` +
renderNetworkInline() + `</div></div> renderNetworkInline() + `</div></div>

View File

@@ -302,6 +302,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
// GPU presence / tools // GPU presence / tools
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence) mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs) mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
mux.HandleFunc("GET /api/gpu/nvidia-status", h.handleAPIGNVIDIAGPUStatuses)
mux.HandleFunc("POST /api/gpu/nvidia-reset", h.handleAPIGNVIDIAReset)
mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools) mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
// System // System

View File

@@ -591,7 +591,7 @@ func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
} }
} }
func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) { func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
handler := NewHandler(HandlerOptions{}) handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder() rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil)) handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
@@ -599,11 +599,20 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
t.Fatalf("status=%d", rec.Code) t.Fatalf("status=%d", rec.Code)
} }
body := rec.Body.String() body := rec.Body.String()
if !strings.Contains(body, `NVIDIA Self Heal`) {
t.Fatalf("tools page missing nvidia self heal section: %s", body)
}
if !strings.Contains(body, `Restart GPU Drivers`) { if !strings.Contains(body, `Restart GPU Drivers`) {
t.Fatalf("tools page missing restart gpu drivers button: %s", body) t.Fatalf("tools page missing restart gpu drivers button: %s", body)
} }
if !strings.Contains(body, `restartGPUDrivers()`) { if !strings.Contains(body, `nvidiaRestartDrivers()`) {
t.Fatalf("tools page missing restartGPUDrivers action: %s", body) t.Fatalf("tools page missing nvidiaRestartDrivers action: %s", body)
}
if !strings.Contains(body, `/api/gpu/nvidia-status`) {
t.Fatalf("tools page missing nvidia status api usage: %s", body)
}
if !strings.Contains(body, `nvidiaResetGPU(`) {
t.Fatalf("tools page missing nvidiaResetGPU action: %s", body)
} }
if !strings.Contains(body, `id="boot-source-text"`) { if !strings.Contains(body, `id="boot-source-text"`) {
t.Fatalf("tools page missing boot source field: %s", body) t.Fatalf("tools page missing boot source field: %s", body)