Add NVIDIA self-heal tools and per-GPU SAT status
This commit is contained in:
@@ -122,6 +122,8 @@ type satRunner interface {
|
||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
||||
ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
|
||||
ResetNvidiaGPU(index int) (string, error)
|
||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
@@ -521,6 +523,15 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return a.sat.ListNvidiaGPUs()
|
||||
}
|
||||
|
||||
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||
return a.sat.ListNvidiaGPUStatuses()
|
||||
}
|
||||
|
||||
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
|
||||
out, err := a.sat.ResetNvidiaGPU(index)
|
||||
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
|
||||
@@ -135,6 +135,8 @@ type fakeSAT struct {
|
||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||
runAMDPackFn func(string) (string, error)
|
||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||
listNvidiaGPUStatusesFn func() ([]platform.NvidiaGPUStatus, error)
|
||||
resetNvidiaGPUFn func(int) (string, error)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||
@@ -201,6 +203,20 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||
if f.listNvidiaGPUStatusesFn != nil {
|
||||
return f.listNvidiaGPUStatusesFn()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) {
|
||||
if f.resetNvidiaGPUFn != nil {
|
||||
return f.resetNvidiaGPUFn(index)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||
return f.runMemoryFn(baseDir)
|
||||
}
|
||||
@@ -805,6 +821,9 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
for _, want := range []string{
|
||||
"/system/ip-link.txt",
|
||||
"/system/ip-link-stats.txt",
|
||||
"/system/kernel-aer-nvidia.txt",
|
||||
"/system/lspci-nvidia-bridges-vv.txt",
|
||||
"/system/pcie-aer-sysfs.txt",
|
||||
"/system/ethtool-info.txt",
|
||||
"/system/ethtool-link.txt",
|
||||
"/system/ethtool-module.txt",
|
||||
|
||||
@@ -3,6 +3,7 @@ package app
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
@@ -18,6 +19,7 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
|
||||
applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
|
||||
applyNvidiaPerGPUStatus(snap.PCIeDevices, baseDir)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
|
||||
applyMemorySAT(snap.Memory, summary)
|
||||
@@ -32,6 +34,100 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
|
||||
applyComponentStatusDB(snap, db)
|
||||
}
|
||||
|
||||
type nvidiaPerGPUStatus struct {
|
||||
runStatus string
|
||||
reason string
|
||||
}
|
||||
|
||||
func applyNvidiaPerGPUStatus(devs []schema.HardwarePCIeDevice, baseDir string) {
|
||||
statusByIndex, ts, ok := loadLatestNvidiaPerGPUStatus(baseDir)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for i := range devs {
|
||||
if devs[i].Telemetry == nil {
|
||||
continue
|
||||
}
|
||||
rawIdx, ok := devs[i].Telemetry["nvidia_gpu_index"]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
idx, ok := telemetryInt(rawIdx)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
st, ok := statusByIndex[idx]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
status, description, ok := satKeyStatus(st.runStatus, firstNonEmpty(strings.TrimSpace(st.reason), "nvidia GPU SAT"))
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
mergeComponentStatusPreferDetail(&devs[i].HardwareComponentStatus, ts, status, description)
|
||||
}
|
||||
}
|
||||
|
||||
func loadLatestNvidiaPerGPUStatus(baseDir string) (map[int]nvidiaPerGPUStatus, string, bool) {
|
||||
matches, err := filepath.Glob(filepath.Join(baseDir, "gpu-nvidia-*"))
|
||||
if err != nil || len(matches) == 0 {
|
||||
return nil, "", false
|
||||
}
|
||||
sort.Strings(matches)
|
||||
runDir := matches[len(matches)-1]
|
||||
summaryRaw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return nil, "", false
|
||||
}
|
||||
summaryKV := parseKeyValueSummary(string(summaryRaw))
|
||||
runAtUTC := strings.TrimSpace(summaryKV["run_at_utc"])
|
||||
files, err := filepath.Glob(filepath.Join(runDir, "gpu-*-status.txt"))
|
||||
if err != nil || len(files) == 0 {
|
||||
return nil, "", false
|
||||
}
|
||||
out := make(map[int]nvidiaPerGPUStatus, len(files))
|
||||
for _, file := range files {
|
||||
raw, err := os.ReadFile(file)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
kv := parseKeyValueSummary(string(raw))
|
||||
idx, err := strconv.Atoi(strings.TrimSpace(kv["gpu_index"]))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out[idx] = nvidiaPerGPUStatus{
|
||||
runStatus: strings.ToUpper(strings.TrimSpace(kv["run_status"])),
|
||||
reason: strings.TrimSpace(kv["reason"]),
|
||||
}
|
||||
}
|
||||
if len(out) == 0 {
|
||||
return nil, "", false
|
||||
}
|
||||
return out, runAtUTC, true
|
||||
}
|
||||
|
||||
func telemetryInt(v any) (int, bool) {
|
||||
switch value := v.(type) {
|
||||
case int:
|
||||
return value, true
|
||||
case int32:
|
||||
return int(value), true
|
||||
case int64:
|
||||
return int(value), true
|
||||
case float64:
|
||||
return int(value), true
|
||||
case string:
|
||||
n, err := strconv.Atoi(strings.TrimSpace(value))
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
return n, true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
type satSummary struct {
|
||||
runAtUTC string
|
||||
overall string
|
||||
@@ -176,6 +272,31 @@ func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func mergeComponentStatusPreferDetail(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
|
||||
if component == nil || satStatus == "" {
|
||||
return
|
||||
}
|
||||
current := strings.TrimSpace(ptrString(component.Status))
|
||||
newSeverity := statusSeverity(satStatus)
|
||||
currentSeverity := statusSeverity(current)
|
||||
if current == "" || current == "Unknown" || newSeverity > currentSeverity {
|
||||
mergeComponentStatus(component, changedAt, satStatus, description)
|
||||
return
|
||||
}
|
||||
if newSeverity == currentSeverity && strings.TrimSpace(description) != "" {
|
||||
component.Status = appStringPtr(satStatus)
|
||||
component.ErrorDescription = appStringPtr(description)
|
||||
if strings.TrimSpace(changedAt) != "" {
|
||||
component.StatusChangedAt = appStringPtr(changedAt)
|
||||
component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
|
||||
Status: satStatus,
|
||||
ChangedAt: changedAt,
|
||||
Details: appStringPtr(description),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func statusSeverity(status string) int {
|
||||
switch strings.TrimSpace(status) {
|
||||
case "Critical":
|
||||
|
||||
@@ -59,3 +59,51 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyLatestSATStatusesMarksNvidiaGPUByPerGPUStatusFile(t *testing.T) {
|
||||
baseDir := t.TempDir()
|
||||
runDir := filepath.Join(baseDir, "gpu-nvidia-20260407-162123")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte("run_at_utc=2026-04-07T16:21:23Z\noverall_status=FAILED\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "gpu-1-status.txt"), []byte("gpu_index=1\ngpu_name=NVIDIA H100 PCIe\nrun_status=FAILED\nreason=GPU requires reset\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
class := "VideoController"
|
||||
manufacturer := "NVIDIA Corporation"
|
||||
bdf0 := "0000:4b:00.0"
|
||||
bdf1 := "0000:4f:00.0"
|
||||
snap := schema.HardwareSnapshot{
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
BDF: &bdf0,
|
||||
Telemetry: map[string]any{"nvidia_gpu_index": 0},
|
||||
},
|
||||
{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
BDF: &bdf1,
|
||||
Telemetry: map[string]any{"nvidia_gpu_index": 1},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.PCIeDevices[1].Status == nil || *snap.PCIeDevices[1].Status != "Critical" {
|
||||
t.Fatalf("gpu1 status=%v want Critical", snap.PCIeDevices[1].Status)
|
||||
}
|
||||
if snap.PCIeDevices[1].ErrorDescription == nil || *snap.PCIeDevices[1].ErrorDescription != "GPU requires reset failed" {
|
||||
got := "<nil>"
|
||||
if snap.PCIeDevices[1].ErrorDescription != nil {
|
||||
got = *snap.PCIeDevices[1].ErrorDescription
|
||||
}
|
||||
t.Fatalf("gpu1 error=%q want per-gpu reason", got)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,7 +40,36 @@ var supportBundleCommands = []struct {
|
||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
||||
{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v dmesg >/dev/null 2>&1; then
|
||||
dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
|
||||
else
|
||||
echo "dmesg not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v lspci >/dev/null 2>&1; then
|
||||
echo "lspci not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for gpu in $(lspci -Dn | awk '$3 ~ /^10de:/ {print $1}'); do
|
||||
found=1
|
||||
echo "=== GPU $gpu ==="
|
||||
lspci -s "$gpu" -vv 2>&1 || true
|
||||
bridge=$(basename "$(readlink -f "/sys/bus/pci/devices/$gpu/.." 2>/dev/null)" 2>/dev/null)
|
||||
if [ -n "$bridge" ] && [ "$bridge" != "$gpu" ]; then
|
||||
echo
|
||||
echo "=== UPSTREAM $bridge for $gpu ==="
|
||||
lspci -s "$bridge" -vv 2>&1 || true
|
||||
fi
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no NVIDIA PCI devices found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||
for d in /sys/bus/pci/devices/*/; do
|
||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||
@@ -51,6 +80,30 @@ for d in /sys/bus/pci/devices/*/; do
|
||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||
done
|
||||
done
|
||||
`}},
|
||||
{name: "system/pcie-aer-sysfs.txt", cmd: []string{"sh", "-c", `
|
||||
found=0
|
||||
for dev in /sys/bus/pci/devices/*; do
|
||||
[ -e "$dev" ] || continue
|
||||
bdf=$(basename "$dev")
|
||||
block=""
|
||||
for f in aer_dev_correctable aer_dev_fatal aer_dev_nonfatal aer_rootport_total_err_cor aer_rootport_total_err_fatal aer_rootport_total_err_nonfatal; do
|
||||
if [ -r "$dev/$f" ]; then
|
||||
if [ -z "$block" ]; then
|
||||
block=1
|
||||
found=1
|
||||
echo "=== $bdf ==="
|
||||
fi
|
||||
printf " %-30s %s\n" "$f" "$(cat "$dev/$f" 2>/dev/null)"
|
||||
fi
|
||||
done
|
||||
if [ -n "$block" ]; then
|
||||
echo
|
||||
fi
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no PCIe AER sysfs counters found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v ethtool >/dev/null 2>&1; then
|
||||
|
||||
Reference in New Issue
Block a user