Fix GPU model propagation, export filenames, PSU/service status, and chart perf
- nvidia.go: add Name field to nvidiaGPUInfo, include model name in nvidia-smi query, set dev.Model in enrichPCIeWithNVIDIAData - pages.go: fix duplicate GPU count in validate card summary (4 GPU: 4 x … → 4 x … GPU); fix PSU UNKNOWN fallback from hw.PowerSupplies; treat activating/deactivating/reloading service states as OK in Runtime Health - support_bundle.go: use "150405" time format (no colons) for exFAT compat - sat.go / benchmark.go / platform_stress.go / sat_fan_stress.go: remove .tar.gz archive creation from export dirs — export packs everything itself - charts_svg.go: add min-max downsampling (1400 pt cap) for SVG chart perf - benchmark_report.go / sat.go: normalize GPU fallback to "Unknown GPU" Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -349,6 +349,9 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||
writeRow("GPU", hwDescribeGPU(hw), runtimeStatusBadge(gpuRow.Status))
|
||||
|
||||
psuRow := aggregateComponentStatus("PSU", records, nil, []string{"psu:"})
|
||||
if psuRow.Status == "UNKNOWN" && len(hw.PowerSupplies) > 0 {
|
||||
psuRow.Status = hwPSUStatus(hw.PowerSupplies)
|
||||
}
|
||||
writeRow("PSU", hwDescribePSU(hw), runtimeStatusBadge(psuRow.Status))
|
||||
|
||||
if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
|
||||
@@ -506,6 +509,31 @@ func hwDescribeGPU(hw schema.HardwareSnapshot) string {
|
||||
return strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
// hwPSUStatus returns "OK", "CRITICAL", "WARNING", or "UNKNOWN" based on
|
||||
// PSU statuses from the audit snapshot. Used as fallback when component-status.json
|
||||
// has no psu: records yet (e.g. first boot before audit writes them).
|
||||
func hwPSUStatus(psus []schema.HardwarePowerSupply) string {
|
||||
worst := "UNKNOWN"
|
||||
for _, psu := range psus {
|
||||
if psu.Status == nil {
|
||||
continue
|
||||
}
|
||||
switch strings.ToUpper(strings.TrimSpace(*psu.Status)) {
|
||||
case "CRITICAL":
|
||||
return "CRITICAL"
|
||||
case "WARNING":
|
||||
if worst != "CRITICAL" {
|
||||
worst = "WARNING"
|
||||
}
|
||||
case "OK":
|
||||
if worst == "UNKNOWN" {
|
||||
worst = "OK"
|
||||
}
|
||||
}
|
||||
}
|
||||
return worst
|
||||
}
|
||||
|
||||
// hwDescribePSU returns a summary like "2× 1600 W" or "2× PSU".
|
||||
func hwDescribePSU(hw schema.HardwareSnapshot) string {
|
||||
n := len(hw.PowerSupplies)
|
||||
@@ -742,7 +770,13 @@ func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
|
||||
nonActive := make([]string, 0)
|
||||
for _, svc := range health.Services {
|
||||
state := strings.TrimSpace(strings.ToLower(svc.Status))
|
||||
if state != "active" {
|
||||
// "activating" and "deactivating" are transient states for oneshot services
|
||||
// (RemainAfterExit=yes) — the service is running normally, not failed.
|
||||
// Only "failed" and "inactive" (after services should be running) are problems.
|
||||
switch state {
|
||||
case "active", "activating", "deactivating", "reloading":
|
||||
// OK — service is running or transitioning normally
|
||||
default:
|
||||
nonActive = append(nonActive, svc.Name+"="+svc.Status)
|
||||
}
|
||||
}
|
||||
@@ -1777,6 +1811,11 @@ func formatValidateDeviceSummary(total int, models map[string]int, unit string)
|
||||
if total != 1 {
|
||||
label += "s"
|
||||
}
|
||||
// If there is only one model the leading count duplicates the per-model
|
||||
// count already in parts (e.g. "4 GPU: 4 x RTX …" → "4 x RTX …").
|
||||
if len(parts) == 1 {
|
||||
return parts[0] + " " + label
|
||||
}
|
||||
return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user