Fix GPU model propagation, export filenames, PSU/service status, and chart perf

- nvidia.go: add Name field to nvidiaGPUInfo, include model name in
  nvidia-smi query, set dev.Model in enrichPCIeWithNVIDIAData
- pages.go: fix duplicate GPU count in validate card summary (4 GPU: 4 x …
  → 4 x … GPU); fix PSU UNKNOWN fallback from hw.PowerSupplies; treat
  activating/deactivating/reloading service states as OK in Runtime Health
- support_bundle.go: use "150405" time format (no colons) for exFAT compat
- sat.go / benchmark.go / platform_stress.go / sat_fan_stress.go: remove
  .tar.gz archive creation from export dirs — export packs everything itself
- charts_svg.go: add min-max downsampling (1400 pt cap) for SVG chart perf
- benchmark_report.go / sat.go: normalize GPU fallback to "Unknown GPU"

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-11 09:59:16 +03:00
parent bb1218ddd4
commit ba16021cdb
9 changed files with 263 additions and 31 deletions

View File

@@ -213,7 +213,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
now := time.Now().UTC() now := time.Now().UTC()
date := now.Format("2006-01-02") date := now.Format("2006-01-02")
tod := now.Format("15:04:05") tod := now.Format("150405")
ver := bundleVersion() ver := bundleVersion()
model := serverModelForBundle() model := serverModelForBundle()
sn := serverSerialForBundle() sn := serverSerialForBundle()

View File

@@ -335,11 +335,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
return "", fmt.Errorf("write summary.txt: %w", err) return "", fmt.Errorf("write summary.txt: %w", err)
} }
archive := filepath.Join(baseDir, "gpu-benchmark-"+ts+".tar.gz") return runDir, nil
if err := createTarGz(archive, runDir); err != nil {
return "", fmt.Errorf("pack benchmark archive: %w", err)
}
return archive, nil
} }
func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions { func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions {

View File

@@ -90,7 +90,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
for _, gpu := range result.GPUs { for _, gpu := range result.GPUs {
name := strings.TrimSpace(gpu.Name) name := strings.TrimSpace(gpu.Name)
if name == "" { if name == "" {
name = "Unknown" name = "Unknown GPU"
} }
interconnect := "-" interconnect := "-"
if gpu.Scores.InterconnectScore > 0 { if gpu.Scores.InterconnectScore > 0 {

View File

@@ -161,13 +161,7 @@ func (s *System) RunPlatformStress(
} }
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644) _ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
// Pack tar.gz return runDir, nil
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
if err := packPlatformDir(runDir, archivePath); err != nil {
return "", fmt.Errorf("pack archive: %w", err)
}
_ = os.RemoveAll(runDir)
return archivePath, nil
} }
// collectPhase samples live metrics every second until ctx is done. // collectPhase samples live metrics every second until ctx is done.

View File

@@ -662,11 +662,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil { if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
return "", err return "", err
} }
archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz") return runDir, nil
if err := createTarGz(archive, runDir); err != nil {
return "", err
}
return archive, nil
} }
type satJob struct { type satJob struct {
@@ -852,11 +848,7 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
} }
} }
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz") return runDir, nil
if err := createTarGz(archive, runDir); err != nil {
return "", err
}
return archive, nil
} }
func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) { func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
@@ -919,7 +911,7 @@ func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPU
entry.Health = "UNKNOWN" entry.Health = "UNKNOWN"
} }
if entry.Name == "" { if entry.Name == "" {
entry.Name = "unknown" entry.Name = "Unknown GPU"
} }
var body strings.Builder var body strings.Builder
fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index) fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)

View File

@@ -223,11 +223,7 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
return "", err return "", err
} }
archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz") return runDir, nil
if err := createTarGz(archive, runDir); err != nil {
return "", err
}
return archive, nil
} }
func applyFanStressDefaults(opts *FanStressOptions) { func applyFanStressDefaults(opts *FanStressOptions) {

View File

@@ -83,6 +83,10 @@ func renderMetricChartSVG(title string, labels []string, times []time.Time, data
} }
} }
// Downsample to at most ~1400 points (one per pixel) before building SVG.
times, datasets = downsampleTimeSeries(times, datasets, 1400)
pointCount = len(times)
statsLabel := chartStatsLabel(datasets) statsLabel := chartStatsLabel(datasets)
legendItems := []metricChartSeries{} legendItems := []metricChartSeries{}
@@ -196,6 +200,19 @@ func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, s
} }
} }
// Downsample to at most ~1400 points before building SVG.
{
datasets := make([][]float64, len(series))
for i := range series {
datasets[i] = series[i].Values
}
times, datasets = downsampleTimeSeries(times, datasets, 1400)
pointCount = len(times)
for i := range series {
series[i].Values = datasets[i]
}
}
scales := make([]chartScale, len(series)) scales := make([]chartScale, len(series))
for i := range series { for i := range series {
min, max := chartSeriesBounds(series[i].Values) min, max := chartSeriesBounds(series[i].Values)
@@ -626,6 +643,87 @@ func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end
b.WriteString(`</g>` + "\n") b.WriteString(`</g>` + "\n")
} }
// downsampleTimeSeries reduces the time series to at most maxPts points using
// min-max bucketing. Each bucket contributes the index of its min and max value
// (using the first full-length dataset as the reference series). All parallel
// datasets are sampled at those same indices so all series stay aligned.
// If len(times) <= maxPts the inputs are returned unchanged.
func downsampleTimeSeries(times []time.Time, datasets [][]float64, maxPts int) ([]time.Time, [][]float64) {
n := len(times)
if n <= maxPts || maxPts <= 0 {
return times, datasets
}
buckets := maxPts / 2
if buckets < 1 {
buckets = 1
}
// Use the first dataset that has the same length as times as the reference
// for deciding which two indices to keep per bucket.
var ref []float64
for _, ds := range datasets {
if len(ds) == n {
ref = ds
break
}
}
selected := make([]int, 0, maxPts)
bucketSize := float64(n) / float64(buckets)
for b := 0; b < buckets; b++ {
lo := int(math.Round(float64(b) * bucketSize))
hi := int(math.Round(float64(b+1) * bucketSize))
if hi > n {
hi = n
}
if lo >= hi {
continue
}
if ref == nil {
selected = append(selected, lo)
if hi-1 != lo {
selected = append(selected, hi-1)
}
continue
}
minIdx, maxIdx := lo, lo
for i := lo + 1; i < hi; i++ {
if ref[i] < ref[minIdx] {
minIdx = i
}
if ref[i] > ref[maxIdx] {
maxIdx = i
}
}
if minIdx <= maxIdx {
selected = append(selected, minIdx)
if maxIdx != minIdx {
selected = append(selected, maxIdx)
}
} else {
selected = append(selected, maxIdx)
if minIdx != maxIdx {
selected = append(selected, minIdx)
}
}
}
outTimes := make([]time.Time, len(selected))
for i, idx := range selected {
outTimes[i] = times[idx]
}
outDatasets := make([][]float64, len(datasets))
for d, ds := range datasets {
if len(ds) != n {
outDatasets[d] = ds
continue
}
out := make([]float64, len(selected))
for i, idx := range selected {
out[i] = ds[idx]
}
outDatasets[d] = out
}
return outTimes, outDatasets
}
func chartXForTime(ts, start, end time.Time, left, right int) float64 { func chartXForTime(ts, start, end time.Time, left, right int) float64 {
if !end.After(start) { if !end.After(start) {
return float64(left+right) / 2 return float64(left+right) / 2

View File

@@ -349,6 +349,9 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
writeRow("GPU", hwDescribeGPU(hw), runtimeStatusBadge(gpuRow.Status)) writeRow("GPU", hwDescribeGPU(hw), runtimeStatusBadge(gpuRow.Status))
psuRow := aggregateComponentStatus("PSU", records, nil, []string{"psu:"}) psuRow := aggregateComponentStatus("PSU", records, nil, []string{"psu:"})
if psuRow.Status == "UNKNOWN" && len(hw.PowerSupplies) > 0 {
psuRow.Status = hwPSUStatus(hw.PowerSupplies)
}
writeRow("PSU", hwDescribePSU(hw), runtimeStatusBadge(psuRow.Status)) writeRow("PSU", hwDescribePSU(hw), runtimeStatusBadge(psuRow.Status))
if nicDesc := hwDescribeNIC(hw); nicDesc != "" { if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
@@ -506,6 +509,31 @@ func hwDescribeGPU(hw schema.HardwareSnapshot) string {
return strings.Join(parts, ", ") return strings.Join(parts, ", ")
} }
// hwPSUStatus returns "OK", "CRITICAL", "WARNING", or "UNKNOWN" based on
// PSU statuses from the audit snapshot. Used as fallback when component-status.json
// has no psu: records yet (e.g. first boot before audit writes them).
func hwPSUStatus(psus []schema.HardwarePowerSupply) string {
worst := "UNKNOWN"
for _, psu := range psus {
if psu.Status == nil {
continue
}
switch strings.ToUpper(strings.TrimSpace(*psu.Status)) {
case "CRITICAL":
return "CRITICAL"
case "WARNING":
if worst != "CRITICAL" {
worst = "WARNING"
}
case "OK":
if worst == "UNKNOWN" {
worst = "OK"
}
}
}
return worst
}
// hwDescribePSU returns a summary like "2× 1600 W" or "2× PSU". // hwDescribePSU returns a summary like "2× 1600 W" or "2× PSU".
func hwDescribePSU(hw schema.HardwareSnapshot) string { func hwDescribePSU(hw schema.HardwareSnapshot) string {
n := len(hw.PowerSupplies) n := len(hw.PowerSupplies)
@@ -742,7 +770,13 @@ func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
nonActive := make([]string, 0) nonActive := make([]string, 0)
for _, svc := range health.Services { for _, svc := range health.Services {
state := strings.TrimSpace(strings.ToLower(svc.Status)) state := strings.TrimSpace(strings.ToLower(svc.Status))
if state != "active" { // "activating" and "deactivating" are transient states for oneshot services
// (RemainAfterExit=yes) — the service is running normally, not failed.
// Only "failed" and "inactive" (after services should be running) are problems.
switch state {
case "active", "activating", "deactivating", "reloading":
// OK — service is running or transitioning normally
default:
nonActive = append(nonActive, svc.Name+"="+svc.Status) nonActive = append(nonActive, svc.Name+"="+svc.Status)
} }
} }
@@ -1777,6 +1811,11 @@ func formatValidateDeviceSummary(total int, models map[string]int, unit string)
if total != 1 { if total != 1 {
label += "s" label += "s"
} }
// If there is only one model the leading count duplicates the per-model
// count already in parts (e.g. "4 GPU: 4 x RTX …" → "4 x RTX …").
if len(parts) == 1 {
return parts[0] + " " + label
}
return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", ")) return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
} }

View File

@@ -0,0 +1,117 @@
# GPU Model Name Propagation
How GPU model names are detected, stored, and displayed throughout the project.
---
## Detection Sources
There are **two separate pipelines** for GPU model names — they use different structs and don't share state.
### Pipeline A — Live / SAT (nvidia-smi query at runtime)
**File:** `audit/internal/platform/sat.go`
- `ListNvidiaGPUs()``NvidiaGPU.Name` (field: `name`, from `nvidia-smi --query-gpu=index,name,...`)
- `ListNvidiaGPUStatuses()``NvidiaGPUStatus.Name`
- Used by: GPU selection UI, live metrics labels, burn/stress test logic
### Pipeline B — Benchmark results
**File:** `audit/internal/platform/benchmark.go`, line 124
- `queryBenchmarkGPUInfo(selected)``benchmarkGPUInfo.Name`
- Stored in `BenchmarkGPUResult.Name` (`json:"name,omitempty"`)
- Used by: benchmark history table, benchmark report
### Pipeline C — Hardware audit JSON (PCIe schema)
**File:** `audit/internal/schema/hardware.go`
- `HardwarePCIeDevice.Model *string` (field name is **Model**, not Name)
- For AMD GPUs: populated by `audit/internal/collector/amdgpu.go` from `info.Product`
- For NVIDIA GPUs: **NOT populated** by `audit/internal/collector/nvidia.go` — the NVIDIA enricher sets telemetry/status but skips the Model field
- Used by: hardware summary page (`hwDescribeGPU` in `pages.go:487`)
---
## Key Inconsistency: NVIDIA PCIe Model is Never Set
`audit/internal/collector/nvidia.go``enrichPCIeWithNVIDIAData()` enriches NVIDIA PCIe devices with telemetry and status but does **not** populate `HardwarePCIeDevice.Model`.
This means:
- Hardware summary page shows "Unknown GPU" for all NVIDIA devices (falls back at `pages.go:486`)
- AMD GPUs do have their model populated
The fix would be: copy `gpu.Name` from the SAT pipeline into `dev.Model` inside `enrichPCIeWithNVIDIAData`.
---
## Benchmark History "Unknown GPU" Issue
**Symptom:** Benchmark history table shows "GPU #N — Unknown GPU" columns instead of real GPU model names.
**Root cause:** `BenchmarkGPUResult.Name` has tag `json:"name,omitempty"`. If `queryBenchmarkGPUInfo()` fails (warns at `benchmark.go:126`) or returns empty names, the Name field is never set and is omitted from JSON. Loaded results have empty Name → falls back to "Unknown GPU" at `pages.go:2226, 2237`.
This happens for:
- Older result files saved before the `Name` field was added
- Runs where nvidia-smi query failed before the benchmark started
---
## Fallback Strings — Current State
| Location | File | Fallback string |
|---|---|---|
| Hardware summary (PCIe) | `pages.go:486` | `"Unknown GPU"` |
| Benchmark report summary | `benchmark_report.go:43` | `"Unknown GPU"` |
| Benchmark report scorecard | `benchmark_report.go:93` | `"Unknown"` ← inconsistent |
| Benchmark report detail | `benchmark_report.go:122` | `"Unknown GPU"` |
| Benchmark history per-GPU col | `pages.go:2226` | `"Unknown GPU"` |
| Benchmark history parallel col | `pages.go:2237` | `"Unknown GPU"` |
| SAT status file write | `sat.go:922` | `"unknown"` ← lowercase, inconsistent |
| GPU selection API | `api.go:163` | `"GPU N"` (no "Unknown") |
**Rule:** all UI fallbacks should use `"Unknown GPU"`. The two outliers are `benchmark_report.go:93` (`"Unknown"`) and `sat.go:922` (`"unknown"`).
---
## GPU Selection UI
**File:** `audit/internal/webui/pages.go`
- Source: `GET /api/gpus``api.go``ListNvidiaGPUs()` → live nvidia-smi
- Render: `'GPU ' + gpu.index + ' — ' + gpu.name + ' · ' + mem`
- Fallback: `gpu.name || 'GPU ' + idx` (JS, line ~1432)
This always shows the correct model because it queries nvidia-smi live. It is **not** connected to benchmark result data.
---
## Data Flow Summary
```
nvidia-smi (live)
└─ ListNvidiaGPUs() → NvidiaGPU.Name
├─ GPU selection UI (always correct)
├─ Live metrics labels (charts_svg.go)
└─ SAT/burn status file (sat.go)
nvidia-smi (at benchmark start)
└─ queryBenchmarkGPUInfo() → benchmarkGPUInfo.Name
└─ BenchmarkGPUResult.Name (json:"name,omitempty")
├─ Benchmark report
└─ Benchmark history table columns
nvidia-smi / lspci (audit collection)
└─ HardwarePCIeDevice.Model (NVIDIA: NOT populated; AMD: populated)
└─ Hardware summary page hwDescribeGPU()
```
---
## What Needs Fixing
1. **NVIDIA PCIe Model**`enrichPCIeWithNVIDIAData()` should set `dev.Model = &gpu.Name`
2. **Fallback consistency**`benchmark_report.go:93` should say `"Unknown GPU"` not `"Unknown"`; `sat.go:922` should say `"Unknown GPU"` not `"unknown"`
3. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue)