Fix GPU model propagation, export filenames, PSU/service status, and chart perf

- nvidia.go: add Name field to nvidiaGPUInfo, include model name in nvidia-smi query, set dev.Model in enrichPCIeWithNVIDIAData - pages.go: fix duplicate GPU count in validate card summary (4 GPU: 4 x … → 4 x … GPU); fix PSU UNKNOWN fallback from hw.PowerSupplies; treat activating/deactivating/reloading service states as OK in Runtime Health - support_bundle.go: use "150405" time format (no colons) for exFAT compat - sat.go / benchmark.go / platform_stress.go / sat_fan_stress.go: remove .tar.gz archive creation from export dirs — export packs everything itself - charts_svg.go: add min-max downsampling (1400 pt cap) for SVG chart perf - benchmark_report.go / sat.go: normalize GPU fallback to "Unknown GPU" Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-11 09:59:16 +03:00
parent bb1218ddd4
commit ba16021cdb
9 changed files with 263 additions and 31 deletions
@@ -213,7 +213,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
 	now := time.Now().UTC()
 	date := now.Format("2006-01-02")
-	tod := now.Format("15:04:05")
+	tod := now.Format("150405")
 	ver := bundleVersion()
 	model := serverModelForBundle()
 	sn := serverSerialForBundle()
@@ -335,11 +335,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		return "", fmt.Errorf("write summary.txt: %w", err)
 	}
-	archive := filepath.Join(baseDir, "gpu-benchmark-"+ts+".tar.gz")
+	return runDir, nil
 	if err := createTarGz(archive, runDir); err != nil {
 		return "", fmt.Errorf("pack benchmark archive: %w", err)
 	}
 	return archive, nil
 }
 func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions {
@@ -90,7 +90,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
 	for _, gpu := range result.GPUs {
 		name := strings.TrimSpace(gpu.Name)
 		if name == "" {
-			name = "Unknown"
+			name = "Unknown GPU"
 		}
 		interconnect := "-"
 		if gpu.Scores.InterconnectScore > 0 {
@@ -161,13 +161,7 @@ func (s *System) RunPlatformStress(
 	}
 	_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
-	// Pack tar.gz
+	return runDir, nil
 	archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
 	if err := packPlatformDir(runDir, archivePath); err != nil {
 		return "", fmt.Errorf("pack archive: %w", err)
 	}
 	_ = os.RemoveAll(runDir)
 	return archivePath, nil
 }
 // collectPhase samples live metrics every second until ctx is done.
@@ -662,11 +662,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e
 	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
 		return "", err
 	}
-	archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz")
+	return runDir, nil
 	if err := createTarGz(archive, runDir); err != nil {
 		return "", err
 	}
 	return archive, nil
 }
 type satJob struct {
@@ -852,11 +848,7 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 		}
 	}
-	archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
+	return runDir, nil
 	if err := createTarGz(archive, runDir); err != nil {
 		return "", err
 	}
 	return archive, nil
 }
 func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
@@ -919,7 +911,7 @@ func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPU
 			entry.Health = "UNKNOWN"
 		}
 		if entry.Name == "" {
-			entry.Name = "unknown"
+			entry.Name = "Unknown GPU"
 		}
 		var body strings.Builder
 		fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
@@ -223,11 +223,7 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
 		return "", err
 	}
-	archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
+	return runDir, nil
 	if err := createTarGz(archive, runDir); err != nil {
 		return "", err
 	}
 	return archive, nil
 }
 func applyFanStressDefaults(opts *FanStressOptions) {
@@ -83,6 +83,10 @@ func renderMetricChartSVG(title string, labels []string, times []time.Time, data
 		}
 	}
 	// Downsample to at most ~1400 points (one per pixel) before building SVG.
 	times, datasets = downsampleTimeSeries(times, datasets, 1400)
 	pointCount = len(times)
 	statsLabel := chartStatsLabel(datasets)
 	legendItems := []metricChartSeries{}
@@ -196,6 +200,19 @@ func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, s
 		}
 	}
 	// Downsample to at most ~1400 points before building SVG.
 	{
 		datasets := make([][]float64, len(series))
 		for i := range series {
 			datasets[i] = series[i].Values
 		}
 		times, datasets = downsampleTimeSeries(times, datasets, 1400)
 		pointCount = len(times)
 		for i := range series {
 			series[i].Values = datasets[i]
 		}
 	}
 	scales := make([]chartScale, len(series))
 	for i := range series {
 		min, max := chartSeriesBounds(series[i].Values)
@@ -626,6 +643,87 @@ func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end
 	b.WriteString(`</g>` + "\n")
 }
 // downsampleTimeSeries reduces the time series to at most maxPts points using
 // min-max bucketing. Each bucket contributes the index of its min and max value
 // (using the first full-length dataset as the reference series). All parallel
 // datasets are sampled at those same indices so all series stay aligned.
 // If len(times) <= maxPts the inputs are returned unchanged.
 func downsampleTimeSeries(times []time.Time, datasets [][]float64, maxPts int) ([]time.Time, [][]float64) {
 	n := len(times)
 	if n <= maxPts || maxPts <= 0 {
 		return times, datasets
 	}
 	buckets := maxPts / 2
 	if buckets < 1 {
 		buckets = 1
 	}
 	// Use the first dataset that has the same length as times as the reference
 	// for deciding which two indices to keep per bucket.
 	var ref []float64
 	for _, ds := range datasets {
 		if len(ds) == n {
 			ref = ds
 			break
 		}
 	}
 	selected := make([]int, 0, maxPts)
 	bucketSize := float64(n) / float64(buckets)
 	for b := 0; b < buckets; b++ {
 		lo := int(math.Round(float64(b) * bucketSize))
 		hi := int(math.Round(float64(b+1) * bucketSize))
 		if hi > n {
 			hi = n
 		}
 		if lo >= hi {
 			continue
 		}
 		if ref == nil {
 			selected = append(selected, lo)
 			if hi-1 != lo {
 				selected = append(selected, hi-1)
 			}
 			continue
 		}
 		minIdx, maxIdx := lo, lo
 		for i := lo + 1; i < hi; i++ {
 			if ref[i] < ref[minIdx] {
 				minIdx = i
 			}
 			if ref[i] > ref[maxIdx] {
 				maxIdx = i
 			}
 		}
 		if minIdx <= maxIdx {
 			selected = append(selected, minIdx)
 			if maxIdx != minIdx {
 				selected = append(selected, maxIdx)
 			}
 		} else {
 			selected = append(selected, maxIdx)
 			if minIdx != maxIdx {
 				selected = append(selected, minIdx)
 			}
 		}
 	}
 	outTimes := make([]time.Time, len(selected))
 	for i, idx := range selected {
 		outTimes[i] = times[idx]
 	}
 	outDatasets := make([][]float64, len(datasets))
 	for d, ds := range datasets {
 		if len(ds) != n {
 			outDatasets[d] = ds
 			continue
 		}
 		out := make([]float64, len(selected))
 		for i, idx := range selected {
 			out[i] = ds[idx]
 		}
 		outDatasets[d] = out
 	}
 	return outTimes, outDatasets
 }
 func chartXForTime(ts, start, end time.Time, left, right int) float64 {
 	if !end.After(start) {
 		return float64(left+right) / 2
@@ -349,6 +349,9 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
 	writeRow("GPU", hwDescribeGPU(hw), runtimeStatusBadge(gpuRow.Status))
 	psuRow := aggregateComponentStatus("PSU", records, nil, []string{"psu:"})
 	if psuRow.Status == "UNKNOWN" && len(hw.PowerSupplies) > 0 {
 		psuRow.Status = hwPSUStatus(hw.PowerSupplies)
 	}
 	writeRow("PSU", hwDescribePSU(hw), runtimeStatusBadge(psuRow.Status))
 	if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
@@ -506,6 +509,31 @@ func hwDescribeGPU(hw schema.HardwareSnapshot) string {
 	return strings.Join(parts, ", ")
 }
 // hwPSUStatus returns "OK", "CRITICAL", "WARNING", or "UNKNOWN" based on
 // PSU statuses from the audit snapshot. Used as fallback when component-status.json
 // has no psu: records yet (e.g. first boot before audit writes them).
 func hwPSUStatus(psus []schema.HardwarePowerSupply) string {
 	worst := "UNKNOWN"
 	for _, psu := range psus {
 		if psu.Status == nil {
 			continue
 		}
 		switch strings.ToUpper(strings.TrimSpace(*psu.Status)) {
 		case "CRITICAL":
 			return "CRITICAL"
 		case "WARNING":
 			if worst != "CRITICAL" {
 				worst = "WARNING"
 			}
 		case "OK":
 			if worst == "UNKNOWN" {
 				worst = "OK"
 			}
 		}
 	}
 	return worst
 }
 // hwDescribePSU returns a summary like "2× 1600 W" or "2× PSU".
 func hwDescribePSU(hw schema.HardwareSnapshot) string {
 	n := len(hw.PowerSupplies)
@@ -742,7 +770,13 @@ func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
 	nonActive := make([]string, 0)
 	for _, svc := range health.Services {
 		state := strings.TrimSpace(strings.ToLower(svc.Status))
-		if state != "active" {
+		// "activating" and "deactivating" are transient states for oneshot services
 		// (RemainAfterExit=yes) — the service is running normally, not failed.
 		// Only "failed" and "inactive" (after services should be running) are problems.
 		switch state {
 		case "active", "activating", "deactivating", "reloading":
 			// OK — service is running or transitioning normally
 		default:
 			nonActive = append(nonActive, svc.Name+"="+svc.Status)
 		}
 	}
@@ -1777,6 +1811,11 @@ func formatValidateDeviceSummary(total int, models map[string]int, unit string)
 	if total != 1 {
 		label += "s"
 	}
 	// If there is only one model the leading count duplicates the per-model
 	// count already in parts (e.g. "4 GPU: 4 x RTX …" → "4 x RTX …").
 	if len(parts) == 1 {
 		return parts[0] + " " + label
 	}
 	return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
 }
@@ -0,0 +1,117 @@
 # GPU Model Name Propagation
 How GPU model names are detected, stored, and displayed throughout the project.
 ---
 ## Detection Sources
 There are **two separate pipelines** for GPU model names — they use different structs and don't share state.
 ### Pipeline A — Live / SAT (nvidia-smi query at runtime)
 **File:** `audit/internal/platform/sat.go`
 - `ListNvidiaGPUs()` → `NvidiaGPU.Name` (field: `name`, from `nvidia-smi --query-gpu=index,name,...`)
 - `ListNvidiaGPUStatuses()` → `NvidiaGPUStatus.Name`
 - Used by: GPU selection UI, live metrics labels, burn/stress test logic
 ### Pipeline B — Benchmark results
 **File:** `audit/internal/platform/benchmark.go`, line 124
 - `queryBenchmarkGPUInfo(selected)` → `benchmarkGPUInfo.Name`
 - Stored in `BenchmarkGPUResult.Name` (`json:"name,omitempty"`)
 - Used by: benchmark history table, benchmark report
 ### Pipeline C — Hardware audit JSON (PCIe schema)
 **File:** `audit/internal/schema/hardware.go`
 - `HardwarePCIeDevice.Model *string` (field name is **Model**, not Name)
 - For AMD GPUs: populated by `audit/internal/collector/amdgpu.go` from `info.Product`
 - For NVIDIA GPUs: **NOT populated** by `audit/internal/collector/nvidia.go` — the NVIDIA enricher sets telemetry/status but skips the Model field
 - Used by: hardware summary page (`hwDescribeGPU` in `pages.go:487`)
 ---
 ## Key Inconsistency: NVIDIA PCIe Model is Never Set
 `audit/internal/collector/nvidia.go` — `enrichPCIeWithNVIDIAData()` enriches NVIDIA PCIe devices with telemetry and status but does **not** populate `HardwarePCIeDevice.Model`.
 This means:
 - Hardware summary page shows "Unknown GPU" for all NVIDIA devices (falls back at `pages.go:486`)
 - AMD GPUs do have their model populated
 The fix would be: copy `gpu.Name` from the SAT pipeline into `dev.Model` inside `enrichPCIeWithNVIDIAData`.
 ---
 ## Benchmark History "Unknown GPU" Issue
 **Symptom:** Benchmark history table shows "GPU #N — Unknown GPU" columns instead of real GPU model names.
 **Root cause:** `BenchmarkGPUResult.Name` has tag `json:"name,omitempty"`. If `queryBenchmarkGPUInfo()` fails (warns at `benchmark.go:126`) or returns empty names, the Name field is never set and is omitted from JSON. Loaded results have empty Name → falls back to "Unknown GPU" at `pages.go:2226, 2237`.
 This happens for:
 - Older result files saved before the `Name` field was added
 - Runs where nvidia-smi query failed before the benchmark started
 ---
 ## Fallback Strings — Current State
 | Location | File | Fallback string |
 |---|---|---|
 | Hardware summary (PCIe) | `pages.go:486` | `"Unknown GPU"` |
 | Benchmark report summary | `benchmark_report.go:43` | `"Unknown GPU"` |
 | Benchmark report scorecard | `benchmark_report.go:93` | `"Unknown"` ← inconsistent |
 | Benchmark report detail | `benchmark_report.go:122` | `"Unknown GPU"` |
 | Benchmark history per-GPU col | `pages.go:2226` | `"Unknown GPU"` |
 | Benchmark history parallel col | `pages.go:2237` | `"Unknown GPU"` |
 | SAT status file write | `sat.go:922` | `"unknown"` ← lowercase, inconsistent |
 | GPU selection API | `api.go:163` | `"GPU N"` (no "Unknown") |
 **Rule:** all UI fallbacks should use `"Unknown GPU"`. The two outliers are `benchmark_report.go:93` (`"Unknown"`) and `sat.go:922` (`"unknown"`).
 ---
 ## GPU Selection UI
 **File:** `audit/internal/webui/pages.go`
 - Source: `GET /api/gpus` → `api.go` → `ListNvidiaGPUs()` → live nvidia-smi
 - Render: `'GPU ' + gpu.index + ' — ' + gpu.name + ' · ' + mem`
 - Fallback: `gpu.name || 'GPU ' + idx` (JS, line ~1432)
 This always shows the correct model because it queries nvidia-smi live. It is **not** connected to benchmark result data.
 ---
 ## Data Flow Summary
 ```
 nvidia-smi (live)
  └─ ListNvidiaGPUs() → NvidiaGPU.Name
       ├─ GPU selection UI (always correct)
       ├─ Live metrics labels (charts_svg.go)
       └─ SAT/burn status file (sat.go)
 nvidia-smi (at benchmark start)
  └─ queryBenchmarkGPUInfo() → benchmarkGPUInfo.Name
       └─ BenchmarkGPUResult.Name (json:"name,omitempty")
            ├─ Benchmark report
            └─ Benchmark history table columns
 nvidia-smi / lspci (audit collection)
  └─ HardwarePCIeDevice.Model (NVIDIA: NOT populated; AMD: populated)
       └─ Hardware summary page hwDescribeGPU()
 ```
 ---
 ## What Needs Fixing
 1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` should set `dev.Model = &gpu.Name`
 2. **Fallback consistency** — `benchmark_report.go:93` should say `"Unknown GPU"` not `"Unknown"`; `sat.go:922` should say `"Unknown GPU"` not `"unknown"`
 3. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue)