Fix GPU model propagation, export filenames, PSU/service status, and chart perf

- nvidia.go: add Name field to nvidiaGPUInfo, include model name in
  nvidia-smi query, set dev.Model in enrichPCIeWithNVIDIAData
- pages.go: fix duplicate GPU count in validate card summary (4 GPU: 4 x …
  → 4 x … GPU); fix PSU UNKNOWN fallback from hw.PowerSupplies; treat
  activating/deactivating/reloading service states as OK in Runtime Health
- support_bundle.go: use "150405" time format (no colons) for exFAT compat
- sat.go / benchmark.go / platform_stress.go / sat_fan_stress.go: remove
  .tar.gz archive creation from export dirs — export packs everything itself
- charts_svg.go: add min-max downsampling (1400 pt cap) for SVG chart perf
- benchmark_report.go / sat.go: normalize GPU fallback to "Unknown GPU"

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-11 09:59:16 +03:00
parent bb1218ddd4
commit ba16021cdb
9 changed files with 263 additions and 31 deletions

View File

@@ -213,7 +213,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
now := time.Now().UTC()
date := now.Format("2006-01-02")
tod := now.Format("15:04:05")
tod := now.Format("150405")
ver := bundleVersion()
model := serverModelForBundle()
sn := serverSerialForBundle()

View File

@@ -335,11 +335,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
return "", fmt.Errorf("write summary.txt: %w", err)
}
archive := filepath.Join(baseDir, "gpu-benchmark-"+ts+".tar.gz")
if err := createTarGz(archive, runDir); err != nil {
return "", fmt.Errorf("pack benchmark archive: %w", err)
}
return archive, nil
return runDir, nil
}
func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions {

View File

@@ -90,7 +90,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
for _, gpu := range result.GPUs {
name := strings.TrimSpace(gpu.Name)
if name == "" {
name = "Unknown"
name = "Unknown GPU"
}
interconnect := "-"
if gpu.Scores.InterconnectScore > 0 {

View File

@@ -161,13 +161,7 @@ func (s *System) RunPlatformStress(
}
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
// Pack tar.gz
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
if err := packPlatformDir(runDir, archivePath); err != nil {
return "", fmt.Errorf("pack archive: %w", err)
}
_ = os.RemoveAll(runDir)
return archivePath, nil
return runDir, nil
}
// collectPhase samples live metrics every second until ctx is done.

View File

@@ -662,11 +662,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
return "", err
}
archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz")
if err := createTarGz(archive, runDir); err != nil {
return "", err
}
return archive, nil
return runDir, nil
}
type satJob struct {
@@ -852,11 +848,7 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
}
}
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
if err := createTarGz(archive, runDir); err != nil {
return "", err
}
return archive, nil
return runDir, nil
}
func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
@@ -919,7 +911,7 @@ func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPU
entry.Health = "UNKNOWN"
}
if entry.Name == "" {
entry.Name = "unknown"
entry.Name = "Unknown GPU"
}
var body strings.Builder
fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)

View File

@@ -223,11 +223,7 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
return "", err
}
archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
if err := createTarGz(archive, runDir); err != nil {
return "", err
}
return archive, nil
return runDir, nil
}
func applyFanStressDefaults(opts *FanStressOptions) {

View File

@@ -83,6 +83,10 @@ func renderMetricChartSVG(title string, labels []string, times []time.Time, data
}
}
// Downsample to at most ~1400 points (one per pixel) before building SVG.
times, datasets = downsampleTimeSeries(times, datasets, 1400)
pointCount = len(times)
statsLabel := chartStatsLabel(datasets)
legendItems := []metricChartSeries{}
@@ -196,6 +200,19 @@ func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, s
}
}
// Downsample to at most ~1400 points before building SVG.
{
datasets := make([][]float64, len(series))
for i := range series {
datasets[i] = series[i].Values
}
times, datasets = downsampleTimeSeries(times, datasets, 1400)
pointCount = len(times)
for i := range series {
series[i].Values = datasets[i]
}
}
scales := make([]chartScale, len(series))
for i := range series {
min, max := chartSeriesBounds(series[i].Values)
@@ -626,6 +643,87 @@ func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end
b.WriteString(`</g>` + "\n")
}
// downsampleTimeSeries reduces the time series to at most maxPts points using
// min-max bucketing. Each bucket contributes the index of its min and max value
// (using the first full-length dataset as the reference series). All parallel
// datasets are sampled at those same indices so all series stay aligned.
// If len(times) <= maxPts the inputs are returned unchanged.
func downsampleTimeSeries(times []time.Time, datasets [][]float64, maxPts int) ([]time.Time, [][]float64) {
n := len(times)
if n <= maxPts || maxPts <= 0 {
return times, datasets
}
buckets := maxPts / 2
if buckets < 1 {
buckets = 1
}
// Use the first dataset that has the same length as times as the reference
// for deciding which two indices to keep per bucket.
var ref []float64
for _, ds := range datasets {
if len(ds) == n {
ref = ds
break
}
}
selected := make([]int, 0, maxPts)
bucketSize := float64(n) / float64(buckets)
for b := 0; b < buckets; b++ {
lo := int(math.Round(float64(b) * bucketSize))
hi := int(math.Round(float64(b+1) * bucketSize))
if hi > n {
hi = n
}
if lo >= hi {
continue
}
if ref == nil {
selected = append(selected, lo)
if hi-1 != lo {
selected = append(selected, hi-1)
}
continue
}
minIdx, maxIdx := lo, lo
for i := lo + 1; i < hi; i++ {
if ref[i] < ref[minIdx] {
minIdx = i
}
if ref[i] > ref[maxIdx] {
maxIdx = i
}
}
if minIdx <= maxIdx {
selected = append(selected, minIdx)
if maxIdx != minIdx {
selected = append(selected, maxIdx)
}
} else {
selected = append(selected, maxIdx)
if minIdx != maxIdx {
selected = append(selected, minIdx)
}
}
}
outTimes := make([]time.Time, len(selected))
for i, idx := range selected {
outTimes[i] = times[idx]
}
outDatasets := make([][]float64, len(datasets))
for d, ds := range datasets {
if len(ds) != n {
outDatasets[d] = ds
continue
}
out := make([]float64, len(selected))
for i, idx := range selected {
out[i] = ds[idx]
}
outDatasets[d] = out
}
return outTimes, outDatasets
}
func chartXForTime(ts, start, end time.Time, left, right int) float64 {
if !end.After(start) {
return float64(left+right) / 2

View File

@@ -349,6 +349,9 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
writeRow("GPU", hwDescribeGPU(hw), runtimeStatusBadge(gpuRow.Status))
psuRow := aggregateComponentStatus("PSU", records, nil, []string{"psu:"})
if psuRow.Status == "UNKNOWN" && len(hw.PowerSupplies) > 0 {
psuRow.Status = hwPSUStatus(hw.PowerSupplies)
}
writeRow("PSU", hwDescribePSU(hw), runtimeStatusBadge(psuRow.Status))
if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
@@ -506,6 +509,31 @@ func hwDescribeGPU(hw schema.HardwareSnapshot) string {
return strings.Join(parts, ", ")
}
// hwPSUStatus returns "OK", "CRITICAL", "WARNING", or "UNKNOWN" based on
// PSU statuses from the audit snapshot. Used as fallback when component-status.json
// has no psu: records yet (e.g. first boot before audit writes them).
func hwPSUStatus(psus []schema.HardwarePowerSupply) string {
worst := "UNKNOWN"
for _, psu := range psus {
if psu.Status == nil {
continue
}
switch strings.ToUpper(strings.TrimSpace(*psu.Status)) {
case "CRITICAL":
return "CRITICAL"
case "WARNING":
if worst != "CRITICAL" {
worst = "WARNING"
}
case "OK":
if worst == "UNKNOWN" {
worst = "OK"
}
}
}
return worst
}
// hwDescribePSU returns a summary like "2× 1600 W" or "2× PSU".
func hwDescribePSU(hw schema.HardwareSnapshot) string {
n := len(hw.PowerSupplies)
@@ -742,7 +770,13 @@ func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
nonActive := make([]string, 0)
for _, svc := range health.Services {
state := strings.TrimSpace(strings.ToLower(svc.Status))
if state != "active" {
// "activating" and "deactivating" are transient states for oneshot services
// (RemainAfterExit=yes) — the service is running normally, not failed.
// Only "failed" and "inactive" (after services should be running) are problems.
switch state {
case "active", "activating", "deactivating", "reloading":
// OK — service is running or transitioning normally
default:
nonActive = append(nonActive, svc.Name+"="+svc.Status)
}
}
@@ -1777,6 +1811,11 @@ func formatValidateDeviceSummary(total int, models map[string]int, unit string)
if total != 1 {
label += "s"
}
// If there is only one model the leading count duplicates the per-model
// count already in parts (e.g. "4 GPU: 4 x RTX …" → "4 x RTX …").
if len(parts) == 1 {
return parts[0] + " " + label
}
return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
}