Compare commits

...

2 Commits
v8.26 ... v8.28

Author SHA1 Message Date
bac89bb6e5 Add real-data duration estimates to validate tab profiles
- Add SATEstimated* constants to sat.go derived from _v8 production logs,
  with a rule to recalculate them whenever the script changes
- Extend validateInventory with NvidiaGPUCount to make estimates GPU-aware
- Update all validate card duration strings: CPU, memory, storage, NVIDIA GPU,
  targeted stress/power, pulse test, NCCL, nvbandwidth
- Fix nvbandwidth description ("intended to stay short" → actual ~45 min)
- Top-level profile labels show computed total including GPU count

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-18 10:51:15 +03:00
7a618da1f9 Redesign system power chart as stacked per-PSU area chart
- Add PSUReading struct and PSUs []PSUReading to LiveMetricSample
- Sample per-PSU input watts from IPMI SDR entity 10.x (Power Supply)
- Render stacked filled-area SVG chart (one layer per PSU, cumulative total)
- Fall back to single-line chart on systems with ≤1 PSU in SDR

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-18 10:42:00 +03:00
7 changed files with 455 additions and 69 deletions

View File

@@ -18,11 +18,19 @@ type LiveMetricSample struct {
Fans []FanReading `json:"fans"`
Temps []TempReading `json:"temps"`
PowerW float64 `json:"power_w"`
PSUs []PSUReading `json:"psus,omitempty"`
CPULoadPct float64 `json:"cpu_load_pct"`
MemLoadPct float64 `json:"mem_load_pct"`
GPUs []GPUMetricRow `json:"gpus"`
}
// PSUReading is a per-slot power supply input power reading.
type PSUReading struct {
Slot int `json:"slot"`
Name string `json:"name"`
PowerW float64 `json:"power_w"`
}
// TempReading is a named temperature sensor value.
type TempReading struct {
Name string `json:"name"`
@@ -57,6 +65,9 @@ func SampleLiveMetrics() LiveMetricSample {
// System power — returns 0 if unavailable
s.PowerW = sampleSystemPower()
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
s.PSUs = samplePSUPower()
// CPU load — from /proc/stat
s.CPULoadPct = sampleCPULoadPct()
@@ -326,3 +337,65 @@ func compactAmbientTempName(chip, name string) string {
}
return chip + " / " + name
}
// samplePSUPower reads per-PSU input power via IPMI SDR.
// It parses `ipmitool sdr elist full` output looking for Power Supply entity
// sensors (entity ID "10.N") that report a value in Watts.
// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
func samplePSUPower() []PSUReading {
out, err := exec.Command("ipmitool", "sdr", "elist", "full").Output()
if err != nil || len(out) == 0 {
return nil
}
// map slot → reading (keep highest-watt value per slot in case of duplicates)
type entry struct {
name string
powerW float64
}
bySlot := map[int]entry{}
for _, line := range strings.Split(string(out), "\n") {
parts := strings.Split(line, "|")
if len(parts) < 5 {
continue
}
entityID := strings.TrimSpace(parts[3]) // e.g. "10.1"
if !strings.HasPrefix(entityID, "10.") {
continue // not a Power Supply entity
}
slotStr := strings.TrimPrefix(entityID, "10.")
slot, err := strconv.Atoi(slotStr)
if err != nil {
continue
}
valueField := strings.TrimSpace(parts[4]) // e.g. "740.00 Watts"
if !strings.Contains(strings.ToLower(valueField), "watts") {
continue
}
valueFields := strings.Fields(valueField)
if len(valueFields) < 2 {
continue
}
w, err := strconv.ParseFloat(valueFields[0], 64)
if err != nil || w <= 0 {
continue
}
sensorName := strings.TrimSpace(parts[0])
if existing, ok := bySlot[slot]; !ok || w > existing.powerW {
bySlot[slot] = entry{name: sensorName, powerW: w}
}
}
if len(bySlot) == 0 {
return nil
}
slots := make([]int, 0, len(bySlot))
for s := range bySlot {
slots = append(slots, s)
}
sort.Ints(slots)
psus := make([]PSUReading, 0, len(slots))
for _, s := range slots {
e := bySlot[s]
psus = append(psus, PSUReading{Slot: s, Name: e.name, PowerW: e.powerW})
}
return psus
}

View File

@@ -20,6 +20,54 @@ import (
"time"
)
// Estimated wall-clock durations for each SAT/validate test, derived from real
// production logs in _benchmark/_v8/.
//
// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
// the corresponding Run*Pack function change, re-measure the wall-clock duration
// from actual task logs and update the matching constant here.
//
// Sources:
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 7787 s/GPU
// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444448 s/GPU
// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347348 s/GPU (300 s default + overhead)
// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346351 s/GPU
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210384 s / 8 GPU (all simultaneous)
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 6642 688 s / 8 GPU (all simultaneous)
const (
// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
SATEstimatedCPUValidateSec = 65
// CPU stress: stress-ng 1800 s (stress mode default).
SATEstimatedCPUStressSec = 1800
// RAM: memtester 256 MB / 1 pass.
SATEstimatedMemoryValidateSec = 70
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
SATEstimatedMemoryStressSec = 140
// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
SATEstimatedNvidiaGPUValidatePerGPUSec = 85
// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
SATEstimatedNvidiaGPUStressPerGPUSec = 450
// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
SATEstimatedNvidiaTargetedStressPerGPUSec = 350
// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
SATEstimatedNvidiaPulseTestSec = 5000
// NCCL all_reduce_perf, all GPUs simultaneously.
SATEstimatedNvidiaInterconnectSec = 300
// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
// without a user-configurable time limit; duration is determined by nvbandwidth itself.
SATEstimatedNvidiaBandwidthSec = 2700
)
var (
satExecCommand = exec.Command
satLookPath = exec.LookPath

View File

@@ -462,6 +462,127 @@ func synthesizeChartTimes(times []time.Time, count int) []time.Time {
return out
}
// renderStackedMetricChartSVG renders a stacked area chart where each dataset
// is visually "stacked" on top of the previous one. Intended for multi-PSU
// power charts where the filled area of each PSU shows its individual
// contribution and the total height equals the combined draw.
func renderStackedMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
pointCount := len(labels)
if len(times) > pointCount {
pointCount = len(times)
}
if pointCount == 0 {
pointCount = 1
labels = []string{""}
times = []time.Time{{}}
}
if len(labels) < pointCount {
padded := make([]string, pointCount)
copy(padded, labels)
labels = padded
}
if len(times) < pointCount {
times = synthesizeChartTimes(times, pointCount)
}
for i := range datasets {
if len(datasets[i]) == 0 {
datasets[i] = make([]float64, pointCount)
}
}
times, datasets = downsampleTimeSeries(times, datasets, 1400)
pointCount = len(times)
// Build cumulative sums per time point.
cumulative := make([][]float64, len(datasets)+1)
for i := range cumulative {
cumulative[i] = make([]float64, pointCount)
}
for i, ds := range datasets {
for j, v := range ds {
cumulative[i+1][j] = cumulative[i][j] + v
}
}
// Scale is based on the total (top cumulative row).
total := cumulative[len(cumulative)-1]
yMin := floatPtr(0)
if yMax == nil {
yMax = autoMax120(total)
}
scale := singleAxisChartScale([][]float64{total}, yMin, yMax)
legendItems := make([]metricChartSeries, len(datasets))
for i, name := range names {
color := metricChartPalette[i%len(metricChartPalette)]
legendItems[i] = metricChartSeries{Name: name, Color: color, Values: datasets[i]}
}
// Stats label from totals.
statsLabel := chartStatsLabel([][]float64{total})
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
start, end := chartTimeBounds(times)
var b strings.Builder
writeSVGOpen(&b, layout.Width, layout.Height)
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
writeTimelineIdleSpans(&b, layout, start, end, timeline)
writeVerticalGrid(&b, layout, times, pointCount, 8)
writeHorizontalGrid(&b, layout, scale)
writeTimelineBoundaries(&b, layout, start, end, timeline)
writePlotBorder(&b, layout)
writeSingleAxisY(&b, layout, scale)
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
// Draw stacked areas from top to bottom so lower layers are visible.
for i := len(datasets) - 1; i >= 0; i-- {
writeStackedArea(&b, layout, times, start, end, cumulative[i], cumulative[i+1], scale, legendItems[i].Color)
}
// Draw border polylines on top.
for i := len(datasets) - 1; i >= 0; i-- {
writeSeriesPolyline(&b, layout, times, start, end, cumulative[i+1], scale, legendItems[i].Color)
}
writeLegend(&b, layout, legendItems)
writeSVGClose(&b)
return []byte(b.String()), nil
}
// writeStackedArea draws a filled polygon between two cumulative value arrays
// (baseline and top), using the given color at 55% opacity.
func writeStackedArea(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, baseline, top []float64, scale chartScale, color string) {
n := len(top)
if n == 0 {
return
}
if len(baseline) < n {
baseline = make([]float64, n)
}
// Forward path along top values, then backward along baseline values.
var points strings.Builder
for i := 0; i < n; i++ {
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
y := chartYForValue(valueClamp(top[i], scale), scale, layout.PlotTop, layout.PlotBottom)
if i > 0 {
points.WriteByte(' ')
}
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
points.WriteByte(',')
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
}
for i := n - 1; i >= 0; i-- {
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
y := chartYForValue(valueClamp(baseline[i], scale), scale, layout.PlotTop, layout.PlotBottom)
points.WriteByte(' ')
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
points.WriteByte(',')
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
}
fmt.Fprintf(b, `<polygon points="%s" fill="%s" fill-opacity="0.55" stroke="none"/>`+"\n", points.String(), color)
}
func writeSVGOpen(b *strings.Builder, width, height int) {
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
}

View File

@@ -1378,15 +1378,64 @@ setInterval(loadMetricsLayout, 5000);
// ── Validate (Acceptance Tests) ───────────────────────────────────────────────
type validateInventory struct {
CPU string
Memory string
Storage string
NVIDIA string
AMD string
CPU string
Memory string
Storage string
NVIDIA string
AMD string
NvidiaGPUCount int
AMDGPUCount int
}
// validateFmtDur formats a duration in seconds as a human-readable "~N min" or "~N s" string.
func validateFmtDur(secs int) string {
if secs < 120 {
return fmt.Sprintf("~%d s", secs)
}
mins := (secs + 29) / 60
return fmt.Sprintf("~%d min", mins)
}
// validateTotalValidateSec returns the estimated wall-clock duration of
// "Validate one by one" in Validate mode for n NVIDIA GPUs.
func validateTotalValidateSec(n int) int {
if n < 0 {
n = 0
}
total := platform.SATEstimatedCPUValidateSec +
platform.SATEstimatedMemoryValidateSec +
n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
platform.SATEstimatedNvidiaInterconnectSec +
platform.SATEstimatedNvidiaBandwidthSec
return total
}
// validateTotalStressSec returns the estimated wall-clock duration of
// "Validate one by one" in Stress mode for n NVIDIA GPUs.
func validateTotalStressSec(n int) int {
if n < 0 {
n = 0
}
total := platform.SATEstimatedCPUStressSec +
platform.SATEstimatedMemoryStressSec +
n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
platform.SATEstimatedNvidiaPulseTestSec +
platform.SATEstimatedNvidiaInterconnectSec +
platform.SATEstimatedNvidiaBandwidthSec
return total
}
func renderValidate(opts HandlerOptions) string {
inv := loadValidateInventory(opts)
n := inv.NvidiaGPUCount
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
stressTotalStr := validateFmtDur(validateTotalStressSec(n))
gpuNote := ""
if n > 0 {
gpuNote = fmt.Sprintf(" (%d GPU)", n)
}
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
@@ -1396,10 +1445,10 @@ func renderValidate(opts HandlerOptions) string {
<div class="validate-profile-col">
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~3060 min)</span></label>
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
</div>
<div class="validate-profile-col validate-profile-action">
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~515 min total); Stress is thorough (~3060 min total).</p>
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
<div style="margin-top:12px">
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
@@ -1413,19 +1462,19 @@ func renderValidate(opts HandlerOptions) string {
inv.CPU,
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
`60s in Validate, 30 min in Stress.`,
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
)) +
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
inv.Memory,
`Runs a RAM validation pass and records memory state around the test.`,
`<code>free</code>, <code>memtester</code>`,
`256 MB / 1 pass in Validate, 512 MB / 1 pass in Stress.`,
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
)) +
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
inv.Storage,
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
`Short self-test in Validate, extended self-test in Stress.`,
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
)) +
`</div>
<div style="height:1px;background:var(--border);margin:16px 0"></div>
@@ -1450,14 +1499,33 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA,
`Runs NVIDIA diagnostics and board inventory checks.`,
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
func() string {
perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
if n > 0 {
return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
validateFmtDur(perV), n, validateFmtDur(perV*n),
validateFmtDur(perS), n, validateFmtDur(perS*n))
}
return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
validateFmtDur(perV), validateFmtDur(perS))
}(),
)) +
`<div id="sat-card-nvidia-targeted-stress">` +
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
inv.NVIDIA,
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
`<code>dcgmi diag targeted_stress</code>`,
`Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
func() string {
per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
s := "Skipped in Validate. "
if n > 0 {
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
} else {
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
}
return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
}(),
)) +
`</div>` +
`<div id="sat-card-nvidia-targeted-power">` +
@@ -1465,7 +1533,16 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA,
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
`<code>dcgmi diag targeted_power</code>`,
`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
func() string {
per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
s := "Skipped in Validate. "
if n > 0 {
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
} else {
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
}
return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
}(),
)) +
`</div>` +
`<div id="sat-card-nvidia-pulse">` +
@@ -1473,7 +1550,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA,
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
`<code>dcgmi diag pulse_test</code>`,
`Skipped in Validate mode. Runs in Stress mode only. Runs all selected GPUs simultaneously — synchronous pulsing is required to stress the PSU.<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
)) +
`</div>` +
`<div id="sat-card-nvidia-interconnect">` +
@@ -1481,7 +1558,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA,
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
`<code>all_reduce_perf</code> (NCCL tests)`,
`Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`,
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
)) +
`</div>` +
`<div id="sat-card-nvidia-bandwidth">` +
@@ -1489,7 +1566,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA,
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
`<code>nvbandwidth</code>`,
`Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`,
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
)) +
`</div>` +
`</div>
@@ -1922,6 +1999,8 @@ func loadValidateInventory(opts HandlerOptions) validateInventory {
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
out.NvidiaGPUCount = nvidiaTotal
out.AMDGPUCount = amdTotal
return out
}

View File

@@ -575,12 +575,14 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
}
timeline := metricsTimelineSegments(samples, time.Now())
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
var overviewOk bool
var buf []byte
buf, overviewOk, err = renderGPUOverviewChartSVG(idx, samples, timeline)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if !ok {
if !overviewOk {
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
return
}
@@ -589,23 +591,37 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
_, _ = w.Write(buf)
return
}
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
if !ok {
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
return
}
buf, err := renderMetricChartSVG(
title,
labels,
sampleTimes(samples),
datasets,
names,
yMin,
yMax,
chartCanvasHeightForPath(path, len(names)),
timeline,
)
var buf []byte
if stacked {
buf, err = renderStackedMetricChartSVG(
title,
labels,
sampleTimes(samples),
datasets,
names,
yMax,
chartCanvasHeightForPath(path, len(names)),
timeline,
)
} else {
buf, err = renderMetricChartSVG(
title,
labels,
sampleTimes(samples),
datasets,
names,
yMin,
yMax,
chartCanvasHeightForPath(path, len(names)),
timeline,
)
}
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
@@ -615,12 +631,8 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
_, _ = w.Write(buf)
}
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
var datasets [][]float64
var names []string
var title string
var yMin, yMax *float64
labels := sampleTimeLabels(samples)
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (datasets [][]float64, names []string, labels []string, title string, yMin, yMax *float64, stacked bool, ok bool) {
labels = sampleTimeLabels(samples)
switch {
case path == "server-load":
@@ -656,15 +668,41 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
case path == "server-power":
title = "System Power"
power := make([]float64, len(samples))
for i, s := range samples {
power[i] = s.PowerW
// Use per-PSU stacked chart when PSU SDR data is available.
// Collect the union of PSU slots seen across all samples.
psuSlots := psuSlotsFromSamples(samples)
if len(psuSlots) > 1 {
// Build one dataset per PSU slot.
psuDatasets := make([][]float64, len(psuSlots))
psuNames := make([]string, len(psuSlots))
for si, slot := range psuSlots {
ds := make([]float64, len(samples))
for i, s := range samples {
for _, psu := range s.PSUs {
if psu.Slot == slot {
ds[i] = psu.PowerW
break
}
}
}
psuDatasets[si] = normalizePowerSeries(ds)
psuNames[si] = fmt.Sprintf("PSU %d", slot)
}
datasets = psuDatasets
names = psuNames
stacked = true
yMax = autoMax120(psuStackedTotal(psuDatasets))
} else {
power := make([]float64, len(samples))
for i, s := range samples {
power[i] = s.PowerW
}
power = normalizePowerSeries(power)
datasets = [][]float64{power}
names = []string{"Power W"}
yMin = floatPtr(0)
yMax = autoMax120(power)
}
power = normalizePowerSeries(power)
datasets = [][]float64{power}
names = []string{"Power W"}
yMin = floatPtr(0)
yMax = autoMax120(power)
case path == "server-fans":
title = "Fan RPM"
@@ -707,7 +745,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
case strings.HasPrefix(path, "gpu/"):
idx, sub, ok := parseGPUChartPath(path)
if !ok {
return nil, nil, nil, "", nil, nil, false
return nil, nil, nil, "", nil, nil, false, false
}
switch sub {
case "load":
@@ -715,7 +753,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
if util == nil && mem == nil {
return nil, nil, nil, "", nil, nil, false
return nil, nil, nil, "", nil, nil, false, false
}
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
names = []string{"Load %", "Mem %"}
@@ -725,7 +763,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
title = gpuDisplayLabel(idx) + " Temperature"
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
if temp == nil {
return nil, nil, nil, "", nil, nil, false
return nil, nil, nil, "", nil, nil, false, false
}
datasets = [][]float64{temp}
names = []string{"Temp °C"}
@@ -735,7 +773,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
title = gpuDisplayLabel(idx) + " Core Clock"
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
if clock == nil {
return nil, nil, nil, "", nil, nil, false
return nil, nil, nil, "", nil, nil, false, false
}
datasets = [][]float64{clock}
names = []string{"Core Clock MHz"}
@@ -744,7 +782,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
title = gpuDisplayLabel(idx) + " Memory Clock"
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
if clock == nil {
return nil, nil, nil, "", nil, nil, false
return nil, nil, nil, "", nil, nil, false, false
}
datasets = [][]float64{clock}
names = []string{"Memory Clock MHz"}
@@ -753,7 +791,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
title = gpuDisplayLabel(idx) + " Power"
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
if power == nil {
return nil, nil, nil, "", nil, nil, false
return nil, nil, nil, "", nil, nil, false, false
}
datasets = [][]float64{power}
names = []string{"Power W"}
@@ -761,10 +799,10 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
}
default:
return nil, nil, nil, "", nil, nil, false
return nil, nil, nil, "", nil, nil, false, false
}
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
return datasets, names, labels, title, yMin, yMax, stacked, len(datasets) > 0
}
func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
@@ -930,6 +968,37 @@ func normalizePowerSeries(ds []float64) []float64 {
return out
}
// psuSlotsFromSamples returns the sorted list of PSU slot numbers seen across samples.
func psuSlotsFromSamples(samples []platform.LiveMetricSample) []int {
seen := map[int]struct{}{}
for _, s := range samples {
for _, p := range s.PSUs {
seen[p.Slot] = struct{}{}
}
}
slots := make([]int, 0, len(seen))
for s := range seen {
slots = append(slots, s)
}
sort.Ints(slots)
return slots
}
// psuStackedTotal returns the point-by-point sum of all PSU datasets (for scale calculation).
func psuStackedTotal(datasets [][]float64) []float64 {
if len(datasets) == 0 {
return nil
}
n := len(datasets[0])
total := make([]float64, n)
for _, ds := range datasets {
for i, v := range ds {
total[i] += v
}
}
return total
}
func normalizeFanSeries(ds []float64) []float64 {
if len(ds) == 0 {
return nil

View File

@@ -120,7 +120,7 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
},
}
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
datasets, names, labels, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
if !ok {
t.Fatal("chartDataFromSamples returned ok=false")
}
@@ -164,7 +164,7 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
},
}
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
if !ok {
t.Fatal("chartDataFromSamples returned ok=false")
}
@@ -209,7 +209,7 @@ func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
},
}
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
if !ok {
t.Fatal("gpu-all-clock returned ok=false")
}
@@ -754,9 +754,9 @@ func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
body := rec.Body.String()
for _, needle := range []string{
`NVIDIA Interconnect (NCCL)`,
`Runs in Validate and Stress.`,
`Validate and Stress:`,
`NVIDIA Bandwidth (NVBandwidth)`,
`Intended to stay short enough for Validate.`,
`nvbandwidth runs all built-in tests without a time limit`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("validate page missing %q: %s", needle, body)

View File

@@ -171,21 +171,17 @@ func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeli
}
return gpuDisplayLabel(idx) + " Overview", buf, true
}
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
if !ok {
return "", nil, false
}
buf, err := renderMetricChartSVG(
title,
labels,
sampleTimes(samples),
datasets,
names,
yMin,
yMax,
chartCanvasHeightForPath(path, len(names)),
timeline,
)
var buf []byte
var err error
if stacked {
buf, err = renderStackedMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
} else {
buf, err = renderMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMin, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
}
if err != nil {
return "", nil, false
}