Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| bac89bb6e5 | |||
| 7a618da1f9 |
@@ -18,11 +18,19 @@ type LiveMetricSample struct {
|
|||||||
Fans []FanReading `json:"fans"`
|
Fans []FanReading `json:"fans"`
|
||||||
Temps []TempReading `json:"temps"`
|
Temps []TempReading `json:"temps"`
|
||||||
PowerW float64 `json:"power_w"`
|
PowerW float64 `json:"power_w"`
|
||||||
|
PSUs []PSUReading `json:"psus,omitempty"`
|
||||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||||
MemLoadPct float64 `json:"mem_load_pct"`
|
MemLoadPct float64 `json:"mem_load_pct"`
|
||||||
GPUs []GPUMetricRow `json:"gpus"`
|
GPUs []GPUMetricRow `json:"gpus"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PSUReading is a per-slot power supply input power reading.
|
||||||
|
type PSUReading struct {
|
||||||
|
Slot int `json:"slot"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
PowerW float64 `json:"power_w"`
|
||||||
|
}
|
||||||
|
|
||||||
// TempReading is a named temperature sensor value.
|
// TempReading is a named temperature sensor value.
|
||||||
type TempReading struct {
|
type TempReading struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
@@ -57,6 +65,9 @@ func SampleLiveMetrics() LiveMetricSample {
|
|||||||
// System power — returns 0 if unavailable
|
// System power — returns 0 if unavailable
|
||||||
s.PowerW = sampleSystemPower()
|
s.PowerW = sampleSystemPower()
|
||||||
|
|
||||||
|
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
|
||||||
|
s.PSUs = samplePSUPower()
|
||||||
|
|
||||||
// CPU load — from /proc/stat
|
// CPU load — from /proc/stat
|
||||||
s.CPULoadPct = sampleCPULoadPct()
|
s.CPULoadPct = sampleCPULoadPct()
|
||||||
|
|
||||||
@@ -326,3 +337,65 @@ func compactAmbientTempName(chip, name string) string {
|
|||||||
}
|
}
|
||||||
return chip + " / " + name
|
return chip + " / " + name
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// samplePSUPower reads per-PSU input power via IPMI SDR.
|
||||||
|
// It parses `ipmitool sdr elist full` output looking for Power Supply entity
|
||||||
|
// sensors (entity ID "10.N") that report a value in Watts.
|
||||||
|
// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
|
||||||
|
func samplePSUPower() []PSUReading {
|
||||||
|
out, err := exec.Command("ipmitool", "sdr", "elist", "full").Output()
|
||||||
|
if err != nil || len(out) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// map slot → reading (keep highest-watt value per slot in case of duplicates)
|
||||||
|
type entry struct {
|
||||||
|
name string
|
||||||
|
powerW float64
|
||||||
|
}
|
||||||
|
bySlot := map[int]entry{}
|
||||||
|
for _, line := range strings.Split(string(out), "\n") {
|
||||||
|
parts := strings.Split(line, "|")
|
||||||
|
if len(parts) < 5 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
entityID := strings.TrimSpace(parts[3]) // e.g. "10.1"
|
||||||
|
if !strings.HasPrefix(entityID, "10.") {
|
||||||
|
continue // not a Power Supply entity
|
||||||
|
}
|
||||||
|
slotStr := strings.TrimPrefix(entityID, "10.")
|
||||||
|
slot, err := strconv.Atoi(slotStr)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
valueField := strings.TrimSpace(parts[4]) // e.g. "740.00 Watts"
|
||||||
|
if !strings.Contains(strings.ToLower(valueField), "watts") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
valueFields := strings.Fields(valueField)
|
||||||
|
if len(valueFields) < 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
w, err := strconv.ParseFloat(valueFields[0], 64)
|
||||||
|
if err != nil || w <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sensorName := strings.TrimSpace(parts[0])
|
||||||
|
if existing, ok := bySlot[slot]; !ok || w > existing.powerW {
|
||||||
|
bySlot[slot] = entry{name: sensorName, powerW: w}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(bySlot) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
slots := make([]int, 0, len(bySlot))
|
||||||
|
for s := range bySlot {
|
||||||
|
slots = append(slots, s)
|
||||||
|
}
|
||||||
|
sort.Ints(slots)
|
||||||
|
psus := make([]PSUReading, 0, len(slots))
|
||||||
|
for _, s := range slots {
|
||||||
|
e := bySlot[s]
|
||||||
|
psus = append(psus, PSUReading{Slot: s, Name: e.name, PowerW: e.powerW})
|
||||||
|
}
|
||||||
|
return psus
|
||||||
|
}
|
||||||
|
|||||||
@@ -20,6 +20,54 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Estimated wall-clock durations for each SAT/validate test, derived from real
|
||||||
|
// production logs in _benchmark/_v8/.
|
||||||
|
//
|
||||||
|
// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
|
||||||
|
// the corresponding Run*Pack function change, re-measure the wall-clock duration
|
||||||
|
// from actual task logs and update the matching constant here.
|
||||||
|
//
|
||||||
|
// Sources:
|
||||||
|
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
||||||
|
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
||||||
|
// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 77–87 s/GPU
|
||||||
|
// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444–448 s/GPU
|
||||||
|
// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
|
||||||
|
// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
|
||||||
|
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
||||||
|
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
||||||
|
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
||||||
|
const (
|
||||||
|
// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
|
||||||
|
SATEstimatedCPUValidateSec = 65
|
||||||
|
// CPU stress: stress-ng 1800 s (stress mode default).
|
||||||
|
SATEstimatedCPUStressSec = 1800
|
||||||
|
|
||||||
|
// RAM: memtester 256 MB / 1 pass.
|
||||||
|
SATEstimatedMemoryValidateSec = 70
|
||||||
|
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
||||||
|
SATEstimatedMemoryStressSec = 140
|
||||||
|
|
||||||
|
// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
|
||||||
|
SATEstimatedNvidiaGPUValidatePerGPUSec = 85
|
||||||
|
// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
|
||||||
|
SATEstimatedNvidiaGPUStressPerGPUSec = 450
|
||||||
|
|
||||||
|
// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
|
||||||
|
SATEstimatedNvidiaTargetedStressPerGPUSec = 350
|
||||||
|
// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
|
||||||
|
SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
|
||||||
|
|
||||||
|
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
||||||
|
SATEstimatedNvidiaPulseTestSec = 5000
|
||||||
|
|
||||||
|
// NCCL all_reduce_perf, all GPUs simultaneously.
|
||||||
|
SATEstimatedNvidiaInterconnectSec = 300
|
||||||
|
// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
|
||||||
|
// without a user-configurable time limit; duration is determined by nvbandwidth itself.
|
||||||
|
SATEstimatedNvidiaBandwidthSec = 2700
|
||||||
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
satExecCommand = exec.Command
|
satExecCommand = exec.Command
|
||||||
satLookPath = exec.LookPath
|
satLookPath = exec.LookPath
|
||||||
|
|||||||
@@ -462,6 +462,127 @@ func synthesizeChartTimes(times []time.Time, count int) []time.Time {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// renderStackedMetricChartSVG renders a stacked area chart where each dataset
|
||||||
|
// is visually "stacked" on top of the previous one. Intended for multi-PSU
|
||||||
|
// power charts where the filled area of each PSU shows its individual
|
||||||
|
// contribution and the total height equals the combined draw.
|
||||||
|
func renderStackedMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
||||||
|
pointCount := len(labels)
|
||||||
|
if len(times) > pointCount {
|
||||||
|
pointCount = len(times)
|
||||||
|
}
|
||||||
|
if pointCount == 0 {
|
||||||
|
pointCount = 1
|
||||||
|
labels = []string{""}
|
||||||
|
times = []time.Time{{}}
|
||||||
|
}
|
||||||
|
if len(labels) < pointCount {
|
||||||
|
padded := make([]string, pointCount)
|
||||||
|
copy(padded, labels)
|
||||||
|
labels = padded
|
||||||
|
}
|
||||||
|
if len(times) < pointCount {
|
||||||
|
times = synthesizeChartTimes(times, pointCount)
|
||||||
|
}
|
||||||
|
for i := range datasets {
|
||||||
|
if len(datasets[i]) == 0 {
|
||||||
|
datasets[i] = make([]float64, pointCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||||
|
pointCount = len(times)
|
||||||
|
|
||||||
|
// Build cumulative sums per time point.
|
||||||
|
cumulative := make([][]float64, len(datasets)+1)
|
||||||
|
for i := range cumulative {
|
||||||
|
cumulative[i] = make([]float64, pointCount)
|
||||||
|
}
|
||||||
|
for i, ds := range datasets {
|
||||||
|
for j, v := range ds {
|
||||||
|
cumulative[i+1][j] = cumulative[i][j] + v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scale is based on the total (top cumulative row).
|
||||||
|
total := cumulative[len(cumulative)-1]
|
||||||
|
yMin := floatPtr(0)
|
||||||
|
if yMax == nil {
|
||||||
|
yMax = autoMax120(total)
|
||||||
|
}
|
||||||
|
scale := singleAxisChartScale([][]float64{total}, yMin, yMax)
|
||||||
|
|
||||||
|
legendItems := make([]metricChartSeries, len(datasets))
|
||||||
|
for i, name := range names {
|
||||||
|
color := metricChartPalette[i%len(metricChartPalette)]
|
||||||
|
legendItems[i] = metricChartSeries{Name: name, Color: color, Values: datasets[i]}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stats label from totals.
|
||||||
|
statsLabel := chartStatsLabel([][]float64{total})
|
||||||
|
|
||||||
|
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
|
||||||
|
start, end := chartTimeBounds(times)
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
writeSVGOpen(&b, layout.Width, layout.Height)
|
||||||
|
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
|
||||||
|
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||||
|
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||||
|
writeHorizontalGrid(&b, layout, scale)
|
||||||
|
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||||
|
writePlotBorder(&b, layout)
|
||||||
|
writeSingleAxisY(&b, layout, scale)
|
||||||
|
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||||
|
|
||||||
|
// Draw stacked areas from top to bottom so lower layers are visible.
|
||||||
|
for i := len(datasets) - 1; i >= 0; i-- {
|
||||||
|
writeStackedArea(&b, layout, times, start, end, cumulative[i], cumulative[i+1], scale, legendItems[i].Color)
|
||||||
|
}
|
||||||
|
// Draw border polylines on top.
|
||||||
|
for i := len(datasets) - 1; i >= 0; i-- {
|
||||||
|
writeSeriesPolyline(&b, layout, times, start, end, cumulative[i+1], scale, legendItems[i].Color)
|
||||||
|
}
|
||||||
|
|
||||||
|
writeLegend(&b, layout, legendItems)
|
||||||
|
writeSVGClose(&b)
|
||||||
|
return []byte(b.String()), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// writeStackedArea draws a filled polygon between two cumulative value arrays
|
||||||
|
// (baseline and top), using the given color at 55% opacity.
|
||||||
|
func writeStackedArea(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, baseline, top []float64, scale chartScale, color string) {
|
||||||
|
n := len(top)
|
||||||
|
if n == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if len(baseline) < n {
|
||||||
|
baseline = make([]float64, n)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Forward path along top values, then backward along baseline values.
|
||||||
|
var points strings.Builder
|
||||||
|
for i := 0; i < n; i++ {
|
||||||
|
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(valueClamp(top[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
if i > 0 {
|
||||||
|
points.WriteByte(' ')
|
||||||
|
}
|
||||||
|
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||||
|
points.WriteByte(',')
|
||||||
|
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||||
|
}
|
||||||
|
for i := n - 1; i >= 0; i-- {
|
||||||
|
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(valueClamp(baseline[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
points.WriteByte(' ')
|
||||||
|
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||||
|
points.WriteByte(',')
|
||||||
|
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||||
|
}
|
||||||
|
fmt.Fprintf(b, `<polygon points="%s" fill="%s" fill-opacity="0.55" stroke="none"/>`+"\n", points.String(), color)
|
||||||
|
}
|
||||||
|
|
||||||
func writeSVGOpen(b *strings.Builder, width, height int) {
|
func writeSVGOpen(b *strings.Builder, width, height int) {
|
||||||
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
|
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1383,10 +1383,59 @@ type validateInventory struct {
|
|||||||
Storage string
|
Storage string
|
||||||
NVIDIA string
|
NVIDIA string
|
||||||
AMD string
|
AMD string
|
||||||
|
NvidiaGPUCount int
|
||||||
|
AMDGPUCount int
|
||||||
|
}
|
||||||
|
|
||||||
|
// validateFmtDur formats a duration in seconds as a human-readable "~N min" or "~N s" string.
|
||||||
|
func validateFmtDur(secs int) string {
|
||||||
|
if secs < 120 {
|
||||||
|
return fmt.Sprintf("~%d s", secs)
|
||||||
|
}
|
||||||
|
mins := (secs + 29) / 60
|
||||||
|
return fmt.Sprintf("~%d min", mins)
|
||||||
|
}
|
||||||
|
|
||||||
|
// validateTotalValidateSec returns the estimated wall-clock duration of
|
||||||
|
// "Validate one by one" in Validate mode for n NVIDIA GPUs.
|
||||||
|
func validateTotalValidateSec(n int) int {
|
||||||
|
if n < 0 {
|
||||||
|
n = 0
|
||||||
|
}
|
||||||
|
total := platform.SATEstimatedCPUValidateSec +
|
||||||
|
platform.SATEstimatedMemoryValidateSec +
|
||||||
|
n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
|
||||||
|
platform.SATEstimatedNvidiaInterconnectSec +
|
||||||
|
platform.SATEstimatedNvidiaBandwidthSec
|
||||||
|
return total
|
||||||
|
}
|
||||||
|
|
||||||
|
// validateTotalStressSec returns the estimated wall-clock duration of
|
||||||
|
// "Validate one by one" in Stress mode for n NVIDIA GPUs.
|
||||||
|
func validateTotalStressSec(n int) int {
|
||||||
|
if n < 0 {
|
||||||
|
n = 0
|
||||||
|
}
|
||||||
|
total := platform.SATEstimatedCPUStressSec +
|
||||||
|
platform.SATEstimatedMemoryStressSec +
|
||||||
|
n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
|
||||||
|
n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
|
||||||
|
n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
|
||||||
|
platform.SATEstimatedNvidiaPulseTestSec +
|
||||||
|
platform.SATEstimatedNvidiaInterconnectSec +
|
||||||
|
platform.SATEstimatedNvidiaBandwidthSec
|
||||||
|
return total
|
||||||
}
|
}
|
||||||
|
|
||||||
func renderValidate(opts HandlerOptions) string {
|
func renderValidate(opts HandlerOptions) string {
|
||||||
inv := loadValidateInventory(opts)
|
inv := loadValidateInventory(opts)
|
||||||
|
n := inv.NvidiaGPUCount
|
||||||
|
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||||
|
stressTotalStr := validateFmtDur(validateTotalStressSec(n))
|
||||||
|
gpuNote := ""
|
||||||
|
if n > 0 {
|
||||||
|
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||||
|
}
|
||||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
|
|
||||||
@@ -1396,10 +1445,10 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
<div class="validate-profile-col">
|
<div class="validate-profile-col">
|
||||||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
|
||||||
</div>
|
</div>
|
||||||
<div class="validate-profile-col validate-profile-action">
|
<div class="validate-profile-col validate-profile-action">
|
||||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
|
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
|
||||||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||||
<div style="margin-top:12px">
|
<div style="margin-top:12px">
|
||||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||||
@@ -1413,19 +1462,19 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.CPU,
|
inv.CPU,
|
||||||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||||
`60s in Validate, 30 min in Stress.`,
|
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
|
||||||
)) +
|
)) +
|
||||||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||||
inv.Memory,
|
inv.Memory,
|
||||||
`Runs a RAM validation pass and records memory state around the test.`,
|
`Runs a RAM validation pass and records memory state around the test.`,
|
||||||
`<code>free</code>, <code>memtester</code>`,
|
`<code>free</code>, <code>memtester</code>`,
|
||||||
`256 MB / 1 pass in Validate, 512 MB / 1 pass in Stress.`,
|
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
|
||||||
)) +
|
)) +
|
||||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||||
inv.Storage,
|
inv.Storage,
|
||||||
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||||
`Short self-test in Validate, extended self-test in Stress.`,
|
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
|
||||||
)) +
|
)) +
|
||||||
`</div>
|
`</div>
|
||||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||||
@@ -1450,14 +1499,33 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||||
`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
|
func() string {
|
||||||
|
perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
|
||||||
|
perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
|
||||||
|
if n > 0 {
|
||||||
|
return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
|
||||||
|
validateFmtDur(perV), n, validateFmtDur(perV*n),
|
||||||
|
validateFmtDur(perS), n, validateFmtDur(perS*n))
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
|
||||||
|
validateFmtDur(perV), validateFmtDur(perS))
|
||||||
|
}(),
|
||||||
)) +
|
)) +
|
||||||
`<div id="sat-card-nvidia-targeted-stress">` +
|
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||||
`<code>dcgmi diag targeted_stress</code>`,
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
`Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
func() string {
|
||||||
|
per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
|
||||||
|
s := "Skipped in Validate. "
|
||||||
|
if n > 0 {
|
||||||
|
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
||||||
|
} else {
|
||||||
|
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
||||||
|
}
|
||||||
|
return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
||||||
|
}(),
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-targeted-power">` +
|
`<div id="sat-card-nvidia-targeted-power">` +
|
||||||
@@ -1465,7 +1533,16 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||||
`<code>dcgmi diag targeted_power</code>`,
|
`<code>dcgmi diag targeted_power</code>`,
|
||||||
`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
func() string {
|
||||||
|
per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
|
||||||
|
s := "Skipped in Validate. "
|
||||||
|
if n > 0 {
|
||||||
|
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
||||||
|
} else {
|
||||||
|
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
||||||
|
}
|
||||||
|
return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
||||||
|
}(),
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-pulse">` +
|
`<div id="sat-card-nvidia-pulse">` +
|
||||||
@@ -1473,7 +1550,7 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||||
`<code>dcgmi diag pulse_test</code>`,
|
`<code>dcgmi diag pulse_test</code>`,
|
||||||
`Skipped in Validate mode. Runs in Stress mode only. Runs all selected GPUs simultaneously — synchronous pulsing is required to stress the PSU.<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-interconnect">` +
|
`<div id="sat-card-nvidia-interconnect">` +
|
||||||
@@ -1481,7 +1558,7 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||||
`Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`,
|
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-bandwidth">` +
|
`<div id="sat-card-nvidia-bandwidth">` +
|
||||||
@@ -1489,7 +1566,7 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||||
`<code>nvbandwidth</code>`,
|
`<code>nvbandwidth</code>`,
|
||||||
`Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`,
|
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`</div>
|
`</div>
|
||||||
@@ -1922,6 +1999,8 @@ func loadValidateInventory(opts HandlerOptions) validateInventory {
|
|||||||
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
|
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
|
||||||
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
|
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
|
||||||
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
|
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
|
||||||
|
out.NvidiaGPUCount = nvidiaTotal
|
||||||
|
out.AMDGPUCount = amdTotal
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -575,12 +575,14 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
}
|
}
|
||||||
timeline := metricsTimelineSegments(samples, time.Now())
|
timeline := metricsTimelineSegments(samples, time.Now())
|
||||||
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
|
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
|
||||||
buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
|
var overviewOk bool
|
||||||
|
var buf []byte
|
||||||
|
buf, overviewOk, err = renderGPUOverviewChartSVG(idx, samples, timeline)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if !ok {
|
if !overviewOk {
|
||||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -589,13 +591,26 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
_, _ = w.Write(buf)
|
_, _ = w.Write(buf)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
|
datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
buf, err := renderMetricChartSVG(
|
var buf []byte
|
||||||
|
if stacked {
|
||||||
|
buf, err = renderStackedMetricChartSVG(
|
||||||
|
title,
|
||||||
|
labels,
|
||||||
|
sampleTimes(samples),
|
||||||
|
datasets,
|
||||||
|
names,
|
||||||
|
yMax,
|
||||||
|
chartCanvasHeightForPath(path, len(names)),
|
||||||
|
timeline,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
buf, err = renderMetricChartSVG(
|
||||||
title,
|
title,
|
||||||
labels,
|
labels,
|
||||||
sampleTimes(samples),
|
sampleTimes(samples),
|
||||||
@@ -606,6 +621,7 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
chartCanvasHeightForPath(path, len(names)),
|
chartCanvasHeightForPath(path, len(names)),
|
||||||
timeline,
|
timeline,
|
||||||
)
|
)
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
@@ -615,12 +631,8 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
_, _ = w.Write(buf)
|
_, _ = w.Write(buf)
|
||||||
}
|
}
|
||||||
|
|
||||||
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (datasets [][]float64, names []string, labels []string, title string, yMin, yMax *float64, stacked bool, ok bool) {
|
||||||
var datasets [][]float64
|
labels = sampleTimeLabels(samples)
|
||||||
var names []string
|
|
||||||
var title string
|
|
||||||
var yMin, yMax *float64
|
|
||||||
labels := sampleTimeLabels(samples)
|
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case path == "server-load":
|
case path == "server-load":
|
||||||
@@ -656,6 +668,31 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
|
|
||||||
case path == "server-power":
|
case path == "server-power":
|
||||||
title = "System Power"
|
title = "System Power"
|
||||||
|
// Use per-PSU stacked chart when PSU SDR data is available.
|
||||||
|
// Collect the union of PSU slots seen across all samples.
|
||||||
|
psuSlots := psuSlotsFromSamples(samples)
|
||||||
|
if len(psuSlots) > 1 {
|
||||||
|
// Build one dataset per PSU slot.
|
||||||
|
psuDatasets := make([][]float64, len(psuSlots))
|
||||||
|
psuNames := make([]string, len(psuSlots))
|
||||||
|
for si, slot := range psuSlots {
|
||||||
|
ds := make([]float64, len(samples))
|
||||||
|
for i, s := range samples {
|
||||||
|
for _, psu := range s.PSUs {
|
||||||
|
if psu.Slot == slot {
|
||||||
|
ds[i] = psu.PowerW
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
psuDatasets[si] = normalizePowerSeries(ds)
|
||||||
|
psuNames[si] = fmt.Sprintf("PSU %d", slot)
|
||||||
|
}
|
||||||
|
datasets = psuDatasets
|
||||||
|
names = psuNames
|
||||||
|
stacked = true
|
||||||
|
yMax = autoMax120(psuStackedTotal(psuDatasets))
|
||||||
|
} else {
|
||||||
power := make([]float64, len(samples))
|
power := make([]float64, len(samples))
|
||||||
for i, s := range samples {
|
for i, s := range samples {
|
||||||
power[i] = s.PowerW
|
power[i] = s.PowerW
|
||||||
@@ -665,6 +702,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
names = []string{"Power W"}
|
names = []string{"Power W"}
|
||||||
yMin = floatPtr(0)
|
yMin = floatPtr(0)
|
||||||
yMax = autoMax120(power)
|
yMax = autoMax120(power)
|
||||||
|
}
|
||||||
|
|
||||||
case path == "server-fans":
|
case path == "server-fans":
|
||||||
title = "Fan RPM"
|
title = "Fan RPM"
|
||||||
@@ -707,7 +745,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
case strings.HasPrefix(path, "gpu/"):
|
case strings.HasPrefix(path, "gpu/"):
|
||||||
idx, sub, ok := parseGPUChartPath(path)
|
idx, sub, ok := parseGPUChartPath(path)
|
||||||
if !ok {
|
if !ok {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
switch sub {
|
switch sub {
|
||||||
case "load":
|
case "load":
|
||||||
@@ -715,7 +753,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||||
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||||
if util == nil && mem == nil {
|
if util == nil && mem == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
|
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
|
||||||
names = []string{"Load %", "Mem %"}
|
names = []string{"Load %", "Mem %"}
|
||||||
@@ -725,7 +763,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
title = gpuDisplayLabel(idx) + " Temperature"
|
title = gpuDisplayLabel(idx) + " Temperature"
|
||||||
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||||
if temp == nil {
|
if temp == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{temp}
|
datasets = [][]float64{temp}
|
||||||
names = []string{"Temp °C"}
|
names = []string{"Temp °C"}
|
||||||
@@ -735,7 +773,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
title = gpuDisplayLabel(idx) + " Core Clock"
|
title = gpuDisplayLabel(idx) + " Core Clock"
|
||||||
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||||
if clock == nil {
|
if clock == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{clock}
|
datasets = [][]float64{clock}
|
||||||
names = []string{"Core Clock MHz"}
|
names = []string{"Core Clock MHz"}
|
||||||
@@ -744,7 +782,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
title = gpuDisplayLabel(idx) + " Memory Clock"
|
title = gpuDisplayLabel(idx) + " Memory Clock"
|
||||||
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
|
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
|
||||||
if clock == nil {
|
if clock == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{clock}
|
datasets = [][]float64{clock}
|
||||||
names = []string{"Memory Clock MHz"}
|
names = []string{"Memory Clock MHz"}
|
||||||
@@ -753,7 +791,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
title = gpuDisplayLabel(idx) + " Power"
|
title = gpuDisplayLabel(idx) + " Power"
|
||||||
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||||
if power == nil {
|
if power == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{power}
|
datasets = [][]float64{power}
|
||||||
names = []string{"Power W"}
|
names = []string{"Power W"}
|
||||||
@@ -761,10 +799,10 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
}
|
}
|
||||||
|
|
||||||
default:
|
default:
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
|
|
||||||
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
|
return datasets, names, labels, title, yMin, yMax, stacked, len(datasets) > 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
|
func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
|
||||||
@@ -930,6 +968,37 @@ func normalizePowerSeries(ds []float64) []float64 {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// psuSlotsFromSamples returns the sorted list of PSU slot numbers seen across samples.
|
||||||
|
func psuSlotsFromSamples(samples []platform.LiveMetricSample) []int {
|
||||||
|
seen := map[int]struct{}{}
|
||||||
|
for _, s := range samples {
|
||||||
|
for _, p := range s.PSUs {
|
||||||
|
seen[p.Slot] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
slots := make([]int, 0, len(seen))
|
||||||
|
for s := range seen {
|
||||||
|
slots = append(slots, s)
|
||||||
|
}
|
||||||
|
sort.Ints(slots)
|
||||||
|
return slots
|
||||||
|
}
|
||||||
|
|
||||||
|
// psuStackedTotal returns the point-by-point sum of all PSU datasets (for scale calculation).
|
||||||
|
func psuStackedTotal(datasets [][]float64) []float64 {
|
||||||
|
if len(datasets) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
n := len(datasets[0])
|
||||||
|
total := make([]float64, n)
|
||||||
|
for _, ds := range datasets {
|
||||||
|
for i, v := range ds {
|
||||||
|
total[i] += v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return total
|
||||||
|
}
|
||||||
|
|
||||||
func normalizeFanSeries(ds []float64) []float64 {
|
func normalizeFanSeries(ds []float64) []float64 {
|
||||||
if len(ds) == 0 {
|
if len(ds) == 0 {
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -120,7 +120,7 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
datasets, names, labels, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
t.Fatal("chartDataFromSamples returned ok=false")
|
t.Fatal("chartDataFromSamples returned ok=false")
|
||||||
}
|
}
|
||||||
@@ -164,7 +164,7 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
t.Fatal("chartDataFromSamples returned ok=false")
|
t.Fatal("chartDataFromSamples returned ok=false")
|
||||||
}
|
}
|
||||||
@@ -209,7 +209,7 @@ func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
|
datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
t.Fatal("gpu-all-clock returned ok=false")
|
t.Fatal("gpu-all-clock returned ok=false")
|
||||||
}
|
}
|
||||||
@@ -754,9 +754,9 @@ func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
|
|||||||
body := rec.Body.String()
|
body := rec.Body.String()
|
||||||
for _, needle := range []string{
|
for _, needle := range []string{
|
||||||
`NVIDIA Interconnect (NCCL)`,
|
`NVIDIA Interconnect (NCCL)`,
|
||||||
`Runs in Validate and Stress.`,
|
`Validate and Stress:`,
|
||||||
`NVIDIA Bandwidth (NVBandwidth)`,
|
`NVIDIA Bandwidth (NVBandwidth)`,
|
||||||
`Intended to stay short enough for Validate.`,
|
`nvbandwidth runs all built-in tests without a time limit`,
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(body, needle) {
|
if !strings.Contains(body, needle) {
|
||||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||||
|
|||||||
@@ -171,21 +171,17 @@ func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeli
|
|||||||
}
|
}
|
||||||
return gpuDisplayLabel(idx) + " Overview", buf, true
|
return gpuDisplayLabel(idx) + " Overview", buf, true
|
||||||
}
|
}
|
||||||
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
|
datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
return "", nil, false
|
return "", nil, false
|
||||||
}
|
}
|
||||||
buf, err := renderMetricChartSVG(
|
var buf []byte
|
||||||
title,
|
var err error
|
||||||
labels,
|
if stacked {
|
||||||
sampleTimes(samples),
|
buf, err = renderStackedMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
|
||||||
datasets,
|
} else {
|
||||||
names,
|
buf, err = renderMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMin, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
|
||||||
yMin,
|
}
|
||||||
yMax,
|
|
||||||
chartCanvasHeightForPath(path, len(names)),
|
|
||||||
timeline,
|
|
||||||
)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", nil, false
|
return "", nil, false
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user