Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
18e24a9aa5 | ||
|
|
e306250da7 | ||
|
|
c5b2081ac9 | ||
| 434528083e |
@@ -94,9 +94,13 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// benchmarkPrecisionPhases lists the precision categories run as individual
|
// benchmarkPrecisionPhases lists the precision categories run as individual
|
||||||
// steady-state windows before the combined steady pass. Order is from lowest
|
// steady-state windows before the combined steady pass. Order is from lowest
|
||||||
// to highest power draw so thermal ramp-up is gradual.
|
// to highest power draw so thermal ramp-up is gradual.
|
||||||
var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"}
|
//
|
||||||
|
// fp64 and fp4 are intentionally disabled for now: both are currently unstable
|
||||||
|
// on the target fleet and can abort the mixed steady stage after the earlier
|
||||||
|
// phases already collected useful telemetry.
|
||||||
|
var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32"}
|
||||||
|
|
||||||
func computeCapabilityCode(raw string) int {
|
func computeCapabilityCode(raw string) int {
|
||||||
raw = strings.TrimSpace(raw)
|
raw = strings.TrimSpace(raw)
|
||||||
@@ -124,6 +128,15 @@ func benchmarkSupportedPrecisions(computeCapability string) []string {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func benchmarkPrecisionEnabled(category string) bool {
|
||||||
|
switch category {
|
||||||
|
case "int8", "fp8", "fp16", "fp16_bf16", "fp32", "fp32_tf32":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, precisions []string, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
|
func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, precisions []string, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
|
||||||
if len(precisions) == 0 {
|
if len(precisions) == 0 {
|
||||||
precisions = append([]string(nil), benchmarkPrecisionPhases...)
|
precisions = append([]string(nil), benchmarkPrecisionPhases...)
|
||||||
@@ -514,6 +527,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx), &metricTimelineSec, float64(spec.CooldownSec))
|
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx), &metricTimelineSec, float64(spec.CooldownSec))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
applyBenchmarkSteadyFallback(&gpuResult)
|
||||||
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
|
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
|
||||||
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
|
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
|
||||||
if anomaly := detectPowerAnomaly(metricRows, idx); anomaly != "" {
|
if anomaly := detectPowerAnomaly(metricRows, idx); anomaly != "" {
|
||||||
@@ -1108,6 +1122,7 @@ type benchmarkCoolingSample struct {
|
|||||||
AvgFanRPM float64
|
AvgFanRPM float64
|
||||||
AvgFanDutyCyclePct float64
|
AvgFanDutyCyclePct float64
|
||||||
FanDutyCycleAvailable bool
|
FanDutyCycleAvailable bool
|
||||||
|
FanDutyCycleEstimated bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
|
func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||||
@@ -1120,6 +1135,7 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
|
|||||||
samples[i].FanAvgRPM = fanSample.AvgFanRPM
|
samples[i].FanAvgRPM = fanSample.AvgFanRPM
|
||||||
samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
|
samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
|
||||||
samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
|
samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
|
||||||
|
samples[i].FanDutyCycleEstimated = fanSample.FanDutyCycleEstimated
|
||||||
}
|
}
|
||||||
return samples, nil
|
return samples, nil
|
||||||
}
|
}
|
||||||
@@ -1127,11 +1143,12 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
|
|||||||
func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
|
func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
|
||||||
fans, _ := sampleFanSpeeds()
|
fans, _ := sampleFanSpeeds()
|
||||||
avgRPM, _, _ := fanRPMStats(fans)
|
avgRPM, _, _ := fanRPMStats(fans)
|
||||||
dutyPct, dutyAvailable := sampleFanDutyCyclePct()
|
dutyPct, dutyAvailable, dutyEstimated := sampleFanDutyCyclePctFromFans(fans)
|
||||||
return benchmarkCoolingSample{
|
return benchmarkCoolingSample{
|
||||||
AvgFanRPM: avgRPM,
|
AvgFanRPM: avgRPM,
|
||||||
AvgFanDutyCyclePct: dutyPct,
|
AvgFanDutyCyclePct: dutyPct,
|
||||||
FanDutyCycleAvailable: dutyAvailable,
|
FanDutyCycleAvailable: dutyAvailable,
|
||||||
|
FanDutyCycleEstimated: dutyEstimated,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1373,44 +1390,91 @@ func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
|
|||||||
}
|
}
|
||||||
var rpmValues []float64
|
var rpmValues []float64
|
||||||
var dutyValues []float64
|
var dutyValues []float64
|
||||||
|
var dutyEstimated bool
|
||||||
for _, row := range rows {
|
for _, row := range rows {
|
||||||
if row.FanAvgRPM > 0 {
|
if row.FanAvgRPM > 0 {
|
||||||
rpmValues = append(rpmValues, row.FanAvgRPM)
|
rpmValues = append(rpmValues, row.FanAvgRPM)
|
||||||
}
|
}
|
||||||
if row.FanDutyCycleAvailable {
|
if row.FanDutyCycleAvailable {
|
||||||
dutyValues = append(dutyValues, row.FanDutyCyclePct)
|
dutyValues = append(dutyValues, row.FanDutyCyclePct)
|
||||||
|
if row.FanDutyCycleEstimated {
|
||||||
|
dutyEstimated = true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(rpmValues) == 0 && len(dutyValues) == 0 {
|
if len(rpmValues) == 0 && len(dutyValues) == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
summary := &BenchmarkCoolingSummary{
|
summary := &BenchmarkCoolingSummary{
|
||||||
Available: true,
|
Available: true,
|
||||||
AvgFanRPM: benchmarkMean(rpmValues),
|
AvgFanRPM: benchmarkMean(rpmValues),
|
||||||
|
FanDutyCycleEstimated: dutyEstimated,
|
||||||
}
|
}
|
||||||
if len(dutyValues) > 0 {
|
if len(dutyValues) > 0 {
|
||||||
summary.FanDutyCycleAvailable = true
|
summary.FanDutyCycleAvailable = true
|
||||||
summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
|
summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
|
||||||
summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
|
summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
|
||||||
|
if summary.FanDutyCycleEstimated {
|
||||||
|
summary.Notes = append(summary.Notes, "fan duty cycle is estimated from the highest fan RPM observed since boot; treat it as an approximation, not a direct PWM reading")
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
|
summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
|
||||||
}
|
}
|
||||||
return summary
|
return summary
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func benchmarkTelemetryAvailable(summary BenchmarkTelemetrySummary) bool {
|
||||||
|
return summary.Samples > 0 || summary.DurationSec > 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func benchmarkPrecisionSteadyFallback(phases []BenchmarkPrecisionSteadyPhase) (BenchmarkTelemetrySummary, string, bool) {
|
||||||
|
var (
|
||||||
|
best BenchmarkTelemetrySummary
|
||||||
|
bestLabel string
|
||||||
|
found bool
|
||||||
|
)
|
||||||
|
for _, phase := range phases {
|
||||||
|
if !benchmarkTelemetryAvailable(phase.Steady) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !found ||
|
||||||
|
phase.Steady.DurationSec > best.DurationSec ||
|
||||||
|
(phase.Steady.DurationSec == best.DurationSec && phase.Steady.P95PowerW > best.P95PowerW) {
|
||||||
|
best = phase.Steady
|
||||||
|
bestLabel = phase.Precision
|
||||||
|
found = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return best, bestLabel, found
|
||||||
|
}
|
||||||
|
|
||||||
|
func applyBenchmarkSteadyFallback(gpu *BenchmarkGPUResult) {
|
||||||
|
if gpu == nil || benchmarkTelemetryAvailable(gpu.Steady) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if fallback, label, ok := benchmarkPrecisionSteadyFallback(gpu.PrecisionSteady); ok {
|
||||||
|
gpu.Steady = fallback
|
||||||
|
gpu.Notes = append(gpu.Notes,
|
||||||
|
fmt.Sprintf("mixed steady telemetry unavailable; reporting steady-state fallback from %s precision phase", label))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
||||||
score := BenchmarkScorecard{}
|
score := BenchmarkScorecard{}
|
||||||
|
|
||||||
// SyntheticScore: sum of fp32-equivalent TOPS from per-precision phases.
|
// SyntheticScore: sum of fp32-equivalent TOPS from per-precision phases.
|
||||||
// Each precision ran alone with full GPU dedicated — peak capability.
|
// Each precision ran alone with full GPU dedicated — peak capability.
|
||||||
for _, p := range gpu.PrecisionSteady {
|
for _, p := range gpu.PrecisionSteady {
|
||||||
|
if !benchmarkPrecisionEnabled(p.Precision) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
score.SyntheticScore += p.WeightedTeraOpsPerSec
|
score.SyntheticScore += p.WeightedTeraOpsPerSec
|
||||||
}
|
}
|
||||||
|
|
||||||
// MixedScore: sum of fp32-equivalent TOPS from the combined phase.
|
// MixedScore: sum of fp32-equivalent TOPS from the combined phase.
|
||||||
// All precisions compete simultaneously — closer to real inference workloads.
|
// All precisions compete simultaneously — closer to real inference workloads.
|
||||||
for _, p := range gpu.PrecisionResults {
|
for _, p := range gpu.PrecisionResults {
|
||||||
if p.Supported {
|
if p.Supported && benchmarkPrecisionEnabled(p.Category) {
|
||||||
score.MixedScore += p.WeightedTeraOpsPerSec
|
score.MixedScore += p.WeightedTeraOpsPerSec
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1441,10 +1505,17 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
|||||||
// so CV reflects genuine power regulation, not workload switching).
|
// so CV reflects genuine power regulation, not workload switching).
|
||||||
if len(gpu.PrecisionSteady) > 0 {
|
if len(gpu.PrecisionSteady) > 0 {
|
||||||
var sum float64
|
var sum float64
|
||||||
|
var count int
|
||||||
for _, p := range gpu.PrecisionSteady {
|
for _, p := range gpu.PrecisionSteady {
|
||||||
|
if !benchmarkPrecisionEnabled(p.Precision) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
sum += clampScore(100 - p.Steady.PowerCVPct*3)
|
sum += clampScore(100 - p.Steady.PowerCVPct*3)
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
if count > 0 {
|
||||||
|
score.PowerSustainScore = sum / float64(count)
|
||||||
}
|
}
|
||||||
score.PowerSustainScore = sum / float64(len(gpu.PrecisionSteady))
|
|
||||||
} else if gpu.Steady.PowerCVPct > 0 {
|
} else if gpu.Steady.PowerCVPct > 0 {
|
||||||
score.PowerSustainScore = clampScore(100 - gpu.Steady.PowerCVPct*3)
|
score.PowerSustainScore = clampScore(100 - gpu.Steady.PowerCVPct*3)
|
||||||
}
|
}
|
||||||
@@ -2512,6 +2583,7 @@ func runNvidiaBenchmarkParallel(
|
|||||||
// Score and finalize each GPU.
|
// Score and finalize each GPU.
|
||||||
for _, idx := range selected {
|
for _, idx := range selected {
|
||||||
r := gpuResults[idx]
|
r := gpuResults[idx]
|
||||||
|
applyBenchmarkSteadyFallback(r)
|
||||||
r.Scores = scoreBenchmarkGPUResult(*r)
|
r.Scores = scoreBenchmarkGPUResult(*r)
|
||||||
r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
|
r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
|
||||||
pr := parseResults[idx]
|
pr := parseResults[idx]
|
||||||
@@ -2694,18 +2766,21 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
|
|||||||
return cl
|
return cl
|
||||||
}
|
}
|
||||||
|
|
||||||
// runBenchmarkPowerCalibration runs targeted_power per GPU and actively watches
|
// runBenchmarkPowerCalibration runs targeted_power for the supplied GPU set and
|
||||||
// throttle counters. If a GPU starts throttling, the current targeted_power run
|
// actively watches throttle counters. seedLimits, when provided, are treated as
|
||||||
// is canceled immediately, the power limit is reduced, and a fresh full cycle
|
// the starting point for this calibration pass rather than as immutable fixed
|
||||||
// is started again from the beginning. The selected reduced power limit stays
|
// limits. This matters during cumulative ramp-up: once an additional GPU is
|
||||||
// active for the main benchmark and is restored by the caller afterwards.
|
// introduced, every already-active GPU must be revalidated under the new
|
||||||
|
// thermal state instead of assuming its previous single-step limit is still
|
||||||
|
// valid. The selected reduced power limits stay active for the main benchmark
|
||||||
|
// and are restored by the caller afterwards.
|
||||||
func runBenchmarkPowerCalibration(
|
func runBenchmarkPowerCalibration(
|
||||||
ctx context.Context,
|
ctx context.Context,
|
||||||
verboseLog, runDir string,
|
verboseLog, runDir string,
|
||||||
gpuIndices []int,
|
gpuIndices []int,
|
||||||
infoByIndex map[int]benchmarkGPUInfo,
|
infoByIndex map[int]benchmarkGPUInfo,
|
||||||
logFunc func(string),
|
logFunc func(string),
|
||||||
fixedLimits map[int]int,
|
seedLimits map[int]int,
|
||||||
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
|
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
|
||||||
const calibDurationSec = 120
|
const calibDurationSec = 120
|
||||||
const maxDerateW = 150
|
const maxDerateW = 150
|
||||||
@@ -2739,7 +2814,6 @@ func runBenchmarkPowerCalibration(
|
|||||||
err error
|
err error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// gpuCalibState holds per-GPU binary search state during parallel calibration.
|
// gpuCalibState holds per-GPU binary search state during parallel calibration.
|
||||||
type gpuCalibState struct {
|
type gpuCalibState struct {
|
||||||
idx int
|
idx int
|
||||||
@@ -2796,19 +2870,20 @@ func runBenchmarkPowerCalibration(
|
|||||||
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
|
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
|
||||||
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
|
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
|
||||||
}
|
}
|
||||||
if fixedLimits != nil {
|
if seedLimits != nil {
|
||||||
if fixedW, ok := fixedLimits[idx]; ok {
|
if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
|
||||||
// This GPU's limit was established in a prior ramp step and must
|
// A previously validated limit is only a starting point. Re-run
|
||||||
// remain unchanged. Apply it immediately and skip the binary search.
|
// targeted_power under the current multi-GPU thermal load and derate
|
||||||
if canDerate && fixedW > 0 {
|
// again if this step shows new throttling.
|
||||||
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, fixedW)
|
if canDerate {
|
||||||
|
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
|
||||||
}
|
}
|
||||||
s.appliedLimitW = fixedW
|
s.appliedLimitW = seedW
|
||||||
s.calib.AppliedPowerLimitW = float64(fixedW)
|
s.hi = seedW + 1
|
||||||
s.calib.Completed = true
|
s.calib.AppliedPowerLimitW = float64(seedW)
|
||||||
s.converged = true
|
s.calib.Derated = seedW < s.originalLimitW
|
||||||
s.calib.Notes = append(s.calib.Notes,
|
s.calib.Notes = append(s.calib.Notes,
|
||||||
fmt.Sprintf("fixed limit: %d W (held from prior ramp step)", fixedW))
|
fmt.Sprintf("seed limit: %d W (revalidating under current thermal load)", seedW))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
states = append(states, s)
|
states = append(states, s)
|
||||||
@@ -3091,7 +3166,6 @@ func powerBenchDurationSec(profile string) int {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
|
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
|
||||||
out := make(map[int]benchmarkGPUInfo, len(src))
|
out := make(map[int]benchmarkGPUInfo, len(src))
|
||||||
for k, v := range src {
|
for k, v := range src {
|
||||||
@@ -3107,7 +3181,42 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||||
fmt.Fprintf(&b, "**Platform max TDP:** %.0f W \n\n", result.PlatformMaxTDPW)
|
fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW)
|
||||||
|
if sp := result.ServerPower; sp != nil && sp.Available {
|
||||||
|
fmt.Fprintf(&b, "**Server power delta (IPMI):** %.0f W \n", sp.DeltaW)
|
||||||
|
fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU sum):** %.2f \n", sp.ReportingRatio)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
// Server power comparison table.
|
||||||
|
if sp := result.ServerPower; sp != nil {
|
||||||
|
b.WriteString("## Server vs GPU Power Comparison\n\n")
|
||||||
|
b.WriteString("| Metric | Value |\n")
|
||||||
|
b.WriteString("|--------|-------|\n")
|
||||||
|
fmt.Fprintf(&b, "| GPU stable limits sum (nvidia-smi) | %.0f W |\n", result.PlatformMaxTDPW)
|
||||||
|
if sp.Available {
|
||||||
|
fmt.Fprintf(&b, "| Server idle power (IPMI) | %.0f W |\n", sp.IdleW)
|
||||||
|
fmt.Fprintf(&b, "| Server loaded power (IPMI) | %.0f W |\n", sp.LoadedW)
|
||||||
|
fmt.Fprintf(&b, "| Server Δ power (loaded − idle) | %.0f W |\n", sp.DeltaW)
|
||||||
|
ratio := sp.ReportingRatio
|
||||||
|
ratioNote := ""
|
||||||
|
switch {
|
||||||
|
case ratio >= 0.9:
|
||||||
|
ratioNote = "✓ GPU telemetry matches server power"
|
||||||
|
case ratio >= 0.75:
|
||||||
|
ratioNote = "⚠ minor discrepancy — GPU may slightly over-report TDP"
|
||||||
|
default:
|
||||||
|
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "| Reporting ratio (IPMI Δ / GPU sum) | %.2f — %s |\n", ratio, ratioNote)
|
||||||
|
} else {
|
||||||
|
b.WriteString("| IPMI availability | not available — IPMI not supported or ipmitool not found |\n")
|
||||||
|
}
|
||||||
|
for _, note := range sp.Notes {
|
||||||
|
fmt.Fprintf(&b, "\n> %s\n", note)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
if len(result.Findings) > 0 {
|
if len(result.Findings) > 0 {
|
||||||
b.WriteString("## Summary\n\n")
|
b.WriteString("## Summary\n\n")
|
||||||
for _, finding := range result.Findings {
|
for _, finding := range result.Findings {
|
||||||
@@ -3181,6 +3290,12 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
|
|||||||
fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
|
fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if sp := result.ServerPower; sp != nil && sp.Available {
|
||||||
|
fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW)
|
||||||
|
fmt.Fprintf(&b, "server_loaded_w=%.0f\n", sp.LoadedW)
|
||||||
|
fmt.Fprintf(&b, "server_delta_w=%.0f\n", sp.DeltaW)
|
||||||
|
fmt.Fprintf(&b, "server_reporting_ratio=%.2f\n", sp.ReportingRatio)
|
||||||
|
}
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3224,6 +3339,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
}
|
}
|
||||||
durationSec := powerBenchDurationSec(opts.Profile)
|
durationSec := powerBenchDurationSec(opts.Profile)
|
||||||
_ = durationSec
|
_ = durationSec
|
||||||
|
|
||||||
|
// Sample IPMI idle power before any GPU load.
|
||||||
|
var serverIdleW float64
|
||||||
|
var serverIdleOK bool
|
||||||
|
if w, ok := sampleIPMIPowerSeries(ctx, 10); ok {
|
||||||
|
serverIdleW = w
|
||||||
|
serverIdleOK = true
|
||||||
|
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
||||||
|
}
|
||||||
|
|
||||||
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
||||||
// establish a true single-card power baseline unaffected by neighbour heat.
|
// establish a true single-card power baseline unaffected by neighbour heat.
|
||||||
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
|
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
|
||||||
@@ -3320,20 +3445,35 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
|
// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
|
||||||
stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))
|
stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))
|
||||||
|
|
||||||
|
// Start an IPMI sampling goroutine that runs throughout Phase 2 to capture
|
||||||
|
// server-side loaded power while GPUs are under stress. The goroutine is
|
||||||
|
// cancelled as soon as Phase 2 finishes, and the average is used to compare
|
||||||
|
// against PlatformMaxTDPW (GPU-reported stable limits sum).
|
||||||
|
var serverLoadedW float64
|
||||||
|
var serverLoadedOK bool
|
||||||
|
ipmiPhase2Ctx, ipmiPhase2Cancel := context.WithCancel(ctx)
|
||||||
|
ipmiPhase2Done := make(chan float64, 1)
|
||||||
|
go func() {
|
||||||
|
defer close(ipmiPhase2Done)
|
||||||
|
if w, ok := sampleIPMIPowerSeries(ipmiPhase2Ctx, 3600); ok {
|
||||||
|
ipmiPhase2Done <- w
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
// Step 1: reuse single-card calibration result directly.
|
// Step 1: reuse single-card calibration result directly.
|
||||||
if len(result.RecommendedSlotOrder) > 0 {
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
firstIdx := result.RecommendedSlotOrder[0]
|
firstIdx := result.RecommendedSlotOrder[0]
|
||||||
firstCalib := calibByIndex[firstIdx]
|
firstCalib := calibByIndex[firstIdx]
|
||||||
stableLimits[firstIdx] = int(math.Round(firstCalib.AppliedPowerLimitW))
|
stableLimits[firstIdx] = int(math.Round(firstCalib.AppliedPowerLimitW))
|
||||||
ramp := NvidiaPowerBenchStep{
|
ramp := NvidiaPowerBenchStep{
|
||||||
StepIndex: 1,
|
StepIndex: 1,
|
||||||
GPUIndices: []int{firstIdx},
|
GPUIndices: []int{firstIdx},
|
||||||
NewGPUIndex: firstIdx,
|
NewGPUIndex: firstIdx,
|
||||||
NewGPUStableLimitW: firstCalib.AppliedPowerLimitW,
|
NewGPUStableLimitW: firstCalib.AppliedPowerLimitW,
|
||||||
TotalObservedPowerW: firstCalib.Summary.P95PowerW,
|
TotalObservedPowerW: firstCalib.Summary.P95PowerW,
|
||||||
AvgObservedPowerW: firstCalib.Summary.P95PowerW,
|
AvgObservedPowerW: firstCalib.Summary.P95PowerW,
|
||||||
Derated: firstCalib.Derated,
|
Derated: firstCalib.Derated,
|
||||||
Status: "OK",
|
Status: "OK",
|
||||||
}
|
}
|
||||||
if !firstCalib.Completed {
|
if !firstCalib.Completed {
|
||||||
ramp.Status = "FAILED"
|
ramp.Status = "FAILED"
|
||||||
@@ -3351,8 +3491,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
len(result.RecommendedSlotOrder), firstIdx, firstCalib.AppliedPowerLimitW))
|
len(result.RecommendedSlotOrder), firstIdx, firstCalib.AppliedPowerLimitW))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Steps 2..N: each step fixes previously calibrated GPUs and searches only
|
// Steps 2..N: each step revalidates every already-active GPU under the new
|
||||||
// the new GPU's stable limit in the combined thermal environment.
|
// cumulative thermal environment and also calibrates the newly introduced
|
||||||
|
// GPU. Previously found limits are used only as seeds for the search.
|
||||||
for stepNum := 1; stepNum < len(result.RecommendedSlotOrder); stepNum++ {
|
for stepNum := 1; stepNum < len(result.RecommendedSlotOrder); stepNum++ {
|
||||||
step := stepNum + 1
|
step := stepNum + 1
|
||||||
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
|
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
|
||||||
@@ -3360,17 +3501,18 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
|
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
|
||||||
_ = os.MkdirAll(stepDir, 0755)
|
_ = os.MkdirAll(stepDir, 0755)
|
||||||
|
|
||||||
// All previously calibrated GPUs are fixed at their stable limits.
|
// Reuse the latest stable limits as starting points, but re-check every
|
||||||
fixedForStep := make(map[int]int, len(stableLimits))
|
// active GPU in this hotter configuration.
|
||||||
|
seedForStep := make(map[int]int, len(stableLimits))
|
||||||
for k, v := range stableLimits {
|
for k, v := range stableLimits {
|
||||||
fixedForStep[k] = v
|
seedForStep[k] = v
|
||||||
}
|
}
|
||||||
|
|
||||||
logFunc(fmt.Sprintf("power ramp: step %d/%d — calibrating GPU %d with %d fixed GPU(s)",
|
logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d",
|
||||||
step, len(result.RecommendedSlotOrder), newGPUIdx, len(fixedForStep)))
|
step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
|
||||||
|
|
||||||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, fixedForStep)
|
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
|
||||||
// Accumulate restore actions; they all run in the outer defer.
|
// Accumulate restore actions; they all run in the outer defer.
|
||||||
allRestoreActions = append(allRestoreActions, stepRestore...)
|
allRestoreActions = append(allRestoreActions, stepRestore...)
|
||||||
|
|
||||||
@@ -3391,36 +3533,72 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
|
ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determine stable limit for the new GPU.
|
for _, idx := range subset {
|
||||||
if c, ok := stepCalib[newGPUIdx]; ok && c.Completed {
|
c, ok := stepCalib[idx]
|
||||||
stableLimits[newGPUIdx] = int(math.Round(c.AppliedPowerLimitW))
|
if !ok || !c.Completed {
|
||||||
ramp.NewGPUStableLimitW = c.AppliedPowerLimitW
|
fallback := 0
|
||||||
ramp.Derated = c.Derated
|
if lim, ok := stableLimits[idx]; ok && lim > 0 {
|
||||||
|
fallback = lim
|
||||||
|
} else if fb, ok := calibByIndex[idx]; ok {
|
||||||
|
fallback = int(math.Round(fb.AppliedPowerLimitW))
|
||||||
|
}
|
||||||
|
if fallback > 0 {
|
||||||
|
stableLimits[idx] = fallback
|
||||||
|
}
|
||||||
|
ramp.Status = "FAILED"
|
||||||
|
ramp.Notes = append(ramp.Notes,
|
||||||
|
fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; keeping previous stable limit %d W", idx, step, fallback))
|
||||||
|
result.OverallStatus = "PARTIAL"
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
prevLimit, hadPrev := stableLimits[idx]
|
||||||
|
newLimit := int(math.Round(c.AppliedPowerLimitW))
|
||||||
|
stableLimits[idx] = newLimit
|
||||||
|
if idx == newGPUIdx {
|
||||||
|
ramp.NewGPUStableLimitW = c.AppliedPowerLimitW
|
||||||
|
ramp.Derated = c.Derated
|
||||||
|
}
|
||||||
if c.Derated {
|
if c.Derated {
|
||||||
ramp.Status = "PARTIAL"
|
ramp.Status = "PARTIAL"
|
||||||
if result.OverallStatus == "OK" {
|
if result.OverallStatus == "OK" {
|
||||||
result.OverallStatus = "PARTIAL"
|
result.OverallStatus = "PARTIAL"
|
||||||
}
|
}
|
||||||
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
|
||||||
}
|
}
|
||||||
} else {
|
if hadPrev && newLimit < prevLimit {
|
||||||
// Calibration failed — fall back to single-card limit.
|
ramp.Notes = append(ramp.Notes,
|
||||||
fb := calibByIndex[newGPUIdx]
|
fmt.Sprintf("GPU %d was re-derated from %d W to %d W under combined thermal load.", idx, prevLimit, newLimit))
|
||||||
stableLimits[newGPUIdx] = int(math.Round(fb.AppliedPowerLimitW))
|
}
|
||||||
ramp.NewGPUStableLimitW = fb.AppliedPowerLimitW
|
}
|
||||||
ramp.Status = "FAILED"
|
|
||||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; using single-card limit %.0f W", newGPUIdx, step, fb.AppliedPowerLimitW))
|
if c, ok := stepCalib[newGPUIdx]; ok && c.Completed && c.Derated {
|
||||||
result.OverallStatus = "PARTIAL"
|
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
||||||
}
|
}
|
||||||
|
|
||||||
result.RampSteps = append(result.RampSteps, ramp)
|
result.RampSteps = append(result.RampSteps, ramp)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Stop IPMI Phase 2 sampling and collect result.
|
||||||
|
ipmiPhase2Cancel()
|
||||||
|
if w, ok := <-ipmiPhase2Done; ok {
|
||||||
|
serverLoadedW = w
|
||||||
|
serverLoadedOK = true
|
||||||
|
logFunc(fmt.Sprintf("server loaded power (IPMI, Phase 2 avg): %.0f W", w))
|
||||||
|
}
|
||||||
|
|
||||||
// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
|
// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
|
||||||
for i := range result.GPUs {
|
for i := range result.GPUs {
|
||||||
if lim, ok := stableLimits[result.GPUs[i].Index]; ok {
|
if lim, ok := stableLimits[result.GPUs[i].Index]; ok {
|
||||||
result.GPUs[i].StablePowerLimitW = float64(lim)
|
result.GPUs[i].StablePowerLimitW = float64(lim)
|
||||||
}
|
}
|
||||||
|
if result.GPUs[i].StablePowerLimitW > 0 && result.GPUs[i].AppliedPowerLimitW > 0 &&
|
||||||
|
result.GPUs[i].StablePowerLimitW < result.GPUs[i].AppliedPowerLimitW {
|
||||||
|
result.GPUs[i].Derated = true
|
||||||
|
result.Findings = append(result.Findings, fmt.Sprintf(
|
||||||
|
"GPU %d required additional derating from %.0f W (single-card) to %.0f W under full-system thermal load.",
|
||||||
|
result.GPUs[i].Index, result.GPUs[i].AppliedPowerLimitW, result.GPUs[i].StablePowerLimitW,
|
||||||
|
))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// PlatformMaxTDPW = sum of all stable limits — the actual sustained power
|
// PlatformMaxTDPW = sum of all stable limits — the actual sustained power
|
||||||
@@ -3428,6 +3606,13 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
for _, lim := range stableLimits {
|
for _, lim := range stableLimits {
|
||||||
result.PlatformMaxTDPW += float64(lim)
|
result.PlatformMaxTDPW += float64(lim)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Characterize server power from IPMI idle/loaded samples.
|
||||||
|
// GPUReportedSumW = PlatformMaxTDPW (sum of stable GPU limits, nvidia-smi).
|
||||||
|
// ReportingRatio = IPMI_delta / GPU_reported_sum:
|
||||||
|
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
|
||||||
|
_ = serverIdleOK // used implicitly via characterizeServerPower
|
||||||
|
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
|
||||||
resultJSON, err := json.MarshalIndent(result, "", " ")
|
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("marshal power result: %w", err)
|
return "", fmt.Errorf("marshal power result: %w", err)
|
||||||
|
|||||||
@@ -261,14 +261,18 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
|
|
||||||
// Steady-state telemetry
|
// Steady-state telemetry
|
||||||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
if benchmarkTelemetryAvailable(gpu.Steady) {
|
||||||
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
|
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||||
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
|
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
|
||||||
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
|
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
|
||||||
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
|
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
|
||||||
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
|
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
|
||||||
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
|
||||||
b.WriteString("\n")
|
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
||||||
|
b.WriteString("\n")
|
||||||
|
} else {
|
||||||
|
b.WriteString("**Steady-state telemetry:** unavailable\n\n")
|
||||||
|
}
|
||||||
|
|
||||||
// Per-precision stability phases.
|
// Per-precision stability phases.
|
||||||
if len(gpu.PrecisionSteady) > 0 {
|
if len(gpu.PrecisionSteady) > 0 {
|
||||||
|
|||||||
@@ -49,8 +49,8 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
|
|||||||
benchmarkPrecisionPhases,
|
benchmarkPrecisionPhases,
|
||||||
func(label string) string { return label },
|
func(label string) string { return label },
|
||||||
)
|
)
|
||||||
if len(labels) != 7 || len(phases) != 7 {
|
if len(labels) != 5 || len(phases) != 5 {
|
||||||
t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
|
t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases))
|
||||||
}
|
}
|
||||||
if basePhaseSec != 60 {
|
if basePhaseSec != 60 {
|
||||||
t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
|
t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
|
||||||
@@ -61,7 +61,7 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
|
|||||||
if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
|
if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
|
||||||
t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
|
t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
|
||||||
}
|
}
|
||||||
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
|
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" {
|
||||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -80,7 +80,7 @@ func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
|
|||||||
if mixedPhaseSec != 3600 {
|
if mixedPhaseSec != 3600 {
|
||||||
t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
|
t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
|
||||||
}
|
}
|
||||||
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
|
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" {
|
||||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -99,7 +99,7 @@ func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
|
|||||||
if mixedPhaseSec != 14400 {
|
if mixedPhaseSec != 14400 {
|
||||||
t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
|
t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
|
||||||
}
|
}
|
||||||
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
|
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" {
|
||||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -133,10 +133,10 @@ func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
|
|||||||
func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
|
func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64" {
|
if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
|
||||||
t.Fatalf("supported=%v", got)
|
t.Fatalf("supported=%v", got)
|
||||||
}
|
}
|
||||||
if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64,fp4" {
|
if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
|
||||||
t.Fatalf("supported=%v", got)
|
t.Fatalf("supported=%v", got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -314,6 +314,30 @@ func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
score := scoreBenchmarkGPUResult(BenchmarkGPUResult{
|
||||||
|
PrecisionSteady: []BenchmarkPrecisionSteadyPhase{
|
||||||
|
{Precision: "fp16", WeightedTeraOpsPerSec: 100},
|
||||||
|
{Precision: "fp64", WeightedTeraOpsPerSec: 999},
|
||||||
|
{Precision: "fp4", WeightedTeraOpsPerSec: 999},
|
||||||
|
},
|
||||||
|
PrecisionResults: []BenchmarkPrecisionResult{
|
||||||
|
{Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50},
|
||||||
|
{Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999},
|
||||||
|
{Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
if score.SyntheticScore != 100 {
|
||||||
|
t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore)
|
||||||
|
}
|
||||||
|
if score.MixedScore != 50 {
|
||||||
|
t.Fatalf("MixedScore=%f want 50", score.MixedScore)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
|
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ type BenchmarkCoolingSummary struct {
|
|||||||
Available bool `json:"available"`
|
Available bool `json:"available"`
|
||||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||||
|
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||||
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
|
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
|
||||||
Notes []string `json:"notes,omitempty"`
|
Notes []string `json:"notes,omitempty"`
|
||||||
@@ -55,32 +56,32 @@ type NvidiaBenchmarkOptions struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaBenchmarkResult struct {
|
type NvidiaBenchmarkResult struct {
|
||||||
BenchmarkVersion string `json:"benchmark_version"`
|
BenchmarkVersion string `json:"benchmark_version"`
|
||||||
GeneratedAt time.Time `json:"generated_at"`
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
Hostname string `json:"hostname,omitempty"`
|
Hostname string `json:"hostname,omitempty"`
|
||||||
ServerModel string `json:"server_model,omitempty"`
|
ServerModel string `json:"server_model,omitempty"`
|
||||||
BenchmarkProfile string `json:"benchmark_profile"`
|
BenchmarkProfile string `json:"benchmark_profile"`
|
||||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||||
RampStep int `json:"ramp_step,omitempty"`
|
RampStep int `json:"ramp_step,omitempty"`
|
||||||
RampTotal int `json:"ramp_total,omitempty"`
|
RampTotal int `json:"ramp_total,omitempty"`
|
||||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||||
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
||||||
// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
|
// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
|
||||||
// 100% = each added GPU contributes exactly its single-card throughput.
|
// 100% = each added GPU contributes exactly its single-card throughput.
|
||||||
// < 100% = throughput loss due to thermal throttle, power limits, or contention.
|
// < 100% = throughput loss due to thermal throttle, power limits, or contention.
|
||||||
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
|
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
|
||||||
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
|
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
|
||||||
OverallStatus string `json:"overall_status"`
|
OverallStatus string `json:"overall_status"`
|
||||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||||
Findings []string `json:"findings,omitempty"`
|
Findings []string `json:"findings,omitempty"`
|
||||||
Warnings []string `json:"warnings,omitempty"`
|
Warnings []string `json:"warnings,omitempty"`
|
||||||
Normalization BenchmarkNormalization `json:"normalization"`
|
Normalization BenchmarkNormalization `json:"normalization"`
|
||||||
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
||||||
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
||||||
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
|
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
|
||||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type BenchmarkNormalization struct {
|
type BenchmarkNormalization struct {
|
||||||
@@ -223,8 +224,8 @@ type BenchmarkScorecard struct {
|
|||||||
|
|
||||||
// Throttle breakdown — percentage of steady-state time in each throttle type.
|
// Throttle breakdown — percentage of steady-state time in each throttle type.
|
||||||
// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
|
// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
|
||||||
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
|
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
|
||||||
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
|
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
|
||||||
SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
|
SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
|
||||||
|
|
||||||
// Temperature headroom: distance to the 100°C destruction threshold.
|
// Temperature headroom: distance to the 100°C destruction threshold.
|
||||||
@@ -300,18 +301,22 @@ type NvidiaPowerBenchResult struct {
|
|||||||
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
|
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
|
||||||
// cumulative thermal ramp. Represents the actual sustained power budget of
|
// cumulative thermal ramp. Represents the actual sustained power budget of
|
||||||
// this server under full GPU load. Use for rack power planning.
|
// this server under full GPU load. Use for rack power planning.
|
||||||
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
|
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
|
||||||
Findings []string `json:"findings,omitempty"`
|
// ServerPower captures IPMI server power delta (idle→loaded) measured in
|
||||||
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
// parallel with the thermal ramp. Use to compare GPU-reported TDP against
|
||||||
|
// actual wall-power draw as seen by the server's power supply.
|
||||||
|
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||||
|
Findings []string `json:"findings,omitempty"`
|
||||||
|
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaPowerBenchGPU struct {
|
type NvidiaPowerBenchGPU struct {
|
||||||
Index int `json:"index"`
|
Index int `json:"index"`
|
||||||
Name string `json:"name,omitempty"`
|
Name string `json:"name,omitempty"`
|
||||||
BusID string `json:"bus_id,omitempty"`
|
BusID string `json:"bus_id,omitempty"`
|
||||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||||
// AppliedPowerLimitW is the stable limit found during single-card calibration.
|
// AppliedPowerLimitW is the stable limit found during single-card calibration.
|
||||||
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
||||||
// StablePowerLimitW is the final fixed limit for this GPU after the
|
// StablePowerLimitW is the final fixed limit for this GPU after the
|
||||||
// cumulative thermal ramp. This is the limit at which the GPU operated
|
// cumulative thermal ramp. This is the limit at which the GPU operated
|
||||||
// stably with all other GPUs running simultaneously at their own limits.
|
// stably with all other GPUs running simultaneously at their own limits.
|
||||||
@@ -329,10 +334,10 @@ type NvidiaPowerBenchGPU struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaPowerBenchStep struct {
|
type NvidiaPowerBenchStep struct {
|
||||||
StepIndex int `json:"step_index"`
|
StepIndex int `json:"step_index"`
|
||||||
GPUIndices []int `json:"gpu_indices"`
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
// NewGPUIndex is the GPU whose stable limit was searched in this step.
|
// NewGPUIndex is the GPU whose stable limit was searched in this step.
|
||||||
NewGPUIndex int `json:"new_gpu_index"`
|
NewGPUIndex int `json:"new_gpu_index"`
|
||||||
// NewGPUStableLimitW is the stable power limit found for the new GPU.
|
// NewGPUStableLimitW is the stable power limit found for the new GPU.
|
||||||
NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"`
|
NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"`
|
||||||
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
||||||
@@ -345,15 +350,15 @@ type NvidiaPowerBenchStep struct {
|
|||||||
// NvidiaPerformanceRampStep holds per-step performance data for the
|
// NvidiaPerformanceRampStep holds per-step performance data for the
|
||||||
// scalability ramp-up phase of the performance benchmark.
|
// scalability ramp-up phase of the performance benchmark.
|
||||||
type NvidiaPerformanceRampStep struct {
|
type NvidiaPerformanceRampStep struct {
|
||||||
StepIndex int `json:"step_index"`
|
StepIndex int `json:"step_index"`
|
||||||
GPUIndices []int `json:"gpu_indices"`
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
|
// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
|
||||||
// TOPS from dedicated single-precision phases) across all GPUs in this step.
|
// TOPS from dedicated single-precision phases) across all GPUs in this step.
|
||||||
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
|
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
|
||||||
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
|
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
|
||||||
// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
|
// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
|
||||||
// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
|
// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
|
||||||
ScalabilityPct float64 `json:"scalability_pct"`
|
ScalabilityPct float64 `json:"scalability_pct"`
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
Notes []string `json:"notes,omitempty"`
|
Notes []string `json:"notes,omitempty"`
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ type GPUMetricRow struct {
|
|||||||
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
|
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
|
||||||
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
|
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
|
||||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||||
|
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||||
@@ -147,14 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
|||||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n")
|
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
|
||||||
for _, r := range rows {
|
for _, r := range rows {
|
||||||
dutyAvail := 0
|
dutyAvail := 0
|
||||||
if r.FanDutyCycleAvailable {
|
if r.FanDutyCycleAvailable {
|
||||||
dutyAvail = 1
|
dutyAvail = 1
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n",
|
dutyEstimated := 0
|
||||||
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail)
|
if r.FanDutyCycleEstimated {
|
||||||
|
dutyEstimated = 1
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
|
||||||
|
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
|
||||||
}
|
}
|
||||||
return os.WriteFile(path, b.Bytes(), 0644)
|
return os.WriteFile(path, b.Bytes(), 0644)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
@@ -56,13 +57,37 @@ type cachedPowerReading struct {
|
|||||||
UpdatedAt time.Time
|
UpdatedAt time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type fanObservationState struct {
|
||||||
|
MaxRPM map[string]float64 `json:"max_rpm"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type fanPeakCandidate struct {
|
||||||
|
FirstSeen time.Time
|
||||||
|
RPM float64
|
||||||
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
systemPowerCacheMu sync.Mutex
|
systemPowerCacheMu sync.Mutex
|
||||||
systemPowerCache cachedPowerReading
|
systemPowerCache cachedPowerReading
|
||||||
|
fanObservationMu sync.Mutex
|
||||||
|
fanObservation fanObservationState
|
||||||
|
fanObservationInit bool
|
||||||
|
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||||
)
|
)
|
||||||
|
|
||||||
const systemPowerHoldTTL = 15 * time.Second
|
const systemPowerHoldTTL = 15 * time.Second
|
||||||
|
|
||||||
|
var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
|
||||||
|
|
||||||
|
const fanObservationMinPeakHold = time.Second
|
||||||
|
|
||||||
|
func normalizeObservedFanMaxRPM(rpm float64) float64 {
|
||||||
|
if rpm <= 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return math.Ceil(rpm/1000.0) * 1000.0
|
||||||
|
}
|
||||||
|
|
||||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||||
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||||
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||||
@@ -310,11 +335,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
|||||||
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
|
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
|
||||||
|
updateFanObservation(fans, time.Now())
|
||||||
return fans, nil
|
return fans, nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
|
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
|
||||||
if len(fans) > 0 {
|
if len(fans) > 0 {
|
||||||
|
updateFanObservation(fans, time.Now())
|
||||||
return fans, nil
|
return fans, nil
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -323,6 +350,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
|||||||
return nil, sensorsErr
|
return nil, sensorsErr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func loadFanObservationLocked() {
|
||||||
|
if fanObservationInit {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fanObservationInit = true
|
||||||
|
fanObservation.MaxRPM = make(map[string]float64)
|
||||||
|
raw, err := os.ReadFile(fanObservationStatePath)
|
||||||
|
if err != nil || len(raw) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var persisted fanObservationState
|
||||||
|
if json.Unmarshal(raw, &persisted) != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for name, rpm := range persisted.MaxRPM {
|
||||||
|
name = strings.TrimSpace(name)
|
||||||
|
if name == "" || rpm <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fanObservation.MaxRPM[name] = rpm
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func saveFanObservationLocked() {
|
||||||
|
if len(fanObservation.MaxRPM) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
dir := filepath.Dir(fanObservationStatePath)
|
||||||
|
if dir == "" || dir == "." {
|
||||||
|
dir = "/var/log/bee-sat"
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
raw, err := json.MarshalIndent(fanObservation, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = os.WriteFile(fanObservationStatePath, raw, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func updateFanObservation(fans []FanReading, now time.Time) {
|
||||||
|
if len(fans) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fanObservationMu.Lock()
|
||||||
|
defer fanObservationMu.Unlock()
|
||||||
|
loadFanObservationLocked()
|
||||||
|
changed := false
|
||||||
|
for _, fan := range fans {
|
||||||
|
name := strings.TrimSpace(fan.Name)
|
||||||
|
if name == "" || fan.RPM <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
currentMax := fanObservation.MaxRPM[name]
|
||||||
|
if fan.RPM <= currentMax {
|
||||||
|
delete(fanPeakCandidates, name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if cand, ok := fanPeakCandidates[name]; ok {
|
||||||
|
if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
|
||||||
|
newMax := math.Max(cand.RPM, fan.RPM)
|
||||||
|
if newMax > currentMax {
|
||||||
|
fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
|
||||||
|
changed = true
|
||||||
|
}
|
||||||
|
delete(fanPeakCandidates, name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if fan.RPM > cand.RPM {
|
||||||
|
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
|
||||||
|
}
|
||||||
|
if changed {
|
||||||
|
saveFanObservationLocked()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
|
||||||
|
if len(fans) == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
fanObservationMu.Lock()
|
||||||
|
defer fanObservationMu.Unlock()
|
||||||
|
loadFanObservationLocked()
|
||||||
|
var samples []float64
|
||||||
|
for _, fan := range fans {
|
||||||
|
name := strings.TrimSpace(fan.Name)
|
||||||
|
if name == "" || fan.RPM <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
maxRPM := fanObservation.MaxRPM[name]
|
||||||
|
if maxRPM <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
pct := fan.RPM / maxRPM * 100.0
|
||||||
|
if pct > 100 {
|
||||||
|
pct = 100
|
||||||
|
}
|
||||||
|
if pct < 0 {
|
||||||
|
pct = 0
|
||||||
|
}
|
||||||
|
samples = append(samples, pct)
|
||||||
|
}
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return benchmarkMean(samples), true
|
||||||
|
}
|
||||||
|
|
||||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||||
// Handles two formats:
|
// Handles two formats:
|
||||||
//
|
//
|
||||||
@@ -428,12 +568,27 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
|
|||||||
|
|
||||||
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
|
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
|
||||||
// Returns the average duty cycle across all exposed PWM controls.
|
// Returns the average duty cycle across all exposed PWM controls.
|
||||||
func sampleFanDutyCyclePct() (float64, bool) {
|
func sampleFanDutyCyclePct() (float64, bool, bool) {
|
||||||
out, err := exec.Command("sensors", "-j").Output()
|
out, err := exec.Command("sensors", "-j").Output()
|
||||||
if err != nil || len(out) == 0 {
|
if err != nil || len(out) == 0 {
|
||||||
return 0, false
|
fans, fanErr := sampleFanSpeeds()
|
||||||
|
if fanErr != nil {
|
||||||
|
return 0, false, false
|
||||||
|
}
|
||||||
|
return sampleFanDutyCyclePctFromFans(fans)
|
||||||
}
|
}
|
||||||
return parseFanDutyCyclePctSensorsJSON(out)
|
pct, ok := parseFanDutyCyclePctSensorsJSON(out)
|
||||||
|
return pct, ok, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
|
||||||
|
if len(fans) == 0 {
|
||||||
|
return 0, false, false
|
||||||
|
}
|
||||||
|
if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
|
||||||
|
return pct, true, true
|
||||||
|
}
|
||||||
|
return 0, false, false
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
|
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
@@ -50,6 +51,53 @@ func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
oldPath := fanObservationStatePath
|
||||||
|
oldState := fanObservation
|
||||||
|
oldInit := fanObservationInit
|
||||||
|
oldCandidates := fanPeakCandidates
|
||||||
|
fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
|
||||||
|
fanObservation = fanObservationState{}
|
||||||
|
fanObservationInit = false
|
||||||
|
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||||
|
t.Cleanup(func() {
|
||||||
|
fanObservationStatePath = oldPath
|
||||||
|
fanObservation = oldState
|
||||||
|
fanObservationInit = oldInit
|
||||||
|
fanPeakCandidates = oldCandidates
|
||||||
|
})
|
||||||
|
|
||||||
|
start := time.Unix(100, 0)
|
||||||
|
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
|
||||||
|
if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
|
||||||
|
t.Fatalf("single-sample spike should not establish observed max")
|
||||||
|
}
|
||||||
|
|
||||||
|
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
|
||||||
|
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
|
||||||
|
|
||||||
|
got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("expected estimated duty cycle from persisted observed max")
|
||||||
|
}
|
||||||
|
if got < 43 || got > 44 {
|
||||||
|
t.Fatalf("got=%v want ~43.3", got)
|
||||||
|
}
|
||||||
|
|
||||||
|
fanObservation = fanObservationState{}
|
||||||
|
fanObservationInit = false
|
||||||
|
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||||
|
got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("expected persisted observed max to be reloaded from disk")
|
||||||
|
}
|
||||||
|
if got < 43 || got > 44 {
|
||||||
|
t.Fatalf("reloaded got=%v want ~43.3", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestParseDCMIPowerReading(t *testing.T) {
|
func TestParseDCMIPowerReading(t *testing.T) {
|
||||||
raw := `
|
raw := `
|
||||||
Instantaneous power reading: 512 Watts
|
Instantaneous power reading: 512 Watts
|
||||||
|
|||||||
@@ -713,6 +713,19 @@ static const struct profile_desc k_profiles[] = {
|
|||||||
|
|
||||||
#define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
|
#define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
|
||||||
|
|
||||||
|
static int profile_allowed_for_run(const struct profile_desc *desc, int cc, const char *precision_filter) {
|
||||||
|
if (!(desc->enabled && cc >= desc->min_cc)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (precision_filter != NULL) {
|
||||||
|
return strcmp(desc->block_label, precision_filter) == 0;
|
||||||
|
}
|
||||||
|
/* Mixed/all phases intentionally exclude fp64/fp4 for now: both paths are
|
||||||
|
* unstable on the current benchmark fleet and can abort the whole mixed
|
||||||
|
* pass after earlier phases already collected useful telemetry. */
|
||||||
|
return strcmp(desc->block_label, "fp64") != 0 && strcmp(desc->block_label, "fp4") != 0;
|
||||||
|
}
|
||||||
|
|
||||||
static int load_cublaslt(struct cublaslt_api *api) {
|
static int load_cublaslt(struct cublaslt_api *api) {
|
||||||
memset(api, 0, sizeof(*api));
|
memset(api, 0, sizeof(*api));
|
||||||
api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
|
api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
|
||||||
@@ -1222,8 +1235,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
|
|
||||||
/* Count profiles matching the filter (for deciding what to run). */
|
/* Count profiles matching the filter (for deciding what to run). */
|
||||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||||
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc &&
|
if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
|
||||||
(precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) {
|
|
||||||
planned++;
|
planned++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1240,7 +1252,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
* profiles matching precision_filter. */
|
* profiles matching precision_filter. */
|
||||||
int planned_total = 0;
|
int planned_total = 0;
|
||||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||||
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
|
if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
|
||||||
planned_total++;
|
planned_total++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1310,10 +1322,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
desc->min_cc);
|
desc->min_cc);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (precision_filter != NULL && strcmp(desc->block_label, precision_filter) != 0) {
|
if (!profile_allowed_for_run(desc, cc, precision_filter)) {
|
||||||
append_detail(report->details,
|
append_detail(report->details,
|
||||||
sizeof(report->details),
|
sizeof(report->details),
|
||||||
"%s=SKIPPED precision_filter\n",
|
"%s=SKIPPED benchmark_disabled\n",
|
||||||
desc->name);
|
desc->name);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user