Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| f8cd9a7376 | |||
| d52ec67f8f | |||
| 61c7abaa80 | |||
| d60f7758ba | |||
| 52c3a24b76 | |||
| 028bb30333 | |||
| 7d64e5d215 |
@@ -304,7 +304,7 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
|
|||||||
}
|
}
|
||||||
filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
|
filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
|
||||||
tmpPath := filepath.Join(os.TempDir(), filename)
|
tmpPath := filepath.Join(os.TempDir(), filename)
|
||||||
data, err := os.ReadFile(DefaultAuditJSONPath)
|
data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,10 +2,29 @@ package app
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// readFileLimited reads path into memory, refusing files larger than maxBytes.
|
||||||
|
// Prevents OOM on corrupted or unexpectedly large data files.
|
||||||
|
func readFileLimited(path string, maxBytes int64) ([]byte, error) {
|
||||||
|
f, err := os.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
data, err := io.ReadAll(io.LimitReader(f, maxBytes+1))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if int64(len(data)) > maxBytes {
|
||||||
|
return nil, fmt.Errorf("file %s too large (exceeds %d bytes)", path, maxBytes)
|
||||||
|
}
|
||||||
|
return data, nil
|
||||||
|
}
|
||||||
|
|
||||||
func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
|
func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
|
||||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
|
return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
|
|||||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
data, err := os.ReadFile(path)
|
data, err := readFileLimited(path, 10<<20)
|
||||||
if err != nil && !os.IsNotExist(err) {
|
if err != nil && !os.IsNotExist(err) {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -160,11 +160,54 @@ type psuSDR struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var psuSlotPatterns = []*regexp.Regexp{
|
var psuSlotPatterns = []*regexp.Regexp{
|
||||||
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),
|
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`), // PSU1, PS1, ps 2
|
||||||
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),
|
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`), // PS 6, PS6
|
||||||
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),
|
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`), // PWS1
|
||||||
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`),
|
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), // Power Supply 1, Power Supply Bay 3
|
||||||
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),
|
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`), // Bay 1
|
||||||
|
// Fallback for xFusion-style generic numbered PSU sensors (Power1, Power2, …).
|
||||||
|
// Must be last: "power supply N" is already caught by the pattern above.
|
||||||
|
regexp.MustCompile(`(?i)\bpower([0-9]+)\b`),
|
||||||
|
}
|
||||||
|
|
||||||
|
// psuInputPowerKeywords matches AC-input power sensor names across vendors:
|
||||||
|
// MSI: PSU1_POWER_IN, PSU1_PIN
|
||||||
|
// MLT: PSU1_PIN
|
||||||
|
// xFusion: (matched via default fallback — no explicit keyword)
|
||||||
|
// HPE: PS1 Input Power, PS1 Input Watts
|
||||||
|
func isPSUInputPower(name string) bool {
|
||||||
|
return strings.Contains(name, "input power") ||
|
||||||
|
strings.Contains(name, "input watts") ||
|
||||||
|
strings.Contains(name, "_pin") ||
|
||||||
|
strings.Contains(name, " pin") ||
|
||||||
|
strings.Contains(name, "_power_in") ||
|
||||||
|
strings.Contains(name, "power_in")
|
||||||
|
}
|
||||||
|
|
||||||
|
// isPSUOutputPower matches DC-output power sensor names across vendors:
|
||||||
|
// MSI: PSU1_POWER_OUT
|
||||||
|
// MLT: PSU1_POUT
|
||||||
|
// xFusion: PS1 POut
|
||||||
|
func isPSUOutputPower(name string) bool {
|
||||||
|
return strings.Contains(name, "output power") ||
|
||||||
|
strings.Contains(name, "output watts") ||
|
||||||
|
strings.Contains(name, "_pout") ||
|
||||||
|
strings.Contains(name, " pout") ||
|
||||||
|
strings.Contains(name, "_power_out") ||
|
||||||
|
strings.Contains(name, "power_out") ||
|
||||||
|
strings.Contains(name, "power supply bay") ||
|
||||||
|
strings.Contains(name, "psu bay")
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseBoundedFloat parses a numeric value from an SDR value field and
|
||||||
|
// validates it is within (0, max]. Returns nil for zero, negative, or
|
||||||
|
// out-of-range values — these indicate missing/off/fault sensor readings.
|
||||||
|
func parseBoundedFloat(raw string, max float64) *float64 {
|
||||||
|
v := parseFloatPtr(raw)
|
||||||
|
if v == nil || *v <= 0 || *v > max {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return v
|
||||||
}
|
}
|
||||||
|
|
||||||
func parsePSUSDR(raw string) map[int]psuSDR {
|
func parsePSUSDR(raw string) map[int]psuSDR {
|
||||||
@@ -194,24 +237,59 @@ func parsePSUSDR(raw string) map[int]psuSDR {
|
|||||||
|
|
||||||
lowerName := strings.ToLower(name)
|
lowerName := strings.ToLower(name)
|
||||||
switch {
|
switch {
|
||||||
case strings.Contains(lowerName, "input power"):
|
case isPSUInputPower(lowerName):
|
||||||
entry.inputPowerW = parseFloatPtr(value)
|
entry.inputPowerW = parseBoundedFloat(value, 6000)
|
||||||
case strings.Contains(lowerName, "output power"):
|
case isPSUOutputPower(lowerName):
|
||||||
entry.outputPowerW = parseFloatPtr(value)
|
entry.outputPowerW = parseBoundedFloat(value, 6000)
|
||||||
case strings.Contains(lowerName, "power supply bay"), strings.Contains(lowerName, "psu bay"):
|
|
||||||
entry.outputPowerW = parseFloatPtr(value)
|
|
||||||
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
|
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
|
||||||
entry.inputVoltage = parseFloatPtr(value)
|
entry.inputVoltage = parseFloatPtr(value)
|
||||||
case strings.Contains(lowerName, "temp"):
|
case strings.Contains(lowerName, "temp"):
|
||||||
entry.temperatureC = parseFloatPtr(value)
|
entry.temperatureC = parseFloatPtr(value)
|
||||||
case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
|
case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
|
||||||
entry.healthPct = parsePercentPtr(value)
|
entry.healthPct = parsePercentPtr(value)
|
||||||
|
default:
|
||||||
|
// Generic PSU power reading: sensor matched a slot pattern but carries
|
||||||
|
// no input/output keyword (e.g. xFusion "Power1", "Power2"). Treat as
|
||||||
|
// AC input if the value looks like wattage and no better data is set yet.
|
||||||
|
if entry.inputPowerW == nil {
|
||||||
|
entry.inputPowerW = parseBoundedFloat(value, 6000)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
out[slot] = entry
|
out[slot] = entry
|
||||||
}
|
}
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PSUSlotPower holds SDR power readings for one PSU slot.
|
||||||
|
// Slot key used by PSUSlotsFromSDR is the 0-based index string,
|
||||||
|
// matching HardwarePowerSupply.Slot in the audit schema.
|
||||||
|
type PSUSlotPower struct {
|
||||||
|
InputW *float64 `json:"input_w,omitempty"`
|
||||||
|
OutputW *float64 `json:"output_w,omitempty"`
|
||||||
|
Status string `json:"status,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// PSUSlotsFromSDR parses `ipmitool sdr` output and returns per-slot PSU data
|
||||||
|
// using the same battle-tested slot patterns as the hardware audit collector.
|
||||||
|
// Works across MSI (PSU1_POWER_IN), xFusion (Power1, PS1 POut), MLT (PSU1_PIN).
|
||||||
|
// Slot keys are 0-based index strings matching HardwarePowerSupply.Slot.
|
||||||
|
func PSUSlotsFromSDR(sdrOutput string) map[string]PSUSlotPower {
|
||||||
|
sdr := parsePSUSDR(sdrOutput)
|
||||||
|
if len(sdr) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
out := make(map[string]PSUSlotPower, len(sdr))
|
||||||
|
for slot, entry := range sdr {
|
||||||
|
key := strconv.Itoa(slot - 1) // audit uses 0-based slot
|
||||||
|
out[key] = PSUSlotPower{
|
||||||
|
InputW: entry.inputPowerW,
|
||||||
|
OutputW: entry.outputPowerW,
|
||||||
|
Status: entry.status,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
|
func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
|
||||||
if len(sdr) == 0 {
|
if len(sdr) == 0 {
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -89,136 +89,159 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
|
|
||||||
// Perspective 1: Compatibility — hard stops
|
// Perspective 1: Compatibility — hard stops
|
||||||
b.WriteString("### 1. Compatibility\n\n")
|
b.WriteString("### 1. Compatibility\n\n")
|
||||||
b.WriteString("| GPU | Thermal throttle | Fan duty at throttle | ECC uncorr | Status |\n")
|
{
|
||||||
b.WriteString("|-----|------------------|----------------------|------------|--------|\n")
|
var rows [][]string
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
thermalThrottle := "-"
|
thermalThrottle := "-"
|
||||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||||
thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||||
|
}
|
||||||
|
fanAtThrottle := "-"
|
||||||
|
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
|
||||||
|
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||||
|
}
|
||||||
|
ecc := "-"
|
||||||
|
if gpu.ECC.Uncorrected > 0 {
|
||||||
|
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
|
||||||
|
}
|
||||||
|
compatStatus := "✓ OK"
|
||||||
|
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
|
||||||
|
compatStatus = "⛔ HARD STOP"
|
||||||
|
}
|
||||||
|
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus})
|
||||||
}
|
}
|
||||||
fanAtThrottle := "-"
|
b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows))
|
||||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
|
b.WriteString("\n")
|
||||||
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
|
||||||
}
|
|
||||||
ecc := "-"
|
|
||||||
if gpu.ECC.Uncorrected > 0 {
|
|
||||||
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
|
|
||||||
}
|
|
||||||
compatStatus := "✓ OK"
|
|
||||||
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
|
|
||||||
compatStatus = "⛔ HARD STOP"
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
|
|
||||||
gpu.Index, thermalThrottle, fanAtThrottle, ecc, compatStatus)
|
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
|
||||||
|
|
||||||
// Perspective 2: Thermal headroom
|
// Perspective 2: Thermal headroom
|
||||||
b.WriteString("### 2. Thermal Headroom\n\n")
|
b.WriteString("### 2. Thermal Headroom\n\n")
|
||||||
b.WriteString("| GPU | p95 temp | Slowdown limit | Shutdown limit | Headroom | Thermal throttle | Status |\n")
|
{
|
||||||
b.WriteString("|-----|----------|----------------|----------------|----------|------------------|--------|\n")
|
var rows [][]string
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
shutdownTemp := gpu.ShutdownTempC
|
shutdownTemp := gpu.ShutdownTempC
|
||||||
if shutdownTemp <= 0 {
|
if shutdownTemp <= 0 {
|
||||||
shutdownTemp = 90
|
shutdownTemp = 90
|
||||||
|
}
|
||||||
|
slowdownTemp := gpu.SlowdownTempC
|
||||||
|
if slowdownTemp <= 0 {
|
||||||
|
slowdownTemp = 80
|
||||||
|
}
|
||||||
|
headroom := gpu.Scores.TempHeadroomC
|
||||||
|
thermalStatus := "✓ OK"
|
||||||
|
switch {
|
||||||
|
case headroom < 10:
|
||||||
|
thermalStatus = "⛔ CRITICAL"
|
||||||
|
case gpu.Steady.P95TempC >= slowdownTemp:
|
||||||
|
thermalStatus = "⚠ WARNING"
|
||||||
|
}
|
||||||
|
throttlePct := "-"
|
||||||
|
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||||
|
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||||
|
}
|
||||||
|
rows = append(rows, []string{
|
||||||
|
fmt.Sprintf("GPU %d", gpu.Index),
|
||||||
|
fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC),
|
||||||
|
fmt.Sprintf("%.0f°C", slowdownTemp),
|
||||||
|
fmt.Sprintf("%.0f°C", shutdownTemp),
|
||||||
|
fmt.Sprintf("%.1f°C", headroom),
|
||||||
|
throttlePct,
|
||||||
|
thermalStatus,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
slowdownTemp := gpu.SlowdownTempC
|
b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows))
|
||||||
if slowdownTemp <= 0 {
|
b.WriteString("\n")
|
||||||
slowdownTemp = 80
|
|
||||||
}
|
|
||||||
headroom := gpu.Scores.TempHeadroomC
|
|
||||||
thermalStatus := "✓ OK"
|
|
||||||
switch {
|
|
||||||
case headroom < 10:
|
|
||||||
thermalStatus = "⛔ CRITICAL"
|
|
||||||
case gpu.Steady.P95TempC >= slowdownTemp:
|
|
||||||
thermalStatus = "⚠ WARNING"
|
|
||||||
}
|
|
||||||
throttlePct := "-"
|
|
||||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
|
||||||
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.0f°C | %.0f°C | %.1f°C | %s | %s |\n",
|
|
||||||
gpu.Index, gpu.Steady.P95TempC, slowdownTemp, shutdownTemp, headroom, throttlePct, thermalStatus)
|
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
|
||||||
|
|
||||||
// Perspective 3: Power delivery
|
// Perspective 3: Power delivery
|
||||||
b.WriteString("### 3. Power Delivery\n\n")
|
b.WriteString("### 3. Power Delivery\n\n")
|
||||||
b.WriteString("| GPU | Power cap throttle | Power stability | Fan duty (p95) | Status |\n")
|
{
|
||||||
b.WriteString("|-----|-------------------|-----------------|----------------|--------|\n")
|
var rows [][]string
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
powerCap := "-"
|
powerCap := "-"
|
||||||
if gpu.Scores.PowerCapThrottlePct > 0 {
|
if gpu.Scores.PowerCapThrottlePct > 0 {
|
||||||
powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
|
powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
|
||||||
|
}
|
||||||
|
fanDuty := "-"
|
||||||
|
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
|
||||||
|
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||||
|
}
|
||||||
|
powerStatus := "✓ OK"
|
||||||
|
if gpu.Scores.PowerCapThrottlePct > 5 {
|
||||||
|
powerStatus = "⚠ POWER LIMITED"
|
||||||
|
}
|
||||||
|
rows = append(rows, []string{
|
||||||
|
fmt.Sprintf("GPU %d", gpu.Index),
|
||||||
|
powerCap,
|
||||||
|
fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore),
|
||||||
|
fanDuty,
|
||||||
|
powerStatus,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
fanDuty := "-"
|
b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows))
|
||||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
|
b.WriteString("\n")
|
||||||
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
|
||||||
}
|
|
||||||
powerStatus := "✓ OK"
|
|
||||||
if gpu.Scores.PowerCapThrottlePct > 5 {
|
|
||||||
powerStatus = "⚠ POWER LIMITED"
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "| GPU %d | %s | %.1f | %s | %s |\n",
|
|
||||||
gpu.Index, powerCap, gpu.Scores.PowerSustainScore, fanDuty, powerStatus)
|
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
|
||||||
|
|
||||||
// Perspective 4: Performance
|
// Perspective 4: Performance
|
||||||
b.WriteString("### 4. Performance\n\n")
|
b.WriteString("### 4. Performance\n\n")
|
||||||
b.WriteString("| GPU | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz |\n")
|
{
|
||||||
b.WriteString("|-----|--------------|-----------|-------|------------|-------------|\n")
|
var rows [][]string
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
synthetic := "-"
|
synthetic := "-"
|
||||||
if gpu.Scores.SyntheticScore > 0 {
|
if gpu.Scores.SyntheticScore > 0 {
|
||||||
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
||||||
|
}
|
||||||
|
mixed := "-"
|
||||||
|
if gpu.Scores.MixedScore > 0 {
|
||||||
|
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
|
||||||
|
}
|
||||||
|
mixedEff := "-"
|
||||||
|
if gpu.Scores.MixedEfficiency > 0 {
|
||||||
|
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
||||||
|
}
|
||||||
|
topsPerSM := "-"
|
||||||
|
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||||
|
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||||
|
}
|
||||||
|
rows = append(rows, []string{
|
||||||
|
fmt.Sprintf("GPU %d", gpu.Index),
|
||||||
|
fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore),
|
||||||
|
synthetic, mixed, mixedEff, topsPerSM,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
mixed := "-"
|
b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows))
|
||||||
if gpu.Scores.MixedScore > 0 {
|
if len(result.PerformanceRampSteps) > 0 {
|
||||||
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
|
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
|
||||||
}
|
}
|
||||||
mixedEff := "-"
|
b.WriteString("\n")
|
||||||
if gpu.Scores.MixedEfficiency > 0 {
|
|
||||||
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
|
||||||
}
|
|
||||||
topsPerSM := "-"
|
|
||||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
|
||||||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "| GPU %d | **%.2f** | %s | %s | %s | %s |\n",
|
|
||||||
gpu.Index, gpu.Scores.CompositeScore, synthetic, mixed, mixedEff, topsPerSM)
|
|
||||||
}
|
}
|
||||||
if len(result.PerformanceRampSteps) > 0 {
|
|
||||||
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
|
|
||||||
}
|
|
||||||
b.WriteString("\n")
|
|
||||||
|
|
||||||
// Perspective 5: Anomaly flags
|
// Perspective 5: Anomaly flags
|
||||||
b.WriteString("### 5. Anomalies\n\n")
|
b.WriteString("### 5. Anomalies\n\n")
|
||||||
b.WriteString("| GPU | ECC corrected | Sync boost throttle | Power instability | Thermal instability |\n")
|
{
|
||||||
b.WriteString("|-----|---------------|---------------------|-------------------|---------------------|\n")
|
var rows [][]string
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
eccCorr := "-"
|
eccCorr := "-"
|
||||||
if gpu.ECC.Corrected > 0 {
|
if gpu.ECC.Corrected > 0 {
|
||||||
eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
|
eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
|
||||||
|
}
|
||||||
|
syncBoost := "-"
|
||||||
|
if gpu.Scores.SyncBoostThrottlePct > 0 {
|
||||||
|
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
|
||||||
|
}
|
||||||
|
powerVar := "OK"
|
||||||
|
if gpu.Scores.PowerSustainScore < 70 {
|
||||||
|
powerVar = "⚠ unstable"
|
||||||
|
}
|
||||||
|
thermalVar := "OK"
|
||||||
|
if gpu.Scores.ThermalSustainScore < 70 {
|
||||||
|
thermalVar = "⚠ unstable"
|
||||||
|
}
|
||||||
|
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar})
|
||||||
}
|
}
|
||||||
syncBoost := "-"
|
b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows))
|
||||||
if gpu.Scores.SyncBoostThrottlePct > 0 {
|
b.WriteString("\n")
|
||||||
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
|
|
||||||
}
|
|
||||||
powerVar := "OK"
|
|
||||||
if gpu.Scores.PowerSustainScore < 70 {
|
|
||||||
powerVar = "⚠ unstable"
|
|
||||||
}
|
|
||||||
thermalVar := "OK"
|
|
||||||
if gpu.Scores.ThermalSustainScore < 70 {
|
|
||||||
thermalVar = "⚠ unstable"
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
|
|
||||||
gpu.Index, eccCorr, syncBoost, powerVar, thermalVar)
|
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
|
||||||
|
|
||||||
// ── Per GPU detail ────────────────────────────────────────────────────────
|
// ── Per GPU detail ────────────────────────────────────────────────────────
|
||||||
b.WriteString("## Per-GPU Details\n\n")
|
b.WriteString("## Per-GPU Details\n\n")
|
||||||
@@ -263,12 +286,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
// Steady-state telemetry
|
// Steady-state telemetry
|
||||||
if benchmarkTelemetryAvailable(gpu.Steady) {
|
if benchmarkTelemetryAvailable(gpu.Steady) {
|
||||||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||||
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
|
b.WriteString(fmtMDTable(
|
||||||
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
|
[]string{"", "Avg", "P95"},
|
||||||
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
|
[][]string{
|
||||||
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
|
{"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)},
|
||||||
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
|
{"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)},
|
||||||
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
{"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)},
|
||||||
|
{"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)},
|
||||||
|
{"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"},
|
||||||
|
},
|
||||||
|
))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
} else {
|
} else {
|
||||||
b.WriteString("**Steady-state telemetry:** unavailable\n\n")
|
b.WriteString("**Steady-state telemetry:** unavailable\n\n")
|
||||||
@@ -277,7 +304,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
// Per-precision stability phases.
|
// Per-precision stability phases.
|
||||||
if len(gpu.PrecisionSteady) > 0 {
|
if len(gpu.PrecisionSteady) > 0 {
|
||||||
b.WriteString("**Per-precision stability:**\n\n")
|
b.WriteString("**Per-precision stability:**\n\n")
|
||||||
b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n")
|
var precRows [][]string
|
||||||
for _, p := range gpu.PrecisionSteady {
|
for _, p := range gpu.PrecisionSteady {
|
||||||
eccCorr := "—"
|
eccCorr := "—"
|
||||||
eccUncorr := "—"
|
eccUncorr := "—"
|
||||||
@@ -289,10 +316,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
if strings.TrimSpace(status) == "" {
|
if strings.TrimSpace(status) == "" {
|
||||||
status = "OK"
|
status = "OK"
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
|
precRows = append(precRows, []string{
|
||||||
p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
|
p.Precision, status,
|
||||||
eccCorr, eccUncorr)
|
fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct),
|
||||||
|
fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct),
|
||||||
|
fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct),
|
||||||
|
eccCorr, eccUncorr,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
} else {
|
} else {
|
||||||
// Legacy: show combined-window variance.
|
// Legacy: show combined-window variance.
|
||||||
@@ -315,16 +347,22 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
// Precision results
|
// Precision results
|
||||||
if len(gpu.PrecisionResults) > 0 {
|
if len(gpu.PrecisionResults) > 0 {
|
||||||
b.WriteString("**Precision results:**\n\n")
|
b.WriteString("**Precision results:**\n\n")
|
||||||
b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n")
|
var presRows [][]string
|
||||||
for _, p := range gpu.PrecisionResults {
|
for _, p := range gpu.PrecisionResults {
|
||||||
if p.Supported {
|
if p.Supported {
|
||||||
weightStr := fmt.Sprintf("×%.3g", p.Weight)
|
presRows = append(presRows, []string{
|
||||||
fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n",
|
p.Name,
|
||||||
p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations)
|
fmt.Sprintf("%.2f", p.TeraOpsPerSec),
|
||||||
|
fmt.Sprintf("×%.3g", p.Weight),
|
||||||
|
fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec),
|
||||||
|
fmt.Sprintf("%d", p.Lanes),
|
||||||
|
fmt.Sprintf("%d", p.Iterations),
|
||||||
|
})
|
||||||
} else {
|
} else {
|
||||||
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name)
|
presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -346,9 +384,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
b.WriteString("## Interconnect (NCCL)\n\n")
|
b.WriteString("## Interconnect (NCCL)\n\n")
|
||||||
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
||||||
if result.Interconnect.Supported {
|
if result.Interconnect.Supported {
|
||||||
b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
|
b.WriteString(fmtMDTable(
|
||||||
fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
|
[]string{"Metric", "Avg", "Max"},
|
||||||
fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
|
[][]string{
|
||||||
|
{"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)},
|
||||||
|
{"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)},
|
||||||
|
},
|
||||||
|
))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
for _, note := range result.Interconnect.Notes {
|
for _, note := range result.Interconnect.Notes {
|
||||||
@@ -365,14 +407,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
if !sp.Available {
|
if !sp.Available {
|
||||||
b.WriteString("IPMI power measurement unavailable.\n\n")
|
b.WriteString("IPMI power measurement unavailable.\n\n")
|
||||||
} else {
|
} else {
|
||||||
b.WriteString("| | Value |\n|---|---|\n")
|
spRows := [][]string{
|
||||||
fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
|
{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
|
||||||
fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
|
{"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)},
|
||||||
fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
|
{"Server delta (load − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)},
|
||||||
fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
|
{"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)},
|
||||||
if sp.ReportingRatio > 0 {
|
|
||||||
fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
|
|
||||||
}
|
}
|
||||||
|
if sp.ReportingRatio > 0 {
|
||||||
|
spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)})
|
||||||
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"", "Value"}, spRows))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
for _, note := range sp.Notes {
|
for _, note := range sp.Notes {
|
||||||
@@ -383,19 +427,33 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── PSU Issues ────────────────────────────────────────────────────────────
|
||||||
|
if len(result.PSUIssues) > 0 {
|
||||||
|
b.WriteString("## PSU Issues\n\n")
|
||||||
|
b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n")
|
||||||
|
for _, issue := range result.PSUIssues {
|
||||||
|
fmt.Fprintf(&b, "- ⛔ %s\n", issue)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
// ── Cooling ───────────────────────────────────────────────────────────────
|
// ── Cooling ───────────────────────────────────────────────────────────────
|
||||||
if cooling := result.Cooling; cooling != nil {
|
if cooling := result.Cooling; cooling != nil {
|
||||||
b.WriteString("## Cooling\n\n")
|
b.WriteString("## Cooling\n\n")
|
||||||
if cooling.Available {
|
if cooling.Available {
|
||||||
b.WriteString("| Metric | Value |\n|--------|-------|\n")
|
dutyAvg, dutyP95 := "N/A", "N/A"
|
||||||
fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM)
|
|
||||||
if cooling.FanDutyCycleAvailable {
|
if cooling.FanDutyCycleAvailable {
|
||||||
fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct)
|
dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct)
|
||||||
fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct)
|
dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct)
|
||||||
} else {
|
|
||||||
b.WriteString("| Average fan duty cycle | N/A |\n")
|
|
||||||
b.WriteString("| P95 fan duty cycle | N/A |\n")
|
|
||||||
}
|
}
|
||||||
|
b.WriteString(fmtMDTable(
|
||||||
|
[]string{"Metric", "Value"},
|
||||||
|
[][]string{
|
||||||
|
{"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)},
|
||||||
|
{"Average fan duty cycle", dutyAvg},
|
||||||
|
{"P95 fan duty cycle", dutyP95},
|
||||||
|
},
|
||||||
|
))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
} else {
|
} else {
|
||||||
b.WriteString("Cooling telemetry unavailable.\n\n")
|
b.WriteString("Cooling telemetry unavailable.\n\n")
|
||||||
@@ -412,12 +470,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
if len(result.PerformanceRampSteps) > 0 {
|
if len(result.PerformanceRampSteps) > 0 {
|
||||||
b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
|
b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
|
||||||
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore)
|
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore)
|
||||||
b.WriteString("| k GPUs | GPU Indices | Total Synthetic TOPS | Scalability |\n")
|
var scalRows [][]string
|
||||||
b.WriteString("|--------|-------------|----------------------|-------------|\n")
|
|
||||||
for _, step := range result.PerformanceRampSteps {
|
for _, step := range result.PerformanceRampSteps {
|
||||||
fmt.Fprintf(&b, "| %d | %s | %.2f | %.1f%% |\n",
|
scalRows = append(scalRows, []string{
|
||||||
step.StepIndex, joinIndexList(step.GPUIndices), step.TotalSyntheticTOPS, step.ScalabilityPct)
|
fmt.Sprintf("%d", step.StepIndex),
|
||||||
|
joinIndexList(step.GPUIndices),
|
||||||
|
fmt.Sprintf("%.2f", step.TotalSyntheticTOPS),
|
||||||
|
fmt.Sprintf("%.1f%%", step.ScalabilityPct),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
75
audit/internal/platform/benchmark_table.go
Normal file
75
audit/internal/platform/benchmark_table.go
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// fmtMDTable renders a markdown table with column widths padded so the table
|
||||||
|
// is readable as plain text without a markdown renderer.
|
||||||
|
//
|
||||||
|
// headers contains the column header strings.
|
||||||
|
// rows contains data rows; each row must have the same number of cells as headers.
|
||||||
|
// Cells with fewer entries than headers are treated as empty.
|
||||||
|
func fmtMDTable(headers []string, rows [][]string) string {
|
||||||
|
ncols := len(headers)
|
||||||
|
if ncols == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute max width per column.
|
||||||
|
widths := make([]int, ncols)
|
||||||
|
for i, h := range headers {
|
||||||
|
if len(h) > widths[i] {
|
||||||
|
widths[i] = len(h)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, row := range rows {
|
||||||
|
for i := 0; i < ncols; i++ {
|
||||||
|
cell := ""
|
||||||
|
if i < len(row) {
|
||||||
|
cell = row[i]
|
||||||
|
}
|
||||||
|
if len(cell) > widths[i] {
|
||||||
|
widths[i] = len(cell)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
|
||||||
|
// Header row.
|
||||||
|
b.WriteByte('|')
|
||||||
|
for i, h := range headers {
|
||||||
|
b.WriteByte(' ')
|
||||||
|
b.WriteString(h)
|
||||||
|
b.WriteString(strings.Repeat(" ", widths[i]-len(h)))
|
||||||
|
b.WriteString(" |")
|
||||||
|
}
|
||||||
|
b.WriteByte('\n')
|
||||||
|
|
||||||
|
// Separator row.
|
||||||
|
b.WriteByte('|')
|
||||||
|
for i := range headers {
|
||||||
|
b.WriteString(strings.Repeat("-", widths[i]+2))
|
||||||
|
b.WriteByte('|')
|
||||||
|
}
|
||||||
|
b.WriteByte('\n')
|
||||||
|
|
||||||
|
// Data rows.
|
||||||
|
for _, row := range rows {
|
||||||
|
b.WriteByte('|')
|
||||||
|
for i := 0; i < ncols; i++ {
|
||||||
|
cell := ""
|
||||||
|
if i < len(row) {
|
||||||
|
cell = row[i]
|
||||||
|
}
|
||||||
|
b.WriteByte(' ')
|
||||||
|
b.WriteString(cell)
|
||||||
|
b.WriteString(strings.Repeat(" ", widths[i]-len(cell)))
|
||||||
|
b.WriteString(" |")
|
||||||
|
}
|
||||||
|
b.WriteByte('\n')
|
||||||
|
}
|
||||||
|
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
@@ -52,7 +52,7 @@ const (
|
|||||||
// - BenchmarkEstimatedPerfStabilitySec: xFusion v8.22 ramp 1-8: 5532 s
|
// - BenchmarkEstimatedPerfStabilitySec: xFusion v8.22 ramp 1-8: 5532 s
|
||||||
// - BenchmarkEstimatedPerfOvernightSec: derived from profile phases (SteadySec=27000)
|
// - BenchmarkEstimatedPerfOvernightSec: derived from profile phases (SteadySec=27000)
|
||||||
// - BenchmarkEstimatedPowerStandardSec: MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
|
// - BenchmarkEstimatedPowerStandardSec: MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
|
||||||
// - BenchmarkEstimatedPowerStabilitySec: xFusion v8.17/v8.22 ramp 1-8: 1977-2002 s
|
// - BenchmarkEstimatedPowerStabilitySec: target ~90 min with calibDurationSec=300 (8 GPU × ~2-3 attempts)
|
||||||
const (
|
const (
|
||||||
// Performance Benchmark (bee-gpu-burn).
|
// Performance Benchmark (bee-gpu-burn).
|
||||||
// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
|
// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
|
||||||
@@ -64,7 +64,7 @@ const (
|
|||||||
// Power / Thermal Fit (dcgmi targeted_power binary-search calibration).
|
// Power / Thermal Fit (dcgmi targeted_power binary-search calibration).
|
||||||
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
|
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
|
||||||
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
|
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
|
||||||
BenchmarkEstimatedPowerStabilitySec = 2000 // ~33 min; stability profile converges faster (longer steady → faster convergence)
|
BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
|
||||||
BenchmarkEstimatedPowerOvernightSec = 3 * 3600
|
BenchmarkEstimatedPowerOvernightSec = 3 * 3600
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -107,6 +107,10 @@ type NvidiaBenchmarkResult struct {
|
|||||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||||
|
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
|
||||||
|
// sensor states before and after the benchmark run. Empty when IPMI is
|
||||||
|
// unavailable or no PSU faults occurred during the test.
|
||||||
|
PSUIssues []string `json:"psu_issues,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type BenchmarkNormalization struct {
|
type BenchmarkNormalization struct {
|
||||||
@@ -271,18 +275,55 @@ type BenchmarkScorecard struct {
|
|||||||
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
|
// BenchmarkPSUSlotPower holds SDR power readings for one PSU slot sampled
|
||||||
// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
|
// during the benchmark. Slot keys match audit HardwarePowerSupply.Slot (0-based)
|
||||||
// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
|
// so benchmark and audit data can be correlated by slot.
|
||||||
// over-reporting its power consumption.
|
type BenchmarkPSUSlotPower struct {
|
||||||
|
InputW *float64 `json:"input_w,omitempty"` // AC wall input (PSUx_POWER_IN)
|
||||||
|
OutputW *float64 `json:"output_w,omitempty"` // DC output (PSUx_POWER_OUT)
|
||||||
|
Status string `json:"status,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// BenchmarkServerPower captures server-side power from multiple independent
|
||||||
|
// sources: IPMI DCMI (high-level), IPMI SDR per-PSU sensors (granular), and
|
||||||
|
// GPU-reported power (nvidia-smi). Cross-comparing sources detects when DCMI
|
||||||
|
// covers only a subset of installed PSUs (partial coverage).
|
||||||
|
//
|
||||||
|
// Source legend:
|
||||||
|
// - DCMI — `ipmitool dcmi power reading`; fast but may miss PSUs
|
||||||
|
// - SDR — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
|
||||||
|
// - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
|
||||||
type BenchmarkServerPower struct {
|
type BenchmarkServerPower struct {
|
||||||
Available bool `json:"available"`
|
Available bool `json:"available"`
|
||||||
IdleW float64 `json:"idle_w,omitempty"`
|
IdleW float64 `json:"idle_w,omitempty"` // DCMI at idle
|
||||||
LoadedW float64 `json:"loaded_w,omitempty"`
|
LoadedW float64 `json:"loaded_w,omitempty"` // DCMI at peak load
|
||||||
DeltaW float64 `json:"delta_w,omitempty"`
|
DeltaW float64 `json:"delta_w,omitempty"` // DCMI loaded − idle
|
||||||
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||||
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||||
Notes []string `json:"notes,omitempty"`
|
|
||||||
|
// PSU AC input sum — sampled at idle and at peak load using collector's
|
||||||
|
// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
|
||||||
|
PSUInputIdleW float64 `json:"psu_input_idle_w,omitempty"`
|
||||||
|
PSUInputLoadedW float64 `json:"psu_input_loaded_w,omitempty"`
|
||||||
|
|
||||||
|
// PSU DC output sum — power delivered to server internals after conversion.
|
||||||
|
PSUOutputIdleW float64 `json:"psu_output_idle_w,omitempty"`
|
||||||
|
PSUOutputLoadedW float64 `json:"psu_output_loaded_w,omitempty"`
|
||||||
|
|
||||||
|
// Per-slot PSU readings at idle and at peak load.
|
||||||
|
// Keys are 0-based slot strings matching audit HardwarePowerSupply.Slot.
|
||||||
|
PSUSlotReadingsIdle map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_idle,omitempty"`
|
||||||
|
PSUSlotReadingsLoaded map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_loaded,omitempty"`
|
||||||
|
|
||||||
|
// GPUSlotTotalW is the sum of GPU_POWER_SLOTx SDR sensors at peak load.
|
||||||
|
// PCIe slot delivery only (excludes 16-pin connector power).
|
||||||
|
GPUSlotTotalW float64 `json:"gpu_slot_total_w,omitempty"`
|
||||||
|
|
||||||
|
// DCMICoverageRatio = DCMI_idle / SDR_PSU_IN_idle.
|
||||||
|
// Near 1.0 → DCMI tracks all PSUs. Near 0.5 → DCMI tracks half the PSUs.
|
||||||
|
DCMICoverageRatio float64 `json:"dcmi_coverage_ratio,omitempty"`
|
||||||
|
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
|
// BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
|
||||||
@@ -333,6 +374,10 @@ type NvidiaPowerBenchResult struct {
|
|||||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||||
Findings []string `json:"findings,omitempty"`
|
Findings []string `json:"findings,omitempty"`
|
||||||
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||||
|
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
|
||||||
|
// sensor states before and after the power benchmark run. Empty when IPMI is
|
||||||
|
// unavailable or no PSU faults occurred during the test.
|
||||||
|
PSUIssues []string `json:"psu_issues,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaPowerBenchGPU struct {
|
type NvidiaPowerBenchGPU struct {
|
||||||
@@ -363,6 +408,9 @@ type NvidiaPowerBenchGPU struct {
|
|||||||
// Telemetry holds the aggregated stats from the final converged calibration
|
// Telemetry holds the aggregated stats from the final converged calibration
|
||||||
// attempt for this GPU (temperature, power, fan, clock percentiles).
|
// attempt for this GPU (temperature, power, fan, clock percentiles).
|
||||||
Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
|
Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
|
||||||
|
// Fan state sampled at the end of single-card calibration.
|
||||||
|
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||||
|
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaPowerBenchStep struct {
|
type NvidiaPowerBenchStep struct {
|
||||||
@@ -381,6 +429,13 @@ type NvidiaPowerBenchStep struct {
|
|||||||
// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
|
// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||||
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||||
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||||
|
// PSU slot readings sampled at end of this ramp step.
|
||||||
|
PSUSlotReadings map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings,omitempty"`
|
||||||
|
// Fan state at end of this ramp step.
|
||||||
|
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||||
|
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||||
|
// Per-GPU telemetry from this step's calibration, keyed by GPU index.
|
||||||
|
PerGPUTelemetry map[int]*BenchmarkTelemetrySummary `json:"per_gpu_telemetry,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// NvidiaPerformanceRampStep holds per-step performance data for the
|
// NvidiaPerformanceRampStep holds per-step performance data for the
|
||||||
|
|||||||
@@ -1,11 +1,14 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
|
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
|
||||||
@@ -30,7 +33,12 @@ type KilledProcess struct {
|
|||||||
// KillTestWorkers scans /proc for running test worker processes and sends
|
// KillTestWorkers scans /proc for running test worker processes and sends
|
||||||
// SIGKILL to each one found. It returns a list of killed processes.
|
// SIGKILL to each one found. It returns a list of killed processes.
|
||||||
// Errors for individual processes (e.g. already exited) are silently ignored.
|
// Errors for individual processes (e.g. already exited) are silently ignored.
|
||||||
|
// The scan runs under a 5-second deadline to avoid blocking if the process
|
||||||
|
// table is very large (e.g. after a stress test with thousands of children).
|
||||||
func KillTestWorkers() []KilledProcess {
|
func KillTestWorkers() []KilledProcess {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
entries, err := os.ReadDir("/proc")
|
entries, err := os.ReadDir("/proc")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
@@ -38,6 +46,13 @@ func KillTestWorkers() []KilledProcess {
|
|||||||
|
|
||||||
var killed []KilledProcess
|
var killed []KilledProcess
|
||||||
for _, e := range entries {
|
for _, e := range entries {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
slog.Warn("KillTestWorkers scan timed out", "killed_so_far", len(killed))
|
||||||
|
return killed
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
if !e.IsDir() {
|
if !e.IsDir() {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -178,16 +178,20 @@ func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T
|
|||||||
}
|
}
|
||||||
globalQueue.mu.Lock()
|
globalQueue.mu.Lock()
|
||||||
defer globalQueue.mu.Unlock()
|
defer globalQueue.mu.Unlock()
|
||||||
if len(globalQueue.tasks) != 3 {
|
// Ramp-up mode creates a single task that handles the 1→N GPU ramp internally
|
||||||
t.Fatalf("tasks=%d want 3", len(globalQueue.tasks))
|
// (spawning N separate tasks would redundantly repeat all earlier ramp steps).
|
||||||
|
if len(globalQueue.tasks) != 1 {
|
||||||
|
t.Fatalf("tasks=%d want 1 (ramp-up uses single task)", len(globalQueue.tasks))
|
||||||
}
|
}
|
||||||
for i, task := range globalQueue.tasks {
|
task := globalQueue.tasks[0]
|
||||||
if task.Target != "nvidia-bench-power" {
|
if task.Target != "nvidia-bench-power" {
|
||||||
t.Fatalf("task[%d] target=%q", i, task.Target)
|
t.Fatalf("task target=%q want nvidia-bench-power", task.Target)
|
||||||
}
|
}
|
||||||
if task.Priority != taskPriorityBenchmark {
|
if task.Priority != taskPriorityBenchmark {
|
||||||
t.Fatalf("task[%d] priority=%d want %d", i, task.Priority, taskPriorityBenchmark)
|
t.Fatalf("task priority=%d want %d", task.Priority, taskPriorityBenchmark)
|
||||||
}
|
}
|
||||||
|
if task.params.RampTotal != 3 {
|
||||||
|
t.Fatalf("task RampTotal=%d want 3", task.params.RampTotal)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -17,6 +20,25 @@ type jobState struct {
|
|||||||
cancel func() // optional cancel function; nil if job is not cancellable
|
cancel func() // optional cancel function; nil if job is not cancellable
|
||||||
logPath string
|
logPath string
|
||||||
serialPrefix string
|
serialPrefix string
|
||||||
|
logFile *os.File // kept open for the task lifetime to avoid per-line open/close
|
||||||
|
logBuf *bufio.Writer
|
||||||
|
}
|
||||||
|
|
||||||
|
// readTaskLogFile reads a task log, refusing files over 50 MB.
|
||||||
|
func readTaskLogFile(path string) ([]byte, error) {
|
||||||
|
f, err := os.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
data, err := io.ReadAll(io.LimitReader(f, 50<<20+1))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if int64(len(data)) > 50<<20 {
|
||||||
|
return nil, fmt.Errorf("task log %s too large (exceeds 50 MB)", path)
|
||||||
|
}
|
||||||
|
return data, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// abort cancels the job if it has a cancel function and is not yet done.
|
// abort cancels the job if it has a cancel function and is not yet done.
|
||||||
@@ -35,7 +57,7 @@ func (j *jobState) append(line string) {
|
|||||||
defer j.mu.Unlock()
|
defer j.mu.Unlock()
|
||||||
j.lines = append(j.lines, line)
|
j.lines = append(j.lines, line)
|
||||||
if j.logPath != "" {
|
if j.logPath != "" {
|
||||||
appendJobLog(j.logPath, line)
|
j.writeLogLineLocked(line)
|
||||||
}
|
}
|
||||||
if j.serialPrefix != "" {
|
if j.serialPrefix != "" {
|
||||||
taskSerialWriteLine(j.serialPrefix + line)
|
taskSerialWriteLine(j.serialPrefix + line)
|
||||||
@@ -48,6 +70,35 @@ func (j *jobState) append(line string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// writeLogLineLocked writes a line to the persistent log file, opening it lazily.
|
||||||
|
// Must be called with j.mu held. Uses a buffered writer kept open for the task
|
||||||
|
// lifetime — avoids thousands of open/close syscalls during high-frequency logs.
|
||||||
|
func (j *jobState) writeLogLineLocked(line string) {
|
||||||
|
if j.logFile == nil {
|
||||||
|
f, err := os.OpenFile(j.logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
j.logFile = f
|
||||||
|
j.logBuf = bufio.NewWriterSize(f, 64*1024)
|
||||||
|
}
|
||||||
|
_, _ = j.logBuf.WriteString(line + "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// closeLog flushes and closes the log file. Called after all task output is done.
|
||||||
|
func (j *jobState) closeLog() {
|
||||||
|
j.mu.Lock()
|
||||||
|
defer j.mu.Unlock()
|
||||||
|
if j.logBuf != nil {
|
||||||
|
_ = j.logBuf.Flush()
|
||||||
|
}
|
||||||
|
if j.logFile != nil {
|
||||||
|
_ = j.logFile.Close()
|
||||||
|
j.logFile = nil
|
||||||
|
j.logBuf = nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (j *jobState) finish(errMsg string) {
|
func (j *jobState) finish(errMsg string) {
|
||||||
j.mu.Lock()
|
j.mu.Lock()
|
||||||
defer j.mu.Unlock()
|
defer j.mu.Unlock()
|
||||||
@@ -119,7 +170,7 @@ func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
|
|||||||
if logPath == "" {
|
if logPath == "" {
|
||||||
return j
|
return j
|
||||||
}
|
}
|
||||||
data, err := os.ReadFile(logPath)
|
data, err := readTaskLogFile(logPath)
|
||||||
if err != nil || len(data) == 0 {
|
if err != nil || len(data) == 0 {
|
||||||
return j
|
return j
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -161,6 +161,56 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
|||||||
return tx.Commit()
|
return tx.Commit()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Downsample reduces density of old metrics rows to 1 sample per minute.
|
||||||
|
// Only rows in the half-open window [deleteOlderThan, downsampleBefore) are
|
||||||
|
// affected — rows newer than downsampleBefore keep full 5-second resolution.
|
||||||
|
// For each 60-second bucket the row with the smallest ts is kept; the rest
|
||||||
|
// are deleted. This trims ~92 % of rows in that window while preserving
|
||||||
|
// the overall shape of every chart.
|
||||||
|
//
|
||||||
|
// Called hourly by the metrics collector background goroutine.
|
||||||
|
func (m *MetricsDB) Downsample(downsampleBefore, deleteOlderThan time.Time) error {
|
||||||
|
if m == nil || m.db == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
start := deleteOlderThan.Unix()
|
||||||
|
end := downsampleBefore.Unix()
|
||||||
|
if end <= start {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// For each table: delete rows in [start, end) whose ts is NOT the minimum
|
||||||
|
// ts in its 60-second bucket (ts/60 integer division = bucket ID).
|
||||||
|
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||||
|
_, err := m.db.Exec(`
|
||||||
|
DELETE FROM `+table+` WHERE ts >= ? AND ts < ?
|
||||||
|
AND ts NOT IN (
|
||||||
|
SELECT MIN(ts) FROM `+table+`
|
||||||
|
WHERE ts >= ? AND ts < ?
|
||||||
|
GROUP BY ts / 60
|
||||||
|
)`, start, end, start, end)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prune deletes all rows older than the given cutoff from every metrics table.
|
||||||
|
// Called hourly by the metrics collector to keep the DB size bounded.
|
||||||
|
func (m *MetricsDB) Prune(before time.Time) error {
|
||||||
|
if m == nil || m.db == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
cutTS := before.Unix()
|
||||||
|
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||||
|
if _, err := m.db.Exec("DELETE FROM "+table+" WHERE ts < ?", cutTS); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_, _ = m.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||||
|
|||||||
@@ -2385,7 +2385,7 @@ function benchmarkRefreshResults() {
|
|||||||
func renderBenchmarkResultsCard(exportDir string) string {
|
func renderBenchmarkResultsCard(exportDir string) string {
|
||||||
maxIdx, runs := loadBenchmarkHistory(exportDir)
|
maxIdx, runs := loadBenchmarkHistory(exportDir)
|
||||||
perf := renderBenchmarkResultsCardFromRuns(
|
perf := renderBenchmarkResultsCardFromRuns(
|
||||||
"Performance Results",
|
"Perf Results",
|
||||||
"Composite score by saved benchmark run and GPU.",
|
"Composite score by saved benchmark run and GPU.",
|
||||||
"No saved performance benchmark runs yet.",
|
"No saved performance benchmark runs yet.",
|
||||||
maxIdx,
|
maxIdx,
|
||||||
|
|||||||
@@ -135,6 +135,14 @@ type namedMetricsRing struct {
|
|||||||
// At metricsCollectInterval = 5 s this covers 30 minutes of live history.
|
// At metricsCollectInterval = 5 s this covers 30 minutes of live history.
|
||||||
const metricsChartWindow = 360
|
const metricsChartWindow = 360
|
||||||
|
|
||||||
|
// metricsDownsampleAge is the age after which old metrics rows are downsampled
|
||||||
|
// to 1 sample per minute. Data fresher than this is kept at full resolution.
|
||||||
|
const metricsDownsampleAge = 2 * time.Hour
|
||||||
|
|
||||||
|
// metricsRetainWindow is the total retention period for metrics rows.
|
||||||
|
// Rows older than this are deleted entirely by the background compactor.
|
||||||
|
const metricsRetainWindow = 48 * time.Hour
|
||||||
|
|
||||||
var metricsCollectInterval = 5 * time.Second
|
var metricsCollectInterval = 5 * time.Second
|
||||||
|
|
||||||
// pendingNetChange tracks a network state change awaiting confirmation.
|
// pendingNetChange tracks a network state change awaiting confirmation.
|
||||||
@@ -335,13 +343,24 @@ func (h *handler) startMetricsCollector() {
|
|||||||
goRecoverLoop("metrics collector", 2*time.Second, func() {
|
goRecoverLoop("metrics collector", 2*time.Second, func() {
|
||||||
ticker := time.NewTicker(metricsCollectInterval)
|
ticker := time.NewTicker(metricsCollectInterval)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
for range ticker.C {
|
pruneTicker := time.NewTicker(time.Hour)
|
||||||
sample := platform.SampleLiveMetrics()
|
defer pruneTicker.Stop()
|
||||||
if h.metricsDB != nil {
|
for {
|
||||||
_ = h.metricsDB.Write(sample)
|
select {
|
||||||
|
case <-ticker.C:
|
||||||
|
sample := platform.SampleLiveMetrics()
|
||||||
|
if h.metricsDB != nil {
|
||||||
|
_ = h.metricsDB.Write(sample)
|
||||||
|
}
|
||||||
|
h.feedRings(sample)
|
||||||
|
h.setLatestMetric(sample)
|
||||||
|
case <-pruneTicker.C:
|
||||||
|
if h.metricsDB != nil {
|
||||||
|
now := time.Now().UTC()
|
||||||
|
_ = h.metricsDB.Downsample(now.Add(-metricsDownsampleAge), now.Add(-metricsRetainWindow))
|
||||||
|
_ = h.metricsDB.Prune(now.Add(-metricsRetainWindow))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
h.feedRings(sample)
|
|
||||||
h.setLatestMetric(sample)
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,14 +7,43 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
recoverLoopMaxDelay = 60 * time.Second
|
||||||
|
recoverLoopResetAfter = 30 * time.Second
|
||||||
|
)
|
||||||
|
|
||||||
|
// goRecoverLoop starts fn in a goroutine, restarting after panics.
|
||||||
|
// restartDelay is the initial delay; successive panics double it up to
|
||||||
|
// recoverLoopMaxDelay. The delay resets to restartDelay once fn runs
|
||||||
|
// successfully for recoverLoopResetAfter without panicking.
|
||||||
func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
|
func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
|
||||||
go func() {
|
go func() {
|
||||||
|
delay := restartDelay
|
||||||
|
consecutive := 0
|
||||||
for {
|
for {
|
||||||
if !runRecoverable(name, fn) {
|
start := time.Now()
|
||||||
|
panicked := runRecoverable(name, fn)
|
||||||
|
if !panicked {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if restartDelay > 0 {
|
consecutive++
|
||||||
time.Sleep(restartDelay)
|
if time.Since(start) >= recoverLoopResetAfter {
|
||||||
|
delay = restartDelay
|
||||||
|
consecutive = 1
|
||||||
|
}
|
||||||
|
slog.Warn("goroutine restarting after panic",
|
||||||
|
"component", name,
|
||||||
|
"consecutive_panics", consecutive,
|
||||||
|
"next_delay", delay,
|
||||||
|
)
|
||||||
|
if delay > 0 {
|
||||||
|
time.Sleep(delay)
|
||||||
|
}
|
||||||
|
if delay < recoverLoopMaxDelay {
|
||||||
|
delay *= 2
|
||||||
|
if delay > recoverLoopMaxDelay {
|
||||||
|
delay = recoverLoopMaxDelay
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|||||||
@@ -585,6 +585,7 @@ func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
|
|||||||
if err := writeTaskReportArtifacts(t); err != nil {
|
if err := writeTaskReportArtifacts(t); err != nil {
|
||||||
appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
|
appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
|
||||||
}
|
}
|
||||||
|
j.closeLog()
|
||||||
if t.ErrMsg != "" {
|
if t.ErrMsg != "" {
|
||||||
taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
|
taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -110,8 +110,12 @@ nvidia-smi / lspci (audit collection)
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## What Needs Fixing
|
## Fixed Issues
|
||||||
|
|
||||||
1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` should set `dev.Model = &gpu.Name`
|
All previously open items are resolved:
|
||||||
2. **Fallback consistency** — `benchmark_report.go:93` should say `"Unknown GPU"` not `"Unknown"`; `sat.go:922` should say `"Unknown GPU"` not `"unknown"`
|
|
||||||
3. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue)
|
1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` sets `dev.Model = &v` (`nvidia.go:78`).
|
||||||
|
2. **Fallback consistency** — `sat.go` and `benchmark_report.go` both use `"Unknown GPU"`.
|
||||||
|
3. **`tops_per_sm_per_ghz`** — computed in `benchmark.go` and stored in `BenchmarkGPUScore.TOPSPerSMPerGHz`.
|
||||||
|
4. **`MultiprocessorCount`, `PowerLimitW`, `DefaultPowerLimitW`** — present in `benchmark_types.go`.
|
||||||
|
5. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue).
|
||||||
|
|||||||
@@ -203,7 +203,7 @@ dump_memtest_debug() {
|
|||||||
|
|
||||||
echo "-- source bootloader templates --"
|
echo "-- source bootloader templates --"
|
||||||
for cfg in \
|
for cfg in \
|
||||||
"${BUILDER_DIR}/config/bootloaders/grub-pc/grub.cfg" \
|
"${BUILDER_DIR}/config/bootloaders/grub-efi/grub.cfg" \
|
||||||
"${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do
|
"${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do
|
||||||
if [ -f "$cfg" ]; then
|
if [ -f "$cfg" ]; then
|
||||||
echo " file: $cfg"
|
echo " file: $cfg"
|
||||||
@@ -954,86 +954,6 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
|
|||||||
rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
|
rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$BEE_GPU_VENDOR" != "nvidia" ] || [ "$BEE_NVIDIA_MODULE_FLAVOR" != "proprietary" ]; then
|
|
||||||
cat > "${BUILD_WORK_DIR}/config/bootloaders/grub-pc/grub.cfg" <<'EOF'
|
|
||||||
source /boot/grub/config.cfg
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
|
|
||||||
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
|
|
||||||
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
|
|
||||||
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
|
||||||
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
|
||||||
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
|
||||||
echo " Hardware Audit LiveCD"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
submenu "EASY-BEE (advanced options) -->" {
|
|
||||||
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE — fail-safe" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if [ "${grub_platform}" = "efi" ]; then
|
|
||||||
menuentry "Memory Test (memtest86+)" {
|
|
||||||
chainloader /boot/memtest86+x64.efi
|
|
||||||
}
|
|
||||||
else
|
|
||||||
menuentry "Memory Test (memtest86+)" {
|
|
||||||
linux16 /boot/memtest86+x64.bin
|
|
||||||
}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "${grub_platform}" = "efi" ]; then
|
|
||||||
menuentry "UEFI Firmware Settings" {
|
|
||||||
fwsetup
|
|
||||||
}
|
|
||||||
fi
|
|
||||||
EOF
|
|
||||||
|
|
||||||
cat > "${BUILD_WORK_DIR}/config/bootloaders/isolinux/live.cfg.in" <<'EOF'
|
|
||||||
label live-@FLAVOUR@-normal
|
|
||||||
menu label ^EASY-BEE
|
|
||||||
menu default
|
|
||||||
linux @LINUX@
|
|
||||||
initrd @INITRD@
|
|
||||||
append @APPEND_LIVE@
|
|
||||||
|
|
||||||
label live-@FLAVOUR@-kms
|
|
||||||
menu label EASY-BEE (^graphics/KMS)
|
|
||||||
linux @LINUX@
|
|
||||||
initrd @INITRD@
|
|
||||||
append @APPEND_LIVE@ bee.display=kms
|
|
||||||
|
|
||||||
label live-@FLAVOUR@-toram
|
|
||||||
menu label EASY-BEE (^load to RAM)
|
|
||||||
linux @LINUX@
|
|
||||||
initrd @INITRD@
|
|
||||||
append @APPEND_LIVE@ toram
|
|
||||||
|
|
||||||
label live-@FLAVOUR@-failsafe
|
|
||||||
menu label EASY-BEE (^fail-safe)
|
|
||||||
linux @LINUX@
|
|
||||||
initrd @INITRD@
|
|
||||||
append @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
|
||||||
|
|
||||||
label memtest
|
|
||||||
menu label ^Memory Test (memtest86+)
|
|
||||||
linux /boot/memtest86+x64.bin
|
|
||||||
EOF
|
|
||||||
fi
|
|
||||||
|
|
||||||
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
|
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
|
||||||
rm -f \
|
rm -f \
|
||||||
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
|
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
|
||||||
|
|||||||
BIN
iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
Normal file
BIN
iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 70 KiB |
@@ -5,6 +5,15 @@ title-text: ""
|
|||||||
message-font: "Unifont Regular 16"
|
message-font: "Unifont Regular 16"
|
||||||
terminal-font: "Unifont Regular 16"
|
terminal-font: "Unifont Regular 16"
|
||||||
|
|
||||||
|
#bee logo — centered, upper third of screen
|
||||||
|
+ image {
|
||||||
|
top = 4%
|
||||||
|
left = 50%-200
|
||||||
|
width = 400
|
||||||
|
height = 400
|
||||||
|
file = "bee-logo.png"
|
||||||
|
}
|
||||||
|
|
||||||
#help bar at the bottom
|
#help bar at the bottom
|
||||||
+ label {
|
+ label {
|
||||||
top = 100%-50
|
top = 100%-50
|
||||||
@@ -21,8 +30,8 @@ terminal-font: "Unifont Regular 16"
|
|||||||
+ boot_menu {
|
+ boot_menu {
|
||||||
left = 20%
|
left = 20%
|
||||||
width = 60%
|
width = 60%
|
||||||
top = 62%
|
top = 65%
|
||||||
height = 38%-80
|
height = 35%-80
|
||||||
item_color = "#c88000"
|
item_color = "#c88000"
|
||||||
item_font = "Unifont Regular 16"
|
item_font = "Unifont Regular 16"
|
||||||
selected_item_color= "#f5a800"
|
selected_item_color= "#f5a800"
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ RestartSec=3
|
|||||||
StandardOutput=journal
|
StandardOutput=journal
|
||||||
StandardError=journal
|
StandardError=journal
|
||||||
LimitMEMLOCK=infinity
|
LimitMEMLOCK=infinity
|
||||||
|
MemoryMax=3G
|
||||||
# Keep the web server responsive during GPU/CPU stress (children inherit nice+10
|
# Keep the web server responsive during GPU/CPU stress (children inherit nice+10
|
||||||
# via Setpriority in runCmdJob, but the bee-web parent stays at 0).
|
# via Setpriority in runCmdJob, but the bee-web parent stays at 0).
|
||||||
Nice=0
|
Nice=0
|
||||||
|
|||||||
Reference in New Issue
Block a user