Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| b2f8626fee | |||
| dd26e03b2d | |||
| 6937a4c6ec | |||
| b9be93c213 | |||
| d1a22d782d | |||
|
|
0a4bb596f6 | ||
|
|
531d1ca366 | ||
|
|
93cfa78e8c | ||
|
|
1358485f2b |
@@ -382,9 +382,9 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
||||
}
|
||||
case "memory":
|
||||
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
|
||||
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", 256, 1, logLine)
|
||||
case "storage":
|
||||
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
|
||||
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", false, logLine)
|
||||
case "cpu":
|
||||
dur := *duration
|
||||
if dur <= 0 {
|
||||
|
||||
@@ -122,8 +122,10 @@ type satRunner interface {
|
||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
|
||||
ResetNvidiaGPU(index int) (string, error)
|
||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
|
||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
|
||||
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||
DetectGPUVendor() string
|
||||
@@ -521,6 +523,15 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return a.sat.ListNvidiaGPUs()
|
||||
}
|
||||
|
||||
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||
return a.sat.ListNvidiaGPUStatuses()
|
||||
}
|
||||
|
||||
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
|
||||
out, err := a.sat.ResetNvidiaGPU(index)
|
||||
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
@@ -591,14 +602,14 @@ func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts p
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc)
|
||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
@@ -623,14 +634,14 @@ func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (Actio
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc)
|
||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
|
||||
@@ -135,6 +135,8 @@ type fakeSAT struct {
|
||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||
runAMDPackFn func(string) (string, error)
|
||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||
listNvidiaGPUStatusesFn func() ([]platform.NvidiaGPUStatus, error)
|
||||
resetNvidiaGPUFn func(int) (string, error)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||
@@ -201,11 +203,25 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||
if f.listNvidiaGPUStatusesFn != nil {
|
||||
return f.listNvidiaGPUStatusesFn()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) {
|
||||
if f.resetNvidiaGPUFn != nil {
|
||||
return f.resetNvidiaGPUFn(index)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _, _ int, _ func(string)) (string, error) {
|
||||
return f.runMemoryFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ bool, _ func(string)) (string, error) {
|
||||
return f.runStorageFn(baseDir)
|
||||
}
|
||||
|
||||
@@ -805,6 +821,9 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
for _, want := range []string{
|
||||
"/system/ip-link.txt",
|
||||
"/system/ip-link-stats.txt",
|
||||
"/system/kernel-aer-nvidia.txt",
|
||||
"/system/lspci-nvidia-bridges-vv.txt",
|
||||
"/system/pcie-aer-sysfs.txt",
|
||||
"/system/ethtool-info.txt",
|
||||
"/system/ethtool-link.txt",
|
||||
"/system/ethtool-module.txt",
|
||||
|
||||
@@ -3,6 +3,7 @@ package app
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
@@ -18,6 +19,7 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
|
||||
applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
|
||||
applyNvidiaPerGPUStatus(snap.PCIeDevices, baseDir)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
|
||||
applyMemorySAT(snap.Memory, summary)
|
||||
@@ -32,6 +34,100 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
|
||||
applyComponentStatusDB(snap, db)
|
||||
}
|
||||
|
||||
type nvidiaPerGPUStatus struct {
|
||||
runStatus string
|
||||
reason string
|
||||
}
|
||||
|
||||
func applyNvidiaPerGPUStatus(devs []schema.HardwarePCIeDevice, baseDir string) {
|
||||
statusByIndex, ts, ok := loadLatestNvidiaPerGPUStatus(baseDir)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for i := range devs {
|
||||
if devs[i].Telemetry == nil {
|
||||
continue
|
||||
}
|
||||
rawIdx, ok := devs[i].Telemetry["nvidia_gpu_index"]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
idx, ok := telemetryInt(rawIdx)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
st, ok := statusByIndex[idx]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
status, description, ok := satKeyStatus(st.runStatus, firstNonEmpty(strings.TrimSpace(st.reason), "nvidia GPU SAT"))
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
mergeComponentStatusPreferDetail(&devs[i].HardwareComponentStatus, ts, status, description)
|
||||
}
|
||||
}
|
||||
|
||||
func loadLatestNvidiaPerGPUStatus(baseDir string) (map[int]nvidiaPerGPUStatus, string, bool) {
|
||||
matches, err := filepath.Glob(filepath.Join(baseDir, "gpu-nvidia-*"))
|
||||
if err != nil || len(matches) == 0 {
|
||||
return nil, "", false
|
||||
}
|
||||
sort.Strings(matches)
|
||||
runDir := matches[len(matches)-1]
|
||||
summaryRaw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return nil, "", false
|
||||
}
|
||||
summaryKV := parseKeyValueSummary(string(summaryRaw))
|
||||
runAtUTC := strings.TrimSpace(summaryKV["run_at_utc"])
|
||||
files, err := filepath.Glob(filepath.Join(runDir, "gpu-*-status.txt"))
|
||||
if err != nil || len(files) == 0 {
|
||||
return nil, "", false
|
||||
}
|
||||
out := make(map[int]nvidiaPerGPUStatus, len(files))
|
||||
for _, file := range files {
|
||||
raw, err := os.ReadFile(file)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
kv := parseKeyValueSummary(string(raw))
|
||||
idx, err := strconv.Atoi(strings.TrimSpace(kv["gpu_index"]))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out[idx] = nvidiaPerGPUStatus{
|
||||
runStatus: strings.ToUpper(strings.TrimSpace(kv["run_status"])),
|
||||
reason: strings.TrimSpace(kv["reason"]),
|
||||
}
|
||||
}
|
||||
if len(out) == 0 {
|
||||
return nil, "", false
|
||||
}
|
||||
return out, runAtUTC, true
|
||||
}
|
||||
|
||||
func telemetryInt(v any) (int, bool) {
|
||||
switch value := v.(type) {
|
||||
case int:
|
||||
return value, true
|
||||
case int32:
|
||||
return int(value), true
|
||||
case int64:
|
||||
return int(value), true
|
||||
case float64:
|
||||
return int(value), true
|
||||
case string:
|
||||
n, err := strconv.Atoi(strings.TrimSpace(value))
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
return n, true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
type satSummary struct {
|
||||
runAtUTC string
|
||||
overall string
|
||||
@@ -176,6 +272,31 @@ func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func mergeComponentStatusPreferDetail(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
|
||||
if component == nil || satStatus == "" {
|
||||
return
|
||||
}
|
||||
current := strings.TrimSpace(ptrString(component.Status))
|
||||
newSeverity := statusSeverity(satStatus)
|
||||
currentSeverity := statusSeverity(current)
|
||||
if current == "" || current == "Unknown" || newSeverity > currentSeverity {
|
||||
mergeComponentStatus(component, changedAt, satStatus, description)
|
||||
return
|
||||
}
|
||||
if newSeverity == currentSeverity && strings.TrimSpace(description) != "" {
|
||||
component.Status = appStringPtr(satStatus)
|
||||
component.ErrorDescription = appStringPtr(description)
|
||||
if strings.TrimSpace(changedAt) != "" {
|
||||
component.StatusChangedAt = appStringPtr(changedAt)
|
||||
component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
|
||||
Status: satStatus,
|
||||
ChangedAt: changedAt,
|
||||
Details: appStringPtr(description),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func statusSeverity(status string) int {
|
||||
switch strings.TrimSpace(status) {
|
||||
case "Critical":
|
||||
|
||||
@@ -59,3 +59,51 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyLatestSATStatusesMarksNvidiaGPUByPerGPUStatusFile(t *testing.T) {
|
||||
baseDir := t.TempDir()
|
||||
runDir := filepath.Join(baseDir, "gpu-nvidia-20260407-162123")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte("run_at_utc=2026-04-07T16:21:23Z\noverall_status=FAILED\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "gpu-1-status.txt"), []byte("gpu_index=1\ngpu_name=NVIDIA H100 PCIe\nrun_status=FAILED\nreason=GPU requires reset\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
class := "VideoController"
|
||||
manufacturer := "NVIDIA Corporation"
|
||||
bdf0 := "0000:4b:00.0"
|
||||
bdf1 := "0000:4f:00.0"
|
||||
snap := schema.HardwareSnapshot{
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
BDF: &bdf0,
|
||||
Telemetry: map[string]any{"nvidia_gpu_index": 0},
|
||||
},
|
||||
{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
BDF: &bdf1,
|
||||
Telemetry: map[string]any{"nvidia_gpu_index": 1},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.PCIeDevices[1].Status == nil || *snap.PCIeDevices[1].Status != "Critical" {
|
||||
t.Fatalf("gpu1 status=%v want Critical", snap.PCIeDevices[1].Status)
|
||||
}
|
||||
if snap.PCIeDevices[1].ErrorDescription == nil || *snap.PCIeDevices[1].ErrorDescription != "GPU requires reset failed" {
|
||||
got := "<nil>"
|
||||
if snap.PCIeDevices[1].ErrorDescription != nil {
|
||||
got = *snap.PCIeDevices[1].ErrorDescription
|
||||
}
|
||||
t.Fatalf("gpu1 error=%q want per-gpu reason", got)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,7 +40,36 @@ var supportBundleCommands = []struct {
|
||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
||||
{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v dmesg >/dev/null 2>&1; then
|
||||
dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
|
||||
else
|
||||
echo "dmesg not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v lspci >/dev/null 2>&1; then
|
||||
echo "lspci not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for gpu in $(lspci -Dn | awk '$3 ~ /^10de:/ {print $1}'); do
|
||||
found=1
|
||||
echo "=== GPU $gpu ==="
|
||||
lspci -s "$gpu" -vv 2>&1 || true
|
||||
bridge=$(basename "$(readlink -f "/sys/bus/pci/devices/$gpu/.." 2>/dev/null)" 2>/dev/null)
|
||||
if [ -n "$bridge" ] && [ "$bridge" != "$gpu" ]; then
|
||||
echo
|
||||
echo "=== UPSTREAM $bridge for $gpu ==="
|
||||
lspci -s "$bridge" -vv 2>&1 || true
|
||||
fi
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no NVIDIA PCI devices found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||
for d in /sys/bus/pci/devices/*/; do
|
||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||
@@ -51,6 +80,30 @@ for d in /sys/bus/pci/devices/*/; do
|
||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||
done
|
||||
done
|
||||
`}},
|
||||
{name: "system/pcie-aer-sysfs.txt", cmd: []string{"sh", "-c", `
|
||||
found=0
|
||||
for dev in /sys/bus/pci/devices/*; do
|
||||
[ -e "$dev" ] || continue
|
||||
bdf=$(basename "$dev")
|
||||
block=""
|
||||
for f in aer_dev_correctable aer_dev_fatal aer_dev_nonfatal aer_rootport_total_err_cor aer_rootport_total_err_fatal aer_rootport_total_err_nonfatal; do
|
||||
if [ -r "$dev/$f" ]; then
|
||||
if [ -z "$block" ]; then
|
||||
block=1
|
||||
found=1
|
||||
echo "=== $bdf ==="
|
||||
fi
|
||||
printf " %-30s %s\n" "$f" "$(cat "$dev/$f" 2>/dev/null)"
|
||||
fi
|
||||
done
|
||||
if [ -n "$block" ]; then
|
||||
echo
|
||||
fi
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no PCIe AER sysfs counters found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v ethtool >/dev/null 2>&1; then
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
const nvidiaVendorID = 0x10de
|
||||
|
||||
type nvidiaGPUInfo struct {
|
||||
Index int
|
||||
BDF string
|
||||
Serial string
|
||||
VBIOS string
|
||||
@@ -132,6 +133,7 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
}
|
||||
|
||||
info := nvidiaGPUInfo{
|
||||
Index: parseRequiredInt(rec[0]),
|
||||
BDF: bdf,
|
||||
Serial: strings.TrimSpace(rec[2]),
|
||||
VBIOS: strings.TrimSpace(rec[3]),
|
||||
@@ -187,6 +189,14 @@ func parseMaybeInt(v string) *int {
|
||||
return &n
|
||||
}
|
||||
|
||||
func parseRequiredInt(v string) int {
|
||||
n, err := strconv.Atoi(strings.TrimSpace(v))
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func pcieLinkGenLabel(gen int) string {
|
||||
return fmt.Sprintf("Gen%d", gen)
|
||||
}
|
||||
@@ -240,6 +250,10 @@ func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
||||
}
|
||||
|
||||
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||
if dev.Telemetry == nil {
|
||||
dev.Telemetry = map[string]any{}
|
||||
}
|
||||
dev.Telemetry["nvidia_gpu_index"] = info.Index
|
||||
if info.TemperatureC != nil {
|
||||
dev.TemperatureC = info.TemperatureC
|
||||
}
|
||||
|
||||
@@ -86,6 +86,9 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
||||
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
|
||||
t.Fatalf("firmware: got %v", out[0].Firmware)
|
||||
}
|
||||
if out[0].Telemetry == nil || out[0].Telemetry["nvidia_gpu_index"] != 0 {
|
||||
t.Fatalf("telemetry nvidia_gpu_index: got %#v", out[0].Telemetry)
|
||||
}
|
||||
if out[0].Status == nil || *out[0].Status != statusWarning {
|
||||
t.Fatalf("status: got %v", out[0].Status)
|
||||
}
|
||||
|
||||
@@ -105,7 +105,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
BenchmarkVersion: benchmarkVersion,
|
||||
GeneratedAt: time.Now().UTC(),
|
||||
Hostname: hostname,
|
||||
ServerModel: readServerModel(),
|
||||
BenchmarkProfile: spec.Name,
|
||||
ParallelGPUs: opts.ParallelGPUs,
|
||||
SelectedGPUIndices: append([]int(nil), selected...),
|
||||
Normalization: BenchmarkNormalization{
|
||||
Status: "full",
|
||||
@@ -143,6 +145,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
}
|
||||
}()
|
||||
|
||||
if opts.ParallelGPUs {
|
||||
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
|
||||
} else {
|
||||
|
||||
for _, idx := range selected {
|
||||
gpuResult := BenchmarkGPUResult{
|
||||
Index: idx,
|
||||
@@ -285,6 +291,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
|
||||
}
|
||||
|
||||
} // end sequential path
|
||||
|
||||
if len(selected) > 1 && opts.RunNCCL {
|
||||
result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
|
||||
if result.Interconnect != nil && result.Interconnect.Supported {
|
||||
@@ -318,8 +326,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
}
|
||||
|
||||
report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected))
|
||||
if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil {
|
||||
return "", fmt.Errorf("write report.txt: %w", err)
|
||||
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil {
|
||||
return "", fmt.Errorf("write report.md: %w", err)
|
||||
}
|
||||
|
||||
summary := renderBenchmarkSummary(result)
|
||||
@@ -362,60 +370,87 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
|
||||
}
|
||||
}
|
||||
|
||||
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
||||
args := []string{
|
||||
"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
|
||||
"--format=csv,noheader,nounits",
|
||||
}
|
||||
if len(gpuIndices) > 0 {
|
||||
args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
|
||||
}
|
||||
out, err := satExecCommand("nvidia-smi", args...).Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia-smi gpu info: %w", err)
|
||||
}
|
||||
|
||||
r := csv.NewReader(strings.NewReader(string(out)))
|
||||
r.TrimLeadingSpace = true
|
||||
r.FieldsPerRecord = -1
|
||||
rows, err := r.ReadAll()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse nvidia-smi gpu info: %w", err)
|
||||
}
|
||||
|
||||
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
|
||||
for _, row := range rows {
|
||||
if len(row) < 9 {
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
info := benchmarkGPUInfo{
|
||||
Index: idx,
|
||||
UUID: strings.TrimSpace(row[1]),
|
||||
Name: strings.TrimSpace(row[2]),
|
||||
BusID: strings.TrimSpace(row[3]),
|
||||
VBIOS: strings.TrimSpace(row[4]),
|
||||
PowerLimitW: parseBenchmarkFloat(row[5]),
|
||||
MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
|
||||
MaxMemoryClockMHz: parseBenchmarkFloat(row[7]),
|
||||
}
|
||||
if len(row) >= 9 {
|
||||
info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
|
||||
}
|
||||
if len(row) >= 10 {
|
||||
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
|
||||
}
|
||||
if len(row) >= 11 {
|
||||
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
|
||||
}
|
||||
infoByIndex[idx] = info
|
||||
}
|
||||
return infoByIndex, nil
|
||||
// benchmarkGPUInfoQuery describes a nvidia-smi --query-gpu field set to try.
|
||||
// Fields are tried in order; the first successful query wins. Extended fields
|
||||
// (attribute.multiprocessor_count, power.default_limit) are not supported on
|
||||
// all driver versions, so we fall back to the base set if the full query fails.
|
||||
var benchmarkGPUInfoQueries = []struct {
|
||||
fields string
|
||||
extended bool // whether this query includes optional extended fields
|
||||
}{
|
||||
{
|
||||
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
|
||||
extended: true,
|
||||
},
|
||||
{
|
||||
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics",
|
||||
extended: false,
|
||||
},
|
||||
}
|
||||
|
||||
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
||||
var lastErr error
|
||||
for _, q := range benchmarkGPUInfoQueries {
|
||||
args := []string{
|
||||
"--query-gpu=" + q.fields,
|
||||
"--format=csv,noheader,nounits",
|
||||
}
|
||||
if len(gpuIndices) > 0 {
|
||||
args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
|
||||
}
|
||||
out, err := satExecCommand("nvidia-smi", args...).Output()
|
||||
if err != nil {
|
||||
lastErr = fmt.Errorf("nvidia-smi gpu info (%s): %w", q.fields[:min(len(q.fields), 40)], err)
|
||||
continue
|
||||
}
|
||||
|
||||
r := csv.NewReader(strings.NewReader(string(out)))
|
||||
r.TrimLeadingSpace = true
|
||||
r.FieldsPerRecord = -1
|
||||
rows, err := r.ReadAll()
|
||||
if err != nil {
|
||||
lastErr = fmt.Errorf("parse nvidia-smi gpu info: %w", err)
|
||||
continue
|
||||
}
|
||||
|
||||
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
|
||||
for _, row := range rows {
|
||||
if len(row) < 9 {
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
info := benchmarkGPUInfo{
|
||||
Index: idx,
|
||||
UUID: strings.TrimSpace(row[1]),
|
||||
Name: strings.TrimSpace(row[2]),
|
||||
BusID: strings.TrimSpace(row[3]),
|
||||
VBIOS: strings.TrimSpace(row[4]),
|
||||
PowerLimitW: parseBenchmarkFloat(row[5]),
|
||||
MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
|
||||
MaxMemoryClockMHz: parseBenchmarkFloat(row[7]),
|
||||
}
|
||||
if len(row) >= 9 {
|
||||
info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
|
||||
}
|
||||
if q.extended {
|
||||
if len(row) >= 10 {
|
||||
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
|
||||
}
|
||||
if len(row) >= 11 {
|
||||
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
|
||||
}
|
||||
}
|
||||
infoByIndex[idx] = info
|
||||
}
|
||||
return infoByIndex, nil
|
||||
}
|
||||
return nil, lastErr
|
||||
}
|
||||
|
||||
|
||||
func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
|
||||
if os.Geteuid() != 0 {
|
||||
result.Normalization.Status = "partial"
|
||||
@@ -454,6 +489,10 @@ func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndi
|
||||
_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
|
||||
}})
|
||||
}
|
||||
} else {
|
||||
rec.GPUClockLockStatus = "skipped"
|
||||
rec.Notes = append(rec.Notes, "graphics clock lock skipped: gpu inventory unavailable or MaxGraphicsClockMHz=0")
|
||||
result.Normalization.Status = "partial"
|
||||
}
|
||||
|
||||
if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
|
||||
@@ -1144,18 +1183,8 @@ func queryIPMIServerPowerW() (float64, error) {
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
|
||||
}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if strings.Contains(line, "Current Power") {
|
||||
parts := strings.SplitN(line, ":", 2)
|
||||
if len(parts) == 2 {
|
||||
val := strings.TrimSpace(strings.TrimSuffix(strings.TrimSpace(parts[1]), "Watts"))
|
||||
val = strings.TrimSpace(val)
|
||||
w, err := strconv.ParseFloat(val, 64)
|
||||
if err == nil && w > 0 {
|
||||
return w, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
if w := parseDCMIPowerReading(string(out)); w > 0 {
|
||||
return w, nil
|
||||
}
|
||||
return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
|
||||
}
|
||||
@@ -1209,3 +1238,246 @@ func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvaila
|
||||
}
|
||||
return sp
|
||||
}
|
||||
|
||||
// readServerModel returns the DMI system product name (e.g. "SuperMicro SYS-421GE-TNRT").
|
||||
// Returns empty string if unavailable (non-Linux or missing DMI entry).
|
||||
func readServerModel() string {
|
||||
data, err := os.ReadFile("/sys/class/dmi/id/product_name")
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(data))
|
||||
}
|
||||
|
||||
// filterRowsByGPU returns only the metric rows for a specific GPU index.
|
||||
func filterRowsByGPU(rows []GPUMetricRow, gpuIndex int) []GPUMetricRow {
|
||||
var out []GPUMetricRow
|
||||
for _, r := range rows {
|
||||
if r.GPUIndex == gpuIndex {
|
||||
out = append(out, r)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// parseBenchmarkBurnLogByGPU splits a multi-GPU bee-gpu-burn output by [gpu N] prefix
|
||||
// and returns a per-GPU parse result map.
|
||||
func parseBenchmarkBurnLogByGPU(raw string) map[int]benchmarkBurnParseResult {
|
||||
gpuLines := make(map[int][]string)
|
||||
for _, line := range strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(line, "[gpu ") {
|
||||
continue
|
||||
}
|
||||
end := strings.Index(line, "] ")
|
||||
if end < 0 {
|
||||
continue
|
||||
}
|
||||
gpuIdx, err := strconv.Atoi(strings.TrimSpace(line[5:end]))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
gpuLines[gpuIdx] = append(gpuLines[gpuIdx], line[end+2:])
|
||||
}
|
||||
results := make(map[int]benchmarkBurnParseResult, len(gpuLines))
|
||||
for gpuIdx, lines := range gpuLines {
|
||||
// Lines are already stripped of the [gpu N] prefix; parseBenchmarkBurnLog
|
||||
// calls stripBenchmarkPrefix which is a no-op on already-stripped lines.
|
||||
results[gpuIdx] = parseBenchmarkBurnLog(strings.Join(lines, "\n"))
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
||||
// runNvidiaBenchmarkParallel runs warmup and steady compute on all selected GPUs
|
||||
// simultaneously using a single bee-gpu-burn invocation per phase.
|
||||
func runNvidiaBenchmarkParallel(
|
||||
ctx context.Context,
|
||||
verboseLog, runDir string,
|
||||
selected []int,
|
||||
infoByIndex map[int]benchmarkGPUInfo,
|
||||
opts NvidiaBenchmarkOptions,
|
||||
spec benchmarkProfileSpec,
|
||||
logFunc func(string),
|
||||
result *NvidiaBenchmarkResult,
|
||||
serverIdleW *float64, serverLoadedWSum *float64,
|
||||
serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
|
||||
) {
|
||||
allDevices := joinIndexList(selected)
|
||||
|
||||
// Build per-GPU result stubs.
|
||||
gpuResults := make(map[int]*BenchmarkGPUResult, len(selected))
|
||||
for _, idx := range selected {
|
||||
r := &BenchmarkGPUResult{Index: idx, Status: "FAILED"}
|
||||
if info, ok := infoByIndex[idx]; ok {
|
||||
r.UUID = info.UUID
|
||||
r.Name = info.Name
|
||||
r.BusID = info.BusID
|
||||
r.VBIOS = info.VBIOS
|
||||
r.PowerLimitW = info.PowerLimitW
|
||||
r.MultiprocessorCount = info.MultiprocessorCount
|
||||
r.DefaultPowerLimitW = info.DefaultPowerLimitW
|
||||
r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
|
||||
r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
||||
r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
||||
}
|
||||
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
||||
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
||||
r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
|
||||
}
|
||||
gpuResults[idx] = r
|
||||
}
|
||||
|
||||
// Baseline: sample all GPUs together.
|
||||
baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, selected)
|
||||
if err != nil && err != context.Canceled {
|
||||
for _, idx := range selected {
|
||||
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "baseline sampling failed: "+err.Error())
|
||||
}
|
||||
}
|
||||
for _, idx := range selected {
|
||||
perGPU := filterRowsByGPU(baselineRows, idx)
|
||||
gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
|
||||
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), perGPU)
|
||||
}
|
||||
|
||||
// Sample server idle power once.
|
||||
if !*serverIdleOK {
|
||||
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
|
||||
*serverIdleW = w
|
||||
*serverIdleOK = true
|
||||
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
||||
}
|
||||
}
|
||||
|
||||
// Warmup: all GPUs simultaneously.
|
||||
warmupCmd := []string{
|
||||
"bee-gpu-burn",
|
||||
"--seconds", strconv.Itoa(spec.WarmupSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
"--devices", allDevices,
|
||||
}
|
||||
logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
|
||||
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, runDir, "gpu-all-warmup", logFunc)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "gpu-all-warmup.log"), warmupOut, 0644)
|
||||
for _, idx := range selected {
|
||||
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-warmup", idx), filterRowsByGPU(warmupRows, idx))
|
||||
}
|
||||
if warmupErr != nil {
|
||||
for _, idx := range selected {
|
||||
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
|
||||
}
|
||||
}
|
||||
|
||||
// Snapshot throttle counters before steady.
|
||||
beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
|
||||
for _, idx := range selected {
|
||||
beforeThrottle[idx], _ = queryThrottleCounters(idx)
|
||||
}
|
||||
|
||||
// Steady: all GPUs simultaneously.
|
||||
steadyCmd := []string{
|
||||
"bee-gpu-burn",
|
||||
"--seconds", strconv.Itoa(spec.SteadySec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
"--devices", allDevices,
|
||||
}
|
||||
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (%ds)", allDevices, spec.SteadySec))
|
||||
|
||||
// Sample server power via IPMI in parallel with steady phase.
|
||||
ipmiStopCh := make(chan struct{})
|
||||
ipmiResultCh := make(chan float64, 1)
|
||||
go func() {
|
||||
defer close(ipmiResultCh)
|
||||
var samples []float64
|
||||
ticker := time.NewTicker(5 * time.Second)
|
||||
defer ticker.Stop()
|
||||
select {
|
||||
case <-ipmiStopCh:
|
||||
return
|
||||
case <-time.After(15 * time.Second):
|
||||
}
|
||||
for {
|
||||
if w, err := queryIPMIServerPowerW(); err == nil {
|
||||
samples = append(samples, w)
|
||||
}
|
||||
select {
|
||||
case <-ipmiStopCh:
|
||||
if len(samples) > 0 {
|
||||
var sum float64
|
||||
for _, w := range samples {
|
||||
sum += w
|
||||
}
|
||||
ipmiResultCh <- sum / float64(len(samples))
|
||||
}
|
||||
return
|
||||
case <-ticker.C:
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, runDir, "gpu-all-steady", logFunc)
|
||||
close(ipmiStopCh)
|
||||
if loadedW, ok := <-ipmiResultCh; ok {
|
||||
*serverLoadedWSum += loadedW
|
||||
(*serverLoadedSamples)++
|
||||
*serverLoadedOK = true
|
||||
logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
|
||||
}
|
||||
_ = os.WriteFile(filepath.Join(runDir, "gpu-all-steady.log"), steadyOut, 0644)
|
||||
|
||||
afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
|
||||
for _, idx := range selected {
|
||||
afterThrottle[idx], _ = queryThrottleCounters(idx)
|
||||
}
|
||||
|
||||
parseResults := parseBenchmarkBurnLogByGPU(string(steadyOut))
|
||||
|
||||
for _, idx := range selected {
|
||||
perGPU := filterRowsByGPU(steadyRows, idx)
|
||||
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-steady", idx), perGPU)
|
||||
gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU)
|
||||
gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx])
|
||||
|
||||
if pr, ok := parseResults[idx]; ok {
|
||||
gpuResults[idx].ComputeCapability = pr.ComputeCapability
|
||||
gpuResults[idx].Backend = pr.Backend
|
||||
gpuResults[idx].PrecisionResults = pr.Profiles
|
||||
if pr.Fallback {
|
||||
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
|
||||
}
|
||||
}
|
||||
if steadyErr != nil {
|
||||
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel steady compute failed: "+steadyErr.Error())
|
||||
}
|
||||
}
|
||||
|
||||
// Cooldown: all GPUs together.
|
||||
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
|
||||
if err != nil && err != context.Canceled {
|
||||
for _, idx := range selected {
|
||||
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error())
|
||||
}
|
||||
}
|
||||
for _, idx := range selected {
|
||||
perGPU := filterRowsByGPU(cooldownRows, idx)
|
||||
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
|
||||
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), perGPU)
|
||||
}
|
||||
|
||||
// Score and finalize each GPU.
|
||||
for _, idx := range selected {
|
||||
r := gpuResults[idx]
|
||||
r.Scores = scoreBenchmarkGPUResult(*r)
|
||||
r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
|
||||
pr := parseResults[idx]
|
||||
switch {
|
||||
case steadyErr != nil:
|
||||
r.Status = classifySATErrorStatus(steadyOut, steadyErr)
|
||||
case pr.Fallback:
|
||||
r.Status = "PARTIAL"
|
||||
default:
|
||||
r.Status = "OK"
|
||||
}
|
||||
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,18 +22,53 @@ var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
|
||||
|
||||
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
|
||||
fmt.Fprintf(&b, "===========================\n\n")
|
||||
fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||
fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
|
||||
fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
|
||||
fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
|
||||
fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
|
||||
|
||||
// ── Header ────────────────────────────────────────────────────────────────
|
||||
b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
|
||||
|
||||
// System identity block
|
||||
if result.ServerModel != "" {
|
||||
fmt.Fprintf(&b, "**Server:** %s \n", result.ServerModel)
|
||||
}
|
||||
if result.Hostname != "" {
|
||||
fmt.Fprintf(&b, "**Host:** %s \n", result.Hostname)
|
||||
}
|
||||
// GPU models summary
|
||||
if len(result.GPUs) > 0 {
|
||||
modelCount := make(map[string]int)
|
||||
var modelOrder []string
|
||||
for _, g := range result.GPUs {
|
||||
m := strings.TrimSpace(g.Name)
|
||||
if m == "" {
|
||||
m = "Unknown GPU"
|
||||
}
|
||||
if modelCount[m] == 0 {
|
||||
modelOrder = append(modelOrder, m)
|
||||
}
|
||||
modelCount[m]++
|
||||
}
|
||||
var parts []string
|
||||
for _, m := range modelOrder {
|
||||
if modelCount[m] == 1 {
|
||||
parts = append(parts, m)
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
|
||||
}
|
||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||
if result.ParallelGPUs {
|
||||
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
|
||||
}
|
||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||
b.WriteString("\n")
|
||||
|
||||
// ── Executive Summary ─────────────────────────────────────────────────────
|
||||
if len(result.Findings) > 0 {
|
||||
fmt.Fprintf(&b, "Executive Summary\n")
|
||||
fmt.Fprintf(&b, "-----------------\n")
|
||||
b.WriteString("## Executive Summary\n\n")
|
||||
for _, finding := range result.Findings {
|
||||
fmt.Fprintf(&b, "- %s\n", finding)
|
||||
}
|
||||
@@ -41,149 +76,206 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
||||
}
|
||||
|
||||
if len(result.Warnings) > 0 {
|
||||
fmt.Fprintf(&b, "Warnings\n")
|
||||
fmt.Fprintf(&b, "--------\n")
|
||||
b.WriteString("## Warnings\n\n")
|
||||
for _, warning := range result.Warnings {
|
||||
fmt.Fprintf(&b, "- %s\n", warning)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
fmt.Fprintf(&b, "Per GPU Scorecard\n")
|
||||
fmt.Fprintf(&b, "-----------------\n")
|
||||
// ── Scorecard table ───────────────────────────────────────────────────────
|
||||
b.WriteString("## Scorecard\n\n")
|
||||
b.WriteString("| GPU | Status | Composite | Compute | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
|
||||
b.WriteString("|-----|--------|-----------|---------|-------------|---------------|-----------------|-----------|-------------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
fmt.Fprintf(&b, "GPU %d %s\n", gpu.Index, gpu.Name)
|
||||
fmt.Fprintf(&b, " Status: %s\n", gpu.Status)
|
||||
fmt.Fprintf(&b, " Composite score: %.2f\n", gpu.Scores.CompositeScore)
|
||||
fmt.Fprintf(&b, " Compute score: %.2f\n", gpu.Scores.ComputeScore)
|
||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||
fmt.Fprintf(&b, " Compute efficiency: %.3f TOPS/SM/GHz\n", gpu.Scores.TOPSPerSMPerGHz)
|
||||
name := strings.TrimSpace(gpu.Name)
|
||||
if name == "" {
|
||||
name = "Unknown"
|
||||
}
|
||||
fmt.Fprintf(&b, " Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
|
||||
fmt.Fprintf(&b, " Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
|
||||
fmt.Fprintf(&b, " Stability: %.1f\n", gpu.Scores.StabilityScore)
|
||||
interconnect := "-"
|
||||
if gpu.Scores.InterconnectScore > 0 {
|
||||
fmt.Fprintf(&b, " Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
|
||||
interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
|
||||
}
|
||||
if len(gpu.DegradationReasons) > 0 {
|
||||
fmt.Fprintf(&b, " Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
|
||||
topsPerSM := "-"
|
||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||
}
|
||||
fmt.Fprintf(&b, " Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
|
||||
fmt.Fprintf(&b, " P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
|
||||
if len(gpu.PrecisionResults) > 0 {
|
||||
fmt.Fprintf(&b, " Precision results:\n")
|
||||
for _, precision := range gpu.PrecisionResults {
|
||||
if precision.Supported {
|
||||
fmt.Fprintf(&b, " - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
|
||||
} else {
|
||||
fmt.Fprintf(&b, " - %s: unsupported (%s)\n", precision.Name, precision.Notes)
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %.1f | %.1f | %.1f | %s |\n",
|
||||
gpu.Index, name,
|
||||
gpu.Status,
|
||||
gpu.Scores.CompositeScore,
|
||||
gpu.Scores.ComputeScore,
|
||||
topsPerSM,
|
||||
gpu.Scores.PowerSustainScore,
|
||||
gpu.Scores.ThermalSustainScore,
|
||||
gpu.Scores.StabilityScore,
|
||||
interconnect,
|
||||
)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// ── Per GPU detail ────────────────────────────────────────────────────────
|
||||
b.WriteString("## Per-GPU Details\n\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
name := strings.TrimSpace(gpu.Name)
|
||||
if name == "" {
|
||||
name = "Unknown GPU"
|
||||
}
|
||||
fmt.Fprintf(&b, " Throttle: %s\n", formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec))
|
||||
if len(gpu.Notes) > 0 {
|
||||
fmt.Fprintf(&b, " Notes:\n")
|
||||
for _, note := range gpu.Notes {
|
||||
fmt.Fprintf(&b, " - %s\n", note)
|
||||
}
|
||||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
|
||||
|
||||
// Identity
|
||||
if gpu.BusID != "" {
|
||||
fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
|
||||
}
|
||||
if gpu.VBIOS != "" {
|
||||
fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
|
||||
}
|
||||
if gpu.ComputeCapability != "" {
|
||||
fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
|
||||
}
|
||||
if gpu.MultiprocessorCount > 0 {
|
||||
fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
|
||||
}
|
||||
if gpu.PowerLimitW > 0 {
|
||||
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
||||
}
|
||||
if gpu.LockedGraphicsClockMHz > 0 {
|
||||
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Steady-state telemetry
|
||||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
|
||||
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
|
||||
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
|
||||
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
|
||||
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
|
||||
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
||||
b.WriteString("\n")
|
||||
|
||||
// Throttle
|
||||
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
|
||||
if throttle != "none" {
|
||||
fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
|
||||
}
|
||||
|
||||
// Precision results
|
||||
if len(gpu.PrecisionResults) > 0 {
|
||||
b.WriteString("**Precision results:**\n\n")
|
||||
b.WriteString("| Precision | TOPS | Lanes | Iterations |\n|-----------|------|-------|------------|\n")
|
||||
for _, p := range gpu.PrecisionResults {
|
||||
if p.Supported {
|
||||
fmt.Fprintf(&b, "| %s | %.2f | %d | %d |\n", p.Name, p.TeraOpsPerSec, p.Lanes, p.Iterations)
|
||||
} else {
|
||||
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — |\n", p.Name)
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Degradation / Notes
|
||||
if len(gpu.DegradationReasons) > 0 {
|
||||
fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
|
||||
}
|
||||
if len(gpu.Notes) > 0 {
|
||||
b.WriteString("**Notes:**\n\n")
|
||||
for _, note := range gpu.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Interconnect ──────────────────────────────────────────────────────────
|
||||
if result.Interconnect != nil {
|
||||
fmt.Fprintf(&b, "Interconnect\n")
|
||||
fmt.Fprintf(&b, "------------\n")
|
||||
fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
|
||||
b.WriteString("## Interconnect (NCCL)\n\n")
|
||||
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
||||
if result.Interconnect.Supported {
|
||||
fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
|
||||
fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
|
||||
b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
|
||||
fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
|
||||
fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range result.Interconnect.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
if len(result.Interconnect.Notes) > 0 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Server Power (IPMI) ───────────────────────────────────────────────────
|
||||
if sp := result.ServerPower; sp != nil {
|
||||
b.WriteString("## Server Power (IPMI)\n\n")
|
||||
if !sp.Available {
|
||||
b.WriteString("IPMI power measurement unavailable.\n\n")
|
||||
} else {
|
||||
b.WriteString("| | Value |\n|---|---|\n")
|
||||
fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
|
||||
fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
|
||||
fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
|
||||
fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
|
||||
if sp.ReportingRatio > 0 {
|
||||
fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range sp.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
if len(sp.Notes) > 0 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Terminal charts (steady-state only) ───────────────────────────────────
|
||||
if len(charts) > 0 {
|
||||
fmt.Fprintf(&b, "Terminal Charts\n")
|
||||
fmt.Fprintf(&b, "---------------\n")
|
||||
b.WriteString("## Steady-State Charts\n\n")
|
||||
for _, chart := range charts {
|
||||
content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
|
||||
if content == "" {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, "%s\n", chart.Title)
|
||||
fmt.Fprintf(&b, "%s\n", strings.Repeat("~", len(chart.Title)))
|
||||
fmt.Fprintf(&b, "%s\n\n", content)
|
||||
fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content)
|
||||
}
|
||||
}
|
||||
|
||||
if sp := result.ServerPower; sp != nil {
|
||||
fmt.Fprintf(&b, "Server Power (IPMI)\n")
|
||||
fmt.Fprintf(&b, "-------------------\n")
|
||||
if !sp.Available {
|
||||
fmt.Fprintf(&b, "Unavailable\n")
|
||||
} else {
|
||||
fmt.Fprintf(&b, " Server idle: %.0f W\n", sp.IdleW)
|
||||
fmt.Fprintf(&b, " Server under load: %.0f W\n", sp.LoadedW)
|
||||
fmt.Fprintf(&b, " Server delta: %.0f W\n", sp.DeltaW)
|
||||
fmt.Fprintf(&b, " GPU reported (sum): %.0f W\n", sp.GPUReportedSumW)
|
||||
if sp.ReportingRatio > 0 {
|
||||
fmt.Fprintf(&b, " Reporting ratio: %.2f (1.0 = accurate, <0.75 = GPU over-reports)\n", sp.ReportingRatio)
|
||||
}
|
||||
}
|
||||
for _, note := range sp.Notes {
|
||||
fmt.Fprintf(&b, " Note: %s\n", note)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
// ── Methodology ───────────────────────────────────────────────────────────
|
||||
b.WriteString("## Methodology\n\n")
|
||||
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile)
|
||||
b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n")
|
||||
b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
|
||||
b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n")
|
||||
|
||||
fmt.Fprintf(&b, "Methodology\n")
|
||||
fmt.Fprintf(&b, "-----------\n")
|
||||
fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
|
||||
fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
|
||||
fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
|
||||
|
||||
fmt.Fprintf(&b, "Raw Files\n")
|
||||
fmt.Fprintf(&b, "---------\n")
|
||||
fmt.Fprintf(&b, "- result.json\n")
|
||||
fmt.Fprintf(&b, "- report.txt\n")
|
||||
fmt.Fprintf(&b, "- summary.txt\n")
|
||||
fmt.Fprintf(&b, "- verbose.log\n")
|
||||
fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
|
||||
fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
|
||||
fmt.Fprintf(&b, "- gpu-*-steady.log\n")
|
||||
fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
|
||||
fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
|
||||
// ── Raw files ─────────────────────────────────────────────────────────────
|
||||
b.WriteString("## Raw Files\n\n")
|
||||
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||||
b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n")
|
||||
b.WriteString("- `gpu-*-warmup.log`\n")
|
||||
b.WriteString("- `gpu-*-steady.log`\n")
|
||||
b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n")
|
||||
b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n")
|
||||
if result.Interconnect != nil {
|
||||
fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
|
||||
b.WriteString("- `nccl-all-reduce.log`\n")
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and
|
||||
// cooldown charts are not useful for human review).
|
||||
func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
|
||||
phases := []struct {
|
||||
name string
|
||||
label string
|
||||
}{
|
||||
{name: "baseline", label: "Baseline"},
|
||||
{name: "steady", label: "Steady State"},
|
||||
{name: "cooldown", label: "Cooldown"},
|
||||
}
|
||||
var charts []benchmarkReportChart
|
||||
for _, idx := range gpuIndices {
|
||||
for _, phase := range phases {
|
||||
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-%s-metrics-term.txt", idx, phase.name))
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil || len(raw) == 0 {
|
||||
continue
|
||||
}
|
||||
charts = append(charts, benchmarkReportChart{
|
||||
Title: fmt.Sprintf("GPU %d %s", idx, phase.label),
|
||||
Content: string(raw),
|
||||
})
|
||||
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx))
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil || len(raw) == 0 {
|
||||
continue
|
||||
}
|
||||
charts = append(charts, benchmarkReportChart{
|
||||
Title: fmt.Sprintf("GPU %d — Steady State", idx),
|
||||
Content: string(raw),
|
||||
})
|
||||
}
|
||||
return charts
|
||||
}
|
||||
|
||||
@@ -137,8 +137,9 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||
for _, needle := range []string{
|
||||
"Executive Summary",
|
||||
"GPU 0 spent measurable time under SW power cap.",
|
||||
"Composite score: 1176.00",
|
||||
"fp16_tensor: 700.00 TOPS",
|
||||
"1176.00",
|
||||
"fp16_tensor",
|
||||
"700.00",
|
||||
} {
|
||||
if !strings.Contains(report, needle) {
|
||||
t.Fatalf("report missing %q\n%s", needle, report)
|
||||
@@ -164,7 +165,7 @@ func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
|
||||
})
|
||||
|
||||
for _, needle := range []string{
|
||||
"Terminal Charts",
|
||||
"Steady-State Charts",
|
||||
"GPU 0 Steady State",
|
||||
"GPU 0 chart",
|
||||
"42┤───",
|
||||
|
||||
@@ -14,13 +14,17 @@ type NvidiaBenchmarkOptions struct {
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
RunNCCL bool
|
||||
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
||||
}
|
||||
|
||||
|
||||
type NvidiaBenchmarkResult struct {
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
|
||||
@@ -116,25 +116,47 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
|
||||
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
|
||||
|
||||
mediumRebound := false
|
||||
if err := bindMount(dstDir, "/run/live/medium"); err != nil {
|
||||
log(fmt.Sprintf("Warning: rebind /run/live/medium → %s failed: %v", dstDir, err))
|
||||
} else {
|
||||
mediumRebound = true
|
||||
}
|
||||
|
||||
log("Verifying live medium now served from RAM...")
|
||||
status := s.LiveBootSource()
|
||||
if err := verifyInstallToRAMStatus(status); err != nil {
|
||||
if err := verifyInstallToRAMStatus(status, dstDir, mediumRebound, log); err != nil {
|
||||
return err
|
||||
}
|
||||
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
||||
log("Done. Installation media can be safely disconnected.")
|
||||
if status.InRAM {
|
||||
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
||||
}
|
||||
log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
|
||||
return nil
|
||||
}
|
||||
|
||||
func verifyInstallToRAMStatus(status LiveBootSource) error {
|
||||
func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
|
||||
if status.InRAM {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s", describeLiveBootSource(status))
|
||||
|
||||
// The live medium mount was not redirected to RAM. This is expected when
|
||||
// booting from an ISO/CD-ROM: the squashfs loop device has a non-zero
|
||||
// offset and LOOP_CHANGE_FD cannot be used; the bind mount also fails
|
||||
// because the CD-ROM mount is in use. Check whether files were at least
|
||||
// copied to the tmpfs directory — that is sufficient for safe disconnection
|
||||
// once the kernel has paged in all actively-used data.
|
||||
files, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
|
||||
if len(files) > 0 {
|
||||
if !mediumRebound {
|
||||
log(fmt.Sprintf("Note: squashfs copied to RAM (%s) but /run/live/medium still shows the original source.", dstDir))
|
||||
log("This is normal for CD-ROM boots. For a fully transparent RAM boot, add 'toram' to the kernel parameters.")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s and no squashfs found in %s", describeLiveBootSource(status), dstDir)
|
||||
}
|
||||
|
||||
func describeLiveBootSource(status LiveBootSource) string {
|
||||
@@ -247,7 +269,31 @@ func findLoopForFile(backingFile string) (string, error) {
|
||||
return "", fmt.Errorf("no loop device found for %s", backingFile)
|
||||
}
|
||||
|
||||
// loopDeviceOffset returns the byte offset configured for the loop device,
|
||||
// or -1 if it cannot be determined.
|
||||
func loopDeviceOffset(loopDev string) int64 {
|
||||
out, err := exec.Command("losetup", "--json", loopDev).Output()
|
||||
if err != nil {
|
||||
return -1
|
||||
}
|
||||
var result struct {
|
||||
Loopdevices []struct {
|
||||
Offset int64 `json:"offset"`
|
||||
} `json:"loopdevices"`
|
||||
}
|
||||
if err := json.Unmarshal(out, &result); err != nil || len(result.Loopdevices) == 0 {
|
||||
return -1
|
||||
}
|
||||
return result.Loopdevices[0].Offset
|
||||
}
|
||||
|
||||
func reassociateLoopDevice(loopDev, newFile string) error {
|
||||
// LOOP_CHANGE_FD requires lo_offset == 0. ISO/CD-ROM loop devices are
|
||||
// typically set up with a non-zero offset (squashfs lives inside the ISO),
|
||||
// so the ioctl returns EINVAL. Detect this early for a clear error message.
|
||||
if off := loopDeviceOffset(loopDev); off > 0 {
|
||||
return fmt.Errorf("loop device has non-zero offset (%d bytes, typical for ISO/CD-ROM) — LOOP_CHANGE_FD not supported; use 'toram' kernel parameter for RAM boot", off)
|
||||
}
|
||||
if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -26,3 +26,8 @@ func loopChangeFD(loopDev, newFile string) error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// bindMount binds src over dst using the syscall directly (avoids exec PATH issues).
|
||||
func bindMount(src, dst string) error {
|
||||
return syscall.Mount(src, dst, "", syscall.MS_BIND, "")
|
||||
}
|
||||
|
||||
@@ -7,3 +7,7 @@ import "errors"
|
||||
func loopChangeFD(loopDev, newFile string) error {
|
||||
return errors.New("LOOP_CHANGE_FD not available on this platform")
|
||||
}
|
||||
|
||||
func bindMount(src, dst string) error {
|
||||
return errors.New("bind mount not available on this platform")
|
||||
}
|
||||
|
||||
@@ -33,14 +33,17 @@ func TestInferLiveBootKind(t *testing.T) {
|
||||
func TestVerifyInstallToRAMStatus(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}); err != nil {
|
||||
dstDir := t.TempDir()
|
||||
|
||||
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}, dstDir, false, nil); err != nil {
|
||||
t.Fatalf("expected success for RAM-backed status, got %v", err)
|
||||
}
|
||||
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"})
|
||||
|
||||
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"}, dstDir, false, nil)
|
||||
if err == nil {
|
||||
t.Fatal("expected verification failure when media is still on USB")
|
||||
}
|
||||
if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1)" {
|
||||
if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1) and no squashfs found in "+dstDir {
|
||||
t.Fatalf("error=%q", got)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -88,6 +88,37 @@ type NvidiaGPU struct {
|
||||
MemoryMB int `json:"memory_mb"`
|
||||
}
|
||||
|
||||
type NvidiaGPUStatus struct {
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
BDF string `json:"bdf,omitempty"`
|
||||
Serial string `json:"serial,omitempty"`
|
||||
Status string `json:"status"`
|
||||
RawLine string `json:"raw_line,omitempty"`
|
||||
NeedsReset bool `json:"needs_reset"`
|
||||
ParseFailure bool `json:"parse_failure,omitempty"`
|
||||
}
|
||||
|
||||
type nvidiaGPUHealth struct {
|
||||
Index int
|
||||
Name string
|
||||
NeedsReset bool
|
||||
RawLine string
|
||||
ParseFailure bool
|
||||
}
|
||||
|
||||
type nvidiaGPUStatusFile struct {
|
||||
Index int
|
||||
Name string
|
||||
RunStatus string
|
||||
Reason string
|
||||
Health string
|
||||
HealthRaw string
|
||||
Observed bool
|
||||
Selected bool
|
||||
FailingJob string
|
||||
}
|
||||
|
||||
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
||||
type AMDGPUInfo struct {
|
||||
Index int `json:"index"`
|
||||
@@ -269,6 +300,72 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
||||
return gpus, nil
|
||||
}
|
||||
|
||||
func (s *System) ListNvidiaGPUStatuses() ([]NvidiaGPUStatus, error) {
|
||||
out, err := satExecCommand(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,name,pci.bus_id,serial,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||
}
|
||||
var gpus []NvidiaGPUStatus
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ",")
|
||||
if len(parts) < 4 {
|
||||
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
if err != nil {
|
||||
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
|
||||
continue
|
||||
}
|
||||
upper := strings.ToUpper(line)
|
||||
needsReset := strings.Contains(upper, "GPU REQUIRES RESET")
|
||||
status := "OK"
|
||||
if needsReset {
|
||||
status = "RESET_REQUIRED"
|
||||
}
|
||||
gpus = append(gpus, NvidiaGPUStatus{
|
||||
Index: idx,
|
||||
Name: strings.TrimSpace(parts[1]),
|
||||
BDF: normalizeNvidiaBusID(strings.TrimSpace(parts[2])),
|
||||
Serial: strings.TrimSpace(parts[3]),
|
||||
Status: status,
|
||||
RawLine: line,
|
||||
NeedsReset: needsReset,
|
||||
})
|
||||
}
|
||||
sort.Slice(gpus, func(i, j int) bool { return gpus[i].Index < gpus[j].Index })
|
||||
return gpus, nil
|
||||
}
|
||||
|
||||
func normalizeNvidiaBusID(v string) string {
|
||||
v = strings.TrimSpace(strings.ToLower(v))
|
||||
parts := strings.Split(v, ":")
|
||||
if len(parts) == 3 && len(parts[0]) > 4 {
|
||||
parts[0] = parts[0][len(parts[0])-4:]
|
||||
return strings.Join(parts, ":")
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||
if index < 0 {
|
||||
return "", fmt.Errorf("gpu index must be >= 0")
|
||||
}
|
||||
raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
|
||||
if len(raw) == 0 && err == nil {
|
||||
raw = []byte("GPU reset completed.\n")
|
||||
}
|
||||
return string(raw), err
|
||||
}
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
@@ -434,9 +531,13 @@ func memoryStressSizeArg() string {
|
||||
return fmt.Sprintf("%dM", targetMB)
|
||||
}
|
||||
|
||||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
||||
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
||||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||
if sizeMB <= 0 {
|
||||
sizeMB = 256
|
||||
}
|
||||
if passes <= 0 {
|
||||
passes = 1
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
@@ -493,7 +594,7 @@ func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durat
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
@@ -525,7 +626,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l
|
||||
break
|
||||
}
|
||||
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
||||
commands := storageSATCommands(devPath)
|
||||
commands := storageSATCommands(devPath, extended)
|
||||
for cmdIndex, job := range commands {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
@@ -604,7 +705,7 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
satJob{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
||||
satJob{name: "04-dcgmi-diag.log", cmd: diagArgs, gpuIndices: gpuIndices},
|
||||
)
|
||||
}
|
||||
|
||||
@@ -652,11 +753,23 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
|
||||
var summary strings.Builder
|
||||
stats := satStats{}
|
||||
nvidiaPack := strings.HasPrefix(prefix, "gpu-nvidia")
|
||||
perGPU := map[int]*nvidiaGPUStatusFile{}
|
||||
selectedGPUIndices := map[int]struct{}{}
|
||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
for _, job := range jobs {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
for _, idx := range job.gpuIndices {
|
||||
selectedGPUIndices[idx] = struct{}{}
|
||||
status := perGPU[idx]
|
||||
if status == nil {
|
||||
status = &nvidiaGPUStatusFile{Index: idx}
|
||||
perGPU[idx] = status
|
||||
}
|
||||
status.Selected = true
|
||||
}
|
||||
cmd := make([]string, 0, len(job.cmd))
|
||||
for _, arg := range job.cmd {
|
||||
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
||||
@@ -665,10 +778,37 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
var out []byte
|
||||
var err error
|
||||
|
||||
if job.collectGPU {
|
||||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
||||
} else {
|
||||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
||||
if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
|
||||
if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
|
||||
if logFunc != nil {
|
||||
logFunc(msg)
|
||||
}
|
||||
out = []byte(msg + "\n")
|
||||
err = healthErr
|
||||
}
|
||||
}
|
||||
|
||||
if err == nil {
|
||||
if job.collectGPU {
|
||||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
||||
} else {
|
||||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
||||
}
|
||||
}
|
||||
|
||||
if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
|
||||
if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
|
||||
if logFunc != nil {
|
||||
logFunc(msg)
|
||||
}
|
||||
if len(out) > 0 && !bytes.HasSuffix(out, []byte("\n")) {
|
||||
out = append(out, '\n')
|
||||
}
|
||||
out = append(out, []byte(msg+"\n")...)
|
||||
if err == nil {
|
||||
err = healthErr
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||
@@ -679,6 +819,11 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
}
|
||||
status, rc := classifySATResult(job.name, out, err)
|
||||
stats.Add(status)
|
||||
if nvidiaPack && len(job.gpuIndices) > 0 && nvidiaJobNeedsHealthCheck(job) {
|
||||
for _, idx := range job.gpuIndices {
|
||||
updateNvidiaGPUStatus(perGPU, idx, status, job.name, string(out))
|
||||
}
|
||||
}
|
||||
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||
@@ -687,6 +832,11 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if nvidiaPack {
|
||||
if err := writeNvidiaGPUStatusFiles(runDir, stats.Overall(), perGPU, selectedGPUIndices); err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
|
||||
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
@@ -695,6 +845,197 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
return archive, nil
|
||||
}
|
||||
|
||||
func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
|
||||
entry := perGPU[idx]
|
||||
if entry == nil {
|
||||
entry = &nvidiaGPUStatusFile{Index: idx}
|
||||
perGPU[idx] = entry
|
||||
}
|
||||
if nvidiaSATStatusSeverity(status) >= nvidiaSATStatusSeverity(entry.RunStatus) {
|
||||
entry.RunStatus = status
|
||||
entry.FailingJob = jobName
|
||||
entry.Reason = firstLine(detail)
|
||||
}
|
||||
}
|
||||
|
||||
func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPUStatusFile, selected map[int]struct{}) error {
|
||||
health, err := readNvidiaGPUHealth()
|
||||
if err == nil {
|
||||
for _, gpu := range health {
|
||||
entry := perGPU[gpu.Index]
|
||||
if entry == nil {
|
||||
entry = &nvidiaGPUStatusFile{Index: gpu.Index}
|
||||
perGPU[gpu.Index] = entry
|
||||
}
|
||||
entry.Name = gpu.Name
|
||||
entry.Observed = true
|
||||
entry.HealthRaw = gpu.RawLine
|
||||
if gpu.NeedsReset {
|
||||
entry.Health = "RESET_REQUIRED"
|
||||
if entry.RunStatus == "" || nvidiaSATStatusSeverity("FAILED") >= nvidiaSATStatusSeverity(entry.RunStatus) {
|
||||
entry.RunStatus = "FAILED"
|
||||
if strings.TrimSpace(entry.Reason) == "" {
|
||||
entry.Reason = "GPU requires reset"
|
||||
}
|
||||
}
|
||||
} else {
|
||||
entry.Health = "OK"
|
||||
}
|
||||
}
|
||||
}
|
||||
for idx := range selected {
|
||||
entry := perGPU[idx]
|
||||
if entry == nil {
|
||||
entry = &nvidiaGPUStatusFile{Index: idx}
|
||||
perGPU[idx] = entry
|
||||
}
|
||||
entry.Selected = true
|
||||
}
|
||||
var indices []int
|
||||
for idx := range perGPU {
|
||||
indices = append(indices, idx)
|
||||
}
|
||||
sort.Ints(indices)
|
||||
for _, idx := range indices {
|
||||
entry := perGPU[idx]
|
||||
if entry.RunStatus == "" {
|
||||
entry.RunStatus = overall
|
||||
}
|
||||
if entry.Health == "" {
|
||||
entry.Health = "UNKNOWN"
|
||||
}
|
||||
if entry.Name == "" {
|
||||
entry.Name = "unknown"
|
||||
}
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
|
||||
fmt.Fprintf(&body, "gpu_name=%s\n", entry.Name)
|
||||
fmt.Fprintf(&body, "selected=%t\n", entry.Selected)
|
||||
fmt.Fprintf(&body, "observed=%t\n", entry.Observed)
|
||||
fmt.Fprintf(&body, "run_status=%s\n", entry.RunStatus)
|
||||
fmt.Fprintf(&body, "health_status=%s\n", entry.Health)
|
||||
if strings.TrimSpace(entry.FailingJob) != "" {
|
||||
fmt.Fprintf(&body, "failing_job=%s\n", entry.FailingJob)
|
||||
}
|
||||
if strings.TrimSpace(entry.Reason) != "" {
|
||||
fmt.Fprintf(&body, "reason=%s\n", entry.Reason)
|
||||
}
|
||||
if strings.TrimSpace(entry.HealthRaw) != "" {
|
||||
fmt.Fprintf(&body, "health_raw=%s\n", entry.HealthRaw)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-status.txt", idx)), []byte(body.String()), 0644); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func nvidiaSATStatusSeverity(status string) int {
|
||||
switch strings.ToUpper(strings.TrimSpace(status)) {
|
||||
case "FAILED":
|
||||
return 3
|
||||
case "PARTIAL", "UNSUPPORTED":
|
||||
return 2
|
||||
case "OK":
|
||||
return 1
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func firstLine(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" {
|
||||
return ""
|
||||
}
|
||||
if idx := strings.IndexByte(s, '\n'); idx >= 0 {
|
||||
return strings.TrimSpace(s[:idx])
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func nvidiaJobNeedsHealthCheck(job satJob) bool {
|
||||
if job.collectGPU {
|
||||
return true
|
||||
}
|
||||
name := strings.ToLower(strings.TrimSpace(job.name))
|
||||
return strings.Contains(name, "dcgmi") ||
|
||||
strings.Contains(name, "gpu-burn") ||
|
||||
strings.Contains(name, "gpu-stress") ||
|
||||
strings.Contains(name, "dcgmproftester")
|
||||
}
|
||||
|
||||
func checkNvidiaJobHealth(selected []int) (string, error) {
|
||||
health, err := readNvidiaGPUHealth()
|
||||
if err != nil {
|
||||
return "", nil
|
||||
}
|
||||
var bad []nvidiaGPUHealth
|
||||
selectedSet := make(map[int]struct{}, len(selected))
|
||||
for _, idx := range selected {
|
||||
selectedSet[idx] = struct{}{}
|
||||
}
|
||||
for _, gpu := range health {
|
||||
if len(selectedSet) > 0 {
|
||||
if _, ok := selectedSet[gpu.Index]; !ok {
|
||||
continue
|
||||
}
|
||||
}
|
||||
if gpu.NeedsReset {
|
||||
bad = append(bad, gpu)
|
||||
}
|
||||
}
|
||||
if len(bad) == 0 {
|
||||
return "", nil
|
||||
}
|
||||
lines := make([]string, 0, len(bad)+1)
|
||||
lines = append(lines, "NVIDIA GPU health check failed:")
|
||||
for _, gpu := range bad {
|
||||
lines = append(lines, fmt.Sprintf("gpu %d (%s) requires reset: %s", gpu.Index, gpu.Name, gpu.RawLine))
|
||||
}
|
||||
return strings.Join(lines, "\n"), errors.New("nvidia gpu requires reset")
|
||||
}
|
||||
|
||||
func readNvidiaGPUHealth() ([]nvidiaGPUHealth, error) {
|
||||
out, err := satExecCommand(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||
}
|
||||
return parseNvidiaGPUHealth(string(out)), nil
|
||||
}
|
||||
|
||||
func parseNvidiaGPUHealth(raw string) []nvidiaGPUHealth {
|
||||
var gpus []nvidiaGPUHealth
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ",")
|
||||
if len(parts) < 2 {
|
||||
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
if err != nil {
|
||||
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
|
||||
continue
|
||||
}
|
||||
upper := strings.ToUpper(line)
|
||||
gpus = append(gpus, nvidiaGPUHealth{
|
||||
Index: idx,
|
||||
Name: strings.TrimSpace(parts[1]),
|
||||
NeedsReset: strings.Contains(upper, "GPU REQUIRES RESET"),
|
||||
RawLine: line,
|
||||
})
|
||||
}
|
||||
return gpus
|
||||
}
|
||||
|
||||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
|
||||
start := time.Now().UTC()
|
||||
resolvedCmd, err := resolveSATCommand(cmd)
|
||||
@@ -749,17 +1090,25 @@ func listStorageDevices() ([]string, error) {
|
||||
return parseStorageDevices(string(out)), nil
|
||||
}
|
||||
|
||||
func storageSATCommands(devPath string) []satJob {
|
||||
func storageSATCommands(devPath string, extended bool) []satJob {
|
||||
if strings.Contains(filepath.Base(devPath), "nvme") {
|
||||
selfTestLevel := "1"
|
||||
if extended {
|
||||
selfTestLevel = "2"
|
||||
}
|
||||
return []satJob{
|
||||
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
|
||||
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
|
||||
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}},
|
||||
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}},
|
||||
}
|
||||
}
|
||||
smartTestType := "short"
|
||||
if extended {
|
||||
smartTestType = "long"
|
||||
}
|
||||
return []satJob{
|
||||
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
|
||||
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}},
|
||||
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -818,6 +1167,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
||||
// nvidia-smi on a machine with no NVIDIA GPU
|
||||
strings.Contains(text, "couldn't communicate with the nvidia driver") ||
|
||||
strings.Contains(text, "no nvidia gpu") ||
|
||||
// Some NVMe firmwares start self-test but never expose progress to nvme-cli
|
||||
// while waiting, so the CLI stops polling without proving device failure.
|
||||
(strings.Contains(name, "self-test") &&
|
||||
strings.Contains(text, "no progress for") &&
|
||||
strings.Contains(text, "stop waiting")) ||
|
||||
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
||||
return "UNSUPPORTED", rc
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ type FanStressOptions struct {
|
||||
Phase1DurSec int // first load phase duration in seconds (default 300)
|
||||
PauseSec int // pause between the two load phases (default 60)
|
||||
Phase2DurSec int // second load phase duration in seconds (default 300)
|
||||
SizeMB int // GPU memory to allocate per GPU during stress (default 64)
|
||||
SizeMB int // GPU memory to allocate per GPU during stress (0 = auto: 95% of VRAM)
|
||||
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
||||
}
|
||||
|
||||
@@ -243,9 +243,8 @@ func applyFanStressDefaults(opts *FanStressOptions) {
|
||||
if opts.Phase2DurSec <= 0 {
|
||||
opts.Phase2DurSec = 300
|
||||
}
|
||||
if opts.SizeMB <= 0 {
|
||||
opts.SizeMB = 64
|
||||
}
|
||||
// SizeMB == 0 means "auto" (worker picks 95% of GPU VRAM for maximum power draw).
|
||||
// Leave at 0 to avoid passing a too-small size that starves the tensor-core path.
|
||||
}
|
||||
|
||||
// sampleFanStressRow collects all metrics for one telemetry sample.
|
||||
|
||||
@@ -14,12 +14,12 @@ import (
|
||||
func TestStorageSATCommands(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvme := storageSATCommands("/dev/nvme0n1")
|
||||
nvme := storageSATCommands("/dev/nvme0n1", false)
|
||||
if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
|
||||
t.Fatalf("unexpected nvme commands: %#v", nvme)
|
||||
}
|
||||
|
||||
sata := storageSATCommands("/dev/sda")
|
||||
sata := storageSATCommands("/dev/sda", false)
|
||||
if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
|
||||
t.Fatalf("unexpected sata commands: %#v", sata)
|
||||
}
|
||||
@@ -216,6 +216,74 @@ func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNvidiaGPUHealthDetectsResetRequired(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
got := parseNvidiaGPUHealth("0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n")
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("len=%d want 2", len(got))
|
||||
}
|
||||
if got[0].NeedsReset {
|
||||
t.Fatalf("gpu0 unexpectedly marked reset-required")
|
||||
}
|
||||
if !got[1].NeedsReset {
|
||||
t.Fatalf("gpu1 should be marked reset-required: %#v", got[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckNvidiaJobHealthReturnsErrorForSelectedResetRequiredGPU(t *testing.T) {
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
msg, err := checkNvidiaJobHealth([]int{1})
|
||||
if err == nil {
|
||||
t.Fatal("expected health check error")
|
||||
}
|
||||
if !strings.Contains(msg, "gpu 1") || !strings.Contains(strings.ToLower(msg), "requires reset") {
|
||||
t.Fatalf("unexpected message: %q", msg)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriteNvidiaGPUStatusFilesCreatesPerGPUFiles(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
perGPU := map[int]*nvidiaGPUStatusFile{
|
||||
0: {Index: 0, RunStatus: "OK"},
|
||||
1: {Index: 1, RunStatus: "FAILED", FailingJob: "02-dcgmi-targeted-stress.log", Reason: "NVIDIA GPU health check failed:"},
|
||||
}
|
||||
if err := writeNvidiaGPUStatusFiles(dir, "FAILED", perGPU, map[int]struct{}{0: {}, 1: {}}); err != nil {
|
||||
t.Fatalf("writeNvidiaGPUStatusFiles error: %v", err)
|
||||
}
|
||||
raw, err := os.ReadFile(filepath.Join(dir, "gpu-1-status.txt"))
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile gpu-1-status.txt: %v", err)
|
||||
}
|
||||
text := string(raw)
|
||||
if !strings.Contains(text, "run_status=FAILED") {
|
||||
t.Fatalf("missing run status:\n%s", text)
|
||||
}
|
||||
if !strings.Contains(text, "health_status=RESET_REQUIRED") {
|
||||
t.Fatalf("missing health status:\n%s", text)
|
||||
}
|
||||
if !strings.Contains(text, "failing_job=02-dcgmi-targeted-stress.log") {
|
||||
t.Fatalf("missing failing job:\n%s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
satLookPath = func(file string) (string, error) {
|
||||
@@ -341,6 +409,7 @@ func TestClassifySATResult(t *testing.T) {
|
||||
}{
|
||||
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
||||
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "nvme wait timeout without progress", job: "nvme-device-self-test", out: "Short Device self-test started\nWaiting for self test completion...\nno progress for 78 seconds, stop waiting", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
}
|
||||
|
||||
@@ -28,6 +28,12 @@ var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) {
|
||||
}
|
||||
return a.ListNvidiaGPUs()
|
||||
}
|
||||
var apiListNvidiaGPUStatuses = func(a *app.App) ([]platform.NvidiaGPUStatus, error) {
|
||||
if a == nil {
|
||||
return nil, fmt.Errorf("app not configured")
|
||||
}
|
||||
return a.ListNvidiaGPUStatuses()
|
||||
}
|
||||
|
||||
// ── Job ID counter ────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -216,7 +222,21 @@ func formatSplitTaskName(baseName, selectionLabel string) string {
|
||||
}
|
||||
|
||||
func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) {
|
||||
if !shouldSplitHomogeneousNvidiaTarget(target) {
|
||||
if !shouldSplitHomogeneousNvidiaTarget(target) || params.ParallelGPUs {
|
||||
// Parallel mode (or non-splittable target): one task for all selected GPUs.
|
||||
if params.ParallelGPUs && shouldSplitHomogeneousNvidiaTarget(target) {
|
||||
// Resolve the selected GPU indices so ExcludeGPUIndices is applied.
|
||||
gpus, err := apiListNvidiaGPUs(appRef)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
resolved, err := expandSelectedGPUIndices(gpus, params.GPUIndices, params.ExcludeGPUIndices)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
params.GPUIndices = resolved
|
||||
params.ExcludeGPUIndices = nil
|
||||
}
|
||||
t := &Task{
|
||||
ID: newJobID(idPrefix),
|
||||
Name: baseName,
|
||||
@@ -256,6 +276,53 @@ func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params
|
||||
return tasks, nil
|
||||
}
|
||||
|
||||
// expandSelectedGPUIndices returns the sorted list of selected GPU indices after
|
||||
// applying include/exclude filters, without splitting by model.
|
||||
func expandSelectedGPUIndices(gpus []platform.NvidiaGPU, include, exclude []int) ([]int, error) {
|
||||
indexed := make(map[int]struct{}, len(gpus))
|
||||
allIndices := make([]int, 0, len(gpus))
|
||||
for _, gpu := range gpus {
|
||||
indexed[gpu.Index] = struct{}{}
|
||||
allIndices = append(allIndices, gpu.Index)
|
||||
}
|
||||
sort.Ints(allIndices)
|
||||
|
||||
selected := allIndices
|
||||
if len(include) > 0 {
|
||||
selected = make([]int, 0, len(include))
|
||||
seen := make(map[int]struct{}, len(include))
|
||||
for _, idx := range include {
|
||||
if _, ok := indexed[idx]; !ok {
|
||||
continue
|
||||
}
|
||||
if _, dup := seen[idx]; dup {
|
||||
continue
|
||||
}
|
||||
seen[idx] = struct{}{}
|
||||
selected = append(selected, idx)
|
||||
}
|
||||
sort.Ints(selected)
|
||||
}
|
||||
if len(exclude) > 0 {
|
||||
skip := make(map[int]struct{}, len(exclude))
|
||||
for _, idx := range exclude {
|
||||
skip[idx] = struct{}{}
|
||||
}
|
||||
filtered := selected[:0]
|
||||
for _, idx := range selected {
|
||||
if _, ok := skip[idx]; ok {
|
||||
continue
|
||||
}
|
||||
filtered = append(filtered, idx)
|
||||
}
|
||||
selected = filtered
|
||||
}
|
||||
if len(selected) == 0 {
|
||||
return nil, fmt.Errorf("no NVIDIA GPUs selected")
|
||||
}
|
||||
return selected, nil
|
||||
}
|
||||
|
||||
// ── SSE helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
func sseWrite(w http.ResponseWriter, event, data string) bool {
|
||||
@@ -417,7 +484,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
|
||||
var body struct {
|
||||
Duration int `json:"duration"`
|
||||
DiagLevel int `json:"diag_level"`
|
||||
StressMode bool `json:"stress_mode"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
Loader string `json:"loader"`
|
||||
@@ -438,7 +505,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
}
|
||||
params := taskParams{
|
||||
Duration: body.Duration,
|
||||
DiagLevel: body.DiagLevel,
|
||||
StressMode: body.StressMode,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
Loader: body.Loader,
|
||||
@@ -470,6 +537,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
RunNCCL *bool `json:"run_nccl"`
|
||||
ParallelGPUs *bool `json:"parallel_gpus"`
|
||||
DisplayName string `json:"display_name"`
|
||||
}
|
||||
if r.Body != nil {
|
||||
@@ -483,6 +551,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
||||
if body.RunNCCL != nil {
|
||||
runNCCL = *body.RunNCCL
|
||||
}
|
||||
parallelGPUs := false
|
||||
if body.ParallelGPUs != nil {
|
||||
parallelGPUs = *body.ParallelGPUs
|
||||
}
|
||||
name := taskDisplayName("nvidia-benchmark", "", "")
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
name = body.DisplayName
|
||||
@@ -493,6 +565,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL,
|
||||
ParallelGPUs: parallelGPUs,
|
||||
DisplayName: body.DisplayName,
|
||||
}, name, h.opts.App, "benchmark-nvidia")
|
||||
if err != nil {
|
||||
@@ -782,6 +855,42 @@ func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
|
||||
writeJSON(w, gpus)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIGNVIDIAGPUStatuses(w http.ResponseWriter, _ *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
gpus, err := apiListNvidiaGPUStatuses(h.opts.App)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if gpus == nil {
|
||||
gpus = []platform.NvidiaGPUStatus{}
|
||||
}
|
||||
writeJSON(w, gpus)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIGNVIDIAReset(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var req struct {
|
||||
Index int `json:"index"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
result, err := h.opts.App.ResetNvidiaGPU(req.Index)
|
||||
status := "ok"
|
||||
if err != nil {
|
||||
status = "error"
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": status, "output": result.Body})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
|
||||
@@ -1036,10 +1036,12 @@ func renderValidate(opts HandlerOptions) string {
|
||||
<div class="card-body validate-profile-body">
|
||||
<div class="validate-profile-col">
|
||||
<div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
|
||||
<div class="form-row" style="margin:12px 0 0"><label>Diag level</label><select id="sat-profile-nvidia-level" style="width:100%"><option value="1">Level 1 — Quick</option><option value="2">Level 2 — Standard</option><option value="3">Level 3 — Extended</option><option value="4">Level 4 — Full</option></select></div>
|
||||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
|
||||
</div>
|
||||
<div class="validate-profile-col validate-profile-action">
|
||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count. NVIDIA <code>dcgmi diag</code> uses the selected diag level from this profile.</p>
|
||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
|
||||
<button class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||
</div>
|
||||
<div class="validate-profile-col"></div>
|
||||
@@ -1054,19 +1056,19 @@ func renderValidate(opts HandlerOptions) string {
|
||||
inv.CPU,
|
||||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||
`Duration is taken from Validate Profile diag level: Level 1 = 60s, Level 2 = 5m, Level 3 = 1h, Level 4 = 1h.`,
|
||||
`60s in Validate, 30 min in Stress.`,
|
||||
)) +
|
||||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||
inv.Memory,
|
||||
`Runs a short RAM validation pass and records memory state around the test.`,
|
||||
`Runs a RAM validation pass and records memory state around the test.`,
|
||||
`<code>free</code>, <code>memtester</code>`,
|
||||
`No extra settings.`,
|
||||
`256 MB / 1 pass in Validate, 1 GB / 3 passes in Stress.`,
|
||||
)) +
|
||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||
inv.Storage,
|
||||
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||
`No extra settings.`,
|
||||
`Short self-test in Validate, extended self-test in Stress.`,
|
||||
)) +
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
@@ -1083,6 +1085,12 @@ func renderValidate(opts HandlerOptions) string {
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
|
||||
<div style="margin-top:10px;padding-top:10px;border-top:1px solid var(--border)">
|
||||
<label class="sat-gpu-row" title="When checked, multi-GPU tests (PSU Pulse, NCCL, NVBandwidth) run on ALL GPUs in the system regardless of the selection above.">
|
||||
<input type="checkbox" id="sat-multi-gpu-all" checked onchange="satUpdateGPUSelectionNote()">
|
||||
<span><strong>Multi-GPU tests</strong> — use all GPUs <span style="font-size:11px;color:var(--muted)">(PSU Pulse, NCCL, NVBandwidth)</span></span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -1091,14 +1099,48 @@ func renderValidate(opts HandlerOptions) string {
|
||||
inv.NVIDIA,
|
||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||
`Runs one GPU at a time on the selected NVIDIA GPUs. Diag level is taken from Validate Profile.`,
|
||||
`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
|
||||
)) +
|
||||
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
|
||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
`Runs one GPU at a time on the selected NVIDIA GPUs with the fixed DCGM targeted stress recipe.`,
|
||||
`Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-targeted-power">` +
|
||||
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||
`<code>dcgmi diag targeted_power</code>`,
|
||||
`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-pulse">` +
|
||||
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||
`<code>dcgmi diag pulse_test</code>`,
|
||||
`Skipped in Validate mode. Runs in Stress mode only. Runs all selected GPUs simultaneously — synchronous pulsing is required to stress the PSU.<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-interconnect">` +
|
||||
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-bandwidth">` +
|
||||
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||
`<code>nvbandwidth</code>`,
|
||||
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`</div>
|
||||
<div class="grid3" style="margin-top:16px">
|
||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||
@@ -1125,17 +1167,28 @@ func renderValidate(opts HandlerOptions) string {
|
||||
</style>
|
||||
<script>
|
||||
let satES = null;
|
||||
function satDiagLevel() {
|
||||
return parseInt(document.getElementById('sat-profile-nvidia-level').value) || 1;
|
||||
function satStressMode() {
|
||||
return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
|
||||
}
|
||||
function satCPUDurationFromDiagLevel() {
|
||||
const level = satDiagLevel();
|
||||
if (level === 1) return 60;
|
||||
if (level === 2) return 5 * 60;
|
||||
return 60 * 60;
|
||||
function satModeChanged() {
|
||||
const stress = satStressMode();
|
||||
[
|
||||
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
||||
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
||||
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
|
||||
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
|
||||
].forEach(function(item) {
|
||||
const card = document.getElementById(item.card);
|
||||
if (card) {
|
||||
card.style.opacity = stress ? '1' : '0.5';
|
||||
const hint = document.getElementById(item.hint);
|
||||
if (hint) hint.style.display = stress ? 'none' : '';
|
||||
}
|
||||
});
|
||||
}
|
||||
function satLabels() {
|
||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
}
|
||||
let satNvidiaGPUsPromise = null;
|
||||
function loadSatNvidiaGPUs() {
|
||||
@@ -1156,6 +1209,10 @@ function satSelectedGPUIndices() {
|
||||
.filter(function(v) { return !Number.isNaN(v); })
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
function satMultiGPUAll() {
|
||||
const cb = document.getElementById('sat-multi-gpu-all');
|
||||
return cb ? cb.checked : true;
|
||||
}
|
||||
function satUpdateGPUSelectionNote() {
|
||||
const note = document.getElementById('sat-gpu-selection-note');
|
||||
if (!note) return;
|
||||
@@ -1164,7 +1221,8 @@ function satUpdateGPUSelectionNote() {
|
||||
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
|
||||
return;
|
||||
}
|
||||
note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '.';
|
||||
const multiAll = satMultiGPUAll();
|
||||
note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests: ' + (multiAll ? 'all GPUs in system' : 'selected GPUs only') + '.';
|
||||
}
|
||||
function satRenderGPUList(gpus) {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
@@ -1211,9 +1269,8 @@ function satRequestBody(target, overrides) {
|
||||
const body = {};
|
||||
const labels = satLabels();
|
||||
body.display_name = labels[target] || ('Validate ' + target);
|
||||
if (target === 'nvidia') body.diag_level = satDiagLevel();
|
||||
if (target === 'nvidia-targeted-stress') body.duration = 300;
|
||||
if (target === 'cpu') body.duration = satCPUDurationFromDiagLevel();
|
||||
body.stress_mode = satStressMode();
|
||||
if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
|
||||
if (overrides) {
|
||||
Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
|
||||
}
|
||||
@@ -1275,8 +1332,28 @@ function runSATWithOverrides(target, overrides) {
|
||||
return enqueueSATTarget(target, overrides)
|
||||
.then(d => streamSATTask(d.task_id, title, false));
|
||||
}
|
||||
const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power'];
|
||||
// pulse_test and fabric tests run on all selected GPUs simultaneously
|
||||
const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
function satAllGPUIndicesForMulti() {
|
||||
// If "Multi-GPU tests — all GPUs" is checked, return all detected GPUs.
|
||||
// Otherwise fall back to the per-GPU selection.
|
||||
if (satMultiGPUAll()) {
|
||||
return loadSatNvidiaGPUs().then(function(gpus) {
|
||||
return gpus.map(function(g) { return Number(g.index); });
|
||||
});
|
||||
}
|
||||
const sel = satSelectedGPUIndices();
|
||||
return Promise.resolve(sel);
|
||||
}
|
||||
function expandSATTarget(target) {
|
||||
if (target !== 'nvidia' && target !== 'nvidia-targeted-stress') {
|
||||
if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
|
||||
return satAllGPUIndicesForMulti().then(function(indices) {
|
||||
if (!indices.length) return Promise.reject(new Error('No NVIDIA GPUs available.'));
|
||||
return [{target: target, overrides: {gpu_indices: indices, display_name: satLabels()[target] || target}}];
|
||||
});
|
||||
}
|
||||
if (nvidiaPerGPUTargets.indexOf(target) < 0) {
|
||||
return Promise.resolve([{target: target}]);
|
||||
}
|
||||
const selected = satSelectedGPUIndices();
|
||||
@@ -1292,6 +1369,12 @@ function expandSATTarget(target) {
|
||||
label: satGPUDisplayName(gpu)
|
||||
})));
|
||||
}
|
||||
function runNvidiaFabricValidate(target) {
|
||||
satAllGPUIndicesForMulti().then(function(indices) {
|
||||
if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
|
||||
runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
|
||||
});
|
||||
}
|
||||
function runNvidiaValidateSet(target) {
|
||||
return loadSatNvidiaGPUs().then(gpus => {
|
||||
const selected = satSelectedGPUIndices();
|
||||
@@ -1354,8 +1437,10 @@ function runAllSAT() {
|
||||
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
||||
const status = document.getElementById('sat-all-status');
|
||||
status.textContent = 'Enqueuing...';
|
||||
const baseTargets = ['nvidia','nvidia-targeted-stress','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||
const activeTargets = baseTargets.filter(target => {
|
||||
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
||||
const btn = document.getElementById('sat-btn-' + target);
|
||||
return !(btn && btn.disabled);
|
||||
});
|
||||
@@ -1390,6 +1475,10 @@ function runAllSAT() {
|
||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
|
||||
});
|
||||
@@ -1583,10 +1672,11 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
|
||||
// ── Benchmark ─────────────────────────────────────────────────────────────────
|
||||
|
||||
type benchmarkHistoryColumn struct {
|
||||
key string
|
||||
label string
|
||||
name string
|
||||
index int
|
||||
key string
|
||||
label string
|
||||
name string
|
||||
index int
|
||||
parallel bool
|
||||
}
|
||||
|
||||
type benchmarkHistoryCell struct {
|
||||
@@ -1625,6 +1715,10 @@ func renderBenchmark(opts HandlerOptions) string {
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
</div>
|
||||
<label class="benchmark-cb-row">
|
||||
<input type="checkbox" id="benchmark-parallel-gpus">
|
||||
<span>Run all selected GPUs simultaneously (parallel mode)</span>
|
||||
</label>
|
||||
<label class="benchmark-cb-row">
|
||||
<input type="checkbox" id="benchmark-run-nccl" checked>
|
||||
<span>Run multi-GPU interconnect step (NCCL) only on the selected GPUs</span>
|
||||
@@ -1750,10 +1844,12 @@ function runNvidiaBenchmark() {
|
||||
return;
|
||||
}
|
||||
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||
const parallelGPUs = !!document.getElementById('benchmark-parallel-gpus').checked;
|
||||
const body = {
|
||||
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||
gpu_indices: selected,
|
||||
run_nccl: !!document.getElementById('benchmark-run-nccl').checked,
|
||||
parallel_gpus: parallelGPUs,
|
||||
display_name: 'NVIDIA Benchmark'
|
||||
};
|
||||
document.getElementById('benchmark-output').style.display = 'block';
|
||||
@@ -1887,17 +1983,43 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
||||
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||
cells: make(map[string]benchmarkHistoryCell),
|
||||
}
|
||||
for _, gpu := range result.GPUs {
|
||||
key := benchmarkHistoryColumnKey(gpu.Name, gpu.Index)
|
||||
columnByKey[key] = benchmarkHistoryColumn{
|
||||
key: key,
|
||||
label: benchmarkHistoryColumnLabel(gpu.Name, gpu.Index),
|
||||
name: strings.TrimSpace(gpu.Name),
|
||||
index: gpu.Index,
|
||||
|
||||
if result.ParallelGPUs {
|
||||
// All GPUs ran simultaneously — one column per server, score = avg composite.
|
||||
gpuModelCount := make(map[string]int)
|
||||
for _, gpu := range result.GPUs {
|
||||
gpuModelCount[strings.TrimSpace(gpu.Name)]++
|
||||
}
|
||||
run.cells[key] = benchmarkHistoryCell{
|
||||
score: gpu.Scores.CompositeScore,
|
||||
present: true,
|
||||
scoreSum := make(map[string]float64)
|
||||
scoreCnt := make(map[string]int)
|
||||
for _, gpu := range result.GPUs {
|
||||
key := "parallel|" + strings.TrimSpace(result.ServerModel) + "|" + strings.TrimSpace(gpu.Name)
|
||||
scoreSum[key] += gpu.Scores.CompositeScore
|
||||
scoreCnt[key]++
|
||||
count := gpuModelCount[strings.TrimSpace(gpu.Name)]
|
||||
columnByKey[key] = benchmarkHistoryColumn{
|
||||
key: key,
|
||||
label: benchmarkHistoryParallelLabel(result.ServerModel, gpu.Name, count),
|
||||
name: strings.TrimSpace(gpu.Name),
|
||||
index: -1,
|
||||
parallel: true,
|
||||
}
|
||||
}
|
||||
for key, sum := range scoreSum {
|
||||
run.cells[key] = benchmarkHistoryCell{score: sum / float64(scoreCnt[key]), present: true}
|
||||
}
|
||||
} else {
|
||||
// Each GPU ran independently — one column per GPU index.
|
||||
for _, gpu := range result.GPUs {
|
||||
key := "gpu|" + strings.TrimSpace(result.ServerModel) + "|" + strings.TrimSpace(gpu.Name) + "|" + strconv.Itoa(gpu.Index)
|
||||
columnByKey[key] = benchmarkHistoryColumn{
|
||||
key: key,
|
||||
label: benchmarkHistoryPerGPULabel(gpu.Name, gpu.Index),
|
||||
name: strings.TrimSpace(gpu.Name),
|
||||
index: gpu.Index,
|
||||
parallel: false,
|
||||
}
|
||||
run.cells[key] = benchmarkHistoryCell{score: gpu.Scores.CompositeScore, present: true}
|
||||
}
|
||||
}
|
||||
runs = append(runs, run)
|
||||
@@ -1907,16 +2029,24 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
||||
for _, col := range columnByKey {
|
||||
columns = append(columns, col)
|
||||
}
|
||||
// Sequential GPU columns first (sorted by GPU index), then parallel server columns.
|
||||
sort.Slice(columns, func(i, j int) bool {
|
||||
leftName := strings.ToLower(strings.TrimSpace(columns[i].name))
|
||||
rightName := strings.ToLower(strings.TrimSpace(columns[j].name))
|
||||
if leftName != rightName {
|
||||
return leftName < rightName
|
||||
if columns[i].parallel != columns[j].parallel {
|
||||
return !columns[i].parallel // sequential first
|
||||
}
|
||||
if columns[i].parallel {
|
||||
li := strings.ToLower(columns[i].label)
|
||||
lj := strings.ToLower(columns[j].label)
|
||||
if li != lj {
|
||||
return li < lj
|
||||
}
|
||||
return columns[i].key < columns[j].key
|
||||
}
|
||||
// Sequential: sort by GPU index, then name.
|
||||
if columns[i].index != columns[j].index {
|
||||
return columns[i].index < columns[j].index
|
||||
}
|
||||
return columns[i].key < columns[j].key
|
||||
return strings.ToLower(columns[i].name) < strings.ToLower(columns[j].name)
|
||||
})
|
||||
sort.Slice(runs, func(i, j int) bool {
|
||||
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||
@@ -1924,16 +2054,28 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
||||
return columns, runs
|
||||
}
|
||||
|
||||
func benchmarkHistoryColumnKey(name string, index int) string {
|
||||
return strings.TrimSpace(name) + "|" + strconv.Itoa(index)
|
||||
// benchmarkHistoryPerGPULabel formats a label for a single-GPU column: "GPU #N — ModelName".
|
||||
func benchmarkHistoryPerGPULabel(gpuName string, index int) string {
|
||||
gpuName = strings.TrimSpace(gpuName)
|
||||
if gpuName == "" {
|
||||
gpuName = "Unknown GPU"
|
||||
}
|
||||
return fmt.Sprintf("GPU #%d — %s", index, gpuName)
|
||||
}
|
||||
|
||||
func benchmarkHistoryColumnLabel(name string, index int) string {
|
||||
name = strings.TrimSpace(name)
|
||||
if name == "" {
|
||||
return fmt.Sprintf("GPU %d", index)
|
||||
// benchmarkHistoryParallelLabel formats a label for an all-GPU parallel column:
|
||||
// "ServerModel — N× ModelName (All GPUs)" or "N× ModelName (All GPUs)" if no server.
|
||||
func benchmarkHistoryParallelLabel(serverModel, gpuName string, count int) string {
|
||||
serverModel = strings.TrimSpace(serverModel)
|
||||
gpuName = strings.TrimSpace(gpuName)
|
||||
if gpuName == "" {
|
||||
gpuName = "Unknown GPU"
|
||||
}
|
||||
return fmt.Sprintf("%s / GPU %d", name, index)
|
||||
gpuPart := fmt.Sprintf("%d× %s (All GPUs)", count, gpuName)
|
||||
if serverModel == "" {
|
||||
return gpuPart
|
||||
}
|
||||
return fmt.Sprintf("%s — %s", serverModel, gpuPart)
|
||||
}
|
||||
|
||||
// ── Burn ──────────────────────────────────────────────────────────────────────
|
||||
@@ -2007,23 +2149,6 @@ func renderBurn() string {
|
||||
|
||||
<div class="burn-section">GPU-Specific Tests</div>
|
||||
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
||||
<div class="card burn-card">
|
||||
<div class="card-head card-head-actions"><span>Power Delivery / Power Budget</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-power',target:'nvidia-targeted-power',label:'NVIDIA Targeted Power (dcgmi diag targeted_power)',nvidia:true},{id:'burn-nvidia-pulse',target:'nvidia-pulse',label:'NVIDIA Pulse Test (dcgmi diag pulse_test)',nvidia:true}])">Run</button></div>
|
||||
<div class="card-body burn-card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA power-oriented recipes. ` + "targeted_power" + ` checks sustained delivery; ` + "pulse_test" + ` checks transient behavior.</p>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-power" disabled><span>NVIDIA Targeted Power (dcgmi diag targeted_power) <span class="cb-note" id="note-nvidia-power"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-pulse" disabled><span>NVIDIA Pulse Test (dcgmi diag pulse_test) <span class="cb-note" id="note-nvidia-pulse"></span></span></label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card burn-card">
|
||||
<div class="card-head card-head-actions"><span>Interconnect / Bandwidth</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},{id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true}])">Run</button></div>
|
||||
<div class="card-body burn-card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA fabric paths. NCCL is interconnect-only and is not a compute burn. NVBandwidth validates copy and bandwidth paths.</p>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-interconnect" disabled><span>NVIDIA Interconnect Test (NCCL all_reduce_perf) <span class="cb-note" id="note-nvidia-interconnect"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-bandwidth" disabled><span>NVIDIA Bandwidth Test (NVBandwidth) <span class="cb-note" id="note-nvidia-bandwidth"></span></span></label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||
@@ -2275,10 +2400,6 @@ function runAllBurnTasks() {
|
||||
const status = document.getElementById('burn-all-status');
|
||||
const all = [
|
||||
{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
|
||||
{id:'burn-nvidia-power',target:'nvidia-targeted-power',label:'NVIDIA Targeted Power (dcgmi diag targeted_power)',nvidia:true},
|
||||
{id:'burn-nvidia-pulse',target:'nvidia-pulse',label:'NVIDIA Pulse Test (dcgmi diag pulse_test)',nvidia:true},
|
||||
{id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},
|
||||
{id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true},
|
||||
{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
|
||||
{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
|
||||
{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
|
||||
@@ -2293,10 +2414,6 @@ function runAllBurnTasks() {
|
||||
fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
|
||||
const map = {
|
||||
'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
|
||||
'nvidia-targeted-power': {cb:'burn-nvidia-power', note:'note-nvidia-power', reason:'dcgmi not available or NVIDIA driver not running'},
|
||||
'nvidia-pulse': {cb:'burn-nvidia-pulse', note:'note-nvidia-pulse', reason:'dcgmi not available or NVIDIA driver not running'},
|
||||
'nvidia-interconnect': {cb:'burn-nvidia-interconnect', note:'note-nvidia-interconnect', reason:'NCCL interconnect tool not available or NVIDIA driver not running'},
|
||||
'nvidia-bandwidth': {cb:'burn-nvidia-bandwidth', note:'note-nvidia-bandwidth', reason:'nvbandwidth or dcgmi not available or NVIDIA driver not running'},
|
||||
'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
|
||||
'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
|
||||
'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},
|
||||
@@ -2452,7 +2569,7 @@ func renderNetwork() string {
|
||||
|
||||
func renderServicesInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
|
||||
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div id="svc-out" style="display:none;margin-top:12px">
|
||||
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||
@@ -2523,11 +2640,6 @@ function svcAction(btn, name, action) {
|
||||
btn.disabled = false;
|
||||
});
|
||||
}
|
||||
function restartGPUDrivers() {
|
||||
var btn = document.querySelector('[onclick*="restartGPUDrivers"]');
|
||||
if (!btn) { svcAction({textContent:'',disabled:false}, 'bee-nvidia', 'restart'); return; }
|
||||
svcAction(btn, 'bee-nvidia', 'restart');
|
||||
}
|
||||
loadServices();
|
||||
</script>`
|
||||
}
|
||||
@@ -2787,6 +2899,124 @@ loadDisplays();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderNvidiaSelfHealInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">
|
||||
<button id="nvidia-restart-btn" class="btn btn-secondary" onclick="nvidiaRestartDrivers()">Restart GPU Drivers</button>
|
||||
<button class="btn btn-sm btn-secondary" onclick="loadNvidiaSelfHeal()">↻ Refresh</button>
|
||||
</div>
|
||||
<div id="nvidia-self-heal-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVIDIA GPU status...</div>
|
||||
<div id="nvidia-self-heal-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div id="nvidia-self-heal-out" style="display:none;margin-top:12px">
|
||||
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||
<span id="nvidia-self-heal-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||
<span id="nvidia-self-heal-out-status" style="font-size:12px"></span>
|
||||
</div>
|
||||
<div id="nvidia-self-heal-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
|
||||
</div>
|
||||
<script>
|
||||
function nvidiaSelfHealShowResult(label, status, output) {
|
||||
var out = document.getElementById('nvidia-self-heal-out');
|
||||
var term = document.getElementById('nvidia-self-heal-terminal');
|
||||
var statusEl = document.getElementById('nvidia-self-heal-out-status');
|
||||
var labelEl = document.getElementById('nvidia-self-heal-out-label');
|
||||
out.style.display = 'block';
|
||||
labelEl.textContent = label;
|
||||
term.textContent = output || '(no output)';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
if (status === 'ok') {
|
||||
statusEl.textContent = '✓ done';
|
||||
statusEl.style.color = 'var(--ok-fg, #2c662d)';
|
||||
} else {
|
||||
statusEl.textContent = '✗ failed';
|
||||
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
}
|
||||
}
|
||||
function nvidiaRestartDrivers() {
|
||||
var btn = document.getElementById('nvidia-restart-btn');
|
||||
var original = btn.textContent;
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Restarting...';
|
||||
nvidiaSelfHealShowResult('restart bee-nvidia', 'ok', 'Running...');
|
||||
fetch('/api/services/action', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({name:'bee-nvidia', action:'restart'})
|
||||
}).then(r=>r.json()).then(d => {
|
||||
nvidiaSelfHealShowResult('restart bee-nvidia', d.status || 'error', d.output || d.error || '(no output)');
|
||||
setTimeout(function() {
|
||||
loadServices();
|
||||
loadNvidiaSelfHeal();
|
||||
}, 800);
|
||||
}).catch(e => {
|
||||
nvidiaSelfHealShowResult('restart bee-nvidia', 'error', 'Request failed: ' + e);
|
||||
}).finally(() => {
|
||||
btn.disabled = false;
|
||||
btn.textContent = original;
|
||||
});
|
||||
}
|
||||
function nvidiaResetGPU(index, btn) {
|
||||
var original = btn.textContent;
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Resetting...';
|
||||
nvidiaSelfHealShowResult('reset gpu ' + index, 'ok', 'Running...');
|
||||
fetch('/api/gpu/nvidia-reset', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({index:index})
|
||||
}).then(r=>r.json()).then(d => {
|
||||
nvidiaSelfHealShowResult('reset gpu ' + index, d.status || 'error', d.output || '(no output)');
|
||||
setTimeout(loadNvidiaSelfHeal, 1000);
|
||||
}).catch(e => {
|
||||
nvidiaSelfHealShowResult('reset gpu ' + index, 'error', 'Request failed: ' + e);
|
||||
}).finally(() => {
|
||||
btn.disabled = false;
|
||||
btn.textContent = original;
|
||||
});
|
||||
}
|
||||
function loadNvidiaSelfHeal() {
|
||||
var status = document.getElementById('nvidia-self-heal-status');
|
||||
var table = document.getElementById('nvidia-self-heal-table');
|
||||
status.textContent = 'Loading NVIDIA GPU status...';
|
||||
status.style.color = 'var(--muted)';
|
||||
table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
|
||||
fetch('/api/gpu/nvidia-status').then(r=>r.json()).then(gpus => {
|
||||
if (!Array.isArray(gpus) || gpus.length === 0) {
|
||||
status.textContent = 'No NVIDIA GPUs detected or nvidia-smi is unavailable.';
|
||||
table.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
status.textContent = gpus.length + ' NVIDIA GPU(s) detected.';
|
||||
const rows = gpus.map(g => {
|
||||
const serial = g.serial || '';
|
||||
const bdf = g.bdf || '';
|
||||
const id = serial || bdf || ('gpu-' + g.index);
|
||||
const badge = g.status === 'OK' ? 'badge-ok' : g.status === 'RESET_REQUIRED' ? 'badge-err' : 'badge-warn';
|
||||
const details = [];
|
||||
if (serial) details.push('serial ' + serial);
|
||||
if (bdf) details.push('bdf ' + bdf);
|
||||
if (g.parse_failure && g.raw_line) details.push(g.raw_line);
|
||||
return '<tr>'
|
||||
+ '<td style="white-space:nowrap">' + g.index + '</td>'
|
||||
+ '<td>' + (g.name || 'unknown') + '</td>'
|
||||
+ '<td style="font-family:monospace">' + id + '</td>'
|
||||
+ '<td><span class="badge ' + badge + '">' + (g.status || 'UNKNOWN') + '</span>'
|
||||
+ (details.length ? '<div style="margin-top:4px;font-size:12px;color:var(--muted)">' + details.join(' | ') + '</div>' : '')
|
||||
+ '</td>'
|
||||
+ '<td style="white-space:nowrap"><button class="btn btn-sm btn-secondary" onclick="nvidiaResetGPU(' + g.index + ', this)">Reset GPU</button></td>'
|
||||
+ '</tr>';
|
||||
}).join('');
|
||||
table.innerHTML = '<table><tr><th>GPU</th><th>Model</th><th>ID</th><th>Status</th><th>Action</th></tr>' + rows + '</table>';
|
||||
}).catch(e => {
|
||||
status.textContent = 'Error loading NVIDIA GPU status: ' + e;
|
||||
status.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
table.innerHTML = '';
|
||||
});
|
||||
}
|
||||
loadNvidiaSelfHeal();
|
||||
</script>`
|
||||
}
|
||||
|
||||
// ── Tools ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderTools() string {
|
||||
@@ -2847,6 +3077,9 @@ function installToRAM() {
|
||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
||||
|
||||
<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
|
||||
renderNvidiaSelfHealInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
||||
renderNetworkInline() + `</div></div>
|
||||
|
||||
|
||||
@@ -302,6 +302,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// GPU presence / tools
|
||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||
mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
|
||||
mux.HandleFunc("GET /api/gpu/nvidia-status", h.handleAPIGNVIDIAGPUStatuses)
|
||||
mux.HandleFunc("POST /api/gpu/nvidia-reset", h.handleAPIGNVIDIAReset)
|
||||
mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
|
||||
|
||||
// System
|
||||
|
||||
@@ -591,7 +591,7 @@ func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
||||
func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
||||
@@ -599,11 +599,20 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `NVIDIA Self Heal`) {
|
||||
t.Fatalf("tools page missing nvidia self heal section: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Restart GPU Drivers`) {
|
||||
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `restartGPUDrivers()`) {
|
||||
t.Fatalf("tools page missing restartGPUDrivers action: %s", body)
|
||||
if !strings.Contains(body, `nvidiaRestartDrivers()`) {
|
||||
t.Fatalf("tools page missing nvidiaRestartDrivers action: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `/api/gpu/nvidia-status`) {
|
||||
t.Fatalf("tools page missing nvidia status api usage: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `nvidiaResetGPU(`) {
|
||||
t.Fatalf("tools page missing nvidiaResetGPU action: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||
t.Fatalf("tools page missing boot source field: %s", body)
|
||||
@@ -684,8 +693,8 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||
for _, needle := range []string{
|
||||
`Benchmark Results`,
|
||||
`Composite score by saved benchmark run and GPU.`,
|
||||
`NVIDIA H100 PCIe / GPU 0`,
|
||||
`NVIDIA H100 PCIe / GPU 1`,
|
||||
`GPU #0 — NVIDIA H100 PCIe`,
|
||||
`GPU #1 — NVIDIA H100 PCIe`,
|
||||
`#1`,
|
||||
wantTime,
|
||||
`1176.25`,
|
||||
|
||||
@@ -115,14 +115,16 @@ type Task struct {
|
||||
// taskParams holds optional parameters parsed from the run request.
|
||||
type taskParams struct {
|
||||
Duration int `json:"duration,omitempty"`
|
||||
DiagLevel int `json:"diag_level,omitempty"`
|
||||
StressMode bool `json:"stress_mode,omitempty"`
|
||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||
SizeMB int `json:"size_mb,omitempty"`
|
||||
Passes int `json:"passes,omitempty"`
|
||||
Loader string `json:"loader,omitempty"`
|
||||
BurnProfile string `json:"burn_profile,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
||||
RunNCCL bool `json:"run_nccl,omitempty"`
|
||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
@@ -214,11 +216,11 @@ var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
||||
const maxTaskHistory = 50
|
||||
|
||||
var (
|
||||
runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||
runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(ctx, baseDir, sizeMB, passes, logFunc)
|
||||
}
|
||||
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(ctx, baseDir, extended, logFunc)
|
||||
}
|
||||
runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
|
||||
@@ -551,7 +553,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
diagLevel := t.params.DiagLevel
|
||||
diagLevel := 2
|
||||
if t.params.StressMode {
|
||||
diagLevel = 3
|
||||
}
|
||||
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
||||
result, e := a.RunNvidiaAcceptancePackWithOptions(
|
||||
ctx, "", diagLevel, t.params.GPUIndices, j.append,
|
||||
@@ -585,6 +590,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
RunNCCL: t.params.RunNCCL,
|
||||
ParallelGPUs: t.params.ParallelGPUs,
|
||||
}, j.append)
|
||||
case "nvidia-compute":
|
||||
if a == nil {
|
||||
@@ -656,13 +662,17 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append)
|
||||
sizeMB, passes := 256, 1
|
||||
if t.params.StressMode {
|
||||
sizeMB, passes = 1024, 3
|
||||
}
|
||||
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
|
||||
case "storage":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runStorageAcceptancePackCtx(a, ctx, "", j.append)
|
||||
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
|
||||
case "cpu":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
@@ -673,7 +683,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
if dur <= 0 {
|
||||
dur = 60
|
||||
if t.params.StressMode {
|
||||
dur = 1800
|
||||
} else {
|
||||
dur = 60
|
||||
}
|
||||
}
|
||||
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
|
||||
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||
|
||||
@@ -422,7 +422,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
||||
for _, needle := range []string{
|
||||
`Benchmark Results`,
|
||||
`Composite score for this benchmark task.`,
|
||||
`NVIDIA H100 PCIe / GPU 0`,
|
||||
`GPU #0 — NVIDIA H100 PCIe`,
|
||||
`1176.25`,
|
||||
} {
|
||||
if !strings.Contains(html, needle) {
|
||||
|
||||
2
bible
2
bible
Submodule bible updated: 1d89a4918e...688b87e98d
@@ -36,7 +36,6 @@ typedef void *CUstream;
|
||||
#define MAX_CUBLAS_PROFILES 5
|
||||
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
||||
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
||||
#define STRESS_LAUNCH_DEPTH 8
|
||||
|
||||
static const char *ptx_source =
|
||||
".version 6.0\n"
|
||||
@@ -344,7 +343,6 @@ static int run_ptx_fallback(struct cuda_api *api,
|
||||
unsigned long iterations = 0;
|
||||
int mp_count = 0;
|
||||
int stream_count = 1;
|
||||
int launches_per_wave = 0;
|
||||
|
||||
memset(report, 0, sizeof(*report));
|
||||
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
||||
@@ -419,44 +417,42 @@ static int run_ptx_fallback(struct cuda_api *api,
|
||||
|
||||
unsigned int threads = 256;
|
||||
|
||||
double start = now_seconds();
|
||||
double deadline = start + (double)seconds;
|
||||
double deadline = now_seconds() + (double)seconds;
|
||||
double next_sync = now_seconds() + 1.0;
|
||||
while (now_seconds() < deadline) {
|
||||
launches_per_wave = 0;
|
||||
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
||||
int launched_this_batch = 0;
|
||||
for (int lane = 0; lane < stream_count; lane++) {
|
||||
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
||||
if (!check_rc(api,
|
||||
"cuLaunchKernel",
|
||||
api->cuLaunchKernel(kernel,
|
||||
blocks,
|
||||
1,
|
||||
1,
|
||||
threads,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
streams[lane],
|
||||
params[lane],
|
||||
NULL))) {
|
||||
goto fail;
|
||||
}
|
||||
launches_per_wave++;
|
||||
launched_this_batch++;
|
||||
}
|
||||
if (launched_this_batch <= 0) {
|
||||
break;
|
||||
int launched = 0;
|
||||
for (int lane = 0; lane < stream_count; lane++) {
|
||||
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
||||
if (!check_rc(api,
|
||||
"cuLaunchKernel",
|
||||
api->cuLaunchKernel(kernel,
|
||||
blocks,
|
||||
1,
|
||||
1,
|
||||
threads,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
streams[lane],
|
||||
params[lane],
|
||||
NULL))) {
|
||||
goto fail;
|
||||
}
|
||||
launched++;
|
||||
iterations++;
|
||||
}
|
||||
if (launches_per_wave <= 0) {
|
||||
if (launched <= 0) {
|
||||
goto fail;
|
||||
}
|
||||
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
||||
goto fail;
|
||||
double now = now_seconds();
|
||||
if (now >= next_sync || now >= deadline) {
|
||||
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
||||
goto fail;
|
||||
}
|
||||
next_sync = now + 1.0;
|
||||
}
|
||||
iterations += (unsigned long)launches_per_wave;
|
||||
}
|
||||
api->cuCtxSynchronize();
|
||||
|
||||
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
|
||||
goto fail;
|
||||
@@ -468,11 +464,10 @@ static int run_ptx_fallback(struct cuda_api *api,
|
||||
report->iterations = iterations;
|
||||
snprintf(report->details,
|
||||
sizeof(report->details),
|
||||
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
|
||||
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
|
||||
size_mb,
|
||||
report->buffer_mb,
|
||||
report->stream_count,
|
||||
STRESS_LAUNCH_DEPTH,
|
||||
bytes_per_stream[0] / (1024u * 1024u),
|
||||
iterations);
|
||||
|
||||
@@ -1140,7 +1135,6 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
int stream_count = 1;
|
||||
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
|
||||
int prepared_count = 0;
|
||||
int wave_launches = 0;
|
||||
size_t requested_budget = 0;
|
||||
size_t total_budget = 0;
|
||||
size_t per_profile_budget = 0;
|
||||
@@ -1207,11 +1201,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
|
||||
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
|
||||
size_mb,
|
||||
report->buffer_mb,
|
||||
report->stream_count,
|
||||
STRESS_LAUNCH_DEPTH,
|
||||
mp_count,
|
||||
per_profile_budget / (1024u * 1024u));
|
||||
|
||||
@@ -1260,50 +1253,55 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Keep the GPU queue continuously full by submitting kernels without
|
||||
* synchronizing after every wave. A sync barrier after each small batch
|
||||
* creates CPU↔GPU ping-pong gaps that prevent full TDP utilisation,
|
||||
* especially when individual kernels are short. Instead we sync at most
|
||||
* once per second (for error detection) and once at the very end. */
|
||||
double deadline = now_seconds() + (double)seconds;
|
||||
double next_sync = now_seconds() + 1.0;
|
||||
while (now_seconds() < deadline) {
|
||||
wave_launches = 0;
|
||||
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
||||
int launched_this_batch = 0;
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
if (!prepared[i].ready) {
|
||||
continue;
|
||||
}
|
||||
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"%s=FAILED runtime\n",
|
||||
prepared[i].desc.name);
|
||||
for (int j = 0; j < prepared_count; j++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[j]);
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
prepared[i].iterations++;
|
||||
report->iterations++;
|
||||
wave_launches++;
|
||||
launched_this_batch++;
|
||||
int launched = 0;
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
if (!prepared[i].ready) {
|
||||
continue;
|
||||
}
|
||||
if (launched_this_batch <= 0) {
|
||||
break;
|
||||
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"%s=FAILED runtime\n",
|
||||
prepared[i].desc.name);
|
||||
for (int j = 0; j < prepared_count; j++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[j]);
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
prepared[i].iterations++;
|
||||
report->iterations++;
|
||||
launched++;
|
||||
}
|
||||
if (wave_launches <= 0) {
|
||||
if (launched <= 0) {
|
||||
break;
|
||||
}
|
||||
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||
double now = now_seconds();
|
||||
if (now >= next_sync || now >= deadline) {
|
||||
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
next_sync = now + 1.0;
|
||||
}
|
||||
}
|
||||
/* Final drain — ensure all queued work finishes before we read results. */
|
||||
cuda->cuCtxSynchronize();
|
||||
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
if (!prepared[i].ready) {
|
||||
|
||||
@@ -5,69 +5,110 @@ echo "=== generating bee wallpaper ==="
|
||||
mkdir -p /usr/share/bee
|
||||
|
||||
python3 - <<'PYEOF'
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
from PIL import Image, ImageDraw, ImageFont, ImageFilter
|
||||
import os
|
||||
|
||||
W, H = 1920, 1080
|
||||
|
||||
LOGO = """\
|
||||
\u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2557 \u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557
|
||||
\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d\u255a\u2588\u2588\u2557 \u2588\u2588\u2554\u255d \u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d
|
||||
\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2551\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u255a\u2588\u2588\u2588\u2588\u2554\u255d \u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255d\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2557
|
||||
\u2588\u2588\u2554\u2550\u2550\u255d \u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2551\u255a\u2550\u2550\u2550\u2550\u2588\u2588\u2551 \u255a\u2588\u2588\u2554\u255d \u255a\u2550\u2550\u2550\u2550\u255d\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u255d \u2588\u2588\u2554\u2550\u2550\u255d
|
||||
\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2551 \u2588\u2588\u2551 \u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255d\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557
|
||||
\u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d\u255a\u2550\u255d \u255a\u2550\u255d\u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d \u255a\u2550\u255d \u255a\u2550\u2550\u2550\u2550\u2550\u255d \u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d\u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d
|
||||
Hardware Audit LiveCD"""
|
||||
GLYPHS = {
|
||||
'E': ["11111", "10000", "11110", "10000", "10000", "10000", "11111"],
|
||||
'A': ["01110", "10001", "10001", "11111", "10001", "10001", "10001"],
|
||||
'S': ["01111", "10000", "10000", "01110", "00001", "00001", "11110"],
|
||||
'Y': ["10001", "10001", "01010", "00100", "00100", "00100", "00100"],
|
||||
'B': ["11110", "10001", "10001", "11110", "10001", "10001", "11110"],
|
||||
'-': ["00000", "00000", "11111", "00000", "00000", "00000", "00000"],
|
||||
}
|
||||
|
||||
# Find a monospace font that supports box-drawing characters
|
||||
FONT_CANDIDATES = [
|
||||
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf',
|
||||
'/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf',
|
||||
'/usr/share/fonts/truetype/freefont/FreeMono.ttf',
|
||||
'/usr/share/fonts/truetype/noto/NotoMono-Regular.ttf',
|
||||
TITLE = "EASY-BEE"
|
||||
SUBTITLE = "Hardware Audit LiveCD"
|
||||
CELL = 30
|
||||
GLYPH_GAP = 18
|
||||
ROW_GAP = 6
|
||||
|
||||
FG = (0xF6, 0xD0, 0x47)
|
||||
FG_DIM = (0xD4, 0xA9, 0x1C)
|
||||
SHADOW = (0x5E, 0x47, 0x05)
|
||||
SUB = (0x96, 0x7A, 0x17)
|
||||
BG = (0x05, 0x05, 0x05)
|
||||
|
||||
SUB_FONT_CANDIDATES = [
|
||||
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
|
||||
]
|
||||
|
||||
font_path = None
|
||||
for p in FONT_CANDIDATES:
|
||||
if os.path.exists(p):
|
||||
font_path = p
|
||||
break
|
||||
|
||||
SIZE = 22
|
||||
if font_path:
|
||||
font_logo = ImageFont.truetype(font_path, SIZE)
|
||||
font_sub = ImageFont.truetype(font_path, SIZE)
|
||||
else:
|
||||
font_logo = ImageFont.load_default()
|
||||
font_sub = font_logo
|
||||
def load_font(size):
|
||||
for path in SUB_FONT_CANDIDATES:
|
||||
if os.path.exists(path):
|
||||
return ImageFont.truetype(path, size)
|
||||
return ImageFont.load_default()
|
||||
|
||||
img = Image.new('RGB', (W, H), (0, 0, 0))
|
||||
|
||||
def glyph_width(ch):
|
||||
return len(GLYPHS[ch][0])
|
||||
|
||||
|
||||
def render_logo_mask():
|
||||
width_cells = 0
|
||||
for idx, ch in enumerate(TITLE):
|
||||
width_cells += glyph_width(ch)
|
||||
if idx != len(TITLE) - 1:
|
||||
width_cells += 1
|
||||
mask_w = width_cells * CELL + (len(TITLE) - 1) * GLYPH_GAP
|
||||
mask_h = 7 * CELL + 6 * ROW_GAP
|
||||
mask = Image.new('L', (mask_w, mask_h), 0)
|
||||
draw = ImageDraw.Draw(mask)
|
||||
|
||||
cx = 0
|
||||
for idx, ch in enumerate(TITLE):
|
||||
glyph = GLYPHS[ch]
|
||||
for row_idx, row in enumerate(glyph):
|
||||
for col_idx, cell in enumerate(row):
|
||||
if cell != '1':
|
||||
continue
|
||||
x0 = cx + col_idx * CELL
|
||||
y0 = row_idx * (CELL + ROW_GAP)
|
||||
x1 = x0 + CELL - 4
|
||||
y1 = y0 + CELL - 4
|
||||
draw.rounded_rectangle((x0, y0, x1, y1), radius=4, fill=255)
|
||||
cx += glyph_width(ch) * CELL
|
||||
if idx != len(TITLE) - 1:
|
||||
cx += CELL + GLYPH_GAP
|
||||
return mask
|
||||
|
||||
|
||||
img = Image.new('RGB', (W, H), BG)
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
# Measure logo block line by line to avoid font ascender offset
|
||||
lines = LOGO.split('\n')
|
||||
logo_lines = lines[:6]
|
||||
sub_line = lines[6] if len(lines) > 6 else ''
|
||||
# Soft amber glow under the logo without depending on font rendering.
|
||||
glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
|
||||
glow_draw = ImageDraw.Draw(glow)
|
||||
glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
|
||||
glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
|
||||
glow = glow.filter(ImageFilter.GaussianBlur(60))
|
||||
img = Image.alpha_composite(img.convert('RGBA'), glow)
|
||||
|
||||
line_h = SIZE + 2
|
||||
block_h = len(logo_lines) * line_h + 8 + (SIZE if sub_line else 0)
|
||||
logo_mask = render_logo_mask()
|
||||
logo_w, logo_h = logo_mask.size
|
||||
logo_x = (W - logo_w) // 2
|
||||
logo_y = 290
|
||||
|
||||
# Width: measure the widest logo line
|
||||
max_w = 0
|
||||
for line in logo_lines:
|
||||
bb = draw.textbbox((0, 0), line, font=font_logo)
|
||||
max_w = max(max_w, bb[2] - bb[0])
|
||||
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(2))
|
||||
img.paste(SHADOW, (logo_x + 16, logo_y + 14), shadow_mask)
|
||||
img.paste(FG_DIM, (logo_x + 8, logo_y + 7), logo_mask)
|
||||
img.paste(FG, (logo_x, logo_y), logo_mask)
|
||||
|
||||
x = (W - max_w) // 2
|
||||
y = (H - block_h) // 2
|
||||
font_sub = load_font(30)
|
||||
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
|
||||
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
|
||||
sub_y = logo_y + logo_h + 54
|
||||
draw = ImageDraw.Draw(img)
|
||||
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
|
||||
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
|
||||
|
||||
cy = y
|
||||
for line in logo_lines:
|
||||
draw.text((x, cy), line, font=font_logo, fill=(0xf6, 0xc9, 0x0e))
|
||||
cy += line_h
|
||||
cy += 8
|
||||
if sub_line:
|
||||
draw.text((x, cy), sub_line, font=font_sub, fill=(0x80, 0x68, 0x18))
|
||||
img = img.convert('RGB')
|
||||
|
||||
img.save('/usr/share/bee/wallpaper.png', optimize=True)
|
||||
print('wallpaper written: /usr/share/bee/wallpaper.png')
|
||||
|
||||
Reference in New Issue
Block a user