Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3f41a026ca | ||
|
|
0ee4f46537 | ||
| 8db40b098a | |||
| 16e7ae00e7 | |||
| b2f8626fee | |||
| dd26e03b2d | |||
| 6937a4c6ec | |||
| b9be93c213 | |||
| d1a22d782d | |||
|
|
0a4bb596f6 | ||
|
|
531d1ca366 | ||
|
|
93cfa78e8c | ||
|
|
1358485f2b |
@@ -382,9 +382,9 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
|||||||
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
||||||
}
|
}
|
||||||
case "memory":
|
case "memory":
|
||||||
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
|
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", 256, 1, logLine)
|
||||||
case "storage":
|
case "storage":
|
||||||
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
|
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", false, logLine)
|
||||||
case "cpu":
|
case "cpu":
|
||||||
dur := *duration
|
dur := *duration
|
||||||
if dur <= 0 {
|
if dur <= 0 {
|
||||||
|
|||||||
@@ -122,8 +122,10 @@ type satRunner interface {
|
|||||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
||||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
|
||||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
ResetNvidiaGPU(index int) (string, error)
|
||||||
|
RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
|
||||||
|
RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
|
||||||
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||||
DetectGPUVendor() string
|
DetectGPUVendor() string
|
||||||
@@ -137,6 +139,7 @@ type satRunner interface {
|
|||||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||||
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||||
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
|
RunHPL(ctx context.Context, baseDir string, opts platform.HPLOptions, logFunc func(string)) (string, *platform.HPLResult, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
type runtimeChecker interface {
|
type runtimeChecker interface {
|
||||||
@@ -521,6 +524,15 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
|||||||
return a.sat.ListNvidiaGPUs()
|
return a.sat.ListNvidiaGPUs()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||||
|
return a.sat.ListNvidiaGPUStatuses()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
|
||||||
|
out, err := a.sat.ResetNvidiaGPU(index)
|
||||||
|
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
@@ -591,14 +603,14 @@ func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts p
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc)
|
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
@@ -623,14 +635,14 @@ func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (Actio
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc)
|
return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
@@ -726,6 +738,13 @@ func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
|||||||
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunHPL(ctx context.Context, baseDir string, opts platform.HPLOptions, logFunc func(string)) (string, *platform.HPLResult, error) {
|
||||||
|
if a == nil {
|
||||||
|
return "", nil, fmt.Errorf("app not configured")
|
||||||
|
}
|
||||||
|
return a.sat.RunHPL(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
||||||
path, err := a.RunFanStressTest(ctx, "", opts)
|
path, err := a.RunFanStressTest(ctx, "", opts)
|
||||||
body := formatFanStressResult(path)
|
body := formatFanStressResult(path)
|
||||||
|
|||||||
@@ -135,6 +135,8 @@ type fakeSAT struct {
|
|||||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||||
runAMDPackFn func(string) (string, error)
|
runAMDPackFn func(string) (string, error)
|
||||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||||
|
listNvidiaGPUStatusesFn func() ([]platform.NvidiaGPUStatus, error)
|
||||||
|
resetNvidiaGPUFn func(int) (string, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||||
@@ -201,11 +203,25 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
func (f fakeSAT) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||||
|
if f.listNvidiaGPUStatusesFn != nil {
|
||||||
|
return f.listNvidiaGPUStatusesFn()
|
||||||
|
}
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) {
|
||||||
|
if f.resetNvidiaGPUFn != nil {
|
||||||
|
return f.resetNvidiaGPUFn(index)
|
||||||
|
}
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _, _ int, _ func(string)) (string, error) {
|
||||||
return f.runMemoryFn(baseDir)
|
return f.runMemoryFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ bool, _ func(string)) (string, error) {
|
||||||
return f.runStorageFn(baseDir)
|
return f.runStorageFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -266,6 +282,9 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
|
|||||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
func (f fakeSAT) RunHPL(_ context.Context, _ string, _ platform.HPLOptions, _ func(string)) (string, *platform.HPLResult, error) {
|
||||||
|
return "", nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
@@ -805,6 +824,9 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
for _, want := range []string{
|
for _, want := range []string{
|
||||||
"/system/ip-link.txt",
|
"/system/ip-link.txt",
|
||||||
"/system/ip-link-stats.txt",
|
"/system/ip-link-stats.txt",
|
||||||
|
"/system/kernel-aer-nvidia.txt",
|
||||||
|
"/system/lspci-nvidia-bridges-vv.txt",
|
||||||
|
"/system/pcie-aer-sysfs.txt",
|
||||||
"/system/ethtool-info.txt",
|
"/system/ethtool-info.txt",
|
||||||
"/system/ethtool-link.txt",
|
"/system/ethtool-link.txt",
|
||||||
"/system/ethtool-module.txt",
|
"/system/ethtool-module.txt",
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ package app
|
|||||||
import (
|
import (
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@@ -18,6 +19,7 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
|
|||||||
}
|
}
|
||||||
if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
|
if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
|
||||||
applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
|
applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
|
||||||
|
applyNvidiaPerGPUStatus(snap.PCIeDevices, baseDir)
|
||||||
}
|
}
|
||||||
if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
|
if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
|
||||||
applyMemorySAT(snap.Memory, summary)
|
applyMemorySAT(snap.Memory, summary)
|
||||||
@@ -32,6 +34,100 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
|
|||||||
applyComponentStatusDB(snap, db)
|
applyComponentStatusDB(snap, db)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type nvidiaPerGPUStatus struct {
|
||||||
|
runStatus string
|
||||||
|
reason string
|
||||||
|
}
|
||||||
|
|
||||||
|
func applyNvidiaPerGPUStatus(devs []schema.HardwarePCIeDevice, baseDir string) {
|
||||||
|
statusByIndex, ts, ok := loadLatestNvidiaPerGPUStatus(baseDir)
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for i := range devs {
|
||||||
|
if devs[i].Telemetry == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
rawIdx, ok := devs[i].Telemetry["nvidia_gpu_index"]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, ok := telemetryInt(rawIdx)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
st, ok := statusByIndex[idx]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
status, description, ok := satKeyStatus(st.runStatus, firstNonEmpty(strings.TrimSpace(st.reason), "nvidia GPU SAT"))
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
mergeComponentStatusPreferDetail(&devs[i].HardwareComponentStatus, ts, status, description)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadLatestNvidiaPerGPUStatus(baseDir string) (map[int]nvidiaPerGPUStatus, string, bool) {
|
||||||
|
matches, err := filepath.Glob(filepath.Join(baseDir, "gpu-nvidia-*"))
|
||||||
|
if err != nil || len(matches) == 0 {
|
||||||
|
return nil, "", false
|
||||||
|
}
|
||||||
|
sort.Strings(matches)
|
||||||
|
runDir := matches[len(matches)-1]
|
||||||
|
summaryRaw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||||
|
if err != nil {
|
||||||
|
return nil, "", false
|
||||||
|
}
|
||||||
|
summaryKV := parseKeyValueSummary(string(summaryRaw))
|
||||||
|
runAtUTC := strings.TrimSpace(summaryKV["run_at_utc"])
|
||||||
|
files, err := filepath.Glob(filepath.Join(runDir, "gpu-*-status.txt"))
|
||||||
|
if err != nil || len(files) == 0 {
|
||||||
|
return nil, "", false
|
||||||
|
}
|
||||||
|
out := make(map[int]nvidiaPerGPUStatus, len(files))
|
||||||
|
for _, file := range files {
|
||||||
|
raw, err := os.ReadFile(file)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
kv := parseKeyValueSummary(string(raw))
|
||||||
|
idx, err := strconv.Atoi(strings.TrimSpace(kv["gpu_index"]))
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out[idx] = nvidiaPerGPUStatus{
|
||||||
|
runStatus: strings.ToUpper(strings.TrimSpace(kv["run_status"])),
|
||||||
|
reason: strings.TrimSpace(kv["reason"]),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(out) == 0 {
|
||||||
|
return nil, "", false
|
||||||
|
}
|
||||||
|
return out, runAtUTC, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func telemetryInt(v any) (int, bool) {
|
||||||
|
switch value := v.(type) {
|
||||||
|
case int:
|
||||||
|
return value, true
|
||||||
|
case int32:
|
||||||
|
return int(value), true
|
||||||
|
case int64:
|
||||||
|
return int(value), true
|
||||||
|
case float64:
|
||||||
|
return int(value), true
|
||||||
|
case string:
|
||||||
|
n, err := strconv.Atoi(strings.TrimSpace(value))
|
||||||
|
if err != nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return n, true
|
||||||
|
default:
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type satSummary struct {
|
type satSummary struct {
|
||||||
runAtUTC string
|
runAtUTC string
|
||||||
overall string
|
overall string
|
||||||
@@ -176,6 +272,31 @@ func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func mergeComponentStatusPreferDetail(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
|
||||||
|
if component == nil || satStatus == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
current := strings.TrimSpace(ptrString(component.Status))
|
||||||
|
newSeverity := statusSeverity(satStatus)
|
||||||
|
currentSeverity := statusSeverity(current)
|
||||||
|
if current == "" || current == "Unknown" || newSeverity > currentSeverity {
|
||||||
|
mergeComponentStatus(component, changedAt, satStatus, description)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if newSeverity == currentSeverity && strings.TrimSpace(description) != "" {
|
||||||
|
component.Status = appStringPtr(satStatus)
|
||||||
|
component.ErrorDescription = appStringPtr(description)
|
||||||
|
if strings.TrimSpace(changedAt) != "" {
|
||||||
|
component.StatusChangedAt = appStringPtr(changedAt)
|
||||||
|
component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
|
||||||
|
Status: satStatus,
|
||||||
|
ChangedAt: changedAt,
|
||||||
|
Details: appStringPtr(description),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func statusSeverity(status string) int {
|
func statusSeverity(status string) int {
|
||||||
switch strings.TrimSpace(status) {
|
switch strings.TrimSpace(status) {
|
||||||
case "Critical":
|
case "Critical":
|
||||||
|
|||||||
@@ -59,3 +59,51 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
|||||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestApplyLatestSATStatusesMarksNvidiaGPUByPerGPUStatusFile(t *testing.T) {
|
||||||
|
baseDir := t.TempDir()
|
||||||
|
runDir := filepath.Join(baseDir, "gpu-nvidia-20260407-162123")
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte("run_at_utc=2026-04-07T16:21:23Z\noverall_status=FAILED\n"), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "gpu-1-status.txt"), []byte("gpu_index=1\ngpu_name=NVIDIA H100 PCIe\nrun_status=FAILED\nreason=GPU requires reset\n"), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
class := "VideoController"
|
||||||
|
manufacturer := "NVIDIA Corporation"
|
||||||
|
bdf0 := "0000:4b:00.0"
|
||||||
|
bdf1 := "0000:4f:00.0"
|
||||||
|
snap := schema.HardwareSnapshot{
|
||||||
|
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||||
|
{
|
||||||
|
DeviceClass: &class,
|
||||||
|
Manufacturer: &manufacturer,
|
||||||
|
BDF: &bdf0,
|
||||||
|
Telemetry: map[string]any{"nvidia_gpu_index": 0},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
DeviceClass: &class,
|
||||||
|
Manufacturer: &manufacturer,
|
||||||
|
BDF: &bdf1,
|
||||||
|
Telemetry: map[string]any{"nvidia_gpu_index": 1},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||||
|
|
||||||
|
if snap.PCIeDevices[1].Status == nil || *snap.PCIeDevices[1].Status != "Critical" {
|
||||||
|
t.Fatalf("gpu1 status=%v want Critical", snap.PCIeDevices[1].Status)
|
||||||
|
}
|
||||||
|
if snap.PCIeDevices[1].ErrorDescription == nil || *snap.PCIeDevices[1].ErrorDescription != "GPU requires reset failed" {
|
||||||
|
got := "<nil>"
|
||||||
|
if snap.PCIeDevices[1].ErrorDescription != nil {
|
||||||
|
got = *snap.PCIeDevices[1].ErrorDescription
|
||||||
|
}
|
||||||
|
t.Fatalf("gpu1 error=%q want per-gpu reason", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -40,7 +40,36 @@ var supportBundleCommands = []struct {
|
|||||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||||
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
||||||
|
{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if command -v dmesg >/dev/null 2>&1; then
|
||||||
|
dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
|
||||||
|
else
|
||||||
|
echo "dmesg not found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v lspci >/dev/null 2>&1; then
|
||||||
|
echo "lspci not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
found=0
|
||||||
|
for gpu in $(lspci -Dn | awk '$3 ~ /^10de:/ {print $1}'); do
|
||||||
|
found=1
|
||||||
|
echo "=== GPU $gpu ==="
|
||||||
|
lspci -s "$gpu" -vv 2>&1 || true
|
||||||
|
bridge=$(basename "$(readlink -f "/sys/bus/pci/devices/$gpu/.." 2>/dev/null)" 2>/dev/null)
|
||||||
|
if [ -n "$bridge" ] && [ "$bridge" != "$gpu" ]; then
|
||||||
|
echo
|
||||||
|
echo "=== UPSTREAM $bridge for $gpu ==="
|
||||||
|
lspci -s "$bridge" -vv 2>&1 || true
|
||||||
|
fi
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no NVIDIA PCI devices found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||||
for d in /sys/bus/pci/devices/*/; do
|
for d in /sys/bus/pci/devices/*/; do
|
||||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||||
@@ -51,6 +80,30 @@ for d in /sys/bus/pci/devices/*/; do
|
|||||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
`}},
|
||||||
|
{name: "system/pcie-aer-sysfs.txt", cmd: []string{"sh", "-c", `
|
||||||
|
found=0
|
||||||
|
for dev in /sys/bus/pci/devices/*; do
|
||||||
|
[ -e "$dev" ] || continue
|
||||||
|
bdf=$(basename "$dev")
|
||||||
|
block=""
|
||||||
|
for f in aer_dev_correctable aer_dev_fatal aer_dev_nonfatal aer_rootport_total_err_cor aer_rootport_total_err_fatal aer_rootport_total_err_nonfatal; do
|
||||||
|
if [ -r "$dev/$f" ]; then
|
||||||
|
if [ -z "$block" ]; then
|
||||||
|
block=1
|
||||||
|
found=1
|
||||||
|
echo "=== $bdf ==="
|
||||||
|
fi
|
||||||
|
printf " %-30s %s\n" "$f" "$(cat "$dev/$f" 2>/dev/null)"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if [ -n "$block" ]; then
|
||||||
|
echo
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no PCIe AER sysfs counters found"
|
||||||
|
fi
|
||||||
`}},
|
`}},
|
||||||
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
|
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
|
||||||
if ! command -v ethtool >/dev/null 2>&1; then
|
if ! command -v ethtool >/dev/null 2>&1; then
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ import (
|
|||||||
const nvidiaVendorID = 0x10de
|
const nvidiaVendorID = 0x10de
|
||||||
|
|
||||||
type nvidiaGPUInfo struct {
|
type nvidiaGPUInfo struct {
|
||||||
|
Index int
|
||||||
BDF string
|
BDF string
|
||||||
Serial string
|
Serial string
|
||||||
VBIOS string
|
VBIOS string
|
||||||
@@ -132,6 +133,7 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
info := nvidiaGPUInfo{
|
info := nvidiaGPUInfo{
|
||||||
|
Index: parseRequiredInt(rec[0]),
|
||||||
BDF: bdf,
|
BDF: bdf,
|
||||||
Serial: strings.TrimSpace(rec[2]),
|
Serial: strings.TrimSpace(rec[2]),
|
||||||
VBIOS: strings.TrimSpace(rec[3]),
|
VBIOS: strings.TrimSpace(rec[3]),
|
||||||
@@ -187,6 +189,14 @@ func parseMaybeInt(v string) *int {
|
|||||||
return &n
|
return &n
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseRequiredInt(v string) int {
|
||||||
|
n, err := strconv.Atoi(strings.TrimSpace(v))
|
||||||
|
if err != nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
|
||||||
func pcieLinkGenLabel(gen int) string {
|
func pcieLinkGenLabel(gen int) string {
|
||||||
return fmt.Sprintf("Gen%d", gen)
|
return fmt.Sprintf("Gen%d", gen)
|
||||||
}
|
}
|
||||||
@@ -240,6 +250,10 @@ func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||||
|
if dev.Telemetry == nil {
|
||||||
|
dev.Telemetry = map[string]any{}
|
||||||
|
}
|
||||||
|
dev.Telemetry["nvidia_gpu_index"] = info.Index
|
||||||
if info.TemperatureC != nil {
|
if info.TemperatureC != nil {
|
||||||
dev.TemperatureC = info.TemperatureC
|
dev.TemperatureC = info.TemperatureC
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -86,6 +86,9 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
|||||||
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
|
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
|
||||||
t.Fatalf("firmware: got %v", out[0].Firmware)
|
t.Fatalf("firmware: got %v", out[0].Firmware)
|
||||||
}
|
}
|
||||||
|
if out[0].Telemetry == nil || out[0].Telemetry["nvidia_gpu_index"] != 0 {
|
||||||
|
t.Fatalf("telemetry nvidia_gpu_index: got %#v", out[0].Telemetry)
|
||||||
|
}
|
||||||
if out[0].Status == nil || *out[0].Status != statusWarning {
|
if out[0].Status == nil || *out[0].Status != statusWarning {
|
||||||
t.Fatalf("status: got %v", out[0].Status)
|
t.Fatalf("status: got %v", out[0].Status)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -105,7 +105,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
BenchmarkVersion: benchmarkVersion,
|
BenchmarkVersion: benchmarkVersion,
|
||||||
GeneratedAt: time.Now().UTC(),
|
GeneratedAt: time.Now().UTC(),
|
||||||
Hostname: hostname,
|
Hostname: hostname,
|
||||||
|
ServerModel: readServerModel(),
|
||||||
BenchmarkProfile: spec.Name,
|
BenchmarkProfile: spec.Name,
|
||||||
|
ParallelGPUs: opts.ParallelGPUs,
|
||||||
SelectedGPUIndices: append([]int(nil), selected...),
|
SelectedGPUIndices: append([]int(nil), selected...),
|
||||||
Normalization: BenchmarkNormalization{
|
Normalization: BenchmarkNormalization{
|
||||||
Status: "full",
|
Status: "full",
|
||||||
@@ -143,6 +145,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if opts.ParallelGPUs {
|
||||||
|
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
|
||||||
|
} else {
|
||||||
|
|
||||||
for _, idx := range selected {
|
for _, idx := range selected {
|
||||||
gpuResult := BenchmarkGPUResult{
|
gpuResult := BenchmarkGPUResult{
|
||||||
Index: idx,
|
Index: idx,
|
||||||
@@ -285,6 +291,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
|
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} // end sequential path
|
||||||
|
|
||||||
if len(selected) > 1 && opts.RunNCCL {
|
if len(selected) > 1 && opts.RunNCCL {
|
||||||
result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
|
result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
|
||||||
if result.Interconnect != nil && result.Interconnect.Supported {
|
if result.Interconnect != nil && result.Interconnect.Supported {
|
||||||
@@ -318,8 +326,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
}
|
}
|
||||||
|
|
||||||
report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected))
|
report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected))
|
||||||
if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil {
|
||||||
return "", fmt.Errorf("write report.txt: %w", err)
|
return "", fmt.Errorf("write report.md: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
summary := renderBenchmarkSummary(result)
|
summary := renderBenchmarkSummary(result)
|
||||||
@@ -362,9 +370,29 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// benchmarkGPUInfoQuery describes a nvidia-smi --query-gpu field set to try.
|
||||||
|
// Fields are tried in order; the first successful query wins. Extended fields
|
||||||
|
// (attribute.multiprocessor_count, power.default_limit) are not supported on
|
||||||
|
// all driver versions, so we fall back to the base set if the full query fails.
|
||||||
|
var benchmarkGPUInfoQueries = []struct {
|
||||||
|
fields string
|
||||||
|
extended bool // whether this query includes optional extended fields
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
|
||||||
|
extended: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics",
|
||||||
|
extended: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
||||||
|
var lastErr error
|
||||||
|
for _, q := range benchmarkGPUInfoQueries {
|
||||||
args := []string{
|
args := []string{
|
||||||
"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
|
"--query-gpu=" + q.fields,
|
||||||
"--format=csv,noheader,nounits",
|
"--format=csv,noheader,nounits",
|
||||||
}
|
}
|
||||||
if len(gpuIndices) > 0 {
|
if len(gpuIndices) > 0 {
|
||||||
@@ -372,7 +400,8 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
|||||||
}
|
}
|
||||||
out, err := satExecCommand("nvidia-smi", args...).Output()
|
out, err := satExecCommand("nvidia-smi", args...).Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("nvidia-smi gpu info: %w", err)
|
lastErr = fmt.Errorf("nvidia-smi gpu info (%s): %w", q.fields[:min(len(q.fields), 40)], err)
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
r := csv.NewReader(strings.NewReader(string(out)))
|
r := csv.NewReader(strings.NewReader(string(out)))
|
||||||
@@ -380,7 +409,8 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
|||||||
r.FieldsPerRecord = -1
|
r.FieldsPerRecord = -1
|
||||||
rows, err := r.ReadAll()
|
rows, err := r.ReadAll()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("parse nvidia-smi gpu info: %w", err)
|
lastErr = fmt.Errorf("parse nvidia-smi gpu info: %w", err)
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
|
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
|
||||||
@@ -405,17 +435,22 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
|||||||
if len(row) >= 9 {
|
if len(row) >= 9 {
|
||||||
info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
|
info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
|
||||||
}
|
}
|
||||||
|
if q.extended {
|
||||||
if len(row) >= 10 {
|
if len(row) >= 10 {
|
||||||
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
|
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
|
||||||
}
|
}
|
||||||
if len(row) >= 11 {
|
if len(row) >= 11 {
|
||||||
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
|
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
|
||||||
}
|
}
|
||||||
|
}
|
||||||
infoByIndex[idx] = info
|
infoByIndex[idx] = info
|
||||||
}
|
}
|
||||||
return infoByIndex, nil
|
return infoByIndex, nil
|
||||||
|
}
|
||||||
|
return nil, lastErr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
|
func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
|
||||||
if os.Geteuid() != 0 {
|
if os.Geteuid() != 0 {
|
||||||
result.Normalization.Status = "partial"
|
result.Normalization.Status = "partial"
|
||||||
@@ -454,6 +489,10 @@ func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndi
|
|||||||
_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
|
_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
|
||||||
}})
|
}})
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
rec.GPUClockLockStatus = "skipped"
|
||||||
|
rec.Notes = append(rec.Notes, "graphics clock lock skipped: gpu inventory unavailable or MaxGraphicsClockMHz=0")
|
||||||
|
result.Normalization.Status = "partial"
|
||||||
}
|
}
|
||||||
|
|
||||||
if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
|
if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
|
||||||
@@ -1144,19 +1183,9 @@ func queryIPMIServerPowerW() (float64, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
|
return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
|
||||||
}
|
}
|
||||||
for _, line := range strings.Split(string(out), "\n") {
|
if w := parseDCMIPowerReading(string(out)); w > 0 {
|
||||||
if strings.Contains(line, "Current Power") {
|
|
||||||
parts := strings.SplitN(line, ":", 2)
|
|
||||||
if len(parts) == 2 {
|
|
||||||
val := strings.TrimSpace(strings.TrimSuffix(strings.TrimSpace(parts[1]), "Watts"))
|
|
||||||
val = strings.TrimSpace(val)
|
|
||||||
w, err := strconv.ParseFloat(val, 64)
|
|
||||||
if err == nil && w > 0 {
|
|
||||||
return w, nil
|
return w, nil
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
|
return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1209,3 +1238,246 @@ func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvaila
|
|||||||
}
|
}
|
||||||
return sp
|
return sp
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// readServerModel returns the DMI system product name (e.g. "SuperMicro SYS-421GE-TNRT").
|
||||||
|
// Returns empty string if unavailable (non-Linux or missing DMI entry).
|
||||||
|
func readServerModel() string {
|
||||||
|
data, err := os.ReadFile("/sys/class/dmi/id/product_name")
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(string(data))
|
||||||
|
}
|
||||||
|
|
||||||
|
// filterRowsByGPU returns only the metric rows for a specific GPU index.
|
||||||
|
func filterRowsByGPU(rows []GPUMetricRow, gpuIndex int) []GPUMetricRow {
|
||||||
|
var out []GPUMetricRow
|
||||||
|
for _, r := range rows {
|
||||||
|
if r.GPUIndex == gpuIndex {
|
||||||
|
out = append(out, r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseBenchmarkBurnLogByGPU splits a multi-GPU bee-gpu-burn output by [gpu N] prefix
|
||||||
|
// and returns a per-GPU parse result map.
|
||||||
|
func parseBenchmarkBurnLogByGPU(raw string) map[int]benchmarkBurnParseResult {
|
||||||
|
gpuLines := make(map[int][]string)
|
||||||
|
for _, line := range strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if !strings.HasPrefix(line, "[gpu ") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
end := strings.Index(line, "] ")
|
||||||
|
if end < 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
gpuIdx, err := strconv.Atoi(strings.TrimSpace(line[5:end]))
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
gpuLines[gpuIdx] = append(gpuLines[gpuIdx], line[end+2:])
|
||||||
|
}
|
||||||
|
results := make(map[int]benchmarkBurnParseResult, len(gpuLines))
|
||||||
|
for gpuIdx, lines := range gpuLines {
|
||||||
|
// Lines are already stripped of the [gpu N] prefix; parseBenchmarkBurnLog
|
||||||
|
// calls stripBenchmarkPrefix which is a no-op on already-stripped lines.
|
||||||
|
results[gpuIdx] = parseBenchmarkBurnLog(strings.Join(lines, "\n"))
|
||||||
|
}
|
||||||
|
return results
|
||||||
|
}
|
||||||
|
|
||||||
|
// runNvidiaBenchmarkParallel runs warmup and steady compute on all selected GPUs
|
||||||
|
// simultaneously using a single bee-gpu-burn invocation per phase.
|
||||||
|
func runNvidiaBenchmarkParallel(
|
||||||
|
ctx context.Context,
|
||||||
|
verboseLog, runDir string,
|
||||||
|
selected []int,
|
||||||
|
infoByIndex map[int]benchmarkGPUInfo,
|
||||||
|
opts NvidiaBenchmarkOptions,
|
||||||
|
spec benchmarkProfileSpec,
|
||||||
|
logFunc func(string),
|
||||||
|
result *NvidiaBenchmarkResult,
|
||||||
|
serverIdleW *float64, serverLoadedWSum *float64,
|
||||||
|
serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
|
||||||
|
) {
|
||||||
|
allDevices := joinIndexList(selected)
|
||||||
|
|
||||||
|
// Build per-GPU result stubs.
|
||||||
|
gpuResults := make(map[int]*BenchmarkGPUResult, len(selected))
|
||||||
|
for _, idx := range selected {
|
||||||
|
r := &BenchmarkGPUResult{Index: idx, Status: "FAILED"}
|
||||||
|
if info, ok := infoByIndex[idx]; ok {
|
||||||
|
r.UUID = info.UUID
|
||||||
|
r.Name = info.Name
|
||||||
|
r.BusID = info.BusID
|
||||||
|
r.VBIOS = info.VBIOS
|
||||||
|
r.PowerLimitW = info.PowerLimitW
|
||||||
|
r.MultiprocessorCount = info.MultiprocessorCount
|
||||||
|
r.DefaultPowerLimitW = info.DefaultPowerLimitW
|
||||||
|
r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
|
||||||
|
r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
||||||
|
r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
||||||
|
}
|
||||||
|
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
||||||
|
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
||||||
|
r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
|
||||||
|
}
|
||||||
|
gpuResults[idx] = r
|
||||||
|
}
|
||||||
|
|
||||||
|
// Baseline: sample all GPUs together.
|
||||||
|
baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, selected)
|
||||||
|
if err != nil && err != context.Canceled {
|
||||||
|
for _, idx := range selected {
|
||||||
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "baseline sampling failed: "+err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, idx := range selected {
|
||||||
|
perGPU := filterRowsByGPU(baselineRows, idx)
|
||||||
|
gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
|
||||||
|
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), perGPU)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sample server idle power once.
|
||||||
|
if !*serverIdleOK {
|
||||||
|
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
|
||||||
|
*serverIdleW = w
|
||||||
|
*serverIdleOK = true
|
||||||
|
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Warmup: all GPUs simultaneously.
|
||||||
|
warmupCmd := []string{
|
||||||
|
"bee-gpu-burn",
|
||||||
|
"--seconds", strconv.Itoa(spec.WarmupSec),
|
||||||
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
|
"--devices", allDevices,
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
|
||||||
|
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, runDir, "gpu-all-warmup", logFunc)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "gpu-all-warmup.log"), warmupOut, 0644)
|
||||||
|
for _, idx := range selected {
|
||||||
|
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-warmup", idx), filterRowsByGPU(warmupRows, idx))
|
||||||
|
}
|
||||||
|
if warmupErr != nil {
|
||||||
|
for _, idx := range selected {
|
||||||
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Snapshot throttle counters before steady.
|
||||||
|
beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
|
||||||
|
for _, idx := range selected {
|
||||||
|
beforeThrottle[idx], _ = queryThrottleCounters(idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Steady: all GPUs simultaneously.
|
||||||
|
steadyCmd := []string{
|
||||||
|
"bee-gpu-burn",
|
||||||
|
"--seconds", strconv.Itoa(spec.SteadySec),
|
||||||
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
|
"--devices", allDevices,
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (%ds)", allDevices, spec.SteadySec))
|
||||||
|
|
||||||
|
// Sample server power via IPMI in parallel with steady phase.
|
||||||
|
ipmiStopCh := make(chan struct{})
|
||||||
|
ipmiResultCh := make(chan float64, 1)
|
||||||
|
go func() {
|
||||||
|
defer close(ipmiResultCh)
|
||||||
|
var samples []float64
|
||||||
|
ticker := time.NewTicker(5 * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
select {
|
||||||
|
case <-ipmiStopCh:
|
||||||
|
return
|
||||||
|
case <-time.After(15 * time.Second):
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
if w, err := queryIPMIServerPowerW(); err == nil {
|
||||||
|
samples = append(samples, w)
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ipmiStopCh:
|
||||||
|
if len(samples) > 0 {
|
||||||
|
var sum float64
|
||||||
|
for _, w := range samples {
|
||||||
|
sum += w
|
||||||
|
}
|
||||||
|
ipmiResultCh <- sum / float64(len(samples))
|
||||||
|
}
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, runDir, "gpu-all-steady", logFunc)
|
||||||
|
close(ipmiStopCh)
|
||||||
|
if loadedW, ok := <-ipmiResultCh; ok {
|
||||||
|
*serverLoadedWSum += loadedW
|
||||||
|
(*serverLoadedSamples)++
|
||||||
|
*serverLoadedOK = true
|
||||||
|
logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
|
||||||
|
}
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "gpu-all-steady.log"), steadyOut, 0644)
|
||||||
|
|
||||||
|
afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
|
||||||
|
for _, idx := range selected {
|
||||||
|
afterThrottle[idx], _ = queryThrottleCounters(idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
parseResults := parseBenchmarkBurnLogByGPU(string(steadyOut))
|
||||||
|
|
||||||
|
for _, idx := range selected {
|
||||||
|
perGPU := filterRowsByGPU(steadyRows, idx)
|
||||||
|
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-steady", idx), perGPU)
|
||||||
|
gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU)
|
||||||
|
gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx])
|
||||||
|
|
||||||
|
if pr, ok := parseResults[idx]; ok {
|
||||||
|
gpuResults[idx].ComputeCapability = pr.ComputeCapability
|
||||||
|
gpuResults[idx].Backend = pr.Backend
|
||||||
|
gpuResults[idx].PrecisionResults = pr.Profiles
|
||||||
|
if pr.Fallback {
|
||||||
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if steadyErr != nil {
|
||||||
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel steady compute failed: "+steadyErr.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cooldown: all GPUs together.
|
||||||
|
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
|
||||||
|
if err != nil && err != context.Canceled {
|
||||||
|
for _, idx := range selected {
|
||||||
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, idx := range selected {
|
||||||
|
perGPU := filterRowsByGPU(cooldownRows, idx)
|
||||||
|
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
|
||||||
|
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), perGPU)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Score and finalize each GPU.
|
||||||
|
for _, idx := range selected {
|
||||||
|
r := gpuResults[idx]
|
||||||
|
r.Scores = scoreBenchmarkGPUResult(*r)
|
||||||
|
r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
|
||||||
|
pr := parseResults[idx]
|
||||||
|
switch {
|
||||||
|
case steadyErr != nil:
|
||||||
|
r.Status = classifySATErrorStatus(steadyOut, steadyErr)
|
||||||
|
case pr.Fallback:
|
||||||
|
r.Status = "PARTIAL"
|
||||||
|
default:
|
||||||
|
r.Status = "OK"
|
||||||
|
}
|
||||||
|
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -22,18 +22,53 @@ var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
|
|||||||
|
|
||||||
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
|
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
|
|
||||||
fmt.Fprintf(&b, "===========================\n\n")
|
|
||||||
fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
|
||||||
fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
|
|
||||||
fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
|
|
||||||
fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
|
|
||||||
fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
|
|
||||||
fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
|
|
||||||
|
|
||||||
|
// ── Header ────────────────────────────────────────────────────────────────
|
||||||
|
b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
|
||||||
|
|
||||||
|
// System identity block
|
||||||
|
if result.ServerModel != "" {
|
||||||
|
fmt.Fprintf(&b, "**Server:** %s \n", result.ServerModel)
|
||||||
|
}
|
||||||
|
if result.Hostname != "" {
|
||||||
|
fmt.Fprintf(&b, "**Host:** %s \n", result.Hostname)
|
||||||
|
}
|
||||||
|
// GPU models summary
|
||||||
|
if len(result.GPUs) > 0 {
|
||||||
|
modelCount := make(map[string]int)
|
||||||
|
var modelOrder []string
|
||||||
|
for _, g := range result.GPUs {
|
||||||
|
m := strings.TrimSpace(g.Name)
|
||||||
|
if m == "" {
|
||||||
|
m = "Unknown GPU"
|
||||||
|
}
|
||||||
|
if modelCount[m] == 0 {
|
||||||
|
modelOrder = append(modelOrder, m)
|
||||||
|
}
|
||||||
|
modelCount[m]++
|
||||||
|
}
|
||||||
|
var parts []string
|
||||||
|
for _, m := range modelOrder {
|
||||||
|
if modelCount[m] == 1 {
|
||||||
|
parts = append(parts, m)
|
||||||
|
} else {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||||
|
fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion)
|
||||||
|
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||||
|
if result.ParallelGPUs {
|
||||||
|
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||||
|
b.WriteString("\n")
|
||||||
|
|
||||||
|
// ── Executive Summary ─────────────────────────────────────────────────────
|
||||||
if len(result.Findings) > 0 {
|
if len(result.Findings) > 0 {
|
||||||
fmt.Fprintf(&b, "Executive Summary\n")
|
b.WriteString("## Executive Summary\n\n")
|
||||||
fmt.Fprintf(&b, "-----------------\n")
|
|
||||||
for _, finding := range result.Findings {
|
for _, finding := range result.Findings {
|
||||||
fmt.Fprintf(&b, "- %s\n", finding)
|
fmt.Fprintf(&b, "- %s\n", finding)
|
||||||
}
|
}
|
||||||
@@ -41,150 +76,207 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
|||||||
}
|
}
|
||||||
|
|
||||||
if len(result.Warnings) > 0 {
|
if len(result.Warnings) > 0 {
|
||||||
fmt.Fprintf(&b, "Warnings\n")
|
b.WriteString("## Warnings\n\n")
|
||||||
fmt.Fprintf(&b, "--------\n")
|
|
||||||
for _, warning := range result.Warnings {
|
for _, warning := range result.Warnings {
|
||||||
fmt.Fprintf(&b, "- %s\n", warning)
|
fmt.Fprintf(&b, "- %s\n", warning)
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Fprintf(&b, "Per GPU Scorecard\n")
|
// ── Scorecard table ───────────────────────────────────────────────────────
|
||||||
fmt.Fprintf(&b, "-----------------\n")
|
b.WriteString("## Scorecard\n\n")
|
||||||
|
b.WriteString("| GPU | Status | Composite | Compute | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
|
||||||
|
b.WriteString("|-----|--------|-----------|---------|-------------|---------------|-----------------|-----------|-------------|\n")
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
fmt.Fprintf(&b, "GPU %d %s\n", gpu.Index, gpu.Name)
|
name := strings.TrimSpace(gpu.Name)
|
||||||
fmt.Fprintf(&b, " Status: %s\n", gpu.Status)
|
if name == "" {
|
||||||
fmt.Fprintf(&b, " Composite score: %.2f\n", gpu.Scores.CompositeScore)
|
name = "Unknown"
|
||||||
fmt.Fprintf(&b, " Compute score: %.2f\n", gpu.Scores.ComputeScore)
|
|
||||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
|
||||||
fmt.Fprintf(&b, " Compute efficiency: %.3f TOPS/SM/GHz\n", gpu.Scores.TOPSPerSMPerGHz)
|
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, " Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
|
interconnect := "-"
|
||||||
fmt.Fprintf(&b, " Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
|
|
||||||
fmt.Fprintf(&b, " Stability: %.1f\n", gpu.Scores.StabilityScore)
|
|
||||||
if gpu.Scores.InterconnectScore > 0 {
|
if gpu.Scores.InterconnectScore > 0 {
|
||||||
fmt.Fprintf(&b, " Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
|
interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
|
||||||
}
|
}
|
||||||
if len(gpu.DegradationReasons) > 0 {
|
topsPerSM := "-"
|
||||||
fmt.Fprintf(&b, " Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
|
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||||
|
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, " Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
|
fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %.1f | %.1f | %.1f | %s |\n",
|
||||||
fmt.Fprintf(&b, " P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
|
gpu.Index, name,
|
||||||
|
gpu.Status,
|
||||||
|
gpu.Scores.CompositeScore,
|
||||||
|
gpu.Scores.ComputeScore,
|
||||||
|
topsPerSM,
|
||||||
|
gpu.Scores.PowerSustainScore,
|
||||||
|
gpu.Scores.ThermalSustainScore,
|
||||||
|
gpu.Scores.StabilityScore,
|
||||||
|
interconnect,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
|
||||||
|
// ── Per GPU detail ────────────────────────────────────────────────────────
|
||||||
|
b.WriteString("## Per-GPU Details\n\n")
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
name := strings.TrimSpace(gpu.Name)
|
||||||
|
if name == "" {
|
||||||
|
name = "Unknown GPU"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
|
||||||
|
|
||||||
|
// Identity
|
||||||
|
if gpu.BusID != "" {
|
||||||
|
fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
|
||||||
|
}
|
||||||
|
if gpu.VBIOS != "" {
|
||||||
|
fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
|
||||||
|
}
|
||||||
|
if gpu.ComputeCapability != "" {
|
||||||
|
fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
|
||||||
|
}
|
||||||
|
if gpu.MultiprocessorCount > 0 {
|
||||||
|
fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
|
||||||
|
}
|
||||||
|
if gpu.PowerLimitW > 0 {
|
||||||
|
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
||||||
|
}
|
||||||
|
if gpu.LockedGraphicsClockMHz > 0 {
|
||||||
|
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
|
||||||
|
// Steady-state telemetry
|
||||||
|
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||||
|
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
|
||||||
|
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
|
||||||
|
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
|
||||||
|
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
|
||||||
|
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
|
||||||
|
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
||||||
|
b.WriteString("\n")
|
||||||
|
|
||||||
|
// Throttle
|
||||||
|
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
|
||||||
|
if throttle != "none" {
|
||||||
|
fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Precision results
|
||||||
if len(gpu.PrecisionResults) > 0 {
|
if len(gpu.PrecisionResults) > 0 {
|
||||||
fmt.Fprintf(&b, " Precision results:\n")
|
b.WriteString("**Precision results:**\n\n")
|
||||||
for _, precision := range gpu.PrecisionResults {
|
b.WriteString("| Precision | TOPS | Lanes | Iterations |\n|-----------|------|-------|------------|\n")
|
||||||
if precision.Supported {
|
for _, p := range gpu.PrecisionResults {
|
||||||
fmt.Fprintf(&b, " - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
|
if p.Supported {
|
||||||
|
fmt.Fprintf(&b, "| %s | %.2f | %d | %d |\n", p.Name, p.TeraOpsPerSec, p.Lanes, p.Iterations)
|
||||||
} else {
|
} else {
|
||||||
fmt.Fprintf(&b, " - %s: unsupported (%s)\n", precision.Name, precision.Notes)
|
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — |\n", p.Name)
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, " Throttle: %s\n", formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec))
|
|
||||||
if len(gpu.Notes) > 0 {
|
|
||||||
fmt.Fprintf(&b, " Notes:\n")
|
|
||||||
for _, note := range gpu.Notes {
|
|
||||||
fmt.Fprintf(&b, " - %s\n", note)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
if result.Interconnect != nil {
|
// Degradation / Notes
|
||||||
fmt.Fprintf(&b, "Interconnect\n")
|
if len(gpu.DegradationReasons) > 0 {
|
||||||
fmt.Fprintf(&b, "------------\n")
|
fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
|
||||||
fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
|
|
||||||
if result.Interconnect.Supported {
|
|
||||||
fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
|
|
||||||
fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
|
|
||||||
}
|
}
|
||||||
for _, note := range result.Interconnect.Notes {
|
if len(gpu.Notes) > 0 {
|
||||||
|
b.WriteString("**Notes:**\n\n")
|
||||||
|
for _, note := range gpu.Notes {
|
||||||
fmt.Fprintf(&b, "- %s\n", note)
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Interconnect ──────────────────────────────────────────────────────────
|
||||||
|
if result.Interconnect != nil {
|
||||||
|
b.WriteString("## Interconnect (NCCL)\n\n")
|
||||||
|
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
||||||
|
if result.Interconnect.Supported {
|
||||||
|
b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
|
||||||
|
fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
|
||||||
|
fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
for _, note := range result.Interconnect.Notes {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
|
}
|
||||||
|
if len(result.Interconnect.Notes) > 0 {
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Server Power (IPMI) ───────────────────────────────────────────────────
|
||||||
|
if sp := result.ServerPower; sp != nil {
|
||||||
|
b.WriteString("## Server Power (IPMI)\n\n")
|
||||||
|
if !sp.Available {
|
||||||
|
b.WriteString("IPMI power measurement unavailable.\n\n")
|
||||||
|
} else {
|
||||||
|
b.WriteString("| | Value |\n|---|---|\n")
|
||||||
|
fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
|
||||||
|
fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
|
||||||
|
fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
|
||||||
|
fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
|
||||||
|
if sp.ReportingRatio > 0 {
|
||||||
|
fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
for _, note := range sp.Notes {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
|
}
|
||||||
|
if len(sp.Notes) > 0 {
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Terminal charts (steady-state only) ───────────────────────────────────
|
||||||
if len(charts) > 0 {
|
if len(charts) > 0 {
|
||||||
fmt.Fprintf(&b, "Terminal Charts\n")
|
b.WriteString("## Steady-State Charts\n\n")
|
||||||
fmt.Fprintf(&b, "---------------\n")
|
|
||||||
for _, chart := range charts {
|
for _, chart := range charts {
|
||||||
content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
|
content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
|
||||||
if content == "" {
|
if content == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "%s\n", chart.Title)
|
fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content)
|
||||||
fmt.Fprintf(&b, "%s\n", strings.Repeat("~", len(chart.Title)))
|
|
||||||
fmt.Fprintf(&b, "%s\n\n", content)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if sp := result.ServerPower; sp != nil {
|
// ── Methodology ───────────────────────────────────────────────────────────
|
||||||
fmt.Fprintf(&b, "Server Power (IPMI)\n")
|
b.WriteString("## Methodology\n\n")
|
||||||
fmt.Fprintf(&b, "-------------------\n")
|
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile)
|
||||||
if !sp.Available {
|
b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n")
|
||||||
fmt.Fprintf(&b, "Unavailable\n")
|
b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
|
||||||
} else {
|
b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n")
|
||||||
fmt.Fprintf(&b, " Server idle: %.0f W\n", sp.IdleW)
|
|
||||||
fmt.Fprintf(&b, " Server under load: %.0f W\n", sp.LoadedW)
|
|
||||||
fmt.Fprintf(&b, " Server delta: %.0f W\n", sp.DeltaW)
|
|
||||||
fmt.Fprintf(&b, " GPU reported (sum): %.0f W\n", sp.GPUReportedSumW)
|
|
||||||
if sp.ReportingRatio > 0 {
|
|
||||||
fmt.Fprintf(&b, " Reporting ratio: %.2f (1.0 = accurate, <0.75 = GPU over-reports)\n", sp.ReportingRatio)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, note := range sp.Notes {
|
|
||||||
fmt.Fprintf(&b, " Note: %s\n", note)
|
|
||||||
}
|
|
||||||
b.WriteString("\n")
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Fprintf(&b, "Methodology\n")
|
// ── Raw files ─────────────────────────────────────────────────────────────
|
||||||
fmt.Fprintf(&b, "-----------\n")
|
b.WriteString("## Raw Files\n\n")
|
||||||
fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
|
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||||||
fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
|
b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n")
|
||||||
fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
|
b.WriteString("- `gpu-*-warmup.log`\n")
|
||||||
fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
|
b.WriteString("- `gpu-*-steady.log`\n")
|
||||||
|
b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n")
|
||||||
fmt.Fprintf(&b, "Raw Files\n")
|
b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n")
|
||||||
fmt.Fprintf(&b, "---------\n")
|
|
||||||
fmt.Fprintf(&b, "- result.json\n")
|
|
||||||
fmt.Fprintf(&b, "- report.txt\n")
|
|
||||||
fmt.Fprintf(&b, "- summary.txt\n")
|
|
||||||
fmt.Fprintf(&b, "- verbose.log\n")
|
|
||||||
fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
|
|
||||||
fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
|
|
||||||
fmt.Fprintf(&b, "- gpu-*-steady.log\n")
|
|
||||||
fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
|
|
||||||
fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
|
|
||||||
if result.Interconnect != nil {
|
if result.Interconnect != nil {
|
||||||
fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
|
b.WriteString("- `nccl-all-reduce.log`\n")
|
||||||
}
|
}
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and
|
||||||
|
// cooldown charts are not useful for human review).
|
||||||
func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
|
func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
|
||||||
phases := []struct {
|
|
||||||
name string
|
|
||||||
label string
|
|
||||||
}{
|
|
||||||
{name: "baseline", label: "Baseline"},
|
|
||||||
{name: "steady", label: "Steady State"},
|
|
||||||
{name: "cooldown", label: "Cooldown"},
|
|
||||||
}
|
|
||||||
var charts []benchmarkReportChart
|
var charts []benchmarkReportChart
|
||||||
for _, idx := range gpuIndices {
|
for _, idx := range gpuIndices {
|
||||||
for _, phase := range phases {
|
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx))
|
||||||
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-%s-metrics-term.txt", idx, phase.name))
|
|
||||||
raw, err := os.ReadFile(path)
|
raw, err := os.ReadFile(path)
|
||||||
if err != nil || len(raw) == 0 {
|
if err != nil || len(raw) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
charts = append(charts, benchmarkReportChart{
|
charts = append(charts, benchmarkReportChart{
|
||||||
Title: fmt.Sprintf("GPU %d %s", idx, phase.label),
|
Title: fmt.Sprintf("GPU %d — Steady State", idx),
|
||||||
Content: string(raw),
|
Content: string(raw),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return charts
|
return charts
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -137,8 +137,9 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
|||||||
for _, needle := range []string{
|
for _, needle := range []string{
|
||||||
"Executive Summary",
|
"Executive Summary",
|
||||||
"GPU 0 spent measurable time under SW power cap.",
|
"GPU 0 spent measurable time under SW power cap.",
|
||||||
"Composite score: 1176.00",
|
"1176.00",
|
||||||
"fp16_tensor: 700.00 TOPS",
|
"fp16_tensor",
|
||||||
|
"700.00",
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(report, needle) {
|
if !strings.Contains(report, needle) {
|
||||||
t.Fatalf("report missing %q\n%s", needle, report)
|
t.Fatalf("report missing %q\n%s", needle, report)
|
||||||
@@ -164,7 +165,7 @@ func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
|
|||||||
})
|
})
|
||||||
|
|
||||||
for _, needle := range []string{
|
for _, needle := range []string{
|
||||||
"Terminal Charts",
|
"Steady-State Charts",
|
||||||
"GPU 0 Steady State",
|
"GPU 0 Steady State",
|
||||||
"GPU 0 chart",
|
"GPU 0 chart",
|
||||||
"42┤───",
|
"42┤───",
|
||||||
|
|||||||
@@ -14,13 +14,17 @@ type NvidiaBenchmarkOptions struct {
|
|||||||
GPUIndices []int
|
GPUIndices []int
|
||||||
ExcludeGPUIndices []int
|
ExcludeGPUIndices []int
|
||||||
RunNCCL bool
|
RunNCCL bool
|
||||||
|
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
type NvidiaBenchmarkResult struct {
|
type NvidiaBenchmarkResult struct {
|
||||||
BenchmarkVersion string `json:"benchmark_version"`
|
BenchmarkVersion string `json:"benchmark_version"`
|
||||||
GeneratedAt time.Time `json:"generated_at"`
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
Hostname string `json:"hostname,omitempty"`
|
Hostname string `json:"hostname,omitempty"`
|
||||||
|
ServerModel string `json:"server_model,omitempty"`
|
||||||
BenchmarkProfile string `json:"benchmark_profile"`
|
BenchmarkProfile string `json:"benchmark_profile"`
|
||||||
|
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||||
OverallStatus string `json:"overall_status"`
|
OverallStatus string `json:"overall_status"`
|
||||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||||
Findings []string `json:"findings,omitempty"`
|
Findings []string `json:"findings,omitempty"`
|
||||||
|
|||||||
142
audit/internal/platform/hpl.go
Normal file
142
audit/internal/platform/hpl.go
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// HPLOptions configures the HPL (LINPACK) benchmark run.
|
||||||
|
type HPLOptions struct {
|
||||||
|
MemFraction float64 // fraction of RAM to use (default 0.80)
|
||||||
|
NB int // block size (default 256)
|
||||||
|
}
|
||||||
|
|
||||||
|
// HPLResult holds the parsed result of an HPL run.
|
||||||
|
type HPLResult struct {
|
||||||
|
N int // matrix dimension
|
||||||
|
NB int // block size
|
||||||
|
P int // process grid rows
|
||||||
|
Q int // process grid cols
|
||||||
|
TimeSec float64 // wall time in seconds
|
||||||
|
GFlops float64 // achieved performance
|
||||||
|
Residual float64 // backward error residual (from HPL verification line)
|
||||||
|
Status string // "PASSED" or "FAILED"
|
||||||
|
RawOutput string // full xhpl output
|
||||||
|
}
|
||||||
|
|
||||||
|
func applyHPLDefaults(opts *HPLOptions) {
|
||||||
|
if opts.MemFraction <= 0 || opts.MemFraction > 1 {
|
||||||
|
opts.MemFraction = 0.80
|
||||||
|
}
|
||||||
|
if opts.NB <= 0 {
|
||||||
|
opts.NB = 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunHPL runs bee-hpl and returns parsed results plus a tar.gz artifact path.
|
||||||
|
func (s *System) RunHPL(ctx context.Context, baseDir string, opts HPLOptions, logFunc func(string)) (string, *HPLResult, error) {
|
||||||
|
applyHPLDefaults(&opts)
|
||||||
|
|
||||||
|
if baseDir == "" {
|
||||||
|
baseDir = "/var/log/bee-sat"
|
||||||
|
}
|
||||||
|
ts := time.Now().UTC().Format("20060102-150405")
|
||||||
|
runDir := filepath.Join(baseDir, "hpl-"+ts)
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
return "", nil, fmt.Errorf("mkdir %s: %w", runDir, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
logPath := filepath.Join(runDir, "hpl.log")
|
||||||
|
|
||||||
|
cmd := []string{
|
||||||
|
"bee-hpl",
|
||||||
|
"--mem-fraction", strconv.FormatFloat(opts.MemFraction, 'f', 2, 64),
|
||||||
|
"--nb", strconv.Itoa(opts.NB),
|
||||||
|
}
|
||||||
|
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("HPL: N will be auto-sized to %.0f%% of RAM, NB=%d", opts.MemFraction*100, opts.NB))
|
||||||
|
}
|
||||||
|
|
||||||
|
out, err := runSATCommandCtx(ctx, "", "hpl", cmd, nil, logFunc)
|
||||||
|
_ = os.WriteFile(logPath, out, 0644)
|
||||||
|
|
||||||
|
result := parseHPLOutput(string(out))
|
||||||
|
result.RawOutput = string(out)
|
||||||
|
|
||||||
|
if err != nil && err != context.Canceled {
|
||||||
|
return "", result, fmt.Errorf("bee-hpl failed: %w", err)
|
||||||
|
}
|
||||||
|
if err == nil && result.GFlops <= 0 {
|
||||||
|
return "", result, fmt.Errorf("HPL completed but no Gflops result found in output")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write summary
|
||||||
|
summary := fmt.Sprintf("N=%d NB=%d time=%.2fs gflops=%.3f status=%s\n",
|
||||||
|
result.N, result.NB, result.TimeSec, result.GFlops, result.Status)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
||||||
|
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("HPL result: N=%d NB=%d %.2fs %.3f Gflops %s",
|
||||||
|
result.N, result.NB, result.TimeSec, result.GFlops, result.Status))
|
||||||
|
}
|
||||||
|
|
||||||
|
ts2 := time.Now().UTC().Format("20060102-150405")
|
||||||
|
archive := filepath.Join(baseDir, "hpl-"+ts2+".tar.gz")
|
||||||
|
if archErr := createTarGz(archive, runDir); archErr != nil {
|
||||||
|
return runDir, result, err
|
||||||
|
}
|
||||||
|
return archive, result, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseHPLOutput extracts N, NB, time, and Gflops from standard HPL output.
|
||||||
|
//
|
||||||
|
// HPL prints a result line of the form:
|
||||||
|
//
|
||||||
|
// WR00L2L2 45312 256 1 1 1234.56 5.678e+01
|
||||||
|
// T/V N NB P Q Time Gflops
|
||||||
|
func parseHPLOutput(output string) *HPLResult {
|
||||||
|
result := &HPLResult{Status: "FAILED"}
|
||||||
|
for _, line := range strings.Split(output, "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
// Result line starts with WR
|
||||||
|
if strings.HasPrefix(line, "WR") {
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
// WR00L2L2 N NB P Q Time Gflops
|
||||||
|
if len(fields) >= 7 {
|
||||||
|
result.N, _ = strconv.Atoi(fields[1])
|
||||||
|
result.NB, _ = strconv.Atoi(fields[2])
|
||||||
|
result.P, _ = strconv.Atoi(fields[3])
|
||||||
|
result.Q, _ = strconv.Atoi(fields[4])
|
||||||
|
result.TimeSec, _ = strconv.ParseFloat(fields[5], 64)
|
||||||
|
result.GFlops, _ = strconv.ParseFloat(fields[6], 64)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Verification line: "||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)= ... PASSED"
|
||||||
|
if strings.Contains(line, "PASSED") {
|
||||||
|
result.Status = "PASSED"
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
for i, f := range fields {
|
||||||
|
if f == "PASSED" && i > 0 {
|
||||||
|
result.Residual, _ = strconv.ParseFloat(fields[i-1], 64)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// hplAvailable returns true if bee-hpl and xhpl are present and executable.
|
||||||
|
func hplAvailable() bool {
|
||||||
|
if _, err := exec.LookPath("bee-hpl"); err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
_, err := os.Stat("/usr/local/lib/bee/xhpl")
|
||||||
|
return err == nil
|
||||||
|
}
|
||||||
@@ -116,25 +116,47 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
|
|||||||
if err := ctx.Err(); err != nil {
|
if err := ctx.Err(); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
|
|
||||||
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
|
mediumRebound := false
|
||||||
|
if err := bindMount(dstDir, "/run/live/medium"); err != nil {
|
||||||
|
log(fmt.Sprintf("Warning: rebind /run/live/medium → %s failed: %v", dstDir, err))
|
||||||
|
} else {
|
||||||
|
mediumRebound = true
|
||||||
}
|
}
|
||||||
|
|
||||||
log("Verifying live medium now served from RAM...")
|
log("Verifying live medium now served from RAM...")
|
||||||
status := s.LiveBootSource()
|
status := s.LiveBootSource()
|
||||||
if err := verifyInstallToRAMStatus(status); err != nil {
|
if err := verifyInstallToRAMStatus(status, dstDir, mediumRebound, log); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
if status.InRAM {
|
||||||
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
||||||
log("Done. Installation media can be safely disconnected.")
|
}
|
||||||
|
log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func verifyInstallToRAMStatus(status LiveBootSource) error {
|
func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
|
||||||
if status.InRAM {
|
if status.InRAM {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s", describeLiveBootSource(status))
|
|
||||||
|
// The live medium mount was not redirected to RAM. This is expected when
|
||||||
|
// booting from an ISO/CD-ROM: the squashfs loop device has a non-zero
|
||||||
|
// offset and LOOP_CHANGE_FD cannot be used; the bind mount also fails
|
||||||
|
// because the CD-ROM mount is in use. Check whether files were at least
|
||||||
|
// copied to the tmpfs directory — that is sufficient for safe disconnection
|
||||||
|
// once the kernel has paged in all actively-used data.
|
||||||
|
files, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
|
||||||
|
if len(files) > 0 {
|
||||||
|
if !mediumRebound {
|
||||||
|
log(fmt.Sprintf("Note: squashfs copied to RAM (%s) but /run/live/medium still shows the original source.", dstDir))
|
||||||
|
log("This is normal for CD-ROM boots. For a fully transparent RAM boot, add 'toram' to the kernel parameters.")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s and no squashfs found in %s", describeLiveBootSource(status), dstDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func describeLiveBootSource(status LiveBootSource) string {
|
func describeLiveBootSource(status LiveBootSource) string {
|
||||||
@@ -247,7 +269,31 @@ func findLoopForFile(backingFile string) (string, error) {
|
|||||||
return "", fmt.Errorf("no loop device found for %s", backingFile)
|
return "", fmt.Errorf("no loop device found for %s", backingFile)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// loopDeviceOffset returns the byte offset configured for the loop device,
|
||||||
|
// or -1 if it cannot be determined.
|
||||||
|
func loopDeviceOffset(loopDev string) int64 {
|
||||||
|
out, err := exec.Command("losetup", "--json", loopDev).Output()
|
||||||
|
if err != nil {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
var result struct {
|
||||||
|
Loopdevices []struct {
|
||||||
|
Offset int64 `json:"offset"`
|
||||||
|
} `json:"loopdevices"`
|
||||||
|
}
|
||||||
|
if err := json.Unmarshal(out, &result); err != nil || len(result.Loopdevices) == 0 {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
return result.Loopdevices[0].Offset
|
||||||
|
}
|
||||||
|
|
||||||
func reassociateLoopDevice(loopDev, newFile string) error {
|
func reassociateLoopDevice(loopDev, newFile string) error {
|
||||||
|
// LOOP_CHANGE_FD requires lo_offset == 0. ISO/CD-ROM loop devices are
|
||||||
|
// typically set up with a non-zero offset (squashfs lives inside the ISO),
|
||||||
|
// so the ioctl returns EINVAL. Detect this early for a clear error message.
|
||||||
|
if off := loopDeviceOffset(loopDev); off > 0 {
|
||||||
|
return fmt.Errorf("loop device has non-zero offset (%d bytes, typical for ISO/CD-ROM) — LOOP_CHANGE_FD not supported; use 'toram' kernel parameter for RAM boot", off)
|
||||||
|
}
|
||||||
if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
|
if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,3 +26,8 @@ func loopChangeFD(loopDev, newFile string) error {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// bindMount binds src over dst using the syscall directly (avoids exec PATH issues).
|
||||||
|
func bindMount(src, dst string) error {
|
||||||
|
return syscall.Mount(src, dst, "", syscall.MS_BIND, "")
|
||||||
|
}
|
||||||
|
|||||||
@@ -7,3 +7,7 @@ import "errors"
|
|||||||
func loopChangeFD(loopDev, newFile string) error {
|
func loopChangeFD(loopDev, newFile string) error {
|
||||||
return errors.New("LOOP_CHANGE_FD not available on this platform")
|
return errors.New("LOOP_CHANGE_FD not available on this platform")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func bindMount(src, dst string) error {
|
||||||
|
return errors.New("bind mount not available on this platform")
|
||||||
|
}
|
||||||
|
|||||||
@@ -33,14 +33,17 @@ func TestInferLiveBootKind(t *testing.T) {
|
|||||||
func TestVerifyInstallToRAMStatus(t *testing.T) {
|
func TestVerifyInstallToRAMStatus(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}); err != nil {
|
dstDir := t.TempDir()
|
||||||
|
|
||||||
|
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}, dstDir, false, nil); err != nil {
|
||||||
t.Fatalf("expected success for RAM-backed status, got %v", err)
|
t.Fatalf("expected success for RAM-backed status, got %v", err)
|
||||||
}
|
}
|
||||||
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"})
|
|
||||||
|
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"}, dstDir, false, nil)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
t.Fatal("expected verification failure when media is still on USB")
|
t.Fatal("expected verification failure when media is still on USB")
|
||||||
}
|
}
|
||||||
if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1)" {
|
if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1) and no squashfs found in "+dstDir {
|
||||||
t.Fatalf("error=%q", got)
|
t.Fatalf("error=%q", got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -88,6 +88,37 @@ type NvidiaGPU struct {
|
|||||||
MemoryMB int `json:"memory_mb"`
|
MemoryMB int `json:"memory_mb"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type NvidiaGPUStatus struct {
|
||||||
|
Index int `json:"index"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
BDF string `json:"bdf,omitempty"`
|
||||||
|
Serial string `json:"serial,omitempty"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
RawLine string `json:"raw_line,omitempty"`
|
||||||
|
NeedsReset bool `json:"needs_reset"`
|
||||||
|
ParseFailure bool `json:"parse_failure,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type nvidiaGPUHealth struct {
|
||||||
|
Index int
|
||||||
|
Name string
|
||||||
|
NeedsReset bool
|
||||||
|
RawLine string
|
||||||
|
ParseFailure bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type nvidiaGPUStatusFile struct {
|
||||||
|
Index int
|
||||||
|
Name string
|
||||||
|
RunStatus string
|
||||||
|
Reason string
|
||||||
|
Health string
|
||||||
|
HealthRaw string
|
||||||
|
Observed bool
|
||||||
|
Selected bool
|
||||||
|
FailingJob string
|
||||||
|
}
|
||||||
|
|
||||||
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
||||||
type AMDGPUInfo struct {
|
type AMDGPUInfo struct {
|
||||||
Index int `json:"index"`
|
Index int `json:"index"`
|
||||||
@@ -269,6 +300,72 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
|||||||
return gpus, nil
|
return gpus, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *System) ListNvidiaGPUStatuses() ([]NvidiaGPUStatus, error) {
|
||||||
|
out, err := satExecCommand(
|
||||||
|
"nvidia-smi",
|
||||||
|
"--query-gpu=index,name,pci.bus_id,serial,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
|
||||||
|
"--format=csv,noheader,nounits",
|
||||||
|
).Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||||
|
}
|
||||||
|
var gpus []NvidiaGPUStatus
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Split(line, ",")
|
||||||
|
if len(parts) < 4 {
|
||||||
|
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||||
|
if err != nil {
|
||||||
|
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
upper := strings.ToUpper(line)
|
||||||
|
needsReset := strings.Contains(upper, "GPU REQUIRES RESET")
|
||||||
|
status := "OK"
|
||||||
|
if needsReset {
|
||||||
|
status = "RESET_REQUIRED"
|
||||||
|
}
|
||||||
|
gpus = append(gpus, NvidiaGPUStatus{
|
||||||
|
Index: idx,
|
||||||
|
Name: strings.TrimSpace(parts[1]),
|
||||||
|
BDF: normalizeNvidiaBusID(strings.TrimSpace(parts[2])),
|
||||||
|
Serial: strings.TrimSpace(parts[3]),
|
||||||
|
Status: status,
|
||||||
|
RawLine: line,
|
||||||
|
NeedsReset: needsReset,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
sort.Slice(gpus, func(i, j int) bool { return gpus[i].Index < gpus[j].Index })
|
||||||
|
return gpus, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeNvidiaBusID(v string) string {
|
||||||
|
v = strings.TrimSpace(strings.ToLower(v))
|
||||||
|
parts := strings.Split(v, ":")
|
||||||
|
if len(parts) == 3 && len(parts[0]) > 4 {
|
||||||
|
parts[0] = parts[0][len(parts[0])-4:]
|
||||||
|
return strings.Join(parts, ":")
|
||||||
|
}
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||||
|
if index < 0 {
|
||||||
|
return "", fmt.Errorf("gpu index must be >= 0")
|
||||||
|
}
|
||||||
|
raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
|
||||||
|
if len(raw) == 0 && err == nil {
|
||||||
|
raw = []byte("GPU reset completed.\n")
|
||||||
|
}
|
||||||
|
return string(raw), err
|
||||||
|
}
|
||||||
|
|
||||||
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||||
// Measures collective communication bandwidth over NVLink/PCIe.
|
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
@@ -434,9 +531,13 @@ func memoryStressSizeArg() string {
|
|||||||
return fmt.Sprintf("%dM", targetMB)
|
return fmt.Sprintf("%dM", targetMB)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||||
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
if sizeMB <= 0 {
|
||||||
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
sizeMB = 256
|
||||||
|
}
|
||||||
|
if passes <= 0 {
|
||||||
|
passes = 1
|
||||||
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||||
@@ -493,7 +594,7 @@ func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durat
|
|||||||
}, logFunc)
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||||
if baseDir == "" {
|
if baseDir == "" {
|
||||||
baseDir = "/var/log/bee-sat"
|
baseDir = "/var/log/bee-sat"
|
||||||
}
|
}
|
||||||
@@ -525,7 +626,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
||||||
commands := storageSATCommands(devPath)
|
commands := storageSATCommands(devPath, extended)
|
||||||
for cmdIndex, job := range commands {
|
for cmdIndex, job := range commands {
|
||||||
if ctx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
break
|
break
|
||||||
@@ -604,7 +705,7 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
|||||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
satJob{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
satJob{name: "04-dcgmi-diag.log", cmd: diagArgs, gpuIndices: gpuIndices},
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -652,11 +753,23 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
|
|
||||||
var summary strings.Builder
|
var summary strings.Builder
|
||||||
stats := satStats{}
|
stats := satStats{}
|
||||||
|
nvidiaPack := strings.HasPrefix(prefix, "gpu-nvidia")
|
||||||
|
perGPU := map[int]*nvidiaGPUStatusFile{}
|
||||||
|
selectedGPUIndices := map[int]struct{}{}
|
||||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||||
for _, job := range jobs {
|
for _, job := range jobs {
|
||||||
if ctx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
for _, idx := range job.gpuIndices {
|
||||||
|
selectedGPUIndices[idx] = struct{}{}
|
||||||
|
status := perGPU[idx]
|
||||||
|
if status == nil {
|
||||||
|
status = &nvidiaGPUStatusFile{Index: idx}
|
||||||
|
perGPU[idx] = status
|
||||||
|
}
|
||||||
|
status.Selected = true
|
||||||
|
}
|
||||||
cmd := make([]string, 0, len(job.cmd))
|
cmd := make([]string, 0, len(job.cmd))
|
||||||
for _, arg := range job.cmd {
|
for _, arg := range job.cmd {
|
||||||
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
||||||
@@ -665,11 +778,38 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
var out []byte
|
var out []byte
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
|
if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
|
||||||
|
if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(msg)
|
||||||
|
}
|
||||||
|
out = []byte(msg + "\n")
|
||||||
|
err = healthErr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err == nil {
|
||||||
if job.collectGPU {
|
if job.collectGPU {
|
||||||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
||||||
} else {
|
} else {
|
||||||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
|
||||||
|
if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(msg)
|
||||||
|
}
|
||||||
|
if len(out) > 0 && !bytes.HasSuffix(out, []byte("\n")) {
|
||||||
|
out = append(out, '\n')
|
||||||
|
}
|
||||||
|
out = append(out, []byte(msg+"\n")...)
|
||||||
|
if err == nil {
|
||||||
|
err = healthErr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||||
return "", writeErr
|
return "", writeErr
|
||||||
@@ -679,6 +819,11 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
}
|
}
|
||||||
status, rc := classifySATResult(job.name, out, err)
|
status, rc := classifySATResult(job.name, out, err)
|
||||||
stats.Add(status)
|
stats.Add(status)
|
||||||
|
if nvidiaPack && len(job.gpuIndices) > 0 && nvidiaJobNeedsHealthCheck(job) {
|
||||||
|
for _, idx := range job.gpuIndices {
|
||||||
|
updateNvidiaGPUStatus(perGPU, idx, status, job.name, string(out))
|
||||||
|
}
|
||||||
|
}
|
||||||
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
||||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||||
@@ -687,6 +832,11 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
if nvidiaPack {
|
||||||
|
if err := writeNvidiaGPUStatusFiles(runDir, stats.Overall(), perGPU, selectedGPUIndices); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
||||||
if err := createTarGz(archive, runDir); err != nil {
|
if err := createTarGz(archive, runDir); err != nil {
|
||||||
@@ -695,6 +845,197 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
return archive, nil
|
return archive, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
|
||||||
|
entry := perGPU[idx]
|
||||||
|
if entry == nil {
|
||||||
|
entry = &nvidiaGPUStatusFile{Index: idx}
|
||||||
|
perGPU[idx] = entry
|
||||||
|
}
|
||||||
|
if nvidiaSATStatusSeverity(status) >= nvidiaSATStatusSeverity(entry.RunStatus) {
|
||||||
|
entry.RunStatus = status
|
||||||
|
entry.FailingJob = jobName
|
||||||
|
entry.Reason = firstLine(detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPUStatusFile, selected map[int]struct{}) error {
|
||||||
|
health, err := readNvidiaGPUHealth()
|
||||||
|
if err == nil {
|
||||||
|
for _, gpu := range health {
|
||||||
|
entry := perGPU[gpu.Index]
|
||||||
|
if entry == nil {
|
||||||
|
entry = &nvidiaGPUStatusFile{Index: gpu.Index}
|
||||||
|
perGPU[gpu.Index] = entry
|
||||||
|
}
|
||||||
|
entry.Name = gpu.Name
|
||||||
|
entry.Observed = true
|
||||||
|
entry.HealthRaw = gpu.RawLine
|
||||||
|
if gpu.NeedsReset {
|
||||||
|
entry.Health = "RESET_REQUIRED"
|
||||||
|
if entry.RunStatus == "" || nvidiaSATStatusSeverity("FAILED") >= nvidiaSATStatusSeverity(entry.RunStatus) {
|
||||||
|
entry.RunStatus = "FAILED"
|
||||||
|
if strings.TrimSpace(entry.Reason) == "" {
|
||||||
|
entry.Reason = "GPU requires reset"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
entry.Health = "OK"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for idx := range selected {
|
||||||
|
entry := perGPU[idx]
|
||||||
|
if entry == nil {
|
||||||
|
entry = &nvidiaGPUStatusFile{Index: idx}
|
||||||
|
perGPU[idx] = entry
|
||||||
|
}
|
||||||
|
entry.Selected = true
|
||||||
|
}
|
||||||
|
var indices []int
|
||||||
|
for idx := range perGPU {
|
||||||
|
indices = append(indices, idx)
|
||||||
|
}
|
||||||
|
sort.Ints(indices)
|
||||||
|
for _, idx := range indices {
|
||||||
|
entry := perGPU[idx]
|
||||||
|
if entry.RunStatus == "" {
|
||||||
|
entry.RunStatus = overall
|
||||||
|
}
|
||||||
|
if entry.Health == "" {
|
||||||
|
entry.Health = "UNKNOWN"
|
||||||
|
}
|
||||||
|
if entry.Name == "" {
|
||||||
|
entry.Name = "unknown"
|
||||||
|
}
|
||||||
|
var body strings.Builder
|
||||||
|
fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
|
||||||
|
fmt.Fprintf(&body, "gpu_name=%s\n", entry.Name)
|
||||||
|
fmt.Fprintf(&body, "selected=%t\n", entry.Selected)
|
||||||
|
fmt.Fprintf(&body, "observed=%t\n", entry.Observed)
|
||||||
|
fmt.Fprintf(&body, "run_status=%s\n", entry.RunStatus)
|
||||||
|
fmt.Fprintf(&body, "health_status=%s\n", entry.Health)
|
||||||
|
if strings.TrimSpace(entry.FailingJob) != "" {
|
||||||
|
fmt.Fprintf(&body, "failing_job=%s\n", entry.FailingJob)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(entry.Reason) != "" {
|
||||||
|
fmt.Fprintf(&body, "reason=%s\n", entry.Reason)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(entry.HealthRaw) != "" {
|
||||||
|
fmt.Fprintf(&body, "health_raw=%s\n", entry.HealthRaw)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-status.txt", idx)), []byte(body.String()), 0644); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvidiaSATStatusSeverity(status string) int {
|
||||||
|
switch strings.ToUpper(strings.TrimSpace(status)) {
|
||||||
|
case "FAILED":
|
||||||
|
return 3
|
||||||
|
case "PARTIAL", "UNSUPPORTED":
|
||||||
|
return 2
|
||||||
|
case "OK":
|
||||||
|
return 1
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstLine(s string) string {
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
if s == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if idx := strings.IndexByte(s, '\n'); idx >= 0 {
|
||||||
|
return strings.TrimSpace(s[:idx])
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvidiaJobNeedsHealthCheck(job satJob) bool {
|
||||||
|
if job.collectGPU {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
name := strings.ToLower(strings.TrimSpace(job.name))
|
||||||
|
return strings.Contains(name, "dcgmi") ||
|
||||||
|
strings.Contains(name, "gpu-burn") ||
|
||||||
|
strings.Contains(name, "gpu-stress") ||
|
||||||
|
strings.Contains(name, "dcgmproftester")
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkNvidiaJobHealth(selected []int) (string, error) {
|
||||||
|
health, err := readNvidiaGPUHealth()
|
||||||
|
if err != nil {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
var bad []nvidiaGPUHealth
|
||||||
|
selectedSet := make(map[int]struct{}, len(selected))
|
||||||
|
for _, idx := range selected {
|
||||||
|
selectedSet[idx] = struct{}{}
|
||||||
|
}
|
||||||
|
for _, gpu := range health {
|
||||||
|
if len(selectedSet) > 0 {
|
||||||
|
if _, ok := selectedSet[gpu.Index]; !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if gpu.NeedsReset {
|
||||||
|
bad = append(bad, gpu)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(bad) == 0 {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
lines := make([]string, 0, len(bad)+1)
|
||||||
|
lines = append(lines, "NVIDIA GPU health check failed:")
|
||||||
|
for _, gpu := range bad {
|
||||||
|
lines = append(lines, fmt.Sprintf("gpu %d (%s) requires reset: %s", gpu.Index, gpu.Name, gpu.RawLine))
|
||||||
|
}
|
||||||
|
return strings.Join(lines, "\n"), errors.New("nvidia gpu requires reset")
|
||||||
|
}
|
||||||
|
|
||||||
|
func readNvidiaGPUHealth() ([]nvidiaGPUHealth, error) {
|
||||||
|
out, err := satExecCommand(
|
||||||
|
"nvidia-smi",
|
||||||
|
"--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
|
||||||
|
"--format=csv,noheader,nounits",
|
||||||
|
).Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||||
|
}
|
||||||
|
return parseNvidiaGPUHealth(string(out)), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseNvidiaGPUHealth(raw string) []nvidiaGPUHealth {
|
||||||
|
var gpus []nvidiaGPUHealth
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Split(line, ",")
|
||||||
|
if len(parts) < 2 {
|
||||||
|
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||||
|
if err != nil {
|
||||||
|
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
upper := strings.ToUpper(line)
|
||||||
|
gpus = append(gpus, nvidiaGPUHealth{
|
||||||
|
Index: idx,
|
||||||
|
Name: strings.TrimSpace(parts[1]),
|
||||||
|
NeedsReset: strings.Contains(upper, "GPU REQUIRES RESET"),
|
||||||
|
RawLine: line,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return gpus
|
||||||
|
}
|
||||||
|
|
||||||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
|
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
|
||||||
start := time.Now().UTC()
|
start := time.Now().UTC()
|
||||||
resolvedCmd, err := resolveSATCommand(cmd)
|
resolvedCmd, err := resolveSATCommand(cmd)
|
||||||
@@ -749,17 +1090,25 @@ func listStorageDevices() ([]string, error) {
|
|||||||
return parseStorageDevices(string(out)), nil
|
return parseStorageDevices(string(out)), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func storageSATCommands(devPath string) []satJob {
|
func storageSATCommands(devPath string, extended bool) []satJob {
|
||||||
if strings.Contains(filepath.Base(devPath), "nvme") {
|
if strings.Contains(filepath.Base(devPath), "nvme") {
|
||||||
|
selfTestLevel := "1"
|
||||||
|
if extended {
|
||||||
|
selfTestLevel = "2"
|
||||||
|
}
|
||||||
return []satJob{
|
return []satJob{
|
||||||
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
|
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
|
||||||
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
|
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
|
||||||
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}},
|
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
smartTestType := "short"
|
||||||
|
if extended {
|
||||||
|
smartTestType = "long"
|
||||||
|
}
|
||||||
return []satJob{
|
return []satJob{
|
||||||
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
|
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
|
||||||
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}},
|
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -818,6 +1167,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
|||||||
// nvidia-smi on a machine with no NVIDIA GPU
|
// nvidia-smi on a machine with no NVIDIA GPU
|
||||||
strings.Contains(text, "couldn't communicate with the nvidia driver") ||
|
strings.Contains(text, "couldn't communicate with the nvidia driver") ||
|
||||||
strings.Contains(text, "no nvidia gpu") ||
|
strings.Contains(text, "no nvidia gpu") ||
|
||||||
|
// Some NVMe firmwares start self-test but never expose progress to nvme-cli
|
||||||
|
// while waiting, so the CLI stops polling without proving device failure.
|
||||||
|
(strings.Contains(name, "self-test") &&
|
||||||
|
strings.Contains(text, "no progress for") &&
|
||||||
|
strings.Contains(text, "stop waiting")) ||
|
||||||
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
||||||
return "UNSUPPORTED", rc
|
return "UNSUPPORTED", rc
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ type FanStressOptions struct {
|
|||||||
Phase1DurSec int // first load phase duration in seconds (default 300)
|
Phase1DurSec int // first load phase duration in seconds (default 300)
|
||||||
PauseSec int // pause between the two load phases (default 60)
|
PauseSec int // pause between the two load phases (default 60)
|
||||||
Phase2DurSec int // second load phase duration in seconds (default 300)
|
Phase2DurSec int // second load phase duration in seconds (default 300)
|
||||||
SizeMB int // GPU memory to allocate per GPU during stress (default 64)
|
SizeMB int // GPU memory to allocate per GPU during stress (0 = auto: 95% of VRAM)
|
||||||
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -243,9 +243,8 @@ func applyFanStressDefaults(opts *FanStressOptions) {
|
|||||||
if opts.Phase2DurSec <= 0 {
|
if opts.Phase2DurSec <= 0 {
|
||||||
opts.Phase2DurSec = 300
|
opts.Phase2DurSec = 300
|
||||||
}
|
}
|
||||||
if opts.SizeMB <= 0 {
|
// SizeMB == 0 means "auto" (worker picks 95% of GPU VRAM for maximum power draw).
|
||||||
opts.SizeMB = 64
|
// Leave at 0 to avoid passing a too-small size that starves the tensor-core path.
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// sampleFanStressRow collects all metrics for one telemetry sample.
|
// sampleFanStressRow collects all metrics for one telemetry sample.
|
||||||
|
|||||||
@@ -14,12 +14,12 @@ import (
|
|||||||
func TestStorageSATCommands(t *testing.T) {
|
func TestStorageSATCommands(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
nvme := storageSATCommands("/dev/nvme0n1")
|
nvme := storageSATCommands("/dev/nvme0n1", false)
|
||||||
if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
|
if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
|
||||||
t.Fatalf("unexpected nvme commands: %#v", nvme)
|
t.Fatalf("unexpected nvme commands: %#v", nvme)
|
||||||
}
|
}
|
||||||
|
|
||||||
sata := storageSATCommands("/dev/sda")
|
sata := storageSATCommands("/dev/sda", false)
|
||||||
if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
|
if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
|
||||||
t.Fatalf("unexpected sata commands: %#v", sata)
|
t.Fatalf("unexpected sata commands: %#v", sata)
|
||||||
}
|
}
|
||||||
@@ -216,6 +216,74 @@ func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseNvidiaGPUHealthDetectsResetRequired(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
got := parseNvidiaGPUHealth("0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n")
|
||||||
|
if len(got) != 2 {
|
||||||
|
t.Fatalf("len=%d want 2", len(got))
|
||||||
|
}
|
||||||
|
if got[0].NeedsReset {
|
||||||
|
t.Fatalf("gpu0 unexpectedly marked reset-required")
|
||||||
|
}
|
||||||
|
if !got[1].NeedsReset {
|
||||||
|
t.Fatalf("gpu1 should be marked reset-required: %#v", got[1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCheckNvidiaJobHealthReturnsErrorForSelectedResetRequiredGPU(t *testing.T) {
|
||||||
|
oldExecCommand := satExecCommand
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
if name == "nvidia-smi" {
|
||||||
|
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
|
||||||
|
}
|
||||||
|
return exec.Command(name, args...)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||||
|
|
||||||
|
msg, err := checkNvidiaJobHealth([]int{1})
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected health check error")
|
||||||
|
}
|
||||||
|
if !strings.Contains(msg, "gpu 1") || !strings.Contains(strings.ToLower(msg), "requires reset") {
|
||||||
|
t.Fatalf("unexpected message: %q", msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWriteNvidiaGPUStatusFilesCreatesPerGPUFiles(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
oldExecCommand := satExecCommand
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
if name == "nvidia-smi" {
|
||||||
|
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
|
||||||
|
}
|
||||||
|
return exec.Command(name, args...)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||||
|
|
||||||
|
perGPU := map[int]*nvidiaGPUStatusFile{
|
||||||
|
0: {Index: 0, RunStatus: "OK"},
|
||||||
|
1: {Index: 1, RunStatus: "FAILED", FailingJob: "02-dcgmi-targeted-stress.log", Reason: "NVIDIA GPU health check failed:"},
|
||||||
|
}
|
||||||
|
if err := writeNvidiaGPUStatusFiles(dir, "FAILED", perGPU, map[int]struct{}{0: {}, 1: {}}); err != nil {
|
||||||
|
t.Fatalf("writeNvidiaGPUStatusFiles error: %v", err)
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(filepath.Join(dir, "gpu-1-status.txt"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadFile gpu-1-status.txt: %v", err)
|
||||||
|
}
|
||||||
|
text := string(raw)
|
||||||
|
if !strings.Contains(text, "run_status=FAILED") {
|
||||||
|
t.Fatalf("missing run status:\n%s", text)
|
||||||
|
}
|
||||||
|
if !strings.Contains(text, "health_status=RESET_REQUIRED") {
|
||||||
|
t.Fatalf("missing health status:\n%s", text)
|
||||||
|
}
|
||||||
|
if !strings.Contains(text, "failing_job=02-dcgmi-targeted-stress.log") {
|
||||||
|
t.Fatalf("missing failing job:\n%s", text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
|
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
|
||||||
oldLookPath := satLookPath
|
oldLookPath := satLookPath
|
||||||
satLookPath = func(file string) (string, error) {
|
satLookPath = func(file string) (string, error) {
|
||||||
@@ -341,6 +409,7 @@ func TestClassifySATResult(t *testing.T) {
|
|||||||
}{
|
}{
|
||||||
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
||||||
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||||
|
{name: "nvme wait timeout without progress", job: "nvme-device-self-test", out: "Short Device self-test started\nWaiting for self test completion...\nno progress for 78 seconds, stop waiting", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||||
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||||
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,6 +28,12 @@ var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) {
|
|||||||
}
|
}
|
||||||
return a.ListNvidiaGPUs()
|
return a.ListNvidiaGPUs()
|
||||||
}
|
}
|
||||||
|
var apiListNvidiaGPUStatuses = func(a *app.App) ([]platform.NvidiaGPUStatus, error) {
|
||||||
|
if a == nil {
|
||||||
|
return nil, fmt.Errorf("app not configured")
|
||||||
|
}
|
||||||
|
return a.ListNvidiaGPUStatuses()
|
||||||
|
}
|
||||||
|
|
||||||
// ── Job ID counter ────────────────────────────────────────────────────────────
|
// ── Job ID counter ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
@@ -216,7 +222,21 @@ func formatSplitTaskName(baseName, selectionLabel string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) {
|
func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) {
|
||||||
if !shouldSplitHomogeneousNvidiaTarget(target) {
|
if !shouldSplitHomogeneousNvidiaTarget(target) || params.ParallelGPUs {
|
||||||
|
// Parallel mode (or non-splittable target): one task for all selected GPUs.
|
||||||
|
if params.ParallelGPUs && shouldSplitHomogeneousNvidiaTarget(target) {
|
||||||
|
// Resolve the selected GPU indices so ExcludeGPUIndices is applied.
|
||||||
|
gpus, err := apiListNvidiaGPUs(appRef)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
resolved, err := expandSelectedGPUIndices(gpus, params.GPUIndices, params.ExcludeGPUIndices)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
params.GPUIndices = resolved
|
||||||
|
params.ExcludeGPUIndices = nil
|
||||||
|
}
|
||||||
t := &Task{
|
t := &Task{
|
||||||
ID: newJobID(idPrefix),
|
ID: newJobID(idPrefix),
|
||||||
Name: baseName,
|
Name: baseName,
|
||||||
@@ -256,6 +276,53 @@ func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params
|
|||||||
return tasks, nil
|
return tasks, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// expandSelectedGPUIndices returns the sorted list of selected GPU indices after
|
||||||
|
// applying include/exclude filters, without splitting by model.
|
||||||
|
func expandSelectedGPUIndices(gpus []platform.NvidiaGPU, include, exclude []int) ([]int, error) {
|
||||||
|
indexed := make(map[int]struct{}, len(gpus))
|
||||||
|
allIndices := make([]int, 0, len(gpus))
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
indexed[gpu.Index] = struct{}{}
|
||||||
|
allIndices = append(allIndices, gpu.Index)
|
||||||
|
}
|
||||||
|
sort.Ints(allIndices)
|
||||||
|
|
||||||
|
selected := allIndices
|
||||||
|
if len(include) > 0 {
|
||||||
|
selected = make([]int, 0, len(include))
|
||||||
|
seen := make(map[int]struct{}, len(include))
|
||||||
|
for _, idx := range include {
|
||||||
|
if _, ok := indexed[idx]; !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, dup := seen[idx]; dup {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[idx] = struct{}{}
|
||||||
|
selected = append(selected, idx)
|
||||||
|
}
|
||||||
|
sort.Ints(selected)
|
||||||
|
}
|
||||||
|
if len(exclude) > 0 {
|
||||||
|
skip := make(map[int]struct{}, len(exclude))
|
||||||
|
for _, idx := range exclude {
|
||||||
|
skip[idx] = struct{}{}
|
||||||
|
}
|
||||||
|
filtered := selected[:0]
|
||||||
|
for _, idx := range selected {
|
||||||
|
if _, ok := skip[idx]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
filtered = append(filtered, idx)
|
||||||
|
}
|
||||||
|
selected = filtered
|
||||||
|
}
|
||||||
|
if len(selected) == 0 {
|
||||||
|
return nil, fmt.Errorf("no NVIDIA GPUs selected")
|
||||||
|
}
|
||||||
|
return selected, nil
|
||||||
|
}
|
||||||
|
|
||||||
// ── SSE helpers ───────────────────────────────────────────────────────────────
|
// ── SSE helpers ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func sseWrite(w http.ResponseWriter, event, data string) bool {
|
func sseWrite(w http.ResponseWriter, event, data string) bool {
|
||||||
@@ -417,7 +484,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
|
|
||||||
var body struct {
|
var body struct {
|
||||||
Duration int `json:"duration"`
|
Duration int `json:"duration"`
|
||||||
DiagLevel int `json:"diag_level"`
|
StressMode bool `json:"stress_mode"`
|
||||||
GPUIndices []int `json:"gpu_indices"`
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||||
Loader string `json:"loader"`
|
Loader string `json:"loader"`
|
||||||
@@ -438,7 +505,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
}
|
}
|
||||||
params := taskParams{
|
params := taskParams{
|
||||||
Duration: body.Duration,
|
Duration: body.Duration,
|
||||||
DiagLevel: body.DiagLevel,
|
StressMode: body.StressMode,
|
||||||
GPUIndices: body.GPUIndices,
|
GPUIndices: body.GPUIndices,
|
||||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||||
Loader: body.Loader,
|
Loader: body.Loader,
|
||||||
@@ -470,6 +537,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
|||||||
GPUIndices []int `json:"gpu_indices"`
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||||
RunNCCL *bool `json:"run_nccl"`
|
RunNCCL *bool `json:"run_nccl"`
|
||||||
|
ParallelGPUs *bool `json:"parallel_gpus"`
|
||||||
DisplayName string `json:"display_name"`
|
DisplayName string `json:"display_name"`
|
||||||
}
|
}
|
||||||
if r.Body != nil {
|
if r.Body != nil {
|
||||||
@@ -483,6 +551,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
|||||||
if body.RunNCCL != nil {
|
if body.RunNCCL != nil {
|
||||||
runNCCL = *body.RunNCCL
|
runNCCL = *body.RunNCCL
|
||||||
}
|
}
|
||||||
|
parallelGPUs := false
|
||||||
|
if body.ParallelGPUs != nil {
|
||||||
|
parallelGPUs = *body.ParallelGPUs
|
||||||
|
}
|
||||||
name := taskDisplayName("nvidia-benchmark", "", "")
|
name := taskDisplayName("nvidia-benchmark", "", "")
|
||||||
if strings.TrimSpace(body.DisplayName) != "" {
|
if strings.TrimSpace(body.DisplayName) != "" {
|
||||||
name = body.DisplayName
|
name = body.DisplayName
|
||||||
@@ -493,6 +565,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
|||||||
SizeMB: body.SizeMB,
|
SizeMB: body.SizeMB,
|
||||||
BenchmarkProfile: body.Profile,
|
BenchmarkProfile: body.Profile,
|
||||||
RunNCCL: runNCCL,
|
RunNCCL: runNCCL,
|
||||||
|
ParallelGPUs: parallelGPUs,
|
||||||
DisplayName: body.DisplayName,
|
DisplayName: body.DisplayName,
|
||||||
}, name, h.opts.App, "benchmark-nvidia")
|
}, name, h.opts.App, "benchmark-nvidia")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -782,6 +855,42 @@ func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
|
|||||||
writeJSON(w, gpus)
|
writeJSON(w, gpus)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIGNVIDIAGPUStatuses(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
gpus, err := apiListNvidiaGPUStatuses(h.opts.App)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if gpus == nil {
|
||||||
|
gpus = []platform.NvidiaGPUStatus{}
|
||||||
|
}
|
||||||
|
writeJSON(w, gpus)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIGNVIDIAReset(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var req struct {
|
||||||
|
Index int `json:"index"`
|
||||||
|
}
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
result, err := h.opts.App.ResetNvidiaGPU(req.Index)
|
||||||
|
status := "ok"
|
||||||
|
if err != nil {
|
||||||
|
status = "error"
|
||||||
|
}
|
||||||
|
writeJSON(w, map[string]string{"status": status, "output": result.Body})
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
|
||||||
if h.opts.App == nil {
|
if h.opts.App == nil {
|
||||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
|||||||
@@ -1036,10 +1036,12 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
<div class="card-body validate-profile-body">
|
<div class="card-body validate-profile-body">
|
||||||
<div class="validate-profile-col">
|
<div class="validate-profile-col">
|
||||||
<div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
|
<div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
|
||||||
<div class="form-row" style="margin:12px 0 0"><label>Diag level</label><select id="sat-profile-nvidia-level" style="width:100%"><option value="1">Level 1 — Quick</option><option value="2">Level 2 — Standard</option><option value="3">Level 3 — Extended</option><option value="4">Level 4 — Full</option></select></div>
|
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||||
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||||
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
|
||||||
</div>
|
</div>
|
||||||
<div class="validate-profile-col validate-profile-action">
|
<div class="validate-profile-col validate-profile-action">
|
||||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count. NVIDIA <code>dcgmi diag</code> uses the selected diag level from this profile.</p>
|
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
|
||||||
<button class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
<button class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||||
</div>
|
</div>
|
||||||
<div class="validate-profile-col"></div>
|
<div class="validate-profile-col"></div>
|
||||||
@@ -1054,19 +1056,19 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.CPU,
|
inv.CPU,
|
||||||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||||
`Duration is taken from Validate Profile diag level: Level 1 = 60s, Level 2 = 5m, Level 3 = 1h, Level 4 = 1h.`,
|
`60s in Validate, 30 min in Stress.`,
|
||||||
)) +
|
)) +
|
||||||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||||
inv.Memory,
|
inv.Memory,
|
||||||
`Runs a short RAM validation pass and records memory state around the test.`,
|
`Runs a RAM validation pass and records memory state around the test.`,
|
||||||
`<code>free</code>, <code>memtester</code>`,
|
`<code>free</code>, <code>memtester</code>`,
|
||||||
`No extra settings.`,
|
`256 MB / 1 pass in Validate, 1 GB / 3 passes in Stress.`,
|
||||||
)) +
|
)) +
|
||||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||||
inv.Storage,
|
inv.Storage,
|
||||||
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||||
`No extra settings.`,
|
`Short self-test in Validate, extended self-test in Stress.`,
|
||||||
)) +
|
)) +
|
||||||
`</div>
|
`</div>
|
||||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||||
@@ -1083,6 +1085,12 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||||
</div>
|
</div>
|
||||||
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
|
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
|
||||||
|
<div style="margin-top:10px;padding-top:10px;border-top:1px solid var(--border)">
|
||||||
|
<label class="sat-gpu-row" title="When checked, multi-GPU tests (PSU Pulse, NCCL, NVBandwidth) run on ALL GPUs in the system regardless of the selection above.">
|
||||||
|
<input type="checkbox" id="sat-multi-gpu-all" checked onchange="satUpdateGPUSelectionNote()">
|
||||||
|
<span><strong>Multi-GPU tests</strong> — use all GPUs <span style="font-size:11px;color:var(--muted)">(PSU Pulse, NCCL, NVBandwidth)</span></span>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -1091,14 +1099,58 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||||
`Runs one GPU at a time on the selected NVIDIA GPUs. Diag level is taken from Validate Profile.`,
|
`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
|
||||||
)) +
|
)) +
|
||||||
|
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
|
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||||
`<code>dcgmi diag targeted_stress</code>`,
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
`Runs one GPU at a time on the selected NVIDIA GPUs with the fixed DCGM targeted stress recipe.`,
|
`Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
)) +
|
)) +
|
||||||
|
`</div>` +
|
||||||
|
`<div id="sat-card-nvidia-targeted-power">` +
|
||||||
|
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||||
|
`<code>dcgmi diag targeted_power</code>`,
|
||||||
|
`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
|
)) +
|
||||||
|
`</div>` +
|
||||||
|
`<div id="sat-card-nvidia-pulse">` +
|
||||||
|
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||||
|
`<code>dcgmi diag pulse_test</code>`,
|
||||||
|
`Skipped in Validate mode. Runs in Stress mode only. Runs all selected GPUs simultaneously — synchronous pulsing is required to stress the PSU.<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
|
)) +
|
||||||
|
`</div>` +
|
||||||
|
`<div id="sat-card-nvidia-interconnect">` +
|
||||||
|
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||||
|
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||||
|
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
|
)) +
|
||||||
|
`</div>` +
|
||||||
|
`<div id="sat-card-nvidia-bandwidth">` +
|
||||||
|
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||||
|
`<code>nvbandwidth</code>`,
|
||||||
|
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
|
)) +
|
||||||
|
`</div>` +
|
||||||
|
`</div>
|
||||||
|
<div class="grid3" style="margin-top:16px">
|
||||||
|
` + `<div id="sat-card-hpl">` +
|
||||||
|
renderSATCard("hpl", "LINPACK (HPL)", "runSAT('hpl')", "", renderValidateCardBody(
|
||||||
|
``,
|
||||||
|
`Standard High Performance LINPACK benchmark. Measures sustained FP64 GFLOPS and memory bandwidth of the CPU subsystem. Uses 80% of available RAM. Pass/fail based on HPL residual check.`,
|
||||||
|
`<code>xhpl</code> (HPL 2.3, OpenBLAS)`,
|
||||||
|
`Skipped in Validate mode. Runs in Stress mode only. Runtime scales with RAM — expect 5–30 min.<p id="sat-hpl-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
|
)) +
|
||||||
|
`</div>` +
|
||||||
`</div>
|
`</div>
|
||||||
<div class="grid3" style="margin-top:16px">
|
<div class="grid3" style="margin-top:16px">
|
||||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||||
@@ -1125,17 +1177,29 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
</style>
|
</style>
|
||||||
<script>
|
<script>
|
||||||
let satES = null;
|
let satES = null;
|
||||||
function satDiagLevel() {
|
function satStressMode() {
|
||||||
return parseInt(document.getElementById('sat-profile-nvidia-level').value) || 1;
|
return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
|
||||||
}
|
}
|
||||||
function satCPUDurationFromDiagLevel() {
|
function satModeChanged() {
|
||||||
const level = satDiagLevel();
|
const stress = satStressMode();
|
||||||
if (level === 1) return 60;
|
[
|
||||||
if (level === 2) return 5 * 60;
|
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
||||||
return 60 * 60;
|
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
||||||
|
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||||
|
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
|
||||||
|
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
|
||||||
|
{card: 'sat-card-hpl', hint: 'sat-hpl-mode-hint'},
|
||||||
|
].forEach(function(item) {
|
||||||
|
const card = document.getElementById(item.card);
|
||||||
|
if (card) {
|
||||||
|
card.style.opacity = stress ? '1' : '0.5';
|
||||||
|
const hint = document.getElementById(item.hint);
|
||||||
|
if (hint) hint.style.display = stress ? 'none' : '';
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
function satLabels() {
|
function satLabels() {
|
||||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', hpl:'LINPACK (HPL)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||||
}
|
}
|
||||||
let satNvidiaGPUsPromise = null;
|
let satNvidiaGPUsPromise = null;
|
||||||
function loadSatNvidiaGPUs() {
|
function loadSatNvidiaGPUs() {
|
||||||
@@ -1156,6 +1220,10 @@ function satSelectedGPUIndices() {
|
|||||||
.filter(function(v) { return !Number.isNaN(v); })
|
.filter(function(v) { return !Number.isNaN(v); })
|
||||||
.sort(function(a, b) { return a - b; });
|
.sort(function(a, b) { return a - b; });
|
||||||
}
|
}
|
||||||
|
function satMultiGPUAll() {
|
||||||
|
const cb = document.getElementById('sat-multi-gpu-all');
|
||||||
|
return cb ? cb.checked : true;
|
||||||
|
}
|
||||||
function satUpdateGPUSelectionNote() {
|
function satUpdateGPUSelectionNote() {
|
||||||
const note = document.getElementById('sat-gpu-selection-note');
|
const note = document.getElementById('sat-gpu-selection-note');
|
||||||
if (!note) return;
|
if (!note) return;
|
||||||
@@ -1164,7 +1232,8 @@ function satUpdateGPUSelectionNote() {
|
|||||||
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
|
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '.';
|
const multiAll = satMultiGPUAll();
|
||||||
|
note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests: ' + (multiAll ? 'all GPUs in system' : 'selected GPUs only') + '.';
|
||||||
}
|
}
|
||||||
function satRenderGPUList(gpus) {
|
function satRenderGPUList(gpus) {
|
||||||
const root = document.getElementById('sat-gpu-list');
|
const root = document.getElementById('sat-gpu-list');
|
||||||
@@ -1211,9 +1280,8 @@ function satRequestBody(target, overrides) {
|
|||||||
const body = {};
|
const body = {};
|
||||||
const labels = satLabels();
|
const labels = satLabels();
|
||||||
body.display_name = labels[target] || ('Validate ' + target);
|
body.display_name = labels[target] || ('Validate ' + target);
|
||||||
if (target === 'nvidia') body.diag_level = satDiagLevel();
|
body.stress_mode = satStressMode();
|
||||||
if (target === 'nvidia-targeted-stress') body.duration = 300;
|
if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
|
||||||
if (target === 'cpu') body.duration = satCPUDurationFromDiagLevel();
|
|
||||||
if (overrides) {
|
if (overrides) {
|
||||||
Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
|
Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
|
||||||
}
|
}
|
||||||
@@ -1275,8 +1343,28 @@ function runSATWithOverrides(target, overrides) {
|
|||||||
return enqueueSATTarget(target, overrides)
|
return enqueueSATTarget(target, overrides)
|
||||||
.then(d => streamSATTask(d.task_id, title, false));
|
.then(d => streamSATTask(d.task_id, title, false));
|
||||||
}
|
}
|
||||||
|
const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power'];
|
||||||
|
// pulse_test and fabric tests run on all selected GPUs simultaneously
|
||||||
|
const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||||
|
function satAllGPUIndicesForMulti() {
|
||||||
|
// If "Multi-GPU tests — all GPUs" is checked, return all detected GPUs.
|
||||||
|
// Otherwise fall back to the per-GPU selection.
|
||||||
|
if (satMultiGPUAll()) {
|
||||||
|
return loadSatNvidiaGPUs().then(function(gpus) {
|
||||||
|
return gpus.map(function(g) { return Number(g.index); });
|
||||||
|
});
|
||||||
|
}
|
||||||
|
const sel = satSelectedGPUIndices();
|
||||||
|
return Promise.resolve(sel);
|
||||||
|
}
|
||||||
function expandSATTarget(target) {
|
function expandSATTarget(target) {
|
||||||
if (target !== 'nvidia' && target !== 'nvidia-targeted-stress') {
|
if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
|
||||||
|
return satAllGPUIndicesForMulti().then(function(indices) {
|
||||||
|
if (!indices.length) return Promise.reject(new Error('No NVIDIA GPUs available.'));
|
||||||
|
return [{target: target, overrides: {gpu_indices: indices, display_name: satLabels()[target] || target}}];
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (nvidiaPerGPUTargets.indexOf(target) < 0) {
|
||||||
return Promise.resolve([{target: target}]);
|
return Promise.resolve([{target: target}]);
|
||||||
}
|
}
|
||||||
const selected = satSelectedGPUIndices();
|
const selected = satSelectedGPUIndices();
|
||||||
@@ -1292,6 +1380,12 @@ function expandSATTarget(target) {
|
|||||||
label: satGPUDisplayName(gpu)
|
label: satGPUDisplayName(gpu)
|
||||||
})));
|
})));
|
||||||
}
|
}
|
||||||
|
function runNvidiaFabricValidate(target) {
|
||||||
|
satAllGPUIndicesForMulti().then(function(indices) {
|
||||||
|
if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
|
||||||
|
runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
|
||||||
|
});
|
||||||
|
}
|
||||||
function runNvidiaValidateSet(target) {
|
function runNvidiaValidateSet(target) {
|
||||||
return loadSatNvidiaGPUs().then(gpus => {
|
return loadSatNvidiaGPUs().then(gpus => {
|
||||||
const selected = satSelectedGPUIndices();
|
const selected = satSelectedGPUIndices();
|
||||||
@@ -1354,8 +1448,10 @@ function runAllSAT() {
|
|||||||
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
||||||
const status = document.getElementById('sat-all-status');
|
const status = document.getElementById('sat-all-status');
|
||||||
status.textContent = 'Enqueuing...';
|
status.textContent = 'Enqueuing...';
|
||||||
const baseTargets = ['nvidia','nvidia-targeted-stress','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth', 'hpl'];
|
||||||
|
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','hpl','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||||
const activeTargets = baseTargets.filter(target => {
|
const activeTargets = baseTargets.filter(target => {
|
||||||
|
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
||||||
const btn = document.getElementById('sat-btn-' + target);
|
const btn = document.getElementById('sat-btn-' + target);
|
||||||
return !(btn && btn.disabled);
|
return !(btn && btn.disabled);
|
||||||
});
|
});
|
||||||
@@ -1390,6 +1486,10 @@ function runAllSAT() {
|
|||||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||||
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
|
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
|
||||||
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||||
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
|
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
|
||||||
});
|
});
|
||||||
@@ -1587,6 +1687,7 @@ type benchmarkHistoryColumn struct {
|
|||||||
label string
|
label string
|
||||||
name string
|
name string
|
||||||
index int
|
index int
|
||||||
|
parallel bool
|
||||||
}
|
}
|
||||||
|
|
||||||
type benchmarkHistoryCell struct {
|
type benchmarkHistoryCell struct {
|
||||||
@@ -1625,6 +1726,10 @@ func renderBenchmark(opts HandlerOptions) string {
|
|||||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
<label class="benchmark-cb-row">
|
||||||
|
<input type="checkbox" id="benchmark-parallel-gpus">
|
||||||
|
<span>Run all selected GPUs simultaneously (parallel mode)</span>
|
||||||
|
</label>
|
||||||
<label class="benchmark-cb-row">
|
<label class="benchmark-cb-row">
|
||||||
<input type="checkbox" id="benchmark-run-nccl" checked>
|
<input type="checkbox" id="benchmark-run-nccl" checked>
|
||||||
<span>Run multi-GPU interconnect step (NCCL) only on the selected GPUs</span>
|
<span>Run multi-GPU interconnect step (NCCL) only on the selected GPUs</span>
|
||||||
@@ -1750,10 +1855,12 @@ function runNvidiaBenchmark() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||||
|
const parallelGPUs = !!document.getElementById('benchmark-parallel-gpus').checked;
|
||||||
const body = {
|
const body = {
|
||||||
profile: document.getElementById('benchmark-profile').value || 'standard',
|
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||||
gpu_indices: selected,
|
gpu_indices: selected,
|
||||||
run_nccl: !!document.getElementById('benchmark-run-nccl').checked,
|
run_nccl: !!document.getElementById('benchmark-run-nccl').checked,
|
||||||
|
parallel_gpus: parallelGPUs,
|
||||||
display_name: 'NVIDIA Benchmark'
|
display_name: 'NVIDIA Benchmark'
|
||||||
};
|
};
|
||||||
document.getElementById('benchmark-output').style.display = 'block';
|
document.getElementById('benchmark-output').style.display = 'block';
|
||||||
@@ -1887,17 +1994,43 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
|||||||
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||||
cells: make(map[string]benchmarkHistoryCell),
|
cells: make(map[string]benchmarkHistoryCell),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if result.ParallelGPUs {
|
||||||
|
// All GPUs ran simultaneously — one column per server, score = avg composite.
|
||||||
|
gpuModelCount := make(map[string]int)
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
key := benchmarkHistoryColumnKey(gpu.Name, gpu.Index)
|
gpuModelCount[strings.TrimSpace(gpu.Name)]++
|
||||||
|
}
|
||||||
|
scoreSum := make(map[string]float64)
|
||||||
|
scoreCnt := make(map[string]int)
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
key := "parallel|" + strings.TrimSpace(result.ServerModel) + "|" + strings.TrimSpace(gpu.Name)
|
||||||
|
scoreSum[key] += gpu.Scores.CompositeScore
|
||||||
|
scoreCnt[key]++
|
||||||
|
count := gpuModelCount[strings.TrimSpace(gpu.Name)]
|
||||||
columnByKey[key] = benchmarkHistoryColumn{
|
columnByKey[key] = benchmarkHistoryColumn{
|
||||||
key: key,
|
key: key,
|
||||||
label: benchmarkHistoryColumnLabel(gpu.Name, gpu.Index),
|
label: benchmarkHistoryParallelLabel(result.ServerModel, gpu.Name, count),
|
||||||
|
name: strings.TrimSpace(gpu.Name),
|
||||||
|
index: -1,
|
||||||
|
parallel: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for key, sum := range scoreSum {
|
||||||
|
run.cells[key] = benchmarkHistoryCell{score: sum / float64(scoreCnt[key]), present: true}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Each GPU ran independently — one column per GPU index.
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
key := "gpu|" + strings.TrimSpace(result.ServerModel) + "|" + strings.TrimSpace(gpu.Name) + "|" + strconv.Itoa(gpu.Index)
|
||||||
|
columnByKey[key] = benchmarkHistoryColumn{
|
||||||
|
key: key,
|
||||||
|
label: benchmarkHistoryPerGPULabel(gpu.Name, gpu.Index),
|
||||||
name: strings.TrimSpace(gpu.Name),
|
name: strings.TrimSpace(gpu.Name),
|
||||||
index: gpu.Index,
|
index: gpu.Index,
|
||||||
|
parallel: false,
|
||||||
}
|
}
|
||||||
run.cells[key] = benchmarkHistoryCell{
|
run.cells[key] = benchmarkHistoryCell{score: gpu.Scores.CompositeScore, present: true}
|
||||||
score: gpu.Scores.CompositeScore,
|
|
||||||
present: true,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
runs = append(runs, run)
|
runs = append(runs, run)
|
||||||
@@ -1907,16 +2040,24 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
|||||||
for _, col := range columnByKey {
|
for _, col := range columnByKey {
|
||||||
columns = append(columns, col)
|
columns = append(columns, col)
|
||||||
}
|
}
|
||||||
|
// Sequential GPU columns first (sorted by GPU index), then parallel server columns.
|
||||||
sort.Slice(columns, func(i, j int) bool {
|
sort.Slice(columns, func(i, j int) bool {
|
||||||
leftName := strings.ToLower(strings.TrimSpace(columns[i].name))
|
if columns[i].parallel != columns[j].parallel {
|
||||||
rightName := strings.ToLower(strings.TrimSpace(columns[j].name))
|
return !columns[i].parallel // sequential first
|
||||||
if leftName != rightName {
|
|
||||||
return leftName < rightName
|
|
||||||
}
|
}
|
||||||
|
if columns[i].parallel {
|
||||||
|
li := strings.ToLower(columns[i].label)
|
||||||
|
lj := strings.ToLower(columns[j].label)
|
||||||
|
if li != lj {
|
||||||
|
return li < lj
|
||||||
|
}
|
||||||
|
return columns[i].key < columns[j].key
|
||||||
|
}
|
||||||
|
// Sequential: sort by GPU index, then name.
|
||||||
if columns[i].index != columns[j].index {
|
if columns[i].index != columns[j].index {
|
||||||
return columns[i].index < columns[j].index
|
return columns[i].index < columns[j].index
|
||||||
}
|
}
|
||||||
return columns[i].key < columns[j].key
|
return strings.ToLower(columns[i].name) < strings.ToLower(columns[j].name)
|
||||||
})
|
})
|
||||||
sort.Slice(runs, func(i, j int) bool {
|
sort.Slice(runs, func(i, j int) bool {
|
||||||
return runs[i].generatedAt.After(runs[j].generatedAt)
|
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||||
@@ -1924,23 +2065,35 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
|||||||
return columns, runs
|
return columns, runs
|
||||||
}
|
}
|
||||||
|
|
||||||
func benchmarkHistoryColumnKey(name string, index int) string {
|
// benchmarkHistoryPerGPULabel formats a label for a single-GPU column: "GPU #N — ModelName".
|
||||||
return strings.TrimSpace(name) + "|" + strconv.Itoa(index)
|
func benchmarkHistoryPerGPULabel(gpuName string, index int) string {
|
||||||
|
gpuName = strings.TrimSpace(gpuName)
|
||||||
|
if gpuName == "" {
|
||||||
|
gpuName = "Unknown GPU"
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("GPU #%d — %s", index, gpuName)
|
||||||
}
|
}
|
||||||
|
|
||||||
func benchmarkHistoryColumnLabel(name string, index int) string {
|
// benchmarkHistoryParallelLabel formats a label for an all-GPU parallel column:
|
||||||
name = strings.TrimSpace(name)
|
// "ServerModel — N× ModelName (All GPUs)" or "N× ModelName (All GPUs)" if no server.
|
||||||
if name == "" {
|
func benchmarkHistoryParallelLabel(serverModel, gpuName string, count int) string {
|
||||||
return fmt.Sprintf("GPU %d", index)
|
serverModel = strings.TrimSpace(serverModel)
|
||||||
|
gpuName = strings.TrimSpace(gpuName)
|
||||||
|
if gpuName == "" {
|
||||||
|
gpuName = "Unknown GPU"
|
||||||
}
|
}
|
||||||
return fmt.Sprintf("%s / GPU %d", name, index)
|
gpuPart := fmt.Sprintf("%d× %s (All GPUs)", count, gpuName)
|
||||||
|
if serverModel == "" {
|
||||||
|
return gpuPart
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%s — %s", serverModel, gpuPart)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Burn ──────────────────────────────────────────────────────────────────────
|
// ── Burn ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func renderBurn() string {
|
func renderBurn() string {
|
||||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
||||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics and ` + "targeted_stress" + ` remain in <a href="/validate">Validate</a>. Burn exposes official NVIDIA load recipes by test goal plus separate custom stress tools.</div>
|
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
|
||||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
|
|
||||||
<div class="card" style="margin-bottom:16px">
|
<div class="card" style="margin-bottom:16px">
|
||||||
@@ -2007,23 +2160,6 @@ func renderBurn() string {
|
|||||||
|
|
||||||
<div class="burn-section">GPU-Specific Tests</div>
|
<div class="burn-section">GPU-Specific Tests</div>
|
||||||
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
||||||
<div class="card burn-card">
|
|
||||||
<div class="card-head card-head-actions"><span>Power Delivery / Power Budget</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-power',target:'nvidia-targeted-power',label:'NVIDIA Targeted Power (dcgmi diag targeted_power)',nvidia:true},{id:'burn-nvidia-pulse',target:'nvidia-pulse',label:'NVIDIA Pulse Test (dcgmi diag pulse_test)',nvidia:true}])">Run</button></div>
|
|
||||||
<div class="card-body burn-card-body">
|
|
||||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA power-oriented recipes. ` + "targeted_power" + ` checks sustained delivery; ` + "pulse_test" + ` checks transient behavior.</p>
|
|
||||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-power" disabled><span>NVIDIA Targeted Power (dcgmi diag targeted_power) <span class="cb-note" id="note-nvidia-power"></span></span></label>
|
|
||||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-pulse" disabled><span>NVIDIA Pulse Test (dcgmi diag pulse_test) <span class="cb-note" id="note-nvidia-pulse"></span></span></label>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="card burn-card">
|
|
||||||
<div class="card-head card-head-actions"><span>Interconnect / Bandwidth</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},{id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true}])">Run</button></div>
|
|
||||||
<div class="card-body burn-card-body">
|
|
||||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA fabric paths. NCCL is interconnect-only and is not a compute burn. NVBandwidth validates copy and bandwidth paths.</p>
|
|
||||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-interconnect" disabled><span>NVIDIA Interconnect Test (NCCL all_reduce_perf) <span class="cb-note" id="note-nvidia-interconnect"></span></span></label>
|
|
||||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-bandwidth" disabled><span>NVIDIA Bandwidth Test (NVBandwidth) <span class="cb-note" id="note-nvidia-bandwidth"></span></span></label>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||||
@@ -2275,10 +2411,6 @@ function runAllBurnTasks() {
|
|||||||
const status = document.getElementById('burn-all-status');
|
const status = document.getElementById('burn-all-status');
|
||||||
const all = [
|
const all = [
|
||||||
{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
|
{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
|
||||||
{id:'burn-nvidia-power',target:'nvidia-targeted-power',label:'NVIDIA Targeted Power (dcgmi diag targeted_power)',nvidia:true},
|
|
||||||
{id:'burn-nvidia-pulse',target:'nvidia-pulse',label:'NVIDIA Pulse Test (dcgmi diag pulse_test)',nvidia:true},
|
|
||||||
{id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},
|
|
||||||
{id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true},
|
|
||||||
{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
|
{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
|
||||||
{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
|
{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
|
||||||
{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
|
{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
|
||||||
@@ -2293,10 +2425,6 @@ function runAllBurnTasks() {
|
|||||||
fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
|
fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
|
||||||
const map = {
|
const map = {
|
||||||
'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
|
'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
|
||||||
'nvidia-targeted-power': {cb:'burn-nvidia-power', note:'note-nvidia-power', reason:'dcgmi not available or NVIDIA driver not running'},
|
|
||||||
'nvidia-pulse': {cb:'burn-nvidia-pulse', note:'note-nvidia-pulse', reason:'dcgmi not available or NVIDIA driver not running'},
|
|
||||||
'nvidia-interconnect': {cb:'burn-nvidia-interconnect', note:'note-nvidia-interconnect', reason:'NCCL interconnect tool not available or NVIDIA driver not running'},
|
|
||||||
'nvidia-bandwidth': {cb:'burn-nvidia-bandwidth', note:'note-nvidia-bandwidth', reason:'nvbandwidth or dcgmi not available or NVIDIA driver not running'},
|
|
||||||
'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
|
'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
|
||||||
'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
|
'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
|
||||||
'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},
|
'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},
|
||||||
@@ -2452,7 +2580,7 @@ func renderNetwork() string {
|
|||||||
|
|
||||||
func renderServicesInline() string {
|
func renderServicesInline() string {
|
||||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
|
return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
|
||||||
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||||
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||||
<div id="svc-out" style="display:none;margin-top:12px">
|
<div id="svc-out" style="display:none;margin-top:12px">
|
||||||
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||||
@@ -2523,11 +2651,6 @@ function svcAction(btn, name, action) {
|
|||||||
btn.disabled = false;
|
btn.disabled = false;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
function restartGPUDrivers() {
|
|
||||||
var btn = document.querySelector('[onclick*="restartGPUDrivers"]');
|
|
||||||
if (!btn) { svcAction({textContent:'',disabled:false}, 'bee-nvidia', 'restart'); return; }
|
|
||||||
svcAction(btn, 'bee-nvidia', 'restart');
|
|
||||||
}
|
|
||||||
loadServices();
|
loadServices();
|
||||||
</script>`
|
</script>`
|
||||||
}
|
}
|
||||||
@@ -2787,6 +2910,124 @@ loadDisplays();
|
|||||||
</script>`
|
</script>`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func renderNvidiaSelfHealInline() string {
|
||||||
|
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
|
||||||
|
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">
|
||||||
|
<button id="nvidia-restart-btn" class="btn btn-secondary" onclick="nvidiaRestartDrivers()">Restart GPU Drivers</button>
|
||||||
|
<button class="btn btn-sm btn-secondary" onclick="loadNvidiaSelfHeal()">↻ Refresh</button>
|
||||||
|
</div>
|
||||||
|
<div id="nvidia-self-heal-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVIDIA GPU status...</div>
|
||||||
|
<div id="nvidia-self-heal-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||||
|
<div id="nvidia-self-heal-out" style="display:none;margin-top:12px">
|
||||||
|
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||||
|
<span id="nvidia-self-heal-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||||
|
<span id="nvidia-self-heal-out-status" style="font-size:12px"></span>
|
||||||
|
</div>
|
||||||
|
<div id="nvidia-self-heal-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
function nvidiaSelfHealShowResult(label, status, output) {
|
||||||
|
var out = document.getElementById('nvidia-self-heal-out');
|
||||||
|
var term = document.getElementById('nvidia-self-heal-terminal');
|
||||||
|
var statusEl = document.getElementById('nvidia-self-heal-out-status');
|
||||||
|
var labelEl = document.getElementById('nvidia-self-heal-out-label');
|
||||||
|
out.style.display = 'block';
|
||||||
|
labelEl.textContent = label;
|
||||||
|
term.textContent = output || '(no output)';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
if (status === 'ok') {
|
||||||
|
statusEl.textContent = '✓ done';
|
||||||
|
statusEl.style.color = 'var(--ok-fg, #2c662d)';
|
||||||
|
} else {
|
||||||
|
statusEl.textContent = '✗ failed';
|
||||||
|
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
function nvidiaRestartDrivers() {
|
||||||
|
var btn = document.getElementById('nvidia-restart-btn');
|
||||||
|
var original = btn.textContent;
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Restarting...';
|
||||||
|
nvidiaSelfHealShowResult('restart bee-nvidia', 'ok', 'Running...');
|
||||||
|
fetch('/api/services/action', {
|
||||||
|
method:'POST',
|
||||||
|
headers:{'Content-Type':'application/json'},
|
||||||
|
body:JSON.stringify({name:'bee-nvidia', action:'restart'})
|
||||||
|
}).then(r=>r.json()).then(d => {
|
||||||
|
nvidiaSelfHealShowResult('restart bee-nvidia', d.status || 'error', d.output || d.error || '(no output)');
|
||||||
|
setTimeout(function() {
|
||||||
|
loadServices();
|
||||||
|
loadNvidiaSelfHeal();
|
||||||
|
}, 800);
|
||||||
|
}).catch(e => {
|
||||||
|
nvidiaSelfHealShowResult('restart bee-nvidia', 'error', 'Request failed: ' + e);
|
||||||
|
}).finally(() => {
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = original;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function nvidiaResetGPU(index, btn) {
|
||||||
|
var original = btn.textContent;
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Resetting...';
|
||||||
|
nvidiaSelfHealShowResult('reset gpu ' + index, 'ok', 'Running...');
|
||||||
|
fetch('/api/gpu/nvidia-reset', {
|
||||||
|
method:'POST',
|
||||||
|
headers:{'Content-Type':'application/json'},
|
||||||
|
body:JSON.stringify({index:index})
|
||||||
|
}).then(r=>r.json()).then(d => {
|
||||||
|
nvidiaSelfHealShowResult('reset gpu ' + index, d.status || 'error', d.output || '(no output)');
|
||||||
|
setTimeout(loadNvidiaSelfHeal, 1000);
|
||||||
|
}).catch(e => {
|
||||||
|
nvidiaSelfHealShowResult('reset gpu ' + index, 'error', 'Request failed: ' + e);
|
||||||
|
}).finally(() => {
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = original;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function loadNvidiaSelfHeal() {
|
||||||
|
var status = document.getElementById('nvidia-self-heal-status');
|
||||||
|
var table = document.getElementById('nvidia-self-heal-table');
|
||||||
|
status.textContent = 'Loading NVIDIA GPU status...';
|
||||||
|
status.style.color = 'var(--muted)';
|
||||||
|
table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
|
||||||
|
fetch('/api/gpu/nvidia-status').then(r=>r.json()).then(gpus => {
|
||||||
|
if (!Array.isArray(gpus) || gpus.length === 0) {
|
||||||
|
status.textContent = 'No NVIDIA GPUs detected or nvidia-smi is unavailable.';
|
||||||
|
table.innerHTML = '';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
status.textContent = gpus.length + ' NVIDIA GPU(s) detected.';
|
||||||
|
const rows = gpus.map(g => {
|
||||||
|
const serial = g.serial || '';
|
||||||
|
const bdf = g.bdf || '';
|
||||||
|
const id = serial || bdf || ('gpu-' + g.index);
|
||||||
|
const badge = g.status === 'OK' ? 'badge-ok' : g.status === 'RESET_REQUIRED' ? 'badge-err' : 'badge-warn';
|
||||||
|
const details = [];
|
||||||
|
if (serial) details.push('serial ' + serial);
|
||||||
|
if (bdf) details.push('bdf ' + bdf);
|
||||||
|
if (g.parse_failure && g.raw_line) details.push(g.raw_line);
|
||||||
|
return '<tr>'
|
||||||
|
+ '<td style="white-space:nowrap">' + g.index + '</td>'
|
||||||
|
+ '<td>' + (g.name || 'unknown') + '</td>'
|
||||||
|
+ '<td style="font-family:monospace">' + id + '</td>'
|
||||||
|
+ '<td><span class="badge ' + badge + '">' + (g.status || 'UNKNOWN') + '</span>'
|
||||||
|
+ (details.length ? '<div style="margin-top:4px;font-size:12px;color:var(--muted)">' + details.join(' | ') + '</div>' : '')
|
||||||
|
+ '</td>'
|
||||||
|
+ '<td style="white-space:nowrap"><button class="btn btn-sm btn-secondary" onclick="nvidiaResetGPU(' + g.index + ', this)">Reset GPU</button></td>'
|
||||||
|
+ '</tr>';
|
||||||
|
}).join('');
|
||||||
|
table.innerHTML = '<table><tr><th>GPU</th><th>Model</th><th>ID</th><th>Status</th><th>Action</th></tr>' + rows + '</table>';
|
||||||
|
}).catch(e => {
|
||||||
|
status.textContent = 'Error loading NVIDIA GPU status: ' + e;
|
||||||
|
status.style.color = 'var(--crit-fg, #9f3a38)';
|
||||||
|
table.innerHTML = '';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
loadNvidiaSelfHeal();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
// ── Tools ─────────────────────────────────────────────────────────────────────
|
// ── Tools ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func renderTools() string {
|
func renderTools() string {
|
||||||
@@ -2847,6 +3088,9 @@ function installToRAM() {
|
|||||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||||
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
||||||
|
|
||||||
|
<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
|
||||||
|
renderNvidiaSelfHealInline() + `</div></div>
|
||||||
|
|
||||||
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
||||||
renderNetworkInline() + `</div></div>
|
renderNetworkInline() + `</div></div>
|
||||||
|
|
||||||
|
|||||||
@@ -302,6 +302,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
// GPU presence / tools
|
// GPU presence / tools
|
||||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||||
mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
|
mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
|
||||||
|
mux.HandleFunc("GET /api/gpu/nvidia-status", h.handleAPIGNVIDIAGPUStatuses)
|
||||||
|
mux.HandleFunc("POST /api/gpu/nvidia-reset", h.handleAPIGNVIDIAReset)
|
||||||
mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
|
mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
|
||||||
|
|
||||||
// System
|
// System
|
||||||
|
|||||||
@@ -591,7 +591,7 @@ func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
|
||||||
handler := NewHandler(HandlerOptions{})
|
handler := NewHandler(HandlerOptions{})
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
||||||
@@ -599,11 +599,20 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
|||||||
t.Fatalf("status=%d", rec.Code)
|
t.Fatalf("status=%d", rec.Code)
|
||||||
}
|
}
|
||||||
body := rec.Body.String()
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `NVIDIA Self Heal`) {
|
||||||
|
t.Fatalf("tools page missing nvidia self heal section: %s", body)
|
||||||
|
}
|
||||||
if !strings.Contains(body, `Restart GPU Drivers`) {
|
if !strings.Contains(body, `Restart GPU Drivers`) {
|
||||||
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `restartGPUDrivers()`) {
|
if !strings.Contains(body, `nvidiaRestartDrivers()`) {
|
||||||
t.Fatalf("tools page missing restartGPUDrivers action: %s", body)
|
t.Fatalf("tools page missing nvidiaRestartDrivers action: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `/api/gpu/nvidia-status`) {
|
||||||
|
t.Fatalf("tools page missing nvidia status api usage: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `nvidiaResetGPU(`) {
|
||||||
|
t.Fatalf("tools page missing nvidiaResetGPU action: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||||
t.Fatalf("tools page missing boot source field: %s", body)
|
t.Fatalf("tools page missing boot source field: %s", body)
|
||||||
@@ -684,8 +693,8 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
|||||||
for _, needle := range []string{
|
for _, needle := range []string{
|
||||||
`Benchmark Results`,
|
`Benchmark Results`,
|
||||||
`Composite score by saved benchmark run and GPU.`,
|
`Composite score by saved benchmark run and GPU.`,
|
||||||
`NVIDIA H100 PCIe / GPU 0`,
|
`GPU #0 — NVIDIA H100 PCIe`,
|
||||||
`NVIDIA H100 PCIe / GPU 1`,
|
`GPU #1 — NVIDIA H100 PCIe`,
|
||||||
`#1`,
|
`#1`,
|
||||||
wantTime,
|
wantTime,
|
||||||
`1176.25`,
|
`1176.25`,
|
||||||
@@ -732,8 +741,8 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
|||||||
for _, needle := range []string{
|
for _, needle := range []string{
|
||||||
`NVIDIA Max Compute Load`,
|
`NVIDIA Max Compute Load`,
|
||||||
`dcgmproftester`,
|
`dcgmproftester`,
|
||||||
`targeted_stress remain in <a href="/validate">Validate</a>`,
|
`NCCL`,
|
||||||
`NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
|
`Validate → Stress mode`,
|
||||||
`id="burn-gpu-list"`,
|
`id="burn-gpu-list"`,
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(body, needle) {
|
if !strings.Contains(body, needle) {
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ var taskNames = map[string]string{
|
|||||||
"nvidia-interconnect": "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
|
"nvidia-interconnect": "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
|
||||||
"nvidia-bandwidth": "NVIDIA Bandwidth Test (NVBandwidth)",
|
"nvidia-bandwidth": "NVIDIA Bandwidth Test (NVBandwidth)",
|
||||||
"nvidia-stress": "NVIDIA GPU Stress",
|
"nvidia-stress": "NVIDIA GPU Stress",
|
||||||
|
"hpl": "LINPACK (HPL)",
|
||||||
"memory": "Memory SAT",
|
"memory": "Memory SAT",
|
||||||
"storage": "Storage SAT",
|
"storage": "Storage SAT",
|
||||||
"cpu": "CPU SAT",
|
"cpu": "CPU SAT",
|
||||||
@@ -115,14 +116,16 @@ type Task struct {
|
|||||||
// taskParams holds optional parameters parsed from the run request.
|
// taskParams holds optional parameters parsed from the run request.
|
||||||
type taskParams struct {
|
type taskParams struct {
|
||||||
Duration int `json:"duration,omitempty"`
|
Duration int `json:"duration,omitempty"`
|
||||||
DiagLevel int `json:"diag_level,omitempty"`
|
StressMode bool `json:"stress_mode,omitempty"`
|
||||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||||
SizeMB int `json:"size_mb,omitempty"`
|
SizeMB int `json:"size_mb,omitempty"`
|
||||||
|
Passes int `json:"passes,omitempty"`
|
||||||
Loader string `json:"loader,omitempty"`
|
Loader string `json:"loader,omitempty"`
|
||||||
BurnProfile string `json:"burn_profile,omitempty"`
|
BurnProfile string `json:"burn_profile,omitempty"`
|
||||||
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
||||||
RunNCCL bool `json:"run_nccl,omitempty"`
|
RunNCCL bool `json:"run_nccl,omitempty"`
|
||||||
|
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||||
DisplayName string `json:"display_name,omitempty"`
|
DisplayName string `json:"display_name,omitempty"`
|
||||||
Device string `json:"device,omitempty"` // for install
|
Device string `json:"device,omitempty"` // for install
|
||||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||||
@@ -214,11 +217,11 @@ var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
|||||||
const maxTaskHistory = 50
|
const maxTaskHistory = 50
|
||||||
|
|
||||||
var (
|
var (
|
||||||
runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||||
return a.RunMemoryAcceptancePackCtx(ctx, baseDir, logFunc)
|
return a.RunMemoryAcceptancePackCtx(ctx, baseDir, sizeMB, passes, logFunc)
|
||||||
}
|
}
|
||||||
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||||
return a.RunStorageAcceptancePackCtx(ctx, baseDir, logFunc)
|
return a.RunStorageAcceptancePackCtx(ctx, baseDir, extended, logFunc)
|
||||||
}
|
}
|
||||||
runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
|
return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
|
||||||
@@ -551,7 +554,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
diagLevel := t.params.DiagLevel
|
diagLevel := 2
|
||||||
|
if t.params.StressMode {
|
||||||
|
diagLevel = 3
|
||||||
|
}
|
||||||
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
||||||
result, e := a.RunNvidiaAcceptancePackWithOptions(
|
result, e := a.RunNvidiaAcceptancePackWithOptions(
|
||||||
ctx, "", diagLevel, t.params.GPUIndices, j.append,
|
ctx, "", diagLevel, t.params.GPUIndices, j.append,
|
||||||
@@ -585,6 +591,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
GPUIndices: t.params.GPUIndices,
|
GPUIndices: t.params.GPUIndices,
|
||||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
RunNCCL: t.params.RunNCCL,
|
RunNCCL: t.params.RunNCCL,
|
||||||
|
ParallelGPUs: t.params.ParallelGPUs,
|
||||||
}, j.append)
|
}, j.append)
|
||||||
case "nvidia-compute":
|
case "nvidia-compute":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
@@ -656,13 +663,17 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append)
|
sizeMB, passes := 256, 1
|
||||||
|
if t.params.StressMode {
|
||||||
|
sizeMB, passes = 1024, 3
|
||||||
|
}
|
||||||
|
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
|
||||||
case "storage":
|
case "storage":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
archive, err = runStorageAcceptancePackCtx(a, ctx, "", j.append)
|
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
|
||||||
case "cpu":
|
case "cpu":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
@@ -673,8 +684,12 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
}
|
}
|
||||||
if dur <= 0 {
|
if dur <= 0 {
|
||||||
|
if t.params.StressMode {
|
||||||
|
dur = 1800
|
||||||
|
} else {
|
||||||
dur = 60
|
dur = 60
|
||||||
}
|
}
|
||||||
|
}
|
||||||
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
|
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
|
||||||
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||||
case "amd":
|
case "amd":
|
||||||
@@ -725,6 +740,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
}
|
}
|
||||||
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "hpl":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
opts := platform.HPLOptions{
|
||||||
|
MemFraction: 0.80,
|
||||||
|
NB: 256,
|
||||||
|
}
|
||||||
|
archive, err = func() (string, error) {
|
||||||
|
path, _, runErr := a.RunHPL(ctx, "", opts, j.append)
|
||||||
|
return path, runErr
|
||||||
|
}()
|
||||||
case "platform-stress":
|
case "platform-stress":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
|
|||||||
@@ -422,7 +422,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
|||||||
for _, needle := range []string{
|
for _, needle := range []string{
|
||||||
`Benchmark Results`,
|
`Benchmark Results`,
|
||||||
`Composite score for this benchmark task.`,
|
`Composite score for this benchmark task.`,
|
||||||
`NVIDIA H100 PCIe / GPU 0`,
|
`GPU #0 — NVIDIA H100 PCIe`,
|
||||||
`1176.25`,
|
`1176.25`,
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(html, needle) {
|
if !strings.Contains(html, needle) {
|
||||||
|
|||||||
@@ -19,5 +19,7 @@ ROCRAND_VERSION=3.2.0.60304-76~22.04
|
|||||||
HIP_RUNTIME_AMD_VERSION=6.3.42134.60304-76~22.04
|
HIP_RUNTIME_AMD_VERSION=6.3.42134.60304-76~22.04
|
||||||
HIPBLASLT_VERSION=0.10.0.60304-76~22.04
|
HIPBLASLT_VERSION=0.10.0.60304-76~22.04
|
||||||
COMGR_VERSION=2.8.0.60304-76~22.04
|
COMGR_VERSION=2.8.0.60304-76~22.04
|
||||||
|
HPL_VERSION=2.3
|
||||||
|
HPL_SHA256=32c5c17d22330e6f2337b681aded51637fb6008d3f0eb7c277b163fadd612830
|
||||||
GO_VERSION=1.24.0
|
GO_VERSION=1.24.0
|
||||||
AUDIT_VERSION=1.0.0
|
AUDIT_VERSION=1.0.0
|
||||||
|
|||||||
@@ -36,7 +36,6 @@ typedef void *CUstream;
|
|||||||
#define MAX_CUBLAS_PROFILES 5
|
#define MAX_CUBLAS_PROFILES 5
|
||||||
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
||||||
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
||||||
#define STRESS_LAUNCH_DEPTH 8
|
|
||||||
|
|
||||||
static const char *ptx_source =
|
static const char *ptx_source =
|
||||||
".version 6.0\n"
|
".version 6.0\n"
|
||||||
@@ -344,7 +343,6 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
unsigned long iterations = 0;
|
unsigned long iterations = 0;
|
||||||
int mp_count = 0;
|
int mp_count = 0;
|
||||||
int stream_count = 1;
|
int stream_count = 1;
|
||||||
int launches_per_wave = 0;
|
|
||||||
|
|
||||||
memset(report, 0, sizeof(*report));
|
memset(report, 0, sizeof(*report));
|
||||||
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
||||||
@@ -419,12 +417,10 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
|
|
||||||
unsigned int threads = 256;
|
unsigned int threads = 256;
|
||||||
|
|
||||||
double start = now_seconds();
|
double deadline = now_seconds() + (double)seconds;
|
||||||
double deadline = start + (double)seconds;
|
double next_sync = now_seconds() + 1.0;
|
||||||
while (now_seconds() < deadline) {
|
while (now_seconds() < deadline) {
|
||||||
launches_per_wave = 0;
|
int launched = 0;
|
||||||
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
|
||||||
int launched_this_batch = 0;
|
|
||||||
for (int lane = 0; lane < stream_count; lane++) {
|
for (int lane = 0; lane < stream_count; lane++) {
|
||||||
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
||||||
if (!check_rc(api,
|
if (!check_rc(api,
|
||||||
@@ -442,21 +438,21 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
NULL))) {
|
NULL))) {
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
launches_per_wave++;
|
launched++;
|
||||||
launched_this_batch++;
|
iterations++;
|
||||||
}
|
}
|
||||||
if (launched_this_batch <= 0) {
|
if (launched <= 0) {
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (launches_per_wave <= 0) {
|
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
double now = now_seconds();
|
||||||
|
if (now >= next_sync || now >= deadline) {
|
||||||
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
iterations += (unsigned long)launches_per_wave;
|
next_sync = now + 1.0;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
api->cuCtxSynchronize();
|
||||||
|
|
||||||
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
|
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
|
||||||
goto fail;
|
goto fail;
|
||||||
@@ -468,11 +464,10 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
report->iterations = iterations;
|
report->iterations = iterations;
|
||||||
snprintf(report->details,
|
snprintf(report->details,
|
||||||
sizeof(report->details),
|
sizeof(report->details),
|
||||||
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
|
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
|
||||||
size_mb,
|
size_mb,
|
||||||
report->buffer_mb,
|
report->buffer_mb,
|
||||||
report->stream_count,
|
report->stream_count,
|
||||||
STRESS_LAUNCH_DEPTH,
|
|
||||||
bytes_per_stream[0] / (1024u * 1024u),
|
bytes_per_stream[0] / (1024u * 1024u),
|
||||||
iterations);
|
iterations);
|
||||||
|
|
||||||
@@ -1140,7 +1135,6 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
int stream_count = 1;
|
int stream_count = 1;
|
||||||
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
|
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
|
||||||
int prepared_count = 0;
|
int prepared_count = 0;
|
||||||
int wave_launches = 0;
|
|
||||||
size_t requested_budget = 0;
|
size_t requested_budget = 0;
|
||||||
size_t total_budget = 0;
|
size_t total_budget = 0;
|
||||||
size_t per_profile_budget = 0;
|
size_t per_profile_budget = 0;
|
||||||
@@ -1207,11 +1201,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
||||||
append_detail(report->details,
|
append_detail(report->details,
|
||||||
sizeof(report->details),
|
sizeof(report->details),
|
||||||
"requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
|
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
|
||||||
size_mb,
|
size_mb,
|
||||||
report->buffer_mb,
|
report->buffer_mb,
|
||||||
report->stream_count,
|
report->stream_count,
|
||||||
STRESS_LAUNCH_DEPTH,
|
|
||||||
mp_count,
|
mp_count,
|
||||||
per_profile_budget / (1024u * 1024u));
|
per_profile_budget / (1024u * 1024u));
|
||||||
|
|
||||||
@@ -1260,11 +1253,15 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Keep the GPU queue continuously full by submitting kernels without
|
||||||
|
* synchronizing after every wave. A sync barrier after each small batch
|
||||||
|
* creates CPU↔GPU ping-pong gaps that prevent full TDP utilisation,
|
||||||
|
* especially when individual kernels are short. Instead we sync at most
|
||||||
|
* once per second (for error detection) and once at the very end. */
|
||||||
double deadline = now_seconds() + (double)seconds;
|
double deadline = now_seconds() + (double)seconds;
|
||||||
|
double next_sync = now_seconds() + 1.0;
|
||||||
while (now_seconds() < deadline) {
|
while (now_seconds() < deadline) {
|
||||||
wave_launches = 0;
|
int launched = 0;
|
||||||
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
|
||||||
int launched_this_batch = 0;
|
|
||||||
for (int i = 0; i < prepared_count; i++) {
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
if (!prepared[i].ready) {
|
if (!prepared[i].ready) {
|
||||||
continue;
|
continue;
|
||||||
@@ -1284,16 +1281,13 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
}
|
}
|
||||||
prepared[i].iterations++;
|
prepared[i].iterations++;
|
||||||
report->iterations++;
|
report->iterations++;
|
||||||
wave_launches++;
|
launched++;
|
||||||
launched_this_batch++;
|
|
||||||
}
|
}
|
||||||
if (launched_this_batch <= 0) {
|
if (launched <= 0) {
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (wave_launches <= 0) {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
double now = now_seconds();
|
||||||
|
if (now >= next_sync || now >= deadline) {
|
||||||
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
||||||
for (int i = 0; i < prepared_count; i++) {
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||||
@@ -1303,7 +1297,11 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
cuda->cuCtxDestroy(ctx);
|
cuda->cuCtxDestroy(ctx);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
next_sync = now + 1.0;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
/* Final drain — ensure all queued work finishes before we read results. */
|
||||||
|
cuda->cuCtxSynchronize();
|
||||||
|
|
||||||
for (int i = 0; i < prepared_count; i++) {
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
if (!prepared[i].ready) {
|
if (!prepared[i].ready) {
|
||||||
|
|||||||
331
iso/builder/build-hpl.sh
Executable file
331
iso/builder/build-hpl.sh
Executable file
@@ -0,0 +1,331 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# build-hpl.sh — build HPL (High Performance LINPACK) for the bee LiveCD.
|
||||||
|
#
|
||||||
|
# Downloads HPL 2.3 from netlib, downloads OpenBLAS runtime from the Debian 12
|
||||||
|
# apt repo, and compiles xhpl using a minimal single-process MPI stub so that
|
||||||
|
# no MPI package is required inside the ISO.
|
||||||
|
#
|
||||||
|
# The resulting xhpl binary is a standard HPL binary whose output is compatible
|
||||||
|
# with the accepted HPL format (WR... Gflops lines).
|
||||||
|
#
|
||||||
|
# Output:
|
||||||
|
# $CACHE_DIR/bin/xhpl
|
||||||
|
# $CACHE_DIR/lib/libopenblas.so* (runtime, injected into ISO /usr/lib/)
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
HPL_VERSION="$1"
|
||||||
|
HPL_SHA256="$2"
|
||||||
|
DIST_DIR="$3"
|
||||||
|
|
||||||
|
[ -n "$HPL_VERSION" ] || { echo "usage: $0 <hpl-version> <sha256> <dist-dir>"; exit 1; }
|
||||||
|
[ -n "$HPL_SHA256" ] || { echo "usage: $0 <hpl-version> <sha256> <dist-dir>"; exit 1; }
|
||||||
|
[ -n "$DIST_DIR" ] || { echo "usage: $0 <hpl-version> <sha256> <dist-dir>"; exit 1; }
|
||||||
|
|
||||||
|
echo "=== HPL ${HPL_VERSION} ==="
|
||||||
|
|
||||||
|
CACHE_DIR="${DIST_DIR}/hpl-${HPL_VERSION}"
|
||||||
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/hpl-downloads"
|
||||||
|
|
||||||
|
if [ -x "${CACHE_DIR}/bin/xhpl" ]; then
|
||||||
|
echo "=== HPL cached, skipping build ==="
|
||||||
|
echo "binary: ${CACHE_DIR}/bin/xhpl"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p "${DOWNLOAD_CACHE_DIR}" "${CACHE_DIR}/bin" "${CACHE_DIR}/lib"
|
||||||
|
|
||||||
|
# ── download HPL source ────────────────────────────────────────────────────────
|
||||||
|
HPL_TAR="${DOWNLOAD_CACHE_DIR}/hpl-${HPL_VERSION}.tar.gz"
|
||||||
|
DEFAULT_HPL_URLS="
|
||||||
|
https://www.netlib.org/benchmark/hpl/hpl-${HPL_VERSION}.tar.gz
|
||||||
|
https://fossies.org/linux/privat/hpl-${HPL_VERSION}.tar.gz
|
||||||
|
"
|
||||||
|
HPL_GIT_URL="${HPL_GIT_URL:-https://github.com/icl-utk-edu/hpl.git}"
|
||||||
|
DEFAULT_HPL_GIT_REFS="v${HPL_VERSION} ${HPL_VERSION} main"
|
||||||
|
HPL_SOURCE_MODE="tarball"
|
||||||
|
|
||||||
|
download_to_file() {
|
||||||
|
url="$1"
|
||||||
|
out="$2"
|
||||||
|
|
||||||
|
if command -v curl >/dev/null 2>&1; then
|
||||||
|
curl -fL \
|
||||||
|
--connect-timeout 15 \
|
||||||
|
--max-time 180 \
|
||||||
|
--retry 2 \
|
||||||
|
--retry-delay 2 \
|
||||||
|
--output "${out}" \
|
||||||
|
"${url}"
|
||||||
|
return $?
|
||||||
|
fi
|
||||||
|
|
||||||
|
wget \
|
||||||
|
--show-progress \
|
||||||
|
--tries=2 \
|
||||||
|
--timeout=30 \
|
||||||
|
-O "${out}" \
|
||||||
|
"${url}"
|
||||||
|
}
|
||||||
|
|
||||||
|
download_hpl_tarball() {
|
||||||
|
out="$1"
|
||||||
|
tmp="${out}.part"
|
||||||
|
urls="${HPL_URLS:-$DEFAULT_HPL_URLS}"
|
||||||
|
|
||||||
|
rm -f "${tmp}"
|
||||||
|
for url in ${urls}; do
|
||||||
|
[ -n "${url}" ] || continue
|
||||||
|
echo "=== trying HPL source: ${url} ==="
|
||||||
|
if download_to_file "${url}" "${tmp}"; then
|
||||||
|
mv "${tmp}" "${out}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
rm -f "${tmp}"
|
||||||
|
echo "=== failed: ${url} ==="
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "ERROR: failed to download HPL ${HPL_VERSION} from all configured URLs" >&2
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
download_hpl_from_git_archive() {
|
||||||
|
out="$1"
|
||||||
|
refs="${HPL_GIT_REFS:-$DEFAULT_HPL_GIT_REFS}"
|
||||||
|
tmp_root="$(mktemp -d)"
|
||||||
|
repo_dir="${tmp_root}/repo"
|
||||||
|
archive_dir="${tmp_root}/hpl-${HPL_VERSION}"
|
||||||
|
archive_tmp="${out}.part"
|
||||||
|
|
||||||
|
for ref in ${refs}; do
|
||||||
|
[ -n "${ref}" ] || continue
|
||||||
|
echo "=== trying HPL git source: ${HPL_GIT_URL} ref ${ref} ==="
|
||||||
|
rm -rf "${repo_dir}" "${archive_dir}" "${archive_tmp}"
|
||||||
|
if git clone --depth 1 --branch "${ref}" "${HPL_GIT_URL}" "${repo_dir}"; then
|
||||||
|
mv "${repo_dir}" "${archive_dir}"
|
||||||
|
tar czf "${archive_tmp}" -C "${tmp_root}" "hpl-${HPL_VERSION}"
|
||||||
|
mv "${archive_tmp}" "${out}"
|
||||||
|
rm -rf "${tmp_root}"
|
||||||
|
HPL_SOURCE_MODE="git"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
echo "=== failed git ref: ${ref} ==="
|
||||||
|
done
|
||||||
|
|
||||||
|
rm -rf "${tmp_root}" "${archive_tmp}"
|
||||||
|
echo "ERROR: failed to obtain HPL ${HPL_VERSION} from all configured sources" >&2
|
||||||
|
echo " looked for cache: ${out}" >&2
|
||||||
|
echo " tarball mirrors: ${HPL_URLS:-$DEFAULT_HPL_URLS}" >&2
|
||||||
|
echo " git fallback: ${HPL_GIT_URL} refs ${refs}" >&2
|
||||||
|
echo " override mirrors with HPL_URLS=\"https://mirror1/...\"" >&2
|
||||||
|
echo " override git refs with HPL_GIT_REFS=\"v${HPL_VERSION} ${HPL_VERSION} main\"" >&2
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ ! -f "${HPL_TAR}" ]; then
|
||||||
|
echo "=== downloading HPL ${HPL_VERSION} ==="
|
||||||
|
download_hpl_tarball "${HPL_TAR}" || download_hpl_from_git_archive "${HPL_TAR}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "${HPL_SOURCE_MODE}" = "tarball" ]; then
|
||||||
|
actual_sha="$(sha256sum "${HPL_TAR}" | awk '{print $1}')"
|
||||||
|
if [ "${actual_sha}" != "${HPL_SHA256}" ]; then
|
||||||
|
echo "ERROR: sha256 mismatch for hpl-${HPL_VERSION}.tar.gz" >&2
|
||||||
|
echo " expected: ${HPL_SHA256}" >&2
|
||||||
|
echo " actual: ${actual_sha}" >&2
|
||||||
|
rm -f "${HPL_TAR}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "sha256 OK: hpl-${HPL_VERSION}.tar.gz"
|
||||||
|
else
|
||||||
|
echo "=== HPL source obtained from git fallback; skipping tarball sha256 check ==="
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── download OpenBLAS from Debian 12 apt repo ─────────────────────────────────
|
||||||
|
REPO_BASE="https://deb.debian.org/debian/pool/main/o/openblas"
|
||||||
|
PACKAGES_GZ="${DOWNLOAD_CACHE_DIR}/Packages.gz"
|
||||||
|
OPENBLAS_PKG="libopenblas0-openmp"
|
||||||
|
|
||||||
|
echo "=== fetching Debian 12 Packages.gz ==="
|
||||||
|
wget -q -O "${PACKAGES_GZ}" \
|
||||||
|
"https://deb.debian.org/debian/dists/bookworm/main/binary-amd64/Packages.gz"
|
||||||
|
|
||||||
|
lookup_deb() {
|
||||||
|
pkg="$1"
|
||||||
|
gzip -dc "${PACKAGES_GZ}" | awk -v pkg="$pkg" '
|
||||||
|
/^Package: / { cur=$2 }
|
||||||
|
/^Filename: / { file=$2 }
|
||||||
|
/^SHA256: / { sha=$2 }
|
||||||
|
/^$/ {
|
||||||
|
if (cur == pkg) { print file " " sha; exit }
|
||||||
|
cur=""; file=""; sha=""
|
||||||
|
}
|
||||||
|
END {
|
||||||
|
if (cur == pkg) print file " " sha
|
||||||
|
}'
|
||||||
|
}
|
||||||
|
|
||||||
|
meta="$(lookup_deb "${OPENBLAS_PKG}")"
|
||||||
|
[ -n "$meta" ] || { echo "ERROR: ${OPENBLAS_PKG} not found in Packages.gz"; exit 1; }
|
||||||
|
repo_file="$(printf '%s' "$meta" | awk '{print $1}')"
|
||||||
|
repo_sha="$(printf '%s' "$meta" | awk '{print $2}')"
|
||||||
|
|
||||||
|
OPENBLAS_DEB="${DOWNLOAD_CACHE_DIR}/$(basename "${repo_file}")"
|
||||||
|
if [ -f "${OPENBLAS_DEB}" ]; then
|
||||||
|
actual="$(sha256sum "${OPENBLAS_DEB}" | awk '{print $1}')"
|
||||||
|
[ "$actual" = "$repo_sha" ] || rm -f "${OPENBLAS_DEB}"
|
||||||
|
fi
|
||||||
|
if [ ! -f "${OPENBLAS_DEB}" ]; then
|
||||||
|
echo "=== downloading ${OPENBLAS_PKG} ==="
|
||||||
|
wget --show-progress -O "${OPENBLAS_DEB}" "https://deb.debian.org/debian/${repo_file}"
|
||||||
|
actual="$(sha256sum "${OPENBLAS_DEB}" | awk '{print $1}')"
|
||||||
|
[ "$actual" = "$repo_sha" ] || { echo "ERROR: sha256 mismatch for ${OPENBLAS_PKG}"; rm -f "${OPENBLAS_DEB}"; exit 1; }
|
||||||
|
fi
|
||||||
|
|
||||||
|
# extract libopenblas shared libs
|
||||||
|
TMP_DEB=$(mktemp -d)
|
||||||
|
trap 'rm -rf "${TMP_DEB}" "${BUILD_TMP:-}"' EXIT INT TERM
|
||||||
|
(
|
||||||
|
cd "${TMP_DEB}"
|
||||||
|
ar x "${OPENBLAS_DEB}"
|
||||||
|
tar xf data.tar.*
|
||||||
|
)
|
||||||
|
find "${TMP_DEB}" \( -name 'libopenblas*.so*' \) \( -type f -o -type l \) \
|
||||||
|
-exec cp -a {} "${CACHE_DIR}/lib/" \;
|
||||||
|
echo "=== OpenBLAS libs: $(ls "${CACHE_DIR}/lib/" | wc -l) files ==="
|
||||||
|
|
||||||
|
# also need libopenblas-dev header for compilation (we only need the .so symlink)
|
||||||
|
OPENBLAS_SO="$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libopenblas.so.*' -type f | sort | head -1)"
|
||||||
|
[ -n "${OPENBLAS_SO}" ] || { echo "ERROR: libopenblas.so not extracted"; exit 1; }
|
||||||
|
SONAME="$(basename "${OPENBLAS_SO}")"
|
||||||
|
ln -sf "${SONAME}" "${CACHE_DIR}/lib/libopenblas.so" 2>/dev/null || true
|
||||||
|
ln -sf "${SONAME}" "${CACHE_DIR}/lib/libblas.so" 2>/dev/null || true
|
||||||
|
|
||||||
|
# ── build HPL ─────────────────────────────────────────────────────────────────
|
||||||
|
BUILD_TMP=$(mktemp -d)
|
||||||
|
|
||||||
|
cd "${BUILD_TMP}"
|
||||||
|
tar xf "${HPL_TAR}"
|
||||||
|
SRC_DIR="$(find . -maxdepth 1 -type d -name 'hpl-*' | head -1)"
|
||||||
|
[ -n "${SRC_DIR}" ] || { echo "ERROR: HPL source dir not found"; exit 1; }
|
||||||
|
cd "${SRC_DIR}"
|
||||||
|
|
||||||
|
# Write a minimal single-process MPI stub so we don't need an MPI package.
|
||||||
|
# HPL only needs these functions for single-process execution.
|
||||||
|
cat > "${BUILD_TMP}/mpi_stub.c" <<'MPISTUB'
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
|
||||||
|
typedef int MPI_Comm;
|
||||||
|
typedef int MPI_Datatype;
|
||||||
|
typedef int MPI_Op;
|
||||||
|
typedef int MPI_Status;
|
||||||
|
typedef int MPI_Request;
|
||||||
|
|
||||||
|
#define MPI_COMM_WORLD 0
|
||||||
|
#define MPI_SUCCESS 0
|
||||||
|
#define MPI_DOUBLE 6
|
||||||
|
#define MPI_INT 5
|
||||||
|
#define MPI_SUM 0
|
||||||
|
#define MPI_MAX 1
|
||||||
|
#define MPI_MIN 2
|
||||||
|
#define MPI_BYTE 1
|
||||||
|
#define MPI_ANY_SOURCE -1
|
||||||
|
#define MPI_ANY_TAG -1
|
||||||
|
#define MPI_STATUS_IGNORE ((MPI_Status*)0)
|
||||||
|
|
||||||
|
int MPI_Init(int *argc, char ***argv) { (void)argc; (void)argv; return MPI_SUCCESS; }
|
||||||
|
int MPI_Finalize(void) { return MPI_SUCCESS; }
|
||||||
|
int MPI_Comm_rank(MPI_Comm c, int *rank) { (void)c; *rank = 0; return MPI_SUCCESS; }
|
||||||
|
int MPI_Comm_size(MPI_Comm c, int *size) { (void)c; *size = 1; return MPI_SUCCESS; }
|
||||||
|
int MPI_Bcast(void *b, int n, MPI_Datatype t, int r, MPI_Comm c)
|
||||||
|
{ (void)b;(void)n;(void)t;(void)r;(void)c; return MPI_SUCCESS; }
|
||||||
|
int MPI_Reduce(const void *s, void *r, int n, MPI_Datatype t, MPI_Op op, int root, MPI_Comm c) {
|
||||||
|
(void)op;(void)root;(void)c;
|
||||||
|
size_t sz = (t==MPI_DOUBLE)?sizeof(double):(t==MPI_INT)?sizeof(int):1;
|
||||||
|
memcpy(r, s, (size_t)n * sz);
|
||||||
|
return MPI_SUCCESS;
|
||||||
|
}
|
||||||
|
int MPI_Allreduce(const void *s, void *r, int n, MPI_Datatype t, MPI_Op op, MPI_Comm c)
|
||||||
|
{ return MPI_Reduce(s,r,n,t,op,0,c); }
|
||||||
|
int MPI_Send(const void *b, int n, MPI_Datatype t, int d, int tag, MPI_Comm c)
|
||||||
|
{ (void)b;(void)n;(void)t;(void)d;(void)tag;(void)c; return MPI_SUCCESS; }
|
||||||
|
int MPI_Recv(void *b, int n, MPI_Datatype t, int s, int tag, MPI_Comm c, MPI_Status *st)
|
||||||
|
{ (void)b;(void)n;(void)t;(void)s;(void)tag;(void)c;(void)st; return MPI_SUCCESS; }
|
||||||
|
int MPI_Sendrecv(const void *sb, int sn, MPI_Datatype st2, int dest, int stag,
|
||||||
|
void *rb, int rn, MPI_Datatype rt, int src, int rtag,
|
||||||
|
MPI_Comm c, MPI_Status *status)
|
||||||
|
{ (void)sb;(void)sn;(void)st2;(void)dest;(void)stag;
|
||||||
|
(void)rb;(void)rn;(void)rt;(void)src;(void)rtag;(void)c;(void)status;
|
||||||
|
return MPI_SUCCESS; }
|
||||||
|
int MPI_Irecv(void *b, int n, MPI_Datatype t, int s, int tag, MPI_Comm c, MPI_Request *req)
|
||||||
|
{ (void)b;(void)n;(void)t;(void)s;(void)tag;(void)c;(void)req; return MPI_SUCCESS; }
|
||||||
|
int MPI_Wait(MPI_Request *req, MPI_Status *st)
|
||||||
|
{ (void)req;(void)st; return MPI_SUCCESS; }
|
||||||
|
int MPI_Abort(MPI_Comm c, int code) { (void)c; exit(code); }
|
||||||
|
double MPI_Wtime(void) {
|
||||||
|
struct timeval tv;
|
||||||
|
gettimeofday(&tv, NULL);
|
||||||
|
return (double)tv.tv_sec + (double)tv.tv_usec * 1e-6;
|
||||||
|
}
|
||||||
|
MPISTUB
|
||||||
|
|
||||||
|
# Write Make.bee — HPL makefile configuration
|
||||||
|
cat > Make.bee <<MAKEFILE
|
||||||
|
SHELL = /bin/sh
|
||||||
|
CD = cd
|
||||||
|
CP = cp
|
||||||
|
LN_S = ln -s
|
||||||
|
MKDIR = mkdir -p
|
||||||
|
RM = /bin/rm -f
|
||||||
|
TOUCH = touch
|
||||||
|
ARCH = bee
|
||||||
|
|
||||||
|
# Directories
|
||||||
|
TOPdir = \$(shell pwd)
|
||||||
|
INCdir = \$(TOPdir)/include
|
||||||
|
BINdir = \$(TOPdir)/bin/\$(ARCH)
|
||||||
|
LIBdir = \$(TOPdir)/lib/\$(ARCH)
|
||||||
|
HPLlib = \$(LIBdir)/libhpl.a
|
||||||
|
|
||||||
|
# Compiler
|
||||||
|
CC = gcc
|
||||||
|
CCNOOPT = \$(HPL_DEFS)
|
||||||
|
CCFLAGS = \$(HPL_DEFS) -O3 -march=native -funroll-loops -fomit-frame-pointer
|
||||||
|
|
||||||
|
# Linker
|
||||||
|
LINKER = gcc
|
||||||
|
LINKFLAGS = \$(CCFLAGS)
|
||||||
|
|
||||||
|
# MPI (single-process stub — no actual MPI needed)
|
||||||
|
MPdir =
|
||||||
|
MPinc = -I${BUILD_TMP}
|
||||||
|
MPlib = ${BUILD_TMP}/mpi_stub.o
|
||||||
|
|
||||||
|
# BLAS (OpenBLAS)
|
||||||
|
LAdir = ${CACHE_DIR}/lib
|
||||||
|
LAinc =
|
||||||
|
LAlib = -L\$(LAdir) -Wl,-rpath,/usr/lib -lopenblas
|
||||||
|
|
||||||
|
HPL_OPTS =
|
||||||
|
HPL_DEFS = \$(HPL_OPTS) -DHPL_CALL_CBLAS
|
||||||
|
MAKEFILE
|
||||||
|
echo "=== Make.bee written ==="
|
||||||
|
|
||||||
|
# compile MPI stub
|
||||||
|
gcc -O2 -c -o "${BUILD_TMP}/mpi_stub.o" "${BUILD_TMP}/mpi_stub.c"
|
||||||
|
|
||||||
|
# build HPL
|
||||||
|
echo "=== building HPL ${HPL_VERSION} ==="
|
||||||
|
make -j"$(nproc)" arch=bee 2>&1 | tail -20
|
||||||
|
|
||||||
|
XHPL_BIN="bin/bee/xhpl"
|
||||||
|
[ -x "${XHPL_BIN}" ] || { echo "ERROR: xhpl not found after build"; exit 1; }
|
||||||
|
|
||||||
|
cp "${XHPL_BIN}" "${CACHE_DIR}/bin/xhpl"
|
||||||
|
chmod +x "${CACHE_DIR}/bin/xhpl"
|
||||||
|
echo "=== HPL build complete ==="
|
||||||
|
echo "binary: ${CACHE_DIR}/bin/xhpl"
|
||||||
|
echo "libs: $(ls "${CACHE_DIR}/lib/")"
|
||||||
@@ -1148,6 +1148,19 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
echo "=== john injected ==="
|
echo "=== john injected ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# --- build HPL (CPU LINPACK) — runs on all variants ---
|
||||||
|
run_step "build HPL ${HPL_VERSION}" "80-hpl" \
|
||||||
|
sh "${BUILDER_DIR}/build-hpl.sh" "${HPL_VERSION}" "${HPL_SHA256}" "${DIST_DIR}"
|
||||||
|
|
||||||
|
HPL_CACHE="${DIST_DIR}/hpl-${HPL_VERSION}"
|
||||||
|
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee"
|
||||||
|
cp "${HPL_CACHE}/bin/xhpl" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/xhpl"
|
||||||
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/xhpl"
|
||||||
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-hpl" 2>/dev/null || true
|
||||||
|
# Inject OpenBLAS runtime libs needed by xhpl
|
||||||
|
cp "${HPL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||||
|
echo "=== HPL injected: xhpl + $(ls "${HPL_CACHE}/lib/" | wc -l) OpenBLAS libs ==="
|
||||||
|
|
||||||
# --- embed build metadata ---
|
# --- embed build metadata ---
|
||||||
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
||||||
BUILD_DATE="$(date +%Y-%m-%d)"
|
BUILD_DATE="$(date +%Y-%m-%d)"
|
||||||
@@ -1180,6 +1193,7 @@ BUILD_DATE=${BUILD_DATE}
|
|||||||
GIT_COMMIT=${GIT_COMMIT}
|
GIT_COMMIT=${GIT_COMMIT}
|
||||||
DEBIAN_VERSION=${DEBIAN_VERSION}
|
DEBIAN_VERSION=${DEBIAN_VERSION}
|
||||||
DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
|
DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
|
||||||
|
HPL_VERSION=${HPL_VERSION}
|
||||||
${GPU_VERSION_LINE}
|
${GPU_VERSION_LINE}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
|||||||
@@ -5,69 +5,104 @@ echo "=== generating bee wallpaper ==="
|
|||||||
mkdir -p /usr/share/bee
|
mkdir -p /usr/share/bee
|
||||||
|
|
||||||
python3 - <<'PYEOF'
|
python3 - <<'PYEOF'
|
||||||
from PIL import Image, ImageDraw, ImageFont
|
from PIL import Image, ImageDraw, ImageFont, ImageFilter
|
||||||
import os
|
import os
|
||||||
|
|
||||||
W, H = 1920, 1080
|
W, H = 1920, 1080
|
||||||
|
|
||||||
LOGO = """\
|
ASCII_ART = [
|
||||||
\u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2557 \u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557
|
" ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗",
|
||||||
\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d\u255a\u2588\u2588\u2557 \u2588\u2588\u2554\u255d \u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d
|
" ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝",
|
||||||
\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2551\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u255a\u2588\u2588\u2588\u2588\u2554\u255d \u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255d\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2557
|
" █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗",
|
||||||
\u2588\u2588\u2554\u2550\u2550\u255d \u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2551\u255a\u2550\u2550\u2550\u2550\u2588\u2588\u2551 \u255a\u2588\u2588\u2554\u255d \u255a\u2550\u2550\u2550\u2550\u255d\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u255d \u2588\u2588\u2554\u2550\u2550\u255d
|
" ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝",
|
||||||
\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2551 \u2588\u2588\u2551 \u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255d\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557
|
" ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗",
|
||||||
\u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d\u255a\u2550\u255d \u255a\u2550\u255d\u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d \u255a\u2550\u255d \u255a\u2550\u2550\u2550\u2550\u2550\u255d \u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d\u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d
|
" ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝",
|
||||||
Hardware Audit LiveCD"""
|
]
|
||||||
|
SUBTITLE = " Hardware Audit LiveCD"
|
||||||
|
|
||||||
# Find a monospace font that supports box-drawing characters
|
FG = (0xF6, 0xD0, 0x47)
|
||||||
FONT_CANDIDATES = [
|
FG_DIM = (0xD4, 0xA9, 0x1C)
|
||||||
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf',
|
SHADOW = (0x5E, 0x47, 0x05)
|
||||||
'/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf',
|
SUB = (0x96, 0x7A, 0x17)
|
||||||
'/usr/share/fonts/truetype/freefont/FreeMono.ttf',
|
BG = (0x05, 0x05, 0x05)
|
||||||
'/usr/share/fonts/truetype/noto/NotoMono-Regular.ttf',
|
|
||||||
|
MONO_FONT_CANDIDATES = [
|
||||||
|
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
|
||||||
|
'/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
|
||||||
|
'/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
|
||||||
|
'/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
|
||||||
|
]
|
||||||
|
SUB_FONT_CANDIDATES = [
|
||||||
|
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
|
||||||
|
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
|
||||||
|
'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
|
||||||
|
'/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
|
||||||
]
|
]
|
||||||
|
|
||||||
font_path = None
|
|
||||||
for p in FONT_CANDIDATES:
|
|
||||||
if os.path.exists(p):
|
|
||||||
font_path = p
|
|
||||||
break
|
|
||||||
|
|
||||||
SIZE = 22
|
def load_font(candidates, size):
|
||||||
if font_path:
|
for path in candidates:
|
||||||
font_logo = ImageFont.truetype(font_path, SIZE)
|
if os.path.exists(path):
|
||||||
font_sub = ImageFont.truetype(font_path, SIZE)
|
return ImageFont.truetype(path, size)
|
||||||
else:
|
return ImageFont.load_default()
|
||||||
font_logo = ImageFont.load_default()
|
|
||||||
font_sub = font_logo
|
|
||||||
|
|
||||||
img = Image.new('RGB', (W, H), (0, 0, 0))
|
|
||||||
|
def mono_metrics(font):
|
||||||
|
probe = Image.new('L', (W, H), 0)
|
||||||
|
draw = ImageDraw.Draw(probe)
|
||||||
|
char_w = int(round(draw.textlength("M", font=font)))
|
||||||
|
bb = draw.textbbox((0, 0), "Mg", font=font)
|
||||||
|
char_h = bb[3] - bb[1]
|
||||||
|
return char_w, char_h
|
||||||
|
|
||||||
|
|
||||||
|
def render_ascii_mask(font, lines, char_w, char_h, line_gap):
|
||||||
|
width = max(len(line) for line in lines) * char_w
|
||||||
|
height = len(lines) * char_h + line_gap * (len(lines) - 1)
|
||||||
|
mask = Image.new('L', (width, height), 0)
|
||||||
|
draw = ImageDraw.Draw(mask)
|
||||||
|
for row, line in enumerate(lines):
|
||||||
|
y = row * (char_h + line_gap)
|
||||||
|
for col, ch in enumerate(line):
|
||||||
|
if ch == ' ':
|
||||||
|
continue
|
||||||
|
x = col * char_w
|
||||||
|
draw.text((x, y), ch, font=font, fill=255)
|
||||||
|
return mask
|
||||||
|
|
||||||
|
|
||||||
|
img = Image.new('RGB', (W, H), BG)
|
||||||
draw = ImageDraw.Draw(img)
|
draw = ImageDraw.Draw(img)
|
||||||
|
|
||||||
# Measure logo block line by line to avoid font ascender offset
|
# Soft amber glow under the logo without depending on font rendering.
|
||||||
lines = LOGO.split('\n')
|
glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
|
||||||
logo_lines = lines[:6]
|
glow_draw = ImageDraw.Draw(glow)
|
||||||
sub_line = lines[6] if len(lines) > 6 else ''
|
glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
|
||||||
|
glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
|
||||||
|
glow = glow.filter(ImageFilter.GaussianBlur(60))
|
||||||
|
img = Image.alpha_composite(img.convert('RGBA'), glow)
|
||||||
|
|
||||||
line_h = SIZE + 2
|
font_logo = load_font(MONO_FONT_CANDIDATES, 64)
|
||||||
block_h = len(logo_lines) * line_h + 8 + (SIZE if sub_line else 0)
|
char_w, char_h = mono_metrics(font_logo)
|
||||||
|
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 8)
|
||||||
|
logo_w, logo_h = logo_mask.size
|
||||||
|
logo_x = (W - logo_w) // 2
|
||||||
|
logo_y = 270
|
||||||
|
|
||||||
# Width: measure the widest logo line
|
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(2))
|
||||||
max_w = 0
|
img.paste(SHADOW, (logo_x + 16, logo_y + 14), shadow_mask)
|
||||||
for line in logo_lines:
|
img.paste(FG_DIM, (logo_x + 8, logo_y + 7), logo_mask)
|
||||||
bb = draw.textbbox((0, 0), line, font=font_logo)
|
img.paste(FG, (logo_x, logo_y), logo_mask)
|
||||||
max_w = max(max_w, bb[2] - bb[0])
|
|
||||||
|
|
||||||
x = (W - max_w) // 2
|
font_sub = load_font(SUB_FONT_CANDIDATES, 30)
|
||||||
y = (H - block_h) // 2
|
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
|
||||||
|
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
|
||||||
|
sub_y = logo_y + logo_h + 48
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
|
||||||
|
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
|
||||||
|
|
||||||
cy = y
|
img = img.convert('RGB')
|
||||||
for line in logo_lines:
|
|
||||||
draw.text((x, cy), line, font=font_logo, fill=(0xf6, 0xc9, 0x0e))
|
|
||||||
cy += line_h
|
|
||||||
cy += 8
|
|
||||||
if sub_line:
|
|
||||||
draw.text((x, cy), sub_line, font=font_sub, fill=(0x80, 0x68, 0x18))
|
|
||||||
|
|
||||||
img.save('/usr/share/bee/wallpaper.png', optimize=True)
|
img.save('/usr/share/bee/wallpaper.png', optimize=True)
|
||||||
print('wallpaper written: /usr/share/bee/wallpaper.png')
|
print('wallpaper written: /usr/share/bee/wallpaper.png')
|
||||||
|
|||||||
97
iso/overlay/usr/local/bin/bee-hpl
Executable file
97
iso/overlay/usr/local/bin/bee-hpl
Executable file
@@ -0,0 +1,97 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# bee-hpl — run HPL (High Performance LINPACK) with auto-sized problem.
|
||||||
|
#
|
||||||
|
# Generates HPL.dat based on available RAM, runs xhpl, and prints standard
|
||||||
|
# HPL output. The WR... line with Gflops is parsed by the bee audit tool.
|
||||||
|
#
|
||||||
|
# Usage: bee-hpl [--mem-fraction 0.80] [--nb 256] [--seconds N]
|
||||||
|
#
|
||||||
|
# --mem-fraction fraction of total RAM to use for the matrix (default 0.80)
|
||||||
|
# --nb block size; 256 is good for modern CPUs (default 256)
|
||||||
|
# --seconds ignored — HPL runtime is determined by problem size; kept
|
||||||
|
# for interface compatibility with other bee stress tools
|
||||||
|
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
XHPL="/usr/local/lib/bee/xhpl"
|
||||||
|
MEM_FRACTION="0.80"
|
||||||
|
NB=256
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "usage: $0 [--mem-fraction 0.80] [--nb 256] [--seconds N]" >&2
|
||||||
|
exit 2
|
||||||
|
}
|
||||||
|
|
||||||
|
while [ "$#" -gt 0 ]; do
|
||||||
|
case "$1" in
|
||||||
|
--mem-fraction) [ "$#" -ge 2 ] || usage; MEM_FRACTION="$2"; shift 2 ;;
|
||||||
|
--nb) [ "$#" -ge 2 ] || usage; NB="$2"; shift 2 ;;
|
||||||
|
--seconds) [ "$#" -ge 2 ] || usage; shift 2 ;; # accepted, ignored
|
||||||
|
*) usage ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -x "${XHPL}" ] || { echo "ERROR: xhpl not found at ${XHPL}" >&2; exit 1; }
|
||||||
|
|
||||||
|
# Detect total RAM in bytes
|
||||||
|
TOTAL_KB=$(grep MemTotal /proc/meminfo | awk '{print $2}')
|
||||||
|
[ -n "${TOTAL_KB}" ] || { echo "ERROR: cannot read MemTotal from /proc/meminfo" >&2; exit 1; }
|
||||||
|
TOTAL_BYTES=$(( TOTAL_KB * 1024 ))
|
||||||
|
|
||||||
|
# N = floor(sqrt(fraction * total_bytes / 8)) rounded down to multiple of NB
|
||||||
|
# Use awk for floating-point sqrt
|
||||||
|
N=$(awk -v total="${TOTAL_BYTES}" -v frac="${MEM_FRACTION}" -v nb="${NB}" '
|
||||||
|
BEGIN {
|
||||||
|
raw = int(sqrt(total * frac / 8.0))
|
||||||
|
n = int(raw / nb) * nb
|
||||||
|
if (n < nb) n = nb
|
||||||
|
print n
|
||||||
|
}')
|
||||||
|
|
||||||
|
echo "loader=bee-hpl"
|
||||||
|
echo "total_ram_mb=$(( TOTAL_KB / 1024 ))"
|
||||||
|
echo "matrix_n=${N}"
|
||||||
|
echo "block_nb=${NB}"
|
||||||
|
echo "mem_fraction=${MEM_FRACTION}"
|
||||||
|
|
||||||
|
# Generate HPL.dat in a temp directory and run from there
|
||||||
|
RUNDIR=$(mktemp -d)
|
||||||
|
trap 'rm -rf "${RUNDIR}"' EXIT INT TERM
|
||||||
|
|
||||||
|
cat > "${RUNDIR}/HPL.dat" <<DAT
|
||||||
|
HPLinpack benchmark input file
|
||||||
|
Innovative Computing Laboratory, University of Tennessee
|
||||||
|
HPL.out output file name (if any)
|
||||||
|
6 device out (6=stdout, 7=stderr, file)
|
||||||
|
1 # of problems sizes (N)
|
||||||
|
${N} Ns
|
||||||
|
1 # of NBs
|
||||||
|
${NB} NBs
|
||||||
|
0 PMAP process mapping (0=Row-,1=Column-major)
|
||||||
|
1 # of process grids (P x Q)
|
||||||
|
1 Ps
|
||||||
|
1 Qs
|
||||||
|
16.0 threshold
|
||||||
|
1 # of panel fact
|
||||||
|
2 PFACTs (0=left, 1=Crout, 2=Right)
|
||||||
|
1 # of recursive stopping criterium
|
||||||
|
4 NBMINs (>= 1)
|
||||||
|
1 # of panels in recursion
|
||||||
|
2 NDIVs
|
||||||
|
1 # of recursive panel fact.
|
||||||
|
1 RFACTs (0=left, 1=Crout, 2=Right)
|
||||||
|
1 # of broadcast
|
||||||
|
1 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
|
||||||
|
1 # of lookahead depth
|
||||||
|
1 DEPTHs (>=0)
|
||||||
|
2 SWAP (0=bin-exch,1=long,2=mix)
|
||||||
|
64 swapping threshold
|
||||||
|
0 L1 in (0=transposed,1=no-transposed) form
|
||||||
|
0 U in (0=transposed,1=no-transposed) form
|
||||||
|
1 Equilibration (0=no,1=yes)
|
||||||
|
8 memory alignment in double (> 0)
|
||||||
|
DAT
|
||||||
|
|
||||||
|
cd "${RUNDIR}"
|
||||||
|
echo "---"
|
||||||
|
"${XHPL}"
|
||||||
Reference in New Issue
Block a user