Compare commits
57 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 05241f2e0e | |||
|
|
c1690a084b | ||
|
|
9481ca2805 | ||
|
|
a78fdadd88 | ||
|
|
4ef403898f | ||
| 025548ab3c | |||
|
|
e0d94d7f47 | ||
|
|
13899aa864 | ||
|
|
f345d8a89d | ||
|
|
4715059ac0 | ||
|
|
0660a40287 | ||
|
|
67369d9b7b | ||
|
|
3f41a026ca | ||
|
|
0ee4f46537 | ||
| 8db40b098a | |||
| 16e7ae00e7 | |||
| b2f8626fee | |||
| dd26e03b2d | |||
| 6937a4c6ec | |||
| b9be93c213 | |||
| d1a22d782d | |||
|
|
0a4bb596f6 | ||
|
|
531d1ca366 | ||
|
|
93cfa78e8c | ||
|
|
1358485f2b | ||
| 8fe20ba678 | |||
| d973231f37 | |||
| f5d175f488 | |||
| fa00667750 | |||
|
|
c7d2816a7f | ||
|
|
d2eadedff2 | ||
|
|
a98c4d7461 | ||
|
|
2354ae367d | ||
|
|
0d0e1f55a7 | ||
|
|
35f4c53887 | ||
|
|
981315e6fd | ||
|
|
fc5c100a29 | ||
| 6e94216f3b | |||
| 53455063b9 | |||
| 4602f97836 | |||
| c65d3ae3b1 | |||
| 7a21c370e4 | |||
| a493e3ab5b | |||
| 19b4803ec7 | |||
| 1bdfb1e9ca | |||
| c5d6b30177 | |||
| 5b9015451e | |||
| d1a6863ceb | |||
| f9aa05de8e | |||
| a9ccea8cca | |||
| fc5c985fb5 | |||
| 5eb3baddb4 | |||
| a6ac13b5d3 | |||
| 4003cb7676 | |||
| 2875313ba0 | |||
| f1621efee4 | |||
| 4461249cc3 |
@@ -382,9 +382,9 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
|||||||
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
||||||
}
|
}
|
||||||
case "memory":
|
case "memory":
|
||||||
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
|
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", 256, 1, logLine)
|
||||||
case "storage":
|
case "storage":
|
||||||
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
|
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", false, logLine)
|
||||||
case "cpu":
|
case "cpu":
|
||||||
dur := *duration
|
dur := *duration
|
||||||
if dur <= 0 {
|
if dur <= 0 {
|
||||||
|
|||||||
@@ -117,13 +117,15 @@ type satRunner interface {
|
|||||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
||||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
|
||||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
ResetNvidiaGPU(index int) (string, error)
|
||||||
|
RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
|
||||||
|
RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
|
||||||
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||||
DetectGPUVendor() string
|
DetectGPUVendor() string
|
||||||
@@ -188,6 +190,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
|||||||
}
|
}
|
||||||
result := collector.Run(runtimeMode)
|
result := collector.Run(runtimeMode)
|
||||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||||
|
writePSUStatusesToDB(a.StatusDB, result.Hardware.PowerSupplies)
|
||||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||||
result.Runtime = &health
|
result.Runtime = &health
|
||||||
}
|
}
|
||||||
@@ -521,6 +524,15 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
|||||||
return a.sat.ListNvidiaGPUs()
|
return a.sat.ListNvidiaGPUs()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||||
|
return a.sat.ListNvidiaGPUStatuses()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
|
||||||
|
out, err := a.sat.ResetNvidiaGPU(index)
|
||||||
|
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
@@ -555,11 +567,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
|
|||||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
@@ -591,14 +603,14 @@ func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts p
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc)
|
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
@@ -623,14 +635,14 @@ func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (Actio
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc)
|
return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
@@ -915,6 +927,41 @@ func bodyOr(body, fallback string) string {
|
|||||||
return body
|
return body
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// writePSUStatusesToDB records PSU statuses collected during audit into the
|
||||||
|
// component-status DB so they are visible in the Hardware Summary card.
|
||||||
|
// PSU status is sourced from IPMI (ipmitool fru + sdr) during audit.
|
||||||
|
func writePSUStatusesToDB(db *ComponentStatusDB, psus []schema.HardwarePowerSupply) {
|
||||||
|
if db == nil || len(psus) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
const source = "audit:ipmi"
|
||||||
|
worstStatus := "OK"
|
||||||
|
for _, psu := range psus {
|
||||||
|
if psu.Status == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
slot := "?"
|
||||||
|
if psu.Slot != nil {
|
||||||
|
slot = *psu.Slot
|
||||||
|
}
|
||||||
|
st := *psu.Status
|
||||||
|
detail := ""
|
||||||
|
if psu.ErrorDescription != nil {
|
||||||
|
detail = *psu.ErrorDescription
|
||||||
|
}
|
||||||
|
db.Record("psu:"+slot, source, st, detail)
|
||||||
|
switch st {
|
||||||
|
case "Critical":
|
||||||
|
worstStatus = "Critical"
|
||||||
|
case "Warning":
|
||||||
|
if worstStatus != "Critical" {
|
||||||
|
worstStatus = "Warning"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
db.Record("psu:all", source, worstStatus, "")
|
||||||
|
}
|
||||||
|
|
||||||
func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
|
func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
|
||||||
raw, err := os.ReadFile(path)
|
raw, err := os.ReadFile(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -135,6 +135,8 @@ type fakeSAT struct {
|
|||||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||||
runAMDPackFn func(string) (string, error)
|
runAMDPackFn func(string) (string, error)
|
||||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||||
|
listNvidiaGPUStatusesFn func() ([]platform.NvidiaGPUStatus, error)
|
||||||
|
resetNvidiaGPUFn func(int) (string, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||||
@@ -159,7 +161,7 @@ func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir
|
|||||||
return f.runNvidiaFn(baseDir)
|
return f.runNvidiaFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
|
||||||
if f.runNvidiaComputeFn != nil {
|
if f.runNvidiaComputeFn != nil {
|
||||||
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
||||||
}
|
}
|
||||||
@@ -201,11 +203,25 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
func (f fakeSAT) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||||
|
if f.listNvidiaGPUStatusesFn != nil {
|
||||||
|
return f.listNvidiaGPUStatusesFn()
|
||||||
|
}
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) {
|
||||||
|
if f.resetNvidiaGPUFn != nil {
|
||||||
|
return f.resetNvidiaGPUFn(index)
|
||||||
|
}
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _, _ int, _ func(string)) (string, error) {
|
||||||
return f.runMemoryFn(baseDir)
|
return f.runMemoryFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ bool, _ func(string)) (string, error) {
|
||||||
return f.runStorageFn(baseDir)
|
return f.runStorageFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -526,8 +542,6 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
oldExportDir := DefaultExportDir
|
oldExportDir := DefaultExportDir
|
||||||
DefaultExportDir = tmp
|
DefaultExportDir = tmp
|
||||||
@@ -564,8 +578,6 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
oldExportDir := DefaultExportDir
|
oldExportDir := DefaultExportDir
|
||||||
DefaultExportDir = tmp
|
DefaultExportDir = tmp
|
||||||
@@ -627,8 +639,6 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
oldSATBaseDir := DefaultSATBaseDir
|
oldSATBaseDir := DefaultSATBaseDir
|
||||||
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
||||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||||
@@ -805,6 +815,9 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
for _, want := range []string{
|
for _, want := range []string{
|
||||||
"/system/ip-link.txt",
|
"/system/ip-link.txt",
|
||||||
"/system/ip-link-stats.txt",
|
"/system/ip-link-stats.txt",
|
||||||
|
"/system/kernel-aer-nvidia.txt",
|
||||||
|
"/system/lspci-nvidia-bridges-vv.txt",
|
||||||
|
"/system/pcie-aer-sysfs.txt",
|
||||||
"/system/ethtool-info.txt",
|
"/system/ethtool-info.txt",
|
||||||
"/system/ethtool-link.txt",
|
"/system/ethtool-link.txt",
|
||||||
"/system/ethtool-module.txt",
|
"/system/ethtool-module.txt",
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ package app
|
|||||||
import (
|
import (
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@@ -18,6 +19,7 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
|
|||||||
}
|
}
|
||||||
if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
|
if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
|
||||||
applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
|
applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
|
||||||
|
applyNvidiaPerGPUStatus(snap.PCIeDevices, baseDir)
|
||||||
}
|
}
|
||||||
if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
|
if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
|
||||||
applyMemorySAT(snap.Memory, summary)
|
applyMemorySAT(snap.Memory, summary)
|
||||||
@@ -32,6 +34,100 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
|
|||||||
applyComponentStatusDB(snap, db)
|
applyComponentStatusDB(snap, db)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type nvidiaPerGPUStatus struct {
|
||||||
|
runStatus string
|
||||||
|
reason string
|
||||||
|
}
|
||||||
|
|
||||||
|
func applyNvidiaPerGPUStatus(devs []schema.HardwarePCIeDevice, baseDir string) {
|
||||||
|
statusByIndex, ts, ok := loadLatestNvidiaPerGPUStatus(baseDir)
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for i := range devs {
|
||||||
|
if devs[i].Telemetry == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
rawIdx, ok := devs[i].Telemetry["nvidia_gpu_index"]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, ok := telemetryInt(rawIdx)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
st, ok := statusByIndex[idx]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
status, description, ok := satKeyStatus(st.runStatus, firstNonEmpty(strings.TrimSpace(st.reason), "nvidia GPU SAT"))
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
mergeComponentStatusPreferDetail(&devs[i].HardwareComponentStatus, ts, status, description)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadLatestNvidiaPerGPUStatus(baseDir string) (map[int]nvidiaPerGPUStatus, string, bool) {
|
||||||
|
matches, err := filepath.Glob(filepath.Join(baseDir, "gpu-nvidia-*"))
|
||||||
|
if err != nil || len(matches) == 0 {
|
||||||
|
return nil, "", false
|
||||||
|
}
|
||||||
|
sort.Strings(matches)
|
||||||
|
runDir := matches[len(matches)-1]
|
||||||
|
summaryRaw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||||
|
if err != nil {
|
||||||
|
return nil, "", false
|
||||||
|
}
|
||||||
|
summaryKV := parseKeyValueSummary(string(summaryRaw))
|
||||||
|
runAtUTC := strings.TrimSpace(summaryKV["run_at_utc"])
|
||||||
|
files, err := filepath.Glob(filepath.Join(runDir, "gpu-*-status.txt"))
|
||||||
|
if err != nil || len(files) == 0 {
|
||||||
|
return nil, "", false
|
||||||
|
}
|
||||||
|
out := make(map[int]nvidiaPerGPUStatus, len(files))
|
||||||
|
for _, file := range files {
|
||||||
|
raw, err := os.ReadFile(file)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
kv := parseKeyValueSummary(string(raw))
|
||||||
|
idx, err := strconv.Atoi(strings.TrimSpace(kv["gpu_index"]))
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out[idx] = nvidiaPerGPUStatus{
|
||||||
|
runStatus: strings.ToUpper(strings.TrimSpace(kv["run_status"])),
|
||||||
|
reason: strings.TrimSpace(kv["reason"]),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(out) == 0 {
|
||||||
|
return nil, "", false
|
||||||
|
}
|
||||||
|
return out, runAtUTC, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func telemetryInt(v any) (int, bool) {
|
||||||
|
switch value := v.(type) {
|
||||||
|
case int:
|
||||||
|
return value, true
|
||||||
|
case int32:
|
||||||
|
return int(value), true
|
||||||
|
case int64:
|
||||||
|
return int(value), true
|
||||||
|
case float64:
|
||||||
|
return int(value), true
|
||||||
|
case string:
|
||||||
|
n, err := strconv.Atoi(strings.TrimSpace(value))
|
||||||
|
if err != nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return n, true
|
||||||
|
default:
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type satSummary struct {
|
type satSummary struct {
|
||||||
runAtUTC string
|
runAtUTC string
|
||||||
overall string
|
overall string
|
||||||
@@ -176,6 +272,31 @@ func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func mergeComponentStatusPreferDetail(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
|
||||||
|
if component == nil || satStatus == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
current := strings.TrimSpace(ptrString(component.Status))
|
||||||
|
newSeverity := statusSeverity(satStatus)
|
||||||
|
currentSeverity := statusSeverity(current)
|
||||||
|
if current == "" || current == "Unknown" || newSeverity > currentSeverity {
|
||||||
|
mergeComponentStatus(component, changedAt, satStatus, description)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if newSeverity == currentSeverity && strings.TrimSpace(description) != "" {
|
||||||
|
component.Status = appStringPtr(satStatus)
|
||||||
|
component.ErrorDescription = appStringPtr(description)
|
||||||
|
if strings.TrimSpace(changedAt) != "" {
|
||||||
|
component.StatusChangedAt = appStringPtr(changedAt)
|
||||||
|
component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
|
||||||
|
Status: satStatus,
|
||||||
|
ChangedAt: changedAt,
|
||||||
|
Details: appStringPtr(description),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func statusSeverity(status string) int {
|
func statusSeverity(status string) int {
|
||||||
switch strings.TrimSpace(status) {
|
switch strings.TrimSpace(status) {
|
||||||
case "Critical":
|
case "Critical":
|
||||||
|
|||||||
@@ -59,3 +59,51 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
|||||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestApplyLatestSATStatusesMarksNvidiaGPUByPerGPUStatusFile(t *testing.T) {
|
||||||
|
baseDir := t.TempDir()
|
||||||
|
runDir := filepath.Join(baseDir, "gpu-nvidia-20260407-162123")
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte("run_at_utc=2026-04-07T16:21:23Z\noverall_status=FAILED\n"), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "gpu-1-status.txt"), []byte("gpu_index=1\ngpu_name=NVIDIA H100 PCIe\nrun_status=FAILED\nreason=GPU requires reset\n"), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
class := "VideoController"
|
||||||
|
manufacturer := "NVIDIA Corporation"
|
||||||
|
bdf0 := "0000:4b:00.0"
|
||||||
|
bdf1 := "0000:4f:00.0"
|
||||||
|
snap := schema.HardwareSnapshot{
|
||||||
|
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||||
|
{
|
||||||
|
DeviceClass: &class,
|
||||||
|
Manufacturer: &manufacturer,
|
||||||
|
BDF: &bdf0,
|
||||||
|
Telemetry: map[string]any{"nvidia_gpu_index": 0},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
DeviceClass: &class,
|
||||||
|
Manufacturer: &manufacturer,
|
||||||
|
BDF: &bdf1,
|
||||||
|
Telemetry: map[string]any{"nvidia_gpu_index": 1},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||||
|
|
||||||
|
if snap.PCIeDevices[1].Status == nil || *snap.PCIeDevices[1].Status != "Critical" {
|
||||||
|
t.Fatalf("gpu1 status=%v want Critical", snap.PCIeDevices[1].Status)
|
||||||
|
}
|
||||||
|
if snap.PCIeDevices[1].ErrorDescription == nil || *snap.PCIeDevices[1].ErrorDescription != "GPU requires reset failed" {
|
||||||
|
got := "<nil>"
|
||||||
|
if snap.PCIeDevices[1].ErrorDescription != nil {
|
||||||
|
got = *snap.PCIeDevices[1].ErrorDescription
|
||||||
|
}
|
||||||
|
t.Fatalf("gpu1 error=%q want per-gpu reason", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -40,17 +40,75 @@ var supportBundleCommands = []struct {
|
|||||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||||
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
||||||
|
{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if command -v dmesg >/dev/null 2>&1; then
|
||||||
|
dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
|
||||||
|
else
|
||||||
|
echo "dmesg not found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v lspci >/dev/null 2>&1; then
|
||||||
|
echo "lspci not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
found=0
|
||||||
|
for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
|
||||||
|
found=1
|
||||||
|
echo "=== GPU $gpu ==="
|
||||||
|
lspci -s "$gpu" -vv 2>&1 || true
|
||||||
|
bridge=$(basename "$(readlink -f "/sys/bus/pci/devices/$gpu/.." 2>/dev/null)" 2>/dev/null)
|
||||||
|
if [ -n "$bridge" ] && [ "$bridge" != "$gpu" ]; then
|
||||||
|
echo
|
||||||
|
echo "=== UPSTREAM $bridge for $gpu ==="
|
||||||
|
lspci -s "$bridge" -vv 2>&1 || true
|
||||||
|
fi
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no NVIDIA PCI devices found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||||
for d in /sys/bus/pci/devices/*/; do
|
for d in /sys/bus/pci/devices/*/; do
|
||||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||||
[ "$vendor" = "0x10de" ] || continue
|
[ "$vendor" = "0x10de" ] || continue
|
||||||
dev=$(basename "$d")
|
class=$(cat "$d/class" 2>/dev/null)
|
||||||
|
case "$class" in
|
||||||
|
0x030000|0x030200) ;;
|
||||||
|
*) continue ;;
|
||||||
|
esac
|
||||||
|
dev=$(basename "$d")
|
||||||
echo "=== $dev ==="
|
echo "=== $dev ==="
|
||||||
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
||||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
`}},
|
||||||
|
{name: "system/pcie-aer-sysfs.txt", cmd: []string{"sh", "-c", `
|
||||||
|
found=0
|
||||||
|
for dev in /sys/bus/pci/devices/*; do
|
||||||
|
[ -e "$dev" ] || continue
|
||||||
|
bdf=$(basename "$dev")
|
||||||
|
block=""
|
||||||
|
for f in aer_dev_correctable aer_dev_fatal aer_dev_nonfatal aer_rootport_total_err_cor aer_rootport_total_err_fatal aer_rootport_total_err_nonfatal; do
|
||||||
|
if [ -r "$dev/$f" ]; then
|
||||||
|
if [ -z "$block" ]; then
|
||||||
|
block=1
|
||||||
|
found=1
|
||||||
|
echo "=== $bdf ==="
|
||||||
|
fi
|
||||||
|
printf " %-30s %s\n" "$f" "$(cat "$dev/$f" 2>/dev/null)"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if [ -n "$block" ]; then
|
||||||
|
echo
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no PCIe AER sysfs counters found"
|
||||||
|
fi
|
||||||
`}},
|
`}},
|
||||||
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
|
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
|
||||||
if ! command -v ethtool >/dev/null 2>&1; then
|
if ! command -v ethtool >/dev/null 2>&1; then
|
||||||
@@ -139,7 +197,7 @@ var supportBundleOptionalFiles = []struct {
|
|||||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||||
}
|
}
|
||||||
|
|
||||||
const supportBundleGlob = "bee-support-*.tar.gz"
|
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
||||||
|
|
||||||
func BuildSupportBundle(exportDir string) (string, error) {
|
func BuildSupportBundle(exportDir string) (string, error) {
|
||||||
exportDir = strings.TrimSpace(exportDir)
|
exportDir = strings.TrimSpace(exportDir)
|
||||||
@@ -153,9 +211,14 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
host := sanitizeFilename(hostnameOr("unknown"))
|
now := time.Now().UTC()
|
||||||
ts := time.Now().UTC().Format("20060102-150405")
|
date := now.Format("2006-01-02")
|
||||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s", host, ts))
|
tod := now.Format("15:04:05")
|
||||||
|
ver := bundleVersion()
|
||||||
|
model := serverModelForBundle()
|
||||||
|
sn := serverSerialForBundle()
|
||||||
|
|
||||||
|
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
|
||||||
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -187,7 +250,8 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
archivePath := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s.tar.gz", host, ts))
|
archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
|
||||||
|
archivePath := filepath.Join(os.TempDir(), archiveName)
|
||||||
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -344,6 +408,60 @@ func writeManifest(dst, exportDir, stageRoot string) error {
|
|||||||
return os.WriteFile(dst, []byte(body.String()), 0644)
|
return os.WriteFile(dst, []byte(body.String()), 0644)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func bundleVersion() string {
|
||||||
|
v := buildVersion()
|
||||||
|
v = strings.TrimPrefix(v, "v")
|
||||||
|
v = strings.TrimPrefix(v, "V")
|
||||||
|
if v == "" || v == "unknown" {
|
||||||
|
return "0.0"
|
||||||
|
}
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
|
||||||
|
func serverModelForBundle() string {
|
||||||
|
raw, err := exec.Command("dmidecode", "-t", "1").Output()
|
||||||
|
if err != nil {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
for _, line := range strings.Split(string(raw), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
key, val, ok := strings.Cut(line, ": ")
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(key) == "Product Name" {
|
||||||
|
val = strings.TrimSpace(val)
|
||||||
|
if val == "" {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
return strings.ReplaceAll(val, " ", "_")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
func serverSerialForBundle() string {
|
||||||
|
raw, err := exec.Command("dmidecode", "-t", "1").Output()
|
||||||
|
if err != nil {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
for _, line := range strings.Split(string(raw), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
key, val, ok := strings.Cut(line, ": ")
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(key) == "Serial Number" {
|
||||||
|
val = strings.TrimSpace(val)
|
||||||
|
if val == "" {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
func buildVersion() string {
|
func buildVersion() string {
|
||||||
raw, err := exec.Command("bee", "version").CombinedOutput()
|
raw, err := exec.Command("bee", "version").CombinedOutput()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -179,11 +179,3 @@ func commandOutputWithTimeout(timeout time.Duration, name string, args ...string
|
|||||||
defer cancel()
|
defer cancel()
|
||||||
return exec.CommandContext(ctx, name, args...).Output()
|
return exec.CommandContext(ctx, name, args...).Output()
|
||||||
}
|
}
|
||||||
|
|
||||||
func interfaceHasCarrier(iface string) bool {
|
|
||||||
raw, err := readNetCarrierFile(iface)
|
|
||||||
if err != nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
return strings.TrimSpace(raw) == "1"
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -58,12 +58,10 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if interfaceHasCarrier(iface) {
|
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||||
if injectSFPDOMTelemetry(&devs[i], out) {
|
enriched++
|
||||||
enriched++
|
continue
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
||||||
@@ -115,8 +113,38 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
|||||||
}
|
}
|
||||||
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
||||||
val := strings.TrimSpace(trimmed[idx+1:])
|
val := strings.TrimSpace(trimmed[idx+1:])
|
||||||
|
if val == "" || strings.EqualFold(val, "not supported") || strings.EqualFold(val, "unknown") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
|
case key == "identifier":
|
||||||
|
s := parseSFPIdentifier(val)
|
||||||
|
dev.SFPIdentifier = &s
|
||||||
|
t := true
|
||||||
|
dev.SFPPresent = &t
|
||||||
|
changed = true
|
||||||
|
case key == "connector":
|
||||||
|
s := parseSFPConnector(val)
|
||||||
|
dev.SFPConnector = &s
|
||||||
|
changed = true
|
||||||
|
case key == "vendor name":
|
||||||
|
s := strings.TrimSpace(val)
|
||||||
|
dev.SFPVendor = &s
|
||||||
|
changed = true
|
||||||
|
case key == "vendor pn":
|
||||||
|
s := strings.TrimSpace(val)
|
||||||
|
dev.SFPPartNumber = &s
|
||||||
|
changed = true
|
||||||
|
case key == "vendor sn":
|
||||||
|
s := strings.TrimSpace(val)
|
||||||
|
dev.SFPSerialNumber = &s
|
||||||
|
changed = true
|
||||||
|
case strings.Contains(key, "laser wavelength"):
|
||||||
|
if f, ok := firstFloat(val); ok {
|
||||||
|
dev.SFPWavelengthNM = &f
|
||||||
|
changed = true
|
||||||
|
}
|
||||||
case strings.Contains(key, "module temperature"):
|
case strings.Contains(key, "module temperature"):
|
||||||
if f, ok := firstFloat(val); ok {
|
if f, ok := firstFloat(val); ok {
|
||||||
dev.SFPTemperatureC = &f
|
dev.SFPTemperatureC = &f
|
||||||
@@ -147,12 +175,61 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
|||||||
return changed
|
return changed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// parseSFPIdentifier extracts the human-readable transceiver type from the
|
||||||
|
// raw ethtool identifier line, e.g. "0x03 (SFP)" → "SFP".
|
||||||
|
func parseSFPIdentifier(val string) string {
|
||||||
|
if s := extractParens(val); s != "" {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseSFPConnector extracts the connector type from the raw ethtool line,
|
||||||
|
// e.g. "0x07 (LC)" → "LC".
|
||||||
|
func parseSFPConnector(val string) string {
|
||||||
|
if s := extractParens(val); s != "" {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
|
||||||
|
var parenRe = regexp.MustCompile(`\(([^)]+)\)`)
|
||||||
|
|
||||||
|
func extractParens(s string) string {
|
||||||
|
m := parenRe.FindStringSubmatch(s)
|
||||||
|
if len(m) < 2 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(m[1])
|
||||||
|
}
|
||||||
|
|
||||||
func parseSFPDOM(raw string) map[string]any {
|
func parseSFPDOM(raw string) map[string]any {
|
||||||
dev := schema.HardwarePCIeDevice{}
|
dev := schema.HardwarePCIeDevice{}
|
||||||
if !injectSFPDOMTelemetry(&dev, raw) {
|
if !injectSFPDOMTelemetry(&dev, raw) {
|
||||||
return map[string]any{}
|
return map[string]any{}
|
||||||
}
|
}
|
||||||
out := map[string]any{}
|
out := map[string]any{}
|
||||||
|
if dev.SFPPresent != nil {
|
||||||
|
out["sfp_present"] = *dev.SFPPresent
|
||||||
|
}
|
||||||
|
if dev.SFPIdentifier != nil {
|
||||||
|
out["sfp_identifier"] = *dev.SFPIdentifier
|
||||||
|
}
|
||||||
|
if dev.SFPConnector != nil {
|
||||||
|
out["sfp_connector"] = *dev.SFPConnector
|
||||||
|
}
|
||||||
|
if dev.SFPVendor != nil {
|
||||||
|
out["sfp_vendor"] = *dev.SFPVendor
|
||||||
|
}
|
||||||
|
if dev.SFPPartNumber != nil {
|
||||||
|
out["sfp_part_number"] = *dev.SFPPartNumber
|
||||||
|
}
|
||||||
|
if dev.SFPSerialNumber != nil {
|
||||||
|
out["sfp_serial_number"] = *dev.SFPSerialNumber
|
||||||
|
}
|
||||||
|
if dev.SFPWavelengthNM != nil {
|
||||||
|
out["sfp_wavelength_nm"] = *dev.SFPWavelengthNM
|
||||||
|
}
|
||||||
if dev.SFPTemperatureC != nil {
|
if dev.SFPTemperatureC != nil {
|
||||||
out["sfp_temperature_c"] = *dev.SFPTemperatureC
|
out["sfp_temperature_c"] = *dev.SFPTemperatureC
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -122,10 +122,7 @@ func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T)
|
|||||||
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
||||||
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
||||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||||
ethtoolModuleQuery = func(string) (string, error) {
|
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("no module") }
|
||||||
t.Fatal("ethtool -m should not be called without carrier")
|
|
||||||
return "", nil
|
|
||||||
}
|
|
||||||
|
|
||||||
class := "EthernetController"
|
class := "EthernetController"
|
||||||
bdf := "0000:18:00.0"
|
bdf := "0000:18:00.0"
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ import (
|
|||||||
const nvidiaVendorID = 0x10de
|
const nvidiaVendorID = 0x10de
|
||||||
|
|
||||||
type nvidiaGPUInfo struct {
|
type nvidiaGPUInfo struct {
|
||||||
|
Index int
|
||||||
BDF string
|
BDF string
|
||||||
Serial string
|
Serial string
|
||||||
VBIOS string
|
VBIOS string
|
||||||
@@ -132,6 +133,7 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
info := nvidiaGPUInfo{
|
info := nvidiaGPUInfo{
|
||||||
|
Index: parseRequiredInt(rec[0]),
|
||||||
BDF: bdf,
|
BDF: bdf,
|
||||||
Serial: strings.TrimSpace(rec[2]),
|
Serial: strings.TrimSpace(rec[2]),
|
||||||
VBIOS: strings.TrimSpace(rec[3]),
|
VBIOS: strings.TrimSpace(rec[3]),
|
||||||
@@ -187,6 +189,14 @@ func parseMaybeInt(v string) *int {
|
|||||||
return &n
|
return &n
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseRequiredInt(v string) int {
|
||||||
|
n, err := strconv.Atoi(strings.TrimSpace(v))
|
||||||
|
if err != nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
|
||||||
func pcieLinkGenLabel(gen int) string {
|
func pcieLinkGenLabel(gen int) string {
|
||||||
return fmt.Sprintf("Gen%d", gen)
|
return fmt.Sprintf("Gen%d", gen)
|
||||||
}
|
}
|
||||||
@@ -240,6 +250,10 @@ func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||||
|
if dev.Telemetry == nil {
|
||||||
|
dev.Telemetry = map[string]any{}
|
||||||
|
}
|
||||||
|
dev.Telemetry["nvidia_gpu_index"] = info.Index
|
||||||
if info.TemperatureC != nil {
|
if info.TemperatureC != nil {
|
||||||
dev.TemperatureC = info.TemperatureC
|
dev.TemperatureC = info.TemperatureC
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -86,6 +86,9 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
|||||||
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
|
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
|
||||||
t.Fatalf("firmware: got %v", out[0].Firmware)
|
t.Fatalf("firmware: got %v", out[0].Firmware)
|
||||||
}
|
}
|
||||||
|
if out[0].Telemetry == nil || out[0].Telemetry["nvidia_gpu_index"] != 0 {
|
||||||
|
t.Fatalf("telemetry nvidia_gpu_index: got %#v", out[0].Telemetry)
|
||||||
|
}
|
||||||
if out[0].Status == nil || *out[0].Status != statusWarning {
|
if out[0].Status == nil || *out[0].Status != statusWarning {
|
||||||
t.Fatalf("status: got %v", out[0].Status)
|
t.Fatalf("status: got %v", out[0].Status)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,14 +27,17 @@ type benchmarkProfileSpec struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type benchmarkGPUInfo struct {
|
type benchmarkGPUInfo struct {
|
||||||
Index int
|
Index int
|
||||||
UUID string
|
UUID string
|
||||||
Name string
|
Name string
|
||||||
BusID string
|
BusID string
|
||||||
VBIOS string
|
VBIOS string
|
||||||
PowerLimitW float64
|
PowerLimitW float64
|
||||||
MaxGraphicsClockMHz float64
|
DefaultPowerLimitW float64
|
||||||
MaxMemoryClockMHz float64
|
MaxGraphicsClockMHz float64
|
||||||
|
MaxMemoryClockMHz float64
|
||||||
|
BaseGraphicsClockMHz float64
|
||||||
|
MultiprocessorCount int
|
||||||
}
|
}
|
||||||
|
|
||||||
type benchmarkBurnProfile struct {
|
type benchmarkBurnProfile struct {
|
||||||
@@ -102,7 +105,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
BenchmarkVersion: benchmarkVersion,
|
BenchmarkVersion: benchmarkVersion,
|
||||||
GeneratedAt: time.Now().UTC(),
|
GeneratedAt: time.Now().UTC(),
|
||||||
Hostname: hostname,
|
Hostname: hostname,
|
||||||
|
ServerModel: readServerModel(),
|
||||||
BenchmarkProfile: spec.Name,
|
BenchmarkProfile: spec.Name,
|
||||||
|
ParallelGPUs: opts.ParallelGPUs,
|
||||||
SelectedGPUIndices: append([]int(nil), selected...),
|
SelectedGPUIndices: append([]int(nil), selected...),
|
||||||
Normalization: BenchmarkNormalization{
|
Normalization: BenchmarkNormalization{
|
||||||
Status: "full",
|
Status: "full",
|
||||||
@@ -111,6 +116,11 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
|
|
||||||
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
|
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
|
||||||
|
|
||||||
|
// Server power characterization state — populated during per-GPU phases.
|
||||||
|
var serverIdleW, serverLoadedWSum float64
|
||||||
|
var serverIdleOK, serverLoadedOK bool
|
||||||
|
var serverLoadedSamples int
|
||||||
|
|
||||||
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||||||
if infoErr != nil {
|
if infoErr != nil {
|
||||||
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
|
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
|
||||||
@@ -135,6 +145,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if opts.ParallelGPUs {
|
||||||
|
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
|
||||||
|
} else {
|
||||||
|
|
||||||
for _, idx := range selected {
|
for _, idx := range selected {
|
||||||
gpuResult := BenchmarkGPUResult{
|
gpuResult := BenchmarkGPUResult{
|
||||||
Index: idx,
|
Index: idx,
|
||||||
@@ -146,7 +160,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
gpuResult.BusID = info.BusID
|
gpuResult.BusID = info.BusID
|
||||||
gpuResult.VBIOS = info.VBIOS
|
gpuResult.VBIOS = info.VBIOS
|
||||||
gpuResult.PowerLimitW = info.PowerLimitW
|
gpuResult.PowerLimitW = info.PowerLimitW
|
||||||
|
gpuResult.MultiprocessorCount = info.MultiprocessorCount
|
||||||
|
gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
|
||||||
gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
|
gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
|
||||||
|
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
||||||
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
||||||
}
|
}
|
||||||
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
||||||
@@ -161,6 +178,15 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
|
gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
|
||||||
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows)
|
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows)
|
||||||
|
|
||||||
|
// Sample server idle power once (first GPU only — server state is global).
|
||||||
|
if !serverIdleOK {
|
||||||
|
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
|
||||||
|
serverIdleW = w
|
||||||
|
serverIdleOK = true
|
||||||
|
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
warmupCmd := []string{
|
warmupCmd := []string{
|
||||||
"bee-gpu-burn",
|
"bee-gpu-burn",
|
||||||
"--seconds", strconv.Itoa(spec.WarmupSec),
|
"--seconds", strconv.Itoa(spec.WarmupSec),
|
||||||
@@ -184,7 +210,50 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
"--devices", strconv.Itoa(idx),
|
"--devices", strconv.Itoa(idx),
|
||||||
}
|
}
|
||||||
logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec))
|
logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec))
|
||||||
|
|
||||||
|
// Sample server power via IPMI in parallel with the steady phase.
|
||||||
|
// We collect readings every 5s and average them.
|
||||||
|
ipmiStopCh := make(chan struct{})
|
||||||
|
ipmiResultCh := make(chan float64, 1)
|
||||||
|
go func() {
|
||||||
|
defer close(ipmiResultCh)
|
||||||
|
var samples []float64
|
||||||
|
ticker := time.NewTicker(5 * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
// First sample after a short warmup delay.
|
||||||
|
select {
|
||||||
|
case <-ipmiStopCh:
|
||||||
|
return
|
||||||
|
case <-time.After(15 * time.Second):
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
if w, err := queryIPMIServerPowerW(); err == nil {
|
||||||
|
samples = append(samples, w)
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ipmiStopCh:
|
||||||
|
if len(samples) > 0 {
|
||||||
|
var sum float64
|
||||||
|
for _, w := range samples {
|
||||||
|
sum += w
|
||||||
|
}
|
||||||
|
ipmiResultCh <- sum / float64(len(samples))
|
||||||
|
}
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc)
|
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc)
|
||||||
|
close(ipmiStopCh)
|
||||||
|
if loadedW, ok := <-ipmiResultCh; ok {
|
||||||
|
serverLoadedWSum += loadedW
|
||||||
|
serverLoadedSamples++
|
||||||
|
serverLoadedOK = true
|
||||||
|
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
|
||||||
|
}
|
||||||
|
|
||||||
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644)
|
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644)
|
||||||
afterThrottle, _ := queryThrottleCounters(idx)
|
afterThrottle, _ := queryThrottleCounters(idx)
|
||||||
if steadyErr != nil {
|
if steadyErr != nil {
|
||||||
@@ -222,6 +291,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
|
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} // end sequential path
|
||||||
|
|
||||||
if len(selected) > 1 && opts.RunNCCL {
|
if len(selected) > 1 && opts.RunNCCL {
|
||||||
result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
|
result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
|
||||||
if result.Interconnect != nil && result.Interconnect.Supported {
|
if result.Interconnect != nil && result.Interconnect.Supported {
|
||||||
@@ -232,6 +303,17 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Compute server power characterization from accumulated IPMI samples.
|
||||||
|
var gpuReportedSumW float64
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
gpuReportedSumW += gpu.Steady.AvgPowerW
|
||||||
|
}
|
||||||
|
var serverLoadedW float64
|
||||||
|
if serverLoadedSamples > 0 {
|
||||||
|
serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
|
||||||
|
}
|
||||||
|
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
|
||||||
|
|
||||||
result.Findings = buildBenchmarkFindings(result)
|
result.Findings = buildBenchmarkFindings(result)
|
||||||
result.OverallStatus = benchmarkOverallStatus(result)
|
result.OverallStatus = benchmarkOverallStatus(result)
|
||||||
|
|
||||||
@@ -243,9 +325,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
return "", fmt.Errorf("write result.json: %w", err)
|
return "", fmt.Errorf("write result.json: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
report := renderBenchmarkReport(result)
|
report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected))
|
||||||
if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil {
|
||||||
return "", fmt.Errorf("write report.txt: %w", err)
|
return "", fmt.Errorf("write report.md: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
summary := renderBenchmarkSummary(result)
|
summary := renderBenchmarkSummary(result)
|
||||||
@@ -288,50 +370,87 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
// benchmarkGPUInfoQuery describes a nvidia-smi --query-gpu field set to try.
|
||||||
args := []string{
|
// Fields are tried in order; the first successful query wins. Extended fields
|
||||||
"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory",
|
// (attribute.multiprocessor_count, power.default_limit) are not supported on
|
||||||
"--format=csv,noheader,nounits",
|
// all driver versions, so we fall back to the base set if the full query fails.
|
||||||
}
|
var benchmarkGPUInfoQueries = []struct {
|
||||||
if len(gpuIndices) > 0 {
|
fields string
|
||||||
args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
|
extended bool // whether this query includes optional extended fields
|
||||||
}
|
}{
|
||||||
out, err := satExecCommand("nvidia-smi", args...).Output()
|
{
|
||||||
if err != nil {
|
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
|
||||||
return nil, fmt.Errorf("nvidia-smi gpu info: %w", err)
|
extended: true,
|
||||||
}
|
},
|
||||||
|
{
|
||||||
r := csv.NewReader(strings.NewReader(string(out)))
|
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics",
|
||||||
r.TrimLeadingSpace = true
|
extended: false,
|
||||||
r.FieldsPerRecord = -1
|
},
|
||||||
rows, err := r.ReadAll()
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("parse nvidia-smi gpu info: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
|
|
||||||
for _, row := range rows {
|
|
||||||
if len(row) < 8 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
infoByIndex[idx] = benchmarkGPUInfo{
|
|
||||||
Index: idx,
|
|
||||||
UUID: strings.TrimSpace(row[1]),
|
|
||||||
Name: strings.TrimSpace(row[2]),
|
|
||||||
BusID: strings.TrimSpace(row[3]),
|
|
||||||
VBIOS: strings.TrimSpace(row[4]),
|
|
||||||
PowerLimitW: parseBenchmarkFloat(row[5]),
|
|
||||||
MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
|
|
||||||
MaxMemoryClockMHz: parseBenchmarkFloat(row[7]),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return infoByIndex, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
||||||
|
var lastErr error
|
||||||
|
for _, q := range benchmarkGPUInfoQueries {
|
||||||
|
args := []string{
|
||||||
|
"--query-gpu=" + q.fields,
|
||||||
|
"--format=csv,noheader,nounits",
|
||||||
|
}
|
||||||
|
if len(gpuIndices) > 0 {
|
||||||
|
args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
|
||||||
|
}
|
||||||
|
out, err := satExecCommand("nvidia-smi", args...).Output()
|
||||||
|
if err != nil {
|
||||||
|
lastErr = fmt.Errorf("nvidia-smi gpu info (%s): %w", q.fields[:min(len(q.fields), 40)], err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
r := csv.NewReader(strings.NewReader(string(out)))
|
||||||
|
r.TrimLeadingSpace = true
|
||||||
|
r.FieldsPerRecord = -1
|
||||||
|
rows, err := r.ReadAll()
|
||||||
|
if err != nil {
|
||||||
|
lastErr = fmt.Errorf("parse nvidia-smi gpu info: %w", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
|
||||||
|
for _, row := range rows {
|
||||||
|
if len(row) < 9 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
info := benchmarkGPUInfo{
|
||||||
|
Index: idx,
|
||||||
|
UUID: strings.TrimSpace(row[1]),
|
||||||
|
Name: strings.TrimSpace(row[2]),
|
||||||
|
BusID: strings.TrimSpace(row[3]),
|
||||||
|
VBIOS: strings.TrimSpace(row[4]),
|
||||||
|
PowerLimitW: parseBenchmarkFloat(row[5]),
|
||||||
|
MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
|
||||||
|
MaxMemoryClockMHz: parseBenchmarkFloat(row[7]),
|
||||||
|
}
|
||||||
|
if len(row) >= 9 {
|
||||||
|
info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
|
||||||
|
}
|
||||||
|
if q.extended {
|
||||||
|
if len(row) >= 10 {
|
||||||
|
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
|
||||||
|
}
|
||||||
|
if len(row) >= 11 {
|
||||||
|
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
infoByIndex[idx] = info
|
||||||
|
}
|
||||||
|
return infoByIndex, nil
|
||||||
|
}
|
||||||
|
return nil, lastErr
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
|
func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
|
||||||
if os.Geteuid() != 0 {
|
if os.Geteuid() != 0 {
|
||||||
result.Normalization.Status = "partial"
|
result.Normalization.Status = "partial"
|
||||||
@@ -370,6 +489,10 @@ func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndi
|
|||||||
_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
|
_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
|
||||||
}})
|
}})
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
rec.GPUClockLockStatus = "skipped"
|
||||||
|
rec.Notes = append(rec.Notes, "graphics clock lock skipped: gpu inventory unavailable or MaxGraphicsClockMHz=0")
|
||||||
|
result.Normalization.Status = "partial"
|
||||||
}
|
}
|
||||||
|
|
||||||
if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
|
if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
|
||||||
@@ -551,6 +674,8 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
|
|||||||
}
|
}
|
||||||
category := "other"
|
category := "other"
|
||||||
switch {
|
switch {
|
||||||
|
case strings.HasPrefix(name, "fp64"):
|
||||||
|
category = "fp64"
|
||||||
case strings.HasPrefix(name, "fp32"):
|
case strings.HasPrefix(name, "fp32"):
|
||||||
category = "fp32_tf32"
|
category = "fp32_tf32"
|
||||||
case strings.HasPrefix(name, "fp16"):
|
case strings.HasPrefix(name, "fp16"):
|
||||||
@@ -619,14 +744,23 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
|||||||
score.ComputeScore += precision.TeraOpsPerSec
|
score.ComputeScore += precision.TeraOpsPerSec
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if gpu.PowerLimitW > 0 {
|
// Use default power limit for sustain score so a manually reduced limit
|
||||||
score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/gpu.PowerLimitW)*100)
|
// does not inflate the score. Fall back to enforced limit if default unknown.
|
||||||
|
referencePowerW := gpu.DefaultPowerLimitW
|
||||||
|
if referencePowerW <= 0 {
|
||||||
|
referencePowerW = gpu.PowerLimitW
|
||||||
|
}
|
||||||
|
if referencePowerW > 0 {
|
||||||
|
score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/referencePowerW)*100)
|
||||||
}
|
}
|
||||||
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
|
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
|
||||||
thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
|
thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
|
||||||
score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
|
score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
|
||||||
score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2))
|
score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2))
|
||||||
score.CompositeScore = compositeBenchmarkScore(score)
|
score.CompositeScore = compositeBenchmarkScore(score)
|
||||||
|
if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
|
||||||
|
score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
|
||||||
|
}
|
||||||
return score
|
return score
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -679,7 +813,10 @@ func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gp
|
|||||||
"-g", strconv.Itoa(len(gpuIndices)),
|
"-g", strconv.Itoa(len(gpuIndices)),
|
||||||
"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
|
"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
|
||||||
}
|
}
|
||||||
env := []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
|
env := []string{
|
||||||
|
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
|
||||||
|
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
|
||||||
|
}
|
||||||
logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
|
logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
|
||||||
out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
|
out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
|
||||||
_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
|
_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
|
||||||
@@ -795,10 +932,30 @@ func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {
|
|||||||
|
|
||||||
func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
||||||
var findings []string
|
var findings []string
|
||||||
|
|
||||||
|
passed := 0
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
if gpu.Status == "OK" {
|
||||||
|
passed++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
total := len(result.GPUs)
|
||||||
|
if total > 0 {
|
||||||
|
if passed == total {
|
||||||
|
findings = append(findings, fmt.Sprintf("All %d GPU(s) passed the benchmark.", total))
|
||||||
|
} else {
|
||||||
|
findings = append(findings, fmt.Sprintf("%d of %d GPU(s) passed the benchmark.", passed, total))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if result.Normalization.Status != "full" {
|
if result.Normalization.Status != "full" {
|
||||||
findings = append(findings, "Environment normalization was partial; compare results with caution.")
|
findings = append(findings, "Environment normalization was partial; compare results with caution.")
|
||||||
}
|
}
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
|
if gpu.Status == "FAILED" && len(gpu.DegradationReasons) == 0 {
|
||||||
|
findings = append(findings, fmt.Sprintf("GPU %d failed the benchmark (check verbose.log for details).", gpu.Index))
|
||||||
|
continue
|
||||||
|
}
|
||||||
if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
|
if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
|
||||||
findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
|
findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
|
||||||
continue
|
continue
|
||||||
@@ -822,10 +979,24 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
|||||||
if gpu.Backend == "driver-ptx" {
|
if gpu.Backend == "driver-ptx" {
|
||||||
findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
|
findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
|
||||||
}
|
}
|
||||||
|
if gpu.DefaultPowerLimitW > 0 && gpu.PowerLimitW > 0 && gpu.PowerLimitW < gpu.DefaultPowerLimitW*0.95 {
|
||||||
|
findings = append(findings, fmt.Sprintf(
|
||||||
|
"GPU %d power limit %.0f W is below default %.0f W (%.0f%%). Performance may be artificially reduced.",
|
||||||
|
gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100,
|
||||||
|
))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if result.Interconnect != nil && result.Interconnect.Supported {
|
if result.Interconnect != nil && result.Interconnect.Supported {
|
||||||
findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
|
findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
|
||||||
}
|
}
|
||||||
|
if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
|
||||||
|
if sp.ReportingRatio < 0.75 {
|
||||||
|
findings = append(findings, fmt.Sprintf(
|
||||||
|
"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption.",
|
||||||
|
sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
return dedupeStrings(findings)
|
return dedupeStrings(findings)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1004,3 +1175,309 @@ func maxInt(a, b int) int {
|
|||||||
}
|
}
|
||||||
return b
|
return b
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
|
||||||
|
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
|
||||||
|
func queryIPMIServerPowerW() (float64, error) {
|
||||||
|
out, err := satExecCommand("ipmitool", "dcmi", "power", "reading").Output()
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
|
||||||
|
}
|
||||||
|
if w := parseDCMIPowerReading(string(out)); w > 0 {
|
||||||
|
return w, nil
|
||||||
|
}
|
||||||
|
return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleIPMIPowerSeries collects IPMI power readings every 2 seconds for
|
||||||
|
// durationSec seconds. Returns the mean of all successful samples.
|
||||||
|
// Returns 0, false if IPMI is unavailable.
|
||||||
|
func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64, ok bool) {
|
||||||
|
if durationSec <= 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
|
||||||
|
var samples []float64
|
||||||
|
for {
|
||||||
|
if w, err := queryIPMIServerPowerW(); err == nil {
|
||||||
|
samples = append(samples, w)
|
||||||
|
}
|
||||||
|
if time.Now().After(deadline) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
break
|
||||||
|
case <-time.After(2 * time.Second):
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
var sum float64
|
||||||
|
for _, w := range samples {
|
||||||
|
sum += w
|
||||||
|
}
|
||||||
|
return sum / float64(len(samples)), true
|
||||||
|
}
|
||||||
|
|
||||||
|
// characterizeServerPower computes BenchmarkServerPower from idle and loaded
|
||||||
|
// IPMI samples plus the GPU-reported average power during steady state.
|
||||||
|
func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
|
||||||
|
sp := &BenchmarkServerPower{Available: ipmiAvailable}
|
||||||
|
if !ipmiAvailable {
|
||||||
|
sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
|
||||||
|
return sp
|
||||||
|
}
|
||||||
|
sp.IdleW = idleW
|
||||||
|
sp.LoadedW = loadedW
|
||||||
|
sp.DeltaW = loadedW - idleW
|
||||||
|
sp.GPUReportedSumW = gpuReportedSumW
|
||||||
|
if gpuReportedSumW > 0 && sp.DeltaW > 0 {
|
||||||
|
sp.ReportingRatio = sp.DeltaW / gpuReportedSumW
|
||||||
|
}
|
||||||
|
return sp
|
||||||
|
}
|
||||||
|
|
||||||
|
// readServerModel returns the DMI system product name (e.g. "SuperMicro SYS-421GE-TNRT").
|
||||||
|
// Returns empty string if unavailable (non-Linux or missing DMI entry).
|
||||||
|
func readServerModel() string {
|
||||||
|
data, err := os.ReadFile("/sys/class/dmi/id/product_name")
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(string(data))
|
||||||
|
}
|
||||||
|
|
||||||
|
// filterRowsByGPU returns only the metric rows for a specific GPU index.
|
||||||
|
func filterRowsByGPU(rows []GPUMetricRow, gpuIndex int) []GPUMetricRow {
|
||||||
|
var out []GPUMetricRow
|
||||||
|
for _, r := range rows {
|
||||||
|
if r.GPUIndex == gpuIndex {
|
||||||
|
out = append(out, r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseBenchmarkBurnLogByGPU splits a multi-GPU bee-gpu-burn output by [gpu N] prefix
|
||||||
|
// and returns a per-GPU parse result map.
|
||||||
|
func parseBenchmarkBurnLogByGPU(raw string) map[int]benchmarkBurnParseResult {
|
||||||
|
gpuLines := make(map[int][]string)
|
||||||
|
for _, line := range strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if !strings.HasPrefix(line, "[gpu ") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
end := strings.Index(line, "] ")
|
||||||
|
if end < 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
gpuIdx, err := strconv.Atoi(strings.TrimSpace(line[5:end]))
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
gpuLines[gpuIdx] = append(gpuLines[gpuIdx], line[end+2:])
|
||||||
|
}
|
||||||
|
results := make(map[int]benchmarkBurnParseResult, len(gpuLines))
|
||||||
|
for gpuIdx, lines := range gpuLines {
|
||||||
|
// Lines are already stripped of the [gpu N] prefix; parseBenchmarkBurnLog
|
||||||
|
// calls stripBenchmarkPrefix which is a no-op on already-stripped lines.
|
||||||
|
results[gpuIdx] = parseBenchmarkBurnLog(strings.Join(lines, "\n"))
|
||||||
|
}
|
||||||
|
return results
|
||||||
|
}
|
||||||
|
|
||||||
|
// runNvidiaBenchmarkParallel runs warmup and steady compute on all selected GPUs
|
||||||
|
// simultaneously using a single bee-gpu-burn invocation per phase.
|
||||||
|
func runNvidiaBenchmarkParallel(
|
||||||
|
ctx context.Context,
|
||||||
|
verboseLog, runDir string,
|
||||||
|
selected []int,
|
||||||
|
infoByIndex map[int]benchmarkGPUInfo,
|
||||||
|
opts NvidiaBenchmarkOptions,
|
||||||
|
spec benchmarkProfileSpec,
|
||||||
|
logFunc func(string),
|
||||||
|
result *NvidiaBenchmarkResult,
|
||||||
|
serverIdleW *float64, serverLoadedWSum *float64,
|
||||||
|
serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
|
||||||
|
) {
|
||||||
|
allDevices := joinIndexList(selected)
|
||||||
|
|
||||||
|
// Build per-GPU result stubs.
|
||||||
|
gpuResults := make(map[int]*BenchmarkGPUResult, len(selected))
|
||||||
|
for _, idx := range selected {
|
||||||
|
r := &BenchmarkGPUResult{Index: idx, Status: "FAILED"}
|
||||||
|
if info, ok := infoByIndex[idx]; ok {
|
||||||
|
r.UUID = info.UUID
|
||||||
|
r.Name = info.Name
|
||||||
|
r.BusID = info.BusID
|
||||||
|
r.VBIOS = info.VBIOS
|
||||||
|
r.PowerLimitW = info.PowerLimitW
|
||||||
|
r.MultiprocessorCount = info.MultiprocessorCount
|
||||||
|
r.DefaultPowerLimitW = info.DefaultPowerLimitW
|
||||||
|
r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
|
||||||
|
r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
||||||
|
r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
||||||
|
}
|
||||||
|
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
||||||
|
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
||||||
|
r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
|
||||||
|
}
|
||||||
|
gpuResults[idx] = r
|
||||||
|
}
|
||||||
|
|
||||||
|
// Baseline: sample all GPUs together.
|
||||||
|
baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, selected)
|
||||||
|
if err != nil && err != context.Canceled {
|
||||||
|
for _, idx := range selected {
|
||||||
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "baseline sampling failed: "+err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, idx := range selected {
|
||||||
|
perGPU := filterRowsByGPU(baselineRows, idx)
|
||||||
|
gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
|
||||||
|
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), perGPU)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sample server idle power once.
|
||||||
|
if !*serverIdleOK {
|
||||||
|
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
|
||||||
|
*serverIdleW = w
|
||||||
|
*serverIdleOK = true
|
||||||
|
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Warmup: all GPUs simultaneously.
|
||||||
|
warmupCmd := []string{
|
||||||
|
"bee-gpu-burn",
|
||||||
|
"--seconds", strconv.Itoa(spec.WarmupSec),
|
||||||
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
|
"--devices", allDevices,
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
|
||||||
|
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, runDir, "gpu-all-warmup", logFunc)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "gpu-all-warmup.log"), warmupOut, 0644)
|
||||||
|
for _, idx := range selected {
|
||||||
|
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-warmup", idx), filterRowsByGPU(warmupRows, idx))
|
||||||
|
}
|
||||||
|
if warmupErr != nil {
|
||||||
|
for _, idx := range selected {
|
||||||
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Snapshot throttle counters before steady.
|
||||||
|
beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
|
||||||
|
for _, idx := range selected {
|
||||||
|
beforeThrottle[idx], _ = queryThrottleCounters(idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Steady: all GPUs simultaneously.
|
||||||
|
steadyCmd := []string{
|
||||||
|
"bee-gpu-burn",
|
||||||
|
"--seconds", strconv.Itoa(spec.SteadySec),
|
||||||
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
|
"--devices", allDevices,
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (%ds)", allDevices, spec.SteadySec))
|
||||||
|
|
||||||
|
// Sample server power via IPMI in parallel with steady phase.
|
||||||
|
ipmiStopCh := make(chan struct{})
|
||||||
|
ipmiResultCh := make(chan float64, 1)
|
||||||
|
go func() {
|
||||||
|
defer close(ipmiResultCh)
|
||||||
|
var samples []float64
|
||||||
|
ticker := time.NewTicker(5 * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
select {
|
||||||
|
case <-ipmiStopCh:
|
||||||
|
return
|
||||||
|
case <-time.After(15 * time.Second):
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
if w, err := queryIPMIServerPowerW(); err == nil {
|
||||||
|
samples = append(samples, w)
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ipmiStopCh:
|
||||||
|
if len(samples) > 0 {
|
||||||
|
var sum float64
|
||||||
|
for _, w := range samples {
|
||||||
|
sum += w
|
||||||
|
}
|
||||||
|
ipmiResultCh <- sum / float64(len(samples))
|
||||||
|
}
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, runDir, "gpu-all-steady", logFunc)
|
||||||
|
close(ipmiStopCh)
|
||||||
|
if loadedW, ok := <-ipmiResultCh; ok {
|
||||||
|
*serverLoadedWSum += loadedW
|
||||||
|
(*serverLoadedSamples)++
|
||||||
|
*serverLoadedOK = true
|
||||||
|
logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
|
||||||
|
}
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "gpu-all-steady.log"), steadyOut, 0644)
|
||||||
|
|
||||||
|
afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
|
||||||
|
for _, idx := range selected {
|
||||||
|
afterThrottle[idx], _ = queryThrottleCounters(idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
parseResults := parseBenchmarkBurnLogByGPU(string(steadyOut))
|
||||||
|
|
||||||
|
for _, idx := range selected {
|
||||||
|
perGPU := filterRowsByGPU(steadyRows, idx)
|
||||||
|
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-steady", idx), perGPU)
|
||||||
|
gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU)
|
||||||
|
gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx])
|
||||||
|
|
||||||
|
if pr, ok := parseResults[idx]; ok {
|
||||||
|
gpuResults[idx].ComputeCapability = pr.ComputeCapability
|
||||||
|
gpuResults[idx].Backend = pr.Backend
|
||||||
|
gpuResults[idx].PrecisionResults = pr.Profiles
|
||||||
|
if pr.Fallback {
|
||||||
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if steadyErr != nil {
|
||||||
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel steady compute failed: "+steadyErr.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cooldown: all GPUs together.
|
||||||
|
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
|
||||||
|
if err != nil && err != context.Canceled {
|
||||||
|
for _, idx := range selected {
|
||||||
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, idx := range selected {
|
||||||
|
perGPU := filterRowsByGPU(cooldownRows, idx)
|
||||||
|
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
|
||||||
|
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), perGPU)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Score and finalize each GPU.
|
||||||
|
for _, idx := range selected {
|
||||||
|
r := gpuResults[idx]
|
||||||
|
r.Scores = scoreBenchmarkGPUResult(*r)
|
||||||
|
r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
|
||||||
|
pr := parseResults[idx]
|
||||||
|
switch {
|
||||||
|
case steadyErr != nil:
|
||||||
|
r.Status = classifySATErrorStatus(steadyOut, steadyErr)
|
||||||
|
case pr.Fallback:
|
||||||
|
r.Status = "PARTIAL"
|
||||||
|
default:
|
||||||
|
r.Status = "OK"
|
||||||
|
}
|
||||||
|
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -2,24 +2,73 @@ package platform
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
||||||
var b strings.Builder
|
return renderBenchmarkReportWithCharts(result, nil)
|
||||||
fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
|
}
|
||||||
fmt.Fprintf(&b, "===========================\n\n")
|
|
||||||
fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
|
||||||
fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
|
|
||||||
fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
|
|
||||||
fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
|
|
||||||
fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
|
|
||||||
fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
|
|
||||||
|
|
||||||
|
type benchmarkReportChart struct {
|
||||||
|
Title string
|
||||||
|
Content string
|
||||||
|
}
|
||||||
|
|
||||||
|
var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
|
||||||
|
|
||||||
|
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
|
||||||
|
var b strings.Builder
|
||||||
|
|
||||||
|
// ── Header ────────────────────────────────────────────────────────────────
|
||||||
|
b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
|
||||||
|
|
||||||
|
// System identity block
|
||||||
|
if result.ServerModel != "" {
|
||||||
|
fmt.Fprintf(&b, "**Server:** %s \n", result.ServerModel)
|
||||||
|
}
|
||||||
|
if result.Hostname != "" {
|
||||||
|
fmt.Fprintf(&b, "**Host:** %s \n", result.Hostname)
|
||||||
|
}
|
||||||
|
// GPU models summary
|
||||||
|
if len(result.GPUs) > 0 {
|
||||||
|
modelCount := make(map[string]int)
|
||||||
|
var modelOrder []string
|
||||||
|
for _, g := range result.GPUs {
|
||||||
|
m := strings.TrimSpace(g.Name)
|
||||||
|
if m == "" {
|
||||||
|
m = "Unknown GPU"
|
||||||
|
}
|
||||||
|
if modelCount[m] == 0 {
|
||||||
|
modelOrder = append(modelOrder, m)
|
||||||
|
}
|
||||||
|
modelCount[m]++
|
||||||
|
}
|
||||||
|
var parts []string
|
||||||
|
for _, m := range modelOrder {
|
||||||
|
if modelCount[m] == 1 {
|
||||||
|
parts = append(parts, m)
|
||||||
|
} else {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||||
|
fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion)
|
||||||
|
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||||
|
if result.ParallelGPUs {
|
||||||
|
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||||
|
b.WriteString("\n")
|
||||||
|
|
||||||
|
// ── Executive Summary ─────────────────────────────────────────────────────
|
||||||
if len(result.Findings) > 0 {
|
if len(result.Findings) > 0 {
|
||||||
fmt.Fprintf(&b, "Executive Summary\n")
|
b.WriteString("## Executive Summary\n\n")
|
||||||
fmt.Fprintf(&b, "-----------------\n")
|
|
||||||
for _, finding := range result.Findings {
|
for _, finding := range result.Findings {
|
||||||
fmt.Fprintf(&b, "- %s\n", finding)
|
fmt.Fprintf(&b, "- %s\n", finding)
|
||||||
}
|
}
|
||||||
@@ -27,96 +76,250 @@ func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if len(result.Warnings) > 0 {
|
if len(result.Warnings) > 0 {
|
||||||
fmt.Fprintf(&b, "Warnings\n")
|
b.WriteString("## Warnings\n\n")
|
||||||
fmt.Fprintf(&b, "--------\n")
|
|
||||||
for _, warning := range result.Warnings {
|
for _, warning := range result.Warnings {
|
||||||
fmt.Fprintf(&b, "- %s\n", warning)
|
fmt.Fprintf(&b, "- %s\n", warning)
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Fprintf(&b, "Per GPU Scorecard\n")
|
// ── Scorecard table ───────────────────────────────────────────────────────
|
||||||
fmt.Fprintf(&b, "-----------------\n")
|
b.WriteString("## Scorecard\n\n")
|
||||||
|
b.WriteString("| GPU | Status | Composite | Compute | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
|
||||||
|
b.WriteString("|-----|--------|-----------|---------|-------------|---------------|-----------------|-----------|-------------|\n")
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
fmt.Fprintf(&b, "GPU %d %s\n", gpu.Index, gpu.Name)
|
name := strings.TrimSpace(gpu.Name)
|
||||||
fmt.Fprintf(&b, " Status: %s\n", gpu.Status)
|
if name == "" {
|
||||||
fmt.Fprintf(&b, " Composite score: %.2f\n", gpu.Scores.CompositeScore)
|
name = "Unknown"
|
||||||
fmt.Fprintf(&b, " Compute score: %.2f\n", gpu.Scores.ComputeScore)
|
}
|
||||||
fmt.Fprintf(&b, " Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
|
interconnect := "-"
|
||||||
fmt.Fprintf(&b, " Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
|
|
||||||
fmt.Fprintf(&b, " Stability: %.1f\n", gpu.Scores.StabilityScore)
|
|
||||||
if gpu.Scores.InterconnectScore > 0 {
|
if gpu.Scores.InterconnectScore > 0 {
|
||||||
fmt.Fprintf(&b, " Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
|
interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
|
||||||
}
|
}
|
||||||
if len(gpu.DegradationReasons) > 0 {
|
topsPerSM := "-"
|
||||||
fmt.Fprintf(&b, " Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
|
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||||
|
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, " Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
|
fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %.1f | %.1f | %.1f | %s |\n",
|
||||||
fmt.Fprintf(&b, " P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
|
gpu.Index, name,
|
||||||
if len(gpu.PrecisionResults) > 0 {
|
gpu.Status,
|
||||||
fmt.Fprintf(&b, " Precision results:\n")
|
gpu.Scores.CompositeScore,
|
||||||
for _, precision := range gpu.PrecisionResults {
|
gpu.Scores.ComputeScore,
|
||||||
if precision.Supported {
|
topsPerSM,
|
||||||
fmt.Fprintf(&b, " - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
|
gpu.Scores.PowerSustainScore,
|
||||||
} else {
|
gpu.Scores.ThermalSustainScore,
|
||||||
fmt.Fprintf(&b, " - %s: unsupported (%s)\n", precision.Name, precision.Notes)
|
gpu.Scores.StabilityScore,
|
||||||
}
|
interconnect,
|
||||||
}
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, " Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n",
|
|
||||||
gpu.Throttle.SWPowerCapUS,
|
|
||||||
gpu.Throttle.SWThermalSlowdownUS,
|
|
||||||
gpu.Throttle.SyncBoostUS,
|
|
||||||
gpu.Throttle.HWThermalSlowdownUS,
|
|
||||||
gpu.Throttle.HWPowerBrakeSlowdownUS,
|
|
||||||
)
|
)
|
||||||
if len(gpu.Notes) > 0 {
|
}
|
||||||
fmt.Fprintf(&b, " Notes:\n")
|
b.WriteString("\n")
|
||||||
for _, note := range gpu.Notes {
|
|
||||||
fmt.Fprintf(&b, " - %s\n", note)
|
// ── Per GPU detail ────────────────────────────────────────────────────────
|
||||||
}
|
b.WriteString("## Per-GPU Details\n\n")
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
name := strings.TrimSpace(gpu.Name)
|
||||||
|
if name == "" {
|
||||||
|
name = "Unknown GPU"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
|
||||||
|
|
||||||
|
// Identity
|
||||||
|
if gpu.BusID != "" {
|
||||||
|
fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
|
||||||
|
}
|
||||||
|
if gpu.VBIOS != "" {
|
||||||
|
fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
|
||||||
|
}
|
||||||
|
if gpu.ComputeCapability != "" {
|
||||||
|
fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
|
||||||
|
}
|
||||||
|
if gpu.MultiprocessorCount > 0 {
|
||||||
|
fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
|
||||||
|
}
|
||||||
|
if gpu.PowerLimitW > 0 {
|
||||||
|
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
||||||
|
}
|
||||||
|
if gpu.LockedGraphicsClockMHz > 0 {
|
||||||
|
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
|
|
||||||
|
// Steady-state telemetry
|
||||||
|
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||||
|
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
|
||||||
|
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
|
||||||
|
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
|
||||||
|
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
|
||||||
|
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
|
||||||
|
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
||||||
|
b.WriteString("\n")
|
||||||
|
|
||||||
|
// Throttle
|
||||||
|
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
|
||||||
|
if throttle != "none" {
|
||||||
|
fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Precision results
|
||||||
|
if len(gpu.PrecisionResults) > 0 {
|
||||||
|
b.WriteString("**Precision results:**\n\n")
|
||||||
|
b.WriteString("| Precision | TOPS | Lanes | Iterations |\n|-----------|------|-------|------------|\n")
|
||||||
|
for _, p := range gpu.PrecisionResults {
|
||||||
|
if p.Supported {
|
||||||
|
fmt.Fprintf(&b, "| %s | %.2f | %d | %d |\n", p.Name, p.TeraOpsPerSec, p.Lanes, p.Iterations)
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — |\n", p.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Degradation / Notes
|
||||||
|
if len(gpu.DegradationReasons) > 0 {
|
||||||
|
fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
|
||||||
|
}
|
||||||
|
if len(gpu.Notes) > 0 {
|
||||||
|
b.WriteString("**Notes:**\n\n")
|
||||||
|
for _, note := range gpu.Notes {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Interconnect ──────────────────────────────────────────────────────────
|
||||||
if result.Interconnect != nil {
|
if result.Interconnect != nil {
|
||||||
fmt.Fprintf(&b, "Interconnect\n")
|
b.WriteString("## Interconnect (NCCL)\n\n")
|
||||||
fmt.Fprintf(&b, "------------\n")
|
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
||||||
fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
|
|
||||||
if result.Interconnect.Supported {
|
if result.Interconnect.Supported {
|
||||||
fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
|
b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
|
||||||
fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
|
fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
|
||||||
|
fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
|
||||||
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
for _, note := range result.Interconnect.Notes {
|
for _, note := range result.Interconnect.Notes {
|
||||||
fmt.Fprintf(&b, "- %s\n", note)
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
if len(result.Interconnect.Notes) > 0 {
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Fprintf(&b, "Methodology\n")
|
// ── Server Power (IPMI) ───────────────────────────────────────────────────
|
||||||
fmt.Fprintf(&b, "-----------\n")
|
if sp := result.ServerPower; sp != nil {
|
||||||
fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
|
b.WriteString("## Server Power (IPMI)\n\n")
|
||||||
fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
|
if !sp.Available {
|
||||||
fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
|
b.WriteString("IPMI power measurement unavailable.\n\n")
|
||||||
fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
|
} else {
|
||||||
|
b.WriteString("| | Value |\n|---|---|\n")
|
||||||
|
fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
|
||||||
|
fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
|
||||||
|
fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
|
||||||
|
fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
|
||||||
|
if sp.ReportingRatio > 0 {
|
||||||
|
fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
for _, note := range sp.Notes {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
|
}
|
||||||
|
if len(sp.Notes) > 0 {
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fmt.Fprintf(&b, "Raw Files\n")
|
// ── Terminal charts (steady-state only) ───────────────────────────────────
|
||||||
fmt.Fprintf(&b, "---------\n")
|
if len(charts) > 0 {
|
||||||
fmt.Fprintf(&b, "- result.json\n")
|
b.WriteString("## Steady-State Charts\n\n")
|
||||||
fmt.Fprintf(&b, "- report.txt\n")
|
for _, chart := range charts {
|
||||||
fmt.Fprintf(&b, "- summary.txt\n")
|
content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
|
||||||
fmt.Fprintf(&b, "- verbose.log\n")
|
if content == "" {
|
||||||
fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
|
continue
|
||||||
fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
|
}
|
||||||
fmt.Fprintf(&b, "- gpu-*-steady.log\n")
|
fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content)
|
||||||
fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
|
}
|
||||||
fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
|
}
|
||||||
|
|
||||||
|
// ── Methodology ───────────────────────────────────────────────────────────
|
||||||
|
b.WriteString("## Methodology\n\n")
|
||||||
|
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile)
|
||||||
|
b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n")
|
||||||
|
b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
|
||||||
|
b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n")
|
||||||
|
|
||||||
|
// ── Raw files ─────────────────────────────────────────────────────────────
|
||||||
|
b.WriteString("## Raw Files\n\n")
|
||||||
|
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||||||
|
b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n")
|
||||||
|
b.WriteString("- `gpu-*-warmup.log`\n")
|
||||||
|
b.WriteString("- `gpu-*-steady.log`\n")
|
||||||
|
b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n")
|
||||||
|
b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n")
|
||||||
if result.Interconnect != nil {
|
if result.Interconnect != nil {
|
||||||
fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
|
b.WriteString("- `nccl-all-reduce.log`\n")
|
||||||
}
|
}
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and
|
||||||
|
// cooldown charts are not useful for human review).
|
||||||
|
func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
|
||||||
|
var charts []benchmarkReportChart
|
||||||
|
for _, idx := range gpuIndices {
|
||||||
|
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx))
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil || len(raw) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
charts = append(charts, benchmarkReportChart{
|
||||||
|
Title: fmt.Sprintf("GPU %d — Steady State", idx),
|
||||||
|
Content: string(raw),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return charts
|
||||||
|
}
|
||||||
|
|
||||||
|
func stripANSIEscapeSequences(raw string) string {
|
||||||
|
return ansiEscapePattern.ReplaceAllString(raw, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
// formatThrottleLine renders throttle counters as human-readable percentages of
|
||||||
|
// the steady-state window. Only non-zero counters are shown. When the steady
|
||||||
|
// duration is unknown (0), raw seconds are shown instead.
|
||||||
|
func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
|
||||||
|
type counter struct {
|
||||||
|
label string
|
||||||
|
us uint64
|
||||||
|
}
|
||||||
|
counters := []counter{
|
||||||
|
{"sw_power", t.SWPowerCapUS},
|
||||||
|
{"sw_thermal", t.SWThermalSlowdownUS},
|
||||||
|
{"sync_boost", t.SyncBoostUS},
|
||||||
|
{"hw_thermal", t.HWThermalSlowdownUS},
|
||||||
|
{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
|
||||||
|
}
|
||||||
|
var parts []string
|
||||||
|
for _, c := range counters {
|
||||||
|
if c.us == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sec := float64(c.us) / 1e6
|
||||||
|
if steadyDurationSec > 0 {
|
||||||
|
pct := sec / steadyDurationSec * 100
|
||||||
|
parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
|
||||||
|
} else if sec < 1 {
|
||||||
|
parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
|
||||||
|
} else {
|
||||||
|
parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(parts) == 0 {
|
||||||
|
return "none"
|
||||||
|
}
|
||||||
|
return strings.Join(parts, " ")
|
||||||
|
}
|
||||||
|
|
||||||
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||||
|
|||||||
@@ -137,11 +137,44 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
|||||||
for _, needle := range []string{
|
for _, needle := range []string{
|
||||||
"Executive Summary",
|
"Executive Summary",
|
||||||
"GPU 0 spent measurable time under SW power cap.",
|
"GPU 0 spent measurable time under SW power cap.",
|
||||||
"Composite score: 1176.00",
|
"1176.00",
|
||||||
"fp16_tensor: 700.00 TOPS",
|
"fp16_tensor",
|
||||||
|
"700.00",
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(report, needle) {
|
if !strings.Contains(report, needle) {
|
||||||
t.Fatalf("report missing %q\n%s", needle, report)
|
t.Fatalf("report missing %q\n%s", needle, report)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
report := renderBenchmarkReportWithCharts(NvidiaBenchmarkResult{
|
||||||
|
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
|
||||||
|
OverallStatus: "OK",
|
||||||
|
SelectedGPUIndices: []int{0},
|
||||||
|
Normalization: BenchmarkNormalization{
|
||||||
|
Status: "full",
|
||||||
|
},
|
||||||
|
}, []benchmarkReportChart{
|
||||||
|
{
|
||||||
|
Title: "GPU 0 Steady State",
|
||||||
|
Content: "\x1b[31mGPU 0 chart\x1b[0m\n 42┤───",
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
for _, needle := range []string{
|
||||||
|
"Steady-State Charts",
|
||||||
|
"GPU 0 Steady State",
|
||||||
|
"GPU 0 chart",
|
||||||
|
"42┤───",
|
||||||
|
} {
|
||||||
|
if !strings.Contains(report, needle) {
|
||||||
|
t.Fatalf("report missing %q\n%s", needle, report)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if strings.Contains(report, "\x1b[31m") {
|
||||||
|
t.Fatalf("report should not contain ANSI escapes\n%s", report)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -14,13 +14,17 @@ type NvidiaBenchmarkOptions struct {
|
|||||||
GPUIndices []int
|
GPUIndices []int
|
||||||
ExcludeGPUIndices []int
|
ExcludeGPUIndices []int
|
||||||
RunNCCL bool
|
RunNCCL bool
|
||||||
|
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
type NvidiaBenchmarkResult struct {
|
type NvidiaBenchmarkResult struct {
|
||||||
BenchmarkVersion string `json:"benchmark_version"`
|
BenchmarkVersion string `json:"benchmark_version"`
|
||||||
GeneratedAt time.Time `json:"generated_at"`
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
Hostname string `json:"hostname,omitempty"`
|
Hostname string `json:"hostname,omitempty"`
|
||||||
|
ServerModel string `json:"server_model,omitempty"`
|
||||||
BenchmarkProfile string `json:"benchmark_profile"`
|
BenchmarkProfile string `json:"benchmark_profile"`
|
||||||
|
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||||
OverallStatus string `json:"overall_status"`
|
OverallStatus string `json:"overall_status"`
|
||||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||||
Findings []string `json:"findings,omitempty"`
|
Findings []string `json:"findings,omitempty"`
|
||||||
@@ -28,6 +32,7 @@ type NvidiaBenchmarkResult struct {
|
|||||||
Normalization BenchmarkNormalization `json:"normalization"`
|
Normalization BenchmarkNormalization `json:"normalization"`
|
||||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||||
|
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type BenchmarkNormalization struct {
|
type BenchmarkNormalization struct {
|
||||||
@@ -56,7 +61,10 @@ type BenchmarkGPUResult struct {
|
|||||||
Backend string `json:"backend,omitempty"`
|
Backend string `json:"backend,omitempty"`
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
||||||
|
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
||||||
|
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||||
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
||||||
|
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
|
||||||
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
||||||
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
|
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
|
||||||
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
|
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
|
||||||
@@ -117,6 +125,24 @@ type BenchmarkScorecard struct {
|
|||||||
StabilityScore float64 `json:"stability_score"`
|
StabilityScore float64 `json:"stability_score"`
|
||||||
InterconnectScore float64 `json:"interconnect_score"`
|
InterconnectScore float64 `json:"interconnect_score"`
|
||||||
CompositeScore float64 `json:"composite_score"`
|
CompositeScore float64 `json:"composite_score"`
|
||||||
|
// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
|
||||||
|
// Comparable across throttle levels and GPU generations. Low value at normal
|
||||||
|
// clocks indicates silicon degradation.
|
||||||
|
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
|
||||||
|
// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
|
||||||
|
// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
|
||||||
|
// over-reporting its power consumption.
|
||||||
|
type BenchmarkServerPower struct {
|
||||||
|
Available bool `json:"available"`
|
||||||
|
IdleW float64 `json:"idle_w,omitempty"`
|
||||||
|
LoadedW float64 `json:"loaded_w,omitempty"`
|
||||||
|
DeltaW float64 `json:"delta_w,omitempty"`
|
||||||
|
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||||
|
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type BenchmarkInterconnectResult struct {
|
type BenchmarkInterconnectResult struct {
|
||||||
|
|||||||
@@ -383,10 +383,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
ansiRed = "\033[31m"
|
ansiAmber = "\033[38;5;214m"
|
||||||
ansiBlue = "\033[34m"
|
|
||||||
ansiGreen = "\033[32m"
|
|
||||||
ansiYellow = "\033[33m"
|
|
||||||
ansiReset = "\033[0m"
|
ansiReset = "\033[0m"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -415,10 +412,10 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string {
|
|||||||
fn func(GPUMetricRow) float64
|
fn func(GPUMetricRow) float64
|
||||||
}
|
}
|
||||||
defs := []seriesDef{
|
defs := []seriesDef{
|
||||||
{"Temperature (°C)", ansiRed, func(r GPUMetricRow) float64 { return r.TempC }},
|
{"Temperature (°C)", ansiAmber, func(r GPUMetricRow) float64 { return r.TempC }},
|
||||||
{"GPU Usage (%)", ansiBlue, func(r GPUMetricRow) float64 { return r.UsagePct }},
|
{"GPU Usage (%)", ansiAmber, func(r GPUMetricRow) float64 { return r.UsagePct }},
|
||||||
{"Power (W)", ansiGreen, func(r GPUMetricRow) float64 { return r.PowerW }},
|
{"Power (W)", ansiAmber, func(r GPUMetricRow) float64 { return r.PowerW }},
|
||||||
{"Clock (MHz)", ansiYellow, func(r GPUMetricRow) float64 { return r.ClockMHz }},
|
{"Clock (MHz)", ansiAmber, func(r GPUMetricRow) float64 { return r.ClockMHz }},
|
||||||
}
|
}
|
||||||
|
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
|
|||||||
@@ -116,25 +116,47 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
|
|||||||
if err := ctx.Err(); err != nil {
|
if err := ctx.Err(); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
|
|
||||||
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
|
mediumRebound := false
|
||||||
|
if err := bindMount(dstDir, "/run/live/medium"); err != nil {
|
||||||
|
log(fmt.Sprintf("Warning: rebind /run/live/medium → %s failed: %v", dstDir, err))
|
||||||
|
} else {
|
||||||
|
mediumRebound = true
|
||||||
}
|
}
|
||||||
|
|
||||||
log("Verifying live medium now served from RAM...")
|
log("Verifying live medium now served from RAM...")
|
||||||
status := s.LiveBootSource()
|
status := s.LiveBootSource()
|
||||||
if err := verifyInstallToRAMStatus(status); err != nil {
|
if err := verifyInstallToRAMStatus(status, dstDir, mediumRebound, log); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
if status.InRAM {
|
||||||
log("Done. Installation media can be safely disconnected.")
|
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
||||||
|
}
|
||||||
|
log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func verifyInstallToRAMStatus(status LiveBootSource) error {
|
func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
|
||||||
if status.InRAM {
|
if status.InRAM {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s", describeLiveBootSource(status))
|
|
||||||
|
// The live medium mount was not redirected to RAM. This is expected when
|
||||||
|
// booting from an ISO/CD-ROM: the squashfs loop device has a non-zero
|
||||||
|
// offset and LOOP_CHANGE_FD cannot be used; the bind mount also fails
|
||||||
|
// because the CD-ROM mount is in use. Check whether files were at least
|
||||||
|
// copied to the tmpfs directory — that is sufficient for safe disconnection
|
||||||
|
// once the kernel has paged in all actively-used data.
|
||||||
|
files, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
|
||||||
|
if len(files) > 0 {
|
||||||
|
if !mediumRebound {
|
||||||
|
log(fmt.Sprintf("Note: squashfs copied to RAM (%s) but /run/live/medium still shows the original source.", dstDir))
|
||||||
|
log("This is normal for CD-ROM boots. For a fully transparent RAM boot, add 'toram' to the kernel parameters.")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s and no squashfs found in %s", describeLiveBootSource(status), dstDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func describeLiveBootSource(status LiveBootSource) string {
|
func describeLiveBootSource(status LiveBootSource) string {
|
||||||
@@ -247,7 +269,31 @@ func findLoopForFile(backingFile string) (string, error) {
|
|||||||
return "", fmt.Errorf("no loop device found for %s", backingFile)
|
return "", fmt.Errorf("no loop device found for %s", backingFile)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// loopDeviceOffset returns the byte offset configured for the loop device,
|
||||||
|
// or -1 if it cannot be determined.
|
||||||
|
func loopDeviceOffset(loopDev string) int64 {
|
||||||
|
out, err := exec.Command("losetup", "--json", loopDev).Output()
|
||||||
|
if err != nil {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
var result struct {
|
||||||
|
Loopdevices []struct {
|
||||||
|
Offset int64 `json:"offset"`
|
||||||
|
} `json:"loopdevices"`
|
||||||
|
}
|
||||||
|
if err := json.Unmarshal(out, &result); err != nil || len(result.Loopdevices) == 0 {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
return result.Loopdevices[0].Offset
|
||||||
|
}
|
||||||
|
|
||||||
func reassociateLoopDevice(loopDev, newFile string) error {
|
func reassociateLoopDevice(loopDev, newFile string) error {
|
||||||
|
// LOOP_CHANGE_FD requires lo_offset == 0. ISO/CD-ROM loop devices are
|
||||||
|
// typically set up with a non-zero offset (squashfs lives inside the ISO),
|
||||||
|
// so the ioctl returns EINVAL. Detect this early for a clear error message.
|
||||||
|
if off := loopDeviceOffset(loopDev); off > 0 {
|
||||||
|
return fmt.Errorf("loop device has non-zero offset (%d bytes, typical for ISO/CD-ROM) — LOOP_CHANGE_FD not supported; use 'toram' kernel parameter for RAM boot", off)
|
||||||
|
}
|
||||||
if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
|
if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,3 +26,8 @@ func loopChangeFD(loopDev, newFile string) error {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// bindMount binds src over dst using the syscall directly (avoids exec PATH issues).
|
||||||
|
func bindMount(src, dst string) error {
|
||||||
|
return syscall.Mount(src, dst, "", syscall.MS_BIND, "")
|
||||||
|
}
|
||||||
|
|||||||
@@ -7,3 +7,7 @@ import "errors"
|
|||||||
func loopChangeFD(loopDev, newFile string) error {
|
func loopChangeFD(loopDev, newFile string) error {
|
||||||
return errors.New("LOOP_CHANGE_FD not available on this platform")
|
return errors.New("LOOP_CHANGE_FD not available on this platform")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func bindMount(src, dst string) error {
|
||||||
|
return errors.New("bind mount not available on this platform")
|
||||||
|
}
|
||||||
|
|||||||
@@ -33,14 +33,17 @@ func TestInferLiveBootKind(t *testing.T) {
|
|||||||
func TestVerifyInstallToRAMStatus(t *testing.T) {
|
func TestVerifyInstallToRAMStatus(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}); err != nil {
|
dstDir := t.TempDir()
|
||||||
|
|
||||||
|
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}, dstDir, false, nil); err != nil {
|
||||||
t.Fatalf("expected success for RAM-backed status, got %v", err)
|
t.Fatalf("expected success for RAM-backed status, got %v", err)
|
||||||
}
|
}
|
||||||
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"})
|
|
||||||
|
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"}, dstDir, false, nil)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
t.Fatal("expected verification failure when media is still on USB")
|
t.Fatal("expected verification failure when media is still on USB")
|
||||||
}
|
}
|
||||||
if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1)" {
|
if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1) and no squashfs found in "+dstDir {
|
||||||
t.Fatalf("error=%q", got)
|
t.Fatalf("error=%q", got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,6 +15,10 @@ var workerPatterns = []string{
|
|||||||
"stress-ng",
|
"stress-ng",
|
||||||
"stressapptest",
|
"stressapptest",
|
||||||
"memtester",
|
"memtester",
|
||||||
|
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
||||||
|
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
||||||
|
"nvvs",
|
||||||
|
"dcgmi",
|
||||||
}
|
}
|
||||||
|
|
||||||
// KilledProcess describes a process that was sent SIGKILL.
|
// KilledProcess describes a process that was sent SIGKILL.
|
||||||
|
|||||||
@@ -16,12 +16,12 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||||
job,
|
job,
|
||||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func nvidiaStressArchivePrefix(loader string) string {
|
func nvidiaStressArchivePrefix(loader string) string {
|
||||||
@@ -49,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
|||||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
}
|
}
|
||||||
|
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||||
|
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||||
|
}
|
||||||
if len(selected) > 0 {
|
if len(selected) > 0 {
|
||||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
}
|
}
|
||||||
@@ -63,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
|||||||
"bee-john-gpu-stress",
|
"bee-john-gpu-stress",
|
||||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
}
|
}
|
||||||
|
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||||
|
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||||
|
}
|
||||||
if len(selected) > 0 {
|
if len(selected) > 0 {
|
||||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -110,7 +110,7 @@ func (s *System) RunPlatformStress(
|
|||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
|
||||||
if gpuCmd == nil {
|
if gpuCmd == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -392,6 +392,13 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
|||||||
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
||||||
}
|
}
|
||||||
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
cmd.Cancel = func() error {
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
||||||
@@ -402,28 +409,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
|||||||
|
|
||||||
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
||||||
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
||||||
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
|
func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
|
||||||
switch strings.ToLower(vendor) {
|
switch strings.ToLower(vendor) {
|
||||||
case "amd":
|
case "amd":
|
||||||
return buildAMDGPUStressCmd(ctx)
|
return buildAMDGPUStressCmd(ctx, durSec)
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
return buildNvidiaGPUStressCmd(ctx)
|
return buildNvidiaGPUStressCmd(ctx, durSec)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||||
rvsArgs, err := resolveRVSCommand()
|
rvsArgs, err := resolveRVSCommand()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
rvsPath := rvsArgs[0]
|
rvsPath := rvsArgs[0]
|
||||||
cfg := `actions:
|
cfg := fmt.Sprintf(`actions:
|
||||||
- name: gst_platform
|
- name: gst_platform
|
||||||
device: all
|
device: all
|
||||||
module: gst
|
module: gst
|
||||||
parallel: true
|
parallel: true
|
||||||
duration: 86400000
|
duration: %d`, durSec*1000) + `
|
||||||
copy_matrix: false
|
copy_matrix: false
|
||||||
target_stress: 90
|
target_stress: 90
|
||||||
matrix_size_a: 8640
|
matrix_size_a: 8640
|
||||||
@@ -433,13 +440,20 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
|||||||
cfgFile := "/tmp/bee-platform-gst.conf"
|
cfgFile := "/tmp/bee-platform-gst.conf"
|
||||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||||
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
cmd.Cancel = func() error {
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
_ = startLowPriorityCmd(cmd, 10)
|
_ = startLowPriorityCmd(cmd, 10)
|
||||||
return cmd
|
return cmd
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||||
path, err := satLookPath("bee-gpu-burn")
|
path, err := satLookPath("bee-gpu-burn")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
path, err = satLookPath("bee-gpu-stress")
|
path, err = satLookPath("bee-gpu-stress")
|
||||||
@@ -447,7 +461,17 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
|
// Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
|
||||||
|
// Process group kill via Setpgid+Cancel is kept as a safety net for cases
|
||||||
|
// where the context is cancelled early (user stop, parent timeout).
|
||||||
|
cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
cmd.Cancel = func() error {
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
_ = startLowPriorityCmd(cmd, 10)
|
_ = startLowPriorityCmd(cmd, 10)
|
||||||
|
|||||||
@@ -173,6 +173,22 @@ func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHe
|
|||||||
|
|
||||||
switch vendor {
|
switch vendor {
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
|
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
||||||
|
health.NvidiaGSPMode = strings.TrimSpace(string(raw))
|
||||||
|
if health.NvidiaGSPMode == "gsp-stuck" {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "nvidia_gsp_stuck",
|
||||||
|
Severity: "critical",
|
||||||
|
Description: "NVIDIA GSP firmware init timed out and the kernel module is stuck. Reboot and select 'GSP=off' in the boot menu.",
|
||||||
|
})
|
||||||
|
} else if health.NvidiaGSPMode == "gsp-off" {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "nvidia_gsp_disabled",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "NVIDIA GSP firmware disabled (fallback). Power management runs via CPU path — power draw readings may differ from reference hardware.",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
||||||
if !health.DriverReady {
|
if !health.DriverReady {
|
||||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
|||||||
@@ -21,10 +21,11 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
satExecCommand = exec.Command
|
satExecCommand = exec.Command
|
||||||
satLookPath = exec.LookPath
|
satLookPath = exec.LookPath
|
||||||
satGlob = filepath.Glob
|
satGlob = filepath.Glob
|
||||||
satStat = os.Stat
|
satStat = os.Stat
|
||||||
|
satFreeMemBytes = freeMemBytes
|
||||||
|
|
||||||
rocmSMIExecutableGlobs = []string{
|
rocmSMIExecutableGlobs = []string{
|
||||||
"/opt/rocm/bin/rocm-smi",
|
"/opt/rocm/bin/rocm-smi",
|
||||||
@@ -87,6 +88,37 @@ type NvidiaGPU struct {
|
|||||||
MemoryMB int `json:"memory_mb"`
|
MemoryMB int `json:"memory_mb"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type NvidiaGPUStatus struct {
|
||||||
|
Index int `json:"index"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
BDF string `json:"bdf,omitempty"`
|
||||||
|
Serial string `json:"serial,omitempty"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
RawLine string `json:"raw_line,omitempty"`
|
||||||
|
NeedsReset bool `json:"needs_reset"`
|
||||||
|
ParseFailure bool `json:"parse_failure,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type nvidiaGPUHealth struct {
|
||||||
|
Index int
|
||||||
|
Name string
|
||||||
|
NeedsReset bool
|
||||||
|
RawLine string
|
||||||
|
ParseFailure bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type nvidiaGPUStatusFile struct {
|
||||||
|
Index int
|
||||||
|
Name string
|
||||||
|
RunStatus string
|
||||||
|
Reason string
|
||||||
|
Health string
|
||||||
|
HealthRaw string
|
||||||
|
Observed bool
|
||||||
|
Selected bool
|
||||||
|
FailingJob string
|
||||||
|
}
|
||||||
|
|
||||||
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
||||||
type AMDGPUInfo struct {
|
type AMDGPUInfo struct {
|
||||||
Index int `json:"index"`
|
Index int `json:"index"`
|
||||||
@@ -268,6 +300,72 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
|||||||
return gpus, nil
|
return gpus, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *System) ListNvidiaGPUStatuses() ([]NvidiaGPUStatus, error) {
|
||||||
|
out, err := satExecCommand(
|
||||||
|
"nvidia-smi",
|
||||||
|
"--query-gpu=index,name,pci.bus_id,serial,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
|
||||||
|
"--format=csv,noheader,nounits",
|
||||||
|
).Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||||
|
}
|
||||||
|
var gpus []NvidiaGPUStatus
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Split(line, ",")
|
||||||
|
if len(parts) < 4 {
|
||||||
|
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||||
|
if err != nil {
|
||||||
|
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
upper := strings.ToUpper(line)
|
||||||
|
needsReset := strings.Contains(upper, "GPU REQUIRES RESET")
|
||||||
|
status := "OK"
|
||||||
|
if needsReset {
|
||||||
|
status = "RESET_REQUIRED"
|
||||||
|
}
|
||||||
|
gpus = append(gpus, NvidiaGPUStatus{
|
||||||
|
Index: idx,
|
||||||
|
Name: strings.TrimSpace(parts[1]),
|
||||||
|
BDF: normalizeNvidiaBusID(strings.TrimSpace(parts[2])),
|
||||||
|
Serial: strings.TrimSpace(parts[3]),
|
||||||
|
Status: status,
|
||||||
|
RawLine: line,
|
||||||
|
NeedsReset: needsReset,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
sort.Slice(gpus, func(i, j int) bool { return gpus[i].Index < gpus[j].Index })
|
||||||
|
return gpus, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeNvidiaBusID(v string) string {
|
||||||
|
v = strings.TrimSpace(strings.ToLower(v))
|
||||||
|
parts := strings.Split(v, ":")
|
||||||
|
if len(parts) == 3 && len(parts[0]) > 4 {
|
||||||
|
parts[0] = parts[0][len(parts[0])-4:]
|
||||||
|
return strings.Join(parts, ":")
|
||||||
|
}
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||||
|
if index < 0 {
|
||||||
|
return "", fmt.Errorf("gpu index must be >= 0")
|
||||||
|
}
|
||||||
|
raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
|
||||||
|
if len(raw) == 0 && err == nil {
|
||||||
|
raw = []byte("GPU reset completed.\n")
|
||||||
|
}
|
||||||
|
return string(raw), err
|
||||||
|
}
|
||||||
|
|
||||||
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||||
// Measures collective communication bandwidth over NVLink/PCIe.
|
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
@@ -277,36 +375,50 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
|||||||
if gpuCount < 1 {
|
if gpuCount < 1 {
|
||||||
gpuCount = 1
|
gpuCount = 1
|
||||||
}
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-all-reduce-perf.log", cmd: []string{
|
satJob{name: "02-all-reduce-perf.log", cmd: []string{
|
||||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||||
}},
|
}},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
var (
|
||||||
if err != nil {
|
profCmd []string
|
||||||
return "", err
|
profEnv []string
|
||||||
|
)
|
||||||
|
if staggerSec > 0 && len(selected) > 1 {
|
||||||
|
profCmd = []string{
|
||||||
|
"bee-dcgmproftester-staggered",
|
||||||
|
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
||||||
|
"--stagger-seconds", strconv.Itoa(staggerSec),
|
||||||
|
"--devices", joinIndexList(selected),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
profEnv = nvidiaVisibleDevicesEnv(selected)
|
||||||
}
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||||||
{
|
satJob{
|
||||||
name: "03-dcgmproftester.log",
|
name: "03-dcgmproftester.log",
|
||||||
cmd: profCmd,
|
cmd: profCmd,
|
||||||
env: nvidiaVisibleDevicesEnv(selected),
|
env: profEnv,
|
||||||
collectGPU: true,
|
collectGPU: true,
|
||||||
gpuIndices: selected,
|
gpuIndices: selected,
|
||||||
},
|
},
|
||||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
@@ -314,16 +426,16 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{
|
satJob{
|
||||||
name: "02-dcgmi-targeted-power.log",
|
name: "02-dcgmi-targeted-power.log",
|
||||||
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
|
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||||
collectGPU: true,
|
collectGPU: true,
|
||||||
gpuIndices: selected,
|
gpuIndices: selected,
|
||||||
},
|
},
|
||||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
@@ -331,16 +443,16 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{
|
satJob{
|
||||||
name: "02-dcgmi-pulse-test.log",
|
name: "02-dcgmi-pulse-test.log",
|
||||||
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
|
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||||
collectGPU: true,
|
collectGPU: true,
|
||||||
gpuIndices: selected,
|
gpuIndices: selected,
|
||||||
},
|
},
|
||||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
@@ -348,16 +460,16 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{
|
satJob{
|
||||||
name: "02-dcgmi-nvbandwidth.log",
|
name: "02-dcgmi-nvbandwidth.log",
|
||||||
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
|
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
|
||||||
collectGPU: true,
|
collectGPU: true,
|
||||||
gpuIndices: selected,
|
gpuIndices: selected,
|
||||||
},
|
},
|
||||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
@@ -381,16 +493,23 @@ func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDi
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
|
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||||
{
|
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||||
|
for _, p := range killed {
|
||||||
|
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode(
|
||||||
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
satJob{
|
||||||
name: "02-dcgmi-targeted-stress.log",
|
name: "02-dcgmi-targeted-stress.log",
|
||||||
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
|
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||||
collectGPU: true,
|
collectGPU: true,
|
||||||
gpuIndices: selected,
|
gpuIndices: selected,
|
||||||
},
|
},
|
||||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
||||||
@@ -407,9 +526,32 @@ func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
|||||||
return all, nil
|
return all, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func memoryStressSizeArg() string {
|
||||||
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
||||||
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
return fmt.Sprintf("%dM", mb)
|
||||||
|
}
|
||||||
|
availBytes := satFreeMemBytes()
|
||||||
|
if availBytes <= 0 {
|
||||||
|
return "80%"
|
||||||
|
}
|
||||||
|
availMB := availBytes / (1024 * 1024)
|
||||||
|
targetMB := (availMB * 2) / 3
|
||||||
|
if targetMB >= 256 {
|
||||||
|
targetMB = (targetMB / 256) * 256
|
||||||
|
}
|
||||||
|
if targetMB <= 0 {
|
||||||
|
return "80%"
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%dM", targetMB)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||||
|
if sizeMB <= 0 {
|
||||||
|
sizeMB = 256
|
||||||
|
}
|
||||||
|
if passes <= 0 {
|
||||||
|
passes = 1
|
||||||
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||||
@@ -422,11 +564,9 @@ func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durati
|
|||||||
if seconds <= 0 {
|
if seconds <= 0 {
|
||||||
seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
|
seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
|
||||||
}
|
}
|
||||||
// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
|
// Base the default on current MemAvailable and keep headroom for the OS and
|
||||||
sizeArg := "80%"
|
// concurrent stressors so mixed burn runs do not trip the OOM killer.
|
||||||
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
sizeArg := memoryStressSizeArg()
|
||||||
sizeArg = fmt.Sprintf("%dM", mb)
|
|
||||||
}
|
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
|
||||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
{name: "02-stress-ng-vm.log", cmd: []string{
|
{name: "02-stress-ng-vm.log", cmd: []string{
|
||||||
@@ -468,7 +608,7 @@ func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durat
|
|||||||
}, logFunc)
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||||
if baseDir == "" {
|
if baseDir == "" {
|
||||||
baseDir = "/var/log/bee-sat"
|
baseDir = "/var/log/bee-sat"
|
||||||
}
|
}
|
||||||
@@ -500,7 +640,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
||||||
commands := storageSATCommands(devPath)
|
commands := storageSATCommands(devPath, extended)
|
||||||
for cmdIndex, job := range commands {
|
for cmdIndex, job := range commands {
|
||||||
if ctx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
break
|
break
|
||||||
@@ -543,14 +683,24 @@ type satStats struct {
|
|||||||
Unsupported int
|
Unsupported int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func withNvidiaPersistenceMode(jobs ...satJob) []satJob {
|
||||||
|
out := make([]satJob, 0, len(jobs)+1)
|
||||||
|
out = append(out, satJob{
|
||||||
|
name: "00-nvidia-smi-persistence-mode.log",
|
||||||
|
cmd: []string{"nvidia-smi", "-pm", "1"},
|
||||||
|
})
|
||||||
|
out = append(out, jobs...)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
func nvidiaSATJobs() []satJob {
|
func nvidiaSATJobs() []satJob {
|
||||||
return []satJob{
|
return withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||||
{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||||
}
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||||
@@ -565,12 +715,12 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
|||||||
}
|
}
|
||||||
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
||||||
}
|
}
|
||||||
return []satJob{
|
return withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
satJob{name: "04-dcgmi-diag.log", cmd: diagArgs, gpuIndices: gpuIndices},
|
||||||
}
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
|
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
|
||||||
@@ -595,7 +745,10 @@ func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
|
|||||||
if len(gpuIndices) == 0 {
|
if len(gpuIndices) == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
return []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
|
return []string{
|
||||||
|
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
|
||||||
|
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
||||||
@@ -614,11 +767,23 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
|
|
||||||
var summary strings.Builder
|
var summary strings.Builder
|
||||||
stats := satStats{}
|
stats := satStats{}
|
||||||
|
nvidiaPack := strings.HasPrefix(prefix, "gpu-nvidia")
|
||||||
|
perGPU := map[int]*nvidiaGPUStatusFile{}
|
||||||
|
selectedGPUIndices := map[int]struct{}{}
|
||||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||||
for _, job := range jobs {
|
for _, job := range jobs {
|
||||||
if ctx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
for _, idx := range job.gpuIndices {
|
||||||
|
selectedGPUIndices[idx] = struct{}{}
|
||||||
|
status := perGPU[idx]
|
||||||
|
if status == nil {
|
||||||
|
status = &nvidiaGPUStatusFile{Index: idx}
|
||||||
|
perGPU[idx] = status
|
||||||
|
}
|
||||||
|
status.Selected = true
|
||||||
|
}
|
||||||
cmd := make([]string, 0, len(job.cmd))
|
cmd := make([]string, 0, len(job.cmd))
|
||||||
for _, arg := range job.cmd {
|
for _, arg := range job.cmd {
|
||||||
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
||||||
@@ -627,17 +792,52 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
var out []byte
|
var out []byte
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
if job.collectGPU {
|
if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
|
||||||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
|
||||||
} else {
|
if logFunc != nil {
|
||||||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
logFunc(msg)
|
||||||
|
}
|
||||||
|
out = []byte(msg + "\n")
|
||||||
|
err = healthErr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err == nil {
|
||||||
|
if job.collectGPU {
|
||||||
|
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
||||||
|
} else {
|
||||||
|
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
|
||||||
|
if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(msg)
|
||||||
|
}
|
||||||
|
if len(out) > 0 && !bytes.HasSuffix(out, []byte("\n")) {
|
||||||
|
out = append(out, '\n')
|
||||||
|
}
|
||||||
|
out = append(out, []byte(msg+"\n")...)
|
||||||
|
if err == nil {
|
||||||
|
err = healthErr
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||||
return "", writeErr
|
return "", writeErr
|
||||||
}
|
}
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return "", ctx.Err()
|
||||||
|
}
|
||||||
status, rc := classifySATResult(job.name, out, err)
|
status, rc := classifySATResult(job.name, out, err)
|
||||||
stats.Add(status)
|
stats.Add(status)
|
||||||
|
if nvidiaPack && len(job.gpuIndices) > 0 && nvidiaJobNeedsHealthCheck(job) {
|
||||||
|
for _, idx := range job.gpuIndices {
|
||||||
|
updateNvidiaGPUStatus(perGPU, idx, status, job.name, string(out))
|
||||||
|
}
|
||||||
|
}
|
||||||
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
||||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||||
@@ -646,6 +846,11 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
if nvidiaPack {
|
||||||
|
if err := writeNvidiaGPUStatusFiles(runDir, stats.Overall(), perGPU, selectedGPUIndices); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
||||||
if err := createTarGz(archive, runDir); err != nil {
|
if err := createTarGz(archive, runDir); err != nil {
|
||||||
@@ -654,6 +859,197 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
return archive, nil
|
return archive, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
|
||||||
|
entry := perGPU[idx]
|
||||||
|
if entry == nil {
|
||||||
|
entry = &nvidiaGPUStatusFile{Index: idx}
|
||||||
|
perGPU[idx] = entry
|
||||||
|
}
|
||||||
|
if nvidiaSATStatusSeverity(status) >= nvidiaSATStatusSeverity(entry.RunStatus) {
|
||||||
|
entry.RunStatus = status
|
||||||
|
entry.FailingJob = jobName
|
||||||
|
entry.Reason = firstLine(detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPUStatusFile, selected map[int]struct{}) error {
|
||||||
|
health, err := readNvidiaGPUHealth()
|
||||||
|
if err == nil {
|
||||||
|
for _, gpu := range health {
|
||||||
|
entry := perGPU[gpu.Index]
|
||||||
|
if entry == nil {
|
||||||
|
entry = &nvidiaGPUStatusFile{Index: gpu.Index}
|
||||||
|
perGPU[gpu.Index] = entry
|
||||||
|
}
|
||||||
|
entry.Name = gpu.Name
|
||||||
|
entry.Observed = true
|
||||||
|
entry.HealthRaw = gpu.RawLine
|
||||||
|
if gpu.NeedsReset {
|
||||||
|
entry.Health = "RESET_REQUIRED"
|
||||||
|
if entry.RunStatus == "" || nvidiaSATStatusSeverity("FAILED") >= nvidiaSATStatusSeverity(entry.RunStatus) {
|
||||||
|
entry.RunStatus = "FAILED"
|
||||||
|
if strings.TrimSpace(entry.Reason) == "" {
|
||||||
|
entry.Reason = "GPU requires reset"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
entry.Health = "OK"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for idx := range selected {
|
||||||
|
entry := perGPU[idx]
|
||||||
|
if entry == nil {
|
||||||
|
entry = &nvidiaGPUStatusFile{Index: idx}
|
||||||
|
perGPU[idx] = entry
|
||||||
|
}
|
||||||
|
entry.Selected = true
|
||||||
|
}
|
||||||
|
var indices []int
|
||||||
|
for idx := range perGPU {
|
||||||
|
indices = append(indices, idx)
|
||||||
|
}
|
||||||
|
sort.Ints(indices)
|
||||||
|
for _, idx := range indices {
|
||||||
|
entry := perGPU[idx]
|
||||||
|
if entry.RunStatus == "" {
|
||||||
|
entry.RunStatus = overall
|
||||||
|
}
|
||||||
|
if entry.Health == "" {
|
||||||
|
entry.Health = "UNKNOWN"
|
||||||
|
}
|
||||||
|
if entry.Name == "" {
|
||||||
|
entry.Name = "unknown"
|
||||||
|
}
|
||||||
|
var body strings.Builder
|
||||||
|
fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
|
||||||
|
fmt.Fprintf(&body, "gpu_name=%s\n", entry.Name)
|
||||||
|
fmt.Fprintf(&body, "selected=%t\n", entry.Selected)
|
||||||
|
fmt.Fprintf(&body, "observed=%t\n", entry.Observed)
|
||||||
|
fmt.Fprintf(&body, "run_status=%s\n", entry.RunStatus)
|
||||||
|
fmt.Fprintf(&body, "health_status=%s\n", entry.Health)
|
||||||
|
if strings.TrimSpace(entry.FailingJob) != "" {
|
||||||
|
fmt.Fprintf(&body, "failing_job=%s\n", entry.FailingJob)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(entry.Reason) != "" {
|
||||||
|
fmt.Fprintf(&body, "reason=%s\n", entry.Reason)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(entry.HealthRaw) != "" {
|
||||||
|
fmt.Fprintf(&body, "health_raw=%s\n", entry.HealthRaw)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-status.txt", idx)), []byte(body.String()), 0644); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvidiaSATStatusSeverity(status string) int {
|
||||||
|
switch strings.ToUpper(strings.TrimSpace(status)) {
|
||||||
|
case "FAILED":
|
||||||
|
return 3
|
||||||
|
case "PARTIAL", "UNSUPPORTED":
|
||||||
|
return 2
|
||||||
|
case "OK":
|
||||||
|
return 1
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstLine(s string) string {
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
if s == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if idx := strings.IndexByte(s, '\n'); idx >= 0 {
|
||||||
|
return strings.TrimSpace(s[:idx])
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvidiaJobNeedsHealthCheck(job satJob) bool {
|
||||||
|
if job.collectGPU {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
name := strings.ToLower(strings.TrimSpace(job.name))
|
||||||
|
return strings.Contains(name, "dcgmi") ||
|
||||||
|
strings.Contains(name, "gpu-burn") ||
|
||||||
|
strings.Contains(name, "gpu-stress") ||
|
||||||
|
strings.Contains(name, "dcgmproftester")
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkNvidiaJobHealth(selected []int) (string, error) {
|
||||||
|
health, err := readNvidiaGPUHealth()
|
||||||
|
if err != nil {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
var bad []nvidiaGPUHealth
|
||||||
|
selectedSet := make(map[int]struct{}, len(selected))
|
||||||
|
for _, idx := range selected {
|
||||||
|
selectedSet[idx] = struct{}{}
|
||||||
|
}
|
||||||
|
for _, gpu := range health {
|
||||||
|
if len(selectedSet) > 0 {
|
||||||
|
if _, ok := selectedSet[gpu.Index]; !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if gpu.NeedsReset {
|
||||||
|
bad = append(bad, gpu)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(bad) == 0 {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
lines := make([]string, 0, len(bad)+1)
|
||||||
|
lines = append(lines, "NVIDIA GPU health check failed:")
|
||||||
|
for _, gpu := range bad {
|
||||||
|
lines = append(lines, fmt.Sprintf("gpu %d (%s) requires reset: %s", gpu.Index, gpu.Name, gpu.RawLine))
|
||||||
|
}
|
||||||
|
return strings.Join(lines, "\n"), errors.New("nvidia gpu requires reset")
|
||||||
|
}
|
||||||
|
|
||||||
|
func readNvidiaGPUHealth() ([]nvidiaGPUHealth, error) {
|
||||||
|
out, err := satExecCommand(
|
||||||
|
"nvidia-smi",
|
||||||
|
"--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
|
||||||
|
"--format=csv,noheader,nounits",
|
||||||
|
).Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||||
|
}
|
||||||
|
return parseNvidiaGPUHealth(string(out)), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseNvidiaGPUHealth(raw string) []nvidiaGPUHealth {
|
||||||
|
var gpus []nvidiaGPUHealth
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Split(line, ",")
|
||||||
|
if len(parts) < 2 {
|
||||||
|
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||||
|
if err != nil {
|
||||||
|
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
upper := strings.ToUpper(line)
|
||||||
|
gpus = append(gpus, nvidiaGPUHealth{
|
||||||
|
Index: idx,
|
||||||
|
Name: strings.TrimSpace(parts[1]),
|
||||||
|
NeedsReset: strings.Contains(upper, "GPU REQUIRES RESET"),
|
||||||
|
RawLine: line,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return gpus
|
||||||
|
}
|
||||||
|
|
||||||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
|
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
|
||||||
start := time.Now().UTC()
|
start := time.Now().UTC()
|
||||||
resolvedCmd, err := resolveSATCommand(cmd)
|
resolvedCmd, err := resolveSATCommand(cmd)
|
||||||
@@ -708,17 +1104,25 @@ func listStorageDevices() ([]string, error) {
|
|||||||
return parseStorageDevices(string(out)), nil
|
return parseStorageDevices(string(out)), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func storageSATCommands(devPath string) []satJob {
|
func storageSATCommands(devPath string, extended bool) []satJob {
|
||||||
if strings.Contains(filepath.Base(devPath), "nvme") {
|
if strings.Contains(filepath.Base(devPath), "nvme") {
|
||||||
|
selfTestLevel := "1"
|
||||||
|
if extended {
|
||||||
|
selfTestLevel = "2"
|
||||||
|
}
|
||||||
return []satJob{
|
return []satJob{
|
||||||
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
|
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
|
||||||
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
|
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
|
||||||
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}},
|
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
smartTestType := "short"
|
||||||
|
if extended {
|
||||||
|
smartTestType = "long"
|
||||||
|
}
|
||||||
return []satJob{
|
return []satJob{
|
||||||
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
|
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
|
||||||
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}},
|
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -777,6 +1181,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
|||||||
// nvidia-smi on a machine with no NVIDIA GPU
|
// nvidia-smi on a machine with no NVIDIA GPU
|
||||||
strings.Contains(text, "couldn't communicate with the nvidia driver") ||
|
strings.Contains(text, "couldn't communicate with the nvidia driver") ||
|
||||||
strings.Contains(text, "no nvidia gpu") ||
|
strings.Contains(text, "no nvidia gpu") ||
|
||||||
|
// Some NVMe firmwares start self-test but never expose progress to nvme-cli
|
||||||
|
// while waiting, so the CLI stops polling without proving device failure.
|
||||||
|
(strings.Contains(name, "self-test") &&
|
||||||
|
strings.Contains(text, "no progress for") &&
|
||||||
|
strings.Contains(text, "stop waiting")) ||
|
||||||
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
||||||
return "UNSUPPORTED", rc
|
return "UNSUPPORTED", rc
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ type FanStressOptions struct {
|
|||||||
Phase1DurSec int // first load phase duration in seconds (default 300)
|
Phase1DurSec int // first load phase duration in seconds (default 300)
|
||||||
PauseSec int // pause between the two load phases (default 60)
|
PauseSec int // pause between the two load phases (default 60)
|
||||||
Phase2DurSec int // second load phase duration in seconds (default 300)
|
Phase2DurSec int // second load phase duration in seconds (default 300)
|
||||||
SizeMB int // GPU memory to allocate per GPU during stress (default 64)
|
SizeMB int // GPU memory to allocate per GPU during stress (0 = auto: 95% of VRAM)
|
||||||
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -243,9 +243,8 @@ func applyFanStressDefaults(opts *FanStressOptions) {
|
|||||||
if opts.Phase2DurSec <= 0 {
|
if opts.Phase2DurSec <= 0 {
|
||||||
opts.Phase2DurSec = 300
|
opts.Phase2DurSec = 300
|
||||||
}
|
}
|
||||||
if opts.SizeMB <= 0 {
|
// SizeMB == 0 means "auto" (worker picks 95% of GPU VRAM for maximum power draw).
|
||||||
opts.SizeMB = 64
|
// Leave at 0 to avoid passing a too-small size that starves the tensor-core path.
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// sampleFanStressRow collects all metrics for one telemetry sample.
|
// sampleFanStressRow collects all metrics for one telemetry sample.
|
||||||
|
|||||||
@@ -1,23 +1,25 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestStorageSATCommands(t *testing.T) {
|
func TestStorageSATCommands(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
nvme := storageSATCommands("/dev/nvme0n1")
|
nvme := storageSATCommands("/dev/nvme0n1", false)
|
||||||
if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
|
if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
|
||||||
t.Fatalf("unexpected nvme commands: %#v", nvme)
|
t.Fatalf("unexpected nvme commands: %#v", nvme)
|
||||||
}
|
}
|
||||||
|
|
||||||
sata := storageSATCommands("/dev/sda")
|
sata := storageSATCommands("/dev/sda", false)
|
||||||
if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
|
if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
|
||||||
t.Fatalf("unexpected sata commands: %#v", sata)
|
t.Fatalf("unexpected sata commands: %#v", sata)
|
||||||
}
|
}
|
||||||
@@ -28,13 +30,19 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
|||||||
|
|
||||||
jobs := nvidiaSATJobs()
|
jobs := nvidiaSATJobs()
|
||||||
|
|
||||||
if len(jobs) != 5 {
|
if len(jobs) != 6 {
|
||||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
t.Fatalf("jobs=%d want 6", len(jobs))
|
||||||
}
|
}
|
||||||
if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
|
if got := jobs[0].cmd[0]; got != "nvidia-smi" {
|
||||||
|
t.Fatalf("preflight command=%q want nvidia-smi", got)
|
||||||
|
}
|
||||||
|
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||||
|
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||||
|
}
|
||||||
|
if got := jobs[5].cmd[0]; got != "bee-gpu-burn" {
|
||||||
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
||||||
}
|
}
|
||||||
if got := jobs[3].cmd[1]; got != "--output-file" {
|
if got := jobs[4].cmd[1]; got != "--output-file" {
|
||||||
t.Fatalf("bug report flag=%q want --output-file", got)
|
t.Fatalf("bug report flag=%q want --output-file", got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -82,7 +90,7 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
|||||||
|
|
||||||
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||||
jobs := nvidiaSATJobs()
|
jobs := nvidiaSATJobs()
|
||||||
got := jobs[4].cmd
|
got := jobs[5].cmd
|
||||||
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
||||||
if len(got) != len(want) {
|
if len(got) != len(want) {
|
||||||
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||||
@@ -94,6 +102,19 @@ func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) {
|
||||||
|
jobs := nvidiaDCGMJobs(3, []int{2, 0})
|
||||||
|
if len(jobs) != 5 {
|
||||||
|
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||||
|
}
|
||||||
|
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||||
|
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||||
|
}
|
||||||
|
if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" {
|
||||||
|
t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -195,6 +216,74 @@ func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseNvidiaGPUHealthDetectsResetRequired(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
got := parseNvidiaGPUHealth("0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n")
|
||||||
|
if len(got) != 2 {
|
||||||
|
t.Fatalf("len=%d want 2", len(got))
|
||||||
|
}
|
||||||
|
if got[0].NeedsReset {
|
||||||
|
t.Fatalf("gpu0 unexpectedly marked reset-required")
|
||||||
|
}
|
||||||
|
if !got[1].NeedsReset {
|
||||||
|
t.Fatalf("gpu1 should be marked reset-required: %#v", got[1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCheckNvidiaJobHealthReturnsErrorForSelectedResetRequiredGPU(t *testing.T) {
|
||||||
|
oldExecCommand := satExecCommand
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
if name == "nvidia-smi" {
|
||||||
|
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
|
||||||
|
}
|
||||||
|
return exec.Command(name, args...)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||||
|
|
||||||
|
msg, err := checkNvidiaJobHealth([]int{1})
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected health check error")
|
||||||
|
}
|
||||||
|
if !strings.Contains(msg, "gpu 1") || !strings.Contains(strings.ToLower(msg), "requires reset") {
|
||||||
|
t.Fatalf("unexpected message: %q", msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWriteNvidiaGPUStatusFilesCreatesPerGPUFiles(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
oldExecCommand := satExecCommand
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
if name == "nvidia-smi" {
|
||||||
|
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
|
||||||
|
}
|
||||||
|
return exec.Command(name, args...)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||||
|
|
||||||
|
perGPU := map[int]*nvidiaGPUStatusFile{
|
||||||
|
0: {Index: 0, RunStatus: "OK"},
|
||||||
|
1: {Index: 1, RunStatus: "FAILED", FailingJob: "02-dcgmi-targeted-stress.log", Reason: "NVIDIA GPU health check failed:"},
|
||||||
|
}
|
||||||
|
if err := writeNvidiaGPUStatusFiles(dir, "FAILED", perGPU, map[int]struct{}{0: {}, 1: {}}); err != nil {
|
||||||
|
t.Fatalf("writeNvidiaGPUStatusFiles error: %v", err)
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(filepath.Join(dir, "gpu-1-status.txt"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadFile gpu-1-status.txt: %v", err)
|
||||||
|
}
|
||||||
|
text := string(raw)
|
||||||
|
if !strings.Contains(text, "run_status=FAILED") {
|
||||||
|
t.Fatalf("missing run status:\n%s", text)
|
||||||
|
}
|
||||||
|
if !strings.Contains(text, "health_status=RESET_REQUIRED") {
|
||||||
|
t.Fatalf("missing health status:\n%s", text)
|
||||||
|
}
|
||||||
|
if !strings.Contains(text, "failing_job=02-dcgmi-targeted-stress.log") {
|
||||||
|
t.Fatalf("missing failing job:\n%s", text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
|
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
|
||||||
oldLookPath := satLookPath
|
oldLookPath := satLookPath
|
||||||
satLookPath = func(file string) (string, error) {
|
satLookPath = func(file string) (string, error) {
|
||||||
@@ -234,11 +323,14 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
|
|||||||
|
|
||||||
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
||||||
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
||||||
if len(env) != 1 {
|
if len(env) != 2 {
|
||||||
t.Fatalf("env len=%d want 1 (%v)", len(env), env)
|
t.Fatalf("env len=%d want 2 (%v)", len(env), env)
|
||||||
}
|
}
|
||||||
if env[0] != "CUDA_VISIBLE_DEVICES=0,2,4" {
|
if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" {
|
||||||
t.Fatalf("env[0]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[0])
|
t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0])
|
||||||
|
}
|
||||||
|
if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" {
|
||||||
|
t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -276,6 +368,37 @@ func TestEnvIntFallback(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestMemoryStressSizeArgUsesAvailableMemory(t *testing.T) {
|
||||||
|
oldFreeMemBytes := satFreeMemBytes
|
||||||
|
satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
|
||||||
|
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||||
|
|
||||||
|
if got := memoryStressSizeArg(); got != "65536M" {
|
||||||
|
t.Fatalf("sizeArg=%q want 65536M", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMemoryStressSizeArgRespectsOverride(t *testing.T) {
|
||||||
|
oldFreeMemBytes := satFreeMemBytes
|
||||||
|
satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
|
||||||
|
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||||
|
t.Setenv("BEE_VM_STRESS_SIZE_MB", "4096")
|
||||||
|
|
||||||
|
if got := memoryStressSizeArg(); got != "4096M" {
|
||||||
|
t.Fatalf("sizeArg=%q want 4096M", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMemoryStressSizeArgFallsBackWhenFreeMemoryUnknown(t *testing.T) {
|
||||||
|
oldFreeMemBytes := satFreeMemBytes
|
||||||
|
satFreeMemBytes = func() int64 { return 0 }
|
||||||
|
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||||
|
|
||||||
|
if got := memoryStressSizeArg(); got != "80%" {
|
||||||
|
t.Fatalf("sizeArg=%q want 80%%", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestClassifySATResult(t *testing.T) {
|
func TestClassifySATResult(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
@@ -286,6 +409,7 @@ func TestClassifySATResult(t *testing.T) {
|
|||||||
}{
|
}{
|
||||||
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
||||||
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||||
|
{name: "nvme wait timeout without progress", job: "nvme-device-self-test", out: "Short Device self-test started\nWaiting for self test completion...\nno progress for 78 seconds, stop waiting", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||||
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||||
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||||
}
|
}
|
||||||
@@ -300,6 +424,38 @@ func TestClassifySATResult(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRunAcceptancePackCtxReturnsContextErrorWithoutArchive(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
t.Cleanup(cancel)
|
||||||
|
|
||||||
|
done := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
cancel()
|
||||||
|
close(done)
|
||||||
|
}()
|
||||||
|
|
||||||
|
archive, err := runAcceptancePackCtx(ctx, dir, "cancelled-pack", []satJob{
|
||||||
|
{name: "01-sleep.log", cmd: []string{"sh", "-c", "sleep 5"}},
|
||||||
|
}, nil)
|
||||||
|
<-done
|
||||||
|
|
||||||
|
if !errors.Is(err, context.Canceled) {
|
||||||
|
t.Fatalf("err=%v want context.Canceled", err)
|
||||||
|
}
|
||||||
|
if archive != "" {
|
||||||
|
t.Fatalf("archive=%q want empty", archive)
|
||||||
|
}
|
||||||
|
matches, globErr := filepath.Glob(filepath.Join(dir, "cancelled-pack-*.tar.gz"))
|
||||||
|
if globErr != nil {
|
||||||
|
t.Fatalf("Glob error: %v", globErr)
|
||||||
|
}
|
||||||
|
if len(matches) != 0 {
|
||||||
|
t.Fatalf("archives=%v want none", matches)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
@@ -61,7 +61,9 @@ func (s *System) ServiceState(name string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||||
raw, err := exec.Command("systemctl", string(action), name).CombinedOutput()
|
// bee-web runs as the bee user; sudo is required to control system services.
|
||||||
|
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||||
|
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
||||||
return string(raw), err
|
return string(raw), err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -70,6 +70,7 @@ type NvidiaStressOptions struct {
|
|||||||
Loader string
|
Loader string
|
||||||
GPUIndices []int
|
GPUIndices []int
|
||||||
ExcludeGPUIndices []int
|
ExcludeGPUIndices []int
|
||||||
|
StaggerSeconds int
|
||||||
}
|
}
|
||||||
|
|
||||||
func New() *System {
|
func New() *System {
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ type RuntimeHealth struct {
|
|||||||
ExportDir string `json:"export_dir,omitempty"`
|
ExportDir string `json:"export_dir,omitempty"`
|
||||||
DriverReady bool `json:"driver_ready,omitempty"`
|
DriverReady bool `json:"driver_ready,omitempty"`
|
||||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||||
|
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
||||||
NetworkStatus string `json:"network_status,omitempty"`
|
NetworkStatus string `json:"network_status,omitempty"`
|
||||||
Issues []RuntimeIssue `json:"issues,omitempty"`
|
Issues []RuntimeIssue `json:"issues,omitempty"`
|
||||||
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
||||||
@@ -182,6 +183,13 @@ type HardwarePCIeDevice struct {
|
|||||||
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
|
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
|
||||||
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
|
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
|
||||||
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
|
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
|
||||||
|
SFPPresent *bool `json:"sfp_present,omitempty"`
|
||||||
|
SFPIdentifier *string `json:"sfp_identifier,omitempty"`
|
||||||
|
SFPConnector *string `json:"sfp_connector,omitempty"`
|
||||||
|
SFPVendor *string `json:"sfp_vendor,omitempty"`
|
||||||
|
SFPPartNumber *string `json:"sfp_part_number,omitempty"`
|
||||||
|
SFPSerialNumber *string `json:"sfp_serial_number,omitempty"`
|
||||||
|
SFPWavelengthNM *float64 `json:"sfp_wavelength_nm,omitempty"`
|
||||||
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
|
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
|
||||||
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
|
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
|
||||||
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
|
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import (
|
|||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"syscall"
|
"syscall"
|
||||||
@@ -21,13 +22,305 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
|
var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
|
||||||
|
var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
if a == nil {
|
||||||
|
return nil, fmt.Errorf("app not configured")
|
||||||
|
}
|
||||||
|
return a.ListNvidiaGPUs()
|
||||||
|
}
|
||||||
|
var apiListNvidiaGPUStatuses = func(a *app.App) ([]platform.NvidiaGPUStatus, error) {
|
||||||
|
if a == nil {
|
||||||
|
return nil, fmt.Errorf("app not configured")
|
||||||
|
}
|
||||||
|
return a.ListNvidiaGPUStatuses()
|
||||||
|
}
|
||||||
|
|
||||||
// ── Job ID counter ────────────────────────────────────────────────────────────
|
// ── Job ID counter ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
var jobCounter atomic.Uint64
|
var jobCounter atomic.Uint64
|
||||||
|
|
||||||
func newJobID(prefix string) string {
|
func newJobID(_ string) string {
|
||||||
return fmt.Sprintf("%s-%d", prefix, jobCounter.Add(1))
|
start := int((jobCounter.Add(1) - 1) % 1000)
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
for offset := 0; offset < 1000; offset++ {
|
||||||
|
n := (start + offset) % 1000
|
||||||
|
id := fmt.Sprintf("TASK-%03d", n)
|
||||||
|
if !taskIDInUseLocked(id) {
|
||||||
|
return id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("TASK-%03d", start)
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskIDInUseLocked(id string) bool {
|
||||||
|
for _, t := range globalQueue.tasks {
|
||||||
|
if t != nil && t.ID == id {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
type taskRunResponse struct {
|
||||||
|
TaskID string `json:"task_id,omitempty"`
|
||||||
|
JobID string `json:"job_id,omitempty"`
|
||||||
|
TaskIDs []string `json:"task_ids,omitempty"`
|
||||||
|
JobIDs []string `json:"job_ids,omitempty"`
|
||||||
|
TaskCount int `json:"task_count,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type nvidiaTaskSelection struct {
|
||||||
|
GPUIndices []int
|
||||||
|
Label string
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
|
||||||
|
if len(tasks) == 0 {
|
||||||
|
writeJSON(w, taskRunResponse{})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
ids := make([]string, 0, len(tasks))
|
||||||
|
for _, t := range tasks {
|
||||||
|
if t == nil || strings.TrimSpace(t.ID) == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ids = append(ids, t.ID)
|
||||||
|
}
|
||||||
|
resp := taskRunResponse{TaskCount: len(ids)}
|
||||||
|
if len(ids) > 0 {
|
||||||
|
resp.TaskID = ids[0]
|
||||||
|
resp.JobID = ids[0]
|
||||||
|
resp.TaskIDs = ids
|
||||||
|
resp.JobIDs = ids
|
||||||
|
}
|
||||||
|
writeJSON(w, resp)
|
||||||
|
}
|
||||||
|
|
||||||
|
func shouldSplitHomogeneousNvidiaTarget(target string) bool {
|
||||||
|
switch strings.TrimSpace(target) {
|
||||||
|
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
|
||||||
|
"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
|
||||||
|
"nvidia-bandwidth", "nvidia-stress":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
|
||||||
|
if len(gpus) == 0 {
|
||||||
|
return nil, fmt.Errorf("no NVIDIA GPUs detected")
|
||||||
|
}
|
||||||
|
indexed := make(map[int]platform.NvidiaGPU, len(gpus))
|
||||||
|
allIndices := make([]int, 0, len(gpus))
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
indexed[gpu.Index] = gpu
|
||||||
|
allIndices = append(allIndices, gpu.Index)
|
||||||
|
}
|
||||||
|
sort.Ints(allIndices)
|
||||||
|
|
||||||
|
selected := allIndices
|
||||||
|
if len(include) > 0 {
|
||||||
|
selected = make([]int, 0, len(include))
|
||||||
|
seen := make(map[int]struct{}, len(include))
|
||||||
|
for _, idx := range include {
|
||||||
|
if _, ok := indexed[idx]; !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, dup := seen[idx]; dup {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[idx] = struct{}{}
|
||||||
|
selected = append(selected, idx)
|
||||||
|
}
|
||||||
|
sort.Ints(selected)
|
||||||
|
}
|
||||||
|
if len(exclude) > 0 {
|
||||||
|
skip := make(map[int]struct{}, len(exclude))
|
||||||
|
for _, idx := range exclude {
|
||||||
|
skip[idx] = struct{}{}
|
||||||
|
}
|
||||||
|
filtered := selected[:0]
|
||||||
|
for _, idx := range selected {
|
||||||
|
if _, ok := skip[idx]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
filtered = append(filtered, idx)
|
||||||
|
}
|
||||||
|
selected = filtered
|
||||||
|
}
|
||||||
|
if len(selected) == 0 {
|
||||||
|
return nil, fmt.Errorf("no NVIDIA GPUs selected")
|
||||||
|
}
|
||||||
|
|
||||||
|
modelGroups := make(map[string][]platform.NvidiaGPU)
|
||||||
|
modelOrder := make([]string, 0)
|
||||||
|
for _, idx := range selected {
|
||||||
|
gpu := indexed[idx]
|
||||||
|
model := strings.TrimSpace(gpu.Name)
|
||||||
|
if model == "" {
|
||||||
|
model = fmt.Sprintf("GPU %d", gpu.Index)
|
||||||
|
}
|
||||||
|
if _, ok := modelGroups[model]; !ok {
|
||||||
|
modelOrder = append(modelOrder, model)
|
||||||
|
}
|
||||||
|
modelGroups[model] = append(modelGroups[model], gpu)
|
||||||
|
}
|
||||||
|
sort.Slice(modelOrder, func(i, j int) bool {
|
||||||
|
left := modelGroups[modelOrder[i]]
|
||||||
|
right := modelGroups[modelOrder[j]]
|
||||||
|
if len(left) == 0 || len(right) == 0 {
|
||||||
|
return modelOrder[i] < modelOrder[j]
|
||||||
|
}
|
||||||
|
return left[0].Index < right[0].Index
|
||||||
|
})
|
||||||
|
|
||||||
|
var groups []nvidiaTaskSelection
|
||||||
|
var singles []nvidiaTaskSelection
|
||||||
|
for _, model := range modelOrder {
|
||||||
|
group := modelGroups[model]
|
||||||
|
sort.Slice(group, func(i, j int) bool { return group[i].Index < group[j].Index })
|
||||||
|
indices := make([]int, 0, len(group))
|
||||||
|
for _, gpu := range group {
|
||||||
|
indices = append(indices, gpu.Index)
|
||||||
|
}
|
||||||
|
if len(indices) >= 2 {
|
||||||
|
groups = append(groups, nvidiaTaskSelection{
|
||||||
|
GPUIndices: indices,
|
||||||
|
Label: fmt.Sprintf("%s; GPUs %s", model, joinTaskIndices(indices)),
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
gpu := group[0]
|
||||||
|
singles = append(singles, nvidiaTaskSelection{
|
||||||
|
GPUIndices: []int{gpu.Index},
|
||||||
|
Label: fmt.Sprintf("GPU %d — %s", gpu.Index, model),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return append(groups, singles...), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func joinTaskIndices(indices []int) string {
|
||||||
|
parts := make([]string, 0, len(indices))
|
||||||
|
for _, idx := range indices {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d", idx))
|
||||||
|
}
|
||||||
|
return strings.Join(parts, ",")
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatSplitTaskName(baseName, selectionLabel string) string {
|
||||||
|
baseName = strings.TrimSpace(baseName)
|
||||||
|
selectionLabel = strings.TrimSpace(selectionLabel)
|
||||||
|
if baseName == "" {
|
||||||
|
return selectionLabel
|
||||||
|
}
|
||||||
|
if selectionLabel == "" {
|
||||||
|
return baseName
|
||||||
|
}
|
||||||
|
return baseName + " (" + selectionLabel + ")"
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) {
|
||||||
|
if !shouldSplitHomogeneousNvidiaTarget(target) || params.ParallelGPUs {
|
||||||
|
// Parallel mode (or non-splittable target): one task for all selected GPUs.
|
||||||
|
if params.ParallelGPUs && shouldSplitHomogeneousNvidiaTarget(target) {
|
||||||
|
// Resolve the selected GPU indices so ExcludeGPUIndices is applied.
|
||||||
|
gpus, err := apiListNvidiaGPUs(appRef)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
resolved, err := expandSelectedGPUIndices(gpus, params.GPUIndices, params.ExcludeGPUIndices)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
params.GPUIndices = resolved
|
||||||
|
params.ExcludeGPUIndices = nil
|
||||||
|
}
|
||||||
|
t := &Task{
|
||||||
|
ID: newJobID(idPrefix),
|
||||||
|
Name: baseName,
|
||||||
|
Target: target,
|
||||||
|
Priority: priority,
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: createdAt,
|
||||||
|
params: params,
|
||||||
|
}
|
||||||
|
return []*Task{t}, nil
|
||||||
|
}
|
||||||
|
gpus, err := apiListNvidiaGPUs(appRef)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
selections, err := expandHomogeneousNvidiaSelections(gpus, params.GPUIndices, params.ExcludeGPUIndices)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
tasks := make([]*Task, 0, len(selections))
|
||||||
|
for _, selection := range selections {
|
||||||
|
taskParamsCopy := params
|
||||||
|
taskParamsCopy.GPUIndices = append([]int(nil), selection.GPUIndices...)
|
||||||
|
taskParamsCopy.ExcludeGPUIndices = nil
|
||||||
|
displayName := formatSplitTaskName(baseName, selection.Label)
|
||||||
|
taskParamsCopy.DisplayName = displayName
|
||||||
|
tasks = append(tasks, &Task{
|
||||||
|
ID: newJobID(idPrefix),
|
||||||
|
Name: displayName,
|
||||||
|
Target: target,
|
||||||
|
Priority: priority,
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: createdAt,
|
||||||
|
params: taskParamsCopy,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return tasks, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// expandSelectedGPUIndices returns the sorted list of selected GPU indices after
|
||||||
|
// applying include/exclude filters, without splitting by model.
|
||||||
|
func expandSelectedGPUIndices(gpus []platform.NvidiaGPU, include, exclude []int) ([]int, error) {
|
||||||
|
indexed := make(map[int]struct{}, len(gpus))
|
||||||
|
allIndices := make([]int, 0, len(gpus))
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
indexed[gpu.Index] = struct{}{}
|
||||||
|
allIndices = append(allIndices, gpu.Index)
|
||||||
|
}
|
||||||
|
sort.Ints(allIndices)
|
||||||
|
|
||||||
|
selected := allIndices
|
||||||
|
if len(include) > 0 {
|
||||||
|
selected = make([]int, 0, len(include))
|
||||||
|
seen := make(map[int]struct{}, len(include))
|
||||||
|
for _, idx := range include {
|
||||||
|
if _, ok := indexed[idx]; !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, dup := seen[idx]; dup {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[idx] = struct{}{}
|
||||||
|
selected = append(selected, idx)
|
||||||
|
}
|
||||||
|
sort.Ints(selected)
|
||||||
|
}
|
||||||
|
if len(exclude) > 0 {
|
||||||
|
skip := make(map[int]struct{}, len(exclude))
|
||||||
|
for _, idx := range exclude {
|
||||||
|
skip[idx] = struct{}{}
|
||||||
|
}
|
||||||
|
filtered := selected[:0]
|
||||||
|
for _, idx := range selected {
|
||||||
|
if _, ok := skip[idx]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
filtered = append(filtered, idx)
|
||||||
|
}
|
||||||
|
selected = filtered
|
||||||
|
}
|
||||||
|
if len(selected) == 0 {
|
||||||
|
return nil, fmt.Errorf("no NVIDIA GPUs selected")
|
||||||
|
}
|
||||||
|
return selected, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── SSE helpers ───────────────────────────────────────────────────────────────
|
// ── SSE helpers ───────────────────────────────────────────────────────────────
|
||||||
@@ -189,12 +482,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
var body struct {
|
var body struct {
|
||||||
Duration int `json:"duration"`
|
Duration int `json:"duration"`
|
||||||
DiagLevel int `json:"diag_level"`
|
StressMode bool `json:"stress_mode"`
|
||||||
GPUIndices []int `json:"gpu_indices"`
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||||
Loader string `json:"loader"`
|
StaggerGPUStart bool `json:"stagger_gpu_start"`
|
||||||
|
Loader string `json:"loader"`
|
||||||
Profile string `json:"profile"`
|
Profile string `json:"profile"`
|
||||||
DisplayName string `json:"display_name"`
|
DisplayName string `json:"display_name"`
|
||||||
PlatformComponents []string `json:"platform_components"`
|
PlatformComponents []string `json:"platform_components"`
|
||||||
@@ -207,28 +501,29 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
name := taskDisplayName(target, body.Profile, body.Loader)
|
name := taskDisplayName(target, body.Profile, body.Loader)
|
||||||
t := &Task{
|
if strings.TrimSpace(body.DisplayName) != "" {
|
||||||
ID: newJobID("sat-" + target),
|
name = body.DisplayName
|
||||||
Name: name,
|
}
|
||||||
Target: target,
|
params := taskParams{
|
||||||
Status: TaskPending,
|
|
||||||
CreatedAt: time.Now(),
|
|
||||||
params: taskParams{
|
|
||||||
Duration: body.Duration,
|
Duration: body.Duration,
|
||||||
DiagLevel: body.DiagLevel,
|
StressMode: body.StressMode,
|
||||||
GPUIndices: body.GPUIndices,
|
GPUIndices: body.GPUIndices,
|
||||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||||
|
StaggerGPUStart: body.StaggerGPUStart,
|
||||||
Loader: body.Loader,
|
Loader: body.Loader,
|
||||||
BurnProfile: body.Profile,
|
BurnProfile: body.Profile,
|
||||||
DisplayName: body.DisplayName,
|
DisplayName: body.DisplayName,
|
||||||
PlatformComponents: body.PlatformComponents,
|
PlatformComponents: body.PlatformComponents,
|
||||||
},
|
|
||||||
}
|
}
|
||||||
if strings.TrimSpace(body.DisplayName) != "" {
|
tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target)
|
||||||
t.Name = body.DisplayName
|
if err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
}
|
}
|
||||||
globalQueue.enqueue(t)
|
for _, t := range tasks {
|
||||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
globalQueue.enqueue(t)
|
||||||
|
}
|
||||||
|
writeTaskRunResponse(w, tasks)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -244,6 +539,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
|||||||
GPUIndices []int `json:"gpu_indices"`
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||||
RunNCCL *bool `json:"run_nccl"`
|
RunNCCL *bool `json:"run_nccl"`
|
||||||
|
ParallelGPUs *bool `json:"parallel_gpus"`
|
||||||
DisplayName string `json:"display_name"`
|
DisplayName string `json:"display_name"`
|
||||||
}
|
}
|
||||||
if r.Body != nil {
|
if r.Body != nil {
|
||||||
@@ -257,27 +553,31 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
|||||||
if body.RunNCCL != nil {
|
if body.RunNCCL != nil {
|
||||||
runNCCL = *body.RunNCCL
|
runNCCL = *body.RunNCCL
|
||||||
}
|
}
|
||||||
t := &Task{
|
parallelGPUs := false
|
||||||
ID: newJobID("benchmark-nvidia"),
|
if body.ParallelGPUs != nil {
|
||||||
Name: taskDisplayName("nvidia-benchmark", "", ""),
|
parallelGPUs = *body.ParallelGPUs
|
||||||
Target: "nvidia-benchmark",
|
|
||||||
Priority: 15,
|
|
||||||
Status: TaskPending,
|
|
||||||
CreatedAt: time.Now(),
|
|
||||||
params: taskParams{
|
|
||||||
GPUIndices: body.GPUIndices,
|
|
||||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
|
||||||
SizeMB: body.SizeMB,
|
|
||||||
BenchmarkProfile: body.Profile,
|
|
||||||
RunNCCL: runNCCL,
|
|
||||||
DisplayName: body.DisplayName,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
name := taskDisplayName("nvidia-benchmark", "", "")
|
||||||
if strings.TrimSpace(body.DisplayName) != "" {
|
if strings.TrimSpace(body.DisplayName) != "" {
|
||||||
t.Name = body.DisplayName
|
name = body.DisplayName
|
||||||
}
|
}
|
||||||
globalQueue.enqueue(t)
|
tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
|
||||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
GPUIndices: body.GPUIndices,
|
||||||
|
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||||
|
SizeMB: body.SizeMB,
|
||||||
|
BenchmarkProfile: body.Profile,
|
||||||
|
RunNCCL: runNCCL,
|
||||||
|
ParallelGPUs: parallelGPUs,
|
||||||
|
DisplayName: body.DisplayName,
|
||||||
|
}, name, h.opts.App, "benchmark-nvidia")
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, t := range tasks {
|
||||||
|
globalQueue.enqueue(t)
|
||||||
|
}
|
||||||
|
writeTaskRunResponse(w, tasks)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
||||||
@@ -383,11 +683,13 @@ func (h *handler) handleAPIServicesAction(w http.ResponseWriter, r *http.Request
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
result, err := h.opts.App.ServiceActionResult(req.Name, action)
|
result, err := h.opts.App.ServiceActionResult(req.Name, action)
|
||||||
|
status := "ok"
|
||||||
if err != nil {
|
if err != nil {
|
||||||
writeError(w, http.StatusInternalServerError, err.Error())
|
status = "error"
|
||||||
return
|
|
||||||
}
|
}
|
||||||
writeJSON(w, map[string]string{"status": "ok", "output": result.Body})
|
// Always return 200 with output so the frontend can display the actual
|
||||||
|
// systemctl error message instead of a generic "exit status 1".
|
||||||
|
writeJSON(w, map[string]string{"status": status, "output": result.Body})
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Network ───────────────────────────────────────────────────────────────────
|
// ── Network ───────────────────────────────────────────────────────────────────
|
||||||
@@ -555,6 +857,42 @@ func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
|
|||||||
writeJSON(w, gpus)
|
writeJSON(w, gpus)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIGNVIDIAGPUStatuses(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
gpus, err := apiListNvidiaGPUStatuses(h.opts.App)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if gpus == nil {
|
||||||
|
gpus = []platform.NvidiaGPUStatus{}
|
||||||
|
}
|
||||||
|
writeJSON(w, gpus)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIGNVIDIAReset(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var req struct {
|
||||||
|
Index int `json:"index"`
|
||||||
|
}
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
result, err := h.opts.App.ResetNvidiaGPU(req.Index)
|
||||||
|
status := "ok"
|
||||||
|
if err != nil {
|
||||||
|
status = "error"
|
||||||
|
}
|
||||||
|
writeJSON(w, map[string]string{"status": status, "output": result.Body})
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
|
||||||
if h.opts.App == nil {
|
if h.opts.App == nil {
|
||||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
@@ -1040,107 +1378,3 @@ func (h *handler) rollbackPendingNetworkChange() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Display / Screen Resolution ───────────────────────────────────────────────
|
|
||||||
|
|
||||||
type displayMode struct {
|
|
||||||
Output string `json:"output"`
|
|
||||||
Mode string `json:"mode"`
|
|
||||||
Current bool `json:"current"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type displayInfo struct {
|
|
||||||
Output string `json:"output"`
|
|
||||||
Modes []displayMode `json:"modes"`
|
|
||||||
Current string `json:"current"`
|
|
||||||
}
|
|
||||||
|
|
||||||
var xrandrOutputRE = regexp.MustCompile(`^(\S+)\s+connected`)
|
|
||||||
var xrandrModeRE = regexp.MustCompile(`^\s{3}(\d+x\d+)\s`)
|
|
||||||
var xrandrCurrentRE = regexp.MustCompile(`\*`)
|
|
||||||
|
|
||||||
func parseXrandrOutput(out string) []displayInfo {
|
|
||||||
var infos []displayInfo
|
|
||||||
var cur *displayInfo
|
|
||||||
for _, line := range strings.Split(out, "\n") {
|
|
||||||
if m := xrandrOutputRE.FindStringSubmatch(line); m != nil {
|
|
||||||
if cur != nil {
|
|
||||||
infos = append(infos, *cur)
|
|
||||||
}
|
|
||||||
cur = &displayInfo{Output: m[1]}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if cur == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if m := xrandrModeRE.FindStringSubmatch(line); m != nil {
|
|
||||||
isCurrent := xrandrCurrentRE.MatchString(line)
|
|
||||||
mode := displayMode{Output: cur.Output, Mode: m[1], Current: isCurrent}
|
|
||||||
cur.Modes = append(cur.Modes, mode)
|
|
||||||
if isCurrent {
|
|
||||||
cur.Current = m[1]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if cur != nil {
|
|
||||||
infos = append(infos, *cur)
|
|
||||||
}
|
|
||||||
return infos
|
|
||||||
}
|
|
||||||
|
|
||||||
func xrandrCommand(args ...string) *exec.Cmd {
|
|
||||||
cmd := exec.Command("xrandr", args...)
|
|
||||||
env := append([]string{}, os.Environ()...)
|
|
||||||
hasDisplay := false
|
|
||||||
hasXAuthority := false
|
|
||||||
for _, kv := range env {
|
|
||||||
if strings.HasPrefix(kv, "DISPLAY=") && strings.TrimPrefix(kv, "DISPLAY=") != "" {
|
|
||||||
hasDisplay = true
|
|
||||||
}
|
|
||||||
if strings.HasPrefix(kv, "XAUTHORITY=") && strings.TrimPrefix(kv, "XAUTHORITY=") != "" {
|
|
||||||
hasXAuthority = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !hasDisplay {
|
|
||||||
env = append(env, "DISPLAY=:0")
|
|
||||||
}
|
|
||||||
if !hasXAuthority {
|
|
||||||
env = append(env, "XAUTHORITY=/home/bee/.Xauthority")
|
|
||||||
}
|
|
||||||
cmd.Env = env
|
|
||||||
return cmd
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
|
|
||||||
out, err := xrandrCommand().Output()
|
|
||||||
if err != nil {
|
|
||||||
writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
writeJSON(w, parseXrandrOutput(string(out)))
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *handler) handleAPIDisplaySet(w http.ResponseWriter, r *http.Request) {
|
|
||||||
var req struct {
|
|
||||||
Output string `json:"output"`
|
|
||||||
Mode string `json:"mode"`
|
|
||||||
}
|
|
||||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Output == "" || req.Mode == "" {
|
|
||||||
writeError(w, http.StatusBadRequest, "output and mode are required")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// Validate mode looks like WxH to prevent injection
|
|
||||||
if !regexp.MustCompile(`^\d+x\d+$`).MatchString(req.Mode) {
|
|
||||||
writeError(w, http.StatusBadRequest, "invalid mode format")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// Validate output name (no special chars)
|
|
||||||
if !regexp.MustCompile(`^[A-Za-z0-9_\-]+$`).MatchString(req.Output) {
|
|
||||||
writeError(w, http.StatusBadRequest, "invalid output name")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if out, err := xrandrCommand("--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
|
|
||||||
writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
writeJSON(w, map[string]string{"status": "ok", "output": req.Output, "mode": req.Mode})
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
@@ -9,30 +10,6 @@ import (
|
|||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
|
|
||||||
t.Setenv("DISPLAY", "")
|
|
||||||
t.Setenv("XAUTHORITY", "")
|
|
||||||
|
|
||||||
cmd := xrandrCommand("--query")
|
|
||||||
|
|
||||||
var hasDisplay bool
|
|
||||||
var hasXAuthority bool
|
|
||||||
for _, kv := range cmd.Env {
|
|
||||||
if kv == "DISPLAY=:0" {
|
|
||||||
hasDisplay = true
|
|
||||||
}
|
|
||||||
if kv == "XAUTHORITY=/home/bee/.Xauthority" {
|
|
||||||
hasXAuthority = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !hasDisplay {
|
|
||||||
t.Fatalf("DISPLAY not injected: %v", cmd.Env)
|
|
||||||
}
|
|
||||||
if !hasXAuthority {
|
|
||||||
t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||||
globalQueue.mu.Lock()
|
globalQueue.mu.Lock()
|
||||||
originalTasks := globalQueue.tasks
|
originalTasks := globalQueue.tasks
|
||||||
@@ -74,6 +51,14 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
|||||||
globalQueue.tasks = originalTasks
|
globalQueue.tasks = originalTasks
|
||||||
globalQueue.mu.Unlock()
|
globalQueue.mu.Unlock()
|
||||||
})
|
})
|
||||||
|
prevList := apiListNvidiaGPUs
|
||||||
|
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
return []platform.NvidiaGPU{
|
||||||
|
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 3, Name: "NVIDIA H100 PCIe"},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||||
@@ -101,6 +86,97 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
prevList := apiListNvidiaGPUs
|
||||||
|
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
return []platform.NvidiaGPU{
|
||||||
|
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
var resp taskRunResponse
|
||||||
|
if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
|
||||||
|
t.Fatalf("decode response: %v", err)
|
||||||
|
}
|
||||||
|
if len(resp.TaskIDs) != 2 {
|
||||||
|
t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 2 {
|
||||||
|
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||||
|
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||||
|
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
prevList := apiListNvidiaGPUs
|
||||||
|
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
return []platform.NvidiaGPU{
|
||||||
|
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 2 {
|
||||||
|
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||||
|
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||||
|
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||||
h := &handler{}
|
h := &handler{}
|
||||||
h.pushFanRings([]platform.FanReading{
|
h.pushFanRings([]platform.FanReading{
|
||||||
|
|||||||
@@ -9,13 +9,14 @@ import (
|
|||||||
|
|
||||||
// jobState holds the output lines and completion status of an async job.
|
// jobState holds the output lines and completion status of an async job.
|
||||||
type jobState struct {
|
type jobState struct {
|
||||||
lines []string
|
lines []string
|
||||||
done bool
|
done bool
|
||||||
err string
|
err string
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
subs []chan string
|
subs []chan string
|
||||||
cancel func() // optional cancel function; nil if job is not cancellable
|
cancel func() // optional cancel function; nil if job is not cancellable
|
||||||
logPath string
|
logPath string
|
||||||
|
serialPrefix string
|
||||||
}
|
}
|
||||||
|
|
||||||
// abort cancels the job if it has a cancel function and is not yet done.
|
// abort cancels the job if it has a cancel function and is not yet done.
|
||||||
@@ -36,6 +37,9 @@ func (j *jobState) append(line string) {
|
|||||||
if j.logPath != "" {
|
if j.logPath != "" {
|
||||||
appendJobLog(j.logPath, line)
|
appendJobLog(j.logPath, line)
|
||||||
}
|
}
|
||||||
|
if j.serialPrefix != "" {
|
||||||
|
taskSerialWriteLine(j.serialPrefix + line)
|
||||||
|
}
|
||||||
for _, ch := range j.subs {
|
for _, ch := range j.subs {
|
||||||
select {
|
select {
|
||||||
case ch <- line:
|
case ch <- line:
|
||||||
@@ -107,8 +111,11 @@ func (m *jobManager) get(id string) (*jobState, bool) {
|
|||||||
return j, ok
|
return j, ok
|
||||||
}
|
}
|
||||||
|
|
||||||
func newTaskJobState(logPath string) *jobState {
|
func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
|
||||||
j := &jobState{logPath: logPath}
|
j := &jobState{logPath: logPath}
|
||||||
|
if len(serialPrefix) > 0 {
|
||||||
|
j.serialPrefix = serialPrefix[0]
|
||||||
|
}
|
||||||
if logPath == "" {
|
if logPath == "" {
|
||||||
return j
|
return j
|
||||||
}
|
}
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
41
audit/internal/webui/serial_console.go
Normal file
41
audit/internal/webui/serial_console.go
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
var taskSerialWriteLine = writeTaskSerialLine
|
||||||
|
|
||||||
|
func writeTaskSerialLine(line string) {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
payload := fmt.Sprintf("%s %s\n", time.Now().UTC().Format("2006-01-02 15:04:05Z"), line)
|
||||||
|
for _, path := range []string{"/dev/ttyS0", "/dev/ttyS1", "/dev/console"} {
|
||||||
|
f, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
_, _ = f.WriteString(payload)
|
||||||
|
_ = f.Close()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskSerialPrefix(t *Task) string {
|
||||||
|
if t == nil {
|
||||||
|
return "[task] "
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("[task %s %s] ", t.ID, t.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskSerialEvent(t *Task, event string) {
|
||||||
|
if t == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
taskSerialWriteLine(fmt.Sprintf("%s%s", taskSerialPrefix(t), strings.TrimSpace(event)))
|
||||||
|
}
|
||||||
@@ -221,6 +221,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
// ── Infrastructure ──────────────────────────────────────────────────────
|
// ── Infrastructure ──────────────────────────────────────────────────────
|
||||||
mux.HandleFunc("GET /healthz", h.handleHealthz)
|
mux.HandleFunc("GET /healthz", h.handleHealthz)
|
||||||
mux.HandleFunc("GET /api/ready", h.handleReady)
|
mux.HandleFunc("GET /api/ready", h.handleReady)
|
||||||
|
mux.HandleFunc("GET /loading", func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
_, _ = w.Write([]byte(loadingPageHTML))
|
||||||
|
})
|
||||||
|
|
||||||
// ── Existing read-only endpoints (preserved for compatibility) ──────────
|
// ── Existing read-only endpoints (preserved for compatibility) ──────────
|
||||||
mux.HandleFunc("GET /audit.json", h.handleAuditJSON)
|
mux.HandleFunc("GET /audit.json", h.handleAuditJSON)
|
||||||
@@ -265,6 +270,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel)
|
mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel)
|
||||||
mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority)
|
mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority)
|
||||||
mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream)
|
mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream)
|
||||||
|
mux.HandleFunc("GET /api/tasks/{id}/charts", h.handleAPITaskChartsIndex)
|
||||||
|
mux.HandleFunc("GET /api/tasks/{id}/chart/", h.handleAPITaskChartSVG)
|
||||||
mux.HandleFunc("GET /tasks/{id}", h.handleTaskPage)
|
mux.HandleFunc("GET /tasks/{id}", h.handleTaskPage)
|
||||||
|
|
||||||
// Services
|
// Services
|
||||||
@@ -288,13 +295,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
// Tools
|
// Tools
|
||||||
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||||
|
|
||||||
// Display
|
|
||||||
mux.HandleFunc("GET /api/display/resolutions", h.handleAPIDisplayResolutions)
|
|
||||||
mux.HandleFunc("POST /api/display/set", h.handleAPIDisplaySet)
|
|
||||||
|
|
||||||
// GPU presence / tools
|
// GPU presence / tools
|
||||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||||
mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
|
mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
|
||||||
|
mux.HandleFunc("GET /api/gpu/nvidia-status", h.handleAPIGNVIDIAGPUStatuses)
|
||||||
|
mux.HandleFunc("POST /api/gpu/nvidia-reset", h.handleAPIGNVIDIAReset)
|
||||||
mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
|
mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
|
||||||
|
|
||||||
// System
|
// System
|
||||||
@@ -1207,37 +1212,106 @@ const loadingPageHTML = `<!DOCTYPE html>
|
|||||||
<html lang="en">
|
<html lang="en">
|
||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8">
|
<meta charset="UTF-8">
|
||||||
<title>EASY-BEE</title>
|
<title>EASY-BEE — Starting</title>
|
||||||
<style>
|
<style>
|
||||||
*{margin:0;padding:0;box-sizing:border-box}
|
*{margin:0;padding:0;box-sizing:border-box}
|
||||||
html,body{height:100%;background:#0f1117;display:flex;align-items:center;justify-content:center;font-family:'Courier New',monospace;color:#e2e8f0}
|
html,body{height:100%;background:#0f1117;display:flex;align-items:center;justify-content:center;font-family:'Courier New',monospace;color:#e2e8f0}
|
||||||
.logo{font-size:13px;line-height:1.4;color:#f6c90e;margin-bottom:48px;white-space:pre}
|
.wrap{text-align:center;width:420px}
|
||||||
.spinner{width:48px;height:48px;border:4px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 24px}
|
.logo{font-size:11px;line-height:1.4;color:#f6c90e;margin-bottom:6px;white-space:pre;text-align:left}
|
||||||
|
.subtitle{font-size:12px;color:#a0aec0;text-align:left;margin-bottom:24px;padding-left:2px}
|
||||||
|
.spinner{width:36px;height:36px;border:3px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 14px}
|
||||||
|
.spinner.hidden{display:none}
|
||||||
@keyframes spin{to{transform:rotate(360deg)}}
|
@keyframes spin{to{transform:rotate(360deg)}}
|
||||||
.status{font-size:14px;color:#a0aec0;letter-spacing:.05em}
|
.status{font-size:13px;color:#a0aec0;margin-bottom:20px;min-height:18px}
|
||||||
|
table{width:100%;border-collapse:collapse;font-size:12px;margin-bottom:20px;display:none}
|
||||||
|
td{padding:3px 6px;text-align:left}
|
||||||
|
td:first-child{color:#718096;width:55%}
|
||||||
|
.ok{color:#68d391}
|
||||||
|
.run{color:#f6c90e}
|
||||||
|
.fail{color:#fc8181}
|
||||||
|
.dim{color:#4a5568}
|
||||||
|
.btn{background:#1a202c;color:#a0aec0;border:1px solid #2d3748;padding:7px 18px;font-size:12px;cursor:pointer;font-family:inherit;display:none}
|
||||||
|
.btn:hover{border-color:#718096;color:#e2e8f0}
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div style="text-align:center">
|
<div class="wrap">
|
||||||
<div class="logo"> ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗
|
<div class="logo"> ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗
|
||||||
██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝
|
██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝
|
||||||
█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗
|
█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗
|
||||||
██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝
|
██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝
|
||||||
███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗
|
███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗
|
||||||
╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝</div>
|
╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝</div>
|
||||||
<div class="spinner"></div>
|
<div class="subtitle">Hardware Audit LiveCD</div>
|
||||||
<div class="status" id="s">Starting up...</div>
|
<div class="spinner" id="spin"></div>
|
||||||
|
<div class="status" id="st">Connecting to bee-web...</div>
|
||||||
|
<table id="tbl"></table>
|
||||||
|
<button class="btn" id="btn" onclick="go()">Open app now</button>
|
||||||
</div>
|
</div>
|
||||||
<script>
|
<script>
|
||||||
function probe(){
|
(function(){
|
||||||
fetch('/api/ready',{cache:'no-store'})
|
var gone = false;
|
||||||
.then(function(r){
|
function go(){ if(!gone){gone=true;window.location.replace('/');} }
|
||||||
if(r.ok){window.location.replace('/');}
|
|
||||||
else{setTimeout(probe,1000);}
|
function icon(s){
|
||||||
|
if(s==='active') return '<span class="ok">● active</span>';
|
||||||
|
if(s==='failed') return '<span class="fail">✕ failed</span>';
|
||||||
|
if(s==='activating'||s==='reloading') return '<span class="run">○ starting</span>';
|
||||||
|
if(s==='inactive') return '<span class="dim">○ inactive</span>';
|
||||||
|
return '<span class="dim">'+s+'</span>';
|
||||||
|
}
|
||||||
|
|
||||||
|
function allSettled(svcs){
|
||||||
|
for(var i=0;i<svcs.length;i++){
|
||||||
|
var s=svcs[i].state;
|
||||||
|
if(s!=='active'&&s!=='failed'&&s!=='inactive') return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
var pollTimer=null;
|
||||||
|
|
||||||
|
function pollServices(){
|
||||||
|
fetch('/api/services',{cache:'no-store'})
|
||||||
|
.then(function(r){return r.json();})
|
||||||
|
.then(function(svcs){
|
||||||
|
if(!svcs||!svcs.length) return;
|
||||||
|
var tbl=document.getElementById('tbl');
|
||||||
|
tbl.style.display='';
|
||||||
|
var html='';
|
||||||
|
for(var i=0;i<svcs.length;i++)
|
||||||
|
html+='<tr><td>'+svcs[i].name+'</td><td>'+icon(svcs[i].state)+'</td></tr>';
|
||||||
|
tbl.innerHTML=html;
|
||||||
|
if(allSettled(svcs)){
|
||||||
|
clearInterval(pollTimer);
|
||||||
|
document.getElementById('spin').className='spinner hidden';
|
||||||
|
document.getElementById('st').textContent='Ready \u2014 opening...';
|
||||||
|
setTimeout(go,800);
|
||||||
|
}
|
||||||
})
|
})
|
||||||
.catch(function(){setTimeout(probe,1000);});
|
.catch(function(){});
|
||||||
|
}
|
||||||
|
|
||||||
|
function probe(){
|
||||||
|
fetch('/healthz',{cache:'no-store'})
|
||||||
|
.then(function(r){
|
||||||
|
if(r.ok){
|
||||||
|
document.getElementById('st').textContent='bee-web running \u2014 checking services...';
|
||||||
|
document.getElementById('btn').style.display='';
|
||||||
|
pollServices();
|
||||||
|
pollTimer=setInterval(pollServices,1500);
|
||||||
|
} else {
|
||||||
|
document.getElementById('st').textContent='bee-web starting (status '+r.status+')...';
|
||||||
|
setTimeout(probe,500);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch(function(){
|
||||||
|
document.getElementById('st').textContent='Waiting for bee-web to start...';
|
||||||
|
setTimeout(probe,500);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
probe();
|
probe();
|
||||||
|
})();
|
||||||
</script>
|
</script>
|
||||||
</body>
|
</body>
|
||||||
</html>`
|
</html>`
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"os"
|
"os"
|
||||||
@@ -590,7 +591,7 @@ func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
|
||||||
handler := NewHandler(HandlerOptions{})
|
handler := NewHandler(HandlerOptions{})
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
||||||
@@ -598,11 +599,20 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
|||||||
t.Fatalf("status=%d", rec.Code)
|
t.Fatalf("status=%d", rec.Code)
|
||||||
}
|
}
|
||||||
body := rec.Body.String()
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `NVIDIA Self Heal`) {
|
||||||
|
t.Fatalf("tools page missing nvidia self heal section: %s", body)
|
||||||
|
}
|
||||||
if !strings.Contains(body, `Restart GPU Drivers`) {
|
if !strings.Contains(body, `Restart GPU Drivers`) {
|
||||||
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
|
if !strings.Contains(body, `nvidiaRestartDrivers()`) {
|
||||||
t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
|
t.Fatalf("tools page missing nvidiaRestartDrivers action: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `/api/gpu/nvidia-status`) {
|
||||||
|
t.Fatalf("tools page missing nvidia status api usage: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `nvidiaResetGPU(`) {
|
||||||
|
t.Fatalf("tools page missing nvidiaResetGPU action: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||||
t.Fatalf("tools page missing boot source field: %s", body)
|
t.Fatalf("tools page missing boot source field: %s", body)
|
||||||
@@ -636,6 +646,66 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
exportDir := filepath.Join(dir, "export")
|
||||||
|
runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
result := platform.NvidiaBenchmarkResult{
|
||||||
|
GeneratedAt: time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
|
||||||
|
BenchmarkProfile: "standard",
|
||||||
|
OverallStatus: "OK",
|
||||||
|
GPUs: []platform.BenchmarkGPUResult{
|
||||||
|
{
|
||||||
|
Index: 0,
|
||||||
|
Name: "NVIDIA H100 PCIe",
|
||||||
|
Scores: platform.BenchmarkScorecard{
|
||||||
|
CompositeScore: 1176.25,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Index: 1,
|
||||||
|
Name: "NVIDIA H100 PCIe",
|
||||||
|
Scores: platform.BenchmarkScorecard{
|
||||||
|
CompositeScore: 1168.50,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
raw, err := json.Marshal(result)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "result.json"), raw, 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
|
||||||
|
for _, needle := range []string{
|
||||||
|
`Benchmark Results`,
|
||||||
|
`Composite score by saved benchmark run and GPU.`,
|
||||||
|
`GPU #0 — NVIDIA H100 PCIe`,
|
||||||
|
`GPU #1 — NVIDIA H100 PCIe`,
|
||||||
|
`#1`,
|
||||||
|
wantTime,
|
||||||
|
`1176.25`,
|
||||||
|
`1168.50`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||||
handler := NewHandler(HandlerOptions{})
|
handler := NewHandler(HandlerOptions{})
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
@@ -649,6 +719,10 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
|||||||
`nvidia-targeted-stress`,
|
`nvidia-targeted-stress`,
|
||||||
`controlled NVIDIA DCGM load`,
|
`controlled NVIDIA DCGM load`,
|
||||||
`<code>dcgmi diag targeted_stress</code>`,
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
|
`NVIDIA GPU Selection`,
|
||||||
|
`All NVIDIA validate tasks use only the GPUs selected here.`,
|
||||||
|
`Select All`,
|
||||||
|
`id="sat-gpu-list"`,
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(body, needle) {
|
if !strings.Contains(body, needle) {
|
||||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||||
@@ -667,8 +741,8 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
|||||||
for _, needle := range []string{
|
for _, needle := range []string{
|
||||||
`NVIDIA Max Compute Load`,
|
`NVIDIA Max Compute Load`,
|
||||||
`dcgmproftester`,
|
`dcgmproftester`,
|
||||||
`targeted_stress remain in <a href="/validate">Validate</a>`,
|
`NCCL`,
|
||||||
`NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
|
`Validate → Stress mode`,
|
||||||
`id="burn-gpu-list"`,
|
`id="burn-gpu-list"`,
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(body, needle) {
|
if !strings.Contains(body, needle) {
|
||||||
@@ -723,6 +797,111 @@ func TestTaskDetailPageRendersSavedReport(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestTaskDetailPageRendersCancelForRunningTask(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
origTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = []*Task{{
|
||||||
|
ID: "task-live-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
}}
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = origTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-live-1", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `Cancel</button>`) {
|
||||||
|
t.Fatalf("task detail page missing cancel button: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `function cancelTaskDetail(id)`) {
|
||||||
|
t.Fatalf("task detail page missing cancel handler: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `/api/tasks/' + id + '/cancel`) {
|
||||||
|
t.Fatalf("task detail page missing cancel endpoint: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `id="task-live-charts"`) {
|
||||||
|
t.Fatalf("task detail page missing live charts container: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `/api/tasks/' + taskId + '/charts`) {
|
||||||
|
t.Fatalf("task detail page missing live charts index endpoint: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskChartSVGUsesTaskTimeWindow(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
metricsPath := filepath.Join(dir, "metrics.db")
|
||||||
|
prevMetricsPath := taskReportMetricsDBPath
|
||||||
|
taskReportMetricsDBPath = metricsPath
|
||||||
|
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||||
|
|
||||||
|
db, err := openMetricsDB(metricsPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("openMetricsDB: %v", err)
|
||||||
|
}
|
||||||
|
base := time.Now().UTC()
|
||||||
|
samples := []platform.LiveMetricSample{
|
||||||
|
{Timestamp: base.Add(-3 * time.Minute), PowerW: 100},
|
||||||
|
{Timestamp: base.Add(-2 * time.Minute), PowerW: 200},
|
||||||
|
{Timestamp: base.Add(-1 * time.Minute), PowerW: 300},
|
||||||
|
}
|
||||||
|
for _, sample := range samples {
|
||||||
|
if err := db.Write(sample); err != nil {
|
||||||
|
t.Fatalf("Write: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ = db.Close()
|
||||||
|
|
||||||
|
started := base.Add(-2*time.Minute - 5*time.Second)
|
||||||
|
done := base.Add(-1*time.Minute + 5*time.Second)
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
origTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = []*Task{{
|
||||||
|
ID: "task-chart-1",
|
||||||
|
Name: "Power Window",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskDone,
|
||||||
|
CreatedAt: started.Add(-10 * time.Second),
|
||||||
|
StartedAt: &started,
|
||||||
|
DoneAt: &done,
|
||||||
|
}}
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = origTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/api/tasks/task-chart-1/chart/server-power.svg", nil)
|
||||||
|
req.SetPathValue("id", "task-chart-1")
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, "System Power") {
|
||||||
|
t.Fatalf("task chart missing expected title: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, "min 200") {
|
||||||
|
t.Fatalf("task chart stats should start from in-window sample: %s", body)
|
||||||
|
}
|
||||||
|
if strings.Contains(body, "min 100") {
|
||||||
|
t.Fatalf("task chart should not include pre-task sample in stats: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestViewerRendersLatestSnapshot(t *testing.T) {
|
func TestViewerRendersLatestSnapshot(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
path := filepath.Join(dir, "audit.json")
|
path := filepath.Join(dir, "audit.json")
|
||||||
@@ -915,6 +1094,7 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
|||||||
}
|
}
|
||||||
body := rec.Body.String()
|
body := rec.Body.String()
|
||||||
for _, needle := range []string{
|
for _, needle := range []string{
|
||||||
|
// Runtime Health card — LiveCD checks only
|
||||||
`Runtime Health`,
|
`Runtime Health`,
|
||||||
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
|
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
|
||||||
`Export Directory`,
|
`Export Directory`,
|
||||||
@@ -923,16 +1103,18 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
|||||||
`CUDA / ROCm`,
|
`CUDA / ROCm`,
|
||||||
`Required Utilities`,
|
`Required Utilities`,
|
||||||
`Bee Services`,
|
`Bee Services`,
|
||||||
`<td>CPU</td>`,
|
|
||||||
`<td>Memory</td>`,
|
|
||||||
`<td>Storage</td>`,
|
|
||||||
`<td>GPU</td>`,
|
|
||||||
`CUDA runtime is not ready for GPU SAT.`,
|
`CUDA runtime is not ready for GPU SAT.`,
|
||||||
`Missing: nvidia-smi`,
|
`Missing: nvidia-smi`,
|
||||||
`bee-nvidia=inactive`,
|
`bee-nvidia=inactive`,
|
||||||
`cpu SAT: FAILED`,
|
// Hardware Summary card — component health badges
|
||||||
`storage SAT: FAILED`,
|
`Hardware Summary`,
|
||||||
`sat:nvidia`,
|
`>CPU<`,
|
||||||
|
`>Memory<`,
|
||||||
|
`>Storage<`,
|
||||||
|
`>GPU<`,
|
||||||
|
`>PSU<`,
|
||||||
|
`badge-warn`, // cpu Warning badge
|
||||||
|
`badge-err`, // storage Critical badge
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(body, needle) {
|
if !strings.Contains(body, needle) {
|
||||||
t.Fatalf("dashboard missing %q: %s", needle, body)
|
t.Fatalf("dashboard missing %q: %s", needle, body)
|
||||||
|
|||||||
@@ -1,11 +1,15 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"html"
|
"html"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
|
||||||
@@ -22,6 +26,51 @@ func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
|
|||||||
_, _ = w.Write([]byte(body))
|
_, _ = w.Write([]byte(body))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPITaskChartsIndex(w http.ResponseWriter, r *http.Request) {
|
||||||
|
task, samples, _, _, ok := h.taskSamplesForRequest(r)
|
||||||
|
if !ok {
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
type taskChartIndexEntry struct {
|
||||||
|
Title string `json:"title"`
|
||||||
|
File string `json:"file"`
|
||||||
|
}
|
||||||
|
entries := make([]taskChartIndexEntry, 0)
|
||||||
|
for _, spec := range taskChartSpecsForSamples(samples) {
|
||||||
|
title, _, ok := renderTaskChartSVG(spec.Path, samples, taskTimelineForTask(task))
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
entries = append(entries, taskChartIndexEntry{Title: title, File: spec.File})
|
||||||
|
}
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||||
|
_ = json.NewEncoder(w).Encode(entries)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPITaskChartSVG(w http.ResponseWriter, r *http.Request) {
|
||||||
|
task, samples, _, _, ok := h.taskSamplesForRequest(r)
|
||||||
|
if !ok {
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
file := strings.TrimPrefix(r.URL.Path, "/api/tasks/"+task.ID+"/chart/")
|
||||||
|
path, ok := taskChartPathFromFile(file)
|
||||||
|
if !ok {
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
title, buf, hasData := renderTaskChartSVG(path, samples, taskTimelineForTask(task))
|
||||||
|
if !hasData || len(buf) == 0 || strings.TrimSpace(title) == "" {
|
||||||
|
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.Header().Set("Content-Type", "image/svg+xml")
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
_, _ = w.Write(buf)
|
||||||
|
}
|
||||||
|
|
||||||
func renderTaskDetailPage(opts HandlerOptions, task Task) string {
|
func renderTaskDetailPage(opts HandlerOptions, task Task) string {
|
||||||
title := task.Name
|
title := task.Name
|
||||||
if strings.TrimSpace(title) == "" {
|
if strings.TrimSpace(title) == "" {
|
||||||
@@ -30,6 +79,9 @@ func renderTaskDetailPage(opts HandlerOptions, task Task) string {
|
|||||||
var body strings.Builder
|
var body strings.Builder
|
||||||
body.WriteString(`<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">`)
|
body.WriteString(`<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">`)
|
||||||
body.WriteString(`<a class="btn btn-secondary btn-sm" href="/tasks">Back to Tasks</a>`)
|
body.WriteString(`<a class="btn btn-secondary btn-sm" href="/tasks">Back to Tasks</a>`)
|
||||||
|
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||||
|
body.WriteString(`<button class="btn btn-danger btn-sm" onclick="cancelTaskDetail('` + html.EscapeString(task.ID) + `')">Cancel</button>`)
|
||||||
|
}
|
||||||
body.WriteString(`<span style="font-size:12px;color:var(--muted)">Artifacts are saved in the task folder under <code>./tasks</code>.</span>`)
|
body.WriteString(`<span style="font-size:12px;color:var(--muted)">Artifacts are saved in the task folder under <code>./tasks</code>.</span>`)
|
||||||
body.WriteString(`</div>`)
|
body.WriteString(`</div>`)
|
||||||
|
|
||||||
@@ -45,17 +97,113 @@ func renderTaskDetailPage(opts HandlerOptions, task Task) string {
|
|||||||
body.WriteString(`</div></div>`)
|
body.WriteString(`</div></div>`)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if task.Status == TaskRunning {
|
||||||
|
body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`)
|
||||||
|
body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`)
|
||||||
|
body.WriteString(`</div></div>`)
|
||||||
|
}
|
||||||
|
|
||||||
if task.Status == TaskRunning || task.Status == TaskPending {
|
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||||
body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
|
body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
|
||||||
body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
|
body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
|
||||||
body.WriteString(`</div></div>`)
|
body.WriteString(`</div></div>`)
|
||||||
body.WriteString(`<script>
|
body.WriteString(`<script>
|
||||||
|
function cancelTaskDetail(id) {
|
||||||
|
fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){
|
||||||
|
var term = document.getElementById('task-live-log');
|
||||||
|
if (term) {
|
||||||
|
term.textContent += '\nCancel requested.\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function renderTaskLiveCharts(taskId, charts) {
|
||||||
|
const host = document.getElementById('task-live-charts');
|
||||||
|
if (!host) return;
|
||||||
|
if (!Array.isArray(charts) || charts.length === 0) {
|
||||||
|
host.innerHTML = 'Waiting for metric samples...';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const seen = {};
|
||||||
|
charts.forEach(function(chart) {
|
||||||
|
seen[chart.file] = true;
|
||||||
|
let img = host.querySelector('img[data-chart-file="' + chart.file + '"]');
|
||||||
|
if (img) {
|
||||||
|
const card = img.closest('.card');
|
||||||
|
if (card) {
|
||||||
|
const title = card.querySelector('.card-head');
|
||||||
|
if (title) title.textContent = chart.title;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const card = document.createElement('div');
|
||||||
|
card.className = 'card';
|
||||||
|
card.style.margin = '0';
|
||||||
|
card.innerHTML = '<div class="card-head"></div><div class="card-body" style="padding:12px"></div>';
|
||||||
|
card.querySelector('.card-head').textContent = chart.title;
|
||||||
|
const body = card.querySelector('.card-body');
|
||||||
|
img = document.createElement('img');
|
||||||
|
img.setAttribute('data-task-chart', '1');
|
||||||
|
img.setAttribute('data-chart-file', chart.file);
|
||||||
|
img.setAttribute('data-base-src', '/api/tasks/' + taskId + '/chart/' + chart.file);
|
||||||
|
img.src = '/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now();
|
||||||
|
img.style.width = '100%';
|
||||||
|
img.style.display = 'block';
|
||||||
|
img.style.borderRadius = '6px';
|
||||||
|
img.alt = chart.title;
|
||||||
|
body.appendChild(img);
|
||||||
|
host.appendChild(card);
|
||||||
|
});
|
||||||
|
Array.from(host.querySelectorAll('img[data-task-chart="1"]')).forEach(function(img) {
|
||||||
|
const file = img.getAttribute('data-chart-file') || '';
|
||||||
|
if (seen[file]) return;
|
||||||
|
const card = img.closest('.card');
|
||||||
|
if (card) card.remove();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function loadTaskLiveCharts(taskId) {
|
||||||
|
fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
|
||||||
|
renderTaskLiveCharts(taskId, charts);
|
||||||
|
}).catch(function(){
|
||||||
|
const host = document.getElementById('task-live-charts');
|
||||||
|
if (host) host.innerHTML = 'Task charts are unavailable.';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function refreshTaskLiveCharts() {
|
||||||
|
document.querySelectorAll('img[data-task-chart="1"]').forEach(function(img){
|
||||||
|
const base = img.dataset.baseSrc;
|
||||||
|
if (!base) return;
|
||||||
|
img.src = base + '?t=' + Date.now();
|
||||||
|
});
|
||||||
|
}
|
||||||
var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
|
var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
|
||||||
var _taskDetailTerm = document.getElementById('task-live-log');
|
var _taskDetailTerm = document.getElementById('task-live-log');
|
||||||
|
var _taskChartTimer = null;
|
||||||
|
var _taskChartsFrozen = false;
|
||||||
_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
|
_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
|
||||||
_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
|
_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
|
||||||
_taskDetailES.addEventListener('done', function(){ _taskDetailES.close(); setTimeout(function(){ window.location.reload(); }, 1000); });
|
_taskDetailES.addEventListener('done', function(e){
|
||||||
_taskDetailES.onerror = function(){ _taskDetailES.close(); };
|
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||||
|
_taskDetailES.close();
|
||||||
|
_taskDetailES = null;
|
||||||
|
_taskChartsFrozen = true;
|
||||||
|
_taskDetailTerm.textContent += (e.data ? '\nTask finished with error.\n' : '\nTask finished.\n');
|
||||||
|
_taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight;
|
||||||
|
refreshTaskLiveCharts();
|
||||||
|
});
|
||||||
|
_taskDetailES.onerror = function(){
|
||||||
|
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||||
|
if (_taskDetailES) {
|
||||||
|
_taskDetailES.close();
|
||||||
|
_taskDetailES = null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||||
|
_taskChartTimer = setInterval(function(){
|
||||||
|
if (_taskChartsFrozen) return;
|
||||||
|
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||||
|
refreshTaskLiveCharts();
|
||||||
|
}, 2000);
|
||||||
</script>`)
|
</script>`)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -83,3 +231,37 @@ func taskArtifactDownloadLink(task Task, absPath string) string {
|
|||||||
}
|
}
|
||||||
return fmt.Sprintf(`/export/file?path=%s`, absPath)
|
return fmt.Sprintf(`/export/file?path=%s`, absPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) taskSamplesForRequest(r *http.Request) (Task, []platform.LiveMetricSample, time.Time, time.Time, bool) {
|
||||||
|
id := r.PathValue("id")
|
||||||
|
taskPtr, ok := globalQueue.findByID(id)
|
||||||
|
if !ok {
|
||||||
|
return Task{}, nil, time.Time{}, time.Time{}, false
|
||||||
|
}
|
||||||
|
task := *taskPtr
|
||||||
|
start, end := taskTimeWindow(&task)
|
||||||
|
samples, err := loadTaskMetricSamples(start, end)
|
||||||
|
if err != nil {
|
||||||
|
return task, nil, start, end, true
|
||||||
|
}
|
||||||
|
return task, samples, start, end, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskTimelineForTask(task Task) []chartTimelineSegment {
|
||||||
|
start, end := taskTimeWindow(&task)
|
||||||
|
return []chartTimelineSegment{{Start: start, End: end, Active: true}}
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskChartPathFromFile(file string) (string, bool) {
|
||||||
|
file = strings.TrimSpace(file)
|
||||||
|
for _, spec := range taskDashboardChartSpecs {
|
||||||
|
if spec.File == file {
|
||||||
|
return spec.Path, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(file, "gpu-") && strings.HasSuffix(file, "-overview.svg") {
|
||||||
|
id := strings.TrimSuffix(strings.TrimPrefix(file, "gpu-"), "-overview.svg")
|
||||||
|
return "gpu/" + id + "-overview", true
|
||||||
|
}
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
|||||||
@@ -53,6 +53,18 @@ var taskDashboardChartSpecs = []taskChartSpec{
|
|||||||
{Path: "gpu-all-temp", File: "gpu-all-temp.svg"},
|
{Path: "gpu-all-temp", File: "gpu-all-temp.svg"},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func taskChartSpecsForSamples(samples []platform.LiveMetricSample) []taskChartSpec {
|
||||||
|
specs := make([]taskChartSpec, 0, len(taskDashboardChartSpecs)+len(taskGPUIndices(samples)))
|
||||||
|
specs = append(specs, taskDashboardChartSpecs...)
|
||||||
|
for _, idx := range taskGPUIndices(samples) {
|
||||||
|
specs = append(specs, taskChartSpec{
|
||||||
|
Path: fmt.Sprintf("gpu/%d-overview", idx),
|
||||||
|
File: fmt.Sprintf("gpu-%d-overview.svg", idx),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return specs
|
||||||
|
}
|
||||||
|
|
||||||
func writeTaskReportArtifacts(t *Task) error {
|
func writeTaskReportArtifacts(t *Task) error {
|
||||||
if t == nil {
|
if t == nil {
|
||||||
return nil
|
return nil
|
||||||
@@ -136,7 +148,7 @@ func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMe
|
|||||||
timeline := []chartTimelineSegment{{Start: start, End: end, Active: true}}
|
timeline := []chartTimelineSegment{{Start: start, End: end, Active: true}}
|
||||||
var charts []taskReportChart
|
var charts []taskReportChart
|
||||||
inline := make(map[string]string)
|
inline := make(map[string]string)
|
||||||
for _, spec := range taskDashboardChartSpecs {
|
for _, spec := range taskChartSpecsForSamples(samples) {
|
||||||
title, svg, ok := renderTaskChartSVG(spec.Path, samples, timeline)
|
title, svg, ok := renderTaskChartSVG(spec.Path, samples, timeline)
|
||||||
if !ok || len(svg) == 0 {
|
if !ok || len(svg) == 0 {
|
||||||
continue
|
continue
|
||||||
@@ -148,24 +160,17 @@ func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMe
|
|||||||
charts = append(charts, taskReportChart{Title: title, File: spec.File})
|
charts = append(charts, taskReportChart{Title: title, File: spec.File})
|
||||||
inline[spec.File] = string(svg)
|
inline[spec.File] = string(svg)
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, idx := range taskGPUIndices(samples) {
|
|
||||||
file := fmt.Sprintf("gpu-%d-overview.svg", idx)
|
|
||||||
svg, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
|
|
||||||
if err != nil || !ok || len(svg) == 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
path := filepath.Join(dir, file)
|
|
||||||
if err := os.WriteFile(path, svg, 0644); err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
charts = append(charts, taskReportChart{Title: gpuDisplayLabel(idx) + " Overview", File: file})
|
|
||||||
inline[file] = string(svg)
|
|
||||||
}
|
|
||||||
return charts, inline
|
return charts, inline
|
||||||
}
|
}
|
||||||
|
|
||||||
func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) (string, []byte, bool) {
|
func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) (string, []byte, bool) {
|
||||||
|
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
|
||||||
|
buf, hasData, err := renderGPUOverviewChartSVG(idx, samples, timeline)
|
||||||
|
if err != nil || !hasData {
|
||||||
|
return "", nil, false
|
||||||
|
}
|
||||||
|
return gpuDisplayLabel(idx) + " Overview", buf, true
|
||||||
|
}
|
||||||
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
|
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
return "", nil, false
|
return "", nil, false
|
||||||
@@ -225,15 +230,16 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
|||||||
b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
|
b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
|
||||||
b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
|
b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
|
||||||
b.WriteString(`</div></div></div>`)
|
b.WriteString(`</div></div></div>`)
|
||||||
|
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
|
||||||
|
b.WriteString(benchmarkCard)
|
||||||
|
}
|
||||||
|
|
||||||
if len(report.Charts) > 0 {
|
if len(report.Charts) > 0 {
|
||||||
b.WriteString(`<div class="grid2">`)
|
|
||||||
for _, chart := range report.Charts {
|
for _, chart := range report.Charts {
|
||||||
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(chart.Title) + `</div><div class="card-body" style="padding:12px">`)
|
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(chart.Title) + `</div><div class="card-body" style="padding:12px">`)
|
||||||
b.WriteString(charts[chart.File])
|
b.WriteString(charts[chart.File])
|
||||||
b.WriteString(`</div></div>`)
|
b.WriteString(`</div></div>`)
|
||||||
}
|
}
|
||||||
b.WriteString(`</div>`)
|
|
||||||
} else {
|
} else {
|
||||||
b.WriteString(`<div class="alert alert-info">No metric samples were captured during this task window.</div>`)
|
b.WriteString(`<div class="alert alert-info">No metric samples were captured during this task window.</div>`)
|
||||||
}
|
}
|
||||||
@@ -244,6 +250,57 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
|||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||||
|
if strings.TrimSpace(target) != "nvidia-benchmark" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
resultPath := taskBenchmarkResultPath(logText)
|
||||||
|
if strings.TrimSpace(resultPath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
columns, runs := loadBenchmarkHistoryFromPaths([]string{resultPath})
|
||||||
|
if len(runs) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return renderBenchmarkResultsCardFromRuns(
|
||||||
|
"Benchmark Results",
|
||||||
|
"Composite score for this benchmark task.",
|
||||||
|
"No benchmark results were saved for this task.",
|
||||||
|
columns,
|
||||||
|
runs,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskBenchmarkResultPath(logText string) string {
|
||||||
|
archivePath := taskArchivePathFromLog(logText)
|
||||||
|
if archivePath == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
if runDir == archivePath {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return filepath.Join(runDir, "result.json")
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskArchivePathFromLog(logText string) string {
|
||||||
|
lines := strings.Split(logText, "\n")
|
||||||
|
for i := len(lines) - 1; i >= 0; i-- {
|
||||||
|
line := strings.TrimSpace(lines[i])
|
||||||
|
if line == "" || !strings.HasPrefix(line, "Archive:") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
|
||||||
|
if strings.HasPrefix(path, "Archive written to ") {
|
||||||
|
path = strings.TrimSpace(strings.TrimPrefix(path, "Archive written to "))
|
||||||
|
}
|
||||||
|
if strings.HasSuffix(path, ".tar.gz") {
|
||||||
|
return path
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
func renderTaskStatusBadge(status string) string {
|
func renderTaskStatusBadge(status string) string {
|
||||||
className := map[string]string{
|
className := map[string]string{
|
||||||
TaskRunning: "badge-ok",
|
TaskRunning: "badge-ok",
|
||||||
|
|||||||
@@ -115,14 +115,17 @@ type Task struct {
|
|||||||
// taskParams holds optional parameters parsed from the run request.
|
// taskParams holds optional parameters parsed from the run request.
|
||||||
type taskParams struct {
|
type taskParams struct {
|
||||||
Duration int `json:"duration,omitempty"`
|
Duration int `json:"duration,omitempty"`
|
||||||
DiagLevel int `json:"diag_level,omitempty"`
|
StressMode bool `json:"stress_mode,omitempty"`
|
||||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||||
|
StaggerGPUStart bool `json:"stagger_gpu_start,omitempty"`
|
||||||
SizeMB int `json:"size_mb,omitempty"`
|
SizeMB int `json:"size_mb,omitempty"`
|
||||||
|
Passes int `json:"passes,omitempty"`
|
||||||
Loader string `json:"loader,omitempty"`
|
Loader string `json:"loader,omitempty"`
|
||||||
BurnProfile string `json:"burn_profile,omitempty"`
|
BurnProfile string `json:"burn_profile,omitempty"`
|
||||||
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
||||||
RunNCCL bool `json:"run_nccl,omitempty"`
|
RunNCCL bool `json:"run_nccl,omitempty"`
|
||||||
|
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||||
DisplayName string `json:"display_name,omitempty"`
|
DisplayName string `json:"display_name,omitempty"`
|
||||||
Device string `json:"device,omitempty"` // for install
|
Device string `json:"device,omitempty"` // for install
|
||||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||||
@@ -160,6 +163,13 @@ func resolveBurnPreset(profile string) burnPreset {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func boolToNvidiaStaggerSeconds(enabled bool, selected []int) int {
|
||||||
|
if enabled && len(selected) > 1 {
|
||||||
|
return 180
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
||||||
acceptanceCycles := []platform.PlatformStressCycle{
|
acceptanceCycles := []platform.PlatformStressCycle{
|
||||||
{LoadSec: 85, IdleSec: 5},
|
{LoadSec: 85, IdleSec: 5},
|
||||||
@@ -214,11 +224,11 @@ var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
|||||||
const maxTaskHistory = 50
|
const maxTaskHistory = 50
|
||||||
|
|
||||||
var (
|
var (
|
||||||
runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||||
return a.RunMemoryAcceptancePackCtx(ctx, baseDir, logFunc)
|
return a.RunMemoryAcceptancePackCtx(ctx, baseDir, sizeMB, passes, logFunc)
|
||||||
}
|
}
|
||||||
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||||
return a.RunStorageAcceptancePackCtx(ctx, baseDir, logFunc)
|
return a.RunStorageAcceptancePackCtx(ctx, baseDir, extended, logFunc)
|
||||||
}
|
}
|
||||||
runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
|
return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
|
||||||
@@ -258,6 +268,7 @@ func (q *taskQueue) enqueue(t *Task) {
|
|||||||
q.prune()
|
q.prune()
|
||||||
q.persistLocked()
|
q.persistLocked()
|
||||||
q.mu.Unlock()
|
q.mu.Unlock()
|
||||||
|
taskSerialEvent(t, "queued")
|
||||||
select {
|
select {
|
||||||
case q.trigger <- struct{}{}:
|
case q.trigger <- struct{}{}:
|
||||||
default:
|
default:
|
||||||
@@ -422,44 +433,30 @@ func (q *taskQueue) worker() {
|
|||||||
setCPUGovernor("performance")
|
setCPUGovernor("performance")
|
||||||
defer setCPUGovernor("powersave")
|
defer setCPUGovernor("powersave")
|
||||||
|
|
||||||
// Drain all pending tasks and start them in parallel.
|
|
||||||
q.mu.Lock()
|
|
||||||
var batch []*Task
|
|
||||||
for {
|
for {
|
||||||
|
q.mu.Lock()
|
||||||
t := q.nextPending()
|
t := q.nextPending()
|
||||||
if t == nil {
|
if t == nil {
|
||||||
break
|
q.prune()
|
||||||
|
q.persistLocked()
|
||||||
|
q.mu.Unlock()
|
||||||
|
return
|
||||||
}
|
}
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
t.Status = TaskRunning
|
t.Status = TaskRunning
|
||||||
t.StartedAt = &now
|
t.StartedAt = &now
|
||||||
t.DoneAt = nil
|
t.DoneAt = nil
|
||||||
t.ErrMsg = ""
|
t.ErrMsg = ""
|
||||||
j := newTaskJobState(t.LogPath)
|
j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
|
||||||
t.job = j
|
t.job = j
|
||||||
batch = append(batch, t)
|
|
||||||
}
|
|
||||||
if len(batch) > 0 {
|
|
||||||
q.persistLocked()
|
q.persistLocked()
|
||||||
}
|
q.mu.Unlock()
|
||||||
q.mu.Unlock()
|
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
|
||||||
for _, t := range batch {
|
|
||||||
t := t
|
|
||||||
j := t.job
|
|
||||||
taskCtx, taskCancel := context.WithCancel(context.Background())
|
taskCtx, taskCancel := context.WithCancel(context.Background())
|
||||||
j.cancel = taskCancel
|
j.cancel = taskCancel
|
||||||
wg.Add(1)
|
q.executeTask(t, j, taskCtx)
|
||||||
goRecoverOnce("task "+t.Target, func() {
|
taskCancel()
|
||||||
defer wg.Done()
|
|
||||||
defer taskCancel()
|
|
||||||
q.executeTask(t, j, taskCtx)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
wg.Wait()
|
|
||||||
|
|
||||||
if len(batch) > 0 {
|
|
||||||
q.mu.Lock()
|
q.mu.Lock()
|
||||||
q.prune()
|
q.prune()
|
||||||
q.persistLocked()
|
q.persistLocked()
|
||||||
@@ -520,6 +517,11 @@ func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
|
|||||||
if err := writeTaskReportArtifacts(t); err != nil {
|
if err := writeTaskReportArtifacts(t); err != nil {
|
||||||
appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
|
appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
|
||||||
}
|
}
|
||||||
|
if t.ErrMsg != "" {
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
}
|
}
|
||||||
|
|
||||||
// setCPUGovernor writes the given governor to all CPU scaling_governor sysfs files.
|
// setCPUGovernor writes the given governor to all CPU scaling_governor sysfs files.
|
||||||
@@ -559,7 +561,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
diagLevel := t.params.DiagLevel
|
diagLevel := 2
|
||||||
|
if t.params.StressMode {
|
||||||
|
diagLevel = 3
|
||||||
|
}
|
||||||
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
||||||
result, e := a.RunNvidiaAcceptancePackWithOptions(
|
result, e := a.RunNvidiaAcceptancePackWithOptions(
|
||||||
ctx, "", diagLevel, t.params.GPUIndices, j.append,
|
ctx, "", diagLevel, t.params.GPUIndices, j.append,
|
||||||
@@ -593,8 +598,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
GPUIndices: t.params.GPUIndices,
|
GPUIndices: t.params.GPUIndices,
|
||||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
RunNCCL: t.params.RunNCCL,
|
RunNCCL: t.params.RunNCCL,
|
||||||
|
ParallelGPUs: t.params.ParallelGPUs,
|
||||||
}, j.append)
|
}, j.append)
|
||||||
case "nvidia-compute":
|
case "nvidia-compute":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
break
|
break
|
||||||
@@ -603,7 +609,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
if t.params.BurnProfile != "" && dur <= 0 {
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
}
|
}
|
||||||
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
staggerSec := boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||||
|
if staggerSec > 0 {
|
||||||
|
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU", staggerSec))
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, staggerSec, j.append)
|
||||||
case "nvidia-targeted-power":
|
case "nvidia-targeted-power":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
@@ -653,24 +663,29 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
if t.params.BurnProfile != "" && dur <= 0 {
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
}
|
}
|
||||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||||
DurationSec: dur,
|
DurationSec: dur,
|
||||||
Loader: t.params.Loader,
|
Loader: t.params.Loader,
|
||||||
GPUIndices: t.params.GPUIndices,
|
GPUIndices: t.params.GPUIndices,
|
||||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
}, j.append)
|
StaggerSeconds: boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices),
|
||||||
|
}, j.append)
|
||||||
case "memory":
|
case "memory":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append)
|
sizeMB, passes := 256, 1
|
||||||
|
if t.params.StressMode {
|
||||||
|
sizeMB, passes = 1024, 3
|
||||||
|
}
|
||||||
|
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
|
||||||
case "storage":
|
case "storage":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
archive, err = runStorageAcceptancePackCtx(a, ctx, "", j.append)
|
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
|
||||||
case "cpu":
|
case "cpu":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
@@ -681,7 +696,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
}
|
}
|
||||||
if dur <= 0 {
|
if dur <= 0 {
|
||||||
dur = 60
|
if t.params.StressMode {
|
||||||
|
dur = 1800
|
||||||
|
} else {
|
||||||
|
dur = 60
|
||||||
|
}
|
||||||
}
|
}
|
||||||
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
|
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
|
||||||
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||||
@@ -858,6 +877,7 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
|
|||||||
now := time.Now()
|
now := time.Now()
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
globalQueue.persistLocked()
|
globalQueue.persistLocked()
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||||
case TaskRunning:
|
case TaskRunning:
|
||||||
if t.job != nil {
|
if t.job != nil {
|
||||||
@@ -867,6 +887,7 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
|
|||||||
now := time.Now()
|
now := time.Now()
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
globalQueue.persistLocked()
|
globalQueue.persistLocked()
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||||
default:
|
default:
|
||||||
writeError(w, http.StatusConflict, "task is not running or pending")
|
writeError(w, http.StatusConflict, "task is not running or pending")
|
||||||
@@ -907,6 +928,7 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
|||||||
case TaskPending:
|
case TaskPending:
|
||||||
t.Status = TaskCancelled
|
t.Status = TaskCancelled
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
n++
|
n++
|
||||||
case TaskRunning:
|
case TaskRunning:
|
||||||
if t.job != nil {
|
if t.job != nil {
|
||||||
@@ -914,6 +936,7 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
|||||||
}
|
}
|
||||||
t.Status = TaskCancelled
|
t.Status = TaskCancelled
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
n++
|
n++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -932,6 +955,7 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
|
|||||||
case TaskPending:
|
case TaskPending:
|
||||||
t.Status = TaskCancelled
|
t.Status = TaskCancelled
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
cancelled++
|
cancelled++
|
||||||
case TaskRunning:
|
case TaskRunning:
|
||||||
if t.job != nil {
|
if t.job != nil {
|
||||||
@@ -939,6 +963,7 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
|
|||||||
}
|
}
|
||||||
t.Status = TaskCancelled
|
t.Status = TaskCancelled
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
cancelled++
|
cancelled++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1151,7 +1176,32 @@ func taskArtifactsDir(root string, t *Task, status string) string {
|
|||||||
if strings.TrimSpace(root) == "" || t == nil {
|
if strings.TrimSpace(root) == "" || t == nil {
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
return filepath.Join(root, fmt.Sprintf("%s_%s_%s", t.ID, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
|
prefix := taskFolderNumberPrefix(t.ID)
|
||||||
|
return filepath.Join(root, fmt.Sprintf("%s_%s_%s", prefix, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskFolderNumberPrefix(taskID string) string {
|
||||||
|
taskID = strings.TrimSpace(taskID)
|
||||||
|
if strings.HasPrefix(taskID, "TASK-") && len(taskID) >= len("TASK-000") {
|
||||||
|
num := strings.TrimSpace(strings.TrimPrefix(taskID, "TASK-"))
|
||||||
|
if len(num) == 3 {
|
||||||
|
allDigits := true
|
||||||
|
for _, r := range num {
|
||||||
|
if r < '0' || r > '9' {
|
||||||
|
allDigits = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if allDigits {
|
||||||
|
return num
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fallback := sanitizeTaskFolderPart(taskID)
|
||||||
|
if fallback == "" {
|
||||||
|
return "000"
|
||||||
|
}
|
||||||
|
return fallback
|
||||||
}
|
}
|
||||||
|
|
||||||
func ensureTaskReportPaths(t *Task) {
|
func ensureTaskReportPaths(t *Task) {
|
||||||
|
|||||||
@@ -163,6 +163,40 @@ func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNewJobIDUsesTASKPrefixAndZeroPadding(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
origTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
origCounter := jobCounter.Load()
|
||||||
|
jobCounter.Store(0)
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = origTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
jobCounter.Store(origCounter)
|
||||||
|
})
|
||||||
|
|
||||||
|
if got := newJobID("ignored"); got != "TASK-000" {
|
||||||
|
t.Fatalf("id=%q want TASK-000", got)
|
||||||
|
}
|
||||||
|
if got := newJobID("ignored"); got != "TASK-001" {
|
||||||
|
t.Fatalf("id=%q want TASK-001", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskArtifactsDirStartsWithTaskNumber(t *testing.T) {
|
||||||
|
root := t.TempDir()
|
||||||
|
task := &Task{
|
||||||
|
ID: "TASK-007",
|
||||||
|
Name: "NVIDIA Benchmark",
|
||||||
|
}
|
||||||
|
got := filepath.Base(taskArtifactsDir(root, task, TaskDone))
|
||||||
|
if !strings.HasPrefix(got, "007_") {
|
||||||
|
t.Fatalf("artifacts dir=%q want prefix 007_", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
logPath := filepath.Join(dir, "task.log")
|
logPath := filepath.Join(dir, "task.log")
|
||||||
@@ -325,6 +359,121 @@ func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
metricsPath := filepath.Join(dir, "metrics.db")
|
||||||
|
prevMetricsPath := taskReportMetricsDBPath
|
||||||
|
taskReportMetricsDBPath = metricsPath
|
||||||
|
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||||
|
|
||||||
|
benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
||||||
|
if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
result := platform.NvidiaBenchmarkResult{
|
||||||
|
GeneratedAt: time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
|
||||||
|
BenchmarkProfile: "standard",
|
||||||
|
OverallStatus: "OK",
|
||||||
|
GPUs: []platform.BenchmarkGPUResult{
|
||||||
|
{
|
||||||
|
Index: 0,
|
||||||
|
Name: "NVIDIA H100 PCIe",
|
||||||
|
Scores: platform.BenchmarkScorecard{
|
||||||
|
CompositeScore: 1176.25,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
raw, err := json.Marshal(result)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(benchmarkDir, "result.json"), raw, 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
artifactsDir := filepath.Join(dir, "tasks", "task-bench_done")
|
||||||
|
if err := os.MkdirAll(artifactsDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
task := &Task{
|
||||||
|
ID: "task-bench",
|
||||||
|
Name: "NVIDIA Benchmark",
|
||||||
|
Target: "nvidia-benchmark",
|
||||||
|
Status: TaskDone,
|
||||||
|
CreatedAt: time.Now().UTC().Add(-time.Minute),
|
||||||
|
ArtifactsDir: artifactsDir,
|
||||||
|
}
|
||||||
|
ensureTaskReportPaths(task)
|
||||||
|
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
|
||||||
|
if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := writeTaskReportArtifacts(task); err != nil {
|
||||||
|
t.Fatalf("writeTaskReportArtifacts: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
body, err := os.ReadFile(task.ReportHTMLPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadFile(report.html): %v", err)
|
||||||
|
}
|
||||||
|
html := string(body)
|
||||||
|
for _, needle := range []string{
|
||||||
|
`Benchmark Results`,
|
||||||
|
`Composite score for this benchmark task.`,
|
||||||
|
`GPU #0 — NVIDIA H100 PCIe`,
|
||||||
|
`1176.25`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(html, needle) {
|
||||||
|
t.Fatalf("report missing %q: %s", needle, html)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
|
||||||
|
var lines []string
|
||||||
|
prev := taskSerialWriteLine
|
||||||
|
taskSerialWriteLine = func(line string) { lines = append(lines, line) }
|
||||||
|
t.Cleanup(func() { taskSerialWriteLine = prev })
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
q := &taskQueue{
|
||||||
|
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||||
|
logsDir: filepath.Join(dir, "tasks"),
|
||||||
|
trigger: make(chan struct{}, 1),
|
||||||
|
}
|
||||||
|
task := &Task{
|
||||||
|
ID: "task-serial-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: time.Now().UTC(),
|
||||||
|
}
|
||||||
|
|
||||||
|
q.enqueue(task)
|
||||||
|
started := time.Now().UTC()
|
||||||
|
task.Status = TaskRunning
|
||||||
|
task.StartedAt = &started
|
||||||
|
job := newTaskJobState(task.LogPath, taskSerialPrefix(task))
|
||||||
|
job.append("Starting CPU SAT...")
|
||||||
|
job.append("CPU stress duration: 60s")
|
||||||
|
job.finish("")
|
||||||
|
q.finalizeTaskRun(task, job)
|
||||||
|
|
||||||
|
joined := strings.Join(lines, "\n")
|
||||||
|
for _, needle := range []string{
|
||||||
|
"queued",
|
||||||
|
"Starting CPU SAT...",
|
||||||
|
"CPU stress duration: 60s",
|
||||||
|
"finished with status=done",
|
||||||
|
} {
|
||||||
|
if !strings.Contains(joined, needle) {
|
||||||
|
t.Fatalf("serial mirror missing %q in %q", needle, joined)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestResolveBurnPreset(t *testing.T) {
|
func TestResolveBurnPreset(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
profile string
|
profile string
|
||||||
|
|||||||
248
bible-local/docs/benchmark-clock-calibration.md
Normal file
248
bible-local/docs/benchmark-clock-calibration.md
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
# Benchmark clock calibration research
|
||||||
|
|
||||||
|
## Status
|
||||||
|
In progress. Baseline data from production servers pending.
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
The benchmark locks GPU clocks to `MaxGraphicsClockMHz` (boost) via `nvidia-smi -lgc`
|
||||||
|
before the steady-state phase. The metric `low_sm_clock_vs_target` fires when
|
||||||
|
`avg_steady_clock < locked_target * 0.90`.
|
||||||
|
|
||||||
|
Problem: boost clock is the theoretical maximum under ideal cooling. In practice,
|
||||||
|
even a healthy GPU in a non-ideal server will sustain clocks well below boost.
|
||||||
|
The 90% threshold has no empirical basis.
|
||||||
|
|
||||||
|
## Key observations (2026-04-06)
|
||||||
|
|
||||||
|
### H100 PCIe — new card, server not designed for it
|
||||||
|
- avg clock 1384 MHz, P95 1560 MHz (unstable, proba boost 1755 MHz)
|
||||||
|
- Thermal sustain: 0.0 (sw_thermal covers entire steady window)
|
||||||
|
- Stability: 70.0 — clocks erratic, no equilibrium found
|
||||||
|
- Degradation: power_capped, thermal_limited, low_sm_clock_vs_target, variance_too_high
|
||||||
|
|
||||||
|
### H200 NVL — new card, server not designed for it
|
||||||
|
- avg clock = P95 = 1635 MHz (perfectly stable)
|
||||||
|
- Thermal sustain: 0.0 (sw_thermal + sw_power cover entire steady window)
|
||||||
|
- Stability: 92.0 — found stable thermal equilibrium at 1635 MHz
|
||||||
|
- Degradation: power_capped, thermal_limited
|
||||||
|
- Compute: 989 TOPS — card is computing correctly for its frequency
|
||||||
|
|
||||||
|
### Key insight
|
||||||
|
The meaningful distinction is not *whether* the card throttles but *how stably*
|
||||||
|
it throttles. H200 found a thermal equilibrium (avg == P95, Stability 92),
|
||||||
|
H100 did not (avg << P95, Stability 70). Both are new cards; the H100's
|
||||||
|
instability may reflect a more severe thermal mismatch or a card issue.
|
||||||
|
|
||||||
|
`sw_power ≈ sw_thermal` pattern = server cooling constraint, card likely OK.
|
||||||
|
`hw_thermal >> sw_thermal` pattern = card itself overheating, investigate.
|
||||||
|
|
||||||
|
## Hypothesis for baseline
|
||||||
|
|
||||||
|
After testing on servers designed for their GPUs (proper cooling):
|
||||||
|
- Healthy GPU under sustained load will run at a stable fraction of boost
|
||||||
|
- Expected: avg_steady ≈ 80–95% of boost depending on model and TDP class
|
||||||
|
- Base clock (`clocks.base.gr`) may be a better reference than boost:
|
||||||
|
a healthy card under real workload should comfortably exceed base clock
|
||||||
|
|
||||||
|
## Baseline: H100 PCIe HBM2e — designed server (2026-04-06, 10 samples)
|
||||||
|
|
||||||
|
Source: external stress test tool, ~90s runs, designed server, adequate power.
|
||||||
|
|
||||||
|
### Healthy fingerprint
|
||||||
|
|
||||||
|
- **Power**: hits cap ~340–360W immediately, stays flat throughout — HEALTHY
|
||||||
|
- **Clock**: starts ~1750 MHz, oscillates and declines to ~1540–1600 MHz by 90s
|
||||||
|
- Avg steady (visual): **~1580–1620 MHz**
|
||||||
|
- vs boost 1755 MHz: **~91–92%**
|
||||||
|
- Oscillation is NORMAL — this is the boost algorithm balancing under power cap
|
||||||
|
- Stable power + oscillating clocks = healthy power-cap behavior
|
||||||
|
- **Temperature**: linear rise ~38°C → 75–80°C over 90s (no runaway)
|
||||||
|
- **Consistency**: all 10 samples within ±20 MHz — very repeatable
|
||||||
|
|
||||||
|
### Characteristic patten
|
||||||
|
Flat power line + oscillating/declining clock line = GPU correctly managed by
|
||||||
|
power cap algorithm. Do NOT flag this as instability.
|
||||||
|
|
||||||
|
### Clock CV implication
|
||||||
|
The healthy oscillation WILL produce moderate ClockCVPct (~5–10%).
|
||||||
|
The current `variance_too_high` threshold (StabilityScore < 85) may fire on
|
||||||
|
healthy HBM2e PCIe cards. Needs recalibration.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Baseline: H100 HBM3 OEM SXM Custom (restored) — 2 confirmed samples
|
||||||
|
|
||||||
|
Source: pytorch_training_loop stress test, 120s (90s stress + 30s cooldown).
|
||||||
|
Confirmed GPU: NVIDIA H100 80GB HBM3, GH100 rev a1.
|
||||||
|
|
||||||
|
### GPU clock reference (from nvidia-smi, idle):
|
||||||
|
- base_clock_mhz: **1095**
|
||||||
|
- boost_clock_mhz: **1755** (nvidia-smi `clocks.max.graphics` at idle)
|
||||||
|
- achieved_max_clock_mhz: **1980** (actual burst max observed by tool)
|
||||||
|
- Our benchmark locks to `clocks.max.graphics` = likely 1980 MHz for this chip
|
||||||
|
|
||||||
|
### Observed under 700W sustained load (both samples nearly identical):
|
||||||
|
- Power: ~700W flat — SXM slot, adequate power confirmed
|
||||||
|
- Clock steady range: **~1380–1480 MHz**, avg **~1420–1460 MHz**
|
||||||
|
- vs 1980 MHz (lock target): **72–74%** — severely below
|
||||||
|
- vs 1755 MHz (nvidia-smi boost): **81–83%**
|
||||||
|
- vs 1095 MHz (base): 130% — above base but far below expected for SXM
|
||||||
|
- Clock/Watt: ~2.1 MHz/W vs HBM2e ~4.6 MHz/W — 2× worse efficiency
|
||||||
|
- Temperature: 38°C → 79–80°C (same rate as HBM2e)
|
||||||
|
- Oscillation: present, similar character to HBM2e but at much lower frequency
|
||||||
|
|
||||||
|
### Diagnosis
|
||||||
|
These restored cards are degraded. A healthy H100 SXM in a designed server
|
||||||
|
(DGX H100, HGX H100) should sustain ~1800–1900 MHz at 700W (~91–96% of 1980).
|
||||||
|
The 72–74% result is a clear signal of silicon or VRM degradation from the
|
||||||
|
refurbishment process.
|
||||||
|
|
||||||
|
### Clock pattern note
|
||||||
|
Images 8/9 (previously marked as "HBM3 restored") are now confirmed identical
|
||||||
|
to images 19/20. Both sample sets show same degraded pattern — same batch.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Baseline matrix (filled where data available)
|
||||||
|
|
||||||
|
| GPU model | Config | Avg clock steady | vs boost | Clock/Watt | Notes |
|
||||||
|
|---|---|---|---|---|---|
|
||||||
|
| H100 PCIe HBM2e | designed server | 1580–1620 MHz | 91–92% | ~4.6 MHz/W | 10 samples, healthy |
|
||||||
|
| H100 SXM HBM3 restored | 700W full | 1420–1460 MHz | 72–74% of 1980 | ~2.1 MHz/W | 4 samples confirmed, degraded |
|
||||||
|
| H100 SXM HBM3 healthy | designed | ~1800–1900 MHz est. | ~91–96% est. | ~2.7 MHz/W est. | need real baseline |
|
||||||
|
| H200 NVL | designed | TBD | TBD | TBD | need baseline |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## H100 official spec (from NVIDIA datasheet)
|
||||||
|
|
||||||
|
Source: NVIDIA H100 Tensor Core GPU Datasheet (image 23, 2026-04-06).
|
||||||
|
All TOPS marked * are with structural sparsity enabled. Divide by 2 for dense.
|
||||||
|
|
||||||
|
| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
|
||||||
|
|---|---|---|---|---|---|
|
||||||
|
| H100 80GB PCIe | 756 TFLOPS | 378 TFLOPS | 1,513 TFLOPS | 350W | HBM2e |
|
||||||
|
| H100 NVL 94GB PCIe | 990 TFLOPS | 495 TFLOPS | 1,980 TFLOPS | 400W | HBM3 |
|
||||||
|
| H100 80GB SXM (BQQV) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM3 |
|
||||||
|
| H100 94GB SXM (BUBB) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM2e |
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- SXM boards do NOT list FP8 peak in this table (field empty)
|
||||||
|
- fp8_e5m2 is unsupported on H100 PCIe HBM2e — confirmed in our tests
|
||||||
|
- Tensor Cores: PCIe = 456, SXM = 528 (16% more on SXM)
|
||||||
|
|
||||||
|
## Observed efficiency (H100 80GB PCIe, throttled server)
|
||||||
|
|
||||||
|
From the report in this session (power+thermal throttle throughout steady):
|
||||||
|
|
||||||
|
| Precision | Measured | Spec (dense) | % of spec |
|
||||||
|
|---|---|---|---|
|
||||||
|
| fp16_tensor | 329 TOPS | 756 TFLOPS | 44% |
|
||||||
|
| fp32_tf32 | 115 TOPS | 378 TFLOPS | 30% |
|
||||||
|
| fp8_e4m3 | 505 TOPS | 1,513 TFLOPS | 33% |
|
||||||
|
|
||||||
|
33–44% of spec is expected given sustained power+thermal throttle (avg clock
|
||||||
|
1384 MHz vs boost 1755 MHz = 79%). The GPU is computing correctly for its
|
||||||
|
actual frequency — the low TOPS comes from throttle, not silicon defect.
|
||||||
|
|
||||||
|
## H200 official spec (from NVIDIA datasheet, image 24, 2026-04-06)
|
||||||
|
|
||||||
|
Format: without sparsity / with sparsity.
|
||||||
|
|
||||||
|
| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
|
||||||
|
|---|---|---|---|---|---|
|
||||||
|
| H200 NVL PCIe | 836 TFLOPS | 418 TFLOPS | 1,570 TFLOPS | 600W | HBM3e 141GB |
|
||||||
|
| H200 SXM | 990 TFLOPS | 495 TFLOPS | 1,979 TFLOPS | 700W | HBM3e 141GB |
|
||||||
|
|
||||||
|
## Observed efficiency (H200 NVL PCIe, throttled non-designed server)
|
||||||
|
|
||||||
|
Avg clock 1635 MHz (62% of boost ~2619 MHz). Entire steady in thermal throttle.
|
||||||
|
|
||||||
|
| Precision | Measured | Spec (dense) | % of spec |
|
||||||
|
|---|---|---|---|
|
||||||
|
| fp16_tensor | 340 TOPS | 836 TFLOPS | 41% |
|
||||||
|
| fp32_tf32 | 120 TOPS | 418 TFLOPS | 29% |
|
||||||
|
| fp8_e4m3 | 529 TOPS | 1,570 TFLOPS | 34% |
|
||||||
|
|
||||||
|
Comparable to H100 PCIe efficiency (33–44%) despite different architecture —
|
||||||
|
both are throttle-limited. Confirms that % of spec is not a quality signal,
|
||||||
|
it reflects the thermal environment. tops_per_sm_per_ghz is the right metric.
|
||||||
|
|
||||||
|
## Real-world GEMM efficiency reference (2026-04-06, web research)
|
||||||
|
|
||||||
|
Sources: SemiAnalysis MI300X vs H100 vs H200 training benchmark; cuBLAS optimization
|
||||||
|
worklog (hamzaelshafie.bearblog.dev); Lambda AI H100 performance analysis.
|
||||||
|
|
||||||
|
### What healthy systems actually achieve:
|
||||||
|
- H100 SXM in designed server: **~720 TFLOPS FP16 = ~73% of spec**
|
||||||
|
- cuBLAS large square GEMM (8192³): up to **~83% flop utilization**
|
||||||
|
- H200 NVL PCIe: no public data, extrapolating ~73% → ~610 TFLOPS FP16
|
||||||
|
|
||||||
|
### Our results vs expectation:
|
||||||
|
| GPU | Our FP16 | Expected (73%) | Our % of spec | Gap |
|
||||||
|
|---|---|---|---|---|
|
||||||
|
| H100 PCIe HBM2e | 329 TOPS | ~552 TFLOPS | 44% | ~1.7× below |
|
||||||
|
| H200 NVL PCIe | 340 TOPS | ~610 TFLOPS | 41% | ~1.8× below |
|
||||||
|
|
||||||
|
Our results are roughly **half** of what a healthy system achieves even under throttle.
|
||||||
|
This is NOT normal — 30-44% is not the industry baseline.
|
||||||
|
|
||||||
|
### Likely causes of the gap (in order of probability):
|
||||||
|
1. **Thermal throttle** — confirmed, sw_thermal covers entire steady window
|
||||||
|
2. **Power limit below TDP** — GPU may be software-limited below 350W/600W.
|
||||||
|
Previous user may have set a lower limit via nvidia-smi -pl and it was not
|
||||||
|
reset. Our normalization sets clock locks but does NOT reset power limit.
|
||||||
|
Key check: `nvidia-smi -q | grep "Power Limit"` — default vs enforced.
|
||||||
|
3. **Matrix size** — ruled out. bee-gpu-burn uses 4096×4096×4096 for fp16,
|
||||||
|
8192×8192×4096 for fp8. These are large enough for peak tensor utilization.
|
||||||
|
|
||||||
|
### Power limit gap analysis (H100 PCIe):
|
||||||
|
- Avg clock 1384 MHz = 79% of boost 1755 MHz
|
||||||
|
- Expected TOPS at 79% clock: 756 × 0.79 ≈ 597 TFLOPS
|
||||||
|
- Actually measured: 329 TOPS = 55% of that estimate
|
||||||
|
- Remaining gap after accounting for clock throttle: ~45%
|
||||||
|
- Most likely explanation: enforced power limit < 350W TDP, further reducing
|
||||||
|
sustainable clock beyond what sw_thermal alone would cause.
|
||||||
|
|
||||||
|
### Action item:
|
||||||
|
Add `power.limit` (enforced) AND `power.default_limit` to queryBenchmarkGPUInfo
|
||||||
|
so result.json shows if the card was pre-configured with a non-default limit.
|
||||||
|
If enforced < default × 0.95 → add finding "GPU power limit is below default TDP".
|
||||||
|
|
||||||
|
### CPU/RAM impact on GPU FLOPS:
|
||||||
|
None. Pure on-GPU GEMM is fully compute-bound once data is in VRAM.
|
||||||
|
CPU core count and host RAM are irrelevant.
|
||||||
|
|
||||||
|
## Compute efficiency metric (proposed, no hardcode)
|
||||||
|
|
||||||
|
Instead of comparing TOPS to a hardcoded spec, compute:
|
||||||
|
tops_per_sm_per_ghz = measured_tops / (sm_count × avg_clock_ghz)
|
||||||
|
|
||||||
|
This is model-agnostic. A GPU computing correctly at its actual frequency
|
||||||
|
will show a consistent tops_per_sm_per_ghz regardless of throttle level.
|
||||||
|
A GPU with degraded silicon will show low tops_per_sm_per_ghz even at
|
||||||
|
normal clocks.
|
||||||
|
|
||||||
|
SM count is queryable: nvidia-smi --query-gpu=attribute.multiprocessor_count
|
||||||
|
(needs to be added to queryBenchmarkGPUInfo).
|
||||||
|
|
||||||
|
Reference values to establish after baseline runs:
|
||||||
|
- H100 PCIe fp16_tensor: TBD tops/SM/GHz
|
||||||
|
- H100 SXM fp16_tensor: TBD tops/SM/GHz
|
||||||
|
|
||||||
|
## Proposed threshold changes (pending more data)
|
||||||
|
|
||||||
|
1. **`low_sm_clock_vs_target`**: raise threshold from 90% to 85% based on observed
|
||||||
|
91–92% on healthy HBM2e. Or remove entirely — sw_power/sw_thermal already
|
||||||
|
capture the root cause.
|
||||||
|
|
||||||
|
2. **`variance_too_high`** (StabilityScore < 85): healthy HBM2e WILL oscillate
|
||||||
|
under power cap. Consider suppressing this flag when power is flat and usage
|
||||||
|
is 100% (oscillation is expected). Or lower threshold to 70.
|
||||||
|
|
||||||
|
3. **New signal: MHz/Watt efficiency**: if base_graphics_clock_mhz is available,
|
||||||
|
ratio avg_clock / power_w could identify degraded silicon (HBM3 restored S1
|
||||||
|
would have been caught by this).
|
||||||
|
|
||||||
|
Decision deferred until baseline on SXM designed servers collected.
|
||||||
@@ -32,7 +32,7 @@ lb config noauto \
|
|||||||
--memtest memtest86+ \
|
--memtest memtest86+ \
|
||||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=6 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||||
--apt-recommends false \
|
--apt-recommends false \
|
||||||
--chroot-squashfs-compression-type zstd \
|
--chroot-squashfs-compression-type zstd \
|
||||||
"${@}"
|
"${@}"
|
||||||
|
|||||||
@@ -36,7 +36,6 @@ typedef void *CUstream;
|
|||||||
#define MAX_CUBLAS_PROFILES 5
|
#define MAX_CUBLAS_PROFILES 5
|
||||||
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
||||||
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
||||||
#define STRESS_LAUNCH_DEPTH 8
|
|
||||||
|
|
||||||
static const char *ptx_source =
|
static const char *ptx_source =
|
||||||
".version 6.0\n"
|
".version 6.0\n"
|
||||||
@@ -344,7 +343,6 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
unsigned long iterations = 0;
|
unsigned long iterations = 0;
|
||||||
int mp_count = 0;
|
int mp_count = 0;
|
||||||
int stream_count = 1;
|
int stream_count = 1;
|
||||||
int launches_per_wave = 0;
|
|
||||||
|
|
||||||
memset(report, 0, sizeof(*report));
|
memset(report, 0, sizeof(*report));
|
||||||
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
||||||
@@ -419,44 +417,42 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
|
|
||||||
unsigned int threads = 256;
|
unsigned int threads = 256;
|
||||||
|
|
||||||
double start = now_seconds();
|
double deadline = now_seconds() + (double)seconds;
|
||||||
double deadline = start + (double)seconds;
|
double next_sync = now_seconds() + 1.0;
|
||||||
while (now_seconds() < deadline) {
|
while (now_seconds() < deadline) {
|
||||||
launches_per_wave = 0;
|
int launched = 0;
|
||||||
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
for (int lane = 0; lane < stream_count; lane++) {
|
||||||
int launched_this_batch = 0;
|
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
||||||
for (int lane = 0; lane < stream_count; lane++) {
|
if (!check_rc(api,
|
||||||
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
"cuLaunchKernel",
|
||||||
if (!check_rc(api,
|
api->cuLaunchKernel(kernel,
|
||||||
"cuLaunchKernel",
|
blocks,
|
||||||
api->cuLaunchKernel(kernel,
|
1,
|
||||||
blocks,
|
1,
|
||||||
1,
|
threads,
|
||||||
1,
|
1,
|
||||||
threads,
|
1,
|
||||||
1,
|
0,
|
||||||
1,
|
streams[lane],
|
||||||
0,
|
params[lane],
|
||||||
streams[lane],
|
NULL))) {
|
||||||
params[lane],
|
goto fail;
|
||||||
NULL))) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
launches_per_wave++;
|
|
||||||
launched_this_batch++;
|
|
||||||
}
|
|
||||||
if (launched_this_batch <= 0) {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
launched++;
|
||||||
|
iterations++;
|
||||||
}
|
}
|
||||||
if (launches_per_wave <= 0) {
|
if (launched <= 0) {
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
double now = now_seconds();
|
||||||
goto fail;
|
if (now >= next_sync || now >= deadline) {
|
||||||
|
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
next_sync = now + 1.0;
|
||||||
}
|
}
|
||||||
iterations += (unsigned long)launches_per_wave;
|
|
||||||
}
|
}
|
||||||
|
api->cuCtxSynchronize();
|
||||||
|
|
||||||
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
|
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
|
||||||
goto fail;
|
goto fail;
|
||||||
@@ -468,11 +464,10 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
report->iterations = iterations;
|
report->iterations = iterations;
|
||||||
snprintf(report->details,
|
snprintf(report->details,
|
||||||
sizeof(report->details),
|
sizeof(report->details),
|
||||||
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
|
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
|
||||||
size_mb,
|
size_mb,
|
||||||
report->buffer_mb,
|
report->buffer_mb,
|
||||||
report->stream_count,
|
report->stream_count,
|
||||||
STRESS_LAUNCH_DEPTH,
|
|
||||||
bytes_per_stream[0] / (1024u * 1024u),
|
bytes_per_stream[0] / (1024u * 1024u),
|
||||||
iterations);
|
iterations);
|
||||||
|
|
||||||
@@ -606,6 +601,20 @@ struct prepared_profile {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static const struct profile_desc k_profiles[] = {
|
static const struct profile_desc k_profiles[] = {
|
||||||
|
{
|
||||||
|
"fp64",
|
||||||
|
"fp64",
|
||||||
|
80,
|
||||||
|
1,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
8,
|
||||||
|
CUDA_R_64F,
|
||||||
|
CUDA_R_64F,
|
||||||
|
CUDA_R_64F,
|
||||||
|
CUDA_R_64F,
|
||||||
|
CUBLAS_COMPUTE_64F,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"fp32_tf32",
|
"fp32_tf32",
|
||||||
"fp32",
|
"fp32",
|
||||||
@@ -1126,7 +1135,6 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
int stream_count = 1;
|
int stream_count = 1;
|
||||||
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
|
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
|
||||||
int prepared_count = 0;
|
int prepared_count = 0;
|
||||||
int wave_launches = 0;
|
|
||||||
size_t requested_budget = 0;
|
size_t requested_budget = 0;
|
||||||
size_t total_budget = 0;
|
size_t total_budget = 0;
|
||||||
size_t per_profile_budget = 0;
|
size_t per_profile_budget = 0;
|
||||||
@@ -1193,11 +1201,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
||||||
append_detail(report->details,
|
append_detail(report->details,
|
||||||
sizeof(report->details),
|
sizeof(report->details),
|
||||||
"requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
|
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
|
||||||
size_mb,
|
size_mb,
|
||||||
report->buffer_mb,
|
report->buffer_mb,
|
||||||
report->stream_count,
|
report->stream_count,
|
||||||
STRESS_LAUNCH_DEPTH,
|
|
||||||
mp_count,
|
mp_count,
|
||||||
per_profile_budget / (1024u * 1024u));
|
per_profile_budget / (1024u * 1024u));
|
||||||
|
|
||||||
@@ -1246,50 +1253,55 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Keep the GPU queue continuously full by submitting kernels without
|
||||||
|
* synchronizing after every wave. A sync barrier after each small batch
|
||||||
|
* creates CPU↔GPU ping-pong gaps that prevent full TDP utilisation,
|
||||||
|
* especially when individual kernels are short. Instead we sync at most
|
||||||
|
* once per second (for error detection) and once at the very end. */
|
||||||
double deadline = now_seconds() + (double)seconds;
|
double deadline = now_seconds() + (double)seconds;
|
||||||
|
double next_sync = now_seconds() + 1.0;
|
||||||
while (now_seconds() < deadline) {
|
while (now_seconds() < deadline) {
|
||||||
wave_launches = 0;
|
int launched = 0;
|
||||||
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
int launched_this_batch = 0;
|
if (!prepared[i].ready) {
|
||||||
for (int i = 0; i < prepared_count; i++) {
|
continue;
|
||||||
if (!prepared[i].ready) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
|
|
||||||
append_detail(report->details,
|
|
||||||
sizeof(report->details),
|
|
||||||
"%s=FAILED runtime\n",
|
|
||||||
prepared[i].desc.name);
|
|
||||||
for (int j = 0; j < prepared_count; j++) {
|
|
||||||
destroy_profile(&cublas, cuda, &prepared[j]);
|
|
||||||
}
|
|
||||||
cublas.cublasLtDestroy(handle);
|
|
||||||
destroy_streams(cuda, streams, stream_count);
|
|
||||||
cuda->cuCtxDestroy(ctx);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
prepared[i].iterations++;
|
|
||||||
report->iterations++;
|
|
||||||
wave_launches++;
|
|
||||||
launched_this_batch++;
|
|
||||||
}
|
}
|
||||||
if (launched_this_batch <= 0) {
|
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
|
||||||
break;
|
append_detail(report->details,
|
||||||
|
sizeof(report->details),
|
||||||
|
"%s=FAILED runtime\n",
|
||||||
|
prepared[i].desc.name);
|
||||||
|
for (int j = 0; j < prepared_count; j++) {
|
||||||
|
destroy_profile(&cublas, cuda, &prepared[j]);
|
||||||
|
}
|
||||||
|
cublas.cublasLtDestroy(handle);
|
||||||
|
destroy_streams(cuda, streams, stream_count);
|
||||||
|
cuda->cuCtxDestroy(ctx);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
prepared[i].iterations++;
|
||||||
|
report->iterations++;
|
||||||
|
launched++;
|
||||||
}
|
}
|
||||||
if (wave_launches <= 0) {
|
if (launched <= 0) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
double now = now_seconds();
|
||||||
for (int i = 0; i < prepared_count; i++) {
|
if (now >= next_sync || now >= deadline) {
|
||||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
||||||
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
|
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||||
|
}
|
||||||
|
cublas.cublasLtDestroy(handle);
|
||||||
|
destroy_streams(cuda, streams, stream_count);
|
||||||
|
cuda->cuCtxDestroy(ctx);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
cublas.cublasLtDestroy(handle);
|
next_sync = now + 1.0;
|
||||||
destroy_streams(cuda, streams, stream_count);
|
|
||||||
cuda->cuCtxDestroy(ctx);
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/* Final drain — ensure all queued work finishes before we read results. */
|
||||||
|
cuda->cuCtxSynchronize();
|
||||||
|
|
||||||
for (int i = 0; i < prepared_count; i++) {
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
if (!prepared[i].ready) {
|
if (!prepared[i].ready) {
|
||||||
|
|||||||
@@ -41,15 +41,15 @@ while [ $# -gt 0 ]; do
|
|||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "unknown arg: $1" >&2
|
echo "unknown arg: $1" >&2
|
||||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
|
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
case "$VARIANT" in
|
case "$VARIANT" in
|
||||||
nvidia|amd|nogpu|all) ;;
|
nvidia|nvidia-legacy|amd|nogpu|all) ;;
|
||||||
*) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
|
*) echo "unknown variant: $VARIANT (expected nvidia, nvidia-legacy, amd, nogpu, or all)" >&2; exit 1 ;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
if [ "$CLEAN_CACHE" = "1" ]; then
|
if [ "$CLEAN_CACHE" = "1" ]; then
|
||||||
@@ -61,8 +61,13 @@ if [ "$CLEAN_CACHE" = "1" ]; then
|
|||||||
"${CACHE_DIR:?}/lb-packages"
|
"${CACHE_DIR:?}/lb-packages"
|
||||||
echo "=== cleaning live-build work dirs ==="
|
echo "=== cleaning live-build work dirs ==="
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia-legacy"
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia-legacy"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/overlay-stage-amd"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/overlay-stage-nogpu"
|
||||||
echo "=== caches cleared, proceeding with build ==="
|
echo "=== caches cleared, proceeding with build ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -180,6 +185,9 @@ case "$VARIANT" in
|
|||||||
nvidia)
|
nvidia)
|
||||||
run_variant nvidia
|
run_variant nvidia
|
||||||
;;
|
;;
|
||||||
|
nvidia-legacy)
|
||||||
|
run_variant nvidia-legacy
|
||||||
|
;;
|
||||||
amd)
|
amd)
|
||||||
run_variant amd
|
run_variant amd
|
||||||
;;
|
;;
|
||||||
@@ -188,6 +196,7 @@ case "$VARIANT" in
|
|||||||
;;
|
;;
|
||||||
all)
|
all)
|
||||||
run_variant nvidia
|
run_variant nvidia
|
||||||
|
run_variant nvidia-legacy
|
||||||
run_variant amd
|
run_variant amd
|
||||||
run_variant nogpu
|
run_variant nogpu
|
||||||
;;
|
;;
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
# build-nvidia-module.sh — compile NVIDIA proprietary driver modules for Debian 12
|
# build-nvidia-module.sh — compile NVIDIA kernel modules for Debian 12
|
||||||
#
|
#
|
||||||
# Downloads the official NVIDIA .run installer, extracts kernel modules and
|
# Downloads the official NVIDIA .run installer, extracts kernel modules and
|
||||||
# userspace tools (nvidia-smi, libnvidia-ml). Everything is proprietary NVIDIA.
|
# userspace tools (nvidia-smi, libnvidia-ml). Supports both:
|
||||||
|
# - open -> kernel-open/ sources from the .run installer
|
||||||
|
# - proprietary -> traditional proprietary kernel sources from the .run installer
|
||||||
#
|
#
|
||||||
# Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
|
# Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
|
||||||
# are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
|
# are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
|
||||||
@@ -17,10 +19,19 @@ set -e
|
|||||||
NVIDIA_VERSION="$1"
|
NVIDIA_VERSION="$1"
|
||||||
DIST_DIR="$2"
|
DIST_DIR="$2"
|
||||||
DEBIAN_KERNEL_ABI="$3"
|
DEBIAN_KERNEL_ABI="$3"
|
||||||
|
NVIDIA_FLAVOR="${4:-open}"
|
||||||
|
|
||||||
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||||
[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||||
|
|
||||||
|
case "$NVIDIA_FLAVOR" in
|
||||||
|
open|proprietary) ;;
|
||||||
|
*)
|
||||||
|
echo "unsupported NVIDIA flavor: $NVIDIA_FLAVOR (expected open or proprietary)" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||||
# On Debian, kernel headers are split into two packages:
|
# On Debian, kernel headers are split into two packages:
|
||||||
@@ -31,22 +42,13 @@ KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
|||||||
KDIR_ARCH="/usr/src/linux-headers-${KVER}"
|
KDIR_ARCH="/usr/src/linux-headers-${KVER}"
|
||||||
KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"
|
KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"
|
||||||
|
|
||||||
echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
|
echo "=== NVIDIA ${NVIDIA_VERSION} (${NVIDIA_FLAVOR}) for kernel ${KVER} ==="
|
||||||
|
|
||||||
if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
|
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_FLAVOR}-${NVIDIA_VERSION}-${KVER}"
|
||||||
echo "=== installing linux-headers-${KVER} ==="
|
|
||||||
DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
|
||||||
"linux-headers-${KVER}" \
|
|
||||||
gcc make perl
|
|
||||||
fi
|
|
||||||
echo "kernel headers (arch): $KDIR_ARCH"
|
|
||||||
echo "kernel headers (common): $KDIR_COMMON"
|
|
||||||
|
|
||||||
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
|
|
||||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
||||||
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
||||||
CACHE_LAYOUT_VERSION="2"
|
CACHE_LAYOUT_VERSION="3"
|
||||||
CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
|
CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
|
||||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
||||||
&& [ -f "$CACHE_LAYOUT_MARKER" ] \
|
&& [ -f "$CACHE_LAYOUT_MARKER" ] \
|
||||||
@@ -57,6 +59,15 @@ if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
|||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
|
||||||
|
echo "=== installing linux-headers-${KVER} ==="
|
||||||
|
DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
||||||
|
"linux-headers-${KVER}" \
|
||||||
|
gcc make perl
|
||||||
|
fi
|
||||||
|
echo "kernel headers (arch): $KDIR_ARCH"
|
||||||
|
echo "kernel headers (common): $KDIR_COMMON"
|
||||||
|
|
||||||
# Download official NVIDIA .run installer with sha256 verification
|
# Download official NVIDIA .run installer with sha256 verification
|
||||||
BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
|
BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
|
||||||
mkdir -p "$DOWNLOAD_CACHE_DIR" "$EXTRACT_CACHE_DIR"
|
mkdir -p "$DOWNLOAD_CACHE_DIR" "$EXTRACT_CACHE_DIR"
|
||||||
@@ -90,12 +101,18 @@ EXTRACT_DIR="${EXTRACT_CACHE_DIR}/nvidia-extract-${NVIDIA_VERSION}"
|
|||||||
rm -rf "$EXTRACT_DIR"
|
rm -rf "$EXTRACT_DIR"
|
||||||
"$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
|
"$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
|
||||||
|
|
||||||
# Find kernel source directory (proprietary: kernel/, open: kernel-open/)
|
# Find kernel source directory for the selected flavor.
|
||||||
KERNEL_SRC=""
|
KERNEL_SRC=""
|
||||||
for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
|
if [ "$NVIDIA_FLAVOR" = "open" ]; then
|
||||||
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
for d in "$EXTRACT_DIR/kernel-open" "$EXTRACT_DIR/kernel-open/"*; do
|
||||||
done
|
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
||||||
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found in:"; ls "$EXTRACT_DIR/"; exit 1; }
|
done
|
||||||
|
else
|
||||||
|
for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
|
||||||
|
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found for flavor ${NVIDIA_FLAVOR} in:"; ls "$EXTRACT_DIR/"; exit 1; }
|
||||||
echo "kernel source: $KERNEL_SRC"
|
echo "kernel source: $KERNEL_SRC"
|
||||||
|
|
||||||
# Build kernel modules
|
# Build kernel modules
|
||||||
|
|||||||
@@ -15,26 +15,46 @@ DIST_DIR="${REPO_ROOT}/dist"
|
|||||||
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
||||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
AUTH_KEYS=""
|
AUTH_KEYS=""
|
||||||
|
BUILD_VARIANT="nvidia"
|
||||||
BEE_GPU_VENDOR="nvidia"
|
BEE_GPU_VENDOR="nvidia"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR="open"
|
||||||
|
|
||||||
# parse args
|
# parse args
|
||||||
while [ $# -gt 0 ]; do
|
while [ $# -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
|
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
|
||||||
--variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
|
--variant) BUILD_VARIANT="$2"; shift 2 ;;
|
||||||
*) echo "unknown arg: $1"; exit 1 ;;
|
*) echo "unknown arg: $1"; exit 1 ;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
case "$BEE_GPU_VENDOR" in
|
case "$BUILD_VARIANT" in
|
||||||
nvidia|amd|nogpu) ;;
|
nvidia)
|
||||||
*) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia, amd, or nogpu)" >&2; exit 1 ;;
|
BEE_GPU_VENDOR="nvidia"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR="open"
|
||||||
|
;;
|
||||||
|
nvidia-legacy)
|
||||||
|
BEE_GPU_VENDOR="nvidia"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR="proprietary"
|
||||||
|
;;
|
||||||
|
amd)
|
||||||
|
BEE_GPU_VENDOR="amd"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR=""
|
||||||
|
;;
|
||||||
|
nogpu)
|
||||||
|
BEE_GPU_VENDOR="nogpu"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR=""
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "unknown variant: $BUILD_VARIANT (expected nvidia, nvidia-legacy, amd, or nogpu)" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
|
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BUILD_VARIANT}"
|
||||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
|
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
|
||||||
|
|
||||||
export BEE_GPU_VENDOR
|
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
||||||
|
|
||||||
. "${BUILDER_DIR}/VERSIONS"
|
. "${BUILDER_DIR}/VERSIONS"
|
||||||
export PATH="$PATH:/usr/local/go/bin"
|
export PATH="$PATH:/usr/local/go/bin"
|
||||||
@@ -627,7 +647,7 @@ recover_iso_memtest() {
|
|||||||
|
|
||||||
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
||||||
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
||||||
ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
||||||
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
||||||
OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
|
OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
|
||||||
mkdir -p "${OUT_DIR}"
|
mkdir -p "${OUT_DIR}"
|
||||||
@@ -801,7 +821,7 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
|
|||||||
apt-get install -y "linux-headers-${KVER}"
|
apt-get install -y "linux-headers-${KVER}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
|
echo "=== bee ISO build (variant: ${BUILD_VARIANT}) ==="
|
||||||
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
||||||
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -871,7 +891,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
|
echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
|
||||||
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||||
|
|
||||||
# Sync builder config into variant work dir, preserving lb cache.
|
# Sync builder config into variant work dir, preserving lb cache.
|
||||||
@@ -897,6 +917,86 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
|
|||||||
rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
|
rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ "$BEE_GPU_VENDOR" != "nvidia" ] || [ "$BEE_NVIDIA_MODULE_FLAVOR" != "proprietary" ]; then
|
||||||
|
cat > "${BUILD_WORK_DIR}/config/bootloaders/grub-pc/grub.cfg" <<'EOF'
|
||||||
|
source /boot/grub/config.cfg
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
|
||||||
|
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
|
||||||
|
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
|
||||||
|
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
||||||
|
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
||||||
|
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
||||||
|
echo " Hardware Audit LiveCD"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
menuentry "EASY-BEE" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
|
submenu "EASY-BEE (advanced options) -->" {
|
||||||
|
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE — fail-safe" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
chainloader /boot/memtest86+x64.efi
|
||||||
|
}
|
||||||
|
else
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
linux16 /boot/memtest86+x64.bin
|
||||||
|
}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
|
menuentry "UEFI Firmware Settings" {
|
||||||
|
fwsetup
|
||||||
|
}
|
||||||
|
fi
|
||||||
|
EOF
|
||||||
|
|
||||||
|
cat > "${BUILD_WORK_DIR}/config/bootloaders/isolinux/live.cfg.in" <<'EOF'
|
||||||
|
label live-@FLAVOUR@-normal
|
||||||
|
menu label ^EASY-BEE
|
||||||
|
menu default
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-kms
|
||||||
|
menu label EASY-BEE (^graphics/KMS)
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ bee.display=kms
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-toram
|
||||||
|
menu label EASY-BEE (^load to RAM)
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ toram
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-failsafe
|
||||||
|
menu label EASY-BEE (^fail-safe)
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
||||||
|
|
||||||
|
label memtest
|
||||||
|
menu label ^Memory Test (memtest86+)
|
||||||
|
linux /boot/memtest86+x64.bin
|
||||||
|
EOF
|
||||||
|
fi
|
||||||
|
|
||||||
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
|
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
|
||||||
rm -f \
|
rm -f \
|
||||||
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
|
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
|
||||||
@@ -981,10 +1081,10 @@ done
|
|||||||
# --- NVIDIA kernel modules and userspace libs ---
|
# --- NVIDIA kernel modules and userspace libs ---
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
||||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"
|
||||||
|
|
||||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
NVIDIA_CACHE="${DIST_DIR}/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||||
|
|
||||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||||
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||||
@@ -1055,13 +1155,14 @@ GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo u
|
|||||||
|
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
||||||
|
NVIDIA_KERNEL_MODULES_FLAVOR=${BEE_NVIDIA_MODULE_FLAVOR}
|
||||||
NCCL_VERSION=${NCCL_VERSION}
|
NCCL_VERSION=${NCCL_VERSION}
|
||||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
||||||
JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
|
JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
|
||||||
GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
|
GPU_BUILD_INFO="nvidia-${BEE_NVIDIA_MODULE_FLAVOR}:${NVIDIA_DRIVER_VERSION}"
|
||||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||||
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
||||||
GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
|
GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
|
||||||
@@ -1073,6 +1174,7 @@ fi
|
|||||||
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
||||||
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
||||||
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
||||||
|
BEE_BUILD_VARIANT=${BUILD_VARIANT}
|
||||||
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
||||||
BUILD_DATE=${BUILD_DATE}
|
BUILD_DATE=${BUILD_DATE}
|
||||||
GIT_COMMIT=${GIT_COMMIT}
|
GIT_COMMIT=${GIT_COMMIT}
|
||||||
@@ -1083,6 +1185,11 @@ EOF
|
|||||||
|
|
||||||
# Write GPU vendor marker for hooks
|
# Write GPU vendor marker for hooks
|
||||||
echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
|
echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
|
||||||
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
|
echo "${BEE_NVIDIA_MODULE_FLAVOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
|
||||||
|
else
|
||||||
|
rm -f "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
|
||||||
|
fi
|
||||||
|
|
||||||
# Patch motd with build info
|
# Patch motd with build info
|
||||||
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
|
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
|
||||||
@@ -1153,10 +1260,10 @@ fi
|
|||||||
|
|
||||||
# --- build ISO using live-build ---
|
# --- build ISO using live-build ---
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
|
echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
|
||||||
|
|
||||||
# Export for auto/config
|
# Export for auto/config
|
||||||
BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
|
BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
|
||||||
export BEE_GPU_VENDOR_UPPER
|
export BEE_GPU_VENDOR_UPPER
|
||||||
|
|
||||||
cd "${LB_DIR}"
|
cd "${LB_DIR}"
|
||||||
@@ -1191,7 +1298,7 @@ if [ -f "$ISO_RAW" ]; then
|
|||||||
validate_iso_nvidia_runtime "$ISO_RAW"
|
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||||
cp "$ISO_RAW" "$ISO_OUT"
|
cp "$ISO_RAW" "$ISO_OUT"
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== done (${BEE_GPU_VENDOR}) ==="
|
echo "=== done (${BUILD_VARIANT}) ==="
|
||||||
echo "ISO: $ISO_OUT"
|
echo "ISO: $ISO_OUT"
|
||||||
if command -v stat >/dev/null 2>&1; then
|
if command -v stat >/dev/null 2>&1; then
|
||||||
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ echo " █████╗ ███████║███████╗ ╚
|
|||||||
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
||||||
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
||||||
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
||||||
|
echo " Hardware Audit LiveCD"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
menuentry "EASY-BEE" {
|
||||||
@@ -14,29 +15,21 @@ menuentry "EASY-BEE" {
|
|||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (graphics/KMS)" {
|
submenu "EASY-BEE (advanced options) -->" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
menuentry "EASY-BEE — GSP=off" {
|
||||||
initrd @INITRD_LIVE@
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
}
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (load to RAM)" {
|
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (NVIDIA GSP=off)" {
|
menuentry "EASY-BEE — fail-safe" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (graphics/KMS, GSP=off)" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE (fail-safe)" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if [ "${grub_platform}" = "efi" ]; then
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
set color_normal=light-gray/black
|
set color_normal=light-gray/black
|
||||||
set color_highlight=white/dark-gray
|
set color_highlight=yellow/black
|
||||||
|
|
||||||
if [ -e /boot/grub/splash.png ]; then
|
if [ -e /boot/grub/splash.png ]; then
|
||||||
set theme=/boot/grub/live-theme/theme.txt
|
set theme=/boot/grub/live-theme/theme.txt
|
||||||
else
|
else
|
||||||
set menu_color_normal=cyan/black
|
set menu_color_normal=yellow/black
|
||||||
set menu_color_highlight=white/dark-gray
|
set menu_color_highlight=white/brown
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ systemctl enable bee-audit.service
|
|||||||
systemctl enable bee-web.service
|
systemctl enable bee-web.service
|
||||||
systemctl enable bee-sshsetup.service
|
systemctl enable bee-sshsetup.service
|
||||||
systemctl enable bee-selfheal.timer
|
systemctl enable bee-selfheal.timer
|
||||||
|
systemctl enable bee-boot-status.service
|
||||||
systemctl enable ssh.service
|
systemctl enable ssh.service
|
||||||
systemctl enable lightdm.service 2>/dev/null || true
|
systemctl enable lightdm.service 2>/dev/null || true
|
||||||
systemctl enable qemu-guest-agent.service 2>/dev/null || true
|
systemctl enable qemu-guest-agent.service 2>/dev/null || true
|
||||||
@@ -59,7 +60,8 @@ chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
|||||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
||||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||||
|
|||||||
117
iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
Executable file
117
iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
Executable file
@@ -0,0 +1,117 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
|
||||||
|
set -e
|
||||||
|
echo "=== generating bee wallpaper ==="
|
||||||
|
mkdir -p /usr/share/bee
|
||||||
|
|
||||||
|
python3 - <<'PYEOF'
|
||||||
|
from PIL import Image, ImageDraw, ImageFont, ImageFilter
|
||||||
|
import os
|
||||||
|
|
||||||
|
W, H = 1920, 1080
|
||||||
|
|
||||||
|
ASCII_ART = [
|
||||||
|
" ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗",
|
||||||
|
" ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝",
|
||||||
|
" █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗",
|
||||||
|
" ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝",
|
||||||
|
" ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗",
|
||||||
|
" ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝",
|
||||||
|
]
|
||||||
|
SUBTITLE = " Hardware Audit LiveCD"
|
||||||
|
|
||||||
|
FG = (0xF6, 0xD0, 0x47)
|
||||||
|
FG_DIM = (0xD4, 0xA9, 0x1C)
|
||||||
|
SHADOW = (0x5E, 0x47, 0x05)
|
||||||
|
SUB = (0x96, 0x7A, 0x17)
|
||||||
|
BG = (0x05, 0x05, 0x05)
|
||||||
|
|
||||||
|
MONO_FONT_CANDIDATES = [
|
||||||
|
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
|
||||||
|
'/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
|
||||||
|
'/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
|
||||||
|
'/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
|
||||||
|
]
|
||||||
|
SUB_FONT_CANDIDATES = [
|
||||||
|
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
|
||||||
|
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
|
||||||
|
'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
|
||||||
|
'/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def load_font(candidates, size):
|
||||||
|
for path in candidates:
|
||||||
|
if os.path.exists(path):
|
||||||
|
return ImageFont.truetype(path, size)
|
||||||
|
return ImageFont.load_default()
|
||||||
|
|
||||||
|
|
||||||
|
def mono_metrics(font):
|
||||||
|
probe = Image.new('L', (W, H), 0)
|
||||||
|
draw = ImageDraw.Draw(probe)
|
||||||
|
char_w = int(round(draw.textlength("M", font=font)))
|
||||||
|
bb = draw.textbbox((0, 0), "Mg", font=font)
|
||||||
|
char_h = bb[3] - bb[1]
|
||||||
|
return char_w, char_h
|
||||||
|
|
||||||
|
|
||||||
|
def render_ascii_mask(font, lines, char_w, char_h, line_gap):
|
||||||
|
width = max(len(line) for line in lines) * char_w
|
||||||
|
height = len(lines) * char_h + line_gap * (len(lines) - 1)
|
||||||
|
mask = Image.new('L', (width, height), 0)
|
||||||
|
draw = ImageDraw.Draw(mask)
|
||||||
|
for row, line in enumerate(lines):
|
||||||
|
y = row * (char_h + line_gap)
|
||||||
|
for col, ch in enumerate(line):
|
||||||
|
if ch == ' ':
|
||||||
|
continue
|
||||||
|
x = col * char_w
|
||||||
|
draw.text((x, y), ch, font=font, fill=255)
|
||||||
|
return mask
|
||||||
|
|
||||||
|
|
||||||
|
img = Image.new('RGB', (W, H), BG)
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
|
||||||
|
# Soft amber glow under the logo without depending on font rendering.
|
||||||
|
glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
|
||||||
|
glow_draw = ImageDraw.Draw(glow)
|
||||||
|
glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
|
||||||
|
glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
|
||||||
|
glow = glow.filter(ImageFilter.GaussianBlur(60))
|
||||||
|
img = Image.alpha_composite(img.convert('RGBA'), glow)
|
||||||
|
|
||||||
|
TARGET_LOGO_W = 400
|
||||||
|
max_chars = max(len(line) for line in ASCII_ART)
|
||||||
|
_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
|
||||||
|
_probe_cw, _ = mono_metrics(_probe_font)
|
||||||
|
font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
|
||||||
|
font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
|
||||||
|
char_w, char_h = mono_metrics(font_logo)
|
||||||
|
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
|
||||||
|
logo_w, logo_h = logo_mask.size
|
||||||
|
logo_x = (W - logo_w) // 2
|
||||||
|
logo_y = 380
|
||||||
|
|
||||||
|
sh_off = max(1, font_size_logo // 6)
|
||||||
|
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
|
||||||
|
img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
|
||||||
|
img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
|
||||||
|
img.paste(FG, (logo_x, logo_y), logo_mask)
|
||||||
|
|
||||||
|
font_sub = load_font(SUB_FONT_CANDIDATES, 30)
|
||||||
|
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
|
||||||
|
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
|
||||||
|
sub_y = logo_y + logo_h + 48
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
|
||||||
|
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
|
||||||
|
|
||||||
|
img = img.convert('RGB')
|
||||||
|
|
||||||
|
img.save('/usr/share/bee/wallpaper.png', optimize=True)
|
||||||
|
print('wallpaper written: /usr/share/bee/wallpaper.png')
|
||||||
|
PYEOF
|
||||||
|
|
||||||
|
echo "=== wallpaper done ==="
|
||||||
41
iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
Executable file
41
iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
Executable file
@@ -0,0 +1,41 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# 9010-fix-toram.hook.chroot — patch live-boot toram to work with tmpfs (no O_DIRECT)
|
||||||
|
#
|
||||||
|
# live-boot tries "losetup --replace --direct-io=on" when re-associating the
|
||||||
|
# loop device to the RAM copy in /dev/shm. tmpfs does not support O_DIRECT,
|
||||||
|
# so the ioctl returns EINVAL and the verification step fails.
|
||||||
|
#
|
||||||
|
# The patch replaces the replace call so that if --direct-io=on fails it falls
|
||||||
|
# back to a plain replace without direct-io, and also relaxes the verification
|
||||||
|
# to a warning so the boot continues even when re-association is imperfect.
|
||||||
|
set -e
|
||||||
|
|
||||||
|
TORAM_SCRIPT="/usr/lib/live/boot/9990-toram-todisk.sh"
|
||||||
|
|
||||||
|
if [ ! -f "${TORAM_SCRIPT}" ]; then
|
||||||
|
echo "9010-fix-toram: ${TORAM_SCRIPT} not found, skipping"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "9010-fix-toram: patching ${TORAM_SCRIPT}"
|
||||||
|
|
||||||
|
# Replace any losetup --replace call that includes --direct-io=on with a
|
||||||
|
# version that first tries with direct-io, then retries without it.
|
||||||
|
#
|
||||||
|
# The sed expression turns:
|
||||||
|
# losetup --replace ... --direct-io=on LOOP FILE
|
||||||
|
# into a shell snippet that tries both, silently.
|
||||||
|
#
|
||||||
|
# We also downgrade the fatal "Task finished with error." block to a warning
|
||||||
|
# so the boot continues if re-association fails (squashfs still accessible).
|
||||||
|
|
||||||
|
# 1. Strip --direct-io=on from the losetup --replace call so it works on tmpfs.
|
||||||
|
sed -i 's/losetup --replace --direct-io=on/losetup --replace/g' "${TORAM_SCRIPT}"
|
||||||
|
sed -i 's/losetup --replace --direct-io/losetup --replace/g' "${TORAM_SCRIPT}"
|
||||||
|
|
||||||
|
# 2. Turn the hard error into a warning so boot continues.
|
||||||
|
# live-boot prints this exact string when verification fails.
|
||||||
|
sed -i 's/echo "Task finished with error\."/echo "Warning: toram re-association failed, continuing boot (squashfs still in RAM)"/' "${TORAM_SCRIPT}"
|
||||||
|
|
||||||
|
echo "9010-fix-toram: patch applied"
|
||||||
|
grep -n "losetup" "${TORAM_SCRIPT}" | head -20 || true
|
||||||
@@ -60,9 +60,15 @@ qrencode
|
|||||||
# Local desktop (openbox + chromium kiosk)
|
# Local desktop (openbox + chromium kiosk)
|
||||||
openbox
|
openbox
|
||||||
tint2
|
tint2
|
||||||
|
feh
|
||||||
|
python3-pil
|
||||||
xorg
|
xorg
|
||||||
xterm
|
xterm
|
||||||
chromium
|
chromium
|
||||||
|
mousepad
|
||||||
|
pcmanfm
|
||||||
|
ristretto
|
||||||
|
mupdf
|
||||||
xserver-xorg-video-fbdev
|
xserver-xorg-video-fbdev
|
||||||
xserver-xorg-video-vesa
|
xserver-xorg-video-vesa
|
||||||
lightdm
|
lightdm
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ echo ""
|
|||||||
KVER=$(uname -r)
|
KVER=$(uname -r)
|
||||||
info "kernel: $KVER"
|
info "kernel: $KVER"
|
||||||
NVIDIA_BOOT_MODE="normal"
|
NVIDIA_BOOT_MODE="normal"
|
||||||
|
NVIDIA_MODULES_FLAVOR="proprietary"
|
||||||
for arg in $(cat /proc/cmdline 2>/dev/null); do
|
for arg in $(cat /proc/cmdline 2>/dev/null); do
|
||||||
case "$arg" in
|
case "$arg" in
|
||||||
bee.nvidia.mode=*)
|
bee.nvidia.mode=*)
|
||||||
@@ -34,7 +35,11 @@ for arg in $(cat /proc/cmdline 2>/dev/null); do
|
|||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
if [ -f /etc/bee-nvidia-modules-flavor ]; then
|
||||||
|
NVIDIA_MODULES_FLAVOR="$(tr -d '[:space:]' </etc/bee-nvidia-modules-flavor 2>/dev/null || echo proprietary)"
|
||||||
|
fi
|
||||||
info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
|
info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
|
||||||
|
info "nvidia modules flavor: ${NVIDIA_MODULES_FLAVOR}"
|
||||||
|
|
||||||
# --- PATH & binaries ---
|
# --- PATH & binaries ---
|
||||||
echo "-- PATH & binaries --"
|
echo "-- PATH & binaries --"
|
||||||
@@ -110,10 +115,12 @@ fi
|
|||||||
for mod in nvidia_modeset nvidia_uvm; do
|
for mod in nvidia_modeset nvidia_uvm; do
|
||||||
if /sbin/lsmod 2>/dev/null | grep -q "^$mod "; then
|
if /sbin/lsmod 2>/dev/null | grep -q "^$mod "; then
|
||||||
ok "module loaded: $mod"
|
ok "module loaded: $mod"
|
||||||
elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
|
elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ] && { [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; }; then
|
||||||
fail "module NOT loaded in normal mode: $mod"
|
fail "module NOT loaded in normal mode: $mod"
|
||||||
else
|
elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ]; then
|
||||||
warn "module not loaded in GSP-off mode: $mod"
|
warn "module not loaded in GSP-off mode: $mod"
|
||||||
|
else
|
||||||
|
fail "module NOT loaded: $mod"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
@@ -129,10 +136,12 @@ done
|
|||||||
|
|
||||||
if [ -e /dev/nvidia-uvm ]; then
|
if [ -e /dev/nvidia-uvm ]; then
|
||||||
ok "/dev/nvidia-uvm exists"
|
ok "/dev/nvidia-uvm exists"
|
||||||
elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
|
elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ] && { [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; }; then
|
||||||
fail "/dev/nvidia-uvm missing in normal mode"
|
fail "/dev/nvidia-uvm missing in normal mode"
|
||||||
else
|
elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ]; then
|
||||||
warn "/dev/nvidia-uvm missing — CUDA stress path may be unavailable until loaded on demand"
|
warn "/dev/nvidia-uvm missing — CUDA stress path may be unavailable until loaded on demand"
|
||||||
|
else
|
||||||
|
fail "/dev/nvidia-uvm missing"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
|
|||||||
18
iso/overlay/etc/systemd/system/bee-boot-status.service
Normal file
18
iso/overlay/etc/systemd/system/bee-boot-status.service
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Bee: boot status display
|
||||||
|
After=systemd-user-sessions.service
|
||||||
|
Before=getty@tty1.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
RemainAfterExit=no
|
||||||
|
ExecStart=/usr/local/bin/bee-boot-status
|
||||||
|
TTYPath=/dev/tty1
|
||||||
|
StandardInput=tty
|
||||||
|
StandardOutput=tty
|
||||||
|
StandardError=tty
|
||||||
|
TTYReset=yes
|
||||||
|
TTYVHangup=yes
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
[Unit]
|
||||||
|
After=bee-boot-status.service
|
||||||
@@ -1,6 +1,4 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Wants=bee-preflight.service
|
|
||||||
After=bee-preflight.service
|
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
ExecStartPre=/usr/local/bin/bee-display-mode
|
ExecStartPre=/usr/local/bin/bee-display-mode
|
||||||
|
|||||||
89
iso/overlay/usr/local/bin/bee-boot-status
Normal file
89
iso/overlay/usr/local/bin/bee-boot-status
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# bee-boot-status — boot progress display on tty1.
|
||||||
|
# Shows live service status until all bee services are done or failed,
|
||||||
|
# then exits so getty can show the login prompt.
|
||||||
|
|
||||||
|
CRITICAL="bee-preflight bee-nvidia bee-audit"
|
||||||
|
ALL="bee-sshsetup ssh bee-network bee-nvidia bee-preflight bee-audit bee-web"
|
||||||
|
|
||||||
|
svc_state() { systemctl is-active "$1.service" 2>/dev/null || echo "inactive"; }
|
||||||
|
|
||||||
|
svc_icon() {
|
||||||
|
case "$(svc_state "$1")" in
|
||||||
|
active) printf '\033[32m[ OK ]\033[0m' ;;
|
||||||
|
failed) printf '\033[31m[ FAIL ]\033[0m' ;;
|
||||||
|
activating) printf '\033[33m[ .. ]\033[0m' ;;
|
||||||
|
deactivating) printf '\033[33m[ stop ]\033[0m' ;;
|
||||||
|
inactive) printf '\033[90m[ ]\033[0m' ;;
|
||||||
|
*) printf '\033[90m[ ? ]\033[0m' ;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
|
svc_detail() {
|
||||||
|
local svc="$1" state
|
||||||
|
state="$(svc_state "$svc")"
|
||||||
|
case "$state" in
|
||||||
|
failed)
|
||||||
|
local res
|
||||||
|
res="$(systemctl show -p Result "$svc.service" 2>/dev/null | cut -d= -f2)"
|
||||||
|
[ -n "$res" ] && [ "$res" != "success" ] && printf ' \033[31m(%s)\033[0m' "$res"
|
||||||
|
;;
|
||||||
|
activating)
|
||||||
|
local line
|
||||||
|
line="$(journalctl -u "$svc.service" -n 1 --no-pager --output=cat 2>/dev/null | cut -c1-55)"
|
||||||
|
[ -n "$line" ] && printf ' \033[90m%s\033[0m' "$line"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
|
all_critical_done() {
|
||||||
|
for svc in $CRITICAL; do
|
||||||
|
case "$(svc_state "$svc")" in
|
||||||
|
active|failed|inactive) ;;
|
||||||
|
*) return 1 ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
# move to top-left and clear screen
|
||||||
|
printf '\033[H\033[2J'
|
||||||
|
|
||||||
|
printf '\n'
|
||||||
|
printf ' \033[33m███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗\033[0m\n'
|
||||||
|
printf ' \033[33m██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝\033[0m\n'
|
||||||
|
printf ' \033[33m█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗\033[0m\n'
|
||||||
|
printf ' \033[33m██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝\033[0m\n'
|
||||||
|
printf ' \033[33m███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗\033[0m\n'
|
||||||
|
printf ' \033[33m╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝\033[0m\n'
|
||||||
|
printf ' Hardware Audit LiveCD\n'
|
||||||
|
printf '\n'
|
||||||
|
|
||||||
|
for svc in $ALL; do
|
||||||
|
printf ' %s %-20s%s\n' "$(svc_icon "$svc")" "$svc" "$(svc_detail "$svc")"
|
||||||
|
done
|
||||||
|
printf '\n'
|
||||||
|
|
||||||
|
# Network
|
||||||
|
ips="$(ip -4 addr show scope global 2>/dev/null | awk '/inet /{printf " %-16s %s\n", $NF, $2}')"
|
||||||
|
if [ -n "$ips" ]; then
|
||||||
|
printf ' \033[1mNetwork:\033[0m\n'
|
||||||
|
printf '%s\n' "$ips"
|
||||||
|
printf '\n'
|
||||||
|
fi
|
||||||
|
|
||||||
|
if all_critical_done; then
|
||||||
|
printf ' \033[1;32mSystem ready.\033[0m Audit is running in the background.\n'
|
||||||
|
first_ip="$(ip -4 addr show scope global 2>/dev/null | awk '/inet /{print $2}' | cut -d/ -f1 | head -1)"
|
||||||
|
if [ -n "$first_ip" ]; then
|
||||||
|
printf ' Web UI: \033[1mhttp://%s/\033[0m\n' "$first_ip"
|
||||||
|
fi
|
||||||
|
printf '\n'
|
||||||
|
sleep 3
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf ' \033[90mStarting up...\033[0m\n'
|
||||||
|
sleep 3
|
||||||
|
done
|
||||||
110
iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
Executable file
110
iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
Executable file
@@ -0,0 +1,110 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
SECONDS=300
|
||||||
|
STAGGER_SECONDS=180
|
||||||
|
DEVICES=""
|
||||||
|
EXCLUDE=""
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||||
|
exit 2
|
||||||
|
}
|
||||||
|
|
||||||
|
normalize_list() {
|
||||||
|
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||||
|
}
|
||||||
|
|
||||||
|
contains_csv() {
|
||||||
|
needle="$1"
|
||||||
|
haystack="${2:-}"
|
||||||
|
echo ",${haystack}," | grep -q ",${needle},"
|
||||||
|
}
|
||||||
|
|
||||||
|
resolve_dcgmproftester() {
|
||||||
|
for candidate in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
|
||||||
|
if command -v "${candidate}" >/dev/null 2>&1; then
|
||||||
|
command -v "${candidate}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
while [ "$#" -gt 0 ]; do
|
||||||
|
case "$1" in
|
||||||
|
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||||
|
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||||
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
|
*) usage ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
PROF=$(resolve_dcgmproftester) || { echo "dcgmproftester not found in PATH" >&2; exit 1; }
|
||||||
|
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||||
|
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||||
|
|
||||||
|
DEVICES=$(normalize_list "${DEVICES}")
|
||||||
|
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||||
|
SELECTED="${DEVICES}"
|
||||||
|
if [ -z "${SELECTED}" ]; then
|
||||||
|
SELECTED="${ALL_DEVICES}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
FINAL=""
|
||||||
|
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||||
|
[ -n "${id}" ] || continue
|
||||||
|
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if [ -z "${FINAL}" ]; then
|
||||||
|
FINAL="${id}"
|
||||||
|
else
|
||||||
|
FINAL="${FINAL},${id}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||||
|
|
||||||
|
echo "loader=dcgmproftester-staggered"
|
||||||
|
echo "selected_gpus=${FINAL}"
|
||||||
|
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||||
|
|
||||||
|
TMP_DIR=$(mktemp -d)
|
||||||
|
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||||
|
|
||||||
|
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||||
|
gpu_pos=0
|
||||||
|
WORKERS=""
|
||||||
|
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||||
|
gpu_pos=$((gpu_pos + 1))
|
||||||
|
log="${TMP_DIR}/gpu-${id}.log"
|
||||||
|
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
|
||||||
|
gpu_seconds=$(( SECONDS + extra_sec ))
|
||||||
|
echo "starting gpu ${id} seconds=${gpu_seconds}"
|
||||||
|
CUDA_VISIBLE_DEVICES="${id}" "${PROF}" --no-dcgm-validation -t 1004 -d "${gpu_seconds}" >"${log}" 2>&1 &
|
||||||
|
pid=$!
|
||||||
|
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||||
|
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||||
|
sleep "${STAGGER_SECONDS}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
status=0
|
||||||
|
for spec in ${WORKERS}; do
|
||||||
|
pid=${spec%%:*}
|
||||||
|
rest=${spec#*:}
|
||||||
|
id=${rest%%:*}
|
||||||
|
log=${rest#*:}
|
||||||
|
if wait "${pid}"; then
|
||||||
|
echo "gpu ${id} finished: OK"
|
||||||
|
else
|
||||||
|
rc=$?
|
||||||
|
echo "gpu ${id} finished: FAILED rc=${rc}"
|
||||||
|
status=1
|
||||||
|
fi
|
||||||
|
sed "s/^/[gpu ${id}] /" "${log}" || true
|
||||||
|
done
|
||||||
|
|
||||||
|
exit "${status}"
|
||||||
20
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file → Executable file
20
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file → Executable file
@@ -2,13 +2,14 @@
|
|||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
SECONDS=5
|
SECONDS=5
|
||||||
|
STAGGER_SECONDS=0
|
||||||
SIZE_MB=0
|
SIZE_MB=0
|
||||||
DEVICES=""
|
DEVICES=""
|
||||||
EXCLUDE=""
|
EXCLUDE=""
|
||||||
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||||
exit 2
|
exit 2
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -25,6 +26,7 @@ contains_csv() {
|
|||||||
while [ "$#" -gt 0 ]; do
|
while [ "$#" -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||||
|
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||||
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
|
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
|
||||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
@@ -61,12 +63,18 @@ done
|
|||||||
|
|
||||||
echo "loader=bee-gpu-burn"
|
echo "loader=bee-gpu-burn"
|
||||||
echo "selected_gpus=${FINAL}"
|
echo "selected_gpus=${FINAL}"
|
||||||
|
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||||
|
|
||||||
|
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||||
|
|
||||||
TMP_DIR=$(mktemp -d)
|
TMP_DIR=$(mktemp -d)
|
||||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||||
|
|
||||||
|
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||||
|
gpu_pos=0
|
||||||
WORKERS=""
|
WORKERS=""
|
||||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||||
|
gpu_pos=$((gpu_pos + 1))
|
||||||
log="${TMP_DIR}/gpu-${id}.log"
|
log="${TMP_DIR}/gpu-${id}.log"
|
||||||
gpu_size_mb="${SIZE_MB}"
|
gpu_size_mb="${SIZE_MB}"
|
||||||
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
|
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
|
||||||
@@ -77,10 +85,16 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
|
|||||||
gpu_size_mb=512
|
gpu_size_mb=512
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
|
||||||
"${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
gpu_seconds=$(( SECONDS + extra_sec ))
|
||||||
|
echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
|
||||||
|
CUDA_VISIBLE_DEVICES="${id}" \
|
||||||
|
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||||
pid=$!
|
pid=$!
|
||||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||||
|
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||||
|
sleep "${STAGGER_SECONDS}"
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
status=0
|
status=0
|
||||||
|
|||||||
23
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file → Executable file
23
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file → Executable file
@@ -2,6 +2,7 @@
|
|||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
DURATION_SEC=300
|
DURATION_SEC=300
|
||||||
|
STAGGER_SECONDS=0
|
||||||
DEVICES=""
|
DEVICES=""
|
||||||
EXCLUDE=""
|
EXCLUDE=""
|
||||||
FORMAT=""
|
FORMAT=""
|
||||||
@@ -12,7 +13,7 @@ export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
|
|||||||
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
|
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
||||||
exit 2
|
exit 2
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -118,6 +119,7 @@ ensure_opencl_ready() {
|
|||||||
while [ "$#" -gt 0 ]; do
|
while [ "$#" -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
|
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
|
||||||
|
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
||||||
@@ -152,19 +154,25 @@ done
|
|||||||
|
|
||||||
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||||
|
|
||||||
|
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||||
|
export CUDA_VISIBLE_DEVICES="${FINAL}"
|
||||||
|
|
||||||
JOHN_DEVICES=""
|
JOHN_DEVICES=""
|
||||||
|
local_id=1
|
||||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||||
opencl_id=$((id + 1))
|
opencl_id="${local_id}"
|
||||||
if [ -z "${JOHN_DEVICES}" ]; then
|
if [ -z "${JOHN_DEVICES}" ]; then
|
||||||
JOHN_DEVICES="${opencl_id}"
|
JOHN_DEVICES="${opencl_id}"
|
||||||
else
|
else
|
||||||
JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
|
JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
|
||||||
fi
|
fi
|
||||||
|
local_id=$((local_id + 1))
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "loader=john"
|
echo "loader=john"
|
||||||
echo "selected_gpus=${FINAL}"
|
echo "selected_gpus=${FINAL}"
|
||||||
echo "john_devices=${JOHN_DEVICES}"
|
echo "john_devices=${JOHN_DEVICES}"
|
||||||
|
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||||
|
|
||||||
cd "${JOHN_DIR}"
|
cd "${JOHN_DIR}"
|
||||||
|
|
||||||
@@ -227,14 +235,21 @@ trap cleanup EXIT INT TERM
|
|||||||
echo "format=${CHOSEN_FORMAT}"
|
echo "format=${CHOSEN_FORMAT}"
|
||||||
echo "target_seconds=${DURATION_SEC}"
|
echo "target_seconds=${DURATION_SEC}"
|
||||||
echo "slice_seconds=${TEST_SLICE_SECONDS}"
|
echo "slice_seconds=${TEST_SLICE_SECONDS}"
|
||||||
DEADLINE=$(( $(date +%s) + DURATION_SEC ))
|
TOTAL_DEVICES=$(echo "${JOHN_DEVICES}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||||
_first=1
|
_first=1
|
||||||
|
pos=0
|
||||||
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
||||||
|
pos=$((pos + 1))
|
||||||
[ "${_first}" = "1" ] || sleep 3
|
[ "${_first}" = "1" ] || sleep 3
|
||||||
_first=0
|
_first=0
|
||||||
run_john_loop "${opencl_id}" "${DEADLINE}" &
|
extra_sec=$(( STAGGER_SECONDS * (TOTAL_DEVICES - pos) ))
|
||||||
|
deadline=$(( $(date +%s) + DURATION_SEC + extra_sec ))
|
||||||
|
run_john_loop "${opencl_id}" "${deadline}" &
|
||||||
pid=$!
|
pid=$!
|
||||||
PIDS="${PIDS} ${pid}"
|
PIDS="${PIDS} ${pid}"
|
||||||
|
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${pos}" -lt "${TOTAL_DEVICES}" ]; then
|
||||||
|
sleep "${STAGGER_SECONDS}"
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
FAIL=0
|
FAIL=0
|
||||||
for pid in ${PIDS}; do
|
for pid in ${PIDS}; do
|
||||||
|
|||||||
@@ -70,6 +70,8 @@ echo "gpu_count=${GPU_COUNT}"
|
|||||||
echo "range=${MIN_BYTES}..${MAX_BYTES}"
|
echo "range=${MIN_BYTES}..${MAX_BYTES}"
|
||||||
echo "iters=${ITERS}"
|
echo "iters=${ITERS}"
|
||||||
|
|
||||||
|
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||||
|
|
||||||
deadline=$(( $(date +%s) + SECONDS ))
|
deadline=$(( $(date +%s) + SECONDS ))
|
||||||
round=0
|
round=0
|
||||||
|
|
||||||
|
|||||||
@@ -6,10 +6,28 @@ NVIDIA_KO_DIR="/usr/local/lib/nvidia"
|
|||||||
|
|
||||||
log() { echo "[bee-nvidia] $*"; }
|
log() { echo "[bee-nvidia] $*"; }
|
||||||
|
|
||||||
|
read_nvidia_modules_flavor() {
|
||||||
|
if [ -f /etc/bee-nvidia-modules-flavor ]; then
|
||||||
|
flavor="$(tr -d '[:space:]' </etc/bee-nvidia-modules-flavor 2>/dev/null)"
|
||||||
|
case "$flavor" in
|
||||||
|
open|proprietary)
|
||||||
|
echo "$flavor"
|
||||||
|
return 0
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
fi
|
||||||
|
echo "proprietary"
|
||||||
|
}
|
||||||
|
|
||||||
log "kernel: $(uname -r)"
|
log "kernel: $(uname -r)"
|
||||||
|
|
||||||
# Skip if no NVIDIA GPU present (PCI vendor 10de)
|
# Skip if no NVIDIA display/compute GPU is present.
|
||||||
if ! lspci -nn 2>/dev/null | grep -qi '10de:'; then
|
# Match only display-class PCI functions (0300 VGA, 0302 3D controller) from vendor 10de.
|
||||||
|
have_nvidia_gpu() {
|
||||||
|
lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||||
|
}
|
||||||
|
|
||||||
|
if ! have_nvidia_gpu; then
|
||||||
log "no NVIDIA GPU detected — skipping module load"
|
log "no NVIDIA GPU detected — skipping module load"
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
@@ -40,6 +58,8 @@ if [ -z "$nvidia_mode" ]; then
|
|||||||
nvidia_mode="normal"
|
nvidia_mode="normal"
|
||||||
fi
|
fi
|
||||||
log "boot mode: $nvidia_mode"
|
log "boot mode: $nvidia_mode"
|
||||||
|
nvidia_modules_flavor="$(read_nvidia_modules_flavor)"
|
||||||
|
log "modules flavor: $nvidia_modules_flavor"
|
||||||
|
|
||||||
load_module() {
|
load_module() {
|
||||||
mod="$1"
|
mod="$1"
|
||||||
@@ -50,11 +70,93 @@ load_module() {
|
|||||||
log "WARN: not found: $ko"
|
log "WARN: not found: $ko"
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
if insmod "$ko" "$@"; then
|
if timeout 90 insmod "$ko" "$@"; then
|
||||||
log "loaded: $mod $*"
|
log "loaded: $mod $*"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
log "WARN: failed to load: $mod"
|
log "WARN: failed to load: $mod (exit $?)"
|
||||||
|
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
nvidia_is_functional() {
|
||||||
|
grep -q ' nvidiactl$' /proc/devices 2>/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
load_module_with_gsp_fallback() {
|
||||||
|
ko="$NVIDIA_KO_DIR/nvidia.ko"
|
||||||
|
if [ ! -f "$ko" ]; then
|
||||||
|
log "ERROR: not found: $ko"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Run insmod in background — on some converted SXM→PCIe cards GSP enters an
|
||||||
|
# infinite crash/reload loop and insmod never returns. We check for successful
|
||||||
|
# initialization by polling /proc/devices for nvidiactl instead of waiting for
|
||||||
|
# insmod to exit.
|
||||||
|
log "loading nvidia (GSP enabled, timeout 90s)"
|
||||||
|
insmod "$ko" &
|
||||||
|
_insmod_pid=$!
|
||||||
|
|
||||||
|
_waited=0
|
||||||
|
while [ $_waited -lt 90 ]; do
|
||||||
|
if nvidia_is_functional; then
|
||||||
|
log "loaded: nvidia (GSP enabled, ${_waited}s)"
|
||||||
|
echo "gsp-on" > /run/bee-nvidia-mode
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
# Check if insmod exited with an error before timeout
|
||||||
|
if ! kill -0 "$_insmod_pid" 2>/dev/null; then
|
||||||
|
wait "$_insmod_pid"
|
||||||
|
_rc=$?
|
||||||
|
if [ $_rc -ne 0 ]; then
|
||||||
|
log "nvidia load failed (exit $_rc)"
|
||||||
|
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
# insmod exited 0 but nvidiactl not yet in /proc/devices — give it a moment
|
||||||
|
sleep 2
|
||||||
|
if nvidia_is_functional; then
|
||||||
|
log "loaded: nvidia (GSP enabled, ${_waited}s)"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
log "insmod exited 0 but nvidiactl missing — treating as failure"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
_waited=$((_waited + 1))
|
||||||
|
done
|
||||||
|
|
||||||
|
# GSP init timed out — kill the hanging insmod and attempt gsp-off fallback
|
||||||
|
log "nvidia GSP init timed out after 90s"
|
||||||
|
kill "$_insmod_pid" 2>/dev/null || true
|
||||||
|
wait "$_insmod_pid" 2>/dev/null || true
|
||||||
|
|
||||||
|
# Attempt to unload the partially-initialized module
|
||||||
|
if ! rmmod nvidia 2>/dev/null; then
|
||||||
|
# Module is stuck in the kernel — cannot reload with different params.
|
||||||
|
# User must reboot and select bee.nvidia.mode=gsp-off at boot menu.
|
||||||
|
log "ERROR: rmmod nvidia failed (EBUSY) — module stuck in kernel"
|
||||||
|
log "ERROR: reboot and select 'EASY-BEE (advanced) -> GSP=off' in boot menu"
|
||||||
|
echo "gsp-stuck" > /run/bee-nvidia-mode
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep 2
|
||||||
|
log "retrying with NVreg_EnableGpuFirmware=0"
|
||||||
|
log "WARNING: GSP disabled — power management will run via CPU path, not GPU firmware"
|
||||||
|
|
||||||
|
if insmod "$ko" NVreg_EnableGpuFirmware=0; then
|
||||||
|
if nvidia_is_functional; then
|
||||||
|
log "loaded: nvidia (GSP disabled)"
|
||||||
|
echo "gsp-off" > /run/bee-nvidia-mode
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
log "insmod gsp-off exited 0 but nvidiactl missing"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "nvidia load failed (GSP=off)"
|
||||||
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
@@ -68,37 +170,54 @@ load_host_module() {
|
|||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
case "$nvidia_mode" in
|
if [ "$nvidia_modules_flavor" = "open" ]; then
|
||||||
normal|full)
|
case "$nvidia_mode" in
|
||||||
if ! load_module nvidia; then
|
gsp-off|safe|nomsi)
|
||||||
exit 1
|
log "ignoring boot mode ${nvidia_mode} for open NVIDIA modules"
|
||||||
fi
|
;;
|
||||||
# nvidia-modeset on some server kernels needs ACPI video helper symbols
|
esac
|
||||||
# exported by the generic "video" module. Best-effort only; compute paths
|
if ! load_module nvidia; then
|
||||||
# remain functional even if display-related modules stay absent.
|
exit 1
|
||||||
load_host_module video || true
|
fi
|
||||||
load_module nvidia-modeset || true
|
# nvidia-modeset on some server kernels needs ACPI video helper symbols
|
||||||
load_module nvidia-uvm || true
|
# exported by the generic "video" module. Best-effort only; compute paths
|
||||||
;;
|
# remain functional even if display-related modules stay absent.
|
||||||
gsp-off|safe)
|
load_host_module video || true
|
||||||
# NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
|
load_module nvidia-modeset || true
|
||||||
# be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
|
load_module nvidia-uvm || true
|
||||||
# conservative path for platforms where full boot-time GSP init is unstable.
|
else
|
||||||
if ! load_module nvidia NVreg_EnableGpuFirmware=0; then
|
case "$nvidia_mode" in
|
||||||
exit 1
|
normal|full)
|
||||||
fi
|
if ! load_module_with_gsp_fallback; then
|
||||||
log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
|
exit 1
|
||||||
;;
|
fi
|
||||||
nomsi|*)
|
# nvidia-modeset on some server kernels needs ACPI video helper symbols
|
||||||
# nomsi: disable MSI-X/MSI interrupts — use when RmInitAdapter fails with
|
# exported by the generic "video" module. Best-effort only; compute paths
|
||||||
# "Failed to enable MSI-X" on one or more GPUs (IOMMU group interrupt limits).
|
# remain functional even if display-related modules stay absent.
|
||||||
# NVreg_EnableMSI=0 forces legacy INTx interrupts for all GPUs.
|
load_host_module video || true
|
||||||
if ! load_module nvidia NVreg_EnableGpuFirmware=0 NVreg_EnableMSI=0; then
|
load_module nvidia-modeset || true
|
||||||
exit 1
|
load_module nvidia-uvm || true
|
||||||
fi
|
;;
|
||||||
log "nomsi mode: MSI-X disabled (NVreg_EnableMSI=0), skipping nvidia-modeset and nvidia-uvm"
|
gsp-off|safe)
|
||||||
;;
|
# NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
|
||||||
esac
|
# be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
|
||||||
|
# conservative path for platforms where full boot-time GSP init is unstable.
|
||||||
|
if ! load_module nvidia NVreg_EnableGpuFirmware=0; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
|
||||||
|
;;
|
||||||
|
nomsi|*)
|
||||||
|
# nomsi: disable MSI-X/MSI interrupts — use when RmInitAdapter fails with
|
||||||
|
# "Failed to enable MSI-X" on one or more GPUs (IOMMU group interrupt limits).
|
||||||
|
# NVreg_EnableMSI=0 forces legacy INTx interrupts for all GPUs.
|
||||||
|
if ! load_module nvidia NVreg_EnableGpuFirmware=0 NVreg_EnableMSI=0; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
log "nomsi mode: MSI-X disabled (NVreg_EnableMSI=0), skipping nvidia-modeset and nvidia-uvm"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
fi
|
||||||
|
|
||||||
# Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
|
# Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
|
||||||
nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices | awk '{print $1}')
|
nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices | awk '{print $1}')
|
||||||
@@ -127,6 +246,18 @@ fi
|
|||||||
ldconfig 2>/dev/null || true
|
ldconfig 2>/dev/null || true
|
||||||
log "ldconfig refreshed"
|
log "ldconfig refreshed"
|
||||||
|
|
||||||
|
# Keep persistence mode enabled across the session so dcgmi / stress tools do
|
||||||
|
# not fail with deployment warnings on otherwise healthy GPUs.
|
||||||
|
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||||
|
if nvidia-smi -pm 1 >/dev/null 2>&1; then
|
||||||
|
log "enabled NVIDIA persistence mode"
|
||||||
|
else
|
||||||
|
log "WARN: failed to enable NVIDIA persistence mode"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
||||||
|
fi
|
||||||
|
|
||||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||||
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
||||||
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
||||||
|
|||||||
@@ -7,16 +7,24 @@ xset s off
|
|||||||
xset -dpms
|
xset -dpms
|
||||||
xset s noblank
|
xset s noblank
|
||||||
|
|
||||||
|
# Set desktop background.
|
||||||
|
if [ -f /usr/share/bee/wallpaper.png ]; then
|
||||||
|
feh --bg-fill /usr/share/bee/wallpaper.png
|
||||||
|
else
|
||||||
|
xsetroot -solid '#f6c90e'
|
||||||
|
fi
|
||||||
|
|
||||||
tint2 &
|
tint2 &
|
||||||
|
|
||||||
# Wait up to 120s for bee-web to bind. The web server starts immediately now
|
# Wait up to 60s for bee-web before opening Chromium.
|
||||||
# (audit is deferred), so this should succeed in a few seconds on most hardware.
|
# Without this Chromium gets connection-refused and shows a blank page.
|
||||||
i=0
|
_i=0
|
||||||
while [ $i -lt 120 ]; do
|
while [ $_i -lt 60 ]; do
|
||||||
if curl -sf http://localhost/healthz >/dev/null 2>&1; then break; fi
|
curl -sf http://localhost/healthz >/dev/null 2>&1 && break
|
||||||
sleep 1
|
sleep 1
|
||||||
i=$((i+1))
|
_i=$((_i+1))
|
||||||
done
|
done
|
||||||
|
unset _i
|
||||||
|
|
||||||
chromium \
|
chromium \
|
||||||
--disable-infobars \
|
--disable-infobars \
|
||||||
@@ -24,7 +32,8 @@ chromium \
|
|||||||
--no-first-run \
|
--no-first-run \
|
||||||
--disable-session-crashed-bubble \
|
--disable-session-crashed-bubble \
|
||||||
--disable-features=TranslateUI \
|
--disable-features=TranslateUI \
|
||||||
|
--user-data-dir=/tmp/bee-chrome \
|
||||||
--start-maximized \
|
--start-maximized \
|
||||||
http://localhost/ &
|
http://localhost/loading &
|
||||||
|
|
||||||
exec openbox
|
exec openbox
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ log() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
have_nvidia_gpu() {
|
have_nvidia_gpu() {
|
||||||
lspci -nn 2>/dev/null | grep -qi '10de:'
|
lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||||
}
|
}
|
||||||
|
|
||||||
service_active() {
|
service_active() {
|
||||||
|
|||||||
Reference in New Issue
Block a user