Compare commits
75 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| dbab43db90 | |||
| bcb7fe5fe9 | |||
| d21d9d191b | |||
| ef45246ea0 | |||
| 348db35119 | |||
| 1dd7f243f5 | |||
| 938e499ac2 | |||
| 964ab39656 | |||
| c2aecc6ce9 | |||
| 439b86ce59 | |||
| eb60100297 | |||
|
|
2baf3be640 | ||
|
|
d92f8f41d0 | ||
|
|
76a9100779 | ||
|
|
1b6d592bf3 | ||
|
|
c95bbff23b | ||
|
|
4e4debd4da | ||
|
|
5839f870b7 | ||
|
|
b447717a5a | ||
|
|
f6f4923ac9 | ||
|
|
c394845b34 | ||
|
|
3472afea32 | ||
|
|
942f11937f | ||
|
|
b5b34983f1 | ||
| 45221d1e9a | |||
| 3869788bac | |||
| 3dbc2184ef | |||
| 60cb8f889a | |||
| c9ee078622 | |||
| ea660500c9 | |||
| d43a9aeec7 | |||
|
|
f5622e351e | ||
|
|
a20806afc8 | ||
|
|
4f9b6b3bcd | ||
|
|
c850b39b01 | ||
|
|
6dee8f3509 | ||
|
|
20f834aa96 | ||
| 105d92df8b | |||
| f96b149875 | |||
| 5ee120158e | |||
| 09fe0e2e9e | |||
| ace1a9dba6 | |||
| 905c581ece | |||
| 7c2a0135d2 | |||
| 407c1cd1c4 | |||
| e15bcc91c5 | |||
| 98f0cf0d52 | |||
| 4db89e9773 | |||
| 3fda18f708 | |||
| ea518abf30 | |||
| 744de588bb | |||
| a3ed9473a3 | |||
| a714c45f10 | |||
| 349e026cfa | |||
| 889fe1dc2f | |||
| befdbf3768 | |||
| ec6a0b292d | |||
| a03312c286 | |||
| e69e9109da | |||
| 413869809d | |||
| f9bd38572a | |||
| 662e3d2cdd | |||
| 126af96780 | |||
| ada15ac777 | |||
| dfb94f9ca6 | |||
| 5857805518 | |||
| 59a1d4b209 | |||
| 0dbfaf6121 | |||
| 5d72d48714 | |||
| 096b4a09ca | |||
| 5d42a92e4c | |||
| 3e54763367 | |||
| f91bce8661 | |||
| 585e6d7311 | |||
| 0a98ed8ae9 |
4
PLAN.md
4
PLAN.md
@@ -343,9 +343,9 @@ Planned code shape:
|
|||||||
- `bee tui` can rerun the audit manually
|
- `bee tui` can rerun the audit manually
|
||||||
- `bee tui` can export the latest audit JSON to removable media
|
- `bee tui` can export the latest audit JSON to removable media
|
||||||
- `bee tui` can show health summary and run NVIDIA/memory/storage acceptance tests
|
- `bee tui` can show health summary and run NVIDIA/memory/storage acceptance tests
|
||||||
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-stress`
|
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-burn`
|
||||||
- SAT summaries now expose `overall_status` plus per-job `OK/FAILED/UNSUPPORTED`
|
- SAT summaries now expose `overall_status` plus per-job `OK/FAILED/UNSUPPORTED`
|
||||||
- Memory/GPU SAT runtime defaults can be overridden via `BEE_MEMTESTER_*` and `BEE_GPU_STRESS_*`
|
- Memory SAT runtime defaults can be overridden via `BEE_MEMTESTER_*`
|
||||||
- removable export requires explicit target selection, mount, confirmation, copy, and cleanup
|
- removable export requires explicit target selection, mount, confirmation, copy, and cleanup
|
||||||
|
|
||||||
### 2.6 — Vendor utilities and optional assets
|
### 2.6 — Vendor utilities and optional assets
|
||||||
|
|||||||
@@ -1,11 +1,13 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
|
"runtime/debug"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
@@ -16,6 +18,37 @@ import (
|
|||||||
|
|
||||||
var Version = "dev"
|
var Version = "dev"
|
||||||
|
|
||||||
|
func buildLabel() string {
|
||||||
|
label := strings.TrimSpace(Version)
|
||||||
|
if label == "" {
|
||||||
|
label = "dev"
|
||||||
|
}
|
||||||
|
if info, ok := debug.ReadBuildInfo(); ok {
|
||||||
|
var revision string
|
||||||
|
var modified bool
|
||||||
|
for _, setting := range info.Settings {
|
||||||
|
switch setting.Key {
|
||||||
|
case "vcs.revision":
|
||||||
|
revision = setting.Value
|
||||||
|
case "vcs.modified":
|
||||||
|
modified = setting.Value == "true"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if revision != "" {
|
||||||
|
short := revision
|
||||||
|
if len(short) > 12 {
|
||||||
|
short = short[:12]
|
||||||
|
}
|
||||||
|
label += " (" + short
|
||||||
|
if modified {
|
||||||
|
label += "+"
|
||||||
|
}
|
||||||
|
label += ")"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return label
|
||||||
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
||||||
}
|
}
|
||||||
@@ -139,7 +172,6 @@ func runAudit(args []string, stdout, stderr io.Writer) int {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
func runExport(args []string, stdout, stderr io.Writer) int {
|
func runExport(args []string, stdout, stderr io.Writer) int {
|
||||||
fs := flag.NewFlagSet("export", flag.ContinueOnError)
|
fs := flag.NewFlagSet("export", flag.ContinueOnError)
|
||||||
fs.SetOutput(stderr)
|
fs.SetOutput(stderr)
|
||||||
@@ -299,6 +331,7 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
|||||||
|
|
||||||
if err := webui.ListenAndServe(*listenAddr, webui.HandlerOptions{
|
if err := webui.ListenAndServe(*listenAddr, webui.HandlerOptions{
|
||||||
Title: *title,
|
Title: *title,
|
||||||
|
BuildLabel: buildLabel(),
|
||||||
AuditPath: *auditPath,
|
AuditPath: *auditPath,
|
||||||
ExportDir: *exportDir,
|
ExportDir: *exportDir,
|
||||||
App: app.New(platform.New()),
|
App: app.New(platform.New()),
|
||||||
@@ -323,6 +356,7 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
|||||||
fs := flag.NewFlagSet("sat", flag.ContinueOnError)
|
fs := flag.NewFlagSet("sat", flag.ContinueOnError)
|
||||||
fs.SetOutput(stderr)
|
fs.SetOutput(stderr)
|
||||||
duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
|
duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
|
||||||
|
diagLevel := fs.Int("diag-level", 0, "DCGM diagnostic level for nvidia (1=quick, 2=medium, 3=targeted stress, 4=extended stress; default: 1)")
|
||||||
if err := fs.Parse(args[1:]); err != nil {
|
if err := fs.Parse(args[1:]); err != nil {
|
||||||
if err == flag.ErrHelp {
|
if err == flag.ErrHelp {
|
||||||
return 0
|
return 0
|
||||||
@@ -337,7 +371,7 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
|||||||
target := args[0]
|
target := args[0]
|
||||||
if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
|
if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
|
||||||
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
|
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
|
||||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>] [--diag-level <1-4>]")
|
||||||
return 2
|
return 2
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -346,19 +380,25 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
|||||||
archive string
|
archive string
|
||||||
err error
|
err error
|
||||||
)
|
)
|
||||||
|
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
|
||||||
switch target {
|
switch target {
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
archive, err = application.RunNvidiaAcceptancePack("")
|
level := *diagLevel
|
||||||
|
if level > 0 {
|
||||||
|
_, err = application.RunNvidiaAcceptancePackWithOptions(context.Background(), "", level, nil, logLine)
|
||||||
|
} else {
|
||||||
|
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
||||||
|
}
|
||||||
case "memory":
|
case "memory":
|
||||||
archive, err = application.RunMemoryAcceptancePack("")
|
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
|
||||||
case "storage":
|
case "storage":
|
||||||
archive, err = application.RunStorageAcceptancePack("")
|
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
|
||||||
case "cpu":
|
case "cpu":
|
||||||
dur := *duration
|
dur := *duration
|
||||||
if dur <= 0 {
|
if dur <= 0 {
|
||||||
dur = 60
|
dur = 60
|
||||||
}
|
}
|
||||||
archive, err = application.RunCPUAcceptancePack("", dur)
|
archive, err = application.RunCPUAcceptancePackCtx(context.Background(), "", dur, logLine)
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Error("run sat", "target", target, "err", err)
|
slog.Error("run sat", "target", target, "err", err)
|
||||||
|
|||||||
11
audit/go.mod
11
audit/go.mod
@@ -1,6 +1,6 @@
|
|||||||
module bee/audit
|
module bee/audit
|
||||||
|
|
||||||
go 1.24.0
|
go 1.25.0
|
||||||
|
|
||||||
replace reanimator/chart => ../internal/chart
|
replace reanimator/chart => ../internal/chart
|
||||||
|
|
||||||
@@ -13,5 +13,14 @@ require (
|
|||||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||||
github.com/go-analyze/bulk v0.1.3 // indirect
|
github.com/go-analyze/bulk v0.1.3 // indirect
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
|
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
|
||||||
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
|
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||||
|
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||||
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||||
golang.org/x/image v0.24.0 // indirect
|
golang.org/x/image v0.24.0 // indirect
|
||||||
|
golang.org/x/sys v0.42.0 // indirect
|
||||||
|
modernc.org/libc v1.70.0 // indirect
|
||||||
|
modernc.org/mathutil v1.7.1 // indirect
|
||||||
|
modernc.org/memory v1.11.0 // indirect
|
||||||
|
modernc.org/sqlite v1.48.0 // indirect
|
||||||
)
|
)
|
||||||
|
|||||||
19
audit/go.sum
19
audit/go.sum
@@ -8,11 +8,30 @@ github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00
|
|||||||
github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
|
github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
|
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
|
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
|
||||||
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
|
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||||
|
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
|
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||||
|
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||||
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||||
golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
|
golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
|
||||||
golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
|
golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
|
||||||
|
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||||
|
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
|
||||||
|
modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
|
||||||
|
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||||
|
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||||
|
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||||
|
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||||
|
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
||||||
|
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
||||||
|
|||||||
@@ -53,6 +53,10 @@ type networkManager interface {
|
|||||||
DHCPOne(iface string) (string, error)
|
DHCPOne(iface string) (string, error)
|
||||||
DHCPAll() (string, error)
|
DHCPAll() (string, error)
|
||||||
SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error)
|
SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error)
|
||||||
|
SetInterfaceState(iface string, up bool) error
|
||||||
|
GetInterfaceState(iface string) (bool, error)
|
||||||
|
CaptureNetworkSnapshot() (platform.NetworkSnapshot, error)
|
||||||
|
RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error
|
||||||
}
|
}
|
||||||
|
|
||||||
type serviceManager interface {
|
type serviceManager interface {
|
||||||
@@ -75,20 +79,50 @@ type toolManager interface {
|
|||||||
type installer interface {
|
type installer interface {
|
||||||
ListInstallDisks() ([]platform.InstallDisk, error)
|
ListInstallDisks() ([]platform.InstallDisk, error)
|
||||||
InstallToDisk(ctx context.Context, device string, logFile string) error
|
InstallToDisk(ctx context.Context, device string, logFile string) error
|
||||||
|
IsLiveMediaInRAM() bool
|
||||||
|
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
||||||
|
}
|
||||||
|
|
||||||
|
type GPUPresenceResult struct {
|
||||||
|
Nvidia bool
|
||||||
|
AMD bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) DetectGPUPresence() GPUPresenceResult {
|
||||||
|
vendor := a.sat.DetectGPUVendor()
|
||||||
|
return GPUPresenceResult{
|
||||||
|
Nvidia: vendor == "nvidia",
|
||||||
|
AMD: vendor == "amd",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) IsLiveMediaInRAM() bool {
|
||||||
|
return a.installer.IsLiveMediaInRAM()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||||
|
return a.installer.RunInstallToRAM(ctx, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
type satRunner interface {
|
type satRunner interface {
|
||||||
RunNvidiaAcceptancePack(baseDir string) (string, error)
|
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
||||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error)
|
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunMemoryAcceptancePack(baseDir string) (string, error)
|
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
||||||
RunStorageAcceptancePack(baseDir string) (string, error)
|
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
RunCPUAcceptancePack(baseDir string, durationSec int) (string, error)
|
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
|
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||||
DetectGPUVendor() string
|
DetectGPUVendor() string
|
||||||
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
||||||
RunAMDAcceptancePack(baseDir string) (string, error)
|
RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
|
RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
|
RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
|
RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
|
RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
|
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||||
RunNCCLTests(ctx context.Context, baseDir string) (string, error)
|
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||||
|
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
type runtimeChecker interface {
|
type runtimeChecker interface {
|
||||||
@@ -108,6 +142,17 @@ func New(platform *platform.System) *App {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
||||||
|
// and returns the updated JSON. Used by the web UI to serve always-fresh status.
|
||||||
|
func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
|
||||||
|
var snap schema.HardwareIngestRequest
|
||||||
|
if err := json.Unmarshal(auditJSON, &snap); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
|
||||||
|
return json.MarshalIndent(snap, "", " ")
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) {
|
func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) {
|
||||||
if runtimeMode == runtimeenv.ModeLiveCD {
|
if runtimeMode == runtimeenv.ModeLiveCD {
|
||||||
if err := a.runtime.CaptureTechnicalDump(DefaultTechDumpDir); err != nil {
|
if err := a.runtime.CaptureTechnicalDump(DefaultTechDumpDir); err != nil {
|
||||||
@@ -301,6 +346,22 @@ func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
|
|||||||
return a.network.SetStaticIPv4(cfg)
|
return a.network.SetStaticIPv4(cfg)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) SetInterfaceState(iface string, up bool) error {
|
||||||
|
return a.network.SetInterfaceState(iface, up)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) GetInterfaceState(iface string) (bool, error) {
|
||||||
|
return a.network.GetInterfaceState(iface)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||||
|
return a.network.CaptureNetworkSnapshot()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error {
|
||||||
|
return a.network.RestoreNetworkSnapshot(snapshot)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
|
func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
|
||||||
body, err := a.network.SetStaticIPv4(cfg)
|
body, err := a.network.SetStaticIPv4(cfg)
|
||||||
return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
|
return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
|
||||||
@@ -416,15 +477,15 @@ func (a *App) AuditLogTailResult() ActionResult {
|
|||||||
return ActionResult{Title: "Audit log tail", Body: body}
|
return ActionResult{Title: "Audit log tail", Body: body}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunNvidiaAcceptancePack(baseDir)
|
return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
|
func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
path, err := a.RunNvidiaAcceptancePack(baseDir)
|
path, err := a.RunNvidiaAcceptancePack(baseDir, nil)
|
||||||
body := "Archive written."
|
body := "Archive written."
|
||||||
if path != "" {
|
if path != "" {
|
||||||
body = "Archive written to " + path
|
body = "Archive written to " + path
|
||||||
@@ -436,11 +497,11 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
|||||||
return a.sat.ListNvidiaGPUs()
|
return a.sat.ListNvidiaGPUs()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (ActionResult, error) {
|
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices)
|
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc)
|
||||||
body := "Archive written."
|
body := "Archive written."
|
||||||
if path != "" {
|
if path != "" {
|
||||||
body = "Archive written to " + path
|
body = "Archive written to " + path
|
||||||
@@ -448,39 +509,62 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
|
|||||||
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunMemoryAcceptancePack(baseDir)
|
return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
path, err := a.RunMemoryAcceptancePack(baseDir)
|
path, err := a.RunMemoryAcceptancePack(baseDir, nil)
|
||||||
return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
|
return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
|
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunCPUAcceptancePack(baseDir, durationSec)
|
return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
|
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
|
||||||
path, err := a.RunCPUAcceptancePack(baseDir, durationSec)
|
path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil)
|
||||||
return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
|
return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunStorageAcceptancePack(baseDir string) (string, error) {
|
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunStorageAcceptancePack(baseDir)
|
return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
path, err := a.RunStorageAcceptancePack(baseDir)
|
path, err := a.RunStorageAcceptancePack(baseDir, nil)
|
||||||
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -492,18 +576,63 @@ func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
|||||||
return a.sat.ListAMDGPUs()
|
return a.sat.ListAMDGPUs()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunAMDAcceptancePack(baseDir string) (string, error) {
|
func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunAMDAcceptancePack(baseDir)
|
return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
path, err := a.RunAMDAcceptancePack(baseDir)
|
path, err := a.RunAMDAcceptancePack(baseDir, nil)
|
||||||
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
@@ -511,8 +640,15 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor
|
|||||||
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||||
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir)
|
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
||||||
body := "Results: " + path
|
body := "Results: " + path
|
||||||
if err != nil && err != context.Canceled {
|
if err != nil && err != context.Canceled {
|
||||||
body += "\nERROR: " + err.Error()
|
body += "\nERROR: " + err.Error()
|
||||||
|
|||||||
@@ -43,6 +43,13 @@ func (f fakeNetwork) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error
|
|||||||
return f.setStaticIPv4Fn(cfg)
|
return f.setStaticIPv4Fn(cfg)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeNetwork) SetInterfaceState(_ string, _ bool) error { return nil }
|
||||||
|
func (f fakeNetwork) GetInterfaceState(_ string) (bool, error) { return true, nil }
|
||||||
|
func (f fakeNetwork) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||||
|
return platform.NetworkSnapshot{}, nil
|
||||||
|
}
|
||||||
|
func (f fakeNetwork) RestoreNetworkSnapshot(platform.NetworkSnapshot) error { return nil }
|
||||||
|
|
||||||
type fakeServices struct {
|
type fakeServices struct {
|
||||||
serviceStatusFn func(string) (string, error)
|
serviceStatusFn func(string) (string, error)
|
||||||
serviceDoFn func(string, platform.ServiceAction) (string, error)
|
serviceDoFn func(string, platform.ServiceAction) (string, error)
|
||||||
@@ -113,21 +120,29 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type fakeSAT struct {
|
type fakeSAT struct {
|
||||||
runNvidiaFn func(string) (string, error)
|
runNvidiaFn func(string) (string, error)
|
||||||
runMemoryFn func(string) (string, error)
|
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||||
runStorageFn func(string) (string, error)
|
runMemoryFn func(string) (string, error)
|
||||||
runCPUFn func(string, int) (string, error)
|
runStorageFn func(string) (string, error)
|
||||||
detectVendorFn func() string
|
runCPUFn func(string, int) (string, error)
|
||||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
detectVendorFn func() string
|
||||||
runAMDPackFn func(string) (string, error)
|
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
runAMDPackFn func(string) (string, error)
|
||||||
|
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||||
return f.runNvidiaFn(baseDir)
|
return f.runNvidiaFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ []int) (string, error) {
|
func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ []int, _ func(string)) (string, error) {
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaStressFn != nil {
|
||||||
|
return f.runNvidiaStressFn(baseDir, opts)
|
||||||
|
}
|
||||||
return f.runNvidiaFn(baseDir)
|
return f.runNvidiaFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -138,15 +153,15 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||||
return f.runMemoryFn(baseDir)
|
return f.runMemoryFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunStorageAcceptancePack(baseDir string) (string, error) {
|
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||||
return f.runStorageFn(baseDir)
|
return f.runStorageFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
|
func (f fakeSAT) RunCPUAcceptancePack(_ context.Context, baseDir string, durationSec int, _ func(string)) (string, error) {
|
||||||
if f.runCPUFn != nil {
|
if f.runCPUFn != nil {
|
||||||
return f.runCPUFn(baseDir, durationSec)
|
return f.runCPUFn(baseDir, durationSec)
|
||||||
}
|
}
|
||||||
@@ -167,18 +182,40 @@ func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) {
|
func (f fakeSAT) RunAMDAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||||
if f.runAMDPackFn != nil {
|
if f.runAMDPackFn != nil {
|
||||||
return f.runAMDPackFn(baseDir)
|
return f.runAMDPackFn(baseDir)
|
||||||
}
|
}
|
||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunAMDMemIntegrityPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunAMDMemBandwidthPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunAMDStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
func (f fakeSAT) RunMemoryStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
func (f fakeSAT) RunSATStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) {
|
func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) {
|
||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string) (string, error) {
|
func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.PlatformStressOptions, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -574,13 +611,13 @@ func TestRunSATDefaultsToExportDir(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
if _, err := a.RunNvidiaAcceptancePack(""); err != nil {
|
if _, err := a.RunNvidiaAcceptancePack("", nil); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
if _, err := a.RunMemoryAcceptancePack(""); err != nil {
|
if _, err := a.RunMemoryAcceptancePack("", nil); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
if _, err := a.RunStorageAcceptancePack(""); err != nil {
|
if _, err := a.RunStorageAcceptancePack("", nil); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -141,9 +141,11 @@ func satSummaryStatus(summary satSummary, label string) (string, string, bool) {
|
|||||||
func satKeyStatus(rawStatus, label string) (string, string, bool) {
|
func satKeyStatus(rawStatus, label string) (string, string, bool) {
|
||||||
switch strings.ToUpper(strings.TrimSpace(rawStatus)) {
|
switch strings.ToUpper(strings.TrimSpace(rawStatus)) {
|
||||||
case "OK":
|
case "OK":
|
||||||
return "OK", label + " passed", true
|
// No error description on success — error_description is for problems only.
|
||||||
|
return "OK", "", true
|
||||||
case "PARTIAL", "UNSUPPORTED", "CANCELED", "CANCELLED":
|
case "PARTIAL", "UNSUPPORTED", "CANCELED", "CANCELLED":
|
||||||
return "Warning", label + " incomplete", true
|
// Tool couldn't run or test was incomplete — we can't assert hardware health.
|
||||||
|
return "Unknown", "", true
|
||||||
case "FAILED":
|
case "FAILED":
|
||||||
return "Critical", label + " failed", true
|
return "Critical", label + " failed", true
|
||||||
default:
|
default:
|
||||||
@@ -180,6 +182,8 @@ func statusSeverity(status string) int {
|
|||||||
return 2
|
return 2
|
||||||
case "OK":
|
case "OK":
|
||||||
return 1
|
return 1
|
||||||
|
case "Unknown":
|
||||||
|
return 1 // same as OK — does not override OK from another source
|
||||||
default:
|
default:
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -36,6 +36,8 @@ var supportBundleCommands = []struct {
|
|||||||
{name: "system/dmesg-tail.txt", cmd: []string{"sh", "-c", "dmesg | tail -n 200"}},
|
{name: "system/dmesg-tail.txt", cmd: []string{"sh", "-c", "dmesg | tail -n 200"}},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const supportBundleGlob = "bee-support-*.tar.gz"
|
||||||
|
|
||||||
func BuildSupportBundle(exportDir string) (string, error) {
|
func BuildSupportBundle(exportDir string) (string, error) {
|
||||||
exportDir = strings.TrimSpace(exportDir)
|
exportDir = strings.TrimSpace(exportDir)
|
||||||
if exportDir == "" {
|
if exportDir == "" {
|
||||||
@@ -86,34 +88,64 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
return archivePath, nil
|
return archivePath, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func LatestSupportBundlePath() (string, error) {
|
||||||
|
return latestSupportBundlePath(os.TempDir())
|
||||||
|
}
|
||||||
|
|
||||||
func cleanupOldSupportBundles(dir string) error {
|
func cleanupOldSupportBundles(dir string) error {
|
||||||
matches, err := filepath.Glob(filepath.Join(dir, "bee-support-*.tar.gz"))
|
matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
type entry struct {
|
entries := supportBundleEntries(matches)
|
||||||
path string
|
for path, mod := range entries {
|
||||||
mod time.Time
|
if time.Since(mod) > 24*time.Hour {
|
||||||
|
_ = os.Remove(path)
|
||||||
|
delete(entries, path)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
list := make([]entry, 0, len(matches))
|
ordered := orderSupportBundles(entries)
|
||||||
|
if len(ordered) > 3 {
|
||||||
|
for _, old := range ordered[3:] {
|
||||||
|
_ = os.Remove(old)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func latestSupportBundlePath(dir string) (string, error) {
|
||||||
|
matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
ordered := orderSupportBundles(supportBundleEntries(matches))
|
||||||
|
if len(ordered) == 0 {
|
||||||
|
return "", os.ErrNotExist
|
||||||
|
}
|
||||||
|
return ordered[0], nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func supportBundleEntries(matches []string) map[string]time.Time {
|
||||||
|
entries := make(map[string]time.Time, len(matches))
|
||||||
for _, match := range matches {
|
for _, match := range matches {
|
||||||
info, err := os.Stat(match)
|
info, err := os.Stat(match)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if time.Since(info.ModTime()) > 24*time.Hour {
|
entries[match] = info.ModTime()
|
||||||
_ = os.Remove(match)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
list = append(list, entry{path: match, mod: info.ModTime()})
|
|
||||||
}
|
}
|
||||||
sort.Slice(list, func(i, j int) bool { return list[i].mod.After(list[j].mod) })
|
return entries
|
||||||
if len(list) > 3 {
|
}
|
||||||
for _, old := range list[3:] {
|
|
||||||
_ = os.Remove(old.path)
|
func orderSupportBundles(entries map[string]time.Time) []string {
|
||||||
}
|
ordered := make([]string, 0, len(entries))
|
||||||
|
for path := range entries {
|
||||||
|
ordered = append(ordered, path)
|
||||||
}
|
}
|
||||||
return nil
|
sort.Slice(ordered, func(i, j int) bool {
|
||||||
|
return entries[ordered[i]].After(entries[ordered[j]])
|
||||||
|
})
|
||||||
|
return ordered
|
||||||
}
|
}
|
||||||
|
|
||||||
func writeJournalDump(dst string) error {
|
func writeJournalDump(dst string) error {
|
||||||
|
|||||||
@@ -13,14 +13,18 @@ import (
|
|||||||
const nvidiaVendorID = 0x10de
|
const nvidiaVendorID = 0x10de
|
||||||
|
|
||||||
type nvidiaGPUInfo struct {
|
type nvidiaGPUInfo struct {
|
||||||
BDF string
|
BDF string
|
||||||
Serial string
|
Serial string
|
||||||
VBIOS string
|
VBIOS string
|
||||||
TemperatureC *float64
|
TemperatureC *float64
|
||||||
PowerW *float64
|
PowerW *float64
|
||||||
ECCUncorrected *int64
|
ECCUncorrected *int64
|
||||||
ECCCorrected *int64
|
ECCCorrected *int64
|
||||||
HWSlowdown *bool
|
HWSlowdown *bool
|
||||||
|
PCIeLinkGenCurrent *int
|
||||||
|
PCIeLinkGenMax *int
|
||||||
|
PCIeLinkWidthCur *int
|
||||||
|
PCIeLinkWidthMax *int
|
||||||
}
|
}
|
||||||
|
|
||||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||||
@@ -94,7 +98,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
|||||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||||
out, err := exec.Command(
|
out, err := exec.Command(
|
||||||
"nvidia-smi",
|
"nvidia-smi",
|
||||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
|
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||||
"--format=csv,noheader,nounits",
|
"--format=csv,noheader,nounits",
|
||||||
).Output()
|
).Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -118,8 +122,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
|||||||
if len(rec) == 0 {
|
if len(rec) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if len(rec) < 9 {
|
if len(rec) < 13 {
|
||||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
|
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
|
||||||
}
|
}
|
||||||
|
|
||||||
bdf := normalizePCIeBDF(rec[1])
|
bdf := normalizePCIeBDF(rec[1])
|
||||||
@@ -128,14 +132,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
info := nvidiaGPUInfo{
|
info := nvidiaGPUInfo{
|
||||||
BDF: bdf,
|
BDF: bdf,
|
||||||
Serial: strings.TrimSpace(rec[2]),
|
Serial: strings.TrimSpace(rec[2]),
|
||||||
VBIOS: strings.TrimSpace(rec[3]),
|
VBIOS: strings.TrimSpace(rec[3]),
|
||||||
TemperatureC: parseMaybeFloat(rec[4]),
|
TemperatureC: parseMaybeFloat(rec[4]),
|
||||||
PowerW: parseMaybeFloat(rec[5]),
|
PowerW: parseMaybeFloat(rec[5]),
|
||||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||||
HWSlowdown: parseMaybeBool(rec[8]),
|
HWSlowdown: parseMaybeBool(rec[8]),
|
||||||
|
PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
|
||||||
|
PCIeLinkGenMax: parseMaybeInt(rec[10]),
|
||||||
|
PCIeLinkWidthCur: parseMaybeInt(rec[11]),
|
||||||
|
PCIeLinkWidthMax: parseMaybeInt(rec[12]),
|
||||||
}
|
}
|
||||||
result[bdf] = info
|
result[bdf] = info
|
||||||
}
|
}
|
||||||
@@ -167,6 +175,22 @@ func parseMaybeInt64(v string) *int64 {
|
|||||||
return &n
|
return &n
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseMaybeInt(v string) *int {
|
||||||
|
v = strings.TrimSpace(v)
|
||||||
|
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
n, err := strconv.Atoi(v)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return &n
|
||||||
|
}
|
||||||
|
|
||||||
|
func pcieLinkGenLabel(gen int) string {
|
||||||
|
return fmt.Sprintf("Gen%d", gen)
|
||||||
|
}
|
||||||
|
|
||||||
func parseMaybeBool(v string) *bool {
|
func parseMaybeBool(v string) *bool {
|
||||||
v = strings.TrimSpace(strings.ToLower(v))
|
v = strings.TrimSpace(strings.ToLower(v))
|
||||||
switch v {
|
switch v {
|
||||||
@@ -231,4 +255,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
|||||||
if info.HWSlowdown != nil {
|
if info.HWSlowdown != nil {
|
||||||
dev.HWSlowdown = info.HWSlowdown
|
dev.HWSlowdown = info.HWSlowdown
|
||||||
}
|
}
|
||||||
|
// Override PCIe link speed/width with nvidia-smi driver values.
|
||||||
|
// sysfs current_link_speed reflects the instantaneous physical link state and
|
||||||
|
// can show Gen1 when the GPU is idle due to ASPM power management. The driver
|
||||||
|
// knows the negotiated speed regardless of the current power state.
|
||||||
|
if info.PCIeLinkGenCurrent != nil {
|
||||||
|
s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent)
|
||||||
|
dev.LinkSpeed = &s
|
||||||
|
}
|
||||||
|
if info.PCIeLinkGenMax != nil {
|
||||||
|
s := pcieLinkGenLabel(*info.PCIeLinkGenMax)
|
||||||
|
dev.MaxLinkSpeed = &s
|
||||||
|
}
|
||||||
|
if info.PCIeLinkWidthCur != nil {
|
||||||
|
dev.LinkWidth = info.PCIeLinkWidthCur
|
||||||
|
}
|
||||||
|
if info.PCIeLinkWidthMax != nil {
|
||||||
|
dev.MaxLinkWidth = info.PCIeLinkWidthMax
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestParseNVIDIASMIQuery(t *testing.T) {
|
func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n"
|
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||||
byBDF, err := parseNVIDIASMIQuery(raw)
|
byBDF, err := parseNVIDIASMIQuery(raw)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("parse failed: %v", err)
|
t.Fatalf("parse failed: %v", err)
|
||||||
@@ -28,6 +28,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
|||||||
if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
|
if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
|
||||||
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
|
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
|
||||||
}
|
}
|
||||||
|
if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 {
|
||||||
|
t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent)
|
||||||
|
}
|
||||||
|
if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 {
|
||||||
|
t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestNormalizePCIeBDF(t *testing.T) {
|
func TestNormalizePCIeBDF(t *testing.T) {
|
||||||
|
|||||||
@@ -77,11 +77,24 @@ func discoverStorageDevices() []lsblkDevice {
|
|||||||
if dev.Type != "disk" {
|
if dev.Type != "disk" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
if isVirtualBMCDisk(dev) {
|
||||||
|
slog.Debug("storage: skipping BMC virtual disk", "name", dev.Name, "model", dev.Model)
|
||||||
|
continue
|
||||||
|
}
|
||||||
disks = append(disks, dev)
|
disks = append(disks, dev)
|
||||||
}
|
}
|
||||||
return disks
|
return disks
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isVirtualBMCDisk returns true for BMC/IPMI virtual USB mass storage devices
|
||||||
|
// that appear as disks but are not real hardware (e.g. iDRAC Virtual HDisk*).
|
||||||
|
// These have zero reported size, a generic fake serial, and a model name that
|
||||||
|
// starts with "Virtual HDisk".
|
||||||
|
func isVirtualBMCDisk(dev lsblkDevice) bool {
|
||||||
|
model := strings.ToLower(strings.TrimSpace(dev.Model))
|
||||||
|
return strings.HasPrefix(model, "virtual hdisk")
|
||||||
|
}
|
||||||
|
|
||||||
func lsblkDevices() []lsblkDevice {
|
func lsblkDevices() []lsblkDevice {
|
||||||
out, err := exec.Command("lsblk", "-J", "-d",
|
out, err := exec.Command("lsblk", "-J", "-d",
|
||||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
||||||
|
|||||||
@@ -76,6 +76,66 @@ func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
|||||||
return sampleGPUMetrics(gpuIndices)
|
return sampleGPUMetrics(gpuIndices)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// sampleAMDGPUMetrics queries rocm-smi for live GPU metrics.
|
||||||
|
func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
||||||
|
out, err := runROCmSMI("--showtemp", "--showuse", "--showpower", "--showmemuse", "--csv")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
lines := strings.Split(strings.TrimSpace(string(out)), "\n")
|
||||||
|
if len(lines) < 2 {
|
||||||
|
return nil, fmt.Errorf("rocm-smi: insufficient output")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse header to find column indices by name.
|
||||||
|
headers := strings.Split(lines[0], ",")
|
||||||
|
colIdx := func(keywords ...string) int {
|
||||||
|
for i, h := range headers {
|
||||||
|
hl := strings.ToLower(strings.TrimSpace(h))
|
||||||
|
for _, kw := range keywords {
|
||||||
|
if strings.Contains(hl, kw) {
|
||||||
|
return i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
idxTemp := colIdx("sensor edge", "temperature (c)", "temp")
|
||||||
|
idxUse := colIdx("gpu use (%)")
|
||||||
|
idxMem := colIdx("vram%", "memory allocated")
|
||||||
|
idxPow := colIdx("average graphics package power", "power (w)")
|
||||||
|
|
||||||
|
var rows []GPUMetricRow
|
||||||
|
for _, line := range lines[1:] {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Split(line, ",")
|
||||||
|
idx := len(rows)
|
||||||
|
row := GPUMetricRow{GPUIndex: idx}
|
||||||
|
get := func(i int) float64 {
|
||||||
|
if i < 0 || i >= len(parts) {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
v := strings.TrimSpace(parts[i])
|
||||||
|
if strings.EqualFold(v, "n/a") {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return parseGPUFloat(v)
|
||||||
|
}
|
||||||
|
row.TempC = get(idxTemp)
|
||||||
|
row.UsagePct = get(idxUse)
|
||||||
|
row.MemUsagePct = get(idxMem)
|
||||||
|
row.PowerW = get(idxPow)
|
||||||
|
rows = append(rows, row)
|
||||||
|
}
|
||||||
|
if len(rows) == 0 {
|
||||||
|
return nil, fmt.Errorf("rocm-smi: no GPU rows parsed")
|
||||||
|
}
|
||||||
|
return rows, nil
|
||||||
|
}
|
||||||
|
|
||||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
|
|||||||
191
audit/internal/platform/install_to_ram.go
Normal file
191
audit/internal/platform/install_to_ram.go
Normal file
@@ -0,0 +1,191 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *System) IsLiveMediaInRAM() bool {
|
||||||
|
out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", "/run/live/medium").Output()
|
||||||
|
if err != nil {
|
||||||
|
return toramActive()
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(string(out)) == "tmpfs"
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||||
|
log := func(msg string) {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if s.IsLiveMediaInRAM() {
|
||||||
|
log("Already running from RAM — installation media can be safely disconnected.")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||||
|
if err != nil || len(squashfsFiles) == 0 {
|
||||||
|
return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
|
||||||
|
}
|
||||||
|
|
||||||
|
free := freeMemBytes()
|
||||||
|
var needed int64
|
||||||
|
for _, sf := range squashfsFiles {
|
||||||
|
fi, err2 := os.Stat(sf)
|
||||||
|
if err2 != nil {
|
||||||
|
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||||
|
}
|
||||||
|
needed += fi.Size()
|
||||||
|
}
|
||||||
|
const headroom = 256 * 1024 * 1024
|
||||||
|
if free > 0 && needed+headroom > free {
|
||||||
|
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||||
|
humanBytes(needed+headroom), humanBytes(free))
|
||||||
|
}
|
||||||
|
|
||||||
|
dstDir := "/dev/shm/bee-live"
|
||||||
|
if err := os.MkdirAll(dstDir, 0755); err != nil {
|
||||||
|
return fmt.Errorf("create tmpfs dir: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, sf := range squashfsFiles {
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
base := filepath.Base(sf)
|
||||||
|
dst := filepath.Join(dstDir, base)
|
||||||
|
log(fmt.Sprintf("Copying %s to RAM...", base))
|
||||||
|
if err := copyFileLarge(ctx, sf, dst, log); err != nil {
|
||||||
|
return fmt.Errorf("copy %s: %v", base, err)
|
||||||
|
}
|
||||||
|
log(fmt.Sprintf("Copied %s.", base))
|
||||||
|
|
||||||
|
loopDev, err := findLoopForFile(sf)
|
||||||
|
if err != nil {
|
||||||
|
log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if err := reassociateLoopDevice(loopDev, dst); err != nil {
|
||||||
|
log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, err))
|
||||||
|
} else {
|
||||||
|
log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log("Copying remaining medium files...")
|
||||||
|
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
|
||||||
|
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
||||||
|
}
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
|
||||||
|
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
|
||||||
|
}
|
||||||
|
|
||||||
|
log("Done. Installation media can be safely disconnected.")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||||
|
in, err := os.Open(src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer in.Close()
|
||||||
|
fi, err := in.Stat()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
out, err := os.Create(dst)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer out.Close()
|
||||||
|
total := fi.Size()
|
||||||
|
var copied int64
|
||||||
|
buf := make([]byte, 4*1024*1024)
|
||||||
|
for {
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
n, err := in.Read(buf)
|
||||||
|
if n > 0 {
|
||||||
|
if _, werr := out.Write(buf[:n]); werr != nil {
|
||||||
|
return werr
|
||||||
|
}
|
||||||
|
copied += int64(n)
|
||||||
|
if logFunc != nil && total > 0 {
|
||||||
|
pct := int(float64(copied) / float64(total) * 100)
|
||||||
|
logFunc(fmt.Sprintf(" %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err == io.EOF {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out.Sync()
|
||||||
|
}
|
||||||
|
|
||||||
|
func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||||
|
return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return ctx.Err()
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
rel, _ := filepath.Rel(src, path)
|
||||||
|
target := filepath.Join(dst, rel)
|
||||||
|
if fi.IsDir() {
|
||||||
|
return os.MkdirAll(target, fi.Mode())
|
||||||
|
}
|
||||||
|
if strings.HasSuffix(path, ".squashfs") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(target); err == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return copyFileLarge(ctx, path, target, nil)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func findLoopForFile(backingFile string) (string, error) {
|
||||||
|
out, err := exec.Command("losetup", "--list", "--json").Output()
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
var result struct {
|
||||||
|
Loopdevices []struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
BackFile string `json:"back-file"`
|
||||||
|
} `json:"loopdevices"`
|
||||||
|
}
|
||||||
|
if err := json.Unmarshal(out, &result); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
for _, dev := range result.Loopdevices {
|
||||||
|
if dev.BackFile == backingFile {
|
||||||
|
return dev.Name, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "", fmt.Errorf("no loop device found for %s", backingFile)
|
||||||
|
}
|
||||||
|
|
||||||
|
func reassociateLoopDevice(loopDev, newFile string) error {
|
||||||
|
if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return loopChangeFD(loopDev, newFile)
|
||||||
|
}
|
||||||
28
audit/internal/platform/install_to_ram_linux.go
Normal file
28
audit/internal/platform/install_to_ram_linux.go
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
//go:build linux
|
||||||
|
|
||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"syscall"
|
||||||
|
)
|
||||||
|
|
||||||
|
const ioctlLoopChangeFD = 0x4C08
|
||||||
|
|
||||||
|
func loopChangeFD(loopDev, newFile string) error {
|
||||||
|
lf, err := os.OpenFile(loopDev, os.O_RDWR, 0)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer lf.Close()
|
||||||
|
nf, err := os.OpenFile(newFile, os.O_RDONLY, 0)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer nf.Close()
|
||||||
|
_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, lf.Fd(), ioctlLoopChangeFD, nf.Fd())
|
||||||
|
if errno != 0 {
|
||||||
|
return errno
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
9
audit/internal/platform/install_to_ram_other.go
Normal file
9
audit/internal/platform/install_to_ram_other.go
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
//go:build !linux
|
||||||
|
|
||||||
|
package platform
|
||||||
|
|
||||||
|
import "errors"
|
||||||
|
|
||||||
|
func loopChangeFD(loopDev, newFile string) error {
|
||||||
|
return errors.New("LOOP_CHANGE_FD not available on this platform")
|
||||||
|
}
|
||||||
@@ -2,7 +2,10 @@ package platform
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
|
"encoding/json"
|
||||||
"os"
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
@@ -23,6 +26,7 @@ type LiveMetricSample struct {
|
|||||||
// TempReading is a named temperature sensor value.
|
// TempReading is a named temperature sensor value.
|
||||||
type TempReading struct {
|
type TempReading struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
|
Group string `json:"group,omitempty"`
|
||||||
Celsius float64 `json:"celsius"`
|
Celsius float64 `json:"celsius"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -32,18 +36,22 @@ type TempReading struct {
|
|||||||
func SampleLiveMetrics() LiveMetricSample {
|
func SampleLiveMetrics() LiveMetricSample {
|
||||||
s := LiveMetricSample{Timestamp: time.Now().UTC()}
|
s := LiveMetricSample{Timestamp: time.Now().UTC()}
|
||||||
|
|
||||||
// GPU metrics — skipped silently if nvidia-smi unavailable
|
// GPU metrics — try NVIDIA first, fall back to AMD
|
||||||
gpus, _ := SampleGPUMetrics(nil)
|
if gpus, err := SampleGPUMetrics(nil); err == nil && len(gpus) > 0 {
|
||||||
s.GPUs = gpus
|
s.GPUs = gpus
|
||||||
|
} else if amdGPUs, err := sampleAMDGPUMetrics(); err == nil && len(amdGPUs) > 0 {
|
||||||
|
s.GPUs = amdGPUs
|
||||||
|
}
|
||||||
|
|
||||||
// Fan speeds — skipped silently if ipmitool unavailable
|
// Fan speeds — skipped silently if ipmitool unavailable
|
||||||
fans, _ := sampleFanSpeeds()
|
fans, _ := sampleFanSpeeds()
|
||||||
s.Fans = fans
|
s.Fans = fans
|
||||||
|
|
||||||
// CPU/system temperature — returns 0 if unavailable
|
s.Temps = append(s.Temps, sampleLiveTemperatureReadings()...)
|
||||||
cpuTemp := sampleCPUMaxTemp()
|
if !hasTempGroup(s.Temps, "cpu") {
|
||||||
if cpuTemp > 0 {
|
if cpuTemp := sampleCPUMaxTemp(); cpuTemp > 0 {
|
||||||
s.Temps = append(s.Temps, TempReading{Name: "CPU", Celsius: cpuTemp})
|
s.Temps = append(s.Temps, TempReading{Name: "CPU Max", Group: "cpu", Celsius: cpuTemp})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// System power — returns 0 if unavailable
|
// System power — returns 0 if unavailable
|
||||||
@@ -60,18 +68,20 @@ func SampleLiveMetrics() LiveMetricSample {
|
|||||||
|
|
||||||
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
|
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
|
||||||
// the overall CPU utilisation percentage.
|
// the overall CPU utilisation percentage.
|
||||||
var cpuStatPrev [2]uint64 // [total, idle]
|
|
||||||
|
|
||||||
func sampleCPULoadPct() float64 {
|
func sampleCPULoadPct() float64 {
|
||||||
total, idle := readCPUStat()
|
total0, idle0 := readCPUStat()
|
||||||
if total == 0 {
|
if total0 == 0 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
prevTotal, prevIdle := cpuStatPrev[0], cpuStatPrev[1]
|
time.Sleep(200 * time.Millisecond)
|
||||||
cpuStatPrev = [2]uint64{total, idle}
|
total1, idle1 := readCPUStat()
|
||||||
if prevTotal == 0 {
|
if total1 == 0 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
return cpuLoadPctBetween(total0, idle0, total1, idle1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func cpuLoadPctBetween(prevTotal, prevIdle, total, idle uint64) float64 {
|
||||||
dt := float64(total - prevTotal)
|
dt := float64(total - prevTotal)
|
||||||
di := float64(idle - prevIdle)
|
di := float64(idle - prevIdle)
|
||||||
if dt <= 0 {
|
if dt <= 0 {
|
||||||
@@ -137,3 +147,182 @@ func sampleMemLoadPct() float64 {
|
|||||||
used := total - avail
|
used := total - avail
|
||||||
return float64(used) / float64(total) * 100
|
return float64(used) / float64(total) * 100
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func hasTempGroup(temps []TempReading, group string) bool {
|
||||||
|
for _, t := range temps {
|
||||||
|
if t.Group == group {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleLiveTemperatureReadings() []TempReading {
|
||||||
|
if temps := sampleLiveTempsViaSensorsJSON(); len(temps) > 0 {
|
||||||
|
return temps
|
||||||
|
}
|
||||||
|
return sampleLiveTempsViaIPMI()
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleLiveTempsViaSensorsJSON() []TempReading {
|
||||||
|
out, err := exec.Command("sensors", "-j").Output()
|
||||||
|
if err != nil || len(out) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var doc map[string]map[string]any
|
||||||
|
if err := json.Unmarshal(out, &doc); err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
chips := make([]string, 0, len(doc))
|
||||||
|
for chip := range doc {
|
||||||
|
chips = append(chips, chip)
|
||||||
|
}
|
||||||
|
sort.Strings(chips)
|
||||||
|
|
||||||
|
temps := make([]TempReading, 0, len(chips))
|
||||||
|
seen := map[string]struct{}{}
|
||||||
|
for _, chip := range chips {
|
||||||
|
features := doc[chip]
|
||||||
|
featureNames := make([]string, 0, len(features))
|
||||||
|
for name := range features {
|
||||||
|
featureNames = append(featureNames, name)
|
||||||
|
}
|
||||||
|
sort.Strings(featureNames)
|
||||||
|
for _, name := range featureNames {
|
||||||
|
if strings.EqualFold(name, "Adapter") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
feature, ok := features[name].(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, ok := firstTempInputValue(feature)
|
||||||
|
if !ok || value <= 0 || value > 150 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
group := classifyLiveTempGroup(chip, name)
|
||||||
|
if group == "gpu" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
label := strings.TrimSpace(name)
|
||||||
|
if label == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if group == "ambient" {
|
||||||
|
label = compactAmbientTempName(chip, label)
|
||||||
|
}
|
||||||
|
key := group + "\x00" + label
|
||||||
|
if _, ok := seen[key]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[key] = struct{}{}
|
||||||
|
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return temps
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleLiveTempsViaIPMI() []TempReading {
|
||||||
|
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
||||||
|
if err != nil || len(out) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var temps []TempReading
|
||||||
|
seen := map[string]struct{}{}
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||||
|
parts := strings.Split(line, "|")
|
||||||
|
if len(parts) < 3 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
name := strings.TrimSpace(parts[0])
|
||||||
|
if name == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
unit := strings.ToLower(strings.TrimSpace(parts[2]))
|
||||||
|
if !strings.Contains(unit, "degrees") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
raw := strings.TrimSpace(parts[1])
|
||||||
|
if raw == "" || strings.EqualFold(raw, "na") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, err := strconv.ParseFloat(raw, 64)
|
||||||
|
if err != nil || value <= 0 || value > 150 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
group := classifyLiveTempGroup("", name)
|
||||||
|
if group == "gpu" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
label := name
|
||||||
|
if group == "ambient" {
|
||||||
|
label = compactAmbientTempName("", label)
|
||||||
|
}
|
||||||
|
key := group + "\x00" + label
|
||||||
|
if _, ok := seen[key]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[key] = struct{}{}
|
||||||
|
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
|
||||||
|
}
|
||||||
|
return temps
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstTempInputValue(feature map[string]any) (float64, bool) {
|
||||||
|
keys := make([]string, 0, len(feature))
|
||||||
|
for key := range feature {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
for _, key := range keys {
|
||||||
|
lower := strings.ToLower(key)
|
||||||
|
if !strings.Contains(lower, "temp") || !strings.HasSuffix(lower, "_input") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch value := feature[key].(type) {
|
||||||
|
case float64:
|
||||||
|
return value, true
|
||||||
|
case string:
|
||||||
|
f, err := strconv.ParseFloat(value, 64)
|
||||||
|
if err == nil {
|
||||||
|
return f, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func classifyLiveTempGroup(chip, name string) string {
|
||||||
|
text := strings.ToLower(strings.TrimSpace(chip + " " + name))
|
||||||
|
switch {
|
||||||
|
case strings.Contains(text, "gpu"), strings.Contains(text, "amdgpu"), strings.Contains(text, "nvidia"), strings.Contains(text, "adeon"):
|
||||||
|
return "gpu"
|
||||||
|
case strings.Contains(text, "coretemp"),
|
||||||
|
strings.Contains(text, "k10temp"),
|
||||||
|
strings.Contains(text, "zenpower"),
|
||||||
|
strings.Contains(text, "package id"),
|
||||||
|
strings.Contains(text, "x86_pkg_temp"),
|
||||||
|
strings.Contains(text, "tctl"),
|
||||||
|
strings.Contains(text, "tdie"),
|
||||||
|
strings.Contains(text, "tccd"),
|
||||||
|
strings.Contains(text, "cpu"),
|
||||||
|
strings.Contains(text, "peci"):
|
||||||
|
return "cpu"
|
||||||
|
default:
|
||||||
|
return "ambient"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func compactAmbientTempName(chip, name string) string {
|
||||||
|
chip = strings.TrimSpace(chip)
|
||||||
|
name = strings.TrimSpace(name)
|
||||||
|
if chip == "" || strings.EqualFold(chip, name) {
|
||||||
|
return name
|
||||||
|
}
|
||||||
|
if strings.Contains(strings.ToLower(name), strings.ToLower(chip)) {
|
||||||
|
return name
|
||||||
|
}
|
||||||
|
return chip + " / " + name
|
||||||
|
}
|
||||||
|
|||||||
94
audit/internal/platform/live_metrics_test.go
Normal file
94
audit/internal/platform/live_metrics_test.go
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestFirstTempInputValue(t *testing.T) {
|
||||||
|
feature := map[string]any{
|
||||||
|
"temp1_input": 61.5,
|
||||||
|
"temp1_max": 80.0,
|
||||||
|
}
|
||||||
|
got, ok := firstTempInputValue(feature)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("expected value")
|
||||||
|
}
|
||||||
|
if got != 61.5 {
|
||||||
|
t.Fatalf("got %v want 61.5", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestClassifyLiveTempGroup(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
chip string
|
||||||
|
name string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{chip: "coretemp-isa-0000", name: "Package id 0", want: "cpu"},
|
||||||
|
{chip: "amdgpu-pci-4300", name: "edge", want: "gpu"},
|
||||||
|
{chip: "nvme-pci-0100", name: "Composite", want: "ambient"},
|
||||||
|
{chip: "acpitz-acpi-0", name: "temp1", want: "ambient"},
|
||||||
|
}
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := classifyLiveTempGroup(tc.chip, tc.name); got != tc.want {
|
||||||
|
t.Fatalf("classifyLiveTempGroup(%q,%q)=%q want %q", tc.chip, tc.name, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCompactAmbientTempName(t *testing.T) {
|
||||||
|
if got := compactAmbientTempName("nvme-pci-0100", "Composite"); got != "nvme-pci-0100 / Composite" {
|
||||||
|
t.Fatalf("got %q", got)
|
||||||
|
}
|
||||||
|
if got := compactAmbientTempName("", "Inlet Temp"); got != "Inlet Temp" {
|
||||||
|
t.Fatalf("got %q", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCPULoadPctBetween(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
prevTotal uint64
|
||||||
|
prevIdle uint64
|
||||||
|
total uint64
|
||||||
|
idle uint64
|
||||||
|
want float64
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "busy half",
|
||||||
|
prevTotal: 100,
|
||||||
|
prevIdle: 40,
|
||||||
|
total: 200,
|
||||||
|
idle: 90,
|
||||||
|
want: 50,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "fully busy",
|
||||||
|
prevTotal: 100,
|
||||||
|
prevIdle: 40,
|
||||||
|
total: 200,
|
||||||
|
idle: 40,
|
||||||
|
want: 100,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "no progress",
|
||||||
|
prevTotal: 100,
|
||||||
|
prevIdle: 40,
|
||||||
|
total: 100,
|
||||||
|
idle: 40,
|
||||||
|
want: 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "idle delta larger than total clamps to zero",
|
||||||
|
prevTotal: 100,
|
||||||
|
prevIdle: 40,
|
||||||
|
total: 200,
|
||||||
|
idle: 150,
|
||||||
|
want: 0,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := cpuLoadPctBetween(tc.prevTotal, tc.prevIdle, tc.total, tc.idle); got != tc.want {
|
||||||
|
t.Fatalf("%s: cpuLoadPctBetween(...)=%v want %v", tc.name, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -2,6 +2,7 @@ package platform
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
@@ -18,21 +19,17 @@ func (s *System) ListInterfaces() ([]InterfaceInfo, error) {
|
|||||||
out := make([]InterfaceInfo, 0, len(names))
|
out := make([]InterfaceInfo, 0, len(names))
|
||||||
for _, name := range names {
|
for _, name := range names {
|
||||||
state := "unknown"
|
state := "unknown"
|
||||||
if raw, err := exec.Command("ip", "-o", "link", "show", name).Output(); err == nil {
|
if up, err := interfaceAdminState(name); err == nil {
|
||||||
fields := strings.Fields(string(raw))
|
if up {
|
||||||
if len(fields) >= 9 {
|
state = "up"
|
||||||
state = fields[8]
|
} else {
|
||||||
|
state = "down"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var ipv4 []string
|
ipv4, err := interfaceIPv4Addrs(name)
|
||||||
if raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", name).Output(); err == nil {
|
if err != nil {
|
||||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
ipv4 = nil
|
||||||
fields := strings.Fields(line)
|
|
||||||
if len(fields) >= 4 {
|
|
||||||
ipv4 = append(ipv4, fields[3])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, InterfaceInfo{Name: name, State: state, IPv4: ipv4})
|
out = append(out, InterfaceInfo{Name: name, State: state, IPv4: ipv4})
|
||||||
@@ -55,6 +52,119 @@ func (s *System) DefaultRoute() string {
|
|||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *System) CaptureNetworkSnapshot() (NetworkSnapshot, error) {
|
||||||
|
names, err := listInterfaceNames()
|
||||||
|
if err != nil {
|
||||||
|
return NetworkSnapshot{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
snapshot := NetworkSnapshot{
|
||||||
|
Interfaces: make([]NetworkInterfaceSnapshot, 0, len(names)),
|
||||||
|
}
|
||||||
|
for _, name := range names {
|
||||||
|
up, err := interfaceAdminState(name)
|
||||||
|
if err != nil {
|
||||||
|
return NetworkSnapshot{}, err
|
||||||
|
}
|
||||||
|
ipv4, err := interfaceIPv4Addrs(name)
|
||||||
|
if err != nil {
|
||||||
|
return NetworkSnapshot{}, err
|
||||||
|
}
|
||||||
|
snapshot.Interfaces = append(snapshot.Interfaces, NetworkInterfaceSnapshot{
|
||||||
|
Name: name,
|
||||||
|
Up: up,
|
||||||
|
IPv4: ipv4,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if raw, err := exec.Command("ip", "route", "show", "default").Output(); err == nil {
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line != "" {
|
||||||
|
snapshot.DefaultRoutes = append(snapshot.DefaultRoutes, line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if raw, err := os.ReadFile("/etc/resolv.conf"); err == nil {
|
||||||
|
snapshot.ResolvConf = string(raw)
|
||||||
|
}
|
||||||
|
|
||||||
|
return snapshot, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RestoreNetworkSnapshot(snapshot NetworkSnapshot) error {
|
||||||
|
var errs []string
|
||||||
|
|
||||||
|
for _, iface := range snapshot.Interfaces {
|
||||||
|
if err := exec.Command("ip", "link", "set", "dev", iface.Name, "up").Run(); err != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("%s: bring up before restore: %v", iface.Name, err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if err := exec.Command("ip", "addr", "flush", "dev", iface.Name).Run(); err != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("%s: flush addresses: %v", iface.Name, err))
|
||||||
|
}
|
||||||
|
for _, cidr := range iface.IPv4 {
|
||||||
|
if raw, err := exec.Command("ip", "addr", "add", cidr, "dev", iface.Name).CombinedOutput(); err != nil {
|
||||||
|
detail := strings.TrimSpace(string(raw))
|
||||||
|
if detail != "" {
|
||||||
|
errs = append(errs, fmt.Sprintf("%s: restore address %s: %v: %s", iface.Name, cidr, err, detail))
|
||||||
|
} else {
|
||||||
|
errs = append(errs, fmt.Sprintf("%s: restore address %s: %v", iface.Name, cidr, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
state := "down"
|
||||||
|
if iface.Up {
|
||||||
|
state = "up"
|
||||||
|
}
|
||||||
|
if err := exec.Command("ip", "link", "set", "dev", iface.Name, state).Run(); err != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("%s: restore state %s: %v", iface.Name, state, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := exec.Command("ip", "route", "del", "default").Run(); err != nil {
|
||||||
|
var exitErr *exec.ExitError
|
||||||
|
if !errors.As(err, &exitErr) {
|
||||||
|
errs = append(errs, fmt.Sprintf("clear default route: %v", err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, route := range snapshot.DefaultRoutes {
|
||||||
|
fields := strings.Fields(route)
|
||||||
|
if len(fields) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Strip state flags that ip-route(8) does not accept as add arguments.
|
||||||
|
filtered := fields[:0]
|
||||||
|
for _, f := range fields {
|
||||||
|
switch f {
|
||||||
|
case "linkdown", "dead", "onlink", "pervasive":
|
||||||
|
// skip
|
||||||
|
default:
|
||||||
|
filtered = append(filtered, f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
args := append([]string{"route", "add"}, filtered...)
|
||||||
|
if raw, err := exec.Command("ip", args...).CombinedOutput(); err != nil {
|
||||||
|
detail := strings.TrimSpace(string(raw))
|
||||||
|
if detail != "" {
|
||||||
|
errs = append(errs, fmt.Sprintf("restore route %q: %v: %s", route, err, detail))
|
||||||
|
} else {
|
||||||
|
errs = append(errs, fmt.Sprintf("restore route %q: %v", route, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.WriteFile("/etc/resolv.conf", []byte(snapshot.ResolvConf), 0644); err != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("restore resolv.conf: %v", err))
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(errs) > 0 {
|
||||||
|
return errors.New(strings.Join(errs, "; "))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (s *System) DHCPOne(iface string) (string, error) {
|
func (s *System) DHCPOne(iface string) (string, error) {
|
||||||
var out bytes.Buffer
|
var out bytes.Buffer
|
||||||
if err := exec.Command("ip", "link", "set", iface, "up").Run(); err != nil {
|
if err := exec.Command("ip", "link", "set", iface, "up").Run(); err != nil {
|
||||||
@@ -131,6 +241,65 @@ func (s *System) SetStaticIPv4(cfg StaticIPv4Config) (string, error) {
|
|||||||
return out.String(), nil
|
return out.String(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SetInterfaceState brings a network interface up or down.
|
||||||
|
func (s *System) SetInterfaceState(iface string, up bool) error {
|
||||||
|
state := "down"
|
||||||
|
if up {
|
||||||
|
state = "up"
|
||||||
|
}
|
||||||
|
return exec.Command("ip", "link", "set", "dev", iface, state).Run()
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetInterfaceState returns true if the interface is UP.
|
||||||
|
func (s *System) GetInterfaceState(iface string) (bool, error) {
|
||||||
|
return interfaceAdminState(iface)
|
||||||
|
}
|
||||||
|
|
||||||
|
func interfaceAdminState(iface string) (bool, error) {
|
||||||
|
raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
return parseInterfaceAdminState(string(raw))
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseInterfaceAdminState(raw string) (bool, error) {
|
||||||
|
start := strings.IndexByte(raw, '<')
|
||||||
|
if start == -1 {
|
||||||
|
return false, fmt.Errorf("ip link output missing flags")
|
||||||
|
}
|
||||||
|
end := strings.IndexByte(raw[start+1:], '>')
|
||||||
|
if end == -1 {
|
||||||
|
return false, fmt.Errorf("ip link output missing flag terminator")
|
||||||
|
}
|
||||||
|
flags := strings.Split(raw[start+1:start+1+end], ",")
|
||||||
|
for _, flag := range flags {
|
||||||
|
if strings.TrimSpace(flag) == "UP" {
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func interfaceIPv4Addrs(iface string) ([]string, error) {
|
||||||
|
raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", iface).Output()
|
||||||
|
if err != nil {
|
||||||
|
var exitErr *exec.ExitError
|
||||||
|
if errors.As(err, &exitErr) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var ipv4 []string
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
if len(fields) >= 4 {
|
||||||
|
ipv4 = append(ipv4, fields[3])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ipv4, nil
|
||||||
|
}
|
||||||
|
|
||||||
func listInterfaceNames() ([]string, error) {
|
func listInterfaceNames() ([]string, error) {
|
||||||
raw, err := exec.Command("ip", "-o", "link", "show").Output()
|
raw, err := exec.Command("ip", "-o", "link", "show").Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
46
audit/internal/platform/network_test.go
Normal file
46
audit/internal/platform/network_test.go
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestParseInterfaceAdminState(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
raw string
|
||||||
|
want bool
|
||||||
|
wantErr bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "admin up with no carrier",
|
||||||
|
raw: "2: enp1s0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc mq state DOWN mode DEFAULT group default qlen 1000\n",
|
||||||
|
want: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "admin down",
|
||||||
|
raw: "2: enp1s0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000\n",
|
||||||
|
want: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "malformed output",
|
||||||
|
raw: "2: enp1s0: mtu 1500 state DOWN\n",
|
||||||
|
wantErr: true,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
got, err := parseInterfaceAdminState(tt.raw)
|
||||||
|
if tt.wantErr {
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error")
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if got != tt.want {
|
||||||
|
t.Fatalf("got %v want %v", got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
203
audit/internal/platform/nvidia_stress.go
Normal file
203
audit/internal/platform/nvidia_stress.go
Normal file
@@ -0,0 +1,203 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
normalizeNvidiaStressOptions(&opts)
|
||||||
|
|
||||||
|
job, err := buildNvidiaStressJob(opts)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
|
||||||
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||||
|
job,
|
||||||
|
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvidiaStressArchivePrefix(loader string) string {
|
||||||
|
switch strings.TrimSpace(strings.ToLower(loader)) {
|
||||||
|
case NvidiaStressLoaderJohn:
|
||||||
|
return "gpu-nvidia-john"
|
||||||
|
case NvidiaStressLoaderNCCL:
|
||||||
|
return "gpu-nvidia-nccl"
|
||||||
|
default:
|
||||||
|
return "gpu-nvidia-burn"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||||
|
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
|
||||||
|
if err != nil {
|
||||||
|
return satJob{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
loader := strings.TrimSpace(strings.ToLower(opts.Loader))
|
||||||
|
switch loader {
|
||||||
|
case "", NvidiaStressLoaderBuiltin:
|
||||||
|
cmd := []string{
|
||||||
|
"bee-gpu-burn",
|
||||||
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
|
}
|
||||||
|
if len(selected) > 0 {
|
||||||
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
|
}
|
||||||
|
return satJob{
|
||||||
|
name: "03-bee-gpu-burn.log",
|
||||||
|
cmd: cmd,
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
}, nil
|
||||||
|
case NvidiaStressLoaderJohn:
|
||||||
|
cmd := []string{
|
||||||
|
"bee-john-gpu-stress",
|
||||||
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
|
}
|
||||||
|
if len(selected) > 0 {
|
||||||
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
|
}
|
||||||
|
return satJob{
|
||||||
|
name: "03-john-gpu-stress.log",
|
||||||
|
cmd: cmd,
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
}, nil
|
||||||
|
case NvidiaStressLoaderNCCL:
|
||||||
|
cmd := []string{
|
||||||
|
"bee-nccl-gpu-stress",
|
||||||
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
|
}
|
||||||
|
if len(selected) > 0 {
|
||||||
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
|
}
|
||||||
|
return satJob{
|
||||||
|
name: "03-bee-nccl-gpu-stress.log",
|
||||||
|
cmd: cmd,
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
}, nil
|
||||||
|
default:
|
||||||
|
return satJob{}, fmt.Errorf("unknown NVIDIA stress loader %q", opts.Loader)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
|
||||||
|
if opts.DurationSec <= 0 {
|
||||||
|
opts.DurationSec = 300
|
||||||
|
}
|
||||||
|
// SizeMB=0 means "auto" — bee-gpu-burn will query per-GPU memory at runtime.
|
||||||
|
switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
|
||||||
|
case "", NvidiaStressLoaderBuiltin:
|
||||||
|
opts.Loader = NvidiaStressLoaderBuiltin
|
||||||
|
case NvidiaStressLoaderJohn:
|
||||||
|
opts.Loader = NvidiaStressLoaderJohn
|
||||||
|
case NvidiaStressLoaderNCCL:
|
||||||
|
opts.Loader = NvidiaStressLoaderNCCL
|
||||||
|
default:
|
||||||
|
opts.Loader = NvidiaStressLoaderBuiltin
|
||||||
|
}
|
||||||
|
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
||||||
|
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveNvidiaGPUSelection(include, exclude []int) ([]int, error) {
|
||||||
|
all, err := listNvidiaGPUIndices()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if len(all) == 0 {
|
||||||
|
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
||||||
|
}
|
||||||
|
|
||||||
|
selected := all
|
||||||
|
if len(include) > 0 {
|
||||||
|
want := make(map[int]struct{}, len(include))
|
||||||
|
for _, idx := range include {
|
||||||
|
want[idx] = struct{}{}
|
||||||
|
}
|
||||||
|
selected = selected[:0]
|
||||||
|
for _, idx := range all {
|
||||||
|
if _, ok := want[idx]; ok {
|
||||||
|
selected = append(selected, idx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(exclude) > 0 {
|
||||||
|
skip := make(map[int]struct{}, len(exclude))
|
||||||
|
for _, idx := range exclude {
|
||||||
|
skip[idx] = struct{}{}
|
||||||
|
}
|
||||||
|
filtered := selected[:0]
|
||||||
|
for _, idx := range selected {
|
||||||
|
if _, ok := skip[idx]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
filtered = append(filtered, idx)
|
||||||
|
}
|
||||||
|
selected = filtered
|
||||||
|
}
|
||||||
|
if len(selected) == 0 {
|
||||||
|
return nil, fmt.Errorf("no NVIDIA GPUs selected after applying filters")
|
||||||
|
}
|
||||||
|
out := append([]int(nil), selected...)
|
||||||
|
sort.Ints(out)
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func listNvidiaGPUIndices() ([]int, error) {
|
||||||
|
out, err := satExecCommand("nvidia-smi", "--query-gpu=index", "--format=csv,noheader,nounits").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||||
|
}
|
||||||
|
var indices []int
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, err := strconv.Atoi(line)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
indices = append(indices, idx)
|
||||||
|
}
|
||||||
|
return dedupeSortedIndices(indices), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func dedupeSortedIndices(values []int) []int {
|
||||||
|
if len(values) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
seen := make(map[int]struct{}, len(values))
|
||||||
|
out := make([]int, 0, len(values))
|
||||||
|
for _, value := range values {
|
||||||
|
if value < 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, ok := seen[value]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[value] = struct{}{}
|
||||||
|
out = append(out, value)
|
||||||
|
}
|
||||||
|
sort.Ints(out)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func joinIndexList(values []int) string {
|
||||||
|
parts := make([]string, 0, len(values))
|
||||||
|
for _, value := range values {
|
||||||
|
parts = append(parts, strconv.Itoa(value))
|
||||||
|
}
|
||||||
|
return strings.Join(parts, ",")
|
||||||
|
}
|
||||||
545
audit/internal/platform/platform_stress.go
Normal file
545
audit/internal/platform/platform_stress.go
Normal file
@@ -0,0 +1,545 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"archive/tar"
|
||||||
|
"bytes"
|
||||||
|
"compress/gzip"
|
||||||
|
"context"
|
||||||
|
"encoding/csv"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"runtime"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"syscall"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// PlatformStressCycle defines one load+idle cycle.
|
||||||
|
type PlatformStressCycle struct {
|
||||||
|
LoadSec int // seconds of simultaneous CPU+GPU stress
|
||||||
|
IdleSec int // seconds of idle monitoring after load cut
|
||||||
|
}
|
||||||
|
|
||||||
|
// PlatformStressOptions controls the thermal cycling test.
|
||||||
|
type PlatformStressOptions struct {
|
||||||
|
Cycles []PlatformStressCycle
|
||||||
|
Components []string // if empty: run all; values: "cpu", "gpu"
|
||||||
|
}
|
||||||
|
|
||||||
|
// platformStressRow is one second of telemetry.
|
||||||
|
type platformStressRow struct {
|
||||||
|
ElapsedSec float64
|
||||||
|
Cycle int
|
||||||
|
Phase string // "load" | "idle"
|
||||||
|
CPULoadPct float64
|
||||||
|
MaxCPUTempC float64
|
||||||
|
MaxGPUTempC float64
|
||||||
|
SysPowerW float64
|
||||||
|
FanMinRPM float64
|
||||||
|
FanMaxRPM float64
|
||||||
|
GPUThrottled bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunPlatformStress runs repeated load+idle thermal cycling.
|
||||||
|
// Each cycle starts CPU (stressapptest) and GPU stress simultaneously,
|
||||||
|
// runs for LoadSec, then cuts load abruptly and monitors for IdleSec.
|
||||||
|
func (s *System) RunPlatformStress(
|
||||||
|
ctx context.Context,
|
||||||
|
baseDir string,
|
||||||
|
opts PlatformStressOptions,
|
||||||
|
logFunc func(string),
|
||||||
|
) (string, error) {
|
||||||
|
if logFunc == nil {
|
||||||
|
logFunc = func(string) {}
|
||||||
|
}
|
||||||
|
if len(opts.Cycles) == 0 {
|
||||||
|
return "", fmt.Errorf("no cycles defined")
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||||
|
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
stamp := time.Now().UTC().Format("20060102-150405")
|
||||||
|
runDir := filepath.Join(baseDir, "platform-stress-"+stamp)
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
return "", fmt.Errorf("mkdir run dir: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
hasCPU := len(opts.Components) == 0 || containsComponent(opts.Components, "cpu")
|
||||||
|
hasGPU := len(opts.Components) == 0 || containsComponent(opts.Components, "gpu")
|
||||||
|
|
||||||
|
vendor := s.DetectGPUVendor()
|
||||||
|
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s, cpu=%v gpu=%v", len(opts.Cycles), vendor, hasCPU, hasGPU))
|
||||||
|
|
||||||
|
var rows []platformStressRow
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
|
var analyses []cycleAnalysis
|
||||||
|
|
||||||
|
for i, cycle := range opts.Cycles {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
cycleNum := i + 1
|
||||||
|
logFunc(fmt.Sprintf("--- Cycle %d/%d: load=%ds, idle=%ds ---", cycleNum, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec))
|
||||||
|
|
||||||
|
// ── LOAD PHASE ───────────────────────────────────────────────────────
|
||||||
|
loadCtx, loadCancel := context.WithTimeout(ctx, time.Duration(cycle.LoadSec)*time.Second)
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
|
||||||
|
// CPU stress
|
||||||
|
if hasCPU {
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
||||||
|
if err != nil {
|
||||||
|
logFunc("CPU stress: " + err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
// GPU stress
|
||||||
|
if hasGPU {
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
||||||
|
if gpuCmd == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = gpuCmd.Wait()
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Monitoring goroutine for load phase
|
||||||
|
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
|
||||||
|
for _, r := range loadRows {
|
||||||
|
logFunc(formatPlatformRow(r))
|
||||||
|
}
|
||||||
|
rows = append(rows, loadRows...)
|
||||||
|
loadCancel()
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
if len(loadRows) > 0 {
|
||||||
|
logFunc(fmt.Sprintf("Cycle %d load ended (%.0fs)", cycleNum, loadRows[len(loadRows)-1].ElapsedSec))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── IDLE PHASE ───────────────────────────────────────────────────────
|
||||||
|
idleCtx, idleCancel := context.WithTimeout(ctx, time.Duration(cycle.IdleSec)*time.Second)
|
||||||
|
idleRows := collectPhase(idleCtx, cycleNum, "idle", start)
|
||||||
|
for _, r := range idleRows {
|
||||||
|
logFunc(formatPlatformRow(r))
|
||||||
|
}
|
||||||
|
rows = append(rows, idleRows...)
|
||||||
|
idleCancel()
|
||||||
|
|
||||||
|
// Per-cycle analysis
|
||||||
|
an := analyzePlatformCycle(loadRows, idleRows)
|
||||||
|
analyses = append(analyses, an)
|
||||||
|
logFunc(fmt.Sprintf("Cycle %d: maxCPU=%.1f°C maxGPU=%.1f°C power=%.0fW throttled=%v fanDrop=%.0f%%",
|
||||||
|
cycleNum, an.maxCPUTemp, an.maxGPUTemp, an.maxPower, an.throttled, an.fanDropPct))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write CSV
|
||||||
|
csvData := writePlatformCSV(rows)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "metrics.csv"), csvData, 0644)
|
||||||
|
|
||||||
|
// Write summary
|
||||||
|
summary := writePlatformSummary(opts, analyses)
|
||||||
|
logFunc("--- Summary ---")
|
||||||
|
for _, line := range strings.Split(summary, "\n") {
|
||||||
|
if line != "" {
|
||||||
|
logFunc(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
||||||
|
|
||||||
|
// Pack tar.gz
|
||||||
|
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
|
||||||
|
if err := packPlatformDir(runDir, archivePath); err != nil {
|
||||||
|
return "", fmt.Errorf("pack archive: %w", err)
|
||||||
|
}
|
||||||
|
_ = os.RemoveAll(runDir)
|
||||||
|
return archivePath, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectPhase samples live metrics every second until ctx is done.
|
||||||
|
func collectPhase(ctx context.Context, cycle int, phase string, testStart time.Time) []platformStressRow {
|
||||||
|
var rows []platformStressRow
|
||||||
|
ticker := time.NewTicker(time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return rows
|
||||||
|
case <-ticker.C:
|
||||||
|
sample := SampleLiveMetrics()
|
||||||
|
rows = append(rows, sampleToPlatformRow(sample, cycle, phase, testStart))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleToPlatformRow(s LiveMetricSample, cycle int, phase string, testStart time.Time) platformStressRow {
|
||||||
|
r := platformStressRow{
|
||||||
|
ElapsedSec: time.Since(testStart).Seconds(),
|
||||||
|
Cycle: cycle,
|
||||||
|
Phase: phase,
|
||||||
|
CPULoadPct: s.CPULoadPct,
|
||||||
|
SysPowerW: s.PowerW,
|
||||||
|
}
|
||||||
|
for _, t := range s.Temps {
|
||||||
|
switch t.Group {
|
||||||
|
case "cpu":
|
||||||
|
if t.Celsius > r.MaxCPUTempC {
|
||||||
|
r.MaxCPUTempC = t.Celsius
|
||||||
|
}
|
||||||
|
case "gpu":
|
||||||
|
if t.Celsius > r.MaxGPUTempC {
|
||||||
|
r.MaxGPUTempC = t.Celsius
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, g := range s.GPUs {
|
||||||
|
if g.TempC > r.MaxGPUTempC {
|
||||||
|
r.MaxGPUTempC = g.TempC
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(s.Fans) > 0 {
|
||||||
|
r.FanMinRPM = s.Fans[0].RPM
|
||||||
|
r.FanMaxRPM = s.Fans[0].RPM
|
||||||
|
for _, f := range s.Fans[1:] {
|
||||||
|
if f.RPM < r.FanMinRPM {
|
||||||
|
r.FanMinRPM = f.RPM
|
||||||
|
}
|
||||||
|
if f.RPM > r.FanMaxRPM {
|
||||||
|
r.FanMaxRPM = f.RPM
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatPlatformRow(r platformStressRow) string {
|
||||||
|
throttle := ""
|
||||||
|
if r.GPUThrottled {
|
||||||
|
throttle = " THROTTLE"
|
||||||
|
}
|
||||||
|
fans := ""
|
||||||
|
if r.FanMinRPM > 0 {
|
||||||
|
fans = fmt.Sprintf(" fans=%.0f-%.0fRPM", r.FanMinRPM, r.FanMaxRPM)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("[%5.0fs] cycle=%d phase=%-4s cpu=%.0f%% cpuT=%.1f°C gpuT=%.1f°C pwr=%.0fW%s%s",
|
||||||
|
r.ElapsedSec, r.Cycle, r.Phase, r.CPULoadPct, r.MaxCPUTempC, r.MaxGPUTempC, r.SysPowerW, fans, throttle)
|
||||||
|
}
|
||||||
|
|
||||||
|
func analyzePlatformCycle(loadRows, idleRows []platformStressRow) cycleAnalysis {
|
||||||
|
var an cycleAnalysis
|
||||||
|
for _, r := range loadRows {
|
||||||
|
if r.MaxCPUTempC > an.maxCPUTemp {
|
||||||
|
an.maxCPUTemp = r.MaxCPUTempC
|
||||||
|
}
|
||||||
|
if r.MaxGPUTempC > an.maxGPUTemp {
|
||||||
|
an.maxGPUTemp = r.MaxGPUTempC
|
||||||
|
}
|
||||||
|
if r.SysPowerW > an.maxPower {
|
||||||
|
an.maxPower = r.SysPowerW
|
||||||
|
}
|
||||||
|
if r.GPUThrottled {
|
||||||
|
an.throttled = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Fan RPM at cut = avg of last 5 load rows
|
||||||
|
if n := len(loadRows); n > 0 {
|
||||||
|
window := loadRows
|
||||||
|
if n > 5 {
|
||||||
|
window = loadRows[n-5:]
|
||||||
|
}
|
||||||
|
var sum float64
|
||||||
|
var cnt int
|
||||||
|
for _, r := range window {
|
||||||
|
if r.FanMinRPM > 0 {
|
||||||
|
sum += (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||||
|
cnt++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if cnt > 0 {
|
||||||
|
an.fanAtCutAvg = sum / float64(cnt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Fan RPM min in first 15s of idle
|
||||||
|
an.fanMin15s = an.fanAtCutAvg
|
||||||
|
var cutElapsed float64
|
||||||
|
if len(loadRows) > 0 {
|
||||||
|
cutElapsed = loadRows[len(loadRows)-1].ElapsedSec
|
||||||
|
}
|
||||||
|
for _, r := range idleRows {
|
||||||
|
if r.ElapsedSec > cutElapsed+15 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
avg := (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||||
|
if avg > 0 && (an.fanMin15s == 0 || avg < an.fanMin15s) {
|
||||||
|
an.fanMin15s = avg
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if an.fanAtCutAvg > 0 {
|
||||||
|
an.fanDropPct = (an.fanAtCutAvg - an.fanMin15s) / an.fanAtCutAvg * 100
|
||||||
|
}
|
||||||
|
return an
|
||||||
|
}
|
||||||
|
|
||||||
|
type cycleAnalysis struct {
|
||||||
|
maxCPUTemp float64
|
||||||
|
maxGPUTemp float64
|
||||||
|
maxPower float64
|
||||||
|
throttled bool
|
||||||
|
fanAtCutAvg float64
|
||||||
|
fanMin15s float64
|
||||||
|
fanDropPct float64
|
||||||
|
}
|
||||||
|
|
||||||
|
func writePlatformSummary(opts PlatformStressOptions, analyses []cycleAnalysis) string {
|
||||||
|
var b strings.Builder
|
||||||
|
fmt.Fprintf(&b, "Platform Thermal Cycling — %d cycle(s)\n", len(opts.Cycles))
|
||||||
|
fmt.Fprintf(&b, "%s\n\n", strings.Repeat("=", 48))
|
||||||
|
|
||||||
|
totalThrottle := 0
|
||||||
|
totalFanWarn := 0
|
||||||
|
for i, an := range analyses {
|
||||||
|
cycle := opts.Cycles[i]
|
||||||
|
fmt.Fprintf(&b, "Cycle %d/%d (load=%ds, idle=%ds)\n", i+1, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec)
|
||||||
|
fmt.Fprintf(&b, " Max CPU temp: %.1f°C\n", an.maxCPUTemp)
|
||||||
|
fmt.Fprintf(&b, " Max GPU temp: %.1f°C\n", an.maxGPUTemp)
|
||||||
|
fmt.Fprintf(&b, " Max sys power: %.0f W\n", an.maxPower)
|
||||||
|
if an.throttled {
|
||||||
|
fmt.Fprintf(&b, " Throttle: DETECTED\n")
|
||||||
|
totalThrottle++
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, " Throttle: none\n")
|
||||||
|
}
|
||||||
|
if an.fanAtCutAvg > 0 {
|
||||||
|
fmt.Fprintf(&b, " Fan at load cut: %.0f RPM avg\n", an.fanAtCutAvg)
|
||||||
|
fmt.Fprintf(&b, " Fan min (first 15s idle): %.0f RPM (drop %.0f%%)\n", an.fanMin15s, an.fanDropPct)
|
||||||
|
if an.fanDropPct > 20 {
|
||||||
|
fmt.Fprintf(&b, " Fan response: WARN — fast spindown (>20%% drop in 15s)\n")
|
||||||
|
totalFanWarn++
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, " Fan response: OK\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(&b, "%s\n", strings.Repeat("=", 48))
|
||||||
|
if totalThrottle > 0 {
|
||||||
|
fmt.Fprintf(&b, "Overall: FAIL — throttle detected in %d/%d cycles\n", totalThrottle, len(analyses))
|
||||||
|
} else if totalFanWarn > 0 {
|
||||||
|
fmt.Fprintf(&b, "Overall: WARN — fast fan spindown in %d/%d cycles (cooling recovery risk)\n", totalFanWarn, len(analyses))
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, "Overall: PASS\n")
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func writePlatformCSV(rows []platformStressRow) []byte {
|
||||||
|
var buf bytes.Buffer
|
||||||
|
w := csv.NewWriter(&buf)
|
||||||
|
_ = w.Write([]string{
|
||||||
|
"elapsed_sec", "cycle", "phase",
|
||||||
|
"cpu_load_pct", "max_cpu_temp_c", "max_gpu_temp_c",
|
||||||
|
"sys_power_w", "fan_min_rpm", "fan_max_rpm", "gpu_throttled",
|
||||||
|
})
|
||||||
|
for _, r := range rows {
|
||||||
|
throttled := "0"
|
||||||
|
if r.GPUThrottled {
|
||||||
|
throttled = "1"
|
||||||
|
}
|
||||||
|
_ = w.Write([]string{
|
||||||
|
strconv.FormatFloat(r.ElapsedSec, 'f', 1, 64),
|
||||||
|
strconv.Itoa(r.Cycle),
|
||||||
|
r.Phase,
|
||||||
|
strconv.FormatFloat(r.CPULoadPct, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(r.MaxCPUTempC, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(r.MaxGPUTempC, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(r.SysPowerW, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(r.FanMinRPM, 'f', 0, 64),
|
||||||
|
strconv.FormatFloat(r.FanMaxRPM, 'f', 0, 64),
|
||||||
|
throttled,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
w.Flush()
|
||||||
|
return buf.Bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildCPUStressCmd creates a stressapptest command that runs until ctx is cancelled.
|
||||||
|
func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
||||||
|
path, err := satLookPath("stressapptest")
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("stressapptest not found: %w", err)
|
||||||
|
}
|
||||||
|
// Use a very long duration; the context timeout will kill it at the right time.
|
||||||
|
cmdArgs := []string{"-s", "86400", "-W", "--cc_test"}
|
||||||
|
if threads := platformStressCPUThreads(); threads > 0 {
|
||||||
|
cmdArgs = append(cmdArgs, "-m", strconv.Itoa(threads))
|
||||||
|
}
|
||||||
|
if mb := platformStressMemoryMB(); mb > 0 {
|
||||||
|
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
||||||
|
}
|
||||||
|
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
||||||
|
cmd.Stdout = nil
|
||||||
|
cmd.Stderr = nil
|
||||||
|
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
||||||
|
return nil, fmt.Errorf("stressapptest start: %w", err)
|
||||||
|
}
|
||||||
|
return cmd, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
||||||
|
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
||||||
|
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
|
||||||
|
switch strings.ToLower(vendor) {
|
||||||
|
case "amd":
|
||||||
|
return buildAMDGPUStressCmd(ctx)
|
||||||
|
case "nvidia":
|
||||||
|
return buildNvidiaGPUStressCmd(ctx)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||||
|
rvsArgs, err := resolveRVSCommand()
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
rvsPath := rvsArgs[0]
|
||||||
|
cfg := `actions:
|
||||||
|
- name: gst_platform
|
||||||
|
device: all
|
||||||
|
module: gst
|
||||||
|
parallel: true
|
||||||
|
duration: 86400000
|
||||||
|
copy_matrix: false
|
||||||
|
target_stress: 90
|
||||||
|
matrix_size_a: 8640
|
||||||
|
matrix_size_b: 8640
|
||||||
|
matrix_size_c: 8640
|
||||||
|
`
|
||||||
|
cfgFile := "/tmp/bee-platform-gst.conf"
|
||||||
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||||
|
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||||
|
cmd.Stdout = nil
|
||||||
|
cmd.Stderr = nil
|
||||||
|
_ = startLowPriorityCmd(cmd, 10)
|
||||||
|
return cmd
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||||
|
path, err := satLookPath("bee-gpu-burn")
|
||||||
|
if err != nil {
|
||||||
|
path, err = satLookPath("bee-gpu-stress")
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
|
||||||
|
cmd.Stdout = nil
|
||||||
|
cmd.Stderr = nil
|
||||||
|
_ = startLowPriorityCmd(cmd, 10)
|
||||||
|
return cmd
|
||||||
|
}
|
||||||
|
|
||||||
|
func startLowPriorityCmd(cmd *exec.Cmd, nice int) error {
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, nice)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func platformStressCPUThreads() int {
|
||||||
|
if n := envInt("BEE_PLATFORM_STRESS_THREADS", 0); n > 0 {
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
cpus := runtime.NumCPU()
|
||||||
|
switch {
|
||||||
|
case cpus <= 2:
|
||||||
|
return 1
|
||||||
|
case cpus <= 8:
|
||||||
|
return cpus - 1
|
||||||
|
default:
|
||||||
|
return cpus - 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func platformStressMemoryMB() int {
|
||||||
|
if mb := envInt("BEE_PLATFORM_STRESS_MB", 0); mb > 0 {
|
||||||
|
return mb
|
||||||
|
}
|
||||||
|
free := freeMemBytes()
|
||||||
|
if free <= 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
mb := int((free * 60) / 100 / (1024 * 1024))
|
||||||
|
if mb < 1024 {
|
||||||
|
return 1024
|
||||||
|
}
|
||||||
|
return mb
|
||||||
|
}
|
||||||
|
|
||||||
|
func containsComponent(components []string, name string) bool {
|
||||||
|
for _, c := range components {
|
||||||
|
if c == name {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func packPlatformDir(dir, dest string) error {
|
||||||
|
f, err := os.Create(dest)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
gz := gzip.NewWriter(f)
|
||||||
|
defer gz.Close()
|
||||||
|
tw := tar.NewWriter(gz)
|
||||||
|
defer tw.Close()
|
||||||
|
|
||||||
|
entries, err := os.ReadDir(dir)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
base := filepath.Base(dir)
|
||||||
|
for _, e := range entries {
|
||||||
|
if e.IsDir() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fpath := filepath.Join(dir, e.Name())
|
||||||
|
data, err := os.ReadFile(fpath)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
hdr := &tar.Header{
|
||||||
|
Name: filepath.Join(base, e.Name()),
|
||||||
|
Size: int64(len(data)),
|
||||||
|
Mode: 0644,
|
||||||
|
ModTime: time.Now(),
|
||||||
|
}
|
||||||
|
if err := tw.WriteHeader(hdr); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if _, err := tw.Write(data); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
34
audit/internal/platform/platform_stress_test.go
Normal file
34
audit/internal/platform/platform_stress_test.go
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"runtime"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestPlatformStressCPUThreadsOverride(t *testing.T) {
|
||||||
|
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "7")
|
||||||
|
if got := platformStressCPUThreads(); got != 7 {
|
||||||
|
t.Fatalf("platformStressCPUThreads=%d want 7", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPlatformStressCPUThreadsDefaultLeavesHeadroom(t *testing.T) {
|
||||||
|
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "")
|
||||||
|
got := platformStressCPUThreads()
|
||||||
|
if got < 1 {
|
||||||
|
t.Fatalf("platformStressCPUThreads=%d want >= 1", got)
|
||||||
|
}
|
||||||
|
if got > runtime.NumCPU() {
|
||||||
|
t.Fatalf("platformStressCPUThreads=%d want <= NumCPU=%d", got, runtime.NumCPU())
|
||||||
|
}
|
||||||
|
if runtime.NumCPU() > 2 && got >= runtime.NumCPU() {
|
||||||
|
t.Fatalf("platformStressCPUThreads=%d want headroom below NumCPU=%d", got, runtime.NumCPU())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPlatformStressMemoryMBOverride(t *testing.T) {
|
||||||
|
t.Setenv("BEE_PLATFORM_STRESS_MB", "8192")
|
||||||
|
if got := platformStressMemoryMB(); got != 8192 {
|
||||||
|
t.Fatalf("platformStressMemoryMB=%d want 8192", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -136,7 +136,10 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
|||||||
tools = append(tools, s.CheckTools([]string{
|
tools = append(tools, s.CheckTools([]string{
|
||||||
"nvidia-smi",
|
"nvidia-smi",
|
||||||
"nvidia-bug-report.sh",
|
"nvidia-bug-report.sh",
|
||||||
"bee-gpu-stress",
|
"bee-gpu-burn",
|
||||||
|
"bee-john-gpu-stress",
|
||||||
|
"bee-nccl-gpu-stress",
|
||||||
|
"all_reduce_perf",
|
||||||
})...)
|
})...)
|
||||||
case "amd":
|
case "amd":
|
||||||
tool := ToolStatus{Name: "rocm-smi"}
|
tool := ToolStatus{Name: "rocm-smi"}
|
||||||
@@ -176,8 +179,8 @@ func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHe
|
|||||||
health.DriverReady = true
|
health.DriverReady = true
|
||||||
}
|
}
|
||||||
|
|
||||||
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
|
if _, lookErr := exec.LookPath("bee-gpu-burn"); lookErr == nil {
|
||||||
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
out, err := exec.Command("bee-gpu-burn", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
health.CUDAReady = true
|
health.CUDAReady = true
|
||||||
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
||||||
|
|||||||
@@ -2,6 +2,8 @@ package platform
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"archive/tar"
|
"archive/tar"
|
||||||
|
"bufio"
|
||||||
|
"bytes"
|
||||||
"compress/gzip"
|
"compress/gzip"
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
@@ -10,9 +12,11 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"syscall"
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -30,8 +34,46 @@ var (
|
|||||||
"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
|
"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
|
||||||
"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
|
"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
|
||||||
}
|
}
|
||||||
|
rvsExecutableGlobs = []string{
|
||||||
|
"/opt/rocm/bin/rvs",
|
||||||
|
"/opt/rocm-*/bin/rvs",
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
|
||||||
|
// Returns combined stdout+stderr as a byte slice.
|
||||||
|
func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
|
||||||
|
pr, pw := io.Pipe()
|
||||||
|
cmd.Stdout = pw
|
||||||
|
cmd.Stderr = pw
|
||||||
|
|
||||||
|
var buf bytes.Buffer
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
scanner := bufio.NewScanner(pr)
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
buf.WriteString(line + "\n")
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
err := cmd.Start()
|
||||||
|
if err != nil {
|
||||||
|
_ = pw.Close()
|
||||||
|
wg.Wait()
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
waitErr := cmd.Wait()
|
||||||
|
_ = pw.Close()
|
||||||
|
wg.Wait()
|
||||||
|
return buf.Bytes(), waitErr
|
||||||
|
}
|
||||||
|
|
||||||
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
||||||
type NvidiaGPU struct {
|
type NvidiaGPU struct {
|
||||||
Index int
|
Index int
|
||||||
@@ -53,6 +95,12 @@ func (s *System) DetectGPUVendor() string {
|
|||||||
if _, err := os.Stat("/dev/kfd"); err == nil {
|
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||||||
return "amd"
|
return "amd"
|
||||||
}
|
}
|
||||||
|
if raw, err := exec.Command("lspci", "-nn").Output(); err == nil {
|
||||||
|
text := strings.ToLower(string(raw))
|
||||||
|
if strings.Contains(text, "advanced micro devices") || strings.Contains(text, "amd/ati") {
|
||||||
|
return "amd"
|
||||||
|
}
|
||||||
|
}
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -80,13 +128,103 @@ func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi.
|
// RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi.
|
||||||
func (s *System) RunAMDAcceptancePack(baseDir string) (string, error) {
|
func (s *System) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
return runAcceptancePack(baseDir, "gpu-amd", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd", []satJob{
|
||||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||||
{name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}},
|
{name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}},
|
||||||
{name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
{name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
{name: "04-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
{name: "04-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
})
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunAMDMemIntegrityPack runs the official RVS MEM module as a validate-style memory integrity test.
|
||||||
|
func (s *System) RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if err := ensureAMDRuntimeReady(); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
cfgFile := "/tmp/bee-amd-mem.conf"
|
||||||
|
cfg := `actions:
|
||||||
|
- name: mem_integrity
|
||||||
|
device: all
|
||||||
|
module: mem
|
||||||
|
parallel: true
|
||||||
|
duration: 60000
|
||||||
|
copy_matrix: false
|
||||||
|
target_stress: 90
|
||||||
|
matrix_size: 8640
|
||||||
|
`
|
||||||
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-mem", []satJob{
|
||||||
|
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||||
|
{name: "02-rvs-mem.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||||
|
{name: "03-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunAMDMemBandwidthPack runs AMD's memory/interconnect bandwidth-oriented tools.
|
||||||
|
func (s *System) RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if err := ensureAMDRuntimeReady(); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
cfgFile := "/tmp/bee-amd-babel.conf"
|
||||||
|
cfg := `actions:
|
||||||
|
- name: babel_mem_bw
|
||||||
|
device: all
|
||||||
|
module: babel
|
||||||
|
parallel: true
|
||||||
|
copy_matrix: true
|
||||||
|
target_stress: 90
|
||||||
|
matrix_size: 134217728
|
||||||
|
`
|
||||||
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-bandwidth", []satJob{
|
||||||
|
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||||
|
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||||
|
{name: "03-rvs-babel.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||||
|
{name: "04-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunAMDStressPack runs an AMD GPU burn-in pack.
|
||||||
|
// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
|
||||||
|
func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
seconds := durationSec
|
||||||
|
if seconds <= 0 {
|
||||||
|
seconds = envInt("BEE_AMD_STRESS_SECONDS", 300)
|
||||||
|
}
|
||||||
|
if err := ensureAMDRuntimeReady(); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
// Enable copy_matrix so the same GST run drives VRAM traffic in addition to compute.
|
||||||
|
rvsCfg := amdStressRVSConfig(seconds)
|
||||||
|
cfgFile := "/tmp/bee-amd-gst.conf"
|
||||||
|
_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
|
||||||
|
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", amdStressJobs(seconds, cfgFile), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func amdStressRVSConfig(seconds int) string {
|
||||||
|
return fmt.Sprintf(`actions:
|
||||||
|
- name: gst_stress
|
||||||
|
device: all
|
||||||
|
module: gst
|
||||||
|
parallel: true
|
||||||
|
duration: %d
|
||||||
|
copy_matrix: false
|
||||||
|
target_stress: 90
|
||||||
|
matrix_size_a: 8640
|
||||||
|
matrix_size_b: 8640
|
||||||
|
matrix_size_c: 8640
|
||||||
|
`, seconds*1000)
|
||||||
|
}
|
||||||
|
|
||||||
|
func amdStressJobs(seconds int, cfgFile string) []satJob {
|
||||||
|
return []satJob{
|
||||||
|
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||||
|
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||||
|
{name: fmt.Sprintf("03-rvs-gst-%ds.log", seconds), cmd: []string{"rvs", "-c", cfgFile}},
|
||||||
|
{name: fmt.Sprintf("04-rocm-smi-after.log"), cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--csv"}},
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
||||||
@@ -123,7 +261,7 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
|||||||
|
|
||||||
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||||
// Measures collective communication bandwidth over NVLink/PCIe.
|
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string) (string, error) {
|
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
// detect GPU count
|
// detect GPU count
|
||||||
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
|
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
|
||||||
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
|
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
|
||||||
@@ -136,44 +274,83 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string) (string, erro
|
|||||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||||
}},
|
}},
|
||||||
})
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
|
return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
|
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
|
||||||
// diagLevel: 1=quick, 2=medium, 3=targeted stress, 4=extended stress.
|
// diagLevel: 1=quick, 2=medium, 3=targeted stress, 4=extended stress.
|
||||||
// gpuIndices: specific GPU indices to test (empty = all GPUs).
|
// gpuIndices: specific GPU indices to test (empty = all GPUs).
|
||||||
// ctx cancellation kills the running job.
|
// ctx cancellation kills the running job.
|
||||||
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error) {
|
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices))
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
||||||
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
||||||
return runAcceptancePack(baseDir, "memory", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||||
})
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
|
func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
seconds := durationSec
|
||||||
|
if seconds <= 0 {
|
||||||
|
seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
|
||||||
|
}
|
||||||
|
// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
|
||||||
|
sizeArg := "80%"
|
||||||
|
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
||||||
|
sizeArg = fmt.Sprintf("%dM", mb)
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
|
||||||
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
|
{name: "02-stress-ng-vm.log", cmd: []string{
|
||||||
|
"stress-ng", "--vm", "1",
|
||||||
|
"--vm-bytes", sizeArg,
|
||||||
|
"--vm-method", "all",
|
||||||
|
"--timeout", fmt.Sprintf("%d", seconds),
|
||||||
|
"--metrics-brief",
|
||||||
|
}},
|
||||||
|
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
seconds := durationSec
|
||||||
|
if seconds <= 0 {
|
||||||
|
seconds = envInt("BEE_SAT_STRESS_SECONDS", 300)
|
||||||
|
}
|
||||||
|
cmd := []string{"stressapptest", "-s", fmt.Sprintf("%d", seconds), "-W", "--cc_test"}
|
||||||
|
if mb := envInt("BEE_SAT_STRESS_MB", 0); mb > 0 {
|
||||||
|
cmd = append(cmd, "-M", fmt.Sprintf("%d", mb))
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "sat-stress", []satJob{
|
||||||
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
|
{name: "02-stressapptest.log", cmd: cmd},
|
||||||
|
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
if durationSec <= 0 {
|
if durationSec <= 0 {
|
||||||
durationSec = 60
|
durationSec = 60
|
||||||
}
|
}
|
||||||
return runAcceptancePack(baseDir, "cpu", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "cpu", []satJob{
|
||||||
{name: "01-lscpu.log", cmd: []string{"lscpu"}},
|
{name: "01-lscpu.log", cmd: []string{"lscpu"}},
|
||||||
{name: "02-sensors-before.log", cmd: []string{"sensors"}},
|
{name: "02-sensors-before.log", cmd: []string{"sensors"}},
|
||||||
{name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}},
|
{name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}},
|
||||||
{name: "04-sensors-after.log", cmd: []string{"sensors"}},
|
{name: "04-sensors-after.log", cmd: []string{"sensors"}},
|
||||||
})
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
|
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
if baseDir == "" {
|
if baseDir == "" {
|
||||||
baseDir = "/var/log/bee-sat"
|
baseDir = "/var/log/bee-sat"
|
||||||
}
|
}
|
||||||
@@ -201,11 +378,17 @@ func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for index, devPath := range devices {
|
for index, devPath := range devices {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
||||||
commands := storageSATCommands(devPath)
|
commands := storageSATCommands(devPath)
|
||||||
for cmdIndex, job := range commands {
|
for cmdIndex, job := range commands {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
|
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
|
||||||
out, err := runSATCommand(verboseLog, job.name, job.cmd)
|
out, err := runSATCommandCtx(ctx, verboseLog, job.name, job.cmd, nil, logFunc)
|
||||||
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
|
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
|
||||||
return "", writeErr
|
return "", writeErr
|
||||||
}
|
}
|
||||||
@@ -243,58 +426,15 @@ type satStats struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func nvidiaSATJobs() []satJob {
|
func nvidiaSATJobs() []satJob {
|
||||||
seconds := envInt("BEE_GPU_STRESS_SECONDS", 5)
|
|
||||||
sizeMB := envInt("BEE_GPU_STRESS_SIZE_MB", 64)
|
|
||||||
return []satJob{
|
return []satJob{
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||||
{name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", fmt.Sprintf("%d", seconds), "--size-mb", fmt.Sprintf("%d", sizeMB)}},
|
{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
|
|
||||||
if baseDir == "" {
|
|
||||||
baseDir = "/var/log/bee-sat"
|
|
||||||
}
|
|
||||||
ts := time.Now().UTC().Format("20060102-150405")
|
|
||||||
runDir := filepath.Join(baseDir, prefix+"-"+ts)
|
|
||||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
|
||||||
|
|
||||||
var summary strings.Builder
|
|
||||||
stats := satStats{}
|
|
||||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
|
||||||
for _, job := range jobs {
|
|
||||||
cmd := make([]string, 0, len(job.cmd))
|
|
||||||
for _, arg := range job.cmd {
|
|
||||||
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
|
||||||
}
|
|
||||||
out, err := runSATCommand(verboseLog, job.name, cmd)
|
|
||||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
|
||||||
return "", writeErr
|
|
||||||
}
|
|
||||||
status, rc := classifySATResult(job.name, out, err)
|
|
||||||
stats.Add(status)
|
|
||||||
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
|
||||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
|
||||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
|
||||||
}
|
|
||||||
writeSATStats(&summary, stats)
|
|
||||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
|
||||||
if err := createTarGz(archive, runDir); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
return archive, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||||
if diagLevel < 1 || diagLevel > 4 {
|
if diagLevel < 1 || diagLevel > 4 {
|
||||||
diagLevel = 3
|
diagLevel = 3
|
||||||
@@ -315,7 +455,10 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob) (string, error) {
|
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
if baseDir == "" {
|
if baseDir == "" {
|
||||||
baseDir = "/var/log/bee-sat"
|
baseDir = "/var/log/bee-sat"
|
||||||
}
|
}
|
||||||
@@ -342,9 +485,9 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
var err error
|
var err error
|
||||||
|
|
||||||
if job.collectGPU {
|
if job.collectGPU {
|
||||||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir)
|
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
||||||
} else {
|
} else {
|
||||||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env)
|
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||||
@@ -368,13 +511,16 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
return archive, nil
|
return archive, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
|
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
|
||||||
start := time.Now().UTC()
|
start := time.Now().UTC()
|
||||||
resolvedCmd, err := resolveSATCommand(cmd)
|
resolvedCmd, err := resolveSATCommand(cmd)
|
||||||
appendSATVerboseLog(verboseLog,
|
appendSATVerboseLog(verboseLog,
|
||||||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||||
"cmd: "+strings.Join(resolvedCmd, " "),
|
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||||
)
|
)
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("=== %s ===", name))
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
appendSATVerboseLog(verboseLog,
|
appendSATVerboseLog(verboseLog,
|
||||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||||
@@ -386,10 +532,17 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
|||||||
}
|
}
|
||||||
|
|
||||||
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
||||||
|
c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
c.Cancel = func() error {
|
||||||
|
if c.Process != nil {
|
||||||
|
_ = syscall.Kill(-c.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
if len(env) > 0 {
|
if len(env) > 0 {
|
||||||
c.Env = append(os.Environ(), env...)
|
c.Env = append(os.Environ(), env...)
|
||||||
}
|
}
|
||||||
out, err := c.CombinedOutput()
|
out, err := streamExecOutput(c, logFunc)
|
||||||
|
|
||||||
rc := 0
|
rc := 0
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -464,6 +617,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
text := strings.ToLower(string(out))
|
text := strings.ToLower(string(out))
|
||||||
|
// No output at all means the tool failed to start (mlock limit, binary missing,
|
||||||
|
// etc.) — we cannot say anything about hardware health → UNSUPPORTED.
|
||||||
|
if len(strings.TrimSpace(text)) == 0 {
|
||||||
|
return "UNSUPPORTED", rc
|
||||||
|
}
|
||||||
if strings.Contains(text, "unsupported") ||
|
if strings.Contains(text, "unsupported") ||
|
||||||
strings.Contains(text, "not supported") ||
|
strings.Contains(text, "not supported") ||
|
||||||
strings.Contains(text, "invalid opcode") ||
|
strings.Contains(text, "invalid opcode") ||
|
||||||
@@ -472,19 +630,25 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
|||||||
strings.Contains(text, "not available") ||
|
strings.Contains(text, "not available") ||
|
||||||
strings.Contains(text, "cuda_error_system_not_ready") ||
|
strings.Contains(text, "cuda_error_system_not_ready") ||
|
||||||
strings.Contains(text, "no such device") ||
|
strings.Contains(text, "no such device") ||
|
||||||
|
// nvidia-smi on a machine with no NVIDIA GPU
|
||||||
|
strings.Contains(text, "couldn't communicate with the nvidia driver") ||
|
||||||
|
strings.Contains(text, "no nvidia gpu") ||
|
||||||
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
||||||
return "UNSUPPORTED", rc
|
return "UNSUPPORTED", rc
|
||||||
}
|
}
|
||||||
return "FAILED", rc
|
return "FAILED", rc
|
||||||
}
|
}
|
||||||
|
|
||||||
func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
func runSATCommand(verboseLog, name string, cmd []string, logFunc func(string)) ([]byte, error) {
|
||||||
start := time.Now().UTC()
|
start := time.Now().UTC()
|
||||||
resolvedCmd, err := resolveSATCommand(cmd)
|
resolvedCmd, err := resolveSATCommand(cmd)
|
||||||
appendSATVerboseLog(verboseLog,
|
appendSATVerboseLog(verboseLog,
|
||||||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||||
"cmd: "+strings.Join(resolvedCmd, " "),
|
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||||
)
|
)
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("=== %s ===", name))
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
appendSATVerboseLog(verboseLog,
|
appendSATVerboseLog(verboseLog,
|
||||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||||
@@ -495,7 +659,7 @@ func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
|||||||
return []byte(err.Error() + "\n"), err
|
return []byte(err.Error() + "\n"), err
|
||||||
}
|
}
|
||||||
|
|
||||||
out, err := satExecCommand(resolvedCmd[0], resolvedCmd[1:]...).CombinedOutput()
|
out, err := streamExecOutput(satExecCommand(resolvedCmd[0], resolvedCmd[1:]...), logFunc)
|
||||||
|
|
||||||
rc := 0
|
rc := 0
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -522,10 +686,27 @@ func resolveSATCommand(cmd []string) ([]string, error) {
|
|||||||
if len(cmd) == 0 {
|
if len(cmd) == 0 {
|
||||||
return nil, errors.New("empty SAT command")
|
return nil, errors.New("empty SAT command")
|
||||||
}
|
}
|
||||||
if cmd[0] != "rocm-smi" {
|
switch cmd[0] {
|
||||||
return cmd, nil
|
case "rocm-smi":
|
||||||
|
return resolveROCmSMICommand(cmd[1:]...)
|
||||||
|
case "rvs":
|
||||||
|
return resolveRVSCommand(cmd[1:]...)
|
||||||
}
|
}
|
||||||
return resolveROCmSMICommand(cmd[1:]...)
|
path, err := satLookPath(cmd[0])
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("%s not found in PATH: %w", cmd[0], err)
|
||||||
|
}
|
||||||
|
return append([]string{path}, cmd[1:]...), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveRVSCommand(args ...string) ([]string, error) {
|
||||||
|
if path, err := satLookPath("rvs"); err == nil {
|
||||||
|
return append([]string{path}, args...), nil
|
||||||
|
}
|
||||||
|
for _, path := range expandExistingPaths(rvsExecutableGlobs) {
|
||||||
|
return append([]string{path}, args...), nil
|
||||||
|
}
|
||||||
|
return nil, errors.New("rvs not found in PATH or under /opt/rocm")
|
||||||
}
|
}
|
||||||
|
|
||||||
func resolveROCmSMICommand(args ...string) ([]string, error) {
|
func resolveROCmSMICommand(args ...string) ([]string, error) {
|
||||||
@@ -549,6 +730,20 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
|
|||||||
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ensureAMDRuntimeReady() error {
|
||||||
|
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if raw, err := os.ReadFile("/sys/module/amdgpu/initstate"); err == nil {
|
||||||
|
state := strings.TrimSpace(string(raw))
|
||||||
|
if strings.EqualFold(state, "live") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return fmt.Errorf("AMD driver is present but not initialized: amdgpu initstate=%q", state)
|
||||||
|
}
|
||||||
|
return errors.New("AMD GPUs are present but the runtime is not initialized: /dev/kfd is missing and amdgpu is not loaded")
|
||||||
|
}
|
||||||
|
|
||||||
func rocmSMIExecutableCandidates() []string {
|
func rocmSMIExecutableCandidates() []string {
|
||||||
return expandExistingPaths(rocmSMIExecutableGlobs)
|
return expandExistingPaths(rocmSMIExecutableGlobs)
|
||||||
}
|
}
|
||||||
@@ -597,7 +792,7 @@ func parseStorageDevices(raw string) []string {
|
|||||||
|
|
||||||
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
|
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
|
||||||
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
|
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
|
||||||
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {
|
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string, logFunc func(string)) ([]byte, error) {
|
||||||
stopCh := make(chan struct{})
|
stopCh := make(chan struct{})
|
||||||
doneCh := make(chan struct{})
|
doneCh := make(chan struct{})
|
||||||
var metricRows []GPUMetricRow
|
var metricRows []GPUMetricRow
|
||||||
@@ -625,7 +820,7 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env)
|
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc)
|
||||||
|
|
||||||
close(stopCh)
|
close(stopCh)
|
||||||
<-doneCh
|
<-doneCh
|
||||||
|
|||||||
@@ -2,10 +2,12 @@ package platform
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -49,6 +51,18 @@ type FanStressRow struct {
|
|||||||
SysPowerW float64 // DCMI system power reading
|
SysPowerW float64 // DCMI system power reading
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type cachedPowerReading struct {
|
||||||
|
Value float64
|
||||||
|
UpdatedAt time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
systemPowerCacheMu sync.Mutex
|
||||||
|
systemPowerCache cachedPowerReading
|
||||||
|
)
|
||||||
|
|
||||||
|
const systemPowerHoldTTL = 15 * time.Second
|
||||||
|
|
||||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||||
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||||
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||||
@@ -128,26 +142,21 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
|
|||||||
stats.OK++
|
stats.OK++
|
||||||
}
|
}
|
||||||
|
|
||||||
// loadPhase runs bee-gpu-stress for durSec; sampler stamps phaseName on each row.
|
// loadPhase runs bee-gpu-burn for durSec; sampler stamps phaseName on each row.
|
||||||
loadPhase := func(phaseName, stepName string, durSec int) {
|
loadPhase := func(phaseName, stepName string, durSec int) {
|
||||||
if ctx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
setPhase(phaseName)
|
setPhase(phaseName)
|
||||||
var env []string
|
|
||||||
if len(opts.GPUIndices) > 0 {
|
|
||||||
ids := make([]string, len(opts.GPUIndices))
|
|
||||||
for i, idx := range opts.GPUIndices {
|
|
||||||
ids[i] = strconv.Itoa(idx)
|
|
||||||
}
|
|
||||||
env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
|
|
||||||
}
|
|
||||||
cmd := []string{
|
cmd := []string{
|
||||||
"bee-gpu-stress",
|
"bee-gpu-burn",
|
||||||
"--seconds", strconv.Itoa(durSec),
|
"--seconds", strconv.Itoa(durSec),
|
||||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
}
|
}
|
||||||
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env)
|
if len(opts.GPUIndices) > 0 {
|
||||||
|
cmd = append(cmd, "--devices", joinIndexList(dedupeSortedIndices(opts.GPUIndices)))
|
||||||
|
}
|
||||||
|
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, nil, nil)
|
||||||
_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
|
_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
|
||||||
if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
|
if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
|
||||||
fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
|
fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
|
||||||
@@ -304,41 +313,148 @@ func sampleGPUStressMetrics(gpuIndices []int) []GPUStressMetric {
|
|||||||
// sampleFanSpeeds reads fan RPM values from ipmitool sdr.
|
// sampleFanSpeeds reads fan RPM values from ipmitool sdr.
|
||||||
func sampleFanSpeeds() ([]FanReading, error) {
|
func sampleFanSpeeds() ([]FanReading, error) {
|
||||||
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
||||||
|
if err == nil {
|
||||||
|
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
|
||||||
|
return fans, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
|
||||||
|
if len(fans) > 0 {
|
||||||
|
return fans, nil
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
return parseFanSpeeds(string(out)), nil
|
return nil, sensorsErr
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||||
// Line format: "FAN1 | 2400.000 | RPM | ok"
|
// Handles two formats:
|
||||||
|
//
|
||||||
|
// Old: "FAN1 | 2400.000 | RPM | ok" (value in col[1], unit in col[2])
|
||||||
|
// New: "FAN1 | 41h | ok | 29.1 | 4340 RPM" (value+unit combined in last col)
|
||||||
func parseFanSpeeds(raw string) []FanReading {
|
func parseFanSpeeds(raw string) []FanReading {
|
||||||
var fans []FanReading
|
var fans []FanReading
|
||||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||||
parts := strings.Split(line, "|")
|
parts := strings.Split(line, "|")
|
||||||
if len(parts) < 3 {
|
if len(parts) < 2 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
unit := strings.TrimSpace(parts[2])
|
name := strings.TrimSpace(parts[0])
|
||||||
if !strings.EqualFold(unit, "RPM") {
|
// Find the first field that contains "RPM" (either as a standalone unit or inline)
|
||||||
|
rpmVal := 0.0
|
||||||
|
found := false
|
||||||
|
for _, p := range parts[1:] {
|
||||||
|
p = strings.TrimSpace(p)
|
||||||
|
if !strings.Contains(strings.ToUpper(p), "RPM") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.EqualFold(p, "RPM") {
|
||||||
|
continue // unit-only column in old format; value is in previous field
|
||||||
|
}
|
||||||
|
val, err := parseFanRPMValue(p)
|
||||||
|
if err == nil {
|
||||||
|
rpmVal = val
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Old format: unit "RPM" is in col[2], value is in col[1]
|
||||||
|
if !found && len(parts) >= 3 && strings.EqualFold(strings.TrimSpace(parts[2]), "RPM") {
|
||||||
|
valStr := strings.TrimSpace(parts[1])
|
||||||
|
if !strings.EqualFold(valStr, "na") && !strings.EqualFold(valStr, "disabled") && valStr != "" {
|
||||||
|
if val, err := parseFanRPMValue(valStr); err == nil {
|
||||||
|
rpmVal = val
|
||||||
|
found = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
valStr := strings.TrimSpace(parts[1])
|
fans = append(fans, FanReading{Name: name, RPM: rpmVal})
|
||||||
if strings.EqualFold(valStr, "na") || strings.EqualFold(valStr, "disabled") || valStr == "" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
val, err := strconv.ParseFloat(valStr, 64)
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
fans = append(fans, FanReading{
|
|
||||||
Name: strings.TrimSpace(parts[0]),
|
|
||||||
RPM: val,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
return fans
|
return fans
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseFanRPMValue(raw string) (float64, error) {
|
||||||
|
fields := strings.Fields(strings.TrimSpace(strings.ReplaceAll(raw, ",", "")))
|
||||||
|
if len(fields) == 0 {
|
||||||
|
return 0, strconv.ErrSyntax
|
||||||
|
}
|
||||||
|
return strconv.ParseFloat(fields[0], 64)
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
|
||||||
|
out, err := exec.Command("sensors", "-j").Output()
|
||||||
|
if err != nil || len(out) == 0 {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var doc map[string]map[string]any
|
||||||
|
if err := json.Unmarshal(out, &doc); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
chips := make([]string, 0, len(doc))
|
||||||
|
for chip := range doc {
|
||||||
|
chips = append(chips, chip)
|
||||||
|
}
|
||||||
|
sort.Strings(chips)
|
||||||
|
var fans []FanReading
|
||||||
|
seen := map[string]struct{}{}
|
||||||
|
for _, chip := range chips {
|
||||||
|
features := doc[chip]
|
||||||
|
names := make([]string, 0, len(features))
|
||||||
|
for name := range features {
|
||||||
|
names = append(names, name)
|
||||||
|
}
|
||||||
|
sort.Strings(names)
|
||||||
|
for _, name := range names {
|
||||||
|
feature, ok := features[name].(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
rpm, ok := firstFanInputValue(feature)
|
||||||
|
if !ok || rpm <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
label := strings.TrimSpace(name)
|
||||||
|
if chip != "" && !strings.Contains(strings.ToLower(label), strings.ToLower(chip)) {
|
||||||
|
label = chip + " / " + label
|
||||||
|
}
|
||||||
|
if _, ok := seen[label]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[label] = struct{}{}
|
||||||
|
fans = append(fans, FanReading{Name: label, RPM: rpm})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fans, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstFanInputValue(feature map[string]any) (float64, bool) {
|
||||||
|
keys := make([]string, 0, len(feature))
|
||||||
|
for key := range feature {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
for _, key := range keys {
|
||||||
|
lower := strings.ToLower(key)
|
||||||
|
if !strings.Contains(lower, "fan") || !strings.HasSuffix(lower, "_input") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch value := feature[key].(type) {
|
||||||
|
case float64:
|
||||||
|
return value, true
|
||||||
|
case string:
|
||||||
|
f, err := strconv.ParseFloat(value, 64)
|
||||||
|
if err == nil {
|
||||||
|
return f, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
// sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors.
|
// sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors.
|
||||||
func sampleCPUMaxTemp() float64 {
|
func sampleCPUMaxTemp() float64 {
|
||||||
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
||||||
@@ -404,11 +520,17 @@ func sampleCPUTempViaSensors() float64 {
|
|||||||
|
|
||||||
// sampleSystemPower reads system power draw via DCMI.
|
// sampleSystemPower reads system power draw via DCMI.
|
||||||
func sampleSystemPower() float64 {
|
func sampleSystemPower() float64 {
|
||||||
|
now := time.Now()
|
||||||
|
current := 0.0
|
||||||
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
||||||
if err != nil {
|
if err == nil {
|
||||||
return 0
|
current = parseDCMIPowerReading(string(out))
|
||||||
}
|
}
|
||||||
return parseDCMIPowerReading(string(out))
|
systemPowerCacheMu.Lock()
|
||||||
|
defer systemPowerCacheMu.Unlock()
|
||||||
|
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
|
||||||
|
systemPowerCache = updated
|
||||||
|
return value
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||||
@@ -431,6 +553,17 @@ func parseDCMIPowerReading(raw string) float64 {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
|
||||||
|
if current > 0 {
|
||||||
|
cache = cachedPowerReading{Value: current, UpdatedAt: now}
|
||||||
|
return current, cache
|
||||||
|
}
|
||||||
|
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
||||||
|
return cache.Value, cache
|
||||||
|
}
|
||||||
|
return 0, cache
|
||||||
|
}
|
||||||
|
|
||||||
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
||||||
// during either load phase.
|
// during either load phase.
|
||||||
func analyzeThrottling(rows []FanStressRow) bool {
|
func analyzeThrottling(rows []FanStressRow) bool {
|
||||||
|
|||||||
67
audit/internal/platform/sat_fan_stress_test.go
Normal file
67
audit/internal/platform/sat_fan_stress_test.go
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseFanSpeeds(t *testing.T) {
|
||||||
|
raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n"
|
||||||
|
got := parseFanSpeeds(raw)
|
||||||
|
if len(got) != 2 {
|
||||||
|
t.Fatalf("fans=%d want 2 (%v)", len(got), got)
|
||||||
|
}
|
||||||
|
if got[0].Name != "FAN1" || got[0].RPM != 2400 {
|
||||||
|
t.Fatalf("fan0=%+v", got[0])
|
||||||
|
}
|
||||||
|
if got[1].Name != "FAN2" || got[1].RPM != 1800 {
|
||||||
|
t.Fatalf("fan1=%+v", got[1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFirstFanInputValue(t *testing.T) {
|
||||||
|
feature := map[string]any{
|
||||||
|
"fan1_input": 9200.0,
|
||||||
|
}
|
||||||
|
got, ok := firstFanInputValue(feature)
|
||||||
|
if !ok || got != 9200 {
|
||||||
|
t.Fatalf("got=%v ok=%v", got, ok)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseDCMIPowerReading(t *testing.T) {
|
||||||
|
raw := `
|
||||||
|
Instantaneous power reading: 512 Watts
|
||||||
|
Minimum during sampling period: 498 Watts
|
||||||
|
`
|
||||||
|
if got := parseDCMIPowerReading(raw); got != 512 {
|
||||||
|
t.Fatalf("parseDCMIPowerReading()=%v want 512", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||||
|
now := time.Now()
|
||||||
|
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
||||||
|
|
||||||
|
got, updated := effectiveSystemPowerReading(cache, 0, now)
|
||||||
|
if got != 480 {
|
||||||
|
t.Fatalf("got=%v want cached 480", got)
|
||||||
|
}
|
||||||
|
if updated.Value != 480 {
|
||||||
|
t.Fatalf("updated=%+v", updated)
|
||||||
|
}
|
||||||
|
|
||||||
|
got, updated = effectiveSystemPowerReading(cache, 530, now)
|
||||||
|
if got != 530 {
|
||||||
|
t.Fatalf("got=%v want 530", got)
|
||||||
|
}
|
||||||
|
if updated.Value != 530 {
|
||||||
|
t.Fatalf("updated=%+v", updated)
|
||||||
|
}
|
||||||
|
|
||||||
|
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
||||||
|
got, _ = effectiveSystemPowerReading(expired, 0, now)
|
||||||
|
if got != 0 {
|
||||||
|
t.Fatalf("expired cache returned %v want 0", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -30,21 +31,59 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
|||||||
if len(jobs) != 5 {
|
if len(jobs) != 5 {
|
||||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||||
}
|
}
|
||||||
if got := jobs[4].cmd[0]; got != "bee-gpu-stress" {
|
if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
|
||||||
t.Fatalf("gpu stress command=%q want bee-gpu-stress", got)
|
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
||||||
}
|
}
|
||||||
if got := jobs[3].cmd[1]; got != "--output-file" {
|
if got := jobs[3].cmd[1]; got != "--output-file" {
|
||||||
t.Fatalf("bug report flag=%q want --output-file", got)
|
t.Fatalf("bug report flag=%q want --output-file", got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
func TestAMDStressConfigUsesSingleGSTAction(t *testing.T) {
|
||||||
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
|
t.Parallel()
|
||||||
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
|
|
||||||
|
|
||||||
|
cfg := amdStressRVSConfig(123)
|
||||||
|
if !strings.Contains(cfg, "module: gst") {
|
||||||
|
t.Fatalf("config missing gst module:\n%s", cfg)
|
||||||
|
}
|
||||||
|
if strings.Contains(cfg, "module: mem") {
|
||||||
|
t.Fatalf("config should not include mem module:\n%s", cfg)
|
||||||
|
}
|
||||||
|
if !strings.Contains(cfg, "copy_matrix: false") {
|
||||||
|
t.Fatalf("config should use copy_matrix=false:\n%s", cfg)
|
||||||
|
}
|
||||||
|
if strings.Count(cfg, "duration: 123000") != 1 {
|
||||||
|
t.Fatalf("config should apply duration once:\n%s", cfg)
|
||||||
|
}
|
||||||
|
for _, field := range []string{"matrix_size_a: 8640", "matrix_size_b: 8640", "matrix_size_c: 8640"} {
|
||||||
|
if !strings.Contains(cfg, field) {
|
||||||
|
t.Fatalf("config missing %s:\n%s", field, cfg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
jobs := amdStressJobs(300, "/tmp/test-amd-gst.conf")
|
||||||
|
if len(jobs) != 4 {
|
||||||
|
t.Fatalf("jobs=%d want 4", len(jobs))
|
||||||
|
}
|
||||||
|
if got := jobs[1].cmd[0]; got != "rocm-bandwidth-test" {
|
||||||
|
t.Fatalf("jobs[1]=%q want rocm-bandwidth-test", got)
|
||||||
|
}
|
||||||
|
if got := jobs[2].cmd[0]; got != "rvs" {
|
||||||
|
t.Fatalf("jobs[2]=%q want rvs", got)
|
||||||
|
}
|
||||||
|
if got := jobs[2].cmd[2]; got != "/tmp/test-amd-gst.conf" {
|
||||||
|
t.Fatalf("jobs[2] cfg=%q want /tmp/test-amd-gst.conf", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||||
jobs := nvidiaSATJobs()
|
jobs := nvidiaSATJobs()
|
||||||
got := jobs[4].cmd
|
got := jobs[4].cmd
|
||||||
want := []string{"bee-gpu-stress", "--seconds", "9", "--size-mb", "96"}
|
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
||||||
if len(got) != len(want) {
|
if len(got) != len(want) {
|
||||||
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||||
}
|
}
|
||||||
@@ -55,6 +94,93 @@ func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
oldExecCommand := satExecCommand
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
if name == "nvidia-smi" {
|
||||||
|
return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
|
||||||
|
}
|
||||||
|
return exec.Command(name, args...)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||||
|
|
||||||
|
job, err := buildNvidiaStressJob(NvidiaStressOptions{
|
||||||
|
DurationSec: 600,
|
||||||
|
Loader: NvidiaStressLoaderJohn,
|
||||||
|
ExcludeGPUIndices: []int{1},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("buildNvidiaStressJob error: %v", err)
|
||||||
|
}
|
||||||
|
wantCmd := []string{"bee-john-gpu-stress", "--seconds", "600", "--devices", "0,2"}
|
||||||
|
if len(job.cmd) != len(wantCmd) {
|
||||||
|
t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
|
||||||
|
}
|
||||||
|
for i := range wantCmd {
|
||||||
|
if job.cmd[i] != wantCmd[i] {
|
||||||
|
t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if got := joinIndexList(job.gpuIndices); got != "0,2" {
|
||||||
|
t.Fatalf("gpuIndices=%q want 0,2", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
oldExecCommand := satExecCommand
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
if name == "nvidia-smi" {
|
||||||
|
return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
|
||||||
|
}
|
||||||
|
return exec.Command(name, args...)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||||
|
|
||||||
|
job, err := buildNvidiaStressJob(NvidiaStressOptions{
|
||||||
|
DurationSec: 120,
|
||||||
|
Loader: NvidiaStressLoaderNCCL,
|
||||||
|
GPUIndices: []int{2, 0},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("buildNvidiaStressJob error: %v", err)
|
||||||
|
}
|
||||||
|
wantCmd := []string{"bee-nccl-gpu-stress", "--seconds", "120", "--devices", "0,2"}
|
||||||
|
if len(job.cmd) != len(wantCmd) {
|
||||||
|
t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
|
||||||
|
}
|
||||||
|
for i := range wantCmd {
|
||||||
|
if job.cmd[i] != wantCmd[i] {
|
||||||
|
t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if got := joinIndexList(job.gpuIndices); got != "0,2" {
|
||||||
|
t.Fatalf("gpuIndices=%q want 0,2", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
loader string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{loader: NvidiaStressLoaderBuiltin, want: "gpu-nvidia-burn"},
|
||||||
|
{loader: NvidiaStressLoaderJohn, want: "gpu-nvidia-john"},
|
||||||
|
{loader: NvidiaStressLoaderNCCL, want: "gpu-nvidia-nccl"},
|
||||||
|
{loader: "", want: "gpu-nvidia-burn"},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
if got := nvidiaStressArchivePrefix(tt.loader); got != tt.want {
|
||||||
|
t.Fatalf("loader=%q prefix=%q want %q", tt.loader, got, tt.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestEnvIntFallback(t *testing.T) {
|
func TestEnvIntFallback(t *testing.T) {
|
||||||
os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
|
os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
|
||||||
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
||||||
@@ -80,8 +206,8 @@ func TestClassifySATResult(t *testing.T) {
|
|||||||
}{
|
}{
|
||||||
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
||||||
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||||
{name: "failed", job: "bee-gpu-stress", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||||
{name: "cuda not ready", job: "bee-gpu-stress", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
@@ -130,6 +256,44 @@ func TestResolveROCmSMICommandFromPATH(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestResolveSATCommandUsesLookPathForGenericTools(t *testing.T) {
|
||||||
|
oldLookPath := satLookPath
|
||||||
|
satLookPath = func(file string) (string, error) {
|
||||||
|
if file == "stress-ng" {
|
||||||
|
return "/usr/bin/stress-ng", nil
|
||||||
|
}
|
||||||
|
return "", exec.ErrNotFound
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||||
|
|
||||||
|
cmd, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveSATCommand error: %v", err)
|
||||||
|
}
|
||||||
|
if len(cmd) != 3 {
|
||||||
|
t.Fatalf("cmd len=%d want 3 (%v)", len(cmd), cmd)
|
||||||
|
}
|
||||||
|
if cmd[0] != "/usr/bin/stress-ng" {
|
||||||
|
t.Fatalf("cmd[0]=%q want /usr/bin/stress-ng", cmd[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveSATCommandFailsForMissingGenericTool(t *testing.T) {
|
||||||
|
oldLookPath := satLookPath
|
||||||
|
satLookPath = func(file string) (string, error) {
|
||||||
|
return "", exec.ErrNotFound
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||||
|
|
||||||
|
_, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error")
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), "stress-ng not found in PATH") {
|
||||||
|
t.Fatalf("error=%q", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
|
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
|
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
|
||||||
|
|||||||
@@ -17,6 +17,10 @@ func (s *System) ListBeeServices() ([]string, error) {
|
|||||||
}
|
}
|
||||||
for _, match := range matches {
|
for _, match := range matches {
|
||||||
name := strings.TrimSuffix(filepath.Base(match), ".service")
|
name := strings.TrimSuffix(filepath.Base(match), ".service")
|
||||||
|
// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
|
||||||
|
if strings.HasSuffix(name, "@") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
if !seen[name] {
|
if !seen[name] {
|
||||||
seen[name] = true
|
seen[name] = true
|
||||||
out = append(out, name)
|
out = append(out, name)
|
||||||
|
|||||||
@@ -8,6 +8,18 @@ type InterfaceInfo struct {
|
|||||||
IPv4 []string
|
IPv4 []string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type NetworkInterfaceSnapshot struct {
|
||||||
|
Name string
|
||||||
|
Up bool
|
||||||
|
IPv4 []string
|
||||||
|
}
|
||||||
|
|
||||||
|
type NetworkSnapshot struct {
|
||||||
|
Interfaces []NetworkInterfaceSnapshot
|
||||||
|
DefaultRoutes []string
|
||||||
|
ResolvConf string
|
||||||
|
}
|
||||||
|
|
||||||
type ServiceAction string
|
type ServiceAction string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@@ -39,6 +51,20 @@ type ToolStatus struct {
|
|||||||
OK bool
|
OK bool
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
NvidiaStressLoaderBuiltin = "builtin"
|
||||||
|
NvidiaStressLoaderJohn = "john"
|
||||||
|
NvidiaStressLoaderNCCL = "nccl"
|
||||||
|
)
|
||||||
|
|
||||||
|
type NvidiaStressOptions struct {
|
||||||
|
DurationSec int
|
||||||
|
SizeMB int
|
||||||
|
Loader string
|
||||||
|
GPUIndices []int
|
||||||
|
ExcludeGPUIndices []int
|
||||||
|
}
|
||||||
|
|
||||||
func New() *System {
|
func New() *System {
|
||||||
return &System{}
|
return &System{}
|
||||||
}
|
}
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
129
audit/internal/webui/api_test.go
Normal file
129
audit/internal/webui/api_test.go
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"net/http/httptest"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
|
||||||
|
t.Setenv("DISPLAY", "")
|
||||||
|
t.Setenv("XAUTHORITY", "")
|
||||||
|
|
||||||
|
cmd := xrandrCommand("--query")
|
||||||
|
|
||||||
|
var hasDisplay bool
|
||||||
|
var hasXAuthority bool
|
||||||
|
for _, kv := range cmd.Env {
|
||||||
|
if kv == "DISPLAY=:0" {
|
||||||
|
hasDisplay = true
|
||||||
|
}
|
||||||
|
if kv == "XAUTHORITY=/home/bee/.Xauthority" {
|
||||||
|
hasXAuthority = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !hasDisplay {
|
||||||
|
t.Fatalf("DISPLAY not injected: %v", cmd.Env)
|
||||||
|
}
|
||||||
|
if !hasXAuthority {
|
||||||
|
t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/sat/cpu/run", strings.NewReader(`{"profile":"smoke"}`))
|
||||||
|
req.ContentLength = -1
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPISATRun("cpu").ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 1 {
|
||||||
|
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
|
||||||
|
t.Fatalf("burn profile=%q want smoke", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIExportBundleQueuesTask(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{ExportDir: t.TempDir()}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/export/bundle", nil)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPIExportBundle(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
var body map[string]string
|
||||||
|
if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil {
|
||||||
|
t.Fatalf("decode response: %v", err)
|
||||||
|
}
|
||||||
|
if body["task_id"] == "" {
|
||||||
|
t.Fatalf("missing task_id in response: %v", body)
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 1 {
|
||||||
|
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[0].Target; got != "support-bundle" {
|
||||||
|
t.Fatalf("target=%q want support-bundle", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||||
|
h := &handler{}
|
||||||
|
h.pushFanRings([]platform.FanReading{
|
||||||
|
{Name: "FAN_A", RPM: 4200},
|
||||||
|
{Name: "FAN_B", RPM: 5100},
|
||||||
|
})
|
||||||
|
h.pushFanRings([]platform.FanReading{
|
||||||
|
{Name: "FAN_B", RPM: 5200},
|
||||||
|
})
|
||||||
|
|
||||||
|
if len(h.fanNames) != 2 || h.fanNames[0] != "FAN_A" || h.fanNames[1] != "FAN_B" {
|
||||||
|
t.Fatalf("fanNames=%v", h.fanNames)
|
||||||
|
}
|
||||||
|
aVals, _ := h.ringFans[0].snapshot()
|
||||||
|
bVals, _ := h.ringFans[1].snapshot()
|
||||||
|
if len(aVals) != 2 || len(bVals) != 2 {
|
||||||
|
t.Fatalf("fan ring lengths: A=%d B=%d", len(aVals), len(bVals))
|
||||||
|
}
|
||||||
|
if aVals[1] != 4200 {
|
||||||
|
t.Fatalf("FAN_A should carry forward last value, got %v", aVals)
|
||||||
|
}
|
||||||
|
if bVals[1] != 5200 {
|
||||||
|
t.Fatalf("FAN_B should use latest sampled value, got %v", bVals)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,18 +1,21 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
// jobState holds the output lines and completion status of an async job.
|
// jobState holds the output lines and completion status of an async job.
|
||||||
type jobState struct {
|
type jobState struct {
|
||||||
lines []string
|
lines []string
|
||||||
done bool
|
done bool
|
||||||
err string
|
err string
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
subs []chan string
|
subs []chan string
|
||||||
cancel func() // optional cancel function; nil if job is not cancellable
|
cancel func() // optional cancel function; nil if job is not cancellable
|
||||||
|
logPath string
|
||||||
}
|
}
|
||||||
|
|
||||||
// abort cancels the job if it has a cancel function and is not yet done.
|
// abort cancels the job if it has a cancel function and is not yet done.
|
||||||
@@ -30,6 +33,9 @@ func (j *jobState) append(line string) {
|
|||||||
j.mu.Lock()
|
j.mu.Lock()
|
||||||
defer j.mu.Unlock()
|
defer j.mu.Unlock()
|
||||||
j.lines = append(j.lines, line)
|
j.lines = append(j.lines, line)
|
||||||
|
if j.logPath != "" {
|
||||||
|
appendJobLog(j.logPath, line)
|
||||||
|
}
|
||||||
for _, ch := range j.subs {
|
for _, ch := range j.subs {
|
||||||
select {
|
select {
|
||||||
case ch <- line:
|
case ch <- line:
|
||||||
@@ -100,3 +106,32 @@ func (m *jobManager) get(id string) (*jobState, bool) {
|
|||||||
j, ok := m.jobs[id]
|
j, ok := m.jobs[id]
|
||||||
return j, ok
|
return j, ok
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func newTaskJobState(logPath string) *jobState {
|
||||||
|
j := &jobState{logPath: logPath}
|
||||||
|
if logPath == "" {
|
||||||
|
return j
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(logPath)
|
||||||
|
if err != nil || len(data) == 0 {
|
||||||
|
return j
|
||||||
|
}
|
||||||
|
lines := strings.Split(strings.ReplaceAll(string(data), "\r\n", "\n"), "\n")
|
||||||
|
if len(lines) > 0 && lines[len(lines)-1] == "" {
|
||||||
|
lines = lines[:len(lines)-1]
|
||||||
|
}
|
||||||
|
j.lines = append(j.lines, lines...)
|
||||||
|
return j
|
||||||
|
}
|
||||||
|
|
||||||
|
func appendJobLog(path, line string) {
|
||||||
|
if path == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
_, _ = f.WriteString(line + "\n")
|
||||||
|
}
|
||||||
|
|||||||
326
audit/internal/webui/metricsdb.go
Normal file
326
audit/internal/webui/metricsdb.go
Normal file
@@ -0,0 +1,326 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"database/sql"
|
||||||
|
"encoding/csv"
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
_ "modernc.org/sqlite"
|
||||||
|
)
|
||||||
|
|
||||||
|
const metricsDBPath = "/appdata/bee/metrics.db"
|
||||||
|
|
||||||
|
// MetricsDB persists live metric samples to SQLite.
|
||||||
|
type MetricsDB struct {
|
||||||
|
db *sql.DB
|
||||||
|
}
|
||||||
|
|
||||||
|
// openMetricsDB opens (or creates) the metrics database at the given path.
|
||||||
|
func openMetricsDB(path string) (*MetricsDB, error) {
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
db, err := sql.Open("sqlite", path+"?_journal=WAL&_busy_timeout=5000")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
db.SetMaxOpenConns(1)
|
||||||
|
if err := initMetricsSchema(db); err != nil {
|
||||||
|
_ = db.Close()
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return &MetricsDB{db: db}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func initMetricsSchema(db *sql.DB) error {
|
||||||
|
_, err := db.Exec(`
|
||||||
|
CREATE TABLE IF NOT EXISTS sys_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
cpu_load_pct REAL,
|
||||||
|
mem_load_pct REAL,
|
||||||
|
power_w REAL,
|
||||||
|
PRIMARY KEY (ts)
|
||||||
|
);
|
||||||
|
CREATE TABLE IF NOT EXISTS gpu_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
gpu_index INTEGER NOT NULL,
|
||||||
|
temp_c REAL,
|
||||||
|
usage_pct REAL,
|
||||||
|
mem_usage_pct REAL,
|
||||||
|
power_w REAL,
|
||||||
|
PRIMARY KEY (ts, gpu_index)
|
||||||
|
);
|
||||||
|
CREATE TABLE IF NOT EXISTS fan_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
rpm REAL,
|
||||||
|
PRIMARY KEY (ts, name)
|
||||||
|
);
|
||||||
|
CREATE TABLE IF NOT EXISTS temp_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
grp TEXT NOT NULL,
|
||||||
|
celsius REAL,
|
||||||
|
PRIMARY KEY (ts, name)
|
||||||
|
);
|
||||||
|
`)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write inserts one sample into all relevant tables.
|
||||||
|
func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||||
|
ts := s.Timestamp.Unix()
|
||||||
|
tx, err := m.db.Begin()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer func() { _ = tx.Rollback() }()
|
||||||
|
|
||||||
|
_, err = tx.Exec(
|
||||||
|
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
|
||||||
|
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, g := range s.GPUs {
|
||||||
|
_, err = tx.Exec(
|
||||||
|
`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w) VALUES(?,?,?,?,?,?)`,
|
||||||
|
ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, f := range s.Fans {
|
||||||
|
_, err = tx.Exec(
|
||||||
|
`INSERT OR REPLACE INTO fan_metrics(ts,name,rpm) VALUES(?,?,?)`,
|
||||||
|
ts, f.Name, f.RPM,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, t := range s.Temps {
|
||||||
|
_, err = tx.Exec(
|
||||||
|
`INSERT OR REPLACE INTO temp_metrics(ts,name,grp,celsius) VALUES(?,?,?,?)`,
|
||||||
|
ts, t.Name, t.Group, t.Celsius,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return tx.Commit()
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||||
|
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||||
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||||
|
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||||
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
|
||||||
|
func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
|
||||||
|
rows, err := m.db.Query(query, args...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
type sysRow struct {
|
||||||
|
ts int64
|
||||||
|
cpu, mem, pwr float64
|
||||||
|
}
|
||||||
|
var sysRows []sysRow
|
||||||
|
for rows.Next() {
|
||||||
|
var r sysRow
|
||||||
|
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sysRows = append(sysRows, r)
|
||||||
|
}
|
||||||
|
if len(sysRows) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
// Collect min/max ts for range query
|
||||||
|
minTS := sysRows[0].ts
|
||||||
|
maxTS := sysRows[len(sysRows)-1].ts
|
||||||
|
|
||||||
|
// Load GPU rows in range
|
||||||
|
type gpuKey struct {
|
||||||
|
ts int64
|
||||||
|
idx int
|
||||||
|
}
|
||||||
|
gpuData := map[gpuKey]platform.GPUMetricRow{}
|
||||||
|
gRows, err := m.db.Query(
|
||||||
|
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
|
||||||
|
minTS, maxTS,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
defer gRows.Close()
|
||||||
|
for gRows.Next() {
|
||||||
|
var ts int64
|
||||||
|
var g platform.GPUMetricRow
|
||||||
|
if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW); err == nil {
|
||||||
|
gpuData[gpuKey{ts, g.GPUIndex}] = g
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load fan rows in range
|
||||||
|
type fanKey struct {
|
||||||
|
ts int64
|
||||||
|
name string
|
||||||
|
}
|
||||||
|
fanData := map[fanKey]float64{}
|
||||||
|
fRows, err := m.db.Query(
|
||||||
|
`SELECT ts,name,rpm FROM fan_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
defer fRows.Close()
|
||||||
|
for fRows.Next() {
|
||||||
|
var ts int64
|
||||||
|
var name string
|
||||||
|
var rpm float64
|
||||||
|
if err := fRows.Scan(&ts, &name, &rpm); err == nil {
|
||||||
|
fanData[fanKey{ts, name}] = rpm
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load temp rows in range
|
||||||
|
type tempKey struct {
|
||||||
|
ts int64
|
||||||
|
name string
|
||||||
|
}
|
||||||
|
tempData := map[tempKey]platform.TempReading{}
|
||||||
|
tRows, err := m.db.Query(
|
||||||
|
`SELECT ts,name,grp,celsius FROM temp_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
defer tRows.Close()
|
||||||
|
for tRows.Next() {
|
||||||
|
var ts int64
|
||||||
|
var t platform.TempReading
|
||||||
|
if err := tRows.Scan(&ts, &t.Name, &t.Group, &t.Celsius); err == nil {
|
||||||
|
tempData[tempKey{ts, t.Name}] = t
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect unique GPU indices and fan names from loaded data (preserve order)
|
||||||
|
seenGPU := map[int]bool{}
|
||||||
|
var gpuIndices []int
|
||||||
|
for k := range gpuData {
|
||||||
|
if !seenGPU[k.idx] {
|
||||||
|
seenGPU[k.idx] = true
|
||||||
|
gpuIndices = append(gpuIndices, k.idx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
seenFan := map[string]bool{}
|
||||||
|
var fanNames []string
|
||||||
|
for k := range fanData {
|
||||||
|
if !seenFan[k.name] {
|
||||||
|
seenFan[k.name] = true
|
||||||
|
fanNames = append(fanNames, k.name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
seenTemp := map[string]bool{}
|
||||||
|
var tempNames []string
|
||||||
|
for k := range tempData {
|
||||||
|
if !seenTemp[k.name] {
|
||||||
|
seenTemp[k.name] = true
|
||||||
|
tempNames = append(tempNames, k.name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
samples := make([]platform.LiveMetricSample, len(sysRows))
|
||||||
|
for i, r := range sysRows {
|
||||||
|
s := platform.LiveMetricSample{
|
||||||
|
Timestamp: time.Unix(r.ts, 0).UTC(),
|
||||||
|
CPULoadPct: r.cpu,
|
||||||
|
MemLoadPct: r.mem,
|
||||||
|
PowerW: r.pwr,
|
||||||
|
}
|
||||||
|
for _, idx := range gpuIndices {
|
||||||
|
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
|
||||||
|
s.GPUs = append(s.GPUs, g)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, name := range fanNames {
|
||||||
|
if rpm, ok := fanData[fanKey{r.ts, name}]; ok {
|
||||||
|
s.Fans = append(s.Fans, platform.FanReading{Name: name, RPM: rpm})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, name := range tempNames {
|
||||||
|
if t, ok := tempData[tempKey{r.ts, name}]; ok {
|
||||||
|
s.Temps = append(s.Temps, t)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
samples[i] = s
|
||||||
|
}
|
||||||
|
return samples, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ExportCSV writes all sys+gpu data as CSV to w.
|
||||||
|
func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||||
|
rows, err := m.db.Query(`
|
||||||
|
SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
|
||||||
|
g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w
|
||||||
|
FROM sys_metrics s
|
||||||
|
LEFT JOIN gpu_metrics g ON g.ts = s.ts
|
||||||
|
ORDER BY s.ts, g.gpu_index
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
cw := csv.NewWriter(w)
|
||||||
|
_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w"})
|
||||||
|
for rows.Next() {
|
||||||
|
var ts int64
|
||||||
|
var cpu, mem, pwr float64
|
||||||
|
var gpuIdx sql.NullInt64
|
||||||
|
var gpuTemp, gpuUse, gpuMem, gpuPow sql.NullFloat64
|
||||||
|
if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
row := []string{
|
||||||
|
strconv.FormatInt(ts, 10),
|
||||||
|
strconv.FormatFloat(cpu, 'f', 2, 64),
|
||||||
|
strconv.FormatFloat(mem, 'f', 2, 64),
|
||||||
|
strconv.FormatFloat(pwr, 'f', 1, 64),
|
||||||
|
}
|
||||||
|
if gpuIdx.Valid {
|
||||||
|
row = append(row,
|
||||||
|
strconv.FormatInt(gpuIdx.Int64, 10),
|
||||||
|
strconv.FormatFloat(gpuTemp.Float64, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
row = append(row, "", "", "", "", "")
|
||||||
|
}
|
||||||
|
_ = cw.Write(row)
|
||||||
|
}
|
||||||
|
cw.Flush()
|
||||||
|
return cw.Error()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close closes the database.
|
||||||
|
func (m *MetricsDB) Close() { _ = m.db.Close() }
|
||||||
|
|
||||||
|
func nullFloat(v float64) sql.NullFloat64 {
|
||||||
|
return sql.NullFloat64{Float64: v, Valid: true}
|
||||||
|
}
|
||||||
69
audit/internal/webui/metricsdb_test.go
Normal file
69
audit/internal/webui/metricsdb_test.go
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
|
||||||
|
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("openMetricsDB: %v", err)
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
|
||||||
|
base := time.Unix(1_700_000_000, 0).UTC()
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
err := db.Write(platform.LiveMetricSample{
|
||||||
|
Timestamp: base.Add(time.Duration(i) * time.Second),
|
||||||
|
CPULoadPct: float64(10 + i),
|
||||||
|
MemLoadPct: float64(20 + i),
|
||||||
|
PowerW: float64(300 + i),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, PowerW: float64(100 + i)},
|
||||||
|
{GPUIndex: 2, PowerW: float64(200 + i)},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Write(%d): %v", i, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
all, err := db.LoadAll()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadAll: %v", err)
|
||||||
|
}
|
||||||
|
if len(all) != 3 {
|
||||||
|
t.Fatalf("LoadAll len=%d want 3", len(all))
|
||||||
|
}
|
||||||
|
for i, sample := range all {
|
||||||
|
if len(sample.GPUs) != 2 {
|
||||||
|
t.Fatalf("LoadAll sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||||
|
}
|
||||||
|
if sample.GPUs[0].GPUIndex != 0 || sample.GPUs[0].PowerW != float64(100+i) {
|
||||||
|
t.Fatalf("LoadAll sample %d GPU0=%+v", i, sample.GPUs[0])
|
||||||
|
}
|
||||||
|
if sample.GPUs[1].GPUIndex != 2 || sample.GPUs[1].PowerW != float64(200+i) {
|
||||||
|
t.Fatalf("LoadAll sample %d GPU1=%+v", i, sample.GPUs[1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
recent, err := db.LoadRecent(2)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadRecent: %v", err)
|
||||||
|
}
|
||||||
|
if len(recent) != 2 {
|
||||||
|
t.Fatalf("LoadRecent len=%d want 2", len(recent))
|
||||||
|
}
|
||||||
|
if !recent[0].Timestamp.Before(recent[1].Timestamp) {
|
||||||
|
t.Fatalf("LoadRecent timestamps not ascending: %v >= %v", recent[0].Timestamp, recent[1].Timestamp)
|
||||||
|
}
|
||||||
|
for i, sample := range recent {
|
||||||
|
if len(sample.GPUs) != 2 {
|
||||||
|
t.Fatalf("LoadRecent sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -7,9 +7,260 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestRootRendersShellWithIframe(t *testing.T) {
|
func TestChartLegendNumber(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
in float64
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{in: 0.4, want: "0"},
|
||||||
|
{in: 61.5, want: "62"},
|
||||||
|
{in: 999.4, want: "999"},
|
||||||
|
{in: 1200, want: "1,2k"},
|
||||||
|
{in: 1250, want: "1,25k"},
|
||||||
|
{in: 1310, want: "1,31k"},
|
||||||
|
{in: 1500, want: "1,5k"},
|
||||||
|
{in: 2600, want: "2,6k"},
|
||||||
|
{in: 10200, want: "10k"},
|
||||||
|
}
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := chartLegendNumber(tc.in); got != tc.want {
|
||||||
|
t.Fatalf("chartLegendNumber(%v)=%q want %q", tc.in, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||||
|
samples := []platform.LiveMetricSample{
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-3 * time.Minute),
|
||||||
|
CPULoadPct: 10,
|
||||||
|
MemLoadPct: 20,
|
||||||
|
PowerW: 300,
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, UsagePct: 90, MemUsagePct: 5, PowerW: 120, TempC: 50},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||||
|
CPULoadPct: 30,
|
||||||
|
MemLoadPct: 40,
|
||||||
|
PowerW: 320,
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, UsagePct: 95, MemUsagePct: 7, PowerW: 125, TempC: 51},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||||
|
CPULoadPct: 50,
|
||||||
|
MemLoadPct: 60,
|
||||||
|
PowerW: 340,
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, UsagePct: 97, MemUsagePct: 9, PowerW: 130, TempC: 52},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("chartDataFromSamples returned ok=false")
|
||||||
|
}
|
||||||
|
if title != "GPU Power" {
|
||||||
|
t.Fatalf("title=%q", title)
|
||||||
|
}
|
||||||
|
if len(names) != 1 || names[0] != "GPU 0" {
|
||||||
|
t.Fatalf("names=%v", names)
|
||||||
|
}
|
||||||
|
if len(labels) != len(samples) {
|
||||||
|
t.Fatalf("labels len=%d want %d", len(labels), len(samples))
|
||||||
|
}
|
||||||
|
if len(datasets) != 1 || len(datasets[0]) != len(samples) {
|
||||||
|
t.Fatalf("datasets shape=%v", datasets)
|
||||||
|
}
|
||||||
|
if got := datasets[0][0]; got != 120 {
|
||||||
|
t.Fatalf("datasets[0][0]=%v want 120", got)
|
||||||
|
}
|
||||||
|
if got := datasets[0][2]; got != 130 {
|
||||||
|
t.Fatalf("datasets[0][2]=%v want 130", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
|
||||||
|
samples := []platform.LiveMetricSample{
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 7, PowerW: 170},
|
||||||
|
{GPUIndex: 2, PowerW: 120},
|
||||||
|
{GPUIndex: 0, PowerW: 100},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, PowerW: 101},
|
||||||
|
{GPUIndex: 7, PowerW: 171},
|
||||||
|
{GPUIndex: 2, PowerW: 121},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("chartDataFromSamples returned ok=false")
|
||||||
|
}
|
||||||
|
if title != "GPU Power" {
|
||||||
|
t.Fatalf("title=%q", title)
|
||||||
|
}
|
||||||
|
wantNames := []string{"GPU 0", "GPU 2", "GPU 7"}
|
||||||
|
if len(names) != len(wantNames) {
|
||||||
|
t.Fatalf("names len=%d want %d: %v", len(names), len(wantNames), names)
|
||||||
|
}
|
||||||
|
for i := range wantNames {
|
||||||
|
if names[i] != wantNames[i] {
|
||||||
|
t.Fatalf("names[%d]=%q want %q; full=%v", i, names[i], wantNames[i], names)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if got := datasets[0]; len(got) != 2 || got[0] != 100 || got[1] != 101 {
|
||||||
|
t.Fatalf("GPU 0 dataset=%v want [100 101]", got)
|
||||||
|
}
|
||||||
|
if got := datasets[1]; len(got) != 2 || got[0] != 120 || got[1] != 121 {
|
||||||
|
t.Fatalf("GPU 2 dataset=%v want [120 121]", got)
|
||||||
|
}
|
||||||
|
if got := datasets[2]; len(got) != 2 || got[0] != 170 || got[1] != 171 {
|
||||||
|
t.Fatalf("GPU 7 dataset=%v want [170 171]", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
|
||||||
|
got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
|
||||||
|
want := []float64{0, 480, 480, 480, 510, 510}
|
||||||
|
if len(got) != len(want) {
|
||||||
|
t.Fatalf("len=%d want %d", len(got), len(want))
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if got[i] != want[i] {
|
||||||
|
t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
|
||||||
|
body := renderMetrics()
|
||||||
|
if !strings.Contains(body, "const probe = new Image();") {
|
||||||
|
t.Fatalf("metrics page should preload chart images before swap: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, "el.dataset.loading === '1'") {
|
||||||
|
t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartLegendVisible(t *testing.T) {
|
||||||
|
if !chartLegendVisible(8) {
|
||||||
|
t.Fatal("legend should stay visible for charts with up to 8 series")
|
||||||
|
}
|
||||||
|
if chartLegendVisible(9) {
|
||||||
|
t.Fatal("legend should be hidden for charts with more than 8 series")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartYAxisNumber(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
in float64
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{in: 999, want: "999"},
|
||||||
|
{in: 1000, want: "1к"},
|
||||||
|
{in: 1370, want: "1к"},
|
||||||
|
{in: 1500, want: "2к"},
|
||||||
|
{in: 10200, want: "10к"},
|
||||||
|
{in: -1499, want: "-1к"},
|
||||||
|
}
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := chartYAxisNumber(tc.in); got != tc.want {
|
||||||
|
t.Fatalf("chartYAxisNumber(%v)=%q want %q", tc.in, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartCanvasHeight(t *testing.T) {
|
||||||
|
if got := chartCanvasHeight(4); got != 360 {
|
||||||
|
t.Fatalf("chartCanvasHeight(4)=%d want 360", got)
|
||||||
|
}
|
||||||
|
if got := chartCanvasHeight(12); got != 288 {
|
||||||
|
t.Fatalf("chartCanvasHeight(12)=%d want 288", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
||||||
|
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
|
||||||
|
want := []float64{4200, 4200, 4200, 4300, 4300}
|
||||||
|
if len(got) != len(want) {
|
||||||
|
t.Fatalf("len=%d want %d", len(got), len(want))
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if got[i] != want[i] {
|
||||||
|
t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartYAxisOption(t *testing.T) {
|
||||||
|
min := floatPtr(0)
|
||||||
|
max := floatPtr(100)
|
||||||
|
opt := chartYAxisOption(min, max)
|
||||||
|
if opt.Min != min || opt.Max != max {
|
||||||
|
t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
|
||||||
|
}
|
||||||
|
if opt.LabelCount != 11 {
|
||||||
|
t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
|
||||||
|
}
|
||||||
|
if got := opt.ValueFormatter(1000); got != "1к" {
|
||||||
|
t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
|
||||||
|
r1 := newMetricsRing(4)
|
||||||
|
r2 := newMetricsRing(4)
|
||||||
|
r1.push(1000)
|
||||||
|
r1.push(1100)
|
||||||
|
r2.push(1200)
|
||||||
|
r2.push(1300)
|
||||||
|
|
||||||
|
datasets, names, labels := snapshotFanRings([]*metricsRing{r1, r2}, []string{"FAN_A", "FAN_B"})
|
||||||
|
if len(datasets) != 2 {
|
||||||
|
t.Fatalf("datasets=%d want 2", len(datasets))
|
||||||
|
}
|
||||||
|
if len(names) != 2 || names[0] != "FAN_A RPM" || names[1] != "FAN_B RPM" {
|
||||||
|
t.Fatalf("names=%v", names)
|
||||||
|
}
|
||||||
|
if len(labels) != 2 {
|
||||||
|
t.Fatalf("labels=%v want 2 entries", labels)
|
||||||
|
}
|
||||||
|
if labels[0] == "" || labels[1] == "" {
|
||||||
|
t.Fatalf("labels should contain timeline values, got %v", labels)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRenderNetworkInlineSyncsPendingState(t *testing.T) {
|
||||||
|
body := renderNetworkInline()
|
||||||
|
if !strings.Contains(body, "d.pending_change") {
|
||||||
|
t.Fatalf("network UI should read pending network state from API: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, "setInterval(loadNetwork, 5000)") {
|
||||||
|
t.Fatalf("network UI should periodically refresh network state: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, "showNetPending(NET_ROLLBACK_SECS)") {
|
||||||
|
t.Fatalf("network UI should show pending confirmation immediately on apply: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRootRendersDashboard(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
path := filepath.Join(dir, "audit.json")
|
path := filepath.Join(dir, "audit.json")
|
||||||
exportDir := filepath.Join(dir, "export")
|
exportDir := filepath.Join(dir, "export")
|
||||||
@@ -31,11 +282,12 @@ func TestRootRendersShellWithIframe(t *testing.T) {
|
|||||||
if first.Code != http.StatusOK {
|
if first.Code != http.StatusOK {
|
||||||
t.Fatalf("first status=%d", first.Code)
|
t.Fatalf("first status=%d", first.Code)
|
||||||
}
|
}
|
||||||
if !strings.Contains(first.Body.String(), `iframe`) || !strings.Contains(first.Body.String(), `src="/viewer"`) {
|
// Dashboard should contain the audit nav link and hardware summary
|
||||||
t.Fatalf("first body missing iframe viewer: %s", first.Body.String())
|
if !strings.Contains(first.Body.String(), `href="/audit"`) {
|
||||||
|
t.Fatalf("first body missing audit nav link: %s", first.Body.String())
|
||||||
}
|
}
|
||||||
if !strings.Contains(first.Body.String(), "/export/support.tar.gz") {
|
if !strings.Contains(first.Body.String(), `/viewer`) {
|
||||||
t.Fatalf("first body missing support bundle link: %s", first.Body.String())
|
t.Fatalf("first body missing viewer link: %s", first.Body.String())
|
||||||
}
|
}
|
||||||
if got := first.Header().Get("Cache-Control"); got != "no-store" {
|
if got := first.Header().Get("Cache-Control"); got != "no-store" {
|
||||||
t.Fatalf("first cache-control=%q", got)
|
t.Fatalf("first cache-control=%q", got)
|
||||||
@@ -50,8 +302,57 @@ func TestRootRendersShellWithIframe(t *testing.T) {
|
|||||||
if second.Code != http.StatusOK {
|
if second.Code != http.StatusOK {
|
||||||
t.Fatalf("second status=%d", second.Code)
|
t.Fatalf("second status=%d", second.Code)
|
||||||
}
|
}
|
||||||
if !strings.Contains(second.Body.String(), `src="/viewer"`) {
|
if !strings.Contains(second.Body.String(), `Hardware Summary`) {
|
||||||
t.Fatalf("second body missing iframe viewer: %s", second.Body.String())
|
t.Fatalf("second body missing hardware summary: %s", second.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
exportDir := filepath.Join(dir, "export")
|
||||||
|
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{
|
||||||
|
Title: "Bee Hardware Audit",
|
||||||
|
AuditPath: filepath.Join(dir, "missing-audit.json"),
|
||||||
|
ExportDir: exportDir,
|
||||||
|
})
|
||||||
|
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `Run Audit`) {
|
||||||
|
t.Fatalf("dashboard missing run audit button: %s", body)
|
||||||
|
}
|
||||||
|
if strings.Contains(body, `No audit data`) {
|
||||||
|
t.Fatalf("dashboard still shows empty audit badge: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "audit.json")
|
||||||
|
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{AuditPath: path})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/audit", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `iframe class="viewer-frame" src="/viewer"`) {
|
||||||
|
t.Fatalf("audit page missing viewer frame: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `openAuditModal()`) {
|
||||||
|
t.Fatalf("audit page missing action modal trigger: %s", body)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -103,8 +404,8 @@ func TestAuditJSONServesLatestSnapshot(t *testing.T) {
|
|||||||
if rec.Code != http.StatusOK {
|
if rec.Code != http.StatusOK {
|
||||||
t.Fatalf("status=%d", rec.Code)
|
t.Fatalf("status=%d", rec.Code)
|
||||||
}
|
}
|
||||||
if got := strings.TrimSpace(rec.Body.String()); got != body {
|
if !strings.Contains(rec.Body.String(), "SERIAL-API") {
|
||||||
t.Fatalf("body=%q want %q", got, body)
|
t.Fatalf("body missing expected serial: %s", rec.Body.String())
|
||||||
}
|
}
|
||||||
if got := rec.Header().Get("Content-Type"); !strings.Contains(got, "application/json") {
|
if got := rec.Header().Get("Content-Type"); !strings.Contains(got, "application/json") {
|
||||||
t.Fatalf("content-type=%q", got)
|
t.Fatalf("content-type=%q", got)
|
||||||
@@ -129,6 +430,17 @@ func TestSupportBundleEndpointReturnsArchive(t *testing.T) {
|
|||||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.log"), []byte("audit log"), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.log"), []byte("audit log"), 0644); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
archive, err := os.CreateTemp(os.TempDir(), "bee-support-server-test-*.tar.gz")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { _ = os.Remove(archive.Name()) })
|
||||||
|
if _, err := archive.WriteString("support-bundle"); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := archive.Close(); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
|
|||||||
831
audit/internal/webui/tasks.go
Normal file
831
audit/internal/webui/tasks.go
Normal file
@@ -0,0 +1,831 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Task statuses.
|
||||||
|
const (
|
||||||
|
TaskPending = "pending"
|
||||||
|
TaskRunning = "running"
|
||||||
|
TaskDone = "done"
|
||||||
|
TaskFailed = "failed"
|
||||||
|
TaskCancelled = "cancelled"
|
||||||
|
)
|
||||||
|
|
||||||
|
// taskNames maps target → human-readable name for validate (SAT) runs.
|
||||||
|
var taskNames = map[string]string{
|
||||||
|
"nvidia": "NVIDIA SAT",
|
||||||
|
"nvidia-stress": "NVIDIA GPU Stress",
|
||||||
|
"memory": "Memory SAT",
|
||||||
|
"storage": "Storage SAT",
|
||||||
|
"cpu": "CPU SAT",
|
||||||
|
"amd": "AMD GPU SAT",
|
||||||
|
"amd-mem": "AMD GPU MEM Integrity",
|
||||||
|
"amd-bandwidth": "AMD GPU MEM Bandwidth",
|
||||||
|
"amd-stress": "AMD GPU Burn-in",
|
||||||
|
"memory-stress": "Memory Burn-in",
|
||||||
|
"sat-stress": "SAT Stress (stressapptest)",
|
||||||
|
"platform-stress": "Platform Thermal Cycling",
|
||||||
|
"audit": "Audit",
|
||||||
|
"support-bundle": "Support Bundle",
|
||||||
|
"install": "Install to Disk",
|
||||||
|
"install-to-ram": "Install to RAM",
|
||||||
|
}
|
||||||
|
|
||||||
|
// burnNames maps target → human-readable name when a burn profile is set.
|
||||||
|
var burnNames = map[string]string{
|
||||||
|
"nvidia": "NVIDIA Burn-in",
|
||||||
|
"memory": "Memory Burn-in",
|
||||||
|
"cpu": "CPU Burn-in",
|
||||||
|
"amd": "AMD GPU Burn-in",
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvidiaStressTaskName(loader string) string {
|
||||||
|
switch strings.TrimSpace(strings.ToLower(loader)) {
|
||||||
|
case platform.NvidiaStressLoaderJohn:
|
||||||
|
return "NVIDIA GPU Stress (John/OpenCL)"
|
||||||
|
case platform.NvidiaStressLoaderNCCL:
|
||||||
|
return "NVIDIA GPU Stress (NCCL)"
|
||||||
|
default:
|
||||||
|
return "NVIDIA GPU Stress (bee-gpu-burn)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskDisplayName(target, profile, loader string) string {
|
||||||
|
name := taskNames[target]
|
||||||
|
if profile != "" {
|
||||||
|
if n, ok := burnNames[target]; ok {
|
||||||
|
name = n
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if target == "nvidia-stress" {
|
||||||
|
name = nvidiaStressTaskName(loader)
|
||||||
|
}
|
||||||
|
if name == "" {
|
||||||
|
name = target
|
||||||
|
}
|
||||||
|
return name
|
||||||
|
}
|
||||||
|
|
||||||
|
// Task represents one unit of work in the queue.
|
||||||
|
type Task struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
Target string `json:"target"`
|
||||||
|
Priority int `json:"priority"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||||
|
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||||
|
ElapsedSec int `json:"elapsed_sec,omitempty"`
|
||||||
|
ErrMsg string `json:"error,omitempty"`
|
||||||
|
LogPath string `json:"log_path,omitempty"`
|
||||||
|
|
||||||
|
// runtime fields (not serialised)
|
||||||
|
job *jobState
|
||||||
|
params taskParams
|
||||||
|
}
|
||||||
|
|
||||||
|
// taskParams holds optional parameters parsed from the run request.
|
||||||
|
type taskParams struct {
|
||||||
|
Duration int `json:"duration,omitempty"`
|
||||||
|
DiagLevel int `json:"diag_level,omitempty"`
|
||||||
|
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||||
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||||
|
Loader string `json:"loader,omitempty"`
|
||||||
|
BurnProfile string `json:"burn_profile,omitempty"`
|
||||||
|
DisplayName string `json:"display_name,omitempty"`
|
||||||
|
Device string `json:"device,omitempty"` // for install
|
||||||
|
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type persistedTask struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
Target string `json:"target"`
|
||||||
|
Priority int `json:"priority"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||||
|
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||||
|
ErrMsg string `json:"error,omitempty"`
|
||||||
|
LogPath string `json:"log_path,omitempty"`
|
||||||
|
Params taskParams `json:"params,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type burnPreset struct {
|
||||||
|
NvidiaDiag int
|
||||||
|
DurationSec int
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveBurnPreset(profile string) burnPreset {
|
||||||
|
switch profile {
|
||||||
|
case "overnight":
|
||||||
|
return burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}
|
||||||
|
case "acceptance":
|
||||||
|
return burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}
|
||||||
|
default:
|
||||||
|
return burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
||||||
|
switch profile {
|
||||||
|
case "overnight":
|
||||||
|
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||||
|
{LoadSec: 600, IdleSec: 120},
|
||||||
|
{LoadSec: 600, IdleSec: 60},
|
||||||
|
{LoadSec: 600, IdleSec: 30},
|
||||||
|
{LoadSec: 600, IdleSec: 120},
|
||||||
|
{LoadSec: 600, IdleSec: 60},
|
||||||
|
{LoadSec: 600, IdleSec: 30},
|
||||||
|
{LoadSec: 600, IdleSec: 120},
|
||||||
|
{LoadSec: 600, IdleSec: 60},
|
||||||
|
}}
|
||||||
|
case "acceptance":
|
||||||
|
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||||
|
{LoadSec: 300, IdleSec: 60},
|
||||||
|
{LoadSec: 300, IdleSec: 30},
|
||||||
|
{LoadSec: 300, IdleSec: 60},
|
||||||
|
{LoadSec: 300, IdleSec: 30},
|
||||||
|
}}
|
||||||
|
default: // smoke
|
||||||
|
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||||
|
{LoadSec: 90, IdleSec: 60},
|
||||||
|
{LoadSec: 90, IdleSec: 30},
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
||||||
|
type taskQueue struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
tasks []*Task
|
||||||
|
trigger chan struct{}
|
||||||
|
opts *HandlerOptions // set by startWorker
|
||||||
|
statePath string
|
||||||
|
logsDir string
|
||||||
|
started bool
|
||||||
|
}
|
||||||
|
|
||||||
|
var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
||||||
|
|
||||||
|
const maxTaskHistory = 50
|
||||||
|
|
||||||
|
var (
|
||||||
|
runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunMemoryAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunStorageAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
runAMDAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
runAMDMemIntegrityPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDMemIntegrityPackCtx(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
runAMDMemBandwidthPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDMemBandwidthPackCtx(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
runNvidiaStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunNvidiaStressPackCtx(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
runAMDStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
runMemoryStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunMemoryStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
runSATStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunSATStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
buildSupportBundle = app.BuildSupportBundle
|
||||||
|
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
||||||
|
return exec.CommandContext(ctx, "bee-install", device, logPath)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
// enqueue adds a task to the queue and notifies the worker.
|
||||||
|
func (q *taskQueue) enqueue(t *Task) {
|
||||||
|
q.mu.Lock()
|
||||||
|
q.assignTaskLogPathLocked(t)
|
||||||
|
q.tasks = append(q.tasks, t)
|
||||||
|
q.prune()
|
||||||
|
q.persistLocked()
|
||||||
|
q.mu.Unlock()
|
||||||
|
select {
|
||||||
|
case q.trigger <- struct{}{}:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// prune removes oldest completed tasks beyond maxTaskHistory.
|
||||||
|
func (q *taskQueue) prune() {
|
||||||
|
var done []*Task
|
||||||
|
var active []*Task
|
||||||
|
for _, t := range q.tasks {
|
||||||
|
switch t.Status {
|
||||||
|
case TaskDone, TaskFailed, TaskCancelled:
|
||||||
|
done = append(done, t)
|
||||||
|
default:
|
||||||
|
active = append(active, t)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(done) > maxTaskHistory {
|
||||||
|
done = done[len(done)-maxTaskHistory:]
|
||||||
|
}
|
||||||
|
q.tasks = append(active, done...)
|
||||||
|
}
|
||||||
|
|
||||||
|
// nextPending returns the highest-priority pending task (nil if none).
|
||||||
|
func (q *taskQueue) nextPending() *Task {
|
||||||
|
var best *Task
|
||||||
|
for _, t := range q.tasks {
|
||||||
|
if t.Status != TaskPending {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if best == nil || t.Priority > best.Priority ||
|
||||||
|
(t.Priority == best.Priority && t.CreatedAt.Before(best.CreatedAt)) {
|
||||||
|
best = t
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return best
|
||||||
|
}
|
||||||
|
|
||||||
|
// findByID looks up a task by ID.
|
||||||
|
func (q *taskQueue) findByID(id string) (*Task, bool) {
|
||||||
|
q.mu.Lock()
|
||||||
|
defer q.mu.Unlock()
|
||||||
|
for _, t := range q.tasks {
|
||||||
|
if t.ID == id {
|
||||||
|
return t, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// findJob returns the jobState for a task ID (for SSE streaming compatibility).
|
||||||
|
func (q *taskQueue) findJob(id string) (*jobState, bool) {
|
||||||
|
t, ok := q.findByID(id)
|
||||||
|
if !ok || t.job == nil {
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
return t.job, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) hasActiveTarget(target string) bool {
|
||||||
|
q.mu.Lock()
|
||||||
|
defer q.mu.Unlock()
|
||||||
|
for _, t := range q.tasks {
|
||||||
|
if t.Target != target {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if t.Status == TaskPending || t.Status == TaskRunning {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// snapshot returns a copy of all tasks sorted for display (running first, then pending by priority, then done by doneAt desc).
|
||||||
|
func (q *taskQueue) snapshot() []Task {
|
||||||
|
q.mu.Lock()
|
||||||
|
defer q.mu.Unlock()
|
||||||
|
out := make([]Task, len(q.tasks))
|
||||||
|
for i, t := range q.tasks {
|
||||||
|
out[i] = *t
|
||||||
|
out[i].ElapsedSec = taskElapsedSec(&out[i], time.Now())
|
||||||
|
}
|
||||||
|
sort.SliceStable(out, func(i, j int) bool {
|
||||||
|
si := statusOrder(out[i].Status)
|
||||||
|
sj := statusOrder(out[j].Status)
|
||||||
|
if si != sj {
|
||||||
|
return si < sj
|
||||||
|
}
|
||||||
|
if out[i].Priority != out[j].Priority {
|
||||||
|
return out[i].Priority > out[j].Priority
|
||||||
|
}
|
||||||
|
return out[i].CreatedAt.Before(out[j].CreatedAt)
|
||||||
|
})
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func statusOrder(s string) int {
|
||||||
|
switch s {
|
||||||
|
case TaskRunning:
|
||||||
|
return 0
|
||||||
|
case TaskPending:
|
||||||
|
return 1
|
||||||
|
default:
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// startWorker launches the queue runner goroutine.
|
||||||
|
func (q *taskQueue) startWorker(opts *HandlerOptions) {
|
||||||
|
q.mu.Lock()
|
||||||
|
q.opts = opts
|
||||||
|
q.statePath = filepath.Join(opts.ExportDir, "tasks-state.json")
|
||||||
|
q.logsDir = filepath.Join(opts.ExportDir, "tasks")
|
||||||
|
_ = os.MkdirAll(q.logsDir, 0755)
|
||||||
|
if !q.started {
|
||||||
|
q.loadLocked()
|
||||||
|
q.started = true
|
||||||
|
go q.worker()
|
||||||
|
}
|
||||||
|
hasPending := q.nextPending() != nil
|
||||||
|
q.mu.Unlock()
|
||||||
|
if hasPending {
|
||||||
|
select {
|
||||||
|
case q.trigger <- struct{}{}:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) worker() {
|
||||||
|
for {
|
||||||
|
<-q.trigger
|
||||||
|
setCPUGovernor("performance")
|
||||||
|
for {
|
||||||
|
q.mu.Lock()
|
||||||
|
t := q.nextPending()
|
||||||
|
if t == nil {
|
||||||
|
q.mu.Unlock()
|
||||||
|
break
|
||||||
|
}
|
||||||
|
now := time.Now()
|
||||||
|
t.Status = TaskRunning
|
||||||
|
t.StartedAt = &now
|
||||||
|
t.DoneAt = nil
|
||||||
|
t.ErrMsg = ""
|
||||||
|
j := newTaskJobState(t.LogPath)
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
j.cancel = cancel
|
||||||
|
t.job = j
|
||||||
|
q.persistLocked()
|
||||||
|
q.mu.Unlock()
|
||||||
|
|
||||||
|
q.runTask(t, j, ctx)
|
||||||
|
|
||||||
|
q.mu.Lock()
|
||||||
|
now2 := time.Now()
|
||||||
|
t.DoneAt = &now2
|
||||||
|
if t.Status == TaskRunning { // not cancelled externally
|
||||||
|
if j.err != "" {
|
||||||
|
t.Status = TaskFailed
|
||||||
|
t.ErrMsg = j.err
|
||||||
|
} else {
|
||||||
|
t.Status = TaskDone
|
||||||
|
}
|
||||||
|
}
|
||||||
|
q.prune()
|
||||||
|
q.persistLocked()
|
||||||
|
q.mu.Unlock()
|
||||||
|
}
|
||||||
|
setCPUGovernor("powersave")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// setCPUGovernor writes the given governor to all CPU scaling_governor sysfs files.
|
||||||
|
// Silently ignores errors (e.g. when cpufreq is not available).
|
||||||
|
func setCPUGovernor(governor string) {
|
||||||
|
matches, err := filepath.Glob("/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor")
|
||||||
|
if err != nil || len(matches) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, path := range matches {
|
||||||
|
_ = os.WriteFile(path, []byte(governor), 0644)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// runTask executes the work for a task, writing output to j.
|
||||||
|
func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||||
|
if q.opts == nil {
|
||||||
|
j.append("ERROR: handler options not configured")
|
||||||
|
j.finish("handler options not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
a := q.opts.App
|
||||||
|
|
||||||
|
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
||||||
|
if len(j.lines) > 0 {
|
||||||
|
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
archive string
|
||||||
|
err error
|
||||||
|
)
|
||||||
|
|
||||||
|
switch t.Target {
|
||||||
|
case "nvidia":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
diagLevel := t.params.DiagLevel
|
||||||
|
if t.params.BurnProfile != "" && diagLevel <= 0 {
|
||||||
|
diagLevel = resolveBurnPreset(t.params.BurnProfile).NvidiaDiag
|
||||||
|
}
|
||||||
|
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
||||||
|
result, e := a.RunNvidiaAcceptancePackWithOptions(
|
||||||
|
ctx, "", diagLevel, t.params.GPUIndices, j.append,
|
||||||
|
)
|
||||||
|
if e != nil {
|
||||||
|
err = e
|
||||||
|
} else {
|
||||||
|
archive = result.Body
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
||||||
|
}
|
||||||
|
case "nvidia-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||||
|
DurationSec: dur,
|
||||||
|
Loader: t.params.Loader,
|
||||||
|
GPUIndices: t.params.GPUIndices,
|
||||||
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
|
}, j.append)
|
||||||
|
case "memory":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append)
|
||||||
|
case "storage":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runStorageAcceptancePackCtx(a, ctx, "", j.append)
|
||||||
|
case "cpu":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
if dur <= 0 {
|
||||||
|
dur = 60
|
||||||
|
}
|
||||||
|
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
|
||||||
|
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "amd":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
|
||||||
|
case "amd-mem":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
|
||||||
|
case "amd-bandwidth":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
|
||||||
|
case "amd-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "memory-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "sat-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "platform-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
opts := resolvePlatformStressPreset(t.params.BurnProfile)
|
||||||
|
opts.Components = t.params.PlatformComponents
|
||||||
|
archive, err = a.RunPlatformStress(ctx, "", opts, j.append)
|
||||||
|
case "audit":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
result, e := a.RunAuditNow(q.opts.RuntimeMode)
|
||||||
|
if e != nil {
|
||||||
|
err = e
|
||||||
|
} else {
|
||||||
|
for _, line := range splitLines(result.Body) {
|
||||||
|
j.append(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case "support-bundle":
|
||||||
|
j.append("Building support bundle...")
|
||||||
|
archive, err = buildSupportBundle(q.opts.ExportDir)
|
||||||
|
case "install":
|
||||||
|
if strings.TrimSpace(t.params.Device) == "" {
|
||||||
|
err = fmt.Errorf("device is required")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
installLogPath := platform.InstallLogPath(t.params.Device)
|
||||||
|
j.append("Install log: " + installLogPath)
|
||||||
|
err = streamCmdJob(j, installCommand(ctx, t.params.Device, installLogPath))
|
||||||
|
case "install-to-ram":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
err = a.RunInstallToRAM(ctx, j.append)
|
||||||
|
default:
|
||||||
|
j.append("ERROR: unknown target: " + t.Target)
|
||||||
|
j.finish("unknown target")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
j.append("Aborted.")
|
||||||
|
j.finish("aborted")
|
||||||
|
} else {
|
||||||
|
j.append("ERROR: " + err.Error())
|
||||||
|
j.finish(err.Error())
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if archive != "" {
|
||||||
|
j.append("Archive: " + archive)
|
||||||
|
}
|
||||||
|
j.finish("")
|
||||||
|
}
|
||||||
|
|
||||||
|
func splitLines(s string) []string {
|
||||||
|
var out []string
|
||||||
|
for _, l := range splitNL(s) {
|
||||||
|
if l != "" {
|
||||||
|
out = append(out, l)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func splitNL(s string) []string {
|
||||||
|
var out []string
|
||||||
|
start := 0
|
||||||
|
for i, c := range s {
|
||||||
|
if c == '\n' {
|
||||||
|
out = append(out, s[start:i])
|
||||||
|
start = i + 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out = append(out, s[start:])
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── HTTP handlers ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func (h *handler) handleAPITasksList(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
tasks := globalQueue.snapshot()
|
||||||
|
writeJSON(w, tasks)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
|
||||||
|
id := r.PathValue("id")
|
||||||
|
t, ok := globalQueue.findByID(id)
|
||||||
|
if !ok {
|
||||||
|
writeError(w, http.StatusNotFound, "task not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
switch t.Status {
|
||||||
|
case TaskPending:
|
||||||
|
t.Status = TaskCancelled
|
||||||
|
now := time.Now()
|
||||||
|
t.DoneAt = &now
|
||||||
|
globalQueue.persistLocked()
|
||||||
|
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||||
|
case TaskRunning:
|
||||||
|
if t.job != nil {
|
||||||
|
t.job.abort()
|
||||||
|
}
|
||||||
|
t.Status = TaskCancelled
|
||||||
|
now := time.Now()
|
||||||
|
t.DoneAt = &now
|
||||||
|
globalQueue.persistLocked()
|
||||||
|
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||||
|
default:
|
||||||
|
writeError(w, http.StatusConflict, "task is not running or pending")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPITasksPriority(w http.ResponseWriter, r *http.Request) {
|
||||||
|
id := r.PathValue("id")
|
||||||
|
t, ok := globalQueue.findByID(id)
|
||||||
|
if !ok {
|
||||||
|
writeError(w, http.StatusNotFound, "task not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var req struct {
|
||||||
|
Delta int `json:"delta"`
|
||||||
|
}
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if t.Status != TaskPending {
|
||||||
|
writeError(w, http.StatusConflict, "only pending tasks can be reprioritised")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
t.Priority += req.Delta
|
||||||
|
globalQueue.persistLocked()
|
||||||
|
writeJSON(w, map[string]int{"priority": t.Priority})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
now := time.Now()
|
||||||
|
n := 0
|
||||||
|
for _, t := range globalQueue.tasks {
|
||||||
|
switch t.Status {
|
||||||
|
case TaskPending:
|
||||||
|
t.Status = TaskCancelled
|
||||||
|
t.DoneAt = &now
|
||||||
|
n++
|
||||||
|
case TaskRunning:
|
||||||
|
if t.job != nil {
|
||||||
|
t.job.abort()
|
||||||
|
}
|
||||||
|
t.Status = TaskCancelled
|
||||||
|
t.DoneAt = &now
|
||||||
|
n++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
globalQueue.persistLocked()
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
writeJSON(w, map[string]int{"cancelled": n})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
|
||||||
|
id := r.PathValue("id")
|
||||||
|
// Wait up to 5s for the task to get a job (it may be pending)
|
||||||
|
deadline := time.Now().Add(5 * time.Second)
|
||||||
|
var j *jobState
|
||||||
|
for time.Now().Before(deadline) {
|
||||||
|
if jj, ok := globalQueue.findJob(id); ok {
|
||||||
|
j = jj
|
||||||
|
break
|
||||||
|
}
|
||||||
|
time.Sleep(200 * time.Millisecond)
|
||||||
|
}
|
||||||
|
if j == nil {
|
||||||
|
http.Error(w, "task not found or not yet started", http.StatusNotFound)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
streamJob(w, r, j)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) assignTaskLogPathLocked(t *Task) {
|
||||||
|
if t.LogPath != "" || q.logsDir == "" || t.ID == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
t.LogPath = filepath.Join(q.logsDir, t.ID+".log")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) loadLocked() {
|
||||||
|
if q.statePath == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(q.statePath)
|
||||||
|
if err != nil || len(data) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var persisted []persistedTask
|
||||||
|
if err := json.Unmarshal(data, &persisted); err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, pt := range persisted {
|
||||||
|
t := &Task{
|
||||||
|
ID: pt.ID,
|
||||||
|
Name: pt.Name,
|
||||||
|
Target: pt.Target,
|
||||||
|
Priority: pt.Priority,
|
||||||
|
Status: pt.Status,
|
||||||
|
CreatedAt: pt.CreatedAt,
|
||||||
|
StartedAt: pt.StartedAt,
|
||||||
|
DoneAt: pt.DoneAt,
|
||||||
|
ErrMsg: pt.ErrMsg,
|
||||||
|
LogPath: pt.LogPath,
|
||||||
|
params: pt.Params,
|
||||||
|
}
|
||||||
|
q.assignTaskLogPathLocked(t)
|
||||||
|
if t.Status == TaskPending || t.Status == TaskRunning {
|
||||||
|
t.Status = TaskPending
|
||||||
|
t.StartedAt = nil
|
||||||
|
t.DoneAt = nil
|
||||||
|
t.ErrMsg = ""
|
||||||
|
}
|
||||||
|
q.tasks = append(q.tasks, t)
|
||||||
|
}
|
||||||
|
q.prune()
|
||||||
|
q.persistLocked()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) persistLocked() {
|
||||||
|
if q.statePath == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
state := make([]persistedTask, 0, len(q.tasks))
|
||||||
|
for _, t := range q.tasks {
|
||||||
|
state = append(state, persistedTask{
|
||||||
|
ID: t.ID,
|
||||||
|
Name: t.Name,
|
||||||
|
Target: t.Target,
|
||||||
|
Priority: t.Priority,
|
||||||
|
Status: t.Status,
|
||||||
|
CreatedAt: t.CreatedAt,
|
||||||
|
StartedAt: t.StartedAt,
|
||||||
|
DoneAt: t.DoneAt,
|
||||||
|
ErrMsg: t.ErrMsg,
|
||||||
|
LogPath: t.LogPath,
|
||||||
|
Params: t.params,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
data, err := json.MarshalIndent(state, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
tmp := q.statePath + ".tmp"
|
||||||
|
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = os.Rename(tmp, q.statePath)
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskElapsedSec(t *Task, now time.Time) int {
|
||||||
|
if t == nil || t.StartedAt == nil || t.StartedAt.IsZero() {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
start := *t.StartedAt
|
||||||
|
if !t.CreatedAt.IsZero() && start.Before(t.CreatedAt) {
|
||||||
|
start = t.CreatedAt
|
||||||
|
}
|
||||||
|
end := now
|
||||||
|
if t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||||
|
end = *t.DoneAt
|
||||||
|
}
|
||||||
|
if end.Before(start) {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return int(end.Sub(start).Round(time.Second) / time.Second)
|
||||||
|
}
|
||||||
304
audit/internal/webui/tasks_test.go
Normal file
304
audit/internal/webui/tasks_test.go
Normal file
@@ -0,0 +1,304 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
q := &taskQueue{
|
||||||
|
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||||
|
logsDir: filepath.Join(dir, "tasks"),
|
||||||
|
trigger: make(chan struct{}, 1),
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
started := time.Now().Add(-time.Minute)
|
||||||
|
task := &Task{
|
||||||
|
ID: "task-1",
|
||||||
|
Name: "Memory Burn-in",
|
||||||
|
Target: "memory-stress",
|
||||||
|
Priority: 2,
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now().Add(-2 * time.Minute),
|
||||||
|
StartedAt: &started,
|
||||||
|
params: taskParams{
|
||||||
|
Duration: 300,
|
||||||
|
BurnProfile: "smoke",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
q.tasks = append(q.tasks, task)
|
||||||
|
q.assignTaskLogPathLocked(task)
|
||||||
|
q.persistLocked()
|
||||||
|
|
||||||
|
recovered := &taskQueue{
|
||||||
|
statePath: q.statePath,
|
||||||
|
logsDir: q.logsDir,
|
||||||
|
trigger: make(chan struct{}, 1),
|
||||||
|
}
|
||||||
|
recovered.loadLocked()
|
||||||
|
|
||||||
|
if len(recovered.tasks) != 1 {
|
||||||
|
t.Fatalf("tasks=%d want 1", len(recovered.tasks))
|
||||||
|
}
|
||||||
|
got := recovered.tasks[0]
|
||||||
|
if got.Status != TaskPending {
|
||||||
|
t.Fatalf("status=%q want %q", got.Status, TaskPending)
|
||||||
|
}
|
||||||
|
if got.StartedAt != nil {
|
||||||
|
t.Fatalf("started_at=%v want nil for recovered pending task", got.StartedAt)
|
||||||
|
}
|
||||||
|
if got.params.Duration != 300 || got.params.BurnProfile != "smoke" {
|
||||||
|
t.Fatalf("params=%+v", got.params)
|
||||||
|
}
|
||||||
|
if got.LogPath == "" {
|
||||||
|
t.Fatal("expected log path")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "task.log")
|
||||||
|
if err := os.WriteFile(path, []byte("line1\nline2\n"), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
j := newTaskJobState(path)
|
||||||
|
existing, ch := j.subscribe()
|
||||||
|
if ch == nil {
|
||||||
|
t.Fatal("expected live subscription channel")
|
||||||
|
}
|
||||||
|
if len(existing) != 2 || existing[0] != "line1" || existing[1] != "line2" {
|
||||||
|
t.Fatalf("existing=%v", existing)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveBurnPreset(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
profile string
|
||||||
|
want burnPreset
|
||||||
|
}{
|
||||||
|
{profile: "smoke", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
|
||||||
|
{profile: "acceptance", want: burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}},
|
||||||
|
{profile: "overnight", want: burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}},
|
||||||
|
{profile: "", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
|
||||||
|
}
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := resolveBurnPreset(tc.profile); got != tc.want {
|
||||||
|
t.Fatalf("resolveBurnPreset(%q)=%+v want %+v", tc.profile, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
loader string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{loader: "", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
|
||||||
|
{loader: "builtin", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
|
||||||
|
{loader: "john", want: "NVIDIA GPU Stress (John/OpenCL)"},
|
||||||
|
{loader: "nccl", want: "NVIDIA GPU Stress (NCCL)"},
|
||||||
|
}
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := taskDisplayName("nvidia-stress", "acceptance", tc.loader); got != tc.want {
|
||||||
|
t.Fatalf("taskDisplayName(loader=%q)=%q want %q", tc.loader, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunTaskHonorsCancel(t *testing.T) {
|
||||||
|
blocked := make(chan struct{})
|
||||||
|
released := make(chan struct{})
|
||||||
|
aRun := func(_ any, ctx context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||||
|
close(blocked)
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
close(released)
|
||||||
|
return "", ctx.Err()
|
||||||
|
case <-time.After(5 * time.Second):
|
||||||
|
close(released)
|
||||||
|
return "unexpected", nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{App: &app.App{}},
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "cpu-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
params: taskParams{Duration: 60},
|
||||||
|
}
|
||||||
|
j := &jobState{}
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
j.cancel = cancel
|
||||||
|
tk.job = j
|
||||||
|
|
||||||
|
orig := runCPUAcceptancePackCtx
|
||||||
|
runCPUAcceptancePackCtx = func(_ *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return aRun(nil, ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
defer func() { runCPUAcceptancePackCtx = orig }()
|
||||||
|
|
||||||
|
done := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
q.runTask(tk, j, ctx)
|
||||||
|
close(done)
|
||||||
|
}()
|
||||||
|
|
||||||
|
<-blocked
|
||||||
|
j.abort()
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-released:
|
||||||
|
case <-time.After(2 * time.Second):
|
||||||
|
t.Fatal("task did not observe cancel")
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
case <-time.After(2 * time.Second):
|
||||||
|
t.Fatal("runTask did not return after cancel")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
|
||||||
|
var gotDuration int
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{App: &app.App{}},
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "cpu-burn-1",
|
||||||
|
Name: "CPU Burn-in",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
params: taskParams{BurnProfile: "smoke"},
|
||||||
|
}
|
||||||
|
j := &jobState{}
|
||||||
|
|
||||||
|
orig := runCPUAcceptancePackCtx
|
||||||
|
runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, durationSec int, _ func(string)) (string, error) {
|
||||||
|
gotDuration = durationSec
|
||||||
|
return "/tmp/cpu-burn.tar.gz", nil
|
||||||
|
}
|
||||||
|
defer func() { runCPUAcceptancePackCtx = orig }()
|
||||||
|
|
||||||
|
q.runTask(tk, j, context.Background())
|
||||||
|
|
||||||
|
if gotDuration != 5*60 {
|
||||||
|
t.Fatalf("duration=%d want %d", gotDuration, 5*60)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{ExportDir: dir},
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "support-bundle-1",
|
||||||
|
Name: "Support Bundle",
|
||||||
|
Target: "support-bundle",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
}
|
||||||
|
j := &jobState{}
|
||||||
|
|
||||||
|
var gotExportDir string
|
||||||
|
orig := buildSupportBundle
|
||||||
|
buildSupportBundle = func(exportDir string) (string, error) {
|
||||||
|
gotExportDir = exportDir
|
||||||
|
return filepath.Join(exportDir, "bundle.tar.gz"), nil
|
||||||
|
}
|
||||||
|
defer func() { buildSupportBundle = orig }()
|
||||||
|
|
||||||
|
q.runTask(tk, j, context.Background())
|
||||||
|
|
||||||
|
if gotExportDir != dir {
|
||||||
|
t.Fatalf("exportDir=%q want %q", gotExportDir, dir)
|
||||||
|
}
|
||||||
|
if j.err != "" {
|
||||||
|
t.Fatalf("unexpected error: %q", j.err)
|
||||||
|
}
|
||||||
|
if !strings.Contains(strings.Join(j.lines, "\n"), "Archive: "+filepath.Join(dir, "bundle.tar.gz")) {
|
||||||
|
t.Fatalf("lines=%v", j.lines)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskElapsedSecClampsInvalidStartedAt(t *testing.T) {
|
||||||
|
now := time.Date(2026, 4, 1, 19, 10, 0, 0, time.UTC)
|
||||||
|
created := time.Date(2026, 4, 1, 19, 4, 5, 0, time.UTC)
|
||||||
|
started := time.Time{}
|
||||||
|
task := &Task{
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: created,
|
||||||
|
StartedAt: &started,
|
||||||
|
}
|
||||||
|
if got := taskElapsedSec(task, now); got != 0 {
|
||||||
|
t.Fatalf("taskElapsedSec(zero start)=%d want 0", got)
|
||||||
|
}
|
||||||
|
|
||||||
|
stale := created.Add(-24 * time.Hour)
|
||||||
|
task.StartedAt = &stale
|
||||||
|
if got := taskElapsedSec(task, now); got != int(now.Sub(created).Seconds()) {
|
||||||
|
t.Fatalf("taskElapsedSec(stale start)=%d want %d", got, int(now.Sub(created).Seconds()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{},
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "install-1",
|
||||||
|
Name: "Install to Disk",
|
||||||
|
Target: "install",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
params: taskParams{Device: "/dev/sda"},
|
||||||
|
}
|
||||||
|
j := &jobState{}
|
||||||
|
|
||||||
|
var gotDevice string
|
||||||
|
var gotLogPath string
|
||||||
|
orig := installCommand
|
||||||
|
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
||||||
|
gotDevice = device
|
||||||
|
gotLogPath = logPath
|
||||||
|
return exec.CommandContext(ctx, "sh", "-c", "printf 'line1\nline2\n'")
|
||||||
|
}
|
||||||
|
defer func() { installCommand = orig }()
|
||||||
|
|
||||||
|
q.runTask(tk, j, context.Background())
|
||||||
|
|
||||||
|
if gotDevice != "/dev/sda" {
|
||||||
|
t.Fatalf("device=%q want /dev/sda", gotDevice)
|
||||||
|
}
|
||||||
|
if gotLogPath == "" {
|
||||||
|
t.Fatal("expected install log path")
|
||||||
|
}
|
||||||
|
logs := strings.Join(j.lines, "\n")
|
||||||
|
if !strings.Contains(logs, "Install log: ") {
|
||||||
|
t.Fatalf("missing install log line: %v", j.lines)
|
||||||
|
}
|
||||||
|
if !strings.Contains(logs, "line1") || !strings.Contains(logs, "line2") {
|
||||||
|
t.Fatalf("missing streamed output: %v", j.lines)
|
||||||
|
}
|
||||||
|
if j.err != "" {
|
||||||
|
t.Fatalf("unexpected error: %q", j.err)
|
||||||
|
}
|
||||||
|
}
|
||||||
2
bible
2
bible
Submodule bible updated: 456c1f022c...688b87e98d
@@ -9,6 +9,34 @@ All live metrics charts in the web UI are server-side SVG images served by Go
|
|||||||
and polled by the browser every 2 seconds via `<img src="...?t=now">`.
|
and polled by the browser every 2 seconds via `<img src="...?t=now">`.
|
||||||
There is no client-side canvas or JS chart library.
|
There is no client-side canvas or JS chart library.
|
||||||
|
|
||||||
|
## Rule: live charts must be visually uniform
|
||||||
|
|
||||||
|
Live charts are a single UI family, not a set of one-off widgets. New charts and
|
||||||
|
changes to existing charts must keep the same rendering model and presentation
|
||||||
|
rules unless there is an explicit architectural decision to diverge.
|
||||||
|
|
||||||
|
Default expectations:
|
||||||
|
|
||||||
|
- same server-side SVG pipeline for all live metrics charts
|
||||||
|
- same refresh behaviour and failure handling in the browser
|
||||||
|
- same canvas size class and card layout
|
||||||
|
- same legend placement policy across charts
|
||||||
|
- same axis, title, and summary conventions
|
||||||
|
- no chart-specific visual exceptions added as a quick fix
|
||||||
|
|
||||||
|
Current default for live charts:
|
||||||
|
|
||||||
|
- legend below the plot area when a chart has 8 series or fewer
|
||||||
|
- legend hidden when a chart has more than 8 series
|
||||||
|
- 10 equal Y-axis steps across the chart height
|
||||||
|
- 1400 x 360 SVG canvas with legend
|
||||||
|
- 1400 x 288 SVG canvas without legend
|
||||||
|
- full-width card rendering in a single-column stack
|
||||||
|
|
||||||
|
If one chart needs a different layout or legend behaviour, treat that as a
|
||||||
|
design-level decision affecting the whole chart family, not as a local tweak to
|
||||||
|
just one endpoint.
|
||||||
|
|
||||||
### Why go-analyze/charts
|
### Why go-analyze/charts
|
||||||
|
|
||||||
- Pure Go, no CGO — builds cleanly inside the live-build container
|
- Pure Go, no CGO — builds cleanly inside the live-build container
|
||||||
@@ -29,7 +57,8 @@ self-contained SVG renderer used **only** for completed SAT run reports
|
|||||||
| `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
|
| `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
|
||||||
| `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |
|
| `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |
|
||||||
|
|
||||||
Charts are 1400 × 280 px SVG. The page renders them at `width: 100%` in a
|
Charts are 1400 × 360 px SVG when the legend is shown, and 1400 × 288 px when
|
||||||
|
the legend is hidden. The page renders them at `width: 100%` in a
|
||||||
single-column layout so they always fill the viewport width.
|
single-column layout so they always fill the viewport width.
|
||||||
|
|
||||||
### Ring buffers
|
### Ring buffers
|
||||||
|
|||||||
@@ -60,6 +60,8 @@ Rules:
|
|||||||
- Chromium opens `http://localhost/` — the full interactive web UI
|
- Chromium opens `http://localhost/` — the full interactive web UI
|
||||||
- SSH is independent from the desktop path
|
- SSH is independent from the desktop path
|
||||||
- serial console support is enabled for VM boot debugging
|
- serial console support is enabled for VM boot debugging
|
||||||
|
- Default boot keeps the server-safe graphics path (`nomodeset` + forced `fbdev`) for IPMI/BMC consoles
|
||||||
|
- Higher-resolution mode selection is expected only when booting through an explicit `bee.display=kms` menu entry, which disables the forced `fbdev` Xorg config before `lightdm`
|
||||||
|
|
||||||
## ISO build sequence
|
## ISO build sequence
|
||||||
|
|
||||||
@@ -81,9 +83,9 @@ build-in-container.sh [--authorized-keys /path/to/keys]
|
|||||||
7. `build-cublas.sh`:
|
7. `build-cublas.sh`:
|
||||||
a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
|
a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
|
||||||
b. verify packages against repo `Packages.gz`
|
b. verify packages against repo `Packages.gz`
|
||||||
c. extract headers for `bee-gpu-stress` build
|
c. extract headers for `bee-gpu-burn` worker build
|
||||||
d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
|
d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
|
||||||
8. build `bee-gpu-stress` against extracted cuBLASLt/cudart headers
|
8. build `bee-gpu-burn` worker against extracted cuBLASLt/cudart headers
|
||||||
9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
|
9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
|
||||||
10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
|
10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
|
||||||
11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
|
11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
|
||||||
@@ -104,7 +106,7 @@ Build host notes:
|
|||||||
1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
|
1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
|
||||||
2. `auto/config` — `linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
|
2. `auto/config` — `linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
|
||||||
- NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
|
- NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
|
||||||
- `bee-gpu-stress` must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
|
- `bee-gpu-burn` worker must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
|
||||||
- The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
|
- The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
|
||||||
- The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
|
- The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
|
||||||
- The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
|
- The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
|
||||||
@@ -153,18 +155,17 @@ Current validation state:
|
|||||||
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
||||||
|
|
||||||
Acceptance flows:
|
Acceptance flows:
|
||||||
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + mixed-precision `bee-gpu-stress`
|
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-burn`
|
||||||
|
- NVIDIA GPU burn-in can use either `bee-gpu-burn` or `bee-john-gpu-stress` (John the Ripper jumbo via OpenCL)
|
||||||
- `bee sat memory` → `memtester` archive
|
- `bee sat memory` → `memtester` archive
|
||||||
- `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
|
- `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
|
||||||
- SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
|
- SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
|
||||||
- `bee-gpu-stress` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
|
- `bee-gpu-burn` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
|
||||||
- Ampere: `fp16` + `fp32`/TF32 tensor-core load
|
- Ampere: `fp16` + `fp32`/TF32 tensor-core load
|
||||||
- Ada / Hopper: add `fp8`
|
- Ada / Hopper: add `fp8`
|
||||||
- Blackwell+: add `fp4`
|
- Blackwell+: add `fp4`
|
||||||
- PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
|
- PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
|
||||||
- Runtime overrides:
|
- Runtime overrides:
|
||||||
- `BEE_GPU_STRESS_SECONDS`
|
|
||||||
- `BEE_GPU_STRESS_SIZE_MB`
|
|
||||||
- `BEE_MEMTESTER_SIZE_MB`
|
- `BEE_MEMTESTER_SIZE_MB`
|
||||||
- `BEE_MEMTESTER_PASSES`
|
- `BEE_MEMTESTER_PASSES`
|
||||||
|
|
||||||
@@ -179,6 +180,6 @@ Web UI: Acceptance Tests page → Run Test button
|
|||||||
```
|
```
|
||||||
|
|
||||||
**Critical invariants:**
|
**Critical invariants:**
|
||||||
- `bee-gpu-stress` uses `exec.CommandContext` — killed on job context cancel.
|
- `bee-gpu-burn` / `bee-john-gpu-stress` use `exec.CommandContext` — killed on job context cancel.
|
||||||
- Metric goroutine uses stopCh/doneCh pattern; main goroutine waits `<-doneCh` before reading rows (no mutex needed).
|
- Metric goroutine uses stopCh/doneCh pattern; main goroutine waits `<-doneCh` before reading rows (no mutex needed).
|
||||||
- SVG chart is fully offline: no JS, no external CSS, pure inline SVG.
|
- SVG chart is fully offline: no JS, no external CSS, pure inline SVG.
|
||||||
|
|||||||
@@ -21,8 +21,8 @@ Fills gaps where Redfish/logpile is blind:
|
|||||||
- Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
|
- Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
|
||||||
- Machine-readable health summary derived from collector verdicts
|
- Machine-readable health summary derived from collector verdicts
|
||||||
- Operator-triggered acceptance tests for NVIDIA, memory, and storage
|
- Operator-triggered acceptance tests for NVIDIA, memory, and storage
|
||||||
- NVIDIA SAT includes both diagnostic collection and mixed-precision GPU stress via `bee-gpu-stress`
|
- NVIDIA SAT includes diagnostic collection plus a lightweight in-image GPU stress step via `bee-gpu-burn`
|
||||||
- `bee-gpu-stress` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
|
- `bee-gpu-burn` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
|
||||||
- Automatic boot audit with operator-facing local console and SSH access
|
- Automatic boot audit with operator-facing local console and SSH access
|
||||||
- NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
|
- NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
|
||||||
- SSH access (OpenSSH) always available for inspection and debugging
|
- SSH access (OpenSSH) always available for inspection and debugging
|
||||||
@@ -70,7 +70,7 @@ Fills gaps where Redfish/logpile is blind:
|
|||||||
| SSH | OpenSSH server |
|
| SSH | OpenSSH server |
|
||||||
| NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
|
| NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
|
||||||
| NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
|
| NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
|
||||||
| GPU stress backend | `bee-gpu-stress` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
|
| GPU stress backend | `bee-gpu-burn` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
|
||||||
| Builder | Debian 12 host/VM or Debian 12 container image |
|
| Builder | Debian 12 host/VM or Debian 12 container image |
|
||||||
|
|
||||||
## Operator UX
|
## Operator UX
|
||||||
|
|||||||
@@ -18,6 +18,8 @@ Use the official proprietary NVIDIA `.run` installer for both kernel modules and
|
|||||||
- Kernel modules and nvidia-smi come from a single verified source.
|
- Kernel modules and nvidia-smi come from a single verified source.
|
||||||
- NVIDIA publishes `.sha256sum` alongside each installer — download and verify before use.
|
- NVIDIA publishes `.sha256sum` alongside each installer — download and verify before use.
|
||||||
- Driver version pinned in `iso/builder/VERSIONS` as `NVIDIA_DRIVER_VERSION`.
|
- Driver version pinned in `iso/builder/VERSIONS` as `NVIDIA_DRIVER_VERSION`.
|
||||||
|
- DCGM must track the CUDA user-mode driver major version exposed by `nvidia-smi`.
|
||||||
|
- For NVIDIA driver branch `590` with CUDA `13.x`, use DCGM 4 package family `datacenter-gpu-manager-4-cuda13`; legacy `datacenter-gpu-manager` 3.x does not provide a working path for this stack.
|
||||||
- Build process: download `.run`, extract, compile `kernel/` sources against `linux-lts-dev`.
|
- Build process: download `.run`, extract, compile `kernel/` sources against `linux-lts-dev`.
|
||||||
- Modules cached in `dist/nvidia-<version>-<kver>/` — rebuild only on version or kernel change.
|
- Modules cached in `dist/nvidia-<version>-<kver>/` — rebuild only on version or kernel change.
|
||||||
- ISO size increases by ~50MB for .ko files + nvidia-smi.
|
- ISO size increases by ~50MB for .ko files + nvidia-smi.
|
||||||
|
|||||||
224
bible-local/decisions/2026-04-01-memtest-build-strategy.md
Normal file
224
bible-local/decisions/2026-04-01-memtest-build-strategy.md
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
# Decision: Treat memtest as explicit ISO content, not as trusted live-build magic
|
||||||
|
|
||||||
|
**Date:** 2026-04-01
|
||||||
|
**Status:** resolved
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
We have already iterated on `memtest` multiple times and kept cycling between the same ideas.
|
||||||
|
The commit history shows several distinct attempts:
|
||||||
|
|
||||||
|
- `f91bce8` — fixed Bookworm memtest file names to `memtest86+x64.bin` / `memtest86+x64.efi`
|
||||||
|
- `5857805` — added a binary hook to copy memtest files from the build tree into the ISO root
|
||||||
|
- `f96b149` — added fallback extraction from the cached `.deb` when `chroot/boot/` stayed empty
|
||||||
|
- `d43a9ae` — removed the custom hook and switched back to live-build built-in memtest integration
|
||||||
|
- `60cb8f8` — restored explicit memtest menu entries and added ISO validation
|
||||||
|
- `3dbc218` / `3869788` — added archived build logs and better memtest diagnostics
|
||||||
|
|
||||||
|
Current evidence from the archived `easy-bee-nvidia-v3.14-amd64` logs dated 2026-04-01:
|
||||||
|
|
||||||
|
- `lb binary_memtest` does run and installs `memtest86+`
|
||||||
|
- but the final ISO still does **not** contain `boot/memtest86+x64.bin`
|
||||||
|
- the final ISO also does **not** contain memtest menu entries in `boot/grub/grub.cfg` or `isolinux/live.cfg`
|
||||||
|
|
||||||
|
So the assumption "live-build built-in memtest integration is enough on this stack" is currently false for this project until proven otherwise by a real built ISO.
|
||||||
|
|
||||||
|
Additional evidence from the archived `easy-bee-nvidia-v3.17-dirty-amd64` logs dated 2026-04-01:
|
||||||
|
|
||||||
|
- the build now completes successfully because memtest is non-blocking by default
|
||||||
|
- `lb binary_memtest` still runs and installs `memtest86+`
|
||||||
|
- the project-owned hook `config/hooks/normal/9100-memtest.hook.binary` does execute
|
||||||
|
- but it executes too early for its current target paths:
|
||||||
|
- `binary/boot/grub/grub.cfg` is still missing at hook time
|
||||||
|
- `binary/isolinux/live.cfg` is still missing at hook time
|
||||||
|
- memtest binaries are also still absent in `binary/boot/`
|
||||||
|
- later in the build, live-build does create intermediate bootloader configs with memtest lines in the workdir
|
||||||
|
- but the final ISO still lacks memtest binaries and still lacks memtest lines in extracted ISO `boot/grub/grub.cfg` and `isolinux/live.cfg`
|
||||||
|
|
||||||
|
So the assumption "the current normal binary hook path is late enough to patch final memtest artifacts" is also false.
|
||||||
|
|
||||||
|
Correction after inspecting the real `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
|
||||||
|
artifact dated 2026-04-01:
|
||||||
|
|
||||||
|
- the final ISO does contain `boot/memtest86+x64.bin`
|
||||||
|
- the final ISO does contain `boot/memtest86+x64.efi`
|
||||||
|
- the final ISO does contain memtest menu entries in both `boot/grub/grub.cfg`
|
||||||
|
and `isolinux/live.cfg`
|
||||||
|
- so `v3.20-5-g76a9100` was **not** another real memtest regression in the
|
||||||
|
shipped ISO
|
||||||
|
- the regression was in the build-time validator/debug path in `build.sh`
|
||||||
|
|
||||||
|
Root cause of the false alarm:
|
||||||
|
|
||||||
|
- `build.sh` treated "ISO reader command exists" as equivalent to "ISO reader
|
||||||
|
successfully listed/extracted members"
|
||||||
|
- `iso_list_files` / `iso_extract_file` failures were collapsed into the same
|
||||||
|
observable output as "memtest content missing"
|
||||||
|
- this made a reader failure look identical to a missing memtest payload
|
||||||
|
- as a result, we re-entered the same memtest investigation loop even though
|
||||||
|
the real ISO was already correct
|
||||||
|
|
||||||
|
Additional correction from the subsequent `v3.21` build logs dated 2026-04-01:
|
||||||
|
|
||||||
|
- once ISO reading was fixed, the post-build debug correctly showed the raw ISO
|
||||||
|
still carried live-build's default memtest layout (`live/memtest.bin`,
|
||||||
|
`live/memtest.efi`, `boot/grub/memtest.cfg`, `isolinux/memtest.cfg`)
|
||||||
|
- that mismatch is expected to trigger project recovery, because `bee` requires
|
||||||
|
`boot/memtest86+x64.bin` / `boot/memtest86+x64.efi` plus matching menu paths
|
||||||
|
- however, `build.sh` exited before recovery because `set -e` treated a direct
|
||||||
|
`iso_memtest_present` return code of `1` as fatal
|
||||||
|
- so the next repeated loop was caused by shell control flow, not by proof that
|
||||||
|
the recovery design itself was wrong
|
||||||
|
|
||||||
|
## Known Failed Attempts
|
||||||
|
|
||||||
|
These approaches were already tried and should not be repeated blindly:
|
||||||
|
|
||||||
|
1. Built-in live-build memtest only.
|
||||||
|
Reason it failed:
|
||||||
|
- `lb binary_memtest` runs, but the final ISO still misses memtest binaries and menu entries.
|
||||||
|
|
||||||
|
2. Fixing only the memtest file names for Debian Bookworm.
|
||||||
|
Reason it failed:
|
||||||
|
- correct file names alone do not make the files appear in the final ISO.
|
||||||
|
|
||||||
|
3. Copying memtest from `chroot/boot/` into `binary/boot/` via a binary hook.
|
||||||
|
Reason it failed:
|
||||||
|
- in this stack `chroot/boot/` is often empty for memtest payloads at the relevant time.
|
||||||
|
|
||||||
|
4. Fallback extraction from cached `memtest86+` `.deb`.
|
||||||
|
Reason it failed:
|
||||||
|
- this was explored already and was not enough to stabilize the final ISO path end-to-end.
|
||||||
|
|
||||||
|
5. Restoring explicit memtest menu entries in source bootloader templates only.
|
||||||
|
Reason it failed:
|
||||||
|
- memtest lines in source templates or intermediate workdir configs do not guarantee the final ISO contains them.
|
||||||
|
|
||||||
|
6. Patching `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` from the current `config/hooks/normal/9100-memtest.hook.binary`.
|
||||||
|
Reason it failed:
|
||||||
|
- the hook runs before those files exist, so the hook cannot patch them there.
|
||||||
|
|
||||||
|
## What This Means
|
||||||
|
|
||||||
|
When revisiting memtest later, start from the constraints above rather than retrying the same patterns:
|
||||||
|
|
||||||
|
- do not assume the built-in memtest stage is sufficient
|
||||||
|
- do not assume `chroot/boot/` will contain memtest payloads
|
||||||
|
- do not assume source bootloader templates are the last writer of final ISO configs
|
||||||
|
- do not assume the current normal binary hook timing is late enough for final patching
|
||||||
|
|
||||||
|
Any future memtest fix must explicitly identify:
|
||||||
|
|
||||||
|
- where the memtest binaries are reliably available at build time
|
||||||
|
- which exact build stage writes the final bootloader configs that land in the ISO
|
||||||
|
- and a post-build proof from a real ISO, not only from intermediate workdir files
|
||||||
|
- whether the ISO inspection step itself succeeded, rather than merely whether
|
||||||
|
the validator printed a memtest warning
|
||||||
|
- whether a non-zero probe is intentionally handled inside an `if` / `case`
|
||||||
|
context rather than accidentally tripping `set -e`
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
For `bee`, memtest must be treated as an explicit ISO artifact with explicit post-build validation.
|
||||||
|
|
||||||
|
Project rules from now on:
|
||||||
|
|
||||||
|
- Do **not** trust `--memtest memtest86+` by itself.
|
||||||
|
- A memtest implementation is considered valid only if the produced ISO actually contains:
|
||||||
|
- `boot/memtest86+x64.bin`
|
||||||
|
- `boot/memtest86+x64.efi`
|
||||||
|
- a GRUB menu entry
|
||||||
|
- an isolinux menu entry
|
||||||
|
- If live-build built-in integration does not produce those artifacts, use an explicit project-owned mechanism such as:
|
||||||
|
- a binary hook copying files into `binary/boot/`
|
||||||
|
- extraction from the cached `memtest86+` `.deb`
|
||||||
|
- another deterministic build-time copy step
|
||||||
|
- Do **not** remove such explicit logic later unless a fresh real ISO build proves that built-in integration alone produces all required files and menu entries.
|
||||||
|
|
||||||
|
Current implementation direction:
|
||||||
|
|
||||||
|
- keep the live-build memtest stage enabled if it helps package acquisition
|
||||||
|
- do not rely on the current early `binary_hooks` timing for final patching
|
||||||
|
- prefer a post-`lb build` recovery step in `build.sh` that:
|
||||||
|
- patches the fully materialized `LB_DIR/binary` tree
|
||||||
|
- injects memtest binaries there
|
||||||
|
- ensures final bootloader entries there
|
||||||
|
- reruns late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) after the patch
|
||||||
|
- also treat ISO validation tooling as part of the critical path:
|
||||||
|
- install a stable ISO reader in the builder image
|
||||||
|
- fail with an explicit reader error if ISO listing/extraction fails
|
||||||
|
- do not treat reader failure as evidence that memtest is missing
|
||||||
|
- do not call a probe that may return "needs recovery" as a bare command under
|
||||||
|
`set -e`; wrap it in explicit control flow
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
- Future memtest changes must begin by reading this ADR and the commits listed above.
|
||||||
|
- Future memtest changes must also begin by reading the failed-attempt list above.
|
||||||
|
- We should stop re-introducing "prefer built-in live-build memtest" as a default assumption without new evidence.
|
||||||
|
- Memtest validation in `build.sh` is not optional; it is the acceptance gate that prevents another silent regression.
|
||||||
|
- But validation output is only trustworthy if ISO reading itself succeeded. A
|
||||||
|
"missing memtest" warning without a successful ISO read is not evidence.
|
||||||
|
- If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change.
|
||||||
|
|
||||||
|
## Working Solution (confirmed 2026-04-01, commits 76a9100 → 2baf3be)
|
||||||
|
|
||||||
|
This approach was confirmed working in ISO `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
|
||||||
|
and validated again in subsequent builds. The final ISO contains all required memtest artifacts.
|
||||||
|
|
||||||
|
### Components
|
||||||
|
|
||||||
|
**1. Binary hook `config/hooks/normal/9100-memtest.hook.binary`**
|
||||||
|
|
||||||
|
Runs inside the live-build binary phase. Does not patch bootloader files at hook time —
|
||||||
|
those files may not exist yet. Instead:
|
||||||
|
|
||||||
|
- Tries to copy `memtest86+x64.bin` / `memtest86+x64.efi` from `chroot/boot/` first.
|
||||||
|
- Falls back to extracting from the cached `.deb` (via `dpkg-deb -x`) if `chroot/boot/` is empty.
|
||||||
|
- Appends GRUB and isolinux menu entries only if the respective cfg files already exist at hook time.
|
||||||
|
If they do not exist, the hook warns and continues (does not fail).
|
||||||
|
|
||||||
|
Controlled by `BEE_REQUIRE_MEMTEST=1` env var to turn warnings into hard errors when needed.
|
||||||
|
|
||||||
|
**2. Post-`lb build` recovery step in `build.sh`**
|
||||||
|
|
||||||
|
After `lb build` completes, `build.sh` checks whether the fully materialized `binary/` tree
|
||||||
|
contains all required memtest artifacts. If not:
|
||||||
|
|
||||||
|
- Copies/extracts memtest binaries into `binary/boot/`.
|
||||||
|
- Patches `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` directly.
|
||||||
|
- Reruns the late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) to rebuild
|
||||||
|
the ISO with the patched tree.
|
||||||
|
|
||||||
|
This is the deterministic safety net: even if the hook runs at the wrong time, the recovery
|
||||||
|
step handles the final `binary/` tree after live-build has written all bootloader configs.
|
||||||
|
|
||||||
|
**3. ISO validation hardening**
|
||||||
|
|
||||||
|
The memtest probe in `build.sh` is wrapped in explicit `if` / `case` control flow, not called
|
||||||
|
as a bare command under `set -e`. A non-zero probe return (needs recovery) is intentional and
|
||||||
|
handled — it does not abort the build prematurely.
|
||||||
|
|
||||||
|
ISO reading (`xorriso -indev -ls` / extraction) is treated as a separate prerequisite.
|
||||||
|
If the reader fails, the validator reports a reader error explicitly, not a memtest warning.
|
||||||
|
This prevents the false-negative loop that burned 2026-04-01 v3.14–v3.19.
|
||||||
|
|
||||||
|
### Why this works when earlier attempts did not
|
||||||
|
|
||||||
|
The earlier patterns all shared a single flaw: they assumed a single build-time point
|
||||||
|
(hook or source template) would be the last writer of bootloader configs and memtest payloads.
|
||||||
|
In live-build on Debian Bookworm that assumption is false — live-build continues writing
|
||||||
|
bootloader files after custom hooks run, and `chroot/boot/` does not reliably hold memtest payloads.
|
||||||
|
|
||||||
|
The recovery step sidesteps the ordering problem entirely: it acts on the fully materialized
|
||||||
|
`binary/` tree after `lb build` finishes, then rebuilds the ISO from that patched tree.
|
||||||
|
There is no ordering dependency to get wrong.
|
||||||
|
|
||||||
|
### Do not revert
|
||||||
|
|
||||||
|
Do not remove the recovery step or the hook without a fresh real ISO build proving
|
||||||
|
live-build alone produces all four required artifacts:
|
||||||
|
- `boot/memtest86+x64.bin`
|
||||||
|
- `boot/memtest86+x64.efi`
|
||||||
|
- memtest entry in `boot/grub/grub.cfg`
|
||||||
|
- memtest entry in `isolinux/live.cfg`
|
||||||
@@ -5,3 +5,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
|
|||||||
| Date | Decision | Status |
|
| Date | Decision | Status |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
||||||
|
| 2026-04-01 | Treat memtest as explicit ISO content | active |
|
||||||
|
|||||||
62
bible-local/docs/iso-build-rules.md
Normal file
62
bible-local/docs/iso-build-rules.md
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
# ISO Build Rules
|
||||||
|
|
||||||
|
## Verify package names before use
|
||||||
|
|
||||||
|
ISO builds take 30–60 minutes. A wrong package name wastes an entire build cycle.
|
||||||
|
|
||||||
|
**Rule: before adding any Debian package name to the ISO config, verify it exists and check its file list.**
|
||||||
|
|
||||||
|
Use one of:
|
||||||
|
- `https://packages.debian.org/bookworm/<package-name>` — existence + description
|
||||||
|
- `https://packages.debian.org/bookworm/amd64/<package-name>/filelist` — exact files installed
|
||||||
|
- `apt-cache show <package>` inside a Debian bookworm container
|
||||||
|
|
||||||
|
This applies to:
|
||||||
|
- `iso/builder/config/package-lists/*.list.chroot`
|
||||||
|
- Any package referenced in bootloader configs, hooks, or overlay scripts
|
||||||
|
|
||||||
|
## Memtest rule
|
||||||
|
|
||||||
|
Do not assume live-build's built-in memtest integration is sufficient for `bee`.
|
||||||
|
We already tried that path and regressed again on 2026-04-01: `lb binary_memtest`
|
||||||
|
ran, but the final ISO still lacked memtest binaries and menu entries.
|
||||||
|
|
||||||
|
For this project, memtest is accepted only when the produced ISO actually
|
||||||
|
contains all of the following:
|
||||||
|
|
||||||
|
- `boot/memtest86+x64.bin`
|
||||||
|
- `boot/memtest86+x64.efi`
|
||||||
|
- a memtest entry in `boot/grub/grub.cfg`
|
||||||
|
- a memtest entry in `isolinux/live.cfg`
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
|
||||||
|
- Keep explicit post-build memtest validation in `build.sh`.
|
||||||
|
- Treat ISO reader success as a separate prerequisite from memtest content.
|
||||||
|
If the reader cannot list or extract from the ISO, that is a validator
|
||||||
|
failure, not proof that memtest is missing.
|
||||||
|
- If built-in integration does not produce the artifacts above, use a
|
||||||
|
deterministic project-owned copy/extract step instead of hoping live-build
|
||||||
|
will "start working".
|
||||||
|
- Do not switch back to built-in-only memtest without fresh build evidence from
|
||||||
|
a real ISO.
|
||||||
|
- If you reference memtest files manually, verify the exact package file list
|
||||||
|
first for the target Debian release.
|
||||||
|
|
||||||
|
Known bad loops for this repository:
|
||||||
|
|
||||||
|
- Do not retry built-in-only memtest without new evidence. We already proved
|
||||||
|
that `lb binary_memtest` can run while the final ISO still has no memtest.
|
||||||
|
- Do not assume fixing memtest file names is enough. Correct names did not fix
|
||||||
|
the final artifact path.
|
||||||
|
- Do not assume `chroot/boot/` contains memtest payloads at the time hooks run.
|
||||||
|
- Do not assume source `grub.cfg` / `live.cfg.in` are the final writers of ISO
|
||||||
|
bootloader configs.
|
||||||
|
- Do not assume the current `config/hooks/normal/9100-memtest.hook.binary`
|
||||||
|
timing is late enough to patch final `binary/boot/grub/grub.cfg` or
|
||||||
|
`binary/isolinux/live.cfg`; logs from 2026-04-01 showed those files were not
|
||||||
|
present yet when the hook executed.
|
||||||
|
- Do not treat a validator warning as ground truth until you have confirmed the
|
||||||
|
ISO reader actually succeeded. On 2026-04-01 we misdiagnosed another memtest
|
||||||
|
regression because the final ISO was correct but the validator produced a
|
||||||
|
false negative.
|
||||||
35
bible-local/docs/validate-vs-burn.md
Normal file
35
bible-local/docs/validate-vs-burn.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Validate vs Burn: Hardware Impact Policy
|
||||||
|
|
||||||
|
## Validate Tests (non-destructive)
|
||||||
|
|
||||||
|
Tests on the **Validate** page are purely diagnostic. They:
|
||||||
|
|
||||||
|
- **Do not write to disks** — no data is written to storage devices; SMART counters (power-on hours, load cycle count, reallocated sectors) are not incremented.
|
||||||
|
- **Do not run sustained high load** — commands complete quickly (seconds to minutes) and do not push hardware to thermal or electrical limits.
|
||||||
|
- **Do not increment hardware wear counters** — GPU memory ECC counters, NVMe wear leveling counters, and similar endurance metrics are unaffected.
|
||||||
|
- **Are safe to run repeatedly** — on new, production-bound, or already-deployed hardware without concern for reducing lifespan.
|
||||||
|
|
||||||
|
### What Validate tests actually do
|
||||||
|
|
||||||
|
| Test | What it runs |
|
||||||
|
|---|---|
|
||||||
|
| NVIDIA GPU | `nvidia-smi`, `dcgmi diag` (levels 1–4 read-only diagnostics) |
|
||||||
|
| Memory | `memtester` on a limited allocation; reads/writes to RAM only |
|
||||||
|
| Storage | `smartctl -a`, `nvme smart-log` — reads SMART data only |
|
||||||
|
| CPU | `stress-ng` for a bounded duration; CPU-only, no I/O |
|
||||||
|
| AMD GPU | `rocm-smi --showallinfo`, `dmidecode` — read-only queries |
|
||||||
|
|
||||||
|
## Burn Tests (hardware wear)
|
||||||
|
|
||||||
|
Tests on the **Burn** page run hardware at maximum or near-maximum load for extended durations. They:
|
||||||
|
|
||||||
|
- **Wear storage**: write-intensive patterns can reduce SSD endurance (P/E cycles).
|
||||||
|
- **Stress GPU memory**: extended ECC stress tests may surface latent defects but also exercise memory cells.
|
||||||
|
- **Accelerate thermal cycling**: repeated heat/cool cycles degrade solder joints and capacitors over time.
|
||||||
|
- **May increment wear counters**: GPU power-on hours, NVMe media wear indicator, and similar metrics will advance.
|
||||||
|
|
||||||
|
### Rule
|
||||||
|
|
||||||
|
> Run **Validate** freely on any server, at any time, before or after deployment.
|
||||||
|
> Run **Burn** only when explicitly required (e.g., initial acceptance after repair, or per customer SLA).
|
||||||
|
> Document when and why Burn tests were run.
|
||||||
@@ -48,6 +48,7 @@ sh iso/builder/build-in-container.sh --cache-dir /path/to/cache
|
|||||||
- The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
|
- The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
|
||||||
- The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
|
- The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
|
||||||
- Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
|
- Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
|
||||||
|
- The NVIDIA variant installs DCGM 4 packages matched to the CUDA user-mode driver major version. For driver branch `590` / CUDA `13.x`, the package family is `datacenter-gpu-manager-4-cuda13` rather than legacy `datacenter-gpu-manager`.
|
||||||
- Override the container platform only if you know why:
|
- Override the container platform only if you know why:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
|
|||||||
@@ -17,12 +17,23 @@ RUN apt-get update -qq && apt-get install -y \
|
|||||||
wget \
|
wget \
|
||||||
curl \
|
curl \
|
||||||
tar \
|
tar \
|
||||||
|
libarchive-tools \
|
||||||
xz-utils \
|
xz-utils \
|
||||||
rsync \
|
rsync \
|
||||||
build-essential \
|
build-essential \
|
||||||
gcc \
|
gcc \
|
||||||
make \
|
make \
|
||||||
perl \
|
perl \
|
||||||
|
pkg-config \
|
||||||
|
yasm \
|
||||||
|
libssl-dev \
|
||||||
|
zlib1g-dev \
|
||||||
|
libbz2-dev \
|
||||||
|
libgmp-dev \
|
||||||
|
libpcap-dev \
|
||||||
|
libsqlite3-dev \
|
||||||
|
libcurl4-openssl-dev \
|
||||||
|
ocl-icd-opencl-dev \
|
||||||
linux-headers-amd64 \
|
linux-headers-amd64 \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|||||||
@@ -8,8 +8,16 @@ NCCL_TESTS_VERSION=2.13.10
|
|||||||
NVCC_VERSION=12.8
|
NVCC_VERSION=12.8
|
||||||
CUBLAS_VERSION=13.0.2.14-1
|
CUBLAS_VERSION=13.0.2.14-1
|
||||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||||
DCGM_VERSION=3.3.9
|
DCGM_VERSION=4.5.3-1
|
||||||
|
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||||
ROCM_VERSION=6.3.4
|
ROCM_VERSION=6.3.4
|
||||||
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
||||||
|
ROCM_BANDWIDTH_TEST_VERSION=1.4.0.60304-76~22.04
|
||||||
|
ROCM_VALIDATION_SUITE_VERSION=1.1.0.60304-76~22.04
|
||||||
|
ROCBLAS_VERSION=4.3.0.60304-76~22.04
|
||||||
|
ROCRAND_VERSION=3.2.0.60304-76~22.04
|
||||||
|
HIP_RUNTIME_AMD_VERSION=6.3.42134.60304-76~22.04
|
||||||
|
HIPBLASLT_VERSION=0.10.0.60304-76~22.04
|
||||||
|
COMGR_VERSION=2.8.0.60304-76~22.04
|
||||||
GO_VERSION=1.24.0
|
GO_VERSION=1.24.0
|
||||||
AUDIT_VERSION=1.0.0
|
AUDIT_VERSION=1.0.0
|
||||||
|
|||||||
@@ -29,10 +29,10 @@ lb config noauto \
|
|||||||
--security true \
|
--security true \
|
||||||
--linux-flavours "amd64" \
|
--linux-flavours "amd64" \
|
||||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||||
--memtest none \
|
--memtest memtest86+ \
|
||||||
--iso-volume "EASY-BEE" \
|
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--iso-application "EASY-BEE" \
|
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--bootappend-live "boot=live components nomodeset video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||||
--apt-recommends false \
|
--apt-recommends false \
|
||||||
--chroot-squashfs-compression-type zstd \
|
--chroot-squashfs-compression-type zstd \
|
||||||
"${@}"
|
"${@}"
|
||||||
|
|||||||
@@ -29,8 +29,14 @@ typedef void *CUfunction;
|
|||||||
typedef void *CUstream;
|
typedef void *CUstream;
|
||||||
|
|
||||||
#define CU_SUCCESS 0
|
#define CU_SUCCESS 0
|
||||||
|
#define CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT 16
|
||||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
|
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
|
||||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
|
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
|
||||||
|
#define MAX_STRESS_STREAMS 16
|
||||||
|
#define MAX_CUBLAS_PROFILES 5
|
||||||
|
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
||||||
|
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
||||||
|
#define STRESS_LAUNCH_DEPTH 8
|
||||||
|
|
||||||
static const char *ptx_source =
|
static const char *ptx_source =
|
||||||
".version 6.0\n"
|
".version 6.0\n"
|
||||||
@@ -97,6 +103,9 @@ typedef CUresult (*cuLaunchKernel_fn)(CUfunction,
|
|||||||
CUstream,
|
CUstream,
|
||||||
void **,
|
void **,
|
||||||
void **);
|
void **);
|
||||||
|
typedef CUresult (*cuMemGetInfo_fn)(size_t *, size_t *);
|
||||||
|
typedef CUresult (*cuStreamCreate_fn)(CUstream *, unsigned int);
|
||||||
|
typedef CUresult (*cuStreamDestroy_fn)(CUstream);
|
||||||
typedef CUresult (*cuGetErrorName_fn)(CUresult, const char **);
|
typedef CUresult (*cuGetErrorName_fn)(CUresult, const char **);
|
||||||
typedef CUresult (*cuGetErrorString_fn)(CUresult, const char **);
|
typedef CUresult (*cuGetErrorString_fn)(CUresult, const char **);
|
||||||
|
|
||||||
@@ -118,6 +127,9 @@ struct cuda_api {
|
|||||||
cuModuleLoadDataEx_fn cuModuleLoadDataEx;
|
cuModuleLoadDataEx_fn cuModuleLoadDataEx;
|
||||||
cuModuleGetFunction_fn cuModuleGetFunction;
|
cuModuleGetFunction_fn cuModuleGetFunction;
|
||||||
cuLaunchKernel_fn cuLaunchKernel;
|
cuLaunchKernel_fn cuLaunchKernel;
|
||||||
|
cuMemGetInfo_fn cuMemGetInfo;
|
||||||
|
cuStreamCreate_fn cuStreamCreate;
|
||||||
|
cuStreamDestroy_fn cuStreamDestroy;
|
||||||
cuGetErrorName_fn cuGetErrorName;
|
cuGetErrorName_fn cuGetErrorName;
|
||||||
cuGetErrorString_fn cuGetErrorString;
|
cuGetErrorString_fn cuGetErrorString;
|
||||||
};
|
};
|
||||||
@@ -128,9 +140,10 @@ struct stress_report {
|
|||||||
int cc_major;
|
int cc_major;
|
||||||
int cc_minor;
|
int cc_minor;
|
||||||
int buffer_mb;
|
int buffer_mb;
|
||||||
|
int stream_count;
|
||||||
unsigned long iterations;
|
unsigned long iterations;
|
||||||
uint64_t checksum;
|
uint64_t checksum;
|
||||||
char details[1024];
|
char details[16384];
|
||||||
};
|
};
|
||||||
|
|
||||||
static int load_symbol(void *lib, const char *name, void **out) {
|
static int load_symbol(void *lib, const char *name, void **out) {
|
||||||
@@ -144,7 +157,7 @@ static int load_cuda(struct cuda_api *api) {
|
|||||||
if (!api->lib) {
|
if (!api->lib) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
return
|
if (!(
|
||||||
load_symbol(api->lib, "cuInit", (void **)&api->cuInit) &&
|
load_symbol(api->lib, "cuInit", (void **)&api->cuInit) &&
|
||||||
load_symbol(api->lib, "cuDeviceGetCount", (void **)&api->cuDeviceGetCount) &&
|
load_symbol(api->lib, "cuDeviceGetCount", (void **)&api->cuDeviceGetCount) &&
|
||||||
load_symbol(api->lib, "cuDeviceGet", (void **)&api->cuDeviceGet) &&
|
load_symbol(api->lib, "cuDeviceGet", (void **)&api->cuDeviceGet) &&
|
||||||
@@ -160,7 +173,17 @@ static int load_cuda(struct cuda_api *api) {
|
|||||||
load_symbol(api->lib, "cuMemcpyDtoH_v2", (void **)&api->cuMemcpyDtoH) &&
|
load_symbol(api->lib, "cuMemcpyDtoH_v2", (void **)&api->cuMemcpyDtoH) &&
|
||||||
load_symbol(api->lib, "cuModuleLoadDataEx", (void **)&api->cuModuleLoadDataEx) &&
|
load_symbol(api->lib, "cuModuleLoadDataEx", (void **)&api->cuModuleLoadDataEx) &&
|
||||||
load_symbol(api->lib, "cuModuleGetFunction", (void **)&api->cuModuleGetFunction) &&
|
load_symbol(api->lib, "cuModuleGetFunction", (void **)&api->cuModuleGetFunction) &&
|
||||||
load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel);
|
load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel))) {
|
||||||
|
dlclose(api->lib);
|
||||||
|
memset(api, 0, sizeof(*api));
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
load_symbol(api->lib, "cuMemGetInfo_v2", (void **)&api->cuMemGetInfo);
|
||||||
|
load_symbol(api->lib, "cuStreamCreate", (void **)&api->cuStreamCreate);
|
||||||
|
if (!load_symbol(api->lib, "cuStreamDestroy_v2", (void **)&api->cuStreamDestroy)) {
|
||||||
|
load_symbol(api->lib, "cuStreamDestroy", (void **)&api->cuStreamDestroy);
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char *cu_error_name(struct cuda_api *api, CUresult rc) {
|
static const char *cu_error_name(struct cuda_api *api, CUresult rc) {
|
||||||
@@ -193,14 +216,12 @@ static double now_seconds(void) {
|
|||||||
return (double)ts.tv_sec + ((double)ts.tv_nsec / 1000000000.0);
|
return (double)ts.tv_sec + ((double)ts.tv_nsec / 1000000000.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if HAVE_CUBLASLT_HEADERS
|
|
||||||
static size_t round_down_size(size_t value, size_t multiple) {
|
static size_t round_down_size(size_t value, size_t multiple) {
|
||||||
if (multiple == 0 || value < multiple) {
|
if (multiple == 0 || value < multiple) {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
return value - (value % multiple);
|
return value - (value % multiple);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *major, int *minor) {
|
static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *major, int *minor) {
|
||||||
int cc_major = 0;
|
int cc_major = 0;
|
||||||
@@ -220,6 +241,75 @@ static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *maj
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int query_multiprocessor_count(struct cuda_api *api, CUdevice dev, int *count) {
|
||||||
|
int mp_count = 0;
|
||||||
|
if (!check_rc(api,
|
||||||
|
"cuDeviceGetAttribute(multiprocessors)",
|
||||||
|
api->cuDeviceGetAttribute(&mp_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev))) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
*count = mp_count;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t clamp_budget_to_free_memory(struct cuda_api *api, size_t requested_bytes) {
|
||||||
|
size_t free_bytes = 0;
|
||||||
|
size_t total_bytes = 0;
|
||||||
|
size_t max_bytes = requested_bytes;
|
||||||
|
|
||||||
|
if (!api->cuMemGetInfo) {
|
||||||
|
return requested_bytes;
|
||||||
|
}
|
||||||
|
if (api->cuMemGetInfo(&free_bytes, &total_bytes) != CU_SUCCESS || free_bytes == 0) {
|
||||||
|
return requested_bytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
max_bytes = (free_bytes * 9u) / 10u;
|
||||||
|
if (max_bytes < (size_t)4u * 1024u * 1024u) {
|
||||||
|
max_bytes = (size_t)4u * 1024u * 1024u;
|
||||||
|
}
|
||||||
|
if (requested_bytes > max_bytes) {
|
||||||
|
return max_bytes;
|
||||||
|
}
|
||||||
|
return requested_bytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int choose_stream_count(int mp_count, int planned_profiles, size_t total_budget, int have_streams) {
|
||||||
|
int stream_count = 1;
|
||||||
|
if (!have_streams || mp_count <= 0 || planned_profiles <= 0) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
stream_count = mp_count / 8;
|
||||||
|
if (stream_count < 2) {
|
||||||
|
stream_count = 2;
|
||||||
|
}
|
||||||
|
if (stream_count > MAX_STRESS_STREAMS) {
|
||||||
|
stream_count = MAX_STRESS_STREAMS;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (stream_count > 1) {
|
||||||
|
size_t per_stream_budget = total_budget / ((size_t)planned_profiles * (size_t)stream_count);
|
||||||
|
if (per_stream_budget >= MIN_STREAM_BUDGET_BYTES) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
stream_count--;
|
||||||
|
}
|
||||||
|
return stream_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
|
||||||
|
if (!api->cuStreamDestroy) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
if (streams[i]) {
|
||||||
|
api->cuStreamDestroy(streams[i]);
|
||||||
|
streams[i] = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#if HAVE_CUBLASLT_HEADERS
|
#if HAVE_CUBLASLT_HEADERS
|
||||||
static void append_detail(char *buf, size_t cap, const char *fmt, ...) {
|
static void append_detail(char *buf, size_t cap, const char *fmt, ...) {
|
||||||
size_t len = strlen(buf);
|
size_t len = strlen(buf);
|
||||||
@@ -242,12 +332,19 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
int size_mb,
|
int size_mb,
|
||||||
struct stress_report *report) {
|
struct stress_report *report) {
|
||||||
CUcontext ctx = NULL;
|
CUcontext ctx = NULL;
|
||||||
CUdeviceptr device_mem = 0;
|
|
||||||
CUmodule module = NULL;
|
CUmodule module = NULL;
|
||||||
CUfunction kernel = NULL;
|
CUfunction kernel = NULL;
|
||||||
uint32_t sample[256];
|
uint32_t sample[256];
|
||||||
uint32_t words = 0;
|
CUdeviceptr device_mem[MAX_STRESS_STREAMS] = {0};
|
||||||
|
CUstream streams[MAX_STRESS_STREAMS] = {0};
|
||||||
|
uint32_t words[MAX_STRESS_STREAMS] = {0};
|
||||||
|
uint32_t rounds[MAX_STRESS_STREAMS] = {0};
|
||||||
|
void *params[MAX_STRESS_STREAMS][3];
|
||||||
|
size_t bytes_per_stream[MAX_STRESS_STREAMS] = {0};
|
||||||
unsigned long iterations = 0;
|
unsigned long iterations = 0;
|
||||||
|
int mp_count = 0;
|
||||||
|
int stream_count = 1;
|
||||||
|
int launches_per_wave = 0;
|
||||||
|
|
||||||
memset(report, 0, sizeof(*report));
|
memset(report, 0, sizeof(*report));
|
||||||
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
||||||
@@ -260,64 +357,109 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t bytes = (size_t)size_mb * 1024u * 1024u;
|
size_t requested_bytes = (size_t)size_mb * 1024u * 1024u;
|
||||||
if (bytes < 4u * 1024u * 1024u) {
|
if (requested_bytes < MIN_PROFILE_BUDGET_BYTES) {
|
||||||
bytes = 4u * 1024u * 1024u;
|
requested_bytes = MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
if (bytes > (size_t)1024u * 1024u * 1024u) {
|
size_t total_bytes = clamp_budget_to_free_memory(api, requested_bytes);
|
||||||
bytes = (size_t)1024u * 1024u * 1024u;
|
if (total_bytes < MIN_PROFILE_BUDGET_BYTES) {
|
||||||
|
total_bytes = MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
words = (uint32_t)(bytes / sizeof(uint32_t));
|
report->buffer_mb = (int)(total_bytes / (1024u * 1024u));
|
||||||
|
|
||||||
if (!check_rc(api, "cuMemAlloc", api->cuMemAlloc(&device_mem, bytes))) {
|
if (query_multiprocessor_count(api, dev, &mp_count) &&
|
||||||
api->cuCtxDestroy(ctx);
|
api->cuStreamCreate &&
|
||||||
return 0;
|
api->cuStreamDestroy) {
|
||||||
|
stream_count = choose_stream_count(mp_count, 1, total_bytes, 1);
|
||||||
}
|
}
|
||||||
if (!check_rc(api, "cuMemsetD8", api->cuMemsetD8(device_mem, 0, bytes))) {
|
if (stream_count > 1) {
|
||||||
api->cuMemFree(device_mem);
|
int created = 0;
|
||||||
api->cuCtxDestroy(ctx);
|
for (; created < stream_count; created++) {
|
||||||
return 0;
|
if (!check_rc(api, "cuStreamCreate", api->cuStreamCreate(&streams[created], 0))) {
|
||||||
|
destroy_streams(api, streams, created);
|
||||||
|
stream_count = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
report->stream_count = stream_count;
|
||||||
|
|
||||||
|
for (int lane = 0; lane < stream_count; lane++) {
|
||||||
|
size_t slice = total_bytes / (size_t)stream_count;
|
||||||
|
if (lane == stream_count - 1) {
|
||||||
|
slice = total_bytes - ((size_t)lane * (total_bytes / (size_t)stream_count));
|
||||||
|
}
|
||||||
|
slice = round_down_size(slice, sizeof(uint32_t));
|
||||||
|
if (slice < MIN_PROFILE_BUDGET_BYTES) {
|
||||||
|
slice = MIN_PROFILE_BUDGET_BYTES;
|
||||||
|
}
|
||||||
|
bytes_per_stream[lane] = slice;
|
||||||
|
words[lane] = (uint32_t)(slice / sizeof(uint32_t));
|
||||||
|
|
||||||
|
if (!check_rc(api, "cuMemAlloc", api->cuMemAlloc(&device_mem[lane], slice))) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
if (!check_rc(api, "cuMemsetD8", api->cuMemsetD8(device_mem[lane], 0, slice))) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
rounds[lane] = 2048;
|
||||||
|
params[lane][0] = &device_mem[lane];
|
||||||
|
params[lane][1] = &words[lane];
|
||||||
|
params[lane][2] = &rounds[lane];
|
||||||
|
}
|
||||||
|
|
||||||
if (!check_rc(api,
|
if (!check_rc(api,
|
||||||
"cuModuleLoadDataEx",
|
"cuModuleLoadDataEx",
|
||||||
api->cuModuleLoadDataEx(&module, ptx_source, 0, NULL, NULL))) {
|
api->cuModuleLoadDataEx(&module, ptx_source, 0, NULL, NULL))) {
|
||||||
api->cuMemFree(device_mem);
|
goto fail;
|
||||||
api->cuCtxDestroy(ctx);
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
if (!check_rc(api, "cuModuleGetFunction", api->cuModuleGetFunction(&kernel, module, "burn"))) {
|
if (!check_rc(api, "cuModuleGetFunction", api->cuModuleGetFunction(&kernel, module, "burn"))) {
|
||||||
api->cuMemFree(device_mem);
|
goto fail;
|
||||||
api->cuCtxDestroy(ctx);
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int threads = 256;
|
unsigned int threads = 256;
|
||||||
unsigned int blocks = (unsigned int)((words + threads - 1) / threads);
|
|
||||||
uint32_t rounds = 1024;
|
|
||||||
void *params[] = {&device_mem, &words, &rounds};
|
|
||||||
|
|
||||||
double start = now_seconds();
|
double start = now_seconds();
|
||||||
double deadline = start + (double)seconds;
|
double deadline = start + (double)seconds;
|
||||||
while (now_seconds() < deadline) {
|
while (now_seconds() < deadline) {
|
||||||
if (!check_rc(api,
|
launches_per_wave = 0;
|
||||||
"cuLaunchKernel",
|
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
||||||
api->cuLaunchKernel(kernel, blocks, 1, 1, threads, 1, 1, 0, NULL, params, NULL))) {
|
int launched_this_batch = 0;
|
||||||
api->cuMemFree(device_mem);
|
for (int lane = 0; lane < stream_count; lane++) {
|
||||||
api->cuCtxDestroy(ctx);
|
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
||||||
return 0;
|
if (!check_rc(api,
|
||||||
|
"cuLaunchKernel",
|
||||||
|
api->cuLaunchKernel(kernel,
|
||||||
|
blocks,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
threads,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
0,
|
||||||
|
streams[lane],
|
||||||
|
params[lane],
|
||||||
|
NULL))) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
launches_per_wave++;
|
||||||
|
launched_this_batch++;
|
||||||
|
}
|
||||||
|
if (launched_this_batch <= 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
iterations++;
|
if (launches_per_wave <= 0) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
iterations += (unsigned long)launches_per_wave;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
|
||||||
api->cuMemFree(device_mem);
|
goto fail;
|
||||||
api->cuCtxDestroy(ctx);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem, sizeof(sample)))) {
|
|
||||||
api->cuMemFree(device_mem);
|
|
||||||
api->cuCtxDestroy(ctx);
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < sizeof(sample) / sizeof(sample[0]); i++) {
|
for (size_t i = 0; i < sizeof(sample) / sizeof(sample[0]); i++) {
|
||||||
@@ -326,12 +468,34 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
report->iterations = iterations;
|
report->iterations = iterations;
|
||||||
snprintf(report->details,
|
snprintf(report->details,
|
||||||
sizeof(report->details),
|
sizeof(report->details),
|
||||||
"profile_int32_fallback=OK iterations=%lu\n",
|
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
|
||||||
|
size_mb,
|
||||||
|
report->buffer_mb,
|
||||||
|
report->stream_count,
|
||||||
|
STRESS_LAUNCH_DEPTH,
|
||||||
|
bytes_per_stream[0] / (1024u * 1024u),
|
||||||
iterations);
|
iterations);
|
||||||
|
|
||||||
api->cuMemFree(device_mem);
|
for (int lane = 0; lane < stream_count; lane++) {
|
||||||
|
if (device_mem[lane]) {
|
||||||
|
api->cuMemFree(device_mem[lane]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
destroy_streams(api, streams, stream_count);
|
||||||
api->cuCtxDestroy(ctx);
|
api->cuCtxDestroy(ctx);
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
|
fail:
|
||||||
|
for (int lane = 0; lane < MAX_STRESS_STREAMS; lane++) {
|
||||||
|
if (device_mem[lane]) {
|
||||||
|
api->cuMemFree(device_mem[lane]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
destroy_streams(api, streams, MAX_STRESS_STREAMS);
|
||||||
|
if (ctx) {
|
||||||
|
api->cuCtxDestroy(ctx);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if HAVE_CUBLASLT_HEADERS
|
#if HAVE_CUBLASLT_HEADERS
|
||||||
@@ -418,6 +582,7 @@ struct profile_desc {
|
|||||||
|
|
||||||
struct prepared_profile {
|
struct prepared_profile {
|
||||||
struct profile_desc desc;
|
struct profile_desc desc;
|
||||||
|
CUstream stream;
|
||||||
cublasLtMatmulDesc_t op_desc;
|
cublasLtMatmulDesc_t op_desc;
|
||||||
cublasLtMatrixLayout_t a_layout;
|
cublasLtMatrixLayout_t a_layout;
|
||||||
cublasLtMatrixLayout_t b_layout;
|
cublasLtMatrixLayout_t b_layout;
|
||||||
@@ -617,8 +782,8 @@ static uint64_t choose_square_dim(size_t budget_bytes, size_t bytes_per_cell, in
|
|||||||
if (dim < (uint64_t)multiple) {
|
if (dim < (uint64_t)multiple) {
|
||||||
dim = (uint64_t)multiple;
|
dim = (uint64_t)multiple;
|
||||||
}
|
}
|
||||||
if (dim > 8192u) {
|
if (dim > 65536u) {
|
||||||
dim = 8192u;
|
dim = 65536u;
|
||||||
}
|
}
|
||||||
return dim;
|
return dim;
|
||||||
}
|
}
|
||||||
@@ -704,10 +869,12 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
|||||||
cublasLtHandle_t handle,
|
cublasLtHandle_t handle,
|
||||||
struct cuda_api *cuda,
|
struct cuda_api *cuda,
|
||||||
const struct profile_desc *desc,
|
const struct profile_desc *desc,
|
||||||
|
CUstream stream,
|
||||||
size_t profile_budget_bytes,
|
size_t profile_budget_bytes,
|
||||||
struct prepared_profile *out) {
|
struct prepared_profile *out) {
|
||||||
memset(out, 0, sizeof(*out));
|
memset(out, 0, sizeof(*out));
|
||||||
out->desc = *desc;
|
out->desc = *desc;
|
||||||
|
out->stream = stream;
|
||||||
|
|
||||||
size_t bytes_per_cell = 0;
|
size_t bytes_per_cell = 0;
|
||||||
bytes_per_cell += bytes_for_elements(desc->a_type, 1);
|
bytes_per_cell += bytes_for_elements(desc->a_type, 1);
|
||||||
@@ -935,7 +1102,7 @@ static int run_cublas_profile(cublasLtHandle_t handle,
|
|||||||
&profile->heuristic.algo,
|
&profile->heuristic.algo,
|
||||||
(void *)(uintptr_t)profile->workspace_dev,
|
(void *)(uintptr_t)profile->workspace_dev,
|
||||||
profile->workspace_size,
|
profile->workspace_size,
|
||||||
(cudaStream_t)0));
|
profile->stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
static int run_cublaslt_stress(struct cuda_api *cuda,
|
static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||||
@@ -947,13 +1114,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
int size_mb,
|
int size_mb,
|
||||||
struct stress_report *report) {
|
struct stress_report *report) {
|
||||||
struct cublaslt_api cublas;
|
struct cublaslt_api cublas;
|
||||||
struct prepared_profile prepared[sizeof(k_profiles) / sizeof(k_profiles[0])];
|
struct prepared_profile prepared[MAX_STRESS_STREAMS * MAX_CUBLAS_PROFILES];
|
||||||
cublasLtHandle_t handle = NULL;
|
cublasLtHandle_t handle = NULL;
|
||||||
CUcontext ctx = NULL;
|
CUcontext ctx = NULL;
|
||||||
|
CUstream streams[MAX_STRESS_STREAMS] = {0};
|
||||||
uint16_t sample[256];
|
uint16_t sample[256];
|
||||||
int cc = cc_major * 10 + cc_minor;
|
int cc = cc_major * 10 + cc_minor;
|
||||||
int planned = 0;
|
int planned = 0;
|
||||||
int active = 0;
|
int active = 0;
|
||||||
|
int mp_count = 0;
|
||||||
|
int stream_count = 1;
|
||||||
|
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
|
||||||
|
int prepared_count = 0;
|
||||||
|
int wave_launches = 0;
|
||||||
|
size_t requested_budget = 0;
|
||||||
|
size_t total_budget = 0;
|
||||||
|
size_t per_profile_budget = 0;
|
||||||
|
|
||||||
memset(report, 0, sizeof(*report));
|
memset(report, 0, sizeof(*report));
|
||||||
snprintf(report->backend, sizeof(report->backend), "cublasLt");
|
snprintf(report->backend, sizeof(report->backend), "cublasLt");
|
||||||
@@ -986,16 +1162,46 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t total_budget = (size_t)size_mb * 1024u * 1024u;
|
requested_budget = (size_t)size_mb * 1024u * 1024u;
|
||||||
if (total_budget < (size_t)planned * 4u * 1024u * 1024u) {
|
if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
||||||
total_budget = (size_t)planned * 4u * 1024u * 1024u;
|
requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
size_t per_profile_budget = total_budget / (size_t)planned;
|
total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
|
||||||
if (per_profile_budget < 4u * 1024u * 1024u) {
|
if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
||||||
per_profile_budget = 4u * 1024u * 1024u;
|
total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
|
if (query_multiprocessor_count(cuda, dev, &mp_count) &&
|
||||||
|
cuda->cuStreamCreate &&
|
||||||
|
cuda->cuStreamDestroy) {
|
||||||
|
stream_count = choose_stream_count(mp_count, planned, total_budget, 1);
|
||||||
|
}
|
||||||
|
if (stream_count > 1) {
|
||||||
|
int created = 0;
|
||||||
|
for (; created < stream_count; created++) {
|
||||||
|
if (!check_rc(cuda, "cuStreamCreate", cuda->cuStreamCreate(&streams[created], 0))) {
|
||||||
|
destroy_streams(cuda, streams, created);
|
||||||
|
stream_count = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
report->stream_count = stream_count;
|
||||||
|
per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count);
|
||||||
|
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||||
|
per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
|
||||||
|
}
|
||||||
|
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
||||||
|
append_detail(report->details,
|
||||||
|
sizeof(report->details),
|
||||||
|
"requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
|
||||||
|
size_mb,
|
||||||
|
report->buffer_mb,
|
||||||
|
report->stream_count,
|
||||||
|
STRESS_LAUNCH_DEPTH,
|
||||||
|
mp_count,
|
||||||
|
per_profile_budget / (1024u * 1024u));
|
||||||
|
|
||||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
for (int i = 0; i < profile_count; i++) {
|
||||||
const struct profile_desc *desc = &k_profiles[i];
|
const struct profile_desc *desc = &k_profiles[i];
|
||||||
if (!(desc->enabled && cc >= desc->min_cc)) {
|
if (!(desc->enabled && cc >= desc->min_cc)) {
|
||||||
append_detail(report->details,
|
append_detail(report->details,
|
||||||
@@ -1005,63 +1211,87 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
desc->min_cc);
|
desc->min_cc);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (prepare_profile(&cublas, handle, cuda, desc, per_profile_budget, &prepared[i])) {
|
for (int lane = 0; lane < stream_count; lane++) {
|
||||||
active++;
|
CUstream stream = streams[lane];
|
||||||
append_detail(report->details,
|
if (prepared_count >= (int)(sizeof(prepared) / sizeof(prepared[0]))) {
|
||||||
sizeof(report->details),
|
break;
|
||||||
"%s=READY dim=%llux%llux%llu block=%s\n",
|
}
|
||||||
desc->name,
|
if (prepare_profile(&cublas, handle, cuda, desc, stream, per_profile_budget, &prepared[prepared_count])) {
|
||||||
(unsigned long long)prepared[i].m,
|
active++;
|
||||||
(unsigned long long)prepared[i].n,
|
append_detail(report->details,
|
||||||
(unsigned long long)prepared[i].k,
|
sizeof(report->details),
|
||||||
desc->block_label);
|
"%s[%d]=READY dim=%llux%llux%llu block=%s stream=%d\n",
|
||||||
} else {
|
desc->name,
|
||||||
append_detail(report->details, sizeof(report->details), "%s=SKIPPED unsupported\n", desc->name);
|
lane,
|
||||||
|
(unsigned long long)prepared[prepared_count].m,
|
||||||
|
(unsigned long long)prepared[prepared_count].n,
|
||||||
|
(unsigned long long)prepared[prepared_count].k,
|
||||||
|
desc->block_label,
|
||||||
|
lane);
|
||||||
|
prepared_count++;
|
||||||
|
} else {
|
||||||
|
append_detail(report->details,
|
||||||
|
sizeof(report->details),
|
||||||
|
"%s[%d]=SKIPPED unsupported\n",
|
||||||
|
desc->name,
|
||||||
|
lane);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (active <= 0) {
|
if (active <= 0) {
|
||||||
cublas.cublasLtDestroy(handle);
|
cublas.cublasLtDestroy(handle);
|
||||||
|
destroy_streams(cuda, streams, stream_count);
|
||||||
cuda->cuCtxDestroy(ctx);
|
cuda->cuCtxDestroy(ctx);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
double deadline = now_seconds() + (double)seconds;
|
double deadline = now_seconds() + (double)seconds;
|
||||||
while (now_seconds() < deadline) {
|
while (now_seconds() < deadline) {
|
||||||
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
wave_launches = 0;
|
||||||
if (!prepared[i].ready) {
|
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
||||||
continue;
|
int launched_this_batch = 0;
|
||||||
}
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
|
if (!prepared[i].ready) {
|
||||||
append_detail(report->details,
|
continue;
|
||||||
sizeof(report->details),
|
|
||||||
"%s=FAILED runtime\n",
|
|
||||||
prepared[i].desc.name);
|
|
||||||
for (size_t j = 0; j < sizeof(prepared) / sizeof(prepared[0]); j++) {
|
|
||||||
destroy_profile(&cublas, cuda, &prepared[j]);
|
|
||||||
}
|
}
|
||||||
cublas.cublasLtDestroy(handle);
|
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
|
||||||
cuda->cuCtxDestroy(ctx);
|
append_detail(report->details,
|
||||||
return 0;
|
sizeof(report->details),
|
||||||
|
"%s=FAILED runtime\n",
|
||||||
|
prepared[i].desc.name);
|
||||||
|
for (int j = 0; j < prepared_count; j++) {
|
||||||
|
destroy_profile(&cublas, cuda, &prepared[j]);
|
||||||
|
}
|
||||||
|
cublas.cublasLtDestroy(handle);
|
||||||
|
destroy_streams(cuda, streams, stream_count);
|
||||||
|
cuda->cuCtxDestroy(ctx);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
prepared[i].iterations++;
|
||||||
|
report->iterations++;
|
||||||
|
wave_launches++;
|
||||||
|
launched_this_batch++;
|
||||||
}
|
}
|
||||||
prepared[i].iterations++;
|
if (launched_this_batch <= 0) {
|
||||||
report->iterations++;
|
|
||||||
if (now_seconds() >= deadline) {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
if (wave_launches <= 0) {
|
||||||
|
break;
|
||||||
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
}
|
||||||
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
||||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
|
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||||
|
}
|
||||||
|
cublas.cublasLtDestroy(handle);
|
||||||
|
destroy_streams(cuda, streams, stream_count);
|
||||||
|
cuda->cuCtxDestroy(ctx);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
cublas.cublasLtDestroy(handle);
|
|
||||||
cuda->cuCtxDestroy(ctx);
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
if (!prepared[i].ready) {
|
if (!prepared[i].ready) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1072,7 +1302,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
prepared[i].iterations);
|
prepared[i].iterations);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
if (prepared[i].ready) {
|
if (prepared[i].ready) {
|
||||||
if (check_rc(cuda, "cuMemcpyDtoH", cuda->cuMemcpyDtoH(sample, prepared[i].d_dev, sizeof(sample)))) {
|
if (check_rc(cuda, "cuMemcpyDtoH", cuda->cuMemcpyDtoH(sample, prepared[i].d_dev, sizeof(sample)))) {
|
||||||
for (size_t j = 0; j < sizeof(sample) / sizeof(sample[0]); j++) {
|
for (size_t j = 0; j < sizeof(sample) / sizeof(sample[0]); j++) {
|
||||||
@@ -1083,10 +1313,11 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||||
}
|
}
|
||||||
cublas.cublasLtDestroy(handle);
|
cublas.cublasLtDestroy(handle);
|
||||||
|
destroy_streams(cuda, streams, stream_count);
|
||||||
cuda->cuCtxDestroy(ctx);
|
cuda->cuCtxDestroy(ctx);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@@ -1095,13 +1326,16 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
int seconds = 5;
|
int seconds = 5;
|
||||||
int size_mb = 64;
|
int size_mb = 64;
|
||||||
|
int device_index = 0;
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
||||||
seconds = atoi(argv[++i]);
|
seconds = atoi(argv[++i]);
|
||||||
} else if ((strcmp(argv[i], "--size-mb") == 0 || strcmp(argv[i], "-m") == 0) && i + 1 < argc) {
|
} else if ((strcmp(argv[i], "--size-mb") == 0 || strcmp(argv[i], "-m") == 0) && i + 1 < argc) {
|
||||||
size_mb = atoi(argv[++i]);
|
size_mb = atoi(argv[++i]);
|
||||||
|
} else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
|
||||||
|
device_index = atoi(argv[++i]);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N]\n", argv[0]);
|
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N] [--device N]\n", argv[0]);
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1111,6 +1345,9 @@ int main(int argc, char **argv) {
|
|||||||
if (size_mb <= 0) {
|
if (size_mb <= 0) {
|
||||||
size_mb = 64;
|
size_mb = 64;
|
||||||
}
|
}
|
||||||
|
if (device_index < 0) {
|
||||||
|
device_index = 0;
|
||||||
|
}
|
||||||
|
|
||||||
struct cuda_api cuda;
|
struct cuda_api cuda;
|
||||||
if (!load_cuda(&cuda)) {
|
if (!load_cuda(&cuda)) {
|
||||||
@@ -1133,8 +1370,13 @@ int main(int argc, char **argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (device_index >= count) {
|
||||||
|
fprintf(stderr, "device index %d out of range (found %d CUDA device(s))\n", device_index, count);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
CUdevice dev = 0;
|
CUdevice dev = 0;
|
||||||
if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, 0))) {
|
if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, device_index))) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1162,10 +1404,12 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
printf("device=%s\n", report.device);
|
printf("device=%s\n", report.device);
|
||||||
|
printf("device_index=%d\n", device_index);
|
||||||
printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
|
printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
|
||||||
printf("backend=%s\n", report.backend);
|
printf("backend=%s\n", report.backend);
|
||||||
printf("duration_s=%d\n", seconds);
|
printf("duration_s=%d\n", seconds);
|
||||||
printf("buffer_mb=%d\n", report.buffer_mb);
|
printf("buffer_mb=%d\n", report.buffer_mb);
|
||||||
|
printf("streams=%d\n", report.stream_count);
|
||||||
printf("iterations=%lu\n", report.iterations);
|
printf("iterations=%lu\n", report.iterations);
|
||||||
printf("checksum=%llu\n", (unsigned long long)report.checksum);
|
printf("checksum=%llu\n", (unsigned long long)report.checksum);
|
||||||
if (report.details[0] != '\0') {
|
if (report.details[0] != '\0') {
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-stress.
|
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-burn worker.
|
||||||
#
|
#
|
||||||
# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
|
# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
|
||||||
# verifies them against Packages.gz, and extracts the small subset we need:
|
# verifies them against Packages.gz, and extracts the small subset we need:
|
||||||
# - headers for compiling bee-gpu-stress against cuBLASLt
|
# - headers for compiling bee-gpu-burn worker against cuBLASLt
|
||||||
# - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
|
# - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
|
|||||||
AUTH_KEYS=""
|
AUTH_KEYS=""
|
||||||
REBUILD_IMAGE=0
|
REBUILD_IMAGE=0
|
||||||
CLEAN_CACHE=0
|
CLEAN_CACHE=0
|
||||||
|
VARIANT="all"
|
||||||
|
|
||||||
. "${BUILDER_DIR}/VERSIONS"
|
. "${BUILDER_DIR}/VERSIONS"
|
||||||
|
|
||||||
@@ -34,14 +35,23 @@ while [ $# -gt 0 ]; do
|
|||||||
REBUILD_IMAGE=1
|
REBUILD_IMAGE=1
|
||||||
shift
|
shift
|
||||||
;;
|
;;
|
||||||
|
--variant)
|
||||||
|
VARIANT="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
*)
|
*)
|
||||||
echo "unknown arg: $1" >&2
|
echo "unknown arg: $1" >&2
|
||||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys]" >&2
|
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
|
case "$VARIANT" in
|
||||||
|
nvidia|amd|nogpu|all) ;;
|
||||||
|
*) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
|
||||||
|
esac
|
||||||
|
|
||||||
if [ "$CLEAN_CACHE" = "1" ]; then
|
if [ "$CLEAN_CACHE" = "1" ]; then
|
||||||
echo "=== cleaning build cache: ${CACHE_DIR} ==="
|
echo "=== cleaning build cache: ${CACHE_DIR} ==="
|
||||||
rm -rf "${CACHE_DIR:?}/go-build" \
|
rm -rf "${CACHE_DIR:?}/go-build" \
|
||||||
@@ -49,8 +59,10 @@ if [ "$CLEAN_CACHE" = "1" ]; then
|
|||||||
"${CACHE_DIR:?}/tmp" \
|
"${CACHE_DIR:?}/tmp" \
|
||||||
"${CACHE_DIR:?}/bee" \
|
"${CACHE_DIR:?}/bee" \
|
||||||
"${CACHE_DIR:?}/lb-packages"
|
"${CACHE_DIR:?}/lb-packages"
|
||||||
echo "=== cleaning live-build work dir: ${REPO_ROOT}/dist/live-build-work ==="
|
echo "=== cleaning live-build work dirs ==="
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work"
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
||||||
echo "=== caches cleared, proceeding with build ==="
|
echo "=== caches cleared, proceeding with build ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -108,34 +120,75 @@ else
|
|||||||
echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
|
echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
set -- \
|
# Build base docker run args (without --authorized-keys)
|
||||||
run --rm --privileged \
|
build_run_args() {
|
||||||
--platform "${BUILDER_PLATFORM}" \
|
_variant="$1"
|
||||||
-v "${REPO_ROOT}:/work" \
|
_auth_arg=""
|
||||||
-v "${CACHE_DIR}:/cache" \
|
if [ -n "$AUTH_KEYS" ]; then
|
||||||
-e BEE_CONTAINER_BUILD=1 \
|
_auth_arg="--authorized-keys /tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||||
-e GOCACHE=/cache/go-build \
|
fi
|
||||||
-e GOMODCACHE=/cache/go-mod \
|
echo "run --rm --privileged \
|
||||||
-e TMPDIR=/cache/tmp \
|
--platform ${BUILDER_PLATFORM} \
|
||||||
-e BEE_CACHE_DIR=/cache/bee \
|
-v ${REPO_ROOT}:/work \
|
||||||
-w /work \
|
-v ${CACHE_DIR}:/cache \
|
||||||
"${IMAGE_REF}" \
|
${AUTH_KEYS:+-v ${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro} \
|
||||||
sh /work/iso/builder/build.sh
|
|
||||||
|
|
||||||
if [ -n "$AUTH_KEYS" ]; then
|
|
||||||
set -- run --rm --privileged \
|
|
||||||
--platform "${BUILDER_PLATFORM}" \
|
|
||||||
-v "${REPO_ROOT}:/work" \
|
|
||||||
-v "${CACHE_DIR}:/cache" \
|
|
||||||
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
|
|
||||||
-e BEE_CONTAINER_BUILD=1 \
|
-e BEE_CONTAINER_BUILD=1 \
|
||||||
-e GOCACHE=/cache/go-build \
|
-e GOCACHE=/cache/go-build \
|
||||||
-e GOMODCACHE=/cache/go-mod \
|
-e GOMODCACHE=/cache/go-mod \
|
||||||
-e TMPDIR=/cache/tmp \
|
-e TMPDIR=/cache/tmp \
|
||||||
-e BEE_CACHE_DIR=/cache/bee \
|
-e BEE_CACHE_DIR=/cache/bee \
|
||||||
-w /work \
|
-w /work \
|
||||||
"${IMAGE_REF}" \
|
${IMAGE_REF} \
|
||||||
sh /work/iso/builder/build.sh --authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
sh /work/iso/builder/build.sh --variant ${_variant} ${_auth_arg}"
|
||||||
fi
|
}
|
||||||
|
|
||||||
"$CONTAINER_TOOL" "$@"
|
run_variant() {
|
||||||
|
_v="$1"
|
||||||
|
echo "=== building variant: ${_v} ==="
|
||||||
|
if [ -n "$AUTH_KEYS" ]; then
|
||||||
|
"$CONTAINER_TOOL" run --rm --privileged \
|
||||||
|
--platform "${BUILDER_PLATFORM}" \
|
||||||
|
-v "${REPO_ROOT}:/work" \
|
||||||
|
-v "${CACHE_DIR}:/cache" \
|
||||||
|
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
|
||||||
|
-e BEE_CONTAINER_BUILD=1 \
|
||||||
|
-e GOCACHE=/cache/go-build \
|
||||||
|
-e GOMODCACHE=/cache/go-mod \
|
||||||
|
-e TMPDIR=/cache/tmp \
|
||||||
|
-e BEE_CACHE_DIR=/cache/bee \
|
||||||
|
-w /work \
|
||||||
|
"${IMAGE_REF}" \
|
||||||
|
sh /work/iso/builder/build.sh --variant "${_v}" \
|
||||||
|
--authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||||
|
else
|
||||||
|
"$CONTAINER_TOOL" run --rm --privileged \
|
||||||
|
--platform "${BUILDER_PLATFORM}" \
|
||||||
|
-v "${REPO_ROOT}:/work" \
|
||||||
|
-v "${CACHE_DIR}:/cache" \
|
||||||
|
-e BEE_CONTAINER_BUILD=1 \
|
||||||
|
-e GOCACHE=/cache/go-build \
|
||||||
|
-e GOMODCACHE=/cache/go-mod \
|
||||||
|
-e TMPDIR=/cache/tmp \
|
||||||
|
-e BEE_CACHE_DIR=/cache/bee \
|
||||||
|
-w /work \
|
||||||
|
"${IMAGE_REF}" \
|
||||||
|
sh /work/iso/builder/build.sh --variant "${_v}"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
case "$VARIANT" in
|
||||||
|
nvidia)
|
||||||
|
run_variant nvidia
|
||||||
|
;;
|
||||||
|
amd)
|
||||||
|
run_variant amd
|
||||||
|
;;
|
||||||
|
nogpu)
|
||||||
|
run_variant nogpu
|
||||||
|
;;
|
||||||
|
all)
|
||||||
|
run_variant nvidia
|
||||||
|
run_variant amd
|
||||||
|
run_variant nogpu
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|||||||
55
iso/builder/build-john.sh
Normal file
55
iso/builder/build-john.sh
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# build-john.sh — build John the Ripper jumbo with OpenCL support for the LiveCD.
|
||||||
|
#
|
||||||
|
# Downloads a pinned source snapshot from the official openwall/john repository,
|
||||||
|
# builds it inside the builder container, and caches the resulting run/ tree.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
JOHN_COMMIT="$1"
|
||||||
|
DIST_DIR="$2"
|
||||||
|
|
||||||
|
[ -n "$JOHN_COMMIT" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
|
||||||
|
[ -n "$DIST_DIR" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
|
||||||
|
|
||||||
|
echo "=== John the Ripper jumbo ${JOHN_COMMIT} ==="
|
||||||
|
|
||||||
|
CACHE_DIR="${DIST_DIR}/john-${JOHN_COMMIT}"
|
||||||
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/john-downloads"
|
||||||
|
SRC_TAR="${DOWNLOAD_CACHE_DIR}/john-${JOHN_COMMIT}.tar.gz"
|
||||||
|
SRC_URL="https://github.com/openwall/john/archive/${JOHN_COMMIT}.tar.gz"
|
||||||
|
|
||||||
|
if [ -x "${CACHE_DIR}/run/john" ] && [ -f "${CACHE_DIR}/run/john.conf" ]; then
|
||||||
|
echo "=== john cached, skipping build ==="
|
||||||
|
echo "run dir: ${CACHE_DIR}/run"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p "${DOWNLOAD_CACHE_DIR}"
|
||||||
|
if [ ! -f "${SRC_TAR}" ]; then
|
||||||
|
echo "=== downloading john source snapshot ==="
|
||||||
|
wget --show-progress -O "${SRC_TAR}" "${SRC_URL}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
BUILD_TMP=$(mktemp -d)
|
||||||
|
trap 'rm -rf "${BUILD_TMP}"' EXIT INT TERM
|
||||||
|
|
||||||
|
cd "${BUILD_TMP}"
|
||||||
|
tar xf "${SRC_TAR}"
|
||||||
|
SRC_DIR=$(find . -maxdepth 1 -type d -name 'john-*' | head -1)
|
||||||
|
[ -n "${SRC_DIR}" ] || { echo "ERROR: john source directory not found"; exit 1; }
|
||||||
|
|
||||||
|
cd "${SRC_DIR}/src"
|
||||||
|
echo "=== configuring john ==="
|
||||||
|
./configure
|
||||||
|
echo "=== building john ==="
|
||||||
|
make clean >/dev/null 2>&1 || true
|
||||||
|
make -j"$(nproc)"
|
||||||
|
|
||||||
|
mkdir -p "${CACHE_DIR}"
|
||||||
|
cp -a "../run" "${CACHE_DIR}/run"
|
||||||
|
chmod +x "${CACHE_DIR}/run/john"
|
||||||
|
|
||||||
|
echo "=== john build complete ==="
|
||||||
|
echo "run dir: ${CACHE_DIR}/run"
|
||||||
@@ -9,6 +9,7 @@
|
|||||||
#
|
#
|
||||||
# Output layout:
|
# Output layout:
|
||||||
# $CACHE_DIR/bin/all_reduce_perf
|
# $CACHE_DIR/bin/all_reduce_perf
|
||||||
|
# $CACHE_DIR/lib/libcudart.so* copied from the nvcc toolchain used to build nccl-tests
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
@@ -30,7 +31,7 @@ CACHE_DIR="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
|||||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-tests-downloads"
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-tests-downloads"
|
||||||
|
|
||||||
if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ]; then
|
if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ] && [ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||||
echo "=== nccl-tests cached, skipping build ==="
|
echo "=== nccl-tests cached, skipping build ==="
|
||||||
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
exit 0
|
exit 0
|
||||||
@@ -52,6 +53,23 @@ echo "nvcc: $NVCC"
|
|||||||
CUDA_HOME="$(dirname "$(dirname "$NVCC")")"
|
CUDA_HOME="$(dirname "$(dirname "$NVCC")")"
|
||||||
echo "CUDA_HOME: $CUDA_HOME"
|
echo "CUDA_HOME: $CUDA_HOME"
|
||||||
|
|
||||||
|
find_cudart_dir() {
|
||||||
|
for dir in \
|
||||||
|
"${CUDA_HOME}/targets/x86_64-linux/lib" \
|
||||||
|
"${CUDA_HOME}/targets/x86_64-linux/lib/stubs" \
|
||||||
|
"${CUDA_HOME}/lib64" \
|
||||||
|
"${CUDA_HOME}/lib"; do
|
||||||
|
if [ -d "$dir" ] && find "$dir" -maxdepth 1 -name 'libcudart.so*' -type f | grep -q .; then
|
||||||
|
printf '%s\n' "$dir"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
CUDART_DIR="$(find_cudart_dir)" || { echo "ERROR: libcudart.so* not found under ${CUDA_HOME}"; exit 1; }
|
||||||
|
echo "cudart dir: $CUDART_DIR"
|
||||||
|
|
||||||
# Download libnccl-dev for nccl.h
|
# Download libnccl-dev for nccl.h
|
||||||
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian${DEBIAN_VERSION}/x86_64"
|
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian${DEBIAN_VERSION}/x86_64"
|
||||||
DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
|
DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
|
||||||
@@ -136,6 +154,11 @@ mkdir -p "${CACHE_DIR}/bin"
|
|||||||
cp "./build/all_reduce_perf" "${CACHE_DIR}/bin/all_reduce_perf"
|
cp "./build/all_reduce_perf" "${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
chmod +x "${CACHE_DIR}/bin/all_reduce_perf"
|
chmod +x "${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
|
|
||||||
|
mkdir -p "${CACHE_DIR}/lib"
|
||||||
|
find "${CUDART_DIR}" -maxdepth 1 -name 'libcudart.so*' -type f -exec cp -a {} "${CACHE_DIR}/lib/" \;
|
||||||
|
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' -type f | wc -l)" -gt 0 ] || { echo "ERROR: libcudart runtime copy failed"; exit 1; }
|
||||||
|
|
||||||
echo "=== nccl-tests build complete ==="
|
echo "=== nccl-tests build complete ==="
|
||||||
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
ls -lh "${CACHE_DIR}/bin/all_reduce_perf"
|
ls -lh "${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
|
ls -lh "${CACHE_DIR}/lib/"libcudart.so* 2>/dev/null || true
|
||||||
|
|||||||
@@ -10,7 +10,7 @@
|
|||||||
# Output layout:
|
# Output layout:
|
||||||
# $CACHE_DIR/modules/ — nvidia*.ko files
|
# $CACHE_DIR/modules/ — nvidia*.ko files
|
||||||
# $CACHE_DIR/bin/ — nvidia-smi, nvidia-debugdump
|
# $CACHE_DIR/bin/ — nvidia-smi, nvidia-debugdump
|
||||||
# $CACHE_DIR/lib/ — libnvidia-ml.so*, libcuda.so* (for nvidia-smi)
|
# $CACHE_DIR/lib/ — libnvidia-ml.so*, libcuda.so*, OpenCL-related libs
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
@@ -46,7 +46,10 @@ CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
|
|||||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
||||||
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
||||||
|
CACHE_LAYOUT_VERSION="2"
|
||||||
|
CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
|
||||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
||||||
|
&& [ -f "$CACHE_LAYOUT_MARKER" ] \
|
||||||
&& [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then
|
&& [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||||
echo "=== NVIDIA cached, skipping build ==="
|
echo "=== NVIDIA cached, skipping build ==="
|
||||||
echo "cache: $CACHE_DIR"
|
echo "cache: $CACHE_DIR"
|
||||||
@@ -130,17 +133,30 @@ else
|
|||||||
echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)"
|
echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Copy ALL userspace library files.
|
# Copy NVIDIA userspace libraries broadly instead of whitelisting a few names.
|
||||||
# libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation
|
# Newer driver branches add extra runtime deps (for example OpenCL/compiler side
|
||||||
# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND.
|
# libraries). If we only copy a narrow allowlist, clinfo/John can see nvidia.icd
|
||||||
for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
|
# but still fail with "no OpenCL platforms" because one dependent .so is absent.
|
||||||
count=0
|
copied_libs=0
|
||||||
for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
|
for f in $(find "$EXTRACT_DIR" -maxdepth 1 \( -name 'libnvidia*.so.*' -o -name 'libcuda.so.*' \) -type f 2>/dev/null | sort); do
|
||||||
cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
|
cp "$f" "$CACHE_DIR/lib/"
|
||||||
done
|
copied_libs=$((copied_libs+1))
|
||||||
if [ "$count" -eq 0 ]; then
|
done
|
||||||
echo "ERROR: ${lib}.so.* not found in $EXTRACT_DIR"
|
|
||||||
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -20 || true
|
if [ "$copied_libs" -eq 0 ]; then
|
||||||
|
echo "ERROR: no NVIDIA userspace libraries found in $EXTRACT_DIR"
|
||||||
|
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -40 || true
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
for lib in \
|
||||||
|
libnvidia-ml \
|
||||||
|
libcuda \
|
||||||
|
libnvidia-ptxjitcompiler \
|
||||||
|
libnvidia-opencl; do
|
||||||
|
if ! ls "$CACHE_DIR/lib/${lib}.so."* >/dev/null 2>&1; then
|
||||||
|
echo "ERROR: required ${lib}.so.* not found in extracted userspace libs"
|
||||||
|
ls "$CACHE_DIR/lib/" | sort >&2 || true
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
@@ -149,16 +165,17 @@ done
|
|||||||
ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
|
ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
|
||||||
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
||||||
|
|
||||||
# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
|
# Create soname symlinks for every copied versioned library.
|
||||||
for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
|
for versioned in "$CACHE_DIR"/lib/*.so.*; do
|
||||||
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
|
[ -f "$versioned" ] || continue
|
||||||
[ -n "$versioned" ] || continue
|
|
||||||
base=$(basename "$versioned")
|
base=$(basename "$versioned")
|
||||||
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1"
|
stem=${base%%.so.*}
|
||||||
ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
|
ln -sf "$base" "$CACHE_DIR/lib/${stem}.so.1"
|
||||||
echo "${lib}: .so.1 -> $base"
|
ln -sf "${stem}.so.1" "$CACHE_DIR/lib/${stem}.so" 2>/dev/null || true
|
||||||
done
|
done
|
||||||
|
|
||||||
|
touch "$CACHE_LAYOUT_MARKER"
|
||||||
|
|
||||||
echo "=== NVIDIA build complete ==="
|
echo "=== NVIDIA build complete ==="
|
||||||
echo "cache: $CACHE_DIR"
|
echo "cache: $CACHE_DIR"
|
||||||
echo "modules: $ko_count .ko files"
|
echo "modules: $ko_count .ko files"
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -10,25 +10,45 @@ echo " ╚══════╝╚═╝ ╚═╝╚══════╝
|
|||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
menuentry "EASY-BEE" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE (graphics/KMS)" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (load to RAM)" {
|
menuentry "EASY-BEE (load to RAM)" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.nvidia.mode=normal
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (NVIDIA GSP=off)" {
|
menuentry "EASY-BEE (NVIDIA GSP=off)" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE (graphics/KMS, GSP=off)" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (fail-safe)" {
|
menuentry "EASY-BEE (fail-safe)" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
chainloader /boot/memtest86+x64.efi
|
||||||
|
}
|
||||||
|
else
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
linux16 /boot/memtest86+x64.bin
|
||||||
|
}
|
||||||
|
fi
|
||||||
|
|
||||||
if [ "${grub_platform}" = "efi" ]; then
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
menuentry "UEFI Firmware Settings" {
|
menuentry "UEFI Firmware Settings" {
|
||||||
fwsetup
|
fwsetup
|
||||||
|
|||||||
@@ -5,6 +5,12 @@ label live-@FLAVOUR@-normal
|
|||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ bee.nvidia.mode=normal
|
append @APPEND_LIVE@ bee.nvidia.mode=normal
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-kms
|
||||||
|
menu label EASY-BEE (^graphics/KMS)
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal
|
||||||
|
|
||||||
label live-@FLAVOUR@-toram
|
label live-@FLAVOUR@-toram
|
||||||
menu label EASY-BEE (^load to RAM)
|
menu label EASY-BEE (^load to RAM)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
@@ -15,10 +21,20 @@ label live-@FLAVOUR@-gsp-off
|
|||||||
menu label EASY-BEE (^NVIDIA GSP=off)
|
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off
|
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-kms-gsp-off
|
||||||
|
menu label EASY-BEE (g^raphics/KMS, GSP=off)
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off
|
||||||
|
|
||||||
label live-@FLAVOUR@-failsafe
|
label live-@FLAVOUR@-failsafe
|
||||||
menu label EASY-BEE (^fail-safe)
|
menu label EASY-BEE (^fail-safe)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
||||||
|
|
||||||
|
label memtest
|
||||||
|
menu label ^Memory Test (memtest86+)
|
||||||
|
linux /boot/memtest86+x64.bin
|
||||||
|
|||||||
@@ -5,25 +5,27 @@ set -e
|
|||||||
|
|
||||||
echo "=== bee chroot setup ==="
|
echo "=== bee chroot setup ==="
|
||||||
|
|
||||||
|
GPU_VENDOR=$(cat /etc/bee-gpu-vendor 2>/dev/null || echo nvidia)
|
||||||
|
echo "=== GPU vendor: ${GPU_VENDOR} ==="
|
||||||
|
|
||||||
ensure_bee_console_user() {
|
ensure_bee_console_user() {
|
||||||
if id bee >/dev/null 2>&1; then
|
if id bee >/dev/null 2>&1; then
|
||||||
usermod -d /home/bee -s /bin/sh bee 2>/dev/null || true
|
usermod -d /home/bee -s /bin/bash bee 2>/dev/null || true
|
||||||
else
|
else
|
||||||
useradd -d /home/bee -m -s /bin/sh -U bee
|
useradd -d /home/bee -m -s /bin/bash -U bee
|
||||||
fi
|
fi
|
||||||
|
|
||||||
mkdir -p /home/bee
|
mkdir -p /home/bee
|
||||||
chown -R bee:bee /home/bee
|
chown -R bee:bee /home/bee
|
||||||
echo "bee:eeb" | chpasswd
|
echo "bee:eeb" | chpasswd
|
||||||
usermod -aG sudo,video,input bee 2>/dev/null || true
|
groupadd -f ipmi 2>/dev/null || true
|
||||||
|
usermod -aG sudo,video,input,render,ipmi bee 2>/dev/null || true
|
||||||
}
|
}
|
||||||
|
|
||||||
ensure_bee_console_user
|
ensure_bee_console_user
|
||||||
|
|
||||||
# Enable bee services
|
# Enable common bee services
|
||||||
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
|
||||||
systemctl enable bee-network.service
|
systemctl enable bee-network.service
|
||||||
systemctl enable bee-nvidia.service
|
|
||||||
systemctl enable bee-preflight.service
|
systemctl enable bee-preflight.service
|
||||||
systemctl enable bee-audit.service
|
systemctl enable bee-audit.service
|
||||||
systemctl enable bee-web.service
|
systemctl enable bee-web.service
|
||||||
@@ -35,23 +37,37 @@ systemctl enable serial-getty@ttyS0.service 2>/dev/null || true
|
|||||||
systemctl enable serial-getty@ttyS1.service 2>/dev/null || true
|
systemctl enable serial-getty@ttyS1.service 2>/dev/null || true
|
||||||
systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
||||||
|
|
||||||
|
# Enable GPU-vendor specific services
|
||||||
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
|
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||||
|
systemctl enable bee-nvidia.service
|
||||||
|
elif [ "$GPU_VENDOR" = "amd" ]; then
|
||||||
|
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
||||||
|
for tool in rocm-smi rocm-bandwidth-test rvs; do
|
||||||
|
if [ ! -e /usr/local/bin/${tool} ]; then
|
||||||
|
bin_path="$(find /opt -path "*/bin/${tool}" -type f 2>/dev/null | sort | tail -1)"
|
||||||
|
[ -n "${bin_path}" ] && ln -sf "${bin_path}" /usr/local/bin/${tool}
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
# nogpu: no GPU services needed
|
||||||
|
|
||||||
# Ensure scripts are executable
|
# Ensure scripts are executable
|
||||||
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
|
||||||
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||||
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
|
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-john-gpu-stress 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-nccl-gpu-stress 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
# Reload udev rules
|
# Reload udev rules
|
||||||
udevadm control --reload-rules 2>/dev/null || true
|
udevadm control --reload-rules 2>/dev/null || true
|
||||||
|
|
||||||
# rocm-smi symlink (package installs to /opt/rocm-*/bin/rocm-smi)
|
|
||||||
if [ ! -e /usr/local/bin/rocm-smi ]; then
|
|
||||||
smi_path="$(find /opt -path '*/bin/rocm-smi' -type f 2>/dev/null | sort | tail -1)"
|
|
||||||
[ -n "${smi_path}" ] && ln -sf "${smi_path}" /usr/local/bin/rocm-smi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Create export directory
|
# Create export directory
|
||||||
mkdir -p /appdata/bee/export
|
mkdir -p /appdata/bee/export
|
||||||
|
|
||||||
@@ -59,4 +75,4 @@ if [ -f /etc/sudoers.d/bee ]; then
|
|||||||
chmod 0440 /etc/sudoers.d/bee
|
chmod 0440 /etc/sudoers.d/bee
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "=== bee chroot setup complete ==="
|
echo "=== bee chroot setup complete (${GPU_VENDOR}) ==="
|
||||||
|
|||||||
139
iso/builder/config/hooks/normal/9100-memtest.hook.binary
Executable file
139
iso/builder/config/hooks/normal/9100-memtest.hook.binary
Executable file
@@ -0,0 +1,139 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# Ensure memtest is present in the final ISO even if live-build's built-in
|
||||||
|
# memtest stage does not copy the binaries or expose menu entries.
|
||||||
|
set -e
|
||||||
|
|
||||||
|
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||||
|
|
||||||
|
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
|
||||||
|
BINARY_BOOT_DIR="binary/boot"
|
||||||
|
GRUB_CFG="binary/boot/grub/grub.cfg"
|
||||||
|
ISOLINUX_CFG="binary/isolinux/live.cfg"
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "memtest hook: $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
fail_or_warn() {
|
||||||
|
msg="$1"
|
||||||
|
if [ "${BEE_REQUIRE_MEMTEST}" = "1" ]; then
|
||||||
|
log "ERROR: ${msg}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
log "WARNING: ${msg}"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
copy_memtest_file() {
|
||||||
|
src="$1"
|
||||||
|
base="$(basename "$src")"
|
||||||
|
dst="${BINARY_BOOT_DIR}/${base}"
|
||||||
|
|
||||||
|
[ -f "$src" ] || return 1
|
||||||
|
mkdir -p "${BINARY_BOOT_DIR}"
|
||||||
|
cp "$src" "$dst"
|
||||||
|
log "copied ${base} from ${src}"
|
||||||
|
}
|
||||||
|
|
||||||
|
extract_memtest_from_deb() {
|
||||||
|
deb="$1"
|
||||||
|
tmpdir="$(mktemp -d)"
|
||||||
|
|
||||||
|
log "extracting memtest payload from ${deb}"
|
||||||
|
dpkg-deb -x "$deb" "$tmpdir"
|
||||||
|
for f in ${MEMTEST_FILES}; do
|
||||||
|
if [ -f "${tmpdir}/boot/${f}" ]; then
|
||||||
|
copy_memtest_file "${tmpdir}/boot/${f}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
rm -rf "$tmpdir"
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_memtest_binaries() {
|
||||||
|
missing=0
|
||||||
|
for f in ${MEMTEST_FILES}; do
|
||||||
|
[ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
|
||||||
|
done
|
||||||
|
[ "$missing" -eq 1 ] || return 0
|
||||||
|
|
||||||
|
for root in chroot/boot /boot; do
|
||||||
|
for f in ${MEMTEST_FILES}; do
|
||||||
|
[ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
missing=0
|
||||||
|
for f in ${MEMTEST_FILES}; do
|
||||||
|
[ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
|
||||||
|
done
|
||||||
|
[ "$missing" -eq 1 ] || return 0
|
||||||
|
|
||||||
|
for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
|
||||||
|
[ -d "$root" ] || continue
|
||||||
|
deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
|
||||||
|
[ -n "$deb" ] || continue
|
||||||
|
extract_memtest_from_deb "$deb"
|
||||||
|
break
|
||||||
|
done
|
||||||
|
|
||||||
|
missing=0
|
||||||
|
for f in ${MEMTEST_FILES}; do
|
||||||
|
if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
|
||||||
|
fail_or_warn "missing ${BINARY_BOOT_DIR}/${f}"
|
||||||
|
missing=1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
[ "$missing" -eq 0 ] || return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_grub_entry() {
|
||||||
|
[ -f "$GRUB_CFG" ] || {
|
||||||
|
fail_or_warn "missing ${GRUB_CFG}"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
grep -q '### BEE MEMTEST ###' "$GRUB_CFG" && return 0
|
||||||
|
|
||||||
|
cat >> "$GRUB_CFG" <<'EOF'
|
||||||
|
|
||||||
|
### BEE MEMTEST ###
|
||||||
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
chainloader /boot/memtest86+x64.efi
|
||||||
|
}
|
||||||
|
else
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
linux16 /boot/memtest86+x64.bin
|
||||||
|
}
|
||||||
|
fi
|
||||||
|
### /BEE MEMTEST ###
|
||||||
|
EOF
|
||||||
|
|
||||||
|
log "appended memtest entry to ${GRUB_CFG}"
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_isolinux_entry() {
|
||||||
|
[ -f "$ISOLINUX_CFG" ] || {
|
||||||
|
fail_or_warn "missing ${ISOLINUX_CFG}"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
grep -q '### BEE MEMTEST ###' "$ISOLINUX_CFG" && return 0
|
||||||
|
|
||||||
|
cat >> "$ISOLINUX_CFG" <<'EOF'
|
||||||
|
|
||||||
|
# ### BEE MEMTEST ###
|
||||||
|
label memtest
|
||||||
|
menu label ^Memory Test (memtest86+)
|
||||||
|
linux /boot/memtest86+x64.bin
|
||||||
|
# ### /BEE MEMTEST ###
|
||||||
|
EOF
|
||||||
|
|
||||||
|
log "appended memtest entry to ${ISOLINUX_CFG}"
|
||||||
|
}
|
||||||
|
|
||||||
|
log "ensuring memtest binaries and menu entries in binary image"
|
||||||
|
ensure_memtest_binaries
|
||||||
|
ensure_grub_entry
|
||||||
|
ensure_isolinux_entry
|
||||||
|
log "memtest assets ready"
|
||||||
12
iso/builder/config/package-lists/bee-amd.list.chroot
Normal file
12
iso/builder/config/package-lists/bee-amd.list.chroot
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# AMD GPU firmware
|
||||||
|
firmware-amd-graphics
|
||||||
|
|
||||||
|
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
||||||
|
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
||||||
|
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
|
||||||
|
rocm-validation-suite=%%ROCM_VALIDATION_SUITE_VERSION%%
|
||||||
|
rocblas=%%ROCBLAS_VERSION%%
|
||||||
|
rocrand=%%ROCRAND_VERSION%%
|
||||||
|
hip-runtime-amd=%%HIP_RUNTIME_AMD_VERSION%%
|
||||||
|
hipblaslt=%%HIPBLASLT_VERSION%%
|
||||||
|
comgr=%%COMGR_VERSION%%
|
||||||
1
iso/builder/config/package-lists/bee-nogpu.list.chroot
Normal file
1
iso/builder/config/package-lists/bee-nogpu.list.chroot
Normal file
@@ -0,0 +1 @@
|
|||||||
|
# No GPU variant — no NVIDIA, no AMD/ROCm packages
|
||||||
8
iso/builder/config/package-lists/bee-nvidia.list.chroot
Normal file
8
iso/builder/config/package-lists/bee-nvidia.list.chroot
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing.
|
||||||
|
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace,
|
||||||
|
# so install the CUDA 13 build plus proprietary diagnostic components explicitly.
|
||||||
|
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||||
|
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||||
|
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
||||||
|
ocl-icd-libopencl1
|
||||||
|
clinfo
|
||||||
@@ -21,8 +21,15 @@ openssh-server
|
|||||||
# Disk installer
|
# Disk installer
|
||||||
squashfs-tools
|
squashfs-tools
|
||||||
parted
|
parted
|
||||||
|
# Keep GRUB install tools without selecting a single active platform package.
|
||||||
|
# grub-pc and grub-efi-amd64 conflict with each other, but grub2-common
|
||||||
|
# provides grub-install/update-grub and the *-bin packages provide BIOS/UEFI modules.
|
||||||
|
grub2-common
|
||||||
grub-pc-bin
|
grub-pc-bin
|
||||||
grub-efi-amd64-bin
|
grub-efi-amd64-bin
|
||||||
|
grub-efi-amd64-signed
|
||||||
|
shim-signed
|
||||||
|
efibootmgr
|
||||||
|
|
||||||
# Filesystem support for USB export targets
|
# Filesystem support for USB export targets
|
||||||
exfatprogs
|
exfatprogs
|
||||||
@@ -39,11 +46,13 @@ vim-tiny
|
|||||||
mc
|
mc
|
||||||
htop
|
htop
|
||||||
nvtop
|
nvtop
|
||||||
|
btop
|
||||||
sudo
|
sudo
|
||||||
zstd
|
zstd
|
||||||
mstflint
|
mstflint
|
||||||
memtester
|
memtester
|
||||||
stress-ng
|
stress-ng
|
||||||
|
stressapptest
|
||||||
|
|
||||||
# QR codes (for displaying audit results)
|
# QR codes (for displaying audit results)
|
||||||
qrencode
|
qrencode
|
||||||
@@ -62,19 +71,11 @@ lightdm
|
|||||||
firmware-linux-free
|
firmware-linux-free
|
||||||
firmware-linux-nonfree
|
firmware-linux-nonfree
|
||||||
firmware-misc-nonfree
|
firmware-misc-nonfree
|
||||||
firmware-amd-graphics
|
|
||||||
firmware-realtek
|
firmware-realtek
|
||||||
firmware-intel-sound
|
|
||||||
firmware-bnx2
|
firmware-bnx2
|
||||||
firmware-bnx2x
|
firmware-bnx2x
|
||||||
firmware-cavium
|
firmware-cavium
|
||||||
firmware-qlogic
|
firmware-qlogic
|
||||||
|
|
||||||
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
|
|
||||||
datacenter-gpu-manager=1:%%DCGM_VERSION%%
|
|
||||||
|
|
||||||
# AMD ROCm SMI — GPU monitoring for Instinct cards (repo: rocm/apt/6.3.4 jammy)
|
|
||||||
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
|
||||||
|
|
||||||
# glibc compat helpers (for any external binaries that need it)
|
# glibc compat helpers (for any external binaries that need it)
|
||||||
libc6
|
libc6
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
|
|||||||
# --- PATH & binaries ---
|
# --- PATH & binaries ---
|
||||||
echo "-- PATH & binaries --"
|
echo "-- PATH & binaries --"
|
||||||
for tool in dmidecode smartctl nvme ipmitool lspci bee; do
|
for tool in dmidecode smartctl nvme ipmitool lspci bee; do
|
||||||
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
if p=$(PATH="/usr/local/bin:/usr/sbin:/sbin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||||
ok "$tool found: $p"
|
ok "$tool found: $p"
|
||||||
else
|
else
|
||||||
fail "$tool: NOT FOUND"
|
fail "$tool: NOT FOUND"
|
||||||
@@ -52,6 +52,14 @@ else
|
|||||||
fail "nvidia-smi: NOT FOUND"
|
fail "nvidia-smi: NOT FOUND"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
|
||||||
|
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||||
|
ok "$tool found: $p"
|
||||||
|
else
|
||||||
|
fail "$tool: NOT FOUND"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "-- NVIDIA modules --"
|
echo "-- NVIDIA modules --"
|
||||||
KO_DIR="/usr/local/lib/nvidia"
|
KO_DIR="/usr/local/lib/nvidia"
|
||||||
@@ -109,6 +117,40 @@ else
|
|||||||
fail "nvidia-smi: not found in PATH"
|
fail "nvidia-smi: not found in PATH"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "-- OpenCL / John --"
|
||||||
|
if [ -f /etc/OpenCL/vendors/nvidia.icd ]; then
|
||||||
|
ok "OpenCL ICD present: /etc/OpenCL/vendors/nvidia.icd"
|
||||||
|
else
|
||||||
|
fail "OpenCL ICD missing: /etc/OpenCL/vendors/nvidia.icd"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ldconfig -p 2>/dev/null | grep -q "libnvidia-opencl.so.1"; then
|
||||||
|
ok "libnvidia-opencl.so.1 present in linker cache"
|
||||||
|
else
|
||||||
|
fail "libnvidia-opencl.so.1 missing from linker cache"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if command -v clinfo >/dev/null 2>&1; then
|
||||||
|
if clinfo -l 2>/dev/null | grep -q "Platform"; then
|
||||||
|
ok "clinfo: OpenCL platform detected"
|
||||||
|
else
|
||||||
|
fail "clinfo: no OpenCL platform detected"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
fail "clinfo: not found in PATH"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if command -v john >/dev/null 2>&1; then
|
||||||
|
if john --list=opencl-devices 2>/dev/null | grep -q "Device #"; then
|
||||||
|
ok "john: OpenCL devices detected"
|
||||||
|
else
|
||||||
|
fail "john: no OpenCL devices detected"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
fail "john: not found in PATH"
|
||||||
|
fi
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "-- lib symlinks --"
|
echo "-- lib symlinks --"
|
||||||
for lib in libnvidia-ml libcuda; do
|
for lib in libnvidia-ml libcuda; do
|
||||||
|
|||||||
3
iso/overlay/etc/modules-load.d/bee-ipmi.conf
Normal file
3
iso/overlay/etc/modules-load.d/bee-ipmi.conf
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# Load IPMI modules for fan/sensor/power monitoring via ipmitool
|
||||||
|
ipmi_si
|
||||||
|
ipmi_devintf
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
export PATH="$PATH:/usr/local/bin:/opt/rocm/bin:/opt/rocm/sbin"
|
export PATH="$PATH:/usr/local/bin:/usr/sbin:/sbin:/opt/rocm/bin:/opt/rocm/sbin"
|
||||||
|
|
||||||
# Print web UI URLs on the local console at login.
|
# Print web UI URLs on the local console at login.
|
||||||
if [ -z "${SSH_CONNECTION:-}" ] \
|
if [ -z "${SSH_CONNECTION:-}" ] \
|
||||||
|
|||||||
@@ -1,14 +1,14 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: run hardware audit
|
Description=Bee: hardware audit
|
||||||
After=bee-network.service bee-nvidia.service bee-preflight.service
|
After=bee-preflight.service bee-network.service bee-nvidia.service
|
||||||
Before=bee-web.service
|
Before=bee-web.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=oneshot
|
Type=oneshot
|
||||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-audit.log /bin/sh -c '/usr/local/bin/bee audit --runtime livecd --output file:/appdata/bee/export/bee-audit.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-audit] WARN: audit exited with rc=$rc"; fi; exit 0'
|
RemainAfterExit=yes
|
||||||
|
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-audit.log /usr/local/bin/bee audit --runtime auto --output file:/appdata/bee/export/bee-audit.json
|
||||||
StandardOutput=journal
|
StandardOutput=journal
|
||||||
StandardError=journal
|
StandardError=journal
|
||||||
RemainAfterExit=yes
|
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: hardware audit web viewer
|
Description=Bee: hardware audit web viewer
|
||||||
After=bee-network.service bee-audit.service
|
After=bee-audit.service
|
||||||
Wants=bee-audit.service
|
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
@@ -10,6 +9,10 @@ Restart=always
|
|||||||
RestartSec=2
|
RestartSec=2
|
||||||
StandardOutput=journal
|
StandardOutput=journal
|
||||||
StandardError=journal
|
StandardError=journal
|
||||||
|
LimitMEMLOCK=infinity
|
||||||
|
# Keep the web server responsive during GPU/CPU stress (children inherit nice+10
|
||||||
|
# via Setpriority in runCmdJob, but the bee-web parent stays at 0).
|
||||||
|
Nice=0
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|||||||
@@ -0,0 +1,6 @@
|
|||||||
|
[Unit]
|
||||||
|
Wants=bee-preflight.service
|
||||||
|
After=bee-preflight.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
ExecStartPre=/usr/local/bin/bee-display-mode
|
||||||
@@ -4,3 +4,6 @@
|
|||||||
RestartSec=10
|
RestartSec=10
|
||||||
StartLimitIntervalSec=60
|
StartLimitIntervalSec=60
|
||||||
StartLimitBurst=3
|
StartLimitBurst=3
|
||||||
|
# Raise scheduling priority of the X server so the graphical console (KVM/IPMI)
|
||||||
|
# stays responsive during GPU/CPU stress tests running at nice+10.
|
||||||
|
Nice=-5
|
||||||
|
|||||||
2
iso/overlay/etc/udev/rules.d/99-ipmi.rules
Normal file
2
iso/overlay/etc/udev/rules.d/99-ipmi.rules
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
# Allow ipmi group to access IPMI device without root
|
||||||
|
KERNEL=="ipmi[0-9]*", GROUP="ipmi", MODE="0660"
|
||||||
54
iso/overlay/usr/local/bin/bee-display-mode
Executable file
54
iso/overlay/usr/local/bin/bee-display-mode
Executable file
@@ -0,0 +1,54 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# Select Xorg display mode based on kernel cmdline.
|
||||||
|
# Default is the current server-safe path: keep forced fbdev.
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
cmdline_param() {
|
||||||
|
key="$1"
|
||||||
|
for token in $(cat /proc/cmdline 2>/dev/null); do
|
||||||
|
case "$token" in
|
||||||
|
"$key"=*)
|
||||||
|
echo "${token#*=}"
|
||||||
|
return 0
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "bee-display-mode: $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
mode="$(cmdline_param bee.display || true)"
|
||||||
|
if [ -z "$mode" ]; then
|
||||||
|
mode="safe"
|
||||||
|
fi
|
||||||
|
|
||||||
|
xorg_dir="/etc/X11/xorg.conf.d"
|
||||||
|
fbdev_conf="${xorg_dir}/10-fbdev.conf"
|
||||||
|
fbdev_park="${xorg_dir}/10-fbdev.conf.disabled"
|
||||||
|
|
||||||
|
mkdir -p "$xorg_dir"
|
||||||
|
|
||||||
|
case "$mode" in
|
||||||
|
kms|auto)
|
||||||
|
if [ -f "$fbdev_conf" ]; then
|
||||||
|
mv "$fbdev_conf" "$fbdev_park"
|
||||||
|
log "mode=${mode}; disabled forced fbdev config"
|
||||||
|
else
|
||||||
|
log "mode=${mode}; fbdev config already disabled"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
safe|fbdev|"")
|
||||||
|
if [ -f "$fbdev_park" ] && [ ! -f "$fbdev_conf" ]; then
|
||||||
|
mv "$fbdev_park" "$fbdev_conf"
|
||||||
|
log "mode=${mode}; restored forced fbdev config"
|
||||||
|
else
|
||||||
|
log "mode=${mode}; keeping forced fbdev config"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
log "unknown bee.display=${mode}; keeping forced fbdev config"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
102
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file
102
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
SECONDS=5
|
||||||
|
SIZE_MB=0
|
||||||
|
DEVICES=""
|
||||||
|
EXCLUDE=""
|
||||||
|
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||||
|
exit 2
|
||||||
|
}
|
||||||
|
|
||||||
|
normalize_list() {
|
||||||
|
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||||
|
}
|
||||||
|
|
||||||
|
contains_csv() {
|
||||||
|
needle="$1"
|
||||||
|
haystack="${2:-}"
|
||||||
|
echo ",${haystack}," | grep -q ",${needle},"
|
||||||
|
}
|
||||||
|
|
||||||
|
while [ "$#" -gt 0 ]; do
|
||||||
|
case "$1" in
|
||||||
|
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||||
|
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
|
||||||
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
|
*) usage ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -x "${WORKER}" ] || { echo "bee-gpu-burn worker not found: ${WORKER}" >&2; exit 1; }
|
||||||
|
|
||||||
|
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||||
|
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||||
|
|
||||||
|
DEVICES=$(normalize_list "${DEVICES}")
|
||||||
|
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||||
|
SELECTED="${DEVICES}"
|
||||||
|
if [ -z "${SELECTED}" ]; then
|
||||||
|
SELECTED="${ALL_DEVICES}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
FINAL=""
|
||||||
|
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||||
|
[ -n "${id}" ] || continue
|
||||||
|
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if [ -z "${FINAL}" ]; then
|
||||||
|
FINAL="${id}"
|
||||||
|
else
|
||||||
|
FINAL="${FINAL},${id}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||||
|
|
||||||
|
echo "loader=bee-gpu-burn"
|
||||||
|
echo "selected_gpus=${FINAL}"
|
||||||
|
|
||||||
|
TMP_DIR=$(mktemp -d)
|
||||||
|
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||||
|
|
||||||
|
WORKERS=""
|
||||||
|
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||||
|
log="${TMP_DIR}/gpu-${id}.log"
|
||||||
|
gpu_size_mb="${SIZE_MB}"
|
||||||
|
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
|
||||||
|
total_mb=$(nvidia-smi --id="${id}" --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | tr -d '[:space:]')
|
||||||
|
if [ -n "${total_mb}" ] && [ "${total_mb}" -gt 0 ] 2>/dev/null; then
|
||||||
|
gpu_size_mb=$(( total_mb * 95 / 100 ))
|
||||||
|
else
|
||||||
|
gpu_size_mb=512
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
||||||
|
"${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||||
|
pid=$!
|
||||||
|
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||||
|
done
|
||||||
|
|
||||||
|
status=0
|
||||||
|
for spec in ${WORKERS}; do
|
||||||
|
pid=${spec%%:*}
|
||||||
|
rest=${spec#*:}
|
||||||
|
id=${rest%%:*}
|
||||||
|
log=${rest#*:}
|
||||||
|
if wait "${pid}"; then
|
||||||
|
echo "gpu ${id} finished: OK"
|
||||||
|
else
|
||||||
|
rc=$?
|
||||||
|
echo "gpu ${id} finished: FAILED rc=${rc}"
|
||||||
|
status=1
|
||||||
|
fi
|
||||||
|
sed "s/^/[gpu ${id}] /" "${log}" || true
|
||||||
|
done
|
||||||
|
|
||||||
|
exit "${status}"
|
||||||
@@ -12,17 +12,55 @@
|
|||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat >&2 <<'EOF'
|
||||||
|
Usage: bee-install <device> [logfile]
|
||||||
|
|
||||||
|
Installs the live system to a local disk (WIPES the target).
|
||||||
|
|
||||||
|
device Target block device, e.g. /dev/sda or /dev/nvme0n1
|
||||||
|
Must be a hard disk or NVMe — NOT a CD-ROM (/dev/sr*)
|
||||||
|
logfile Optional path for progress log (default: /tmp/bee-install.log)
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
bee-install /dev/sda
|
||||||
|
bee-install /dev/nvme0n1
|
||||||
|
bee-install /dev/sdb /tmp/my-install.log
|
||||||
|
|
||||||
|
WARNING: ALL DATA ON <device> WILL BE ERASED.
|
||||||
|
|
||||||
|
Layout (UEFI): GPT — partition 1: EFI 512MB vfat, partition 2: root ext4
|
||||||
|
Layout (BIOS): MBR — partition 1: root ext4
|
||||||
|
EOF
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
DEVICE="${1:-}"
|
DEVICE="${1:-}"
|
||||||
LOGFILE="${2:-/tmp/bee-install.log}"
|
LOGFILE="${2:-/tmp/bee-install.log}"
|
||||||
|
|
||||||
if [ -z "$DEVICE" ]; then
|
if [ -z "$DEVICE" ] || [ "$DEVICE" = "--help" ] || [ "$DEVICE" = "-h" ]; then
|
||||||
echo "Usage: bee-install <device> [logfile]" >&2
|
usage
|
||||||
exit 1
|
|
||||||
fi
|
fi
|
||||||
if [ ! -b "$DEVICE" ]; then
|
if [ ! -b "$DEVICE" ]; then
|
||||||
echo "ERROR: $DEVICE is not a block device" >&2
|
echo "ERROR: $DEVICE is not a block device" >&2
|
||||||
|
echo "Run 'lsblk' to list available disks." >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
# Block CD-ROM devices
|
||||||
|
case "$DEVICE" in
|
||||||
|
/dev/sr*|/dev/scd*)
|
||||||
|
echo "ERROR: $DEVICE is a CD-ROM/optical device — cannot install to it." >&2
|
||||||
|
echo "Run 'lsblk' to find the target disk (e.g. /dev/sda, /dev/nvme0n1)." >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
# Check required tools
|
||||||
|
for tool in parted mkfs.vfat mkfs.ext4 unsquashfs grub-install update-grub; do
|
||||||
|
if ! command -v "$tool" >/dev/null 2>&1; then
|
||||||
|
echo "ERROR: required tool not found: $tool" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
SQUASHFS="/run/live/medium/live/filesystem.squashfs"
|
SQUASHFS="/run/live/medium/live/filesystem.squashfs"
|
||||||
if [ ! -f "$SQUASHFS" ]; then
|
if [ ! -f "$SQUASHFS" ]; then
|
||||||
@@ -158,20 +196,56 @@ mount --bind /sys "${MOUNT_ROOT}/sys"
|
|||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
log "--- Step 7/7: Installing GRUB bootloader ---"
|
log "--- Step 7/7: Installing GRUB bootloader ---"
|
||||||
|
|
||||||
|
# Helper: run a chroot command, log all output, return its exit code.
|
||||||
|
# Needed because "cmd | while" pipelines hide the exit code of cmd.
|
||||||
|
chroot_log() {
|
||||||
|
local rc=0
|
||||||
|
local out
|
||||||
|
out=$(chroot "$MOUNT_ROOT" "$@" 2>&1) || rc=$?
|
||||||
|
echo "$out" | while IFS= read -r line; do log " $line"; done
|
||||||
|
return $rc
|
||||||
|
}
|
||||||
|
|
||||||
if [ "$UEFI" = "1" ]; then
|
if [ "$UEFI" = "1" ]; then
|
||||||
chroot "$MOUNT_ROOT" grub-install \
|
# Primary attempt: write EFI NVRAM entry (requires writable efivars)
|
||||||
--target=x86_64-efi \
|
if ! chroot_log grub-install \
|
||||||
--efi-directory=/boot/efi \
|
--target=x86_64-efi \
|
||||||
--bootloader-id=bee \
|
--efi-directory=/boot/efi \
|
||||||
--recheck 2>&1 | while read -r line; do log " $line"; done || true
|
--bootloader-id=bee \
|
||||||
|
--recheck; then
|
||||||
|
log " WARNING: grub-install (with NVRAM) failed — retrying with --no-nvram"
|
||||||
|
# --no-nvram: write grubx64.efi but skip EFI variable update.
|
||||||
|
# Needed on headless servers where efivars is read-only or unavailable.
|
||||||
|
chroot_log grub-install \
|
||||||
|
--target=x86_64-efi \
|
||||||
|
--efi-directory=/boot/efi \
|
||||||
|
--bootloader-id=bee \
|
||||||
|
--no-nvram \
|
||||||
|
--recheck || log " WARNING: grub-install --no-nvram also failed — check logs"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Always install the UEFI fallback path EFI/BOOT/BOOTX64.EFI.
|
||||||
|
# Many UEFI implementations (especially server BMCs and some firmware)
|
||||||
|
# ignore the NVRAM boot entry and only look for this path.
|
||||||
|
GRUB_EFI="${MOUNT_ROOT}/boot/efi/EFI/bee/grubx64.efi"
|
||||||
|
FALLBACK_DIR="${MOUNT_ROOT}/boot/efi/EFI/BOOT"
|
||||||
|
if [ -f "$GRUB_EFI" ]; then
|
||||||
|
mkdir -p "$FALLBACK_DIR"
|
||||||
|
cp "$GRUB_EFI" "${FALLBACK_DIR}/BOOTX64.EFI"
|
||||||
|
log " Fallback EFI binary installed: EFI/BOOT/BOOTX64.EFI"
|
||||||
|
else
|
||||||
|
log " WARNING: grubx64.efi not found at $GRUB_EFI — UEFI fallback path not set"
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
chroot "$MOUNT_ROOT" grub-install \
|
chroot_log grub-install \
|
||||||
--target=i386-pc \
|
--target=i386-pc \
|
||||||
--recheck \
|
--recheck \
|
||||||
"$DEVICE" 2>&1 | while read -r line; do log " $line"; done || true
|
"$DEVICE" || log " WARNING: grub-install (BIOS) failed — check logs"
|
||||||
fi
|
fi
|
||||||
chroot "$MOUNT_ROOT" update-grub 2>&1 | while read -r line; do log " $line"; done || true
|
|
||||||
log " GRUB installed."
|
chroot_log update-grub || log " WARNING: update-grub failed — check logs"
|
||||||
|
log " GRUB step complete."
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Cleanup
|
# Cleanup
|
||||||
|
|||||||
205
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file
205
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
SECONDS=300
|
||||||
|
DEVICES=""
|
||||||
|
EXCLUDE=""
|
||||||
|
FORMAT=""
|
||||||
|
JOHN_DIR="/usr/local/lib/bee/john/run"
|
||||||
|
JOHN_BIN="${JOHN_DIR}/john"
|
||||||
|
export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
|
||||||
|
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
||||||
|
exit 2
|
||||||
|
}
|
||||||
|
|
||||||
|
normalize_list() {
|
||||||
|
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||||
|
}
|
||||||
|
|
||||||
|
contains_csv() {
|
||||||
|
needle="$1"
|
||||||
|
haystack="${2:-}"
|
||||||
|
echo ",${haystack}," | grep -q ",${needle},"
|
||||||
|
}
|
||||||
|
|
||||||
|
show_opencl_diagnostics() {
|
||||||
|
echo "-- OpenCL ICD vendors --" >&2
|
||||||
|
if [ -d /etc/OpenCL/vendors ]; then
|
||||||
|
ls -l /etc/OpenCL/vendors >&2 || true
|
||||||
|
for icd in /etc/OpenCL/vendors/*.icd; do
|
||||||
|
[ -f "${icd}" ] || continue
|
||||||
|
echo " file: ${icd}" >&2
|
||||||
|
sed 's/^/ /' "${icd}" >&2 || true
|
||||||
|
done
|
||||||
|
else
|
||||||
|
echo " /etc/OpenCL/vendors is missing" >&2
|
||||||
|
fi
|
||||||
|
echo "-- NVIDIA device nodes --" >&2
|
||||||
|
ls -l /dev/nvidia* >&2 || true
|
||||||
|
echo "-- ldconfig OpenCL/NVIDIA --" >&2
|
||||||
|
ldconfig -p 2>/dev/null | grep 'libOpenCL\|libcuda\|libnvidia-opencl' >&2 || true
|
||||||
|
if command -v clinfo >/dev/null 2>&1; then
|
||||||
|
echo "-- clinfo -l --" >&2
|
||||||
|
clinfo -l >&2 || true
|
||||||
|
fi
|
||||||
|
echo "-- john --list=opencl-devices --" >&2
|
||||||
|
./john --list=opencl-devices >&2 || true
|
||||||
|
}
|
||||||
|
|
||||||
|
refresh_nvidia_runtime() {
|
||||||
|
if [ "$(id -u)" != "0" ]; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
if command -v bee-nvidia-load >/dev/null 2>&1; then
|
||||||
|
bee-nvidia-load >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
ldconfig >/dev/null 2>&1 || true
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_nvidia_uvm() {
|
||||||
|
if lsmod 2>/dev/null | grep -q '^nvidia_uvm '; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
if [ "$(id -u)" != "0" ]; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ko="/usr/local/lib/nvidia/nvidia-uvm.ko"
|
||||||
|
[ -f "${ko}" ] || return 1
|
||||||
|
|
||||||
|
if ! insmod "${ko}" >/dev/null 2>&1; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
uvm_major=$(grep -m1 ' nvidia-uvm$' /proc/devices | awk '{print $1}')
|
||||||
|
if [ -n "${uvm_major}" ]; then
|
||||||
|
mknod -m 666 /dev/nvidia-uvm c "${uvm_major}" 0 2>/dev/null || true
|
||||||
|
mknod -m 666 /dev/nvidia-uvm-tools c "${uvm_major}" 1 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_opencl_ready() {
|
||||||
|
out=$(./john --list=opencl-devices 2>&1 || true)
|
||||||
|
if echo "${out}" | grep -q "Device #"; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if refresh_nvidia_runtime; then
|
||||||
|
out=$(./john --list=opencl-devices 2>&1 || true)
|
||||||
|
if echo "${out}" | grep -q "Device #"; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ensure_nvidia_uvm; then
|
||||||
|
out=$(./john --list=opencl-devices 2>&1 || true)
|
||||||
|
if echo "${out}" | grep -q "Device #"; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "OpenCL devices are not available for John." >&2
|
||||||
|
if ! lsmod 2>/dev/null | grep -q '^nvidia_uvm '; then
|
||||||
|
echo "nvidia_uvm is not loaded." >&2
|
||||||
|
fi
|
||||||
|
if [ ! -e /dev/nvidia-uvm ]; then
|
||||||
|
echo "/dev/nvidia-uvm is missing." >&2
|
||||||
|
fi
|
||||||
|
show_opencl_diagnostics
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
while [ "$#" -gt 0 ]; do
|
||||||
|
case "$1" in
|
||||||
|
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||||
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
|
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
||||||
|
*) usage ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -x "${JOHN_BIN}" ] || { echo "john binary not found: ${JOHN_BIN}" >&2; exit 1; }
|
||||||
|
|
||||||
|
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||||
|
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||||
|
|
||||||
|
DEVICES=$(normalize_list "${DEVICES}")
|
||||||
|
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||||
|
SELECTED="${DEVICES}"
|
||||||
|
if [ -z "${SELECTED}" ]; then
|
||||||
|
SELECTED="${ALL_DEVICES}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
FINAL=""
|
||||||
|
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||||
|
[ -n "${id}" ] || continue
|
||||||
|
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if [ -z "${FINAL}" ]; then
|
||||||
|
FINAL="${id}"
|
||||||
|
else
|
||||||
|
FINAL="${FINAL},${id}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||||
|
|
||||||
|
JOHN_DEVICES=""
|
||||||
|
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||||
|
opencl_id=$((id + 1))
|
||||||
|
if [ -z "${JOHN_DEVICES}" ]; then
|
||||||
|
JOHN_DEVICES="${opencl_id}"
|
||||||
|
else
|
||||||
|
JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "loader=john"
|
||||||
|
echo "selected_gpus=${FINAL}"
|
||||||
|
echo "john_devices=${JOHN_DEVICES}"
|
||||||
|
|
||||||
|
cd "${JOHN_DIR}"
|
||||||
|
|
||||||
|
ensure_opencl_ready || exit 1
|
||||||
|
|
||||||
|
choose_format() {
|
||||||
|
if [ -n "${FORMAT}" ]; then
|
||||||
|
echo "${FORMAT}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
for candidate in sha512crypt-opencl pbkdf2-hmac-sha512-opencl 7z-opencl sha256crypt-opencl md5crypt-opencl; do
|
||||||
|
if ./john --test=1 --format="${candidate}" --devices="${JOHN_DEVICES}" >/dev/null 2>&1; then
|
||||||
|
echo "${candidate}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
CHOSEN_FORMAT=$(choose_format) || {
|
||||||
|
echo "no suitable john OpenCL format found" >&2
|
||||||
|
./john --list=opencl-devices >&2 || true
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "format=${CHOSEN_FORMAT}"
|
||||||
|
PIDS=""
|
||||||
|
_first=1
|
||||||
|
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
||||||
|
[ "${_first}" = "1" ] || sleep 3
|
||||||
|
_first=0
|
||||||
|
./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" &
|
||||||
|
PIDS="${PIDS} $!"
|
||||||
|
done
|
||||||
|
FAIL=0
|
||||||
|
for pid in ${PIDS}; do
|
||||||
|
wait "${pid}" || FAIL=$((FAIL+1))
|
||||||
|
done
|
||||||
|
[ "${FAIL}" -eq 0 ] || { echo "john: ${FAIL} device(s) failed" >&2; exit 1; }
|
||||||
@@ -17,7 +17,7 @@ mkdir -p "$(dirname "$log_file")"
|
|||||||
serial_sink() {
|
serial_sink() {
|
||||||
local tty="$1"
|
local tty="$1"
|
||||||
if [ -w "$tty" ]; then
|
if [ -w "$tty" ]; then
|
||||||
cat > "$tty"
|
cat > "$tty" 2>/dev/null || true
|
||||||
else
|
else
|
||||||
cat > /dev/null
|
cat > /dev/null
|
||||||
fi
|
fi
|
||||||
|
|||||||
91
iso/overlay/usr/local/bin/bee-nccl-gpu-stress
Normal file
91
iso/overlay/usr/local/bin/bee-nccl-gpu-stress
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
SECONDS=300
|
||||||
|
DEVICES=""
|
||||||
|
EXCLUDE=""
|
||||||
|
MIN_BYTES="512M"
|
||||||
|
MAX_BYTES="4G"
|
||||||
|
FACTOR="2"
|
||||||
|
ITERS="20"
|
||||||
|
ALL_REDUCE_BIN="/usr/local/bin/all_reduce_perf"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||||
|
exit 2
|
||||||
|
}
|
||||||
|
|
||||||
|
normalize_list() {
|
||||||
|
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||||
|
}
|
||||||
|
|
||||||
|
contains_csv() {
|
||||||
|
needle="$1"
|
||||||
|
haystack="${2:-}"
|
||||||
|
echo ",${haystack}," | grep -q ",${needle},"
|
||||||
|
}
|
||||||
|
|
||||||
|
while [ "$#" -gt 0 ]; do
|
||||||
|
case "$1" in
|
||||||
|
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||||
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
|
*) usage ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -x "${ALL_REDUCE_BIN}" ] || { echo "all_reduce_perf not found: ${ALL_REDUCE_BIN}" >&2; exit 1; }
|
||||||
|
|
||||||
|
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||||
|
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||||
|
|
||||||
|
DEVICES=$(normalize_list "${DEVICES}")
|
||||||
|
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||||
|
SELECTED="${DEVICES}"
|
||||||
|
if [ -z "${SELECTED}" ]; then
|
||||||
|
SELECTED="${ALL_DEVICES}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
FINAL=""
|
||||||
|
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||||
|
[ -n "${id}" ] || continue
|
||||||
|
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if [ -z "${FINAL}" ]; then
|
||||||
|
FINAL="${id}"
|
||||||
|
else
|
||||||
|
FINAL="${FINAL},${id}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||||
|
|
||||||
|
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | awk '{print $1}')
|
||||||
|
[ "${GPU_COUNT}" -gt 0 ] || { echo "selected GPU count is zero" >&2; exit 1; }
|
||||||
|
|
||||||
|
echo "loader=nccl"
|
||||||
|
echo "selected_gpus=${FINAL}"
|
||||||
|
echo "gpu_count=${GPU_COUNT}"
|
||||||
|
echo "range=${MIN_BYTES}..${MAX_BYTES}"
|
||||||
|
echo "iters=${ITERS}"
|
||||||
|
|
||||||
|
deadline=$(( $(date +%s) + SECONDS ))
|
||||||
|
round=0
|
||||||
|
|
||||||
|
while :; do
|
||||||
|
now=$(date +%s)
|
||||||
|
if [ "${now}" -ge "${deadline}" ]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
round=$((round + 1))
|
||||||
|
remaining=$((deadline - now))
|
||||||
|
echo "round=${round} remaining_sec=${remaining}"
|
||||||
|
CUDA_VISIBLE_DEVICES="${FINAL}" \
|
||||||
|
"${ALL_REDUCE_BIN}" \
|
||||||
|
-b "${MIN_BYTES}" \
|
||||||
|
-e "${MAX_BYTES}" \
|
||||||
|
-f "${FACTOR}" \
|
||||||
|
-g "${GPU_COUNT}" \
|
||||||
|
--iters "${ITERS}"
|
||||||
|
done
|
||||||
@@ -6,25 +6,66 @@ LOG_PREFIX="bee-network"
|
|||||||
|
|
||||||
log() { echo "[$LOG_PREFIX] $*"; }
|
log() { echo "[$LOG_PREFIX] $*"; }
|
||||||
|
|
||||||
# find physical interfaces: exclude lo and virtual (docker/virbr/veth/tun/tap)
|
list_interfaces() {
|
||||||
interfaces=$(ip -o link show \
|
ip -o link show \
|
||||||
| awk -F': ' '{print $2}' \
|
| awk -F': ' '{print $2}' \
|
||||||
| grep -v '^lo$' \
|
| grep -v '^lo$' \
|
||||||
| grep -vE '^(docker|virbr|veth|tun|tap|br-|bond|dummy)' \
|
| grep -vE '^(docker|virbr|veth|tun|tap|br-|bond|dummy)' \
|
||||||
| sort)
|
| sort
|
||||||
|
}
|
||||||
|
|
||||||
if [ -z "$interfaces" ]; then
|
# Give udev a short chance to expose late NICs before the first scan.
|
||||||
|
if command -v udevadm >/dev/null 2>&1; then
|
||||||
|
udevadm settle --timeout=5 >/dev/null 2>&1 || log "WARN: udevadm settle timed out"
|
||||||
|
fi
|
||||||
|
|
||||||
|
started_ifaces=""
|
||||||
|
started_count=0
|
||||||
|
scan_pass=1
|
||||||
|
|
||||||
|
# Some server NICs appear a bit later after module/firmware init. Do a small
|
||||||
|
# bounded rescan window without turning network bring-up into a boot blocker.
|
||||||
|
while [ "$scan_pass" -le 3 ]; do
|
||||||
|
interfaces=$(list_interfaces)
|
||||||
|
|
||||||
|
if [ -n "$interfaces" ]; then
|
||||||
|
for iface in $interfaces; do
|
||||||
|
case " $started_ifaces " in
|
||||||
|
*" $iface "*) continue ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
log "bringing up $iface"
|
||||||
|
if ! ip link set "$iface" up; then
|
||||||
|
log "WARN: could not bring up $iface"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
carrier=$(cat "/sys/class/net/$iface/carrier" 2>/dev/null || true)
|
||||||
|
if [ "$carrier" = "1" ]; then
|
||||||
|
log "carrier detected on $iface"
|
||||||
|
else
|
||||||
|
log "carrier not detected yet on $iface"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# DHCP in background — non-blocking, keep dhclient verbose output in the service log.
|
||||||
|
dhclient -4 -v -nw "$iface" &
|
||||||
|
log "DHCP started for $iface (pid $!)"
|
||||||
|
|
||||||
|
started_ifaces="$started_ifaces $iface"
|
||||||
|
started_count=$((started_count + 1))
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$scan_pass" -ge 3 ]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
scan_pass=$((scan_pass + 1))
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "$started_count" -eq 0 ]; then
|
||||||
log "no physical interfaces found"
|
log "no physical interfaces found"
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
for iface in $interfaces; do
|
log "done (interfaces started: $started_count)"
|
||||||
log "bringing up $iface"
|
|
||||||
ip link set "$iface" up || { log "WARN: could not bring up $iface"; continue; }
|
|
||||||
|
|
||||||
# DHCP in background — non-blocking, keep dhclient verbose output in the service log.
|
|
||||||
dhclient -4 -v -nw "$iface" &
|
|
||||||
log "DHCP started for $iface (pid $!)"
|
|
||||||
done
|
|
||||||
|
|
||||||
log "done"
|
|
||||||
|
|||||||
@@ -59,15 +59,28 @@ load_module() {
|
|||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
load_host_module() {
|
||||||
|
mod="$1"
|
||||||
|
if modprobe "$mod" >/dev/null 2>&1; then
|
||||||
|
log "host module loaded: $mod"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
case "$nvidia_mode" in
|
case "$nvidia_mode" in
|
||||||
normal|full)
|
normal|full)
|
||||||
if ! load_module nvidia; then
|
if ! load_module nvidia; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
# nvidia-modeset on some server kernels needs ACPI video helper symbols
|
||||||
|
# exported by the generic "video" module. Best-effort only; compute paths
|
||||||
|
# remain functional even if display-related modules stay absent.
|
||||||
|
load_host_module video || true
|
||||||
load_module nvidia-modeset || true
|
load_module nvidia-modeset || true
|
||||||
load_module nvidia-uvm || true
|
load_module nvidia-uvm || true
|
||||||
;;
|
;;
|
||||||
gsp-off|safe|*)
|
gsp-off|safe)
|
||||||
# NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
|
# NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
|
||||||
# be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
|
# be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
|
||||||
# conservative path for platforms where full boot-time GSP init is unstable.
|
# conservative path for platforms where full boot-time GSP init is unstable.
|
||||||
@@ -76,6 +89,15 @@ case "$nvidia_mode" in
|
|||||||
fi
|
fi
|
||||||
log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
|
log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
|
||||||
;;
|
;;
|
||||||
|
nomsi|*)
|
||||||
|
# nomsi: disable MSI-X/MSI interrupts — use when RmInitAdapter fails with
|
||||||
|
# "Failed to enable MSI-X" on one or more GPUs (IOMMU group interrupt limits).
|
||||||
|
# NVreg_EnableMSI=0 forces legacy INTx interrupts for all GPUs.
|
||||||
|
if ! load_module nvidia NVreg_EnableGpuFirmware=0 NVreg_EnableMSI=0; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
log "nomsi mode: MSI-X disabled (NVreg_EnableMSI=0), skipping nvidia-modeset and nvidia-uvm"
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
# Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
|
# Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
|
||||||
@@ -105,4 +127,19 @@ fi
|
|||||||
ldconfig 2>/dev/null || true
|
ldconfig 2>/dev/null || true
|
||||||
log "ldconfig refreshed"
|
log "ldconfig refreshed"
|
||||||
|
|
||||||
|
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||||
|
# nv-hostengine must run before any dcgmi command — without it, dcgmi reports
|
||||||
|
# "group is empty" even when GPUs and modules are present.
|
||||||
|
# Skip if already running (e.g. started by a dcgm systemd service or prior boot).
|
||||||
|
if command -v nv-hostengine >/dev/null 2>&1; then
|
||||||
|
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
|
log "nv-hostengine already running — skipping"
|
||||||
|
else
|
||||||
|
nv-hostengine
|
||||||
|
log "nv-hostengine started"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "WARN: nv-hostengine not found — dcgmi diagnostics will not work"
|
||||||
|
fi
|
||||||
|
|
||||||
log "done"
|
log "done"
|
||||||
|
|||||||
@@ -2,23 +2,29 @@
|
|||||||
# openbox session: launch tint2 taskbar + chromium, then openbox as WM.
|
# openbox session: launch tint2 taskbar + chromium, then openbox as WM.
|
||||||
# This file is used as an xinitrc by bee-desktop.
|
# This file is used as an xinitrc by bee-desktop.
|
||||||
|
|
||||||
# Wait for bee-web to be accepting connections (up to 15 seconds)
|
# Disable screensaver and DPMS
|
||||||
|
xset s off
|
||||||
|
xset -dpms
|
||||||
|
xset s noblank
|
||||||
|
|
||||||
|
tint2 &
|
||||||
|
|
||||||
|
# Wait up to 120s for bee-web to bind. The web server starts immediately now
|
||||||
|
# (audit is deferred), so this should succeed in a few seconds on most hardware.
|
||||||
i=0
|
i=0
|
||||||
while [ $i -lt 15 ]; do
|
while [ $i -lt 120 ]; do
|
||||||
if curl -sf http://localhost/healthz >/dev/null 2>&1; then
|
if curl -sf http://localhost/healthz >/dev/null 2>&1; then break; fi
|
||||||
break
|
|
||||||
fi
|
|
||||||
sleep 1
|
sleep 1
|
||||||
i=$((i+1))
|
i=$((i+1))
|
||||||
done
|
done
|
||||||
|
|
||||||
tint2 &
|
|
||||||
chromium \
|
chromium \
|
||||||
--disable-infobars \
|
--disable-infobars \
|
||||||
--disable-translate \
|
--disable-translate \
|
||||||
--no-first-run \
|
--no-first-run \
|
||||||
--disable-session-crashed-bubble \
|
--disable-session-crashed-bubble \
|
||||||
--disable-features=TranslateUI \
|
--disable-features=TranslateUI \
|
||||||
|
--start-maximized \
|
||||||
http://localhost/ &
|
http://localhost/ &
|
||||||
|
|
||||||
exec openbox
|
exec openbox
|
||||||
|
|||||||
@@ -3,6 +3,11 @@
|
|||||||
# Type 'a' at any prompt to abort, 'b' to go back.
|
# Type 'a' at any prompt to abort, 'b' to go back.
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
# Requires root for ip/dhclient/resolv.conf — re-exec via sudo if needed.
|
||||||
|
if [ "$(id -u)" -ne 0 ]; then
|
||||||
|
exec sudo "$0" "$@"
|
||||||
|
fi
|
||||||
|
|
||||||
abort() { echo "Aborted."; exit 0; }
|
abort() { echo "Aborted."; exit 0; }
|
||||||
|
|
||||||
ask() {
|
ask() {
|
||||||
|
|||||||
Reference in New Issue
Block a user