Compare commits
35 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5ee120158e | |||
| 09fe0e2e9e | |||
| ace1a9dba6 | |||
| 905c581ece | |||
| 7c2a0135d2 | |||
| 407c1cd1c4 | |||
| e15bcc91c5 | |||
| 98f0cf0d52 | |||
| 4db89e9773 | |||
| 3fda18f708 | |||
| ea518abf30 | |||
| 744de588bb | |||
| a3ed9473a3 | |||
| a714c45f10 | |||
| 349e026cfa | |||
| 889fe1dc2f | |||
| befdbf3768 | |||
| ec6a0b292d | |||
| a03312c286 | |||
| e69e9109da | |||
| 413869809d | |||
| f9bd38572a | |||
| 662e3d2cdd | |||
| 126af96780 | |||
| ada15ac777 | |||
| dfb94f9ca6 | |||
| 5857805518 | |||
| 59a1d4b209 | |||
| 0dbfaf6121 | |||
| 5d72d48714 | |||
| 096b4a09ca | |||
| 5d42a92e4c | |||
| 3e54763367 | |||
| f91bce8661 | |||
| 585e6d7311 |
@@ -1,11 +1,13 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
|
"runtime/debug"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
@@ -16,6 +18,37 @@ import (
|
|||||||
|
|
||||||
var Version = "dev"
|
var Version = "dev"
|
||||||
|
|
||||||
|
func buildLabel() string {
|
||||||
|
label := strings.TrimSpace(Version)
|
||||||
|
if label == "" {
|
||||||
|
label = "dev"
|
||||||
|
}
|
||||||
|
if info, ok := debug.ReadBuildInfo(); ok {
|
||||||
|
var revision string
|
||||||
|
var modified bool
|
||||||
|
for _, setting := range info.Settings {
|
||||||
|
switch setting.Key {
|
||||||
|
case "vcs.revision":
|
||||||
|
revision = setting.Value
|
||||||
|
case "vcs.modified":
|
||||||
|
modified = setting.Value == "true"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if revision != "" {
|
||||||
|
short := revision
|
||||||
|
if len(short) > 12 {
|
||||||
|
short = short[:12]
|
||||||
|
}
|
||||||
|
label += " (" + short
|
||||||
|
if modified {
|
||||||
|
label += "+"
|
||||||
|
}
|
||||||
|
label += ")"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return label
|
||||||
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
||||||
}
|
}
|
||||||
@@ -139,7 +172,6 @@ func runAudit(args []string, stdout, stderr io.Writer) int {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
func runExport(args []string, stdout, stderr io.Writer) int {
|
func runExport(args []string, stdout, stderr io.Writer) int {
|
||||||
fs := flag.NewFlagSet("export", flag.ContinueOnError)
|
fs := flag.NewFlagSet("export", flag.ContinueOnError)
|
||||||
fs.SetOutput(stderr)
|
fs.SetOutput(stderr)
|
||||||
@@ -299,6 +331,7 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
|||||||
|
|
||||||
if err := webui.ListenAndServe(*listenAddr, webui.HandlerOptions{
|
if err := webui.ListenAndServe(*listenAddr, webui.HandlerOptions{
|
||||||
Title: *title,
|
Title: *title,
|
||||||
|
BuildLabel: buildLabel(),
|
||||||
AuditPath: *auditPath,
|
AuditPath: *auditPath,
|
||||||
ExportDir: *exportDir,
|
ExportDir: *exportDir,
|
||||||
App: app.New(platform.New()),
|
App: app.New(platform.New()),
|
||||||
@@ -351,15 +384,15 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
|||||||
case "nvidia":
|
case "nvidia":
|
||||||
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
||||||
case "memory":
|
case "memory":
|
||||||
archive, err = application.RunMemoryAcceptancePack("", logLine)
|
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
|
||||||
case "storage":
|
case "storage":
|
||||||
archive, err = application.RunStorageAcceptancePack("", logLine)
|
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
|
||||||
case "cpu":
|
case "cpu":
|
||||||
dur := *duration
|
dur := *duration
|
||||||
if dur <= 0 {
|
if dur <= 0 {
|
||||||
dur = 60
|
dur = 60
|
||||||
}
|
}
|
||||||
archive, err = application.RunCPUAcceptancePack("", dur, logLine)
|
archive, err = application.RunCPUAcceptancePackCtx(context.Background(), "", dur, logLine)
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Error("run sat", "target", target, "err", err)
|
slog.Error("run sat", "target", target, "err", err)
|
||||||
|
|||||||
11
audit/go.mod
11
audit/go.mod
@@ -1,6 +1,6 @@
|
|||||||
module bee/audit
|
module bee/audit
|
||||||
|
|
||||||
go 1.24.0
|
go 1.25.0
|
||||||
|
|
||||||
replace reanimator/chart => ../internal/chart
|
replace reanimator/chart => ../internal/chart
|
||||||
|
|
||||||
@@ -13,5 +13,14 @@ require (
|
|||||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||||
github.com/go-analyze/bulk v0.1.3 // indirect
|
github.com/go-analyze/bulk v0.1.3 // indirect
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
|
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
|
||||||
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
|
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||||
|
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||||
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||||
golang.org/x/image v0.24.0 // indirect
|
golang.org/x/image v0.24.0 // indirect
|
||||||
|
golang.org/x/sys v0.42.0 // indirect
|
||||||
|
modernc.org/libc v1.70.0 // indirect
|
||||||
|
modernc.org/mathutil v1.7.1 // indirect
|
||||||
|
modernc.org/memory v1.11.0 // indirect
|
||||||
|
modernc.org/sqlite v1.48.0 // indirect
|
||||||
)
|
)
|
||||||
|
|||||||
19
audit/go.sum
19
audit/go.sum
@@ -8,11 +8,30 @@ github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00
|
|||||||
github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
|
github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
|
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
|
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
|
||||||
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
|
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||||
|
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
|
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||||
|
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||||
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||||
golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
|
golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
|
||||||
golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
|
golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
|
||||||
|
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||||
|
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
|
||||||
|
modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
|
||||||
|
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||||
|
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||||
|
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||||
|
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||||
|
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
||||||
|
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
||||||
|
|||||||
@@ -55,6 +55,8 @@ type networkManager interface {
|
|||||||
SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error)
|
SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error)
|
||||||
SetInterfaceState(iface string, up bool) error
|
SetInterfaceState(iface string, up bool) error
|
||||||
GetInterfaceState(iface string) (bool, error)
|
GetInterfaceState(iface string) (bool, error)
|
||||||
|
CaptureNetworkSnapshot() (platform.NetworkSnapshot, error)
|
||||||
|
RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error
|
||||||
}
|
}
|
||||||
|
|
||||||
type serviceManager interface {
|
type serviceManager interface {
|
||||||
@@ -78,7 +80,7 @@ type installer interface {
|
|||||||
ListInstallDisks() ([]platform.InstallDisk, error)
|
ListInstallDisks() ([]platform.InstallDisk, error)
|
||||||
InstallToDisk(ctx context.Context, device string, logFile string) error
|
InstallToDisk(ctx context.Context, device string, logFile string) error
|
||||||
IsLiveMediaInRAM() bool
|
IsLiveMediaInRAM() bool
|
||||||
RunInstallToRAM(logFunc func(string)) error
|
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
||||||
}
|
}
|
||||||
|
|
||||||
type GPUPresenceResult struct {
|
type GPUPresenceResult struct {
|
||||||
@@ -98,24 +100,27 @@ func (a *App) IsLiveMediaInRAM() bool {
|
|||||||
return a.installer.IsLiveMediaInRAM()
|
return a.installer.IsLiveMediaInRAM()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunInstallToRAM(logFunc func(string)) error {
|
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||||
return a.installer.RunInstallToRAM(logFunc)
|
return a.installer.RunInstallToRAM(ctx, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
type satRunner interface {
|
type satRunner interface {
|
||||||
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
||||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error)
|
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||||
DetectGPUVendor() string
|
DetectGPUVendor() string
|
||||||
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
||||||
RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
RunAMDStressPack(baseDir string, logFunc func(string)) (string, error)
|
RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
RunMemoryStressPack(baseDir string, logFunc func(string)) (string, error)
|
RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
RunSATStressPack(baseDir string, logFunc func(string)) (string, error)
|
RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
|
RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
|
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||||
|
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||||
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -348,6 +353,14 @@ func (a *App) GetInterfaceState(iface string) (bool, error) {
|
|||||||
return a.network.GetInterfaceState(iface)
|
return a.network.GetInterfaceState(iface)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||||
|
return a.network.CaptureNetworkSnapshot()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error {
|
||||||
|
return a.network.RestoreNetworkSnapshot(snapshot)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
|
func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
|
||||||
body, err := a.network.SetStaticIPv4(cfg)
|
body, err := a.network.SetStaticIPv4(cfg)
|
||||||
return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
|
return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
|
||||||
@@ -496,10 +509,14 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunMemoryAcceptancePack(baseDir, logFunc)
|
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
@@ -508,10 +525,14 @@ func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunCPUAcceptancePack(baseDir, durationSec, logFunc)
|
return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
|
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
|
||||||
@@ -520,10 +541,14 @@ func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (Actio
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunStorageAcceptancePack(baseDir, logFunc)
|
return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
@@ -540,10 +565,14 @@ func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunAMDAcceptancePack(baseDir, logFunc)
|
return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
@@ -551,19 +580,45 @@ func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
|||||||
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryStressPack(baseDir string, logFunc func(string)) (string, error) {
|
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
return a.sat.RunMemoryStressPack(baseDir, logFunc)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (a *App) RunSATStressPack(baseDir string, logFunc func(string)) (string, error) {
|
|
||||||
return a.sat.RunSATStressPack(baseDir, logFunc)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (a *App) RunAMDStressPack(baseDir string, logFunc func(string)) (string, error) {
|
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunAMDStressPack(baseDir, logFunc)
|
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
||||||
@@ -573,6 +628,13 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor
|
|||||||
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||||
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
||||||
body := "Results: " + path
|
body := "Results: " + path
|
||||||
|
|||||||
@@ -43,8 +43,12 @@ func (f fakeNetwork) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error
|
|||||||
return f.setStaticIPv4Fn(cfg)
|
return f.setStaticIPv4Fn(cfg)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeNetwork) SetInterfaceState(_ string, _ bool) error { return nil }
|
func (f fakeNetwork) SetInterfaceState(_ string, _ bool) error { return nil }
|
||||||
func (f fakeNetwork) GetInterfaceState(_ string) (bool, error) { return true, nil }
|
func (f fakeNetwork) GetInterfaceState(_ string) (bool, error) { return true, nil }
|
||||||
|
func (f fakeNetwork) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||||
|
return platform.NetworkSnapshot{}, nil
|
||||||
|
}
|
||||||
|
func (f fakeNetwork) RestoreNetworkSnapshot(platform.NetworkSnapshot) error { return nil }
|
||||||
|
|
||||||
type fakeServices struct {
|
type fakeServices struct {
|
||||||
serviceStatusFn func(string) (string, error)
|
serviceStatusFn func(string) (string, error)
|
||||||
@@ -141,15 +145,15 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunMemoryAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||||
return f.runMemoryFn(baseDir)
|
return f.runMemoryFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunStorageAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||||
return f.runStorageFn(baseDir)
|
return f.runStorageFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int, _ func(string)) (string, error) {
|
func (f fakeSAT) RunCPUAcceptancePack(_ context.Context, baseDir string, durationSec int, _ func(string)) (string, error) {
|
||||||
if f.runCPUFn != nil {
|
if f.runCPUFn != nil {
|
||||||
return f.runCPUFn(baseDir, durationSec)
|
return f.runCPUFn(baseDir, durationSec)
|
||||||
}
|
}
|
||||||
@@ -170,21 +174,39 @@ func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunAMDAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
func (f fakeSAT) RunAMDAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||||
if f.runAMDPackFn != nil {
|
if f.runAMDPackFn != nil {
|
||||||
return f.runAMDPackFn(baseDir)
|
return f.runAMDPackFn(baseDir)
|
||||||
}
|
}
|
||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunAMDStressPack(_ string, _ func(string)) (string, error) { return "", nil }
|
func (f fakeSAT) RunAMDMemIntegrityPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||||
func (f fakeSAT) RunMemoryStressPack(_ string, _ func(string)) (string, error) { return "", nil }
|
return "", nil
|
||||||
func (f fakeSAT) RunSATStressPack(_ string, _ func(string)) (string, error) { return "", nil }
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunAMDMemBandwidthPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunAMDStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
func (f fakeSAT) RunMemoryStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
func (f fakeSAT) RunSATStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) {
|
func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) {
|
||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.PlatformStressOptions, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -78,48 +78,56 @@ func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
|||||||
|
|
||||||
// sampleAMDGPUMetrics queries rocm-smi for live GPU metrics.
|
// sampleAMDGPUMetrics queries rocm-smi for live GPU metrics.
|
||||||
func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
||||||
// --showtemp --showuse --showpower --csv — one row per GPU
|
|
||||||
out, err := runROCmSMI("--showtemp", "--showuse", "--showpower", "--showmemuse", "--csv")
|
out, err := runROCmSMI("--showtemp", "--showuse", "--showpower", "--showmemuse", "--csv")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
var rows []GPUMetricRow
|
lines := strings.Split(strings.TrimSpace(string(out)), "\n")
|
||||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
if len(lines) < 2 {
|
||||||
line = strings.TrimSpace(line)
|
return nil, fmt.Errorf("rocm-smi: insufficient output")
|
||||||
if line == "" || strings.HasPrefix(strings.ToLower(line), "device") {
|
}
|
||||||
continue
|
|
||||||
}
|
// Parse header to find column indices by name.
|
||||||
// CSV format: device,temp_c,gpu_use%,mem_use%,power_w (order may vary by rocm-smi version)
|
headers := strings.Split(lines[0], ",")
|
||||||
// We parse by column header from the first line.
|
colIdx := func(keywords ...string) int {
|
||||||
parts := strings.Split(line, ",")
|
for i, h := range headers {
|
||||||
if len(parts) < 2 {
|
hl := strings.ToLower(strings.TrimSpace(h))
|
||||||
continue
|
for _, kw := range keywords {
|
||||||
}
|
if strings.Contains(hl, kw) {
|
||||||
idx := len(rows)
|
return i
|
||||||
row := GPUMetricRow{GPUIndex: idx}
|
|
||||||
// rocm-smi CSV columns vary; extract what we can
|
|
||||||
for i, p := range parts {
|
|
||||||
p = strings.TrimSpace(p)
|
|
||||||
switch {
|
|
||||||
case i == 0:
|
|
||||||
// device index like "card0" or "0"
|
|
||||||
case strings.Contains(strings.ToLower(p), "n/a"):
|
|
||||||
// skip N/A
|
|
||||||
default:
|
|
||||||
// Try to match by position heuristic: temp, use%, memuse%, power
|
|
||||||
v := parseGPUFloat(p)
|
|
||||||
switch {
|
|
||||||
case i == 1 && row.TempC == 0:
|
|
||||||
row.TempC = v
|
|
||||||
case i == 2 && row.UsagePct == 0:
|
|
||||||
row.UsagePct = v
|
|
||||||
case i == 3 && row.MemUsagePct == 0:
|
|
||||||
row.MemUsagePct = v
|
|
||||||
case i == 4 && row.PowerW == 0:
|
|
||||||
row.PowerW = v
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
idxTemp := colIdx("sensor edge", "temperature (c)", "temp")
|
||||||
|
idxUse := colIdx("gpu use (%)")
|
||||||
|
idxMem := colIdx("vram%", "memory allocated")
|
||||||
|
idxPow := colIdx("average graphics package power", "power (w)")
|
||||||
|
|
||||||
|
var rows []GPUMetricRow
|
||||||
|
for _, line := range lines[1:] {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Split(line, ",")
|
||||||
|
idx := len(rows)
|
||||||
|
row := GPUMetricRow{GPUIndex: idx}
|
||||||
|
get := func(i int) float64 {
|
||||||
|
if i < 0 || i >= len(parts) {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
v := strings.TrimSpace(parts[i])
|
||||||
|
if strings.EqualFold(v, "n/a") {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return parseGPUFloat(v)
|
||||||
|
}
|
||||||
|
row.TempC = get(idxTemp)
|
||||||
|
row.UsagePct = get(idxUse)
|
||||||
|
row.MemUsagePct = get(idxMem)
|
||||||
|
row.PowerW = get(idxPow)
|
||||||
rows = append(rows, row)
|
rows = append(rows, row)
|
||||||
}
|
}
|
||||||
if len(rows) == 0 {
|
if len(rows) == 0 {
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
@@ -18,7 +19,7 @@ func (s *System) IsLiveMediaInRAM() bool {
|
|||||||
return strings.TrimSpace(string(out)) == "tmpfs"
|
return strings.TrimSpace(string(out)) == "tmpfs"
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunInstallToRAM(logFunc func(string)) error {
|
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||||
log := func(msg string) {
|
log := func(msg string) {
|
||||||
if logFunc != nil {
|
if logFunc != nil {
|
||||||
logFunc(msg)
|
logFunc(msg)
|
||||||
@@ -56,10 +57,13 @@ func (s *System) RunInstallToRAM(logFunc func(string)) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for _, sf := range squashfsFiles {
|
for _, sf := range squashfsFiles {
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
base := filepath.Base(sf)
|
base := filepath.Base(sf)
|
||||||
dst := filepath.Join(dstDir, base)
|
dst := filepath.Join(dstDir, base)
|
||||||
log(fmt.Sprintf("Copying %s to RAM...", base))
|
log(fmt.Sprintf("Copying %s to RAM...", base))
|
||||||
if err := copyFileLarge(sf, dst, log); err != nil {
|
if err := copyFileLarge(ctx, sf, dst, log); err != nil {
|
||||||
return fmt.Errorf("copy %s: %v", base, err)
|
return fmt.Errorf("copy %s: %v", base, err)
|
||||||
}
|
}
|
||||||
log(fmt.Sprintf("Copied %s.", base))
|
log(fmt.Sprintf("Copied %s.", base))
|
||||||
@@ -77,9 +81,12 @@ func (s *System) RunInstallToRAM(logFunc func(string)) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
log("Copying remaining medium files...")
|
log("Copying remaining medium files...")
|
||||||
if err := cpDir("/run/live/medium", dstDir, log); err != nil {
|
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
|
||||||
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
||||||
}
|
}
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
|
if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
|
||||||
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
|
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
|
||||||
}
|
}
|
||||||
@@ -88,7 +95,7 @@ func (s *System) RunInstallToRAM(logFunc func(string)) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func copyFileLarge(src, dst string, logFunc func(string)) error {
|
func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||||
in, err := os.Open(src)
|
in, err := os.Open(src)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -107,6 +114,9 @@ func copyFileLarge(src, dst string, logFunc func(string)) error {
|
|||||||
var copied int64
|
var copied int64
|
||||||
buf := make([]byte, 4*1024*1024)
|
buf := make([]byte, 4*1024*1024)
|
||||||
for {
|
for {
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
n, err := in.Read(buf)
|
n, err := in.Read(buf)
|
||||||
if n > 0 {
|
if n > 0 {
|
||||||
if _, werr := out.Write(buf[:n]); werr != nil {
|
if _, werr := out.Write(buf[:n]); werr != nil {
|
||||||
@@ -128,8 +138,11 @@ func copyFileLarge(src, dst string, logFunc func(string)) error {
|
|||||||
return out.Sync()
|
return out.Sync()
|
||||||
}
|
}
|
||||||
|
|
||||||
func cpDir(src, dst string, logFunc func(string)) error {
|
func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||||
return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
|
return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return ctx.Err()
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -144,7 +157,7 @@ func cpDir(src, dst string, logFunc func(string)) error {
|
|||||||
if _, err := os.Stat(target); err == nil {
|
if _, err := os.Stat(target); err == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
return copyFileLarge(path, target, nil)
|
return copyFileLarge(ctx, path, target, nil)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,10 @@ package platform
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
|
"encoding/json"
|
||||||
"os"
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
@@ -23,6 +26,7 @@ type LiveMetricSample struct {
|
|||||||
// TempReading is a named temperature sensor value.
|
// TempReading is a named temperature sensor value.
|
||||||
type TempReading struct {
|
type TempReading struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
|
Group string `json:"group,omitempty"`
|
||||||
Celsius float64 `json:"celsius"`
|
Celsius float64 `json:"celsius"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -43,10 +47,11 @@ func SampleLiveMetrics() LiveMetricSample {
|
|||||||
fans, _ := sampleFanSpeeds()
|
fans, _ := sampleFanSpeeds()
|
||||||
s.Fans = fans
|
s.Fans = fans
|
||||||
|
|
||||||
// CPU/system temperature — returns 0 if unavailable
|
s.Temps = append(s.Temps, sampleLiveTemperatureReadings()...)
|
||||||
cpuTemp := sampleCPUMaxTemp()
|
if !hasTempGroup(s.Temps, "cpu") {
|
||||||
if cpuTemp > 0 {
|
if cpuTemp := sampleCPUMaxTemp(); cpuTemp > 0 {
|
||||||
s.Temps = append(s.Temps, TempReading{Name: "CPU", Celsius: cpuTemp})
|
s.Temps = append(s.Temps, TempReading{Name: "CPU Max", Group: "cpu", Celsius: cpuTemp})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// System power — returns 0 if unavailable
|
// System power — returns 0 if unavailable
|
||||||
@@ -140,3 +145,182 @@ func sampleMemLoadPct() float64 {
|
|||||||
used := total - avail
|
used := total - avail
|
||||||
return float64(used) / float64(total) * 100
|
return float64(used) / float64(total) * 100
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func hasTempGroup(temps []TempReading, group string) bool {
|
||||||
|
for _, t := range temps {
|
||||||
|
if t.Group == group {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleLiveTemperatureReadings() []TempReading {
|
||||||
|
if temps := sampleLiveTempsViaSensorsJSON(); len(temps) > 0 {
|
||||||
|
return temps
|
||||||
|
}
|
||||||
|
return sampleLiveTempsViaIPMI()
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleLiveTempsViaSensorsJSON() []TempReading {
|
||||||
|
out, err := exec.Command("sensors", "-j").Output()
|
||||||
|
if err != nil || len(out) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var doc map[string]map[string]any
|
||||||
|
if err := json.Unmarshal(out, &doc); err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
chips := make([]string, 0, len(doc))
|
||||||
|
for chip := range doc {
|
||||||
|
chips = append(chips, chip)
|
||||||
|
}
|
||||||
|
sort.Strings(chips)
|
||||||
|
|
||||||
|
temps := make([]TempReading, 0, len(chips))
|
||||||
|
seen := map[string]struct{}{}
|
||||||
|
for _, chip := range chips {
|
||||||
|
features := doc[chip]
|
||||||
|
featureNames := make([]string, 0, len(features))
|
||||||
|
for name := range features {
|
||||||
|
featureNames = append(featureNames, name)
|
||||||
|
}
|
||||||
|
sort.Strings(featureNames)
|
||||||
|
for _, name := range featureNames {
|
||||||
|
if strings.EqualFold(name, "Adapter") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
feature, ok := features[name].(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, ok := firstTempInputValue(feature)
|
||||||
|
if !ok || value <= 0 || value > 150 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
group := classifyLiveTempGroup(chip, name)
|
||||||
|
if group == "gpu" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
label := strings.TrimSpace(name)
|
||||||
|
if label == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if group == "ambient" {
|
||||||
|
label = compactAmbientTempName(chip, label)
|
||||||
|
}
|
||||||
|
key := group + "\x00" + label
|
||||||
|
if _, ok := seen[key]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[key] = struct{}{}
|
||||||
|
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return temps
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleLiveTempsViaIPMI() []TempReading {
|
||||||
|
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
||||||
|
if err != nil || len(out) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var temps []TempReading
|
||||||
|
seen := map[string]struct{}{}
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||||
|
parts := strings.Split(line, "|")
|
||||||
|
if len(parts) < 3 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
name := strings.TrimSpace(parts[0])
|
||||||
|
if name == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
unit := strings.ToLower(strings.TrimSpace(parts[2]))
|
||||||
|
if !strings.Contains(unit, "degrees") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
raw := strings.TrimSpace(parts[1])
|
||||||
|
if raw == "" || strings.EqualFold(raw, "na") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, err := strconv.ParseFloat(raw, 64)
|
||||||
|
if err != nil || value <= 0 || value > 150 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
group := classifyLiveTempGroup("", name)
|
||||||
|
if group == "gpu" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
label := name
|
||||||
|
if group == "ambient" {
|
||||||
|
label = compactAmbientTempName("", label)
|
||||||
|
}
|
||||||
|
key := group + "\x00" + label
|
||||||
|
if _, ok := seen[key]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[key] = struct{}{}
|
||||||
|
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
|
||||||
|
}
|
||||||
|
return temps
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstTempInputValue(feature map[string]any) (float64, bool) {
|
||||||
|
keys := make([]string, 0, len(feature))
|
||||||
|
for key := range feature {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
for _, key := range keys {
|
||||||
|
lower := strings.ToLower(key)
|
||||||
|
if !strings.Contains(lower, "temp") || !strings.HasSuffix(lower, "_input") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch value := feature[key].(type) {
|
||||||
|
case float64:
|
||||||
|
return value, true
|
||||||
|
case string:
|
||||||
|
f, err := strconv.ParseFloat(value, 64)
|
||||||
|
if err == nil {
|
||||||
|
return f, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func classifyLiveTempGroup(chip, name string) string {
|
||||||
|
text := strings.ToLower(strings.TrimSpace(chip + " " + name))
|
||||||
|
switch {
|
||||||
|
case strings.Contains(text, "gpu"), strings.Contains(text, "amdgpu"), strings.Contains(text, "nvidia"), strings.Contains(text, "adeon"):
|
||||||
|
return "gpu"
|
||||||
|
case strings.Contains(text, "coretemp"),
|
||||||
|
strings.Contains(text, "k10temp"),
|
||||||
|
strings.Contains(text, "zenpower"),
|
||||||
|
strings.Contains(text, "package id"),
|
||||||
|
strings.Contains(text, "x86_pkg_temp"),
|
||||||
|
strings.Contains(text, "tctl"),
|
||||||
|
strings.Contains(text, "tdie"),
|
||||||
|
strings.Contains(text, "tccd"),
|
||||||
|
strings.Contains(text, "cpu"),
|
||||||
|
strings.Contains(text, "peci"):
|
||||||
|
return "cpu"
|
||||||
|
default:
|
||||||
|
return "ambient"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func compactAmbientTempName(chip, name string) string {
|
||||||
|
chip = strings.TrimSpace(chip)
|
||||||
|
name = strings.TrimSpace(name)
|
||||||
|
if chip == "" || strings.EqualFold(chip, name) {
|
||||||
|
return name
|
||||||
|
}
|
||||||
|
if strings.Contains(strings.ToLower(name), strings.ToLower(chip)) {
|
||||||
|
return name
|
||||||
|
}
|
||||||
|
return chip + " / " + name
|
||||||
|
}
|
||||||
|
|||||||
44
audit/internal/platform/live_metrics_test.go
Normal file
44
audit/internal/platform/live_metrics_test.go
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestFirstTempInputValue(t *testing.T) {
|
||||||
|
feature := map[string]any{
|
||||||
|
"temp1_input": 61.5,
|
||||||
|
"temp1_max": 80.0,
|
||||||
|
}
|
||||||
|
got, ok := firstTempInputValue(feature)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("expected value")
|
||||||
|
}
|
||||||
|
if got != 61.5 {
|
||||||
|
t.Fatalf("got %v want 61.5", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestClassifyLiveTempGroup(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
chip string
|
||||||
|
name string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{chip: "coretemp-isa-0000", name: "Package id 0", want: "cpu"},
|
||||||
|
{chip: "amdgpu-pci-4300", name: "edge", want: "gpu"},
|
||||||
|
{chip: "nvme-pci-0100", name: "Composite", want: "ambient"},
|
||||||
|
{chip: "acpitz-acpi-0", name: "temp1", want: "ambient"},
|
||||||
|
}
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := classifyLiveTempGroup(tc.chip, tc.name); got != tc.want {
|
||||||
|
t.Fatalf("classifyLiveTempGroup(%q,%q)=%q want %q", tc.chip, tc.name, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCompactAmbientTempName(t *testing.T) {
|
||||||
|
if got := compactAmbientTempName("nvme-pci-0100", "Composite"); got != "nvme-pci-0100 / Composite" {
|
||||||
|
t.Fatalf("got %q", got)
|
||||||
|
}
|
||||||
|
if got := compactAmbientTempName("", "Inlet Temp"); got != "Inlet Temp" {
|
||||||
|
t.Fatalf("got %q", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -2,6 +2,7 @@ package platform
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
@@ -18,21 +19,17 @@ func (s *System) ListInterfaces() ([]InterfaceInfo, error) {
|
|||||||
out := make([]InterfaceInfo, 0, len(names))
|
out := make([]InterfaceInfo, 0, len(names))
|
||||||
for _, name := range names {
|
for _, name := range names {
|
||||||
state := "unknown"
|
state := "unknown"
|
||||||
if raw, err := exec.Command("ip", "-o", "link", "show", name).Output(); err == nil {
|
if up, err := interfaceAdminState(name); err == nil {
|
||||||
fields := strings.Fields(string(raw))
|
if up {
|
||||||
if len(fields) >= 9 {
|
state = "up"
|
||||||
state = fields[8]
|
} else {
|
||||||
|
state = "down"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var ipv4 []string
|
ipv4, err := interfaceIPv4Addrs(name)
|
||||||
if raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", name).Output(); err == nil {
|
if err != nil {
|
||||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
ipv4 = nil
|
||||||
fields := strings.Fields(line)
|
|
||||||
if len(fields) >= 4 {
|
|
||||||
ipv4 = append(ipv4, fields[3])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, InterfaceInfo{Name: name, State: state, IPv4: ipv4})
|
out = append(out, InterfaceInfo{Name: name, State: state, IPv4: ipv4})
|
||||||
@@ -55,6 +52,119 @@ func (s *System) DefaultRoute() string {
|
|||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *System) CaptureNetworkSnapshot() (NetworkSnapshot, error) {
|
||||||
|
names, err := listInterfaceNames()
|
||||||
|
if err != nil {
|
||||||
|
return NetworkSnapshot{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
snapshot := NetworkSnapshot{
|
||||||
|
Interfaces: make([]NetworkInterfaceSnapshot, 0, len(names)),
|
||||||
|
}
|
||||||
|
for _, name := range names {
|
||||||
|
up, err := interfaceAdminState(name)
|
||||||
|
if err != nil {
|
||||||
|
return NetworkSnapshot{}, err
|
||||||
|
}
|
||||||
|
ipv4, err := interfaceIPv4Addrs(name)
|
||||||
|
if err != nil {
|
||||||
|
return NetworkSnapshot{}, err
|
||||||
|
}
|
||||||
|
snapshot.Interfaces = append(snapshot.Interfaces, NetworkInterfaceSnapshot{
|
||||||
|
Name: name,
|
||||||
|
Up: up,
|
||||||
|
IPv4: ipv4,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if raw, err := exec.Command("ip", "route", "show", "default").Output(); err == nil {
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line != "" {
|
||||||
|
snapshot.DefaultRoutes = append(snapshot.DefaultRoutes, line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if raw, err := os.ReadFile("/etc/resolv.conf"); err == nil {
|
||||||
|
snapshot.ResolvConf = string(raw)
|
||||||
|
}
|
||||||
|
|
||||||
|
return snapshot, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RestoreNetworkSnapshot(snapshot NetworkSnapshot) error {
|
||||||
|
var errs []string
|
||||||
|
|
||||||
|
for _, iface := range snapshot.Interfaces {
|
||||||
|
if err := exec.Command("ip", "link", "set", "dev", iface.Name, "up").Run(); err != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("%s: bring up before restore: %v", iface.Name, err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if err := exec.Command("ip", "addr", "flush", "dev", iface.Name).Run(); err != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("%s: flush addresses: %v", iface.Name, err))
|
||||||
|
}
|
||||||
|
for _, cidr := range iface.IPv4 {
|
||||||
|
if raw, err := exec.Command("ip", "addr", "add", cidr, "dev", iface.Name).CombinedOutput(); err != nil {
|
||||||
|
detail := strings.TrimSpace(string(raw))
|
||||||
|
if detail != "" {
|
||||||
|
errs = append(errs, fmt.Sprintf("%s: restore address %s: %v: %s", iface.Name, cidr, err, detail))
|
||||||
|
} else {
|
||||||
|
errs = append(errs, fmt.Sprintf("%s: restore address %s: %v", iface.Name, cidr, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
state := "down"
|
||||||
|
if iface.Up {
|
||||||
|
state = "up"
|
||||||
|
}
|
||||||
|
if err := exec.Command("ip", "link", "set", "dev", iface.Name, state).Run(); err != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("%s: restore state %s: %v", iface.Name, state, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := exec.Command("ip", "route", "del", "default").Run(); err != nil {
|
||||||
|
var exitErr *exec.ExitError
|
||||||
|
if !errors.As(err, &exitErr) {
|
||||||
|
errs = append(errs, fmt.Sprintf("clear default route: %v", err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, route := range snapshot.DefaultRoutes {
|
||||||
|
fields := strings.Fields(route)
|
||||||
|
if len(fields) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Strip state flags that ip-route(8) does not accept as add arguments.
|
||||||
|
filtered := fields[:0]
|
||||||
|
for _, f := range fields {
|
||||||
|
switch f {
|
||||||
|
case "linkdown", "dead", "onlink", "pervasive":
|
||||||
|
// skip
|
||||||
|
default:
|
||||||
|
filtered = append(filtered, f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
args := append([]string{"route", "add"}, filtered...)
|
||||||
|
if raw, err := exec.Command("ip", args...).CombinedOutput(); err != nil {
|
||||||
|
detail := strings.TrimSpace(string(raw))
|
||||||
|
if detail != "" {
|
||||||
|
errs = append(errs, fmt.Sprintf("restore route %q: %v: %s", route, err, detail))
|
||||||
|
} else {
|
||||||
|
errs = append(errs, fmt.Sprintf("restore route %q: %v", route, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.WriteFile("/etc/resolv.conf", []byte(snapshot.ResolvConf), 0644); err != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("restore resolv.conf: %v", err))
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(errs) > 0 {
|
||||||
|
return errors.New(strings.Join(errs, "; "))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (s *System) DHCPOne(iface string) (string, error) {
|
func (s *System) DHCPOne(iface string) (string, error) {
|
||||||
var out bytes.Buffer
|
var out bytes.Buffer
|
||||||
if err := exec.Command("ip", "link", "set", iface, "up").Run(); err != nil {
|
if err := exec.Command("ip", "link", "set", iface, "up").Run(); err != nil {
|
||||||
@@ -142,12 +252,52 @@ func (s *System) SetInterfaceState(iface string, up bool) error {
|
|||||||
|
|
||||||
// GetInterfaceState returns true if the interface is UP.
|
// GetInterfaceState returns true if the interface is UP.
|
||||||
func (s *System) GetInterfaceState(iface string) (bool, error) {
|
func (s *System) GetInterfaceState(iface string) (bool, error) {
|
||||||
raw, err := os.ReadFile(fmt.Sprintf("/sys/class/net/%s/operstate", iface))
|
return interfaceAdminState(iface)
|
||||||
|
}
|
||||||
|
|
||||||
|
func interfaceAdminState(iface string) (bool, error) {
|
||||||
|
raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, err
|
return false, err
|
||||||
}
|
}
|
||||||
state := strings.TrimSpace(string(raw))
|
return parseInterfaceAdminState(string(raw))
|
||||||
return state == "up", nil
|
}
|
||||||
|
|
||||||
|
func parseInterfaceAdminState(raw string) (bool, error) {
|
||||||
|
start := strings.IndexByte(raw, '<')
|
||||||
|
if start == -1 {
|
||||||
|
return false, fmt.Errorf("ip link output missing flags")
|
||||||
|
}
|
||||||
|
end := strings.IndexByte(raw[start+1:], '>')
|
||||||
|
if end == -1 {
|
||||||
|
return false, fmt.Errorf("ip link output missing flag terminator")
|
||||||
|
}
|
||||||
|
flags := strings.Split(raw[start+1:start+1+end], ",")
|
||||||
|
for _, flag := range flags {
|
||||||
|
if strings.TrimSpace(flag) == "UP" {
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func interfaceIPv4Addrs(iface string) ([]string, error) {
|
||||||
|
raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", iface).Output()
|
||||||
|
if err != nil {
|
||||||
|
var exitErr *exec.ExitError
|
||||||
|
if errors.As(err, &exitErr) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var ipv4 []string
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
if len(fields) >= 4 {
|
||||||
|
ipv4 = append(ipv4, fields[3])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ipv4, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func listInterfaceNames() ([]string, error) {
|
func listInterfaceNames() ([]string, error) {
|
||||||
|
|||||||
46
audit/internal/platform/network_test.go
Normal file
46
audit/internal/platform/network_test.go
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestParseInterfaceAdminState(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
raw string
|
||||||
|
want bool
|
||||||
|
wantErr bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "admin up with no carrier",
|
||||||
|
raw: "2: enp1s0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc mq state DOWN mode DEFAULT group default qlen 1000\n",
|
||||||
|
want: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "admin down",
|
||||||
|
raw: "2: enp1s0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000\n",
|
||||||
|
want: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "malformed output",
|
||||||
|
raw: "2: enp1s0: mtu 1500 state DOWN\n",
|
||||||
|
wantErr: true,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
got, err := parseInterfaceAdminState(tt.raw)
|
||||||
|
if tt.wantErr {
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error")
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if got != tt.want {
|
||||||
|
t.Fatalf("got %v want %v", got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
476
audit/internal/platform/platform_stress.go
Normal file
476
audit/internal/platform/platform_stress.go
Normal file
@@ -0,0 +1,476 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"archive/tar"
|
||||||
|
"bytes"
|
||||||
|
"compress/gzip"
|
||||||
|
"context"
|
||||||
|
"encoding/csv"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// PlatformStressCycle defines one load+idle cycle.
|
||||||
|
type PlatformStressCycle struct {
|
||||||
|
LoadSec int // seconds of simultaneous CPU+GPU stress
|
||||||
|
IdleSec int // seconds of idle monitoring after load cut
|
||||||
|
}
|
||||||
|
|
||||||
|
// PlatformStressOptions controls the thermal cycling test.
|
||||||
|
type PlatformStressOptions struct {
|
||||||
|
Cycles []PlatformStressCycle
|
||||||
|
}
|
||||||
|
|
||||||
|
// platformStressRow is one second of telemetry.
|
||||||
|
type platformStressRow struct {
|
||||||
|
ElapsedSec float64
|
||||||
|
Cycle int
|
||||||
|
Phase string // "load" | "idle"
|
||||||
|
CPULoadPct float64
|
||||||
|
MaxCPUTempC float64
|
||||||
|
MaxGPUTempC float64
|
||||||
|
SysPowerW float64
|
||||||
|
FanMinRPM float64
|
||||||
|
FanMaxRPM float64
|
||||||
|
GPUThrottled bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunPlatformStress runs repeated load+idle thermal cycling.
|
||||||
|
// Each cycle starts CPU (stressapptest) and GPU stress simultaneously,
|
||||||
|
// runs for LoadSec, then cuts load abruptly and monitors for IdleSec.
|
||||||
|
func (s *System) RunPlatformStress(
|
||||||
|
ctx context.Context,
|
||||||
|
baseDir string,
|
||||||
|
opts PlatformStressOptions,
|
||||||
|
logFunc func(string),
|
||||||
|
) (string, error) {
|
||||||
|
if logFunc == nil {
|
||||||
|
logFunc = func(string) {}
|
||||||
|
}
|
||||||
|
if len(opts.Cycles) == 0 {
|
||||||
|
return "", fmt.Errorf("no cycles defined")
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||||
|
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
stamp := time.Now().UTC().Format("20060102-150405")
|
||||||
|
runDir := filepath.Join(baseDir, "platform-stress-"+stamp)
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
return "", fmt.Errorf("mkdir run dir: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
vendor := s.DetectGPUVendor()
|
||||||
|
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor))
|
||||||
|
|
||||||
|
var rows []platformStressRow
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
|
var analyses []cycleAnalysis
|
||||||
|
|
||||||
|
for i, cycle := range opts.Cycles {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
cycleNum := i + 1
|
||||||
|
logFunc(fmt.Sprintf("--- Cycle %d/%d: load=%ds, idle=%ds ---", cycleNum, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec))
|
||||||
|
|
||||||
|
// ── LOAD PHASE ───────────────────────────────────────────────────────
|
||||||
|
loadCtx, loadCancel := context.WithTimeout(ctx, time.Duration(cycle.LoadSec)*time.Second)
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
|
||||||
|
// CPU stress
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
||||||
|
if err != nil {
|
||||||
|
logFunc("CPU stress: " + err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||||
|
}()
|
||||||
|
|
||||||
|
// GPU stress
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
||||||
|
if gpuCmd == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = gpuCmd.Wait()
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Monitoring goroutine for load phase
|
||||||
|
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
|
||||||
|
for _, r := range loadRows {
|
||||||
|
logFunc(formatPlatformRow(r))
|
||||||
|
}
|
||||||
|
rows = append(rows, loadRows...)
|
||||||
|
loadCancel()
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
if len(loadRows) > 0 {
|
||||||
|
logFunc(fmt.Sprintf("Cycle %d load ended (%.0fs)", cycleNum, loadRows[len(loadRows)-1].ElapsedSec))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── IDLE PHASE ───────────────────────────────────────────────────────
|
||||||
|
idleCtx, idleCancel := context.WithTimeout(ctx, time.Duration(cycle.IdleSec)*time.Second)
|
||||||
|
idleRows := collectPhase(idleCtx, cycleNum, "idle", start)
|
||||||
|
for _, r := range idleRows {
|
||||||
|
logFunc(formatPlatformRow(r))
|
||||||
|
}
|
||||||
|
rows = append(rows, idleRows...)
|
||||||
|
idleCancel()
|
||||||
|
|
||||||
|
// Per-cycle analysis
|
||||||
|
an := analyzePlatformCycle(loadRows, idleRows)
|
||||||
|
analyses = append(analyses, an)
|
||||||
|
logFunc(fmt.Sprintf("Cycle %d: maxCPU=%.1f°C maxGPU=%.1f°C power=%.0fW throttled=%v fanDrop=%.0f%%",
|
||||||
|
cycleNum, an.maxCPUTemp, an.maxGPUTemp, an.maxPower, an.throttled, an.fanDropPct))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write CSV
|
||||||
|
csvData := writePlatformCSV(rows)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "metrics.csv"), csvData, 0644)
|
||||||
|
|
||||||
|
// Write summary
|
||||||
|
summary := writePlatformSummary(opts, analyses)
|
||||||
|
logFunc("--- Summary ---")
|
||||||
|
for _, line := range strings.Split(summary, "\n") {
|
||||||
|
if line != "" {
|
||||||
|
logFunc(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
||||||
|
|
||||||
|
// Pack tar.gz
|
||||||
|
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
|
||||||
|
if err := packPlatformDir(runDir, archivePath); err != nil {
|
||||||
|
return "", fmt.Errorf("pack archive: %w", err)
|
||||||
|
}
|
||||||
|
_ = os.RemoveAll(runDir)
|
||||||
|
return archivePath, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectPhase samples live metrics every second until ctx is done.
|
||||||
|
func collectPhase(ctx context.Context, cycle int, phase string, testStart time.Time) []platformStressRow {
|
||||||
|
var rows []platformStressRow
|
||||||
|
ticker := time.NewTicker(time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return rows
|
||||||
|
case <-ticker.C:
|
||||||
|
sample := SampleLiveMetrics()
|
||||||
|
rows = append(rows, sampleToPlatformRow(sample, cycle, phase, testStart))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleToPlatformRow(s LiveMetricSample, cycle int, phase string, testStart time.Time) platformStressRow {
|
||||||
|
r := platformStressRow{
|
||||||
|
ElapsedSec: time.Since(testStart).Seconds(),
|
||||||
|
Cycle: cycle,
|
||||||
|
Phase: phase,
|
||||||
|
CPULoadPct: s.CPULoadPct,
|
||||||
|
SysPowerW: s.PowerW,
|
||||||
|
}
|
||||||
|
for _, t := range s.Temps {
|
||||||
|
switch t.Group {
|
||||||
|
case "cpu":
|
||||||
|
if t.Celsius > r.MaxCPUTempC {
|
||||||
|
r.MaxCPUTempC = t.Celsius
|
||||||
|
}
|
||||||
|
case "gpu":
|
||||||
|
if t.Celsius > r.MaxGPUTempC {
|
||||||
|
r.MaxGPUTempC = t.Celsius
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, g := range s.GPUs {
|
||||||
|
if g.TempC > r.MaxGPUTempC {
|
||||||
|
r.MaxGPUTempC = g.TempC
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(s.Fans) > 0 {
|
||||||
|
r.FanMinRPM = s.Fans[0].RPM
|
||||||
|
r.FanMaxRPM = s.Fans[0].RPM
|
||||||
|
for _, f := range s.Fans[1:] {
|
||||||
|
if f.RPM < r.FanMinRPM {
|
||||||
|
r.FanMinRPM = f.RPM
|
||||||
|
}
|
||||||
|
if f.RPM > r.FanMaxRPM {
|
||||||
|
r.FanMaxRPM = f.RPM
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatPlatformRow(r platformStressRow) string {
|
||||||
|
throttle := ""
|
||||||
|
if r.GPUThrottled {
|
||||||
|
throttle = " THROTTLE"
|
||||||
|
}
|
||||||
|
fans := ""
|
||||||
|
if r.FanMinRPM > 0 {
|
||||||
|
fans = fmt.Sprintf(" fans=%.0f-%.0fRPM", r.FanMinRPM, r.FanMaxRPM)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("[%5.0fs] cycle=%d phase=%-4s cpu=%.0f%% cpuT=%.1f°C gpuT=%.1f°C pwr=%.0fW%s%s",
|
||||||
|
r.ElapsedSec, r.Cycle, r.Phase, r.CPULoadPct, r.MaxCPUTempC, r.MaxGPUTempC, r.SysPowerW, fans, throttle)
|
||||||
|
}
|
||||||
|
|
||||||
|
func analyzePlatformCycle(loadRows, idleRows []platformStressRow) cycleAnalysis {
|
||||||
|
var an cycleAnalysis
|
||||||
|
for _, r := range loadRows {
|
||||||
|
if r.MaxCPUTempC > an.maxCPUTemp {
|
||||||
|
an.maxCPUTemp = r.MaxCPUTempC
|
||||||
|
}
|
||||||
|
if r.MaxGPUTempC > an.maxGPUTemp {
|
||||||
|
an.maxGPUTemp = r.MaxGPUTempC
|
||||||
|
}
|
||||||
|
if r.SysPowerW > an.maxPower {
|
||||||
|
an.maxPower = r.SysPowerW
|
||||||
|
}
|
||||||
|
if r.GPUThrottled {
|
||||||
|
an.throttled = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Fan RPM at cut = avg of last 5 load rows
|
||||||
|
if n := len(loadRows); n > 0 {
|
||||||
|
window := loadRows
|
||||||
|
if n > 5 {
|
||||||
|
window = loadRows[n-5:]
|
||||||
|
}
|
||||||
|
var sum float64
|
||||||
|
var cnt int
|
||||||
|
for _, r := range window {
|
||||||
|
if r.FanMinRPM > 0 {
|
||||||
|
sum += (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||||
|
cnt++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if cnt > 0 {
|
||||||
|
an.fanAtCutAvg = sum / float64(cnt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Fan RPM min in first 15s of idle
|
||||||
|
an.fanMin15s = an.fanAtCutAvg
|
||||||
|
var cutElapsed float64
|
||||||
|
if len(loadRows) > 0 {
|
||||||
|
cutElapsed = loadRows[len(loadRows)-1].ElapsedSec
|
||||||
|
}
|
||||||
|
for _, r := range idleRows {
|
||||||
|
if r.ElapsedSec > cutElapsed+15 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
avg := (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||||
|
if avg > 0 && (an.fanMin15s == 0 || avg < an.fanMin15s) {
|
||||||
|
an.fanMin15s = avg
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if an.fanAtCutAvg > 0 {
|
||||||
|
an.fanDropPct = (an.fanAtCutAvg - an.fanMin15s) / an.fanAtCutAvg * 100
|
||||||
|
}
|
||||||
|
return an
|
||||||
|
}
|
||||||
|
|
||||||
|
type cycleAnalysis struct {
|
||||||
|
maxCPUTemp float64
|
||||||
|
maxGPUTemp float64
|
||||||
|
maxPower float64
|
||||||
|
throttled bool
|
||||||
|
fanAtCutAvg float64
|
||||||
|
fanMin15s float64
|
||||||
|
fanDropPct float64
|
||||||
|
}
|
||||||
|
|
||||||
|
func writePlatformSummary(opts PlatformStressOptions, analyses []cycleAnalysis) string {
|
||||||
|
var b strings.Builder
|
||||||
|
fmt.Fprintf(&b, "Platform Thermal Cycling — %d cycle(s)\n", len(opts.Cycles))
|
||||||
|
fmt.Fprintf(&b, "%s\n\n", strings.Repeat("=", 48))
|
||||||
|
|
||||||
|
totalThrottle := 0
|
||||||
|
totalFanWarn := 0
|
||||||
|
for i, an := range analyses {
|
||||||
|
cycle := opts.Cycles[i]
|
||||||
|
fmt.Fprintf(&b, "Cycle %d/%d (load=%ds, idle=%ds)\n", i+1, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec)
|
||||||
|
fmt.Fprintf(&b, " Max CPU temp: %.1f°C\n", an.maxCPUTemp)
|
||||||
|
fmt.Fprintf(&b, " Max GPU temp: %.1f°C\n", an.maxGPUTemp)
|
||||||
|
fmt.Fprintf(&b, " Max sys power: %.0f W\n", an.maxPower)
|
||||||
|
if an.throttled {
|
||||||
|
fmt.Fprintf(&b, " Throttle: DETECTED\n")
|
||||||
|
totalThrottle++
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, " Throttle: none\n")
|
||||||
|
}
|
||||||
|
if an.fanAtCutAvg > 0 {
|
||||||
|
fmt.Fprintf(&b, " Fan at load cut: %.0f RPM avg\n", an.fanAtCutAvg)
|
||||||
|
fmt.Fprintf(&b, " Fan min (first 15s idle): %.0f RPM (drop %.0f%%)\n", an.fanMin15s, an.fanDropPct)
|
||||||
|
if an.fanDropPct > 20 {
|
||||||
|
fmt.Fprintf(&b, " Fan response: WARN — fast spindown (>20%% drop in 15s)\n")
|
||||||
|
totalFanWarn++
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, " Fan response: OK\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(&b, "%s\n", strings.Repeat("=", 48))
|
||||||
|
if totalThrottle > 0 {
|
||||||
|
fmt.Fprintf(&b, "Overall: FAIL — throttle detected in %d/%d cycles\n", totalThrottle, len(analyses))
|
||||||
|
} else if totalFanWarn > 0 {
|
||||||
|
fmt.Fprintf(&b, "Overall: WARN — fast fan spindown in %d/%d cycles (cooling recovery risk)\n", totalFanWarn, len(analyses))
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, "Overall: PASS\n")
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func writePlatformCSV(rows []platformStressRow) []byte {
|
||||||
|
var buf bytes.Buffer
|
||||||
|
w := csv.NewWriter(&buf)
|
||||||
|
_ = w.Write([]string{
|
||||||
|
"elapsed_sec", "cycle", "phase",
|
||||||
|
"cpu_load_pct", "max_cpu_temp_c", "max_gpu_temp_c",
|
||||||
|
"sys_power_w", "fan_min_rpm", "fan_max_rpm", "gpu_throttled",
|
||||||
|
})
|
||||||
|
for _, r := range rows {
|
||||||
|
throttled := "0"
|
||||||
|
if r.GPUThrottled {
|
||||||
|
throttled = "1"
|
||||||
|
}
|
||||||
|
_ = w.Write([]string{
|
||||||
|
strconv.FormatFloat(r.ElapsedSec, 'f', 1, 64),
|
||||||
|
strconv.Itoa(r.Cycle),
|
||||||
|
r.Phase,
|
||||||
|
strconv.FormatFloat(r.CPULoadPct, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(r.MaxCPUTempC, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(r.MaxGPUTempC, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(r.SysPowerW, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(r.FanMinRPM, 'f', 0, 64),
|
||||||
|
strconv.FormatFloat(r.FanMaxRPM, 'f', 0, 64),
|
||||||
|
throttled,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
w.Flush()
|
||||||
|
return buf.Bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildCPUStressCmd creates a stressapptest command that runs until ctx is cancelled.
|
||||||
|
func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
||||||
|
path, err := satLookPath("stressapptest")
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("stressapptest not found: %w", err)
|
||||||
|
}
|
||||||
|
// Use a very long duration; the context timeout will kill it at the right time.
|
||||||
|
cmd := exec.CommandContext(ctx, path, "-s", "86400", "-W", "--cc_test")
|
||||||
|
cmd.Stdout = nil
|
||||||
|
cmd.Stderr = nil
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
return nil, fmt.Errorf("stressapptest start: %w", err)
|
||||||
|
}
|
||||||
|
return cmd, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
||||||
|
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
||||||
|
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
|
||||||
|
switch strings.ToLower(vendor) {
|
||||||
|
case "amd":
|
||||||
|
return buildAMDGPUStressCmd(ctx)
|
||||||
|
case "nvidia":
|
||||||
|
return buildNvidiaGPUStressCmd(ctx)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||||
|
rvsArgs, err := resolveRVSCommand()
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
rvsPath := rvsArgs[0]
|
||||||
|
cfg := `actions:
|
||||||
|
- name: gst_platform
|
||||||
|
device: all
|
||||||
|
module: gst
|
||||||
|
parallel: true
|
||||||
|
duration: 86400000
|
||||||
|
copy_matrix: false
|
||||||
|
target_stress: 90
|
||||||
|
matrix_size_a: 8640
|
||||||
|
matrix_size_b: 8640
|
||||||
|
matrix_size_c: 8640
|
||||||
|
`
|
||||||
|
cfgFile := "/tmp/bee-platform-gst.conf"
|
||||||
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||||
|
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||||
|
cmd.Stdout = nil
|
||||||
|
cmd.Stderr = nil
|
||||||
|
_ = cmd.Start()
|
||||||
|
return cmd
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||||
|
path, err := satLookPath("bee-gpu-stress")
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
|
||||||
|
cmd.Stdout = nil
|
||||||
|
cmd.Stderr = nil
|
||||||
|
_ = cmd.Start()
|
||||||
|
return cmd
|
||||||
|
}
|
||||||
|
|
||||||
|
func packPlatformDir(dir, dest string) error {
|
||||||
|
f, err := os.Create(dest)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
gz := gzip.NewWriter(f)
|
||||||
|
defer gz.Close()
|
||||||
|
tw := tar.NewWriter(gz)
|
||||||
|
defer tw.Close()
|
||||||
|
|
||||||
|
entries, err := os.ReadDir(dir)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
base := filepath.Base(dir)
|
||||||
|
for _, e := range entries {
|
||||||
|
if e.IsDir() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fpath := filepath.Join(dir, e.Name())
|
||||||
|
data, err := os.ReadFile(fpath)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
hdr := &tar.Header{
|
||||||
|
Name: filepath.Join(base, e.Name()),
|
||||||
|
Size: int64(len(data)),
|
||||||
|
Mode: 0644,
|
||||||
|
ModTime: time.Now(),
|
||||||
|
}
|
||||||
|
if err := tw.WriteHeader(hdr); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if _, err := tw.Write(data); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
@@ -33,6 +33,10 @@ var (
|
|||||||
"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
|
"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
|
||||||
"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
|
"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
|
||||||
}
|
}
|
||||||
|
rvsExecutableGlobs = []string{
|
||||||
|
"/opt/rocm/bin/rvs",
|
||||||
|
"/opt/rocm-*/bin/rvs",
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
|
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
|
||||||
@@ -90,6 +94,12 @@ func (s *System) DetectGPUVendor() string {
|
|||||||
if _, err := os.Stat("/dev/kfd"); err == nil {
|
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||||||
return "amd"
|
return "amd"
|
||||||
}
|
}
|
||||||
|
if raw, err := exec.Command("lspci", "-nn").Output(); err == nil {
|
||||||
|
text := strings.ToLower(string(raw))
|
||||||
|
if strings.Contains(text, "advanced micro devices") || strings.Contains(text, "amd/ati") {
|
||||||
|
return "amd"
|
||||||
|
}
|
||||||
|
}
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -117,8 +127,8 @@ func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi.
|
// RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi.
|
||||||
func (s *System) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
return runAcceptancePack(baseDir, "gpu-amd", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd", []satJob{
|
||||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||||
{name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}},
|
{name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}},
|
||||||
{name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
{name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
@@ -126,20 +136,96 @@ func (s *System) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (str
|
|||||||
}, logFunc)
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
// RunAMDStressPack runs an AMD GPU burn-in pack.
|
// RunAMDMemIntegrityPack runs the official RVS MEM module as a validate-style memory integrity test.
|
||||||
// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
|
func (s *System) RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
func (s *System) RunAMDStressPack(baseDir string, logFunc func(string)) (string, error) {
|
if err := ensureAMDRuntimeReady(); err != nil {
|
||||||
seconds := envInt("BEE_AMD_STRESS_SECONDS", 300)
|
return "", err
|
||||||
return runAcceptancePack(baseDir, "gpu-amd-stress", []satJob{
|
}
|
||||||
|
cfgFile := "/tmp/bee-amd-mem.conf"
|
||||||
|
cfg := `actions:
|
||||||
|
- name: mem_integrity
|
||||||
|
device: all
|
||||||
|
module: mem
|
||||||
|
parallel: true
|
||||||
|
duration: 60000
|
||||||
|
copy_matrix: false
|
||||||
|
target_stress: 90
|
||||||
|
matrix_size: 8640
|
||||||
|
`
|
||||||
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-mem", []satJob{
|
||||||
|
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||||
|
{name: "02-rvs-mem.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||||
|
{name: "03-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunAMDMemBandwidthPack runs AMD's memory/interconnect bandwidth-oriented tools.
|
||||||
|
func (s *System) RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if err := ensureAMDRuntimeReady(); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
cfgFile := "/tmp/bee-amd-babel.conf"
|
||||||
|
cfg := `actions:
|
||||||
|
- name: babel_mem_bw
|
||||||
|
device: all
|
||||||
|
module: babel
|
||||||
|
parallel: true
|
||||||
|
copy_matrix: true
|
||||||
|
target_stress: 90
|
||||||
|
matrix_size: 134217728
|
||||||
|
`
|
||||||
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-bandwidth", []satJob{
|
||||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||||
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||||
{name: fmt.Sprintf("03-rocm-smi-monitor-%ds.log", seconds), cmd: []string{
|
{name: "03-rvs-babel.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||||
"rocm-smi", "--showtemp", "--showpower",
|
{name: "04-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||||
fmt.Sprintf("--duration=%d", seconds),
|
|
||||||
}},
|
|
||||||
}, logFunc)
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RunAMDStressPack runs an AMD GPU burn-in pack.
|
||||||
|
// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
|
||||||
|
func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
seconds := durationSec
|
||||||
|
if seconds <= 0 {
|
||||||
|
seconds = envInt("BEE_AMD_STRESS_SECONDS", 300)
|
||||||
|
}
|
||||||
|
if err := ensureAMDRuntimeReady(); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
// Enable copy_matrix so the same GST run drives VRAM traffic in addition to compute.
|
||||||
|
rvsCfg := amdStressRVSConfig(seconds)
|
||||||
|
cfgFile := "/tmp/bee-amd-gst.conf"
|
||||||
|
_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
|
||||||
|
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", amdStressJobs(seconds, cfgFile), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func amdStressRVSConfig(seconds int) string {
|
||||||
|
return fmt.Sprintf(`actions:
|
||||||
|
- name: gst_stress
|
||||||
|
device: all
|
||||||
|
module: gst
|
||||||
|
parallel: true
|
||||||
|
duration: %d
|
||||||
|
copy_matrix: false
|
||||||
|
target_stress: 90
|
||||||
|
matrix_size_a: 8640
|
||||||
|
matrix_size_b: 8640
|
||||||
|
matrix_size_c: 8640
|
||||||
|
`, seconds*1000)
|
||||||
|
}
|
||||||
|
|
||||||
|
func amdStressJobs(seconds int, cfgFile string) []satJob {
|
||||||
|
return []satJob{
|
||||||
|
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||||
|
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||||
|
{name: fmt.Sprintf("03-rvs-gst-%ds.log", seconds), cmd: []string{"rvs", "-c", cfgFile}},
|
||||||
|
{name: fmt.Sprintf("04-rocm-smi-after.log"), cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--csv"}},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
||||||
func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
||||||
out, err := exec.Command("nvidia-smi",
|
out, err := exec.Command("nvidia-smi",
|
||||||
@@ -191,7 +277,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
|
return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
|
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
|
||||||
@@ -202,24 +288,27 @@ func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir
|
|||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
||||||
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
||||||
return runAcceptancePack(baseDir, "memory", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||||
}, logFunc)
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunMemoryStressPack(baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
seconds := envInt("BEE_VM_STRESS_SECONDS", 300)
|
seconds := durationSec
|
||||||
|
if seconds <= 0 {
|
||||||
|
seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
|
||||||
|
}
|
||||||
// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
|
// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
|
||||||
sizeArg := "80%"
|
sizeArg := "80%"
|
||||||
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
||||||
sizeArg = fmt.Sprintf("%dM", mb)
|
sizeArg = fmt.Sprintf("%dM", mb)
|
||||||
}
|
}
|
||||||
return runAcceptancePack(baseDir, "memory-stress", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
|
||||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
{name: "02-stress-ng-vm.log", cmd: []string{
|
{name: "02-stress-ng-vm.log", cmd: []string{
|
||||||
"stress-ng", "--vm", "1",
|
"stress-ng", "--vm", "1",
|
||||||
@@ -232,24 +321,27 @@ func (s *System) RunMemoryStressPack(baseDir string, logFunc func(string)) (stri
|
|||||||
}, logFunc)
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunSATStressPack(baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
seconds := envInt("BEE_SAT_STRESS_SECONDS", 300)
|
seconds := durationSec
|
||||||
|
if seconds <= 0 {
|
||||||
|
seconds = envInt("BEE_SAT_STRESS_SECONDS", 300)
|
||||||
|
}
|
||||||
cmd := []string{"stressapptest", "-s", fmt.Sprintf("%d", seconds), "-W", "--cc_test"}
|
cmd := []string{"stressapptest", "-s", fmt.Sprintf("%d", seconds), "-W", "--cc_test"}
|
||||||
if mb := envInt("BEE_SAT_STRESS_MB", 0); mb > 0 {
|
if mb := envInt("BEE_SAT_STRESS_MB", 0); mb > 0 {
|
||||||
cmd = append(cmd, "-M", fmt.Sprintf("%d", mb))
|
cmd = append(cmd, "-M", fmt.Sprintf("%d", mb))
|
||||||
}
|
}
|
||||||
return runAcceptancePack(baseDir, "sat-stress", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "sat-stress", []satJob{
|
||||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
{name: "02-stressapptest.log", cmd: cmd},
|
{name: "02-stressapptest.log", cmd: cmd},
|
||||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||||
}, logFunc)
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
if durationSec <= 0 {
|
if durationSec <= 0 {
|
||||||
durationSec = 60
|
durationSec = 60
|
||||||
}
|
}
|
||||||
return runAcceptancePack(baseDir, "cpu", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "cpu", []satJob{
|
||||||
{name: "01-lscpu.log", cmd: []string{"lscpu"}},
|
{name: "01-lscpu.log", cmd: []string{"lscpu"}},
|
||||||
{name: "02-sensors-before.log", cmd: []string{"sensors"}},
|
{name: "02-sensors-before.log", cmd: []string{"sensors"}},
|
||||||
{name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}},
|
{name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}},
|
||||||
@@ -257,7 +349,7 @@ func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc f
|
|||||||
}, logFunc)
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
if baseDir == "" {
|
if baseDir == "" {
|
||||||
baseDir = "/var/log/bee-sat"
|
baseDir = "/var/log/bee-sat"
|
||||||
}
|
}
|
||||||
@@ -285,11 +377,17 @@ func (s *System) RunStorageAcceptancePack(baseDir string, logFunc func(string))
|
|||||||
}
|
}
|
||||||
|
|
||||||
for index, devPath := range devices {
|
for index, devPath := range devices {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
||||||
commands := storageSATCommands(devPath)
|
commands := storageSATCommands(devPath)
|
||||||
for cmdIndex, job := range commands {
|
for cmdIndex, job := range commands {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
|
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
|
||||||
out, err := runSATCommand(verboseLog, job.name, job.cmd, logFunc)
|
out, err := runSATCommandCtx(ctx, verboseLog, job.name, job.cmd, nil, logFunc)
|
||||||
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
|
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
|
||||||
return "", writeErr
|
return "", writeErr
|
||||||
}
|
}
|
||||||
@@ -338,49 +436,6 @@ func nvidiaSATJobs() []satJob {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func runAcceptancePack(baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
|
||||||
if baseDir == "" {
|
|
||||||
baseDir = "/var/log/bee-sat"
|
|
||||||
}
|
|
||||||
ts := time.Now().UTC().Format("20060102-150405")
|
|
||||||
runDir := filepath.Join(baseDir, prefix+"-"+ts)
|
|
||||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
|
||||||
|
|
||||||
var summary strings.Builder
|
|
||||||
stats := satStats{}
|
|
||||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
|
||||||
for _, job := range jobs {
|
|
||||||
var out []byte
|
|
||||||
var err error
|
|
||||||
cmd := make([]string, 0, len(job.cmd))
|
|
||||||
for _, arg := range job.cmd {
|
|
||||||
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
|
||||||
}
|
|
||||||
out, err = runSATCommand(verboseLog, job.name, cmd, logFunc)
|
|
||||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
|
||||||
return "", writeErr
|
|
||||||
}
|
|
||||||
status, rc := classifySATResult(job.name, out, err)
|
|
||||||
stats.Add(status)
|
|
||||||
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
|
||||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
|
||||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
|
||||||
}
|
|
||||||
writeSATStats(&summary, stats)
|
|
||||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
|
||||||
if err := createTarGz(archive, runDir); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
return archive, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||||
if diagLevel < 1 || diagLevel > 4 {
|
if diagLevel < 1 || diagLevel > 4 {
|
||||||
diagLevel = 3
|
diagLevel = 3
|
||||||
@@ -402,6 +457,9 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
if baseDir == "" {
|
if baseDir == "" {
|
||||||
baseDir = "/var/log/bee-sat"
|
baseDir = "/var/log/bee-sat"
|
||||||
}
|
}
|
||||||
@@ -622,10 +680,23 @@ func resolveSATCommand(cmd []string) ([]string, error) {
|
|||||||
if len(cmd) == 0 {
|
if len(cmd) == 0 {
|
||||||
return nil, errors.New("empty SAT command")
|
return nil, errors.New("empty SAT command")
|
||||||
}
|
}
|
||||||
if cmd[0] != "rocm-smi" {
|
switch cmd[0] {
|
||||||
return cmd, nil
|
case "rocm-smi":
|
||||||
|
return resolveROCmSMICommand(cmd[1:]...)
|
||||||
|
case "rvs":
|
||||||
|
return resolveRVSCommand(cmd[1:]...)
|
||||||
}
|
}
|
||||||
return resolveROCmSMICommand(cmd[1:]...)
|
return cmd, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveRVSCommand(args ...string) ([]string, error) {
|
||||||
|
if path, err := satLookPath("rvs"); err == nil {
|
||||||
|
return append([]string{path}, args...), nil
|
||||||
|
}
|
||||||
|
for _, path := range expandExistingPaths(rvsExecutableGlobs) {
|
||||||
|
return append([]string{path}, args...), nil
|
||||||
|
}
|
||||||
|
return nil, errors.New("rvs not found in PATH or under /opt/rocm")
|
||||||
}
|
}
|
||||||
|
|
||||||
func resolveROCmSMICommand(args ...string) ([]string, error) {
|
func resolveROCmSMICommand(args ...string) ([]string, error) {
|
||||||
@@ -649,6 +720,20 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
|
|||||||
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ensureAMDRuntimeReady() error {
|
||||||
|
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if raw, err := os.ReadFile("/sys/module/amdgpu/initstate"); err == nil {
|
||||||
|
state := strings.TrimSpace(string(raw))
|
||||||
|
if strings.EqualFold(state, "live") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return fmt.Errorf("AMD driver is present but not initialized: amdgpu initstate=%q", state)
|
||||||
|
}
|
||||||
|
return errors.New("AMD GPUs are present but the runtime is not initialized: /dev/kfd is missing and amdgpu is not loaded")
|
||||||
|
}
|
||||||
|
|
||||||
func rocmSMIExecutableCandidates() []string {
|
func rocmSMIExecutableCandidates() []string {
|
||||||
return expandExistingPaths(rocmSMIExecutableGlobs)
|
return expandExistingPaths(rocmSMIExecutableGlobs)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,10 +2,12 @@ package platform
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -304,41 +306,147 @@ func sampleGPUStressMetrics(gpuIndices []int) []GPUStressMetric {
|
|||||||
// sampleFanSpeeds reads fan RPM values from ipmitool sdr.
|
// sampleFanSpeeds reads fan RPM values from ipmitool sdr.
|
||||||
func sampleFanSpeeds() ([]FanReading, error) {
|
func sampleFanSpeeds() ([]FanReading, error) {
|
||||||
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
||||||
|
if err == nil {
|
||||||
|
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
|
||||||
|
return fans, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
|
||||||
|
if len(fans) > 0 {
|
||||||
|
return fans, nil
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
return parseFanSpeeds(string(out)), nil
|
return nil, sensorsErr
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||||
// Line format: "FAN1 | 2400.000 | RPM | ok"
|
// Handles two formats:
|
||||||
|
// Old: "FAN1 | 2400.000 | RPM | ok" (value in col[1], unit in col[2])
|
||||||
|
// New: "FAN1 | 41h | ok | 29.1 | 4340 RPM" (value+unit combined in last col)
|
||||||
func parseFanSpeeds(raw string) []FanReading {
|
func parseFanSpeeds(raw string) []FanReading {
|
||||||
var fans []FanReading
|
var fans []FanReading
|
||||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||||
parts := strings.Split(line, "|")
|
parts := strings.Split(line, "|")
|
||||||
if len(parts) < 3 {
|
if len(parts) < 2 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
unit := strings.TrimSpace(parts[2])
|
name := strings.TrimSpace(parts[0])
|
||||||
if !strings.EqualFold(unit, "RPM") {
|
// Find the first field that contains "RPM" (either as a standalone unit or inline)
|
||||||
|
rpmVal := 0.0
|
||||||
|
found := false
|
||||||
|
for _, p := range parts[1:] {
|
||||||
|
p = strings.TrimSpace(p)
|
||||||
|
if !strings.Contains(strings.ToUpper(p), "RPM") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.EqualFold(p, "RPM") {
|
||||||
|
continue // unit-only column in old format; value is in previous field
|
||||||
|
}
|
||||||
|
val, err := parseFanRPMValue(p)
|
||||||
|
if err == nil {
|
||||||
|
rpmVal = val
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Old format: unit "RPM" is in col[2], value is in col[1]
|
||||||
|
if !found && len(parts) >= 3 && strings.EqualFold(strings.TrimSpace(parts[2]), "RPM") {
|
||||||
|
valStr := strings.TrimSpace(parts[1])
|
||||||
|
if !strings.EqualFold(valStr, "na") && !strings.EqualFold(valStr, "disabled") && valStr != "" {
|
||||||
|
if val, err := parseFanRPMValue(valStr); err == nil {
|
||||||
|
rpmVal = val
|
||||||
|
found = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
valStr := strings.TrimSpace(parts[1])
|
fans = append(fans, FanReading{Name: name, RPM: rpmVal})
|
||||||
if strings.EqualFold(valStr, "na") || strings.EqualFold(valStr, "disabled") || valStr == "" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
val, err := strconv.ParseFloat(valStr, 64)
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
fans = append(fans, FanReading{
|
|
||||||
Name: strings.TrimSpace(parts[0]),
|
|
||||||
RPM: val,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
return fans
|
return fans
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseFanRPMValue(raw string) (float64, error) {
|
||||||
|
fields := strings.Fields(strings.TrimSpace(strings.ReplaceAll(raw, ",", "")))
|
||||||
|
if len(fields) == 0 {
|
||||||
|
return 0, strconv.ErrSyntax
|
||||||
|
}
|
||||||
|
return strconv.ParseFloat(fields[0], 64)
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
|
||||||
|
out, err := exec.Command("sensors", "-j").Output()
|
||||||
|
if err != nil || len(out) == 0 {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var doc map[string]map[string]any
|
||||||
|
if err := json.Unmarshal(out, &doc); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
chips := make([]string, 0, len(doc))
|
||||||
|
for chip := range doc {
|
||||||
|
chips = append(chips, chip)
|
||||||
|
}
|
||||||
|
sort.Strings(chips)
|
||||||
|
var fans []FanReading
|
||||||
|
seen := map[string]struct{}{}
|
||||||
|
for _, chip := range chips {
|
||||||
|
features := doc[chip]
|
||||||
|
names := make([]string, 0, len(features))
|
||||||
|
for name := range features {
|
||||||
|
names = append(names, name)
|
||||||
|
}
|
||||||
|
sort.Strings(names)
|
||||||
|
for _, name := range names {
|
||||||
|
feature, ok := features[name].(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
rpm, ok := firstFanInputValue(feature)
|
||||||
|
if !ok || rpm <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
label := strings.TrimSpace(name)
|
||||||
|
if chip != "" && !strings.Contains(strings.ToLower(label), strings.ToLower(chip)) {
|
||||||
|
label = chip + " / " + label
|
||||||
|
}
|
||||||
|
if _, ok := seen[label]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[label] = struct{}{}
|
||||||
|
fans = append(fans, FanReading{Name: label, RPM: rpm})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fans, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstFanInputValue(feature map[string]any) (float64, bool) {
|
||||||
|
keys := make([]string, 0, len(feature))
|
||||||
|
for key := range feature {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
for _, key := range keys {
|
||||||
|
lower := strings.ToLower(key)
|
||||||
|
if !strings.Contains(lower, "fan") || !strings.HasSuffix(lower, "_input") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch value := feature[key].(type) {
|
||||||
|
case float64:
|
||||||
|
return value, true
|
||||||
|
case string:
|
||||||
|
f, err := strconv.ParseFloat(value, 64)
|
||||||
|
if err == nil {
|
||||||
|
return f, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
// sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors.
|
// sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors.
|
||||||
func sampleCPUMaxTemp() float64 {
|
func sampleCPUMaxTemp() float64 {
|
||||||
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
||||||
|
|||||||
27
audit/internal/platform/sat_fan_stress_test.go
Normal file
27
audit/internal/platform/sat_fan_stress_test.go
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestParseFanSpeeds(t *testing.T) {
|
||||||
|
raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n"
|
||||||
|
got := parseFanSpeeds(raw)
|
||||||
|
if len(got) != 2 {
|
||||||
|
t.Fatalf("fans=%d want 2 (%v)", len(got), got)
|
||||||
|
}
|
||||||
|
if got[0].Name != "FAN1" || got[0].RPM != 2400 {
|
||||||
|
t.Fatalf("fan0=%+v", got[0])
|
||||||
|
}
|
||||||
|
if got[1].Name != "FAN2" || got[1].RPM != 1800 {
|
||||||
|
t.Fatalf("fan1=%+v", got[1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFirstFanInputValue(t *testing.T) {
|
||||||
|
feature := map[string]any{
|
||||||
|
"fan1_input": 9200.0,
|
||||||
|
}
|
||||||
|
got, ok := firstFanInputValue(feature)
|
||||||
|
if !ok || got != 9200 {
|
||||||
|
t.Fatalf("got=%v ok=%v", got, ok)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -38,6 +39,47 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestAMDStressConfigUsesSingleGSTAction(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
cfg := amdStressRVSConfig(123)
|
||||||
|
if !strings.Contains(cfg, "module: gst") {
|
||||||
|
t.Fatalf("config missing gst module:\n%s", cfg)
|
||||||
|
}
|
||||||
|
if strings.Contains(cfg, "module: mem") {
|
||||||
|
t.Fatalf("config should not include mem module:\n%s", cfg)
|
||||||
|
}
|
||||||
|
if !strings.Contains(cfg, "copy_matrix: false") {
|
||||||
|
t.Fatalf("config should use copy_matrix=false:\n%s", cfg)
|
||||||
|
}
|
||||||
|
if strings.Count(cfg, "duration: 123000") != 1 {
|
||||||
|
t.Fatalf("config should apply duration once:\n%s", cfg)
|
||||||
|
}
|
||||||
|
for _, field := range []string{"matrix_size_a: 8640", "matrix_size_b: 8640", "matrix_size_c: 8640"} {
|
||||||
|
if !strings.Contains(cfg, field) {
|
||||||
|
t.Fatalf("config missing %s:\n%s", field, cfg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
jobs := amdStressJobs(300, "/tmp/test-amd-gst.conf")
|
||||||
|
if len(jobs) != 4 {
|
||||||
|
t.Fatalf("jobs=%d want 4", len(jobs))
|
||||||
|
}
|
||||||
|
if got := jobs[1].cmd[0]; got != "rocm-bandwidth-test" {
|
||||||
|
t.Fatalf("jobs[1]=%q want rocm-bandwidth-test", got)
|
||||||
|
}
|
||||||
|
if got := jobs[2].cmd[0]; got != "rvs" {
|
||||||
|
t.Fatalf("jobs[2]=%q want rvs", got)
|
||||||
|
}
|
||||||
|
if got := jobs[2].cmd[2]; got != "/tmp/test-amd-gst.conf" {
|
||||||
|
t.Fatalf("jobs[2] cfg=%q want /tmp/test-amd-gst.conf", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
||||||
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
|
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
|
||||||
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
|
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
|
||||||
|
|||||||
@@ -8,6 +8,18 @@ type InterfaceInfo struct {
|
|||||||
IPv4 []string
|
IPv4 []string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type NetworkInterfaceSnapshot struct {
|
||||||
|
Name string
|
||||||
|
Up bool
|
||||||
|
IPv4 []string
|
||||||
|
}
|
||||||
|
|
||||||
|
type NetworkSnapshot struct {
|
||||||
|
Interfaces []NetworkInterfaceSnapshot
|
||||||
|
DefaultRoutes []string
|
||||||
|
ResolvConf string
|
||||||
|
}
|
||||||
|
|
||||||
type ServiceAction string
|
type ServiceAction string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -152,11 +153,12 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var body struct {
|
var body struct {
|
||||||
Duration int `json:"duration"`
|
Duration int `json:"duration"`
|
||||||
DiagLevel int `json:"diag_level"`
|
DiagLevel int `json:"diag_level"`
|
||||||
GPUIndices []int `json:"gpu_indices"`
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
|
Profile string `json:"profile"`
|
||||||
|
DisplayName string `json:"display_name"`
|
||||||
}
|
}
|
||||||
body.DiagLevel = 1
|
|
||||||
if r.ContentLength > 0 {
|
if r.ContentLength > 0 {
|
||||||
_ = json.NewDecoder(r.Body).Decode(&body)
|
_ = json.NewDecoder(r.Body).Decode(&body)
|
||||||
}
|
}
|
||||||
@@ -172,11 +174,16 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
Status: TaskPending,
|
Status: TaskPending,
|
||||||
CreatedAt: time.Now(),
|
CreatedAt: time.Now(),
|
||||||
params: taskParams{
|
params: taskParams{
|
||||||
Duration: body.Duration,
|
Duration: body.Duration,
|
||||||
DiagLevel: body.DiagLevel,
|
DiagLevel: body.DiagLevel,
|
||||||
GPUIndices: body.GPUIndices,
|
GPUIndices: body.GPUIndices,
|
||||||
|
BurnProfile: body.Profile,
|
||||||
|
DisplayName: body.DisplayName,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
if strings.TrimSpace(body.DisplayName) != "" {
|
||||||
|
t.Name = body.DisplayName
|
||||||
|
}
|
||||||
globalQueue.enqueue(t)
|
globalQueue.enqueue(t)
|
||||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||||
}
|
}
|
||||||
@@ -320,18 +327,21 @@ func (h *handler) handleAPINetworkDHCP(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
_ = json.NewDecoder(r.Body).Decode(&req)
|
_ = json.NewDecoder(r.Body).Decode(&req)
|
||||||
|
|
||||||
var result app.ActionResult
|
result, err := h.applyPendingNetworkChange(func() (app.ActionResult, error) {
|
||||||
var err error
|
if req.Interface == "" || req.Interface == "all" {
|
||||||
if req.Interface == "" || req.Interface == "all" {
|
return h.opts.App.DHCPAllResult()
|
||||||
result, err = h.opts.App.DHCPAllResult()
|
}
|
||||||
} else {
|
return h.opts.App.DHCPOneResult(req.Interface)
|
||||||
result, err = h.opts.App.DHCPOneResult(req.Interface)
|
})
|
||||||
}
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
writeError(w, http.StatusInternalServerError, err.Error())
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
writeJSON(w, map[string]string{"status": "ok", "output": result.Body})
|
writeJSON(w, map[string]any{
|
||||||
|
"status": "ok",
|
||||||
|
"output": result.Body,
|
||||||
|
"rollback_in": int(netRollbackTimeout.Seconds()),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPINetworkStatic(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPINetworkStatic(w http.ResponseWriter, r *http.Request) {
|
||||||
@@ -357,12 +367,18 @@ func (h *handler) handleAPINetworkStatic(w http.ResponseWriter, r *http.Request)
|
|||||||
Gateway: req.Gateway,
|
Gateway: req.Gateway,
|
||||||
DNS: req.DNS,
|
DNS: req.DNS,
|
||||||
}
|
}
|
||||||
result, err := h.opts.App.SetStaticIPv4Result(cfg)
|
result, err := h.applyPendingNetworkChange(func() (app.ActionResult, error) {
|
||||||
|
return h.opts.App.SetStaticIPv4Result(cfg)
|
||||||
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
writeError(w, http.StatusInternalServerError, err.Error())
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
writeJSON(w, map[string]string{"status": "ok", "output": result.Body})
|
writeJSON(w, map[string]any{
|
||||||
|
"status": "ok",
|
||||||
|
"output": result.Body,
|
||||||
|
"rollback_in": int(netRollbackTimeout.Seconds()),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Export ────────────────────────────────────────────────────────────────────
|
// ── Export ────────────────────────────────────────────────────────────────────
|
||||||
@@ -421,6 +437,13 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
|
|||||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
h.installMu.Lock()
|
||||||
|
installRunning := h.installJob != nil && !h.installJob.isDone()
|
||||||
|
h.installMu.Unlock()
|
||||||
|
if installRunning {
|
||||||
|
writeError(w, http.StatusConflict, "install to disk is already running")
|
||||||
|
return
|
||||||
|
}
|
||||||
t := &Task{
|
t := &Task{
|
||||||
ID: newJobID("install-to-ram"),
|
ID: newJobID("install-to-ram"),
|
||||||
Name: "Install to RAM",
|
Name: "Install to RAM",
|
||||||
@@ -528,6 +551,10 @@ func (h *handler) handleAPIInstallRun(w http.ResponseWriter, r *http.Request) {
|
|||||||
writeError(w, http.StatusBadRequest, "device not in install candidate list")
|
writeError(w, http.StatusBadRequest, "device not in install candidate list")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
if globalQueue.hasActiveTarget("install-to-ram") {
|
||||||
|
writeError(w, http.StatusConflict, "install to RAM task is already pending or running")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
h.installMu.Lock()
|
h.installMu.Lock()
|
||||||
if h.installJob != nil && !h.installJob.isDone() {
|
if h.installJob != nil && !h.installJob.isDone() {
|
||||||
@@ -565,53 +592,17 @@ func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request)
|
|||||||
if !sseStart(w) {
|
if !sseStart(w) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
ticker := time.NewTicker(time.Second)
|
ticker := time.NewTicker(1 * time.Second)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-r.Context().Done():
|
case <-r.Context().Done():
|
||||||
return
|
return
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
sample := platform.SampleLiveMetrics()
|
sample, ok := h.latestMetric()
|
||||||
|
if !ok {
|
||||||
// Feed server ring buffers
|
continue
|
||||||
for _, t := range sample.Temps {
|
|
||||||
if t.Name == "CPU" {
|
|
||||||
h.ringCPUTemp.push(t.Celsius)
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
h.ringPower.push(sample.PowerW)
|
|
||||||
h.ringCPULoad.push(sample.CPULoadPct)
|
|
||||||
h.ringMemLoad.push(sample.MemLoadPct)
|
|
||||||
|
|
||||||
// Feed fan ring buffers (grow on first sight)
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
for i, fan := range sample.Fans {
|
|
||||||
for len(h.ringFans) <= i {
|
|
||||||
h.ringFans = append(h.ringFans, newMetricsRing(120))
|
|
||||||
h.fanNames = append(h.fanNames, fan.Name)
|
|
||||||
}
|
|
||||||
h.ringFans[i].push(float64(fan.RPM))
|
|
||||||
}
|
|
||||||
// Feed per-GPU ring buffers (grow on first sight)
|
|
||||||
for _, gpu := range sample.GPUs {
|
|
||||||
idx := gpu.GPUIndex
|
|
||||||
for len(h.gpuRings) <= idx {
|
|
||||||
h.gpuRings = append(h.gpuRings, &gpuRings{
|
|
||||||
Temp: newMetricsRing(120),
|
|
||||||
Util: newMetricsRing(120),
|
|
||||||
MemUtil: newMetricsRing(120),
|
|
||||||
Power: newMetricsRing(120),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
h.gpuRings[idx].Temp.push(gpu.TempC)
|
|
||||||
h.gpuRings[idx].Util.push(gpu.UsagePct)
|
|
||||||
h.gpuRings[idx].MemUtil.push(gpu.MemUsagePct)
|
|
||||||
h.gpuRings[idx].Power.push(gpu.PowerW)
|
|
||||||
}
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
|
|
||||||
b, err := json.Marshal(sample)
|
b, err := json.Marshal(sample)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
@@ -623,6 +614,63 @@ func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// feedRings pushes one sample into all in-memory ring buffers.
|
||||||
|
func (h *handler) feedRings(sample platform.LiveMetricSample) {
|
||||||
|
for _, t := range sample.Temps {
|
||||||
|
switch t.Group {
|
||||||
|
case "cpu":
|
||||||
|
h.pushNamedMetricRing(&h.cpuTempRings, t.Name, t.Celsius)
|
||||||
|
case "ambient":
|
||||||
|
h.pushNamedMetricRing(&h.ambientTempRings, t.Name, t.Celsius)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
h.ringPower.push(sample.PowerW)
|
||||||
|
h.ringCPULoad.push(sample.CPULoadPct)
|
||||||
|
h.ringMemLoad.push(sample.MemLoadPct)
|
||||||
|
|
||||||
|
h.ringsMu.Lock()
|
||||||
|
for i, fan := range sample.Fans {
|
||||||
|
for len(h.ringFans) <= i {
|
||||||
|
h.ringFans = append(h.ringFans, newMetricsRing(120))
|
||||||
|
h.fanNames = append(h.fanNames, fan.Name)
|
||||||
|
}
|
||||||
|
h.ringFans[i].push(float64(fan.RPM))
|
||||||
|
}
|
||||||
|
for _, gpu := range sample.GPUs {
|
||||||
|
idx := gpu.GPUIndex
|
||||||
|
for len(h.gpuRings) <= idx {
|
||||||
|
h.gpuRings = append(h.gpuRings, &gpuRings{
|
||||||
|
Temp: newMetricsRing(120),
|
||||||
|
Util: newMetricsRing(120),
|
||||||
|
MemUtil: newMetricsRing(120),
|
||||||
|
Power: newMetricsRing(120),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
h.gpuRings[idx].Temp.push(gpu.TempC)
|
||||||
|
h.gpuRings[idx].Util.push(gpu.UsagePct)
|
||||||
|
h.gpuRings[idx].MemUtil.push(gpu.MemUsagePct)
|
||||||
|
h.gpuRings[idx].Power.push(gpu.PowerW)
|
||||||
|
}
|
||||||
|
h.ringsMu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) pushNamedMetricRing(dst *[]*namedMetricsRing, name string, value float64) {
|
||||||
|
if name == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, item := range *dst {
|
||||||
|
if item != nil && item.Name == name && item.Ring != nil {
|
||||||
|
item.Ring.push(value)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*dst = append(*dst, &namedMetricsRing{
|
||||||
|
Name: name,
|
||||||
|
Ring: newMetricsRing(120),
|
||||||
|
})
|
||||||
|
(*dst)[len(*dst)-1].Ring.push(value)
|
||||||
|
}
|
||||||
|
|
||||||
// ── Network toggle ────────────────────────────────────────────────────────────
|
// ── Network toggle ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
const netRollbackTimeout = 60 * time.Second
|
const netRollbackTimeout = 60 * time.Second
|
||||||
@@ -646,33 +694,14 @@ func (h *handler) handleAPINetworkToggle(w http.ResponseWriter, r *http.Request)
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := h.opts.App.SetInterfaceState(req.Iface, !wasUp); err != nil {
|
if _, err := h.applyPendingNetworkChange(func() (app.ActionResult, error) {
|
||||||
|
err := h.opts.App.SetInterfaceState(req.Iface, !wasUp)
|
||||||
|
return app.ActionResult{}, err
|
||||||
|
}); err != nil {
|
||||||
writeError(w, http.StatusInternalServerError, err.Error())
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cancel any existing pending change (rollback it first).
|
|
||||||
h.pendingNetMu.Lock()
|
|
||||||
if h.pendingNet != nil {
|
|
||||||
prev := h.pendingNet
|
|
||||||
prev.mu.Lock()
|
|
||||||
prev.timer.Stop()
|
|
||||||
_ = h.opts.App.SetInterfaceState(prev.iface, prev.wasUp)
|
|
||||||
prev.mu.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
pnc := &pendingNetChange{iface: req.Iface, wasUp: wasUp}
|
|
||||||
pnc.timer = time.AfterFunc(netRollbackTimeout, func() {
|
|
||||||
_ = h.opts.App.SetInterfaceState(req.Iface, wasUp)
|
|
||||||
h.pendingNetMu.Lock()
|
|
||||||
if h.pendingNet == pnc {
|
|
||||||
h.pendingNet = nil
|
|
||||||
}
|
|
||||||
h.pendingNetMu.Unlock()
|
|
||||||
})
|
|
||||||
h.pendingNet = pnc
|
|
||||||
h.pendingNetMu.Unlock()
|
|
||||||
|
|
||||||
newState := "up"
|
newState := "up"
|
||||||
if wasUp {
|
if wasUp {
|
||||||
newState = "down"
|
newState = "down"
|
||||||
@@ -684,6 +713,42 @@ func (h *handler) handleAPINetworkToggle(w http.ResponseWriter, r *http.Request)
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) applyPendingNetworkChange(apply func() (app.ActionResult, error)) (app.ActionResult, error) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
return app.ActionResult{}, fmt.Errorf("app not configured")
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := h.rollbackPendingNetworkChange(); err != nil && err.Error() != "no pending network change" {
|
||||||
|
return app.ActionResult{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
snapshot, err := h.opts.App.CaptureNetworkSnapshot()
|
||||||
|
if err != nil {
|
||||||
|
return app.ActionResult{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
result, err := apply()
|
||||||
|
if err != nil {
|
||||||
|
return result, err
|
||||||
|
}
|
||||||
|
|
||||||
|
pnc := &pendingNetChange{snapshot: snapshot}
|
||||||
|
pnc.timer = time.AfterFunc(netRollbackTimeout, func() {
|
||||||
|
_ = h.opts.App.RestoreNetworkSnapshot(snapshot)
|
||||||
|
h.pendingNetMu.Lock()
|
||||||
|
if h.pendingNet == pnc {
|
||||||
|
h.pendingNet = nil
|
||||||
|
}
|
||||||
|
h.pendingNetMu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
h.pendingNetMu.Lock()
|
||||||
|
h.pendingNet = pnc
|
||||||
|
h.pendingNetMu.Unlock()
|
||||||
|
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPINetworkConfirm(w http.ResponseWriter, _ *http.Request) {
|
func (h *handler) handleAPINetworkConfirm(w http.ResponseWriter, _ *http.Request) {
|
||||||
h.pendingNetMu.Lock()
|
h.pendingNetMu.Lock()
|
||||||
pnc := h.pendingNet
|
pnc := h.pendingNet
|
||||||
@@ -698,19 +763,30 @@ func (h *handler) handleAPINetworkConfirm(w http.ResponseWriter, _ *http.Request
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Request) {
|
func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
if err := h.rollbackPendingNetworkChange(); err != nil {
|
||||||
|
if err.Error() == "no pending network change" {
|
||||||
|
writeError(w, http.StatusConflict, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, map[string]string{"status": "rolled back"})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) rollbackPendingNetworkChange() error {
|
||||||
h.pendingNetMu.Lock()
|
h.pendingNetMu.Lock()
|
||||||
pnc := h.pendingNet
|
pnc := h.pendingNet
|
||||||
h.pendingNet = nil
|
h.pendingNet = nil
|
||||||
h.pendingNetMu.Unlock()
|
h.pendingNetMu.Unlock()
|
||||||
if pnc == nil {
|
if pnc == nil {
|
||||||
writeError(w, http.StatusConflict, "no pending network change")
|
return fmt.Errorf("no pending network change")
|
||||||
return
|
|
||||||
}
|
}
|
||||||
pnc.mu.Lock()
|
pnc.mu.Lock()
|
||||||
pnc.timer.Stop()
|
pnc.timer.Stop()
|
||||||
pnc.mu.Unlock()
|
pnc.mu.Unlock()
|
||||||
if h.opts.App != nil {
|
if h.opts.App != nil {
|
||||||
_ = h.opts.App.SetInterfaceState(pnc.iface, pnc.wasUp)
|
return h.opts.App.RestoreNetworkSnapshot(pnc.snapshot)
|
||||||
}
|
}
|
||||||
writeJSON(w, map[string]string{"status": "rolled back"})
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,18 +1,21 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
// jobState holds the output lines and completion status of an async job.
|
// jobState holds the output lines and completion status of an async job.
|
||||||
type jobState struct {
|
type jobState struct {
|
||||||
lines []string
|
lines []string
|
||||||
done bool
|
done bool
|
||||||
err string
|
err string
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
subs []chan string
|
subs []chan string
|
||||||
cancel func() // optional cancel function; nil if job is not cancellable
|
cancel func() // optional cancel function; nil if job is not cancellable
|
||||||
|
logPath string
|
||||||
}
|
}
|
||||||
|
|
||||||
// abort cancels the job if it has a cancel function and is not yet done.
|
// abort cancels the job if it has a cancel function and is not yet done.
|
||||||
@@ -30,6 +33,9 @@ func (j *jobState) append(line string) {
|
|||||||
j.mu.Lock()
|
j.mu.Lock()
|
||||||
defer j.mu.Unlock()
|
defer j.mu.Unlock()
|
||||||
j.lines = append(j.lines, line)
|
j.lines = append(j.lines, line)
|
||||||
|
if j.logPath != "" {
|
||||||
|
appendJobLog(j.logPath, line)
|
||||||
|
}
|
||||||
for _, ch := range j.subs {
|
for _, ch := range j.subs {
|
||||||
select {
|
select {
|
||||||
case ch <- line:
|
case ch <- line:
|
||||||
@@ -100,3 +106,32 @@ func (m *jobManager) get(id string) (*jobState, bool) {
|
|||||||
j, ok := m.jobs[id]
|
j, ok := m.jobs[id]
|
||||||
return j, ok
|
return j, ok
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func newTaskJobState(logPath string) *jobState {
|
||||||
|
j := &jobState{logPath: logPath}
|
||||||
|
if logPath == "" {
|
||||||
|
return j
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(logPath)
|
||||||
|
if err != nil || len(data) == 0 {
|
||||||
|
return j
|
||||||
|
}
|
||||||
|
lines := strings.Split(strings.ReplaceAll(string(data), "\r\n", "\n"), "\n")
|
||||||
|
if len(lines) > 0 && lines[len(lines)-1] == "" {
|
||||||
|
lines = lines[:len(lines)-1]
|
||||||
|
}
|
||||||
|
j.lines = append(j.lines, lines...)
|
||||||
|
return j
|
||||||
|
}
|
||||||
|
|
||||||
|
func appendJobLog(path, line string) {
|
||||||
|
if path == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
_, _ = f.WriteString(line + "\n")
|
||||||
|
}
|
||||||
|
|||||||
317
audit/internal/webui/metricsdb.go
Normal file
317
audit/internal/webui/metricsdb.go
Normal file
@@ -0,0 +1,317 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"database/sql"
|
||||||
|
"encoding/csv"
|
||||||
|
"io"
|
||||||
|
"strconv"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
_ "modernc.org/sqlite"
|
||||||
|
)
|
||||||
|
|
||||||
|
const metricsDBPath = "/appdata/bee/metrics.db"
|
||||||
|
|
||||||
|
// MetricsDB persists live metric samples to SQLite.
|
||||||
|
type MetricsDB struct {
|
||||||
|
db *sql.DB
|
||||||
|
}
|
||||||
|
|
||||||
|
// openMetricsDB opens (or creates) the metrics database at the given path.
|
||||||
|
func openMetricsDB(path string) (*MetricsDB, error) {
|
||||||
|
db, err := sql.Open("sqlite", path+"?_journal=WAL&_busy_timeout=5000")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
db.SetMaxOpenConns(1)
|
||||||
|
if err := initMetricsSchema(db); err != nil {
|
||||||
|
_ = db.Close()
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return &MetricsDB{db: db}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func initMetricsSchema(db *sql.DB) error {
|
||||||
|
_, err := db.Exec(`
|
||||||
|
CREATE TABLE IF NOT EXISTS sys_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
cpu_load_pct REAL,
|
||||||
|
mem_load_pct REAL,
|
||||||
|
power_w REAL,
|
||||||
|
PRIMARY KEY (ts)
|
||||||
|
);
|
||||||
|
CREATE TABLE IF NOT EXISTS gpu_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
gpu_index INTEGER NOT NULL,
|
||||||
|
temp_c REAL,
|
||||||
|
usage_pct REAL,
|
||||||
|
mem_usage_pct REAL,
|
||||||
|
power_w REAL,
|
||||||
|
PRIMARY KEY (ts, gpu_index)
|
||||||
|
);
|
||||||
|
CREATE TABLE IF NOT EXISTS fan_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
rpm REAL,
|
||||||
|
PRIMARY KEY (ts, name)
|
||||||
|
);
|
||||||
|
CREATE TABLE IF NOT EXISTS temp_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
grp TEXT NOT NULL,
|
||||||
|
celsius REAL,
|
||||||
|
PRIMARY KEY (ts, name)
|
||||||
|
);
|
||||||
|
`)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write inserts one sample into all relevant tables.
|
||||||
|
func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||||
|
ts := s.Timestamp.Unix()
|
||||||
|
tx, err := m.db.Begin()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer func() { _ = tx.Rollback() }()
|
||||||
|
|
||||||
|
_, err = tx.Exec(
|
||||||
|
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
|
||||||
|
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, g := range s.GPUs {
|
||||||
|
_, err = tx.Exec(
|
||||||
|
`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w) VALUES(?,?,?,?,?,?)`,
|
||||||
|
ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, f := range s.Fans {
|
||||||
|
_, err = tx.Exec(
|
||||||
|
`INSERT OR REPLACE INTO fan_metrics(ts,name,rpm) VALUES(?,?,?)`,
|
||||||
|
ts, f.Name, f.RPM,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, t := range s.Temps {
|
||||||
|
_, err = tx.Exec(
|
||||||
|
`INSERT OR REPLACE INTO temp_metrics(ts,name,grp,celsius) VALUES(?,?,?,?)`,
|
||||||
|
ts, t.Name, t.Group, t.Celsius,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return tx.Commit()
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||||
|
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||||
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||||
|
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||||
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
|
||||||
|
func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
|
||||||
|
rows, err := m.db.Query(query, args...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
type sysRow struct {
|
||||||
|
ts int64
|
||||||
|
cpu, mem, pwr float64
|
||||||
|
}
|
||||||
|
var sysRows []sysRow
|
||||||
|
for rows.Next() {
|
||||||
|
var r sysRow
|
||||||
|
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sysRows = append(sysRows, r)
|
||||||
|
}
|
||||||
|
if len(sysRows) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
// Reverse to chronological order
|
||||||
|
for i, j := 0, len(sysRows)-1; i < j; i, j = i+1, j-1 {
|
||||||
|
sysRows[i], sysRows[j] = sysRows[j], sysRows[i]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect min/max ts for range query
|
||||||
|
minTS := sysRows[0].ts
|
||||||
|
maxTS := sysRows[len(sysRows)-1].ts
|
||||||
|
|
||||||
|
// Load GPU rows in range
|
||||||
|
type gpuKey struct{ ts int64; idx int }
|
||||||
|
gpuData := map[gpuKey]platform.GPUMetricRow{}
|
||||||
|
gRows, err := m.db.Query(
|
||||||
|
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
|
||||||
|
minTS, maxTS,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
defer gRows.Close()
|
||||||
|
for gRows.Next() {
|
||||||
|
var ts int64
|
||||||
|
var g platform.GPUMetricRow
|
||||||
|
if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW); err == nil {
|
||||||
|
gpuData[gpuKey{ts, g.GPUIndex}] = g
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load fan rows in range
|
||||||
|
type fanKey struct{ ts int64; name string }
|
||||||
|
fanData := map[fanKey]float64{}
|
||||||
|
fRows, err := m.db.Query(
|
||||||
|
`SELECT ts,name,rpm FROM fan_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
defer fRows.Close()
|
||||||
|
for fRows.Next() {
|
||||||
|
var ts int64
|
||||||
|
var name string
|
||||||
|
var rpm float64
|
||||||
|
if err := fRows.Scan(&ts, &name, &rpm); err == nil {
|
||||||
|
fanData[fanKey{ts, name}] = rpm
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load temp rows in range
|
||||||
|
type tempKey struct{ ts int64; name string }
|
||||||
|
tempData := map[tempKey]platform.TempReading{}
|
||||||
|
tRows, err := m.db.Query(
|
||||||
|
`SELECT ts,name,grp,celsius FROM temp_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
defer tRows.Close()
|
||||||
|
for tRows.Next() {
|
||||||
|
var ts int64
|
||||||
|
var t platform.TempReading
|
||||||
|
if err := tRows.Scan(&ts, &t.Name, &t.Group, &t.Celsius); err == nil {
|
||||||
|
tempData[tempKey{ts, t.Name}] = t
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect unique GPU indices and fan names from loaded data (preserve order)
|
||||||
|
seenGPU := map[int]bool{}
|
||||||
|
var gpuIndices []int
|
||||||
|
for k := range gpuData {
|
||||||
|
if !seenGPU[k.idx] {
|
||||||
|
seenGPU[k.idx] = true
|
||||||
|
gpuIndices = append(gpuIndices, k.idx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
seenFan := map[string]bool{}
|
||||||
|
var fanNames []string
|
||||||
|
for k := range fanData {
|
||||||
|
if !seenFan[k.name] {
|
||||||
|
seenFan[k.name] = true
|
||||||
|
fanNames = append(fanNames, k.name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
seenTemp := map[string]bool{}
|
||||||
|
var tempNames []string
|
||||||
|
for k := range tempData {
|
||||||
|
if !seenTemp[k.name] {
|
||||||
|
seenTemp[k.name] = true
|
||||||
|
tempNames = append(tempNames, k.name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
samples := make([]platform.LiveMetricSample, len(sysRows))
|
||||||
|
for i, r := range sysRows {
|
||||||
|
s := platform.LiveMetricSample{
|
||||||
|
Timestamp: time.Unix(r.ts, 0).UTC(),
|
||||||
|
CPULoadPct: r.cpu,
|
||||||
|
MemLoadPct: r.mem,
|
||||||
|
PowerW: r.pwr,
|
||||||
|
}
|
||||||
|
for _, idx := range gpuIndices {
|
||||||
|
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
|
||||||
|
s.GPUs = append(s.GPUs, g)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, name := range fanNames {
|
||||||
|
if rpm, ok := fanData[fanKey{r.ts, name}]; ok {
|
||||||
|
s.Fans = append(s.Fans, platform.FanReading{Name: name, RPM: rpm})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, name := range tempNames {
|
||||||
|
if t, ok := tempData[tempKey{r.ts, name}]; ok {
|
||||||
|
s.Temps = append(s.Temps, t)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
samples[i] = s
|
||||||
|
}
|
||||||
|
return samples, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ExportCSV writes all sys+gpu data as CSV to w.
|
||||||
|
func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||||
|
rows, err := m.db.Query(`
|
||||||
|
SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
|
||||||
|
g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w
|
||||||
|
FROM sys_metrics s
|
||||||
|
LEFT JOIN gpu_metrics g ON g.ts = s.ts
|
||||||
|
ORDER BY s.ts, g.gpu_index
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
cw := csv.NewWriter(w)
|
||||||
|
_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w"})
|
||||||
|
for rows.Next() {
|
||||||
|
var ts int64
|
||||||
|
var cpu, mem, pwr float64
|
||||||
|
var gpuIdx sql.NullInt64
|
||||||
|
var gpuTemp, gpuUse, gpuMem, gpuPow sql.NullFloat64
|
||||||
|
if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
row := []string{
|
||||||
|
strconv.FormatInt(ts, 10),
|
||||||
|
strconv.FormatFloat(cpu, 'f', 2, 64),
|
||||||
|
strconv.FormatFloat(mem, 'f', 2, 64),
|
||||||
|
strconv.FormatFloat(pwr, 'f', 1, 64),
|
||||||
|
}
|
||||||
|
if gpuIdx.Valid {
|
||||||
|
row = append(row,
|
||||||
|
strconv.FormatInt(gpuIdx.Int64, 10),
|
||||||
|
strconv.FormatFloat(gpuTemp.Float64, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
row = append(row, "", "", "", "", "")
|
||||||
|
}
|
||||||
|
_ = cw.Write(row)
|
||||||
|
}
|
||||||
|
cw.Flush()
|
||||||
|
return cw.Error()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close closes the database.
|
||||||
|
func (m *MetricsDB) Close() { _ = m.db.Close() }
|
||||||
|
|
||||||
|
func nullFloat(v float64) sql.NullFloat64 {
|
||||||
|
return sql.NullFloat64{Float64: v, Valid: true}
|
||||||
|
}
|
||||||
@@ -61,7 +61,8 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
|
|||||||
.badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
.badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||||
.badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
.badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||||
/* Output terminal */
|
/* Output terminal */
|
||||||
.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all}
|
.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
|
||||||
|
.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
|
||||||
/* Forms */
|
/* Forms */
|
||||||
.form-row{margin-bottom:14px}
|
.form-row{margin-bottom:14px}
|
||||||
.form-row label{display:block;font-size:12px;color:var(--muted);margin-bottom:5px;font-weight:700}
|
.form-row label{display:block;font-size:12px;color:var(--muted);margin-bottom:5px;font-weight:700}
|
||||||
@@ -83,10 +84,10 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
|
|||||||
`
|
`
|
||||||
}
|
}
|
||||||
|
|
||||||
func layoutNav(active string) string {
|
func layoutNav(active string, buildLabel string) string {
|
||||||
items := []struct{ id, label, href, onclick string }{
|
items := []struct{ id, label, href, onclick string }{
|
||||||
{"dashboard", "Dashboard", "/", ""},
|
{"dashboard", "Dashboard", "/", ""},
|
||||||
{"audit", "Audit", "#", "openAuditModal();return false;"},
|
{"audit", "Audit", "/audit", ""},
|
||||||
{"validate", "Validate", "/validate", ""},
|
{"validate", "Validate", "/validate", ""},
|
||||||
{"burn", "Burn", "/burn", ""},
|
{"burn", "Burn", "/burn", ""},
|
||||||
{"tasks", "Tasks", "/tasks", ""},
|
{"tasks", "Tasks", "/tasks", ""},
|
||||||
@@ -109,7 +110,12 @@ func layoutNav(active string) string {
|
|||||||
cls, item.href, item.label))
|
cls, item.href, item.label))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
b.WriteString(`</nav></aside>`)
|
if strings.TrimSpace(buildLabel) == "" {
|
||||||
|
buildLabel = "dev"
|
||||||
|
}
|
||||||
|
b.WriteString(`</nav>`)
|
||||||
|
b.WriteString(`<div style="padding:12px 16px;border-top:1px solid rgba(255,255,255,.08);font-size:11px;color:rgba(255,255,255,.45)">Build ` + html.EscapeString(buildLabel) + `</div>`)
|
||||||
|
b.WriteString(`</aside>`)
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -121,6 +127,10 @@ func renderPage(page string, opts HandlerOptions) string {
|
|||||||
pageID = "dashboard"
|
pageID = "dashboard"
|
||||||
title = "Dashboard"
|
title = "Dashboard"
|
||||||
body = renderDashboard(opts)
|
body = renderDashboard(opts)
|
||||||
|
case "audit":
|
||||||
|
pageID = "audit"
|
||||||
|
title = "Audit"
|
||||||
|
body = renderAudit()
|
||||||
case "validate":
|
case "validate":
|
||||||
pageID = "validate"
|
pageID = "validate"
|
||||||
title = "Validate"
|
title = "Validate"
|
||||||
@@ -173,11 +183,21 @@ func renderPage(page string, opts HandlerOptions) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return layoutHead(opts.Title+" — "+title) +
|
return layoutHead(opts.Title+" — "+title) +
|
||||||
layoutNav(pageID) +
|
layoutNav(pageID, opts.BuildLabel) +
|
||||||
`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
|
`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
|
||||||
body +
|
body +
|
||||||
`</div></div>` +
|
`</div></div>` +
|
||||||
renderAuditModal() +
|
renderAuditModal() +
|
||||||
|
`<script>
|
||||||
|
// Add copy button to every .terminal on the page
|
||||||
|
document.querySelectorAll('.terminal').forEach(function(t){
|
||||||
|
var w=document.createElement('div');w.className='terminal-wrap';
|
||||||
|
t.parentNode.insertBefore(w,t);w.appendChild(t);
|
||||||
|
var btn=document.createElement('button');btn.className='terminal-copy';btn.textContent='Copy';
|
||||||
|
btn.onclick=function(){navigator.clipboard.writeText(t.textContent).then(function(){btn.textContent='Copied!';setTimeout(function(){btn.textContent='Copy';},1500);});};
|
||||||
|
w.appendChild(btn);
|
||||||
|
});
|
||||||
|
</script>` +
|
||||||
`</body></html>`
|
`</body></html>`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -191,6 +211,10 @@ func renderDashboard(opts HandlerOptions) string {
|
|||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func renderAudit() string {
|
||||||
|
return `<div class="card"><div class="card-head">Audit Viewer <button class="btn btn-sm btn-secondary" style="margin-left:auto" onclick="openAuditModal()">Actions</button></div><div class="card-body" style="padding:0"><iframe class="viewer-frame" src="/viewer" title="Audit viewer"></iframe></div></div>`
|
||||||
|
}
|
||||||
|
|
||||||
func renderHardwareSummaryCard(opts HandlerOptions) string {
|
func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||||
data, err := loadSnapshot(opts.AuditPath)
|
data, err := loadSnapshot(opts.AuditPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -298,14 +322,14 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
|||||||
|
|
||||||
func renderAuditModal() string {
|
func renderAuditModal() string {
|
||||||
return `<div id="audit-modal-overlay" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,.5);z-index:100;align-items:center;justify-content:center">
|
return `<div id="audit-modal-overlay" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,.5);z-index:100;align-items:center;justify-content:center">
|
||||||
<div style="background:#fff;border-radius:6px;padding:24px;min-width:480px;max-width:700px;position:relative">
|
<div style="background:#fff;border-radius:6px;padding:24px;min-width:480px;max-width:1100px;width:min(1100px,92vw);max-height:92vh;overflow:auto;position:relative">
|
||||||
<div style="font-weight:700;font-size:16px;margin-bottom:16px">Audit</div>
|
<div style="font-weight:700;font-size:16px;margin-bottom:16px">Audit</div>
|
||||||
<div style="margin-bottom:12px;display:flex;gap:8px">
|
<div style="margin-bottom:12px;display:flex;gap:8px">
|
||||||
<button class="btn btn-primary" onclick="auditModalRun()">▶ Re-run Audit</button>
|
<button class="btn btn-primary" onclick="auditModalRun()">▶ Re-run Audit</button>
|
||||||
<a class="btn btn-secondary" href="/audit.json" download>↓ Download</a>
|
<a class="btn btn-secondary" href="/audit.json" download>↓ Download</a>
|
||||||
<a class="btn btn-secondary" href="/viewer" target="_blank">Open Viewer</a>
|
|
||||||
</div>
|
</div>
|
||||||
<div id="audit-modal-terminal" class="terminal" style="display:none;max-height:300px"></div>
|
<div id="audit-modal-terminal" class="terminal" style="display:none;max-height:220px;margin-bottom:12px"></div>
|
||||||
|
<iframe class="viewer-frame" src="/viewer" title="Audit viewer in modal" style="height:min(70vh,720px)"></iframe>
|
||||||
<button class="btn btn-secondary btn-sm" onclick="closeAuditModal()" style="position:absolute;top:12px;right:12px">✕</button>
|
<button class="btn btn-secondary btn-sm" onclick="closeAuditModal()" style="position:absolute;top:12px;right:12px">✕</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -373,9 +397,17 @@ func renderMetrics() string {
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="card" style="margin-bottom:16px">
|
<div class="card" style="margin-bottom:16px">
|
||||||
<div class="card-head">Server — Temperature</div>
|
<div class="card-head">Temperature — CPU</div>
|
||||||
<div class="card-body" style="padding:8px">
|
<div class="card-body" style="padding:8px">
|
||||||
<img id="chart-server-temp" src="/api/metrics/chart/server-temp.svg" style="width:100%;display:block;border-radius:6px" alt="CPU temperature">
|
<img id="chart-server-temp-cpu" src="/api/metrics/chart/server-temp-cpu.svg" style="width:100%;display:block;border-radius:6px" alt="CPU temperature">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">Temperature — Ambient Sensors</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-server-temp-ambient" src="/api/metrics/chart/server-temp-ambient.svg" style="width:100%;display:block;border-radius:6px" alt="Ambient temperature sensors">
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -383,78 +415,60 @@ func renderMetrics() string {
|
|||||||
<div class="card-head">Server — Power</div>
|
<div class="card-head">Server — Power</div>
|
||||||
<div class="card-body" style="padding:8px">
|
<div class="card-body" style="padding:8px">
|
||||||
<img id="chart-server-power" src="/api/metrics/chart/server-power.svg" style="width:100%;display:block;border-radius:6px" alt="System power">
|
<img id="chart-server-power" src="/api/metrics/chart/server-power.svg" style="width:100%;display:block;border-radius:6px" alt="System power">
|
||||||
<div id="sys-table" style="margin-top:8px;font-size:12px"></div>
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div id="gpu-charts"></div>
|
<div id="card-server-fans" class="card" style="margin-bottom:16px;display:none">
|
||||||
|
<div class="card-head">Server — Fan RPM</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-server-fans" src="/api/metrics/chart/server-fans.svg" style="width:100%;display:block;border-radius:6px" alt="Fan RPM">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">GPU — Compute Load</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-gpu-all-load" src="/api/metrics/chart/gpu-all-load.svg" style="width:100%;display:block;border-radius:6px" alt="GPU compute load">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">GPU — Memory Load</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-gpu-all-memload" src="/api/metrics/chart/gpu-all-memload.svg" style="width:100%;display:block;border-radius:6px" alt="GPU memory load">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">GPU — Power</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-gpu-all-power" src="/api/metrics/chart/gpu-all-power.svg" style="width:100%;display:block;border-radius:6px" alt="GPU power">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">GPU — Temperature</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-gpu-all-temp" src="/api/metrics/chart/gpu-all-temp.svg" style="width:100%;display:block;border-radius:6px" alt="GPU temperature">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
let knownGPUs = [];
|
|
||||||
|
|
||||||
function refreshCharts() {
|
function refreshCharts() {
|
||||||
const t = '?t=' + Date.now();
|
const t = '?t=' + Date.now();
|
||||||
['chart-server-load','chart-server-temp','chart-server-power'].forEach(id => {
|
['chart-server-load','chart-server-temp-cpu','chart-server-temp-gpu','chart-server-temp-ambient','chart-server-power','chart-server-fans',
|
||||||
|
'chart-gpu-all-load','chart-gpu-all-memload','chart-gpu-all-power','chart-gpu-all-temp'].forEach(id => {
|
||||||
const el = document.getElementById(id);
|
const el = document.getElementById(id);
|
||||||
if (el) el.src = el.src.split('?')[0] + t;
|
if (el) el.src = el.src.split('?')[0] + t;
|
||||||
});
|
});
|
||||||
knownGPUs.forEach(idx => {
|
|
||||||
['load','temp','power'].forEach(kind => {
|
|
||||||
const el = document.getElementById('chart-gpu-' + idx + '-' + kind);
|
|
||||||
if (el) el.src = el.src.split('?')[0] + t;
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
setInterval(refreshCharts, 2000);
|
setInterval(refreshCharts, 3000);
|
||||||
|
|
||||||
const es = new EventSource('/api/metrics/stream');
|
const es = new EventSource('/api/metrics/stream');
|
||||||
es.addEventListener('metrics', e => {
|
es.addEventListener('metrics', e => {
|
||||||
const d = JSON.parse(e.data);
|
const d = JSON.parse(e.data);
|
||||||
|
|
||||||
// Add GPU chart cards as GPUs appear
|
// Show/hide Fan RPM card based on data availability
|
||||||
(d.gpus||[]).forEach(g => {
|
const fanCard = document.getElementById('card-server-fans');
|
||||||
if (knownGPUs.includes(g.index)) return;
|
if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
|
||||||
knownGPUs.push(g.index);
|
|
||||||
const div = document.createElement('div');
|
|
||||||
div.className = 'card';
|
|
||||||
div.style.marginBottom = '16px';
|
|
||||||
div.innerHTML =
|
|
||||||
'<div class="card-head">GPU ' + g.index + ' — Load</div>' +
|
|
||||||
'<div class="card-body" style="padding:8px">' +
|
|
||||||
'<img id="chart-gpu-' + g.index + '-load" src="/api/metrics/chart/gpu/' + g.index + '-load.svg" style="width:100%;display:block;border-radius:6px" alt="GPU ' + g.index + ' load">' +
|
|
||||||
'</div>' +
|
|
||||||
'<div class="card-head">GPU ' + g.index + ' — Temperature</div>' +
|
|
||||||
'<div class="card-body" style="padding:8px">' +
|
|
||||||
'<img id="chart-gpu-' + g.index + '-temp" src="/api/metrics/chart/gpu/' + g.index + '-temp.svg" style="width:100%;display:block;border-radius:6px" alt="GPU ' + g.index + ' temp">' +
|
|
||||||
'</div>' +
|
|
||||||
'<div class="card-head">GPU ' + g.index + ' — Power</div>' +
|
|
||||||
'<div class="card-body" style="padding:8px">' +
|
|
||||||
'<img id="chart-gpu-' + g.index + '-power" src="/api/metrics/chart/gpu/' + g.index + '-power.svg" style="width:100%;display:block;border-radius:6px" alt="GPU ' + g.index + ' power">' +
|
|
||||||
'<div id="gpu-table-' + g.index + '" style="margin-top:8px;font-size:12px"></div>' +
|
|
||||||
'</div>';
|
|
||||||
document.getElementById('gpu-charts').appendChild(div);
|
|
||||||
});
|
|
||||||
|
|
||||||
// Update numeric tables
|
|
||||||
let sysHTML = '';
|
|
||||||
const cpuTemp = (d.temps||[]).find(t => t.name==='CPU');
|
|
||||||
if (cpuTemp) sysHTML += '<tr><td>CPU Temp</td><td>'+cpuTemp.celsius.toFixed(1)+'°C</td></tr>';
|
|
||||||
if (d.cpu_load_pct) sysHTML += '<tr><td>CPU Load</td><td>'+d.cpu_load_pct.toFixed(1)+'%</td></tr>';
|
|
||||||
if (d.mem_load_pct) sysHTML += '<tr><td>Mem Load</td><td>'+d.mem_load_pct.toFixed(1)+'%</td></tr>';
|
|
||||||
(d.fans||[]).forEach(f => sysHTML += '<tr><td>'+f.name+'</td><td>'+f.rpm+' RPM</td></tr>');
|
|
||||||
if (d.power_w) sysHTML += '<tr><td>Power</td><td>'+d.power_w.toFixed(0)+' W</td></tr>';
|
|
||||||
const st = document.getElementById('sys-table');
|
|
||||||
if (st) st.innerHTML = sysHTML ? '<table>'+sysHTML+'</table>' : '<p style="color:var(--muted)">No sensor data (ipmitool/sensors required)</p>';
|
|
||||||
|
|
||||||
(d.gpus||[]).forEach(g => {
|
|
||||||
const t = document.getElementById('gpu-table-' + g.index);
|
|
||||||
if (!t) return;
|
|
||||||
t.innerHTML = '<table>' +
|
|
||||||
'<tr><td>Temp</td><td>'+g.temp_c+'°C</td>' +
|
|
||||||
'<td>Load</td><td>'+g.usage_pct+'%</td>' +
|
|
||||||
'<td>Mem</td><td>'+g.mem_usage_pct+'%</td>' +
|
|
||||||
'<td>Power</td><td>'+g.power_w+' W</td></tr></table>';
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
es.onerror = () => {};
|
es.onerror = () => {};
|
||||||
</script>`
|
</script>`
|
||||||
@@ -480,7 +494,11 @@ func renderValidate() string {
|
|||||||
renderSATCard("memory", "Memory", "") +
|
renderSATCard("memory", "Memory", "") +
|
||||||
renderSATCard("storage", "Storage", "") +
|
renderSATCard("storage", "Storage", "") +
|
||||||
renderSATCard("cpu", "CPU", `<div class="form-row"><label>Duration (seconds)</label><input type="number" id="sat-cpu-dur" value="60" min="10"></div>`) +
|
renderSATCard("cpu", "CPU", `<div class="form-row"><label>Duration (seconds)</label><input type="number" id="sat-cpu-dur" value="60" min="10"></div>`) +
|
||||||
renderSATCard("amd", "AMD GPU", "") +
|
renderSATCard("amd", "AMD GPU", `<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||||
|
<button id="sat-btn-amd-mem" class="btn" type="button" onclick="runSAT('amd-mem')">MEM Integrity</button>
|
||||||
|
<button id="sat-btn-amd-bandwidth" class="btn" type="button" onclick="runSAT('amd-bandwidth')">MEM Bandwidth</button>
|
||||||
|
</div>
|
||||||
|
<p style="color:var(--muted);font-size:12px;margin:0">Additional AMD memory diagnostics: RVS MEM for integrity and BABEL + rocm-bandwidth-test for memory/interconnect bandwidth.</p>`) +
|
||||||
`</div>
|
`</div>
|
||||||
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||||
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||||
@@ -491,6 +509,8 @@ let satES = null;
|
|||||||
function runSAT(target) {
|
function runSAT(target) {
|
||||||
if (satES) { satES.close(); satES = null; }
|
if (satES) { satES.close(); satES = null; }
|
||||||
const body = {};
|
const body = {};
|
||||||
|
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||||
|
body.display_name = labels[target] || ('Validate ' + target);
|
||||||
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
|
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
|
||||||
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
|
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
|
||||||
document.getElementById('sat-output').style.display='block';
|
document.getElementById('sat-output').style.display='block';
|
||||||
@@ -508,7 +528,7 @@ function runSAT(target) {
|
|||||||
}
|
}
|
||||||
function runAllSAT() {
|
function runAllSAT() {
|
||||||
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
||||||
const targets = ['nvidia','memory','storage','cpu','amd'];
|
const targets = ['nvidia','memory','storage','cpu','amd','amd-mem','amd-bandwidth'];
|
||||||
const total = targets.length * cycles;
|
const total = targets.length * cycles;
|
||||||
let enqueued = 0;
|
let enqueued = 0;
|
||||||
const status = document.getElementById('sat-all-status');
|
const status = document.getElementById('sat-all-status');
|
||||||
@@ -520,6 +540,8 @@ function runAllSAT() {
|
|||||||
const btn = document.getElementById('sat-btn-' + target);
|
const btn = document.getElementById('sat-btn-' + target);
|
||||||
if (btn && btn.disabled) { enqueueNext(cycle, idx+1); return; }
|
if (btn && btn.disabled) { enqueueNext(cycle, idx+1); return; }
|
||||||
const body = {};
|
const body = {};
|
||||||
|
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||||
|
body.display_name = labels[target] || ('Validate ' + target);
|
||||||
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
|
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
|
||||||
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
|
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
|
||||||
fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
|
fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
|
||||||
@@ -536,6 +558,8 @@ function runAllSAT() {
|
|||||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||||
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||||
|
if (!gp.amd) disableSATCard('amd-mem', 'No AMD GPU detected');
|
||||||
|
if (!gp.amd) disableSATCard('amd-bandwidth', 'No AMD GPU detected');
|
||||||
});
|
});
|
||||||
function disableSATCard(id, reason) {
|
function disableSATCard(id, reason) {
|
||||||
const btn = document.getElementById('sat-btn-' + id);
|
const btn = document.getElementById('sat-btn-' + id);
|
||||||
@@ -568,17 +592,19 @@ func renderSATCard(id, label, extra string) string {
|
|||||||
func renderBurn() string {
|
func renderBurn() string {
|
||||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at maximum load. Repeated or prolonged use may reduce hardware lifespan (storage endurance, GPU wear). Use only when necessary.</div>
|
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at maximum load. Repeated or prolonged use may reduce hardware lifespan (storage endurance, GPU wear). Use only when necessary.</div>
|
||||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
|
<div class="card"><div class="card-head">Burn Profile</div><div class="card-body">
|
||||||
|
<div class="form-row" style="max-width:320px"><label>Preset</label><select id="burn-profile"><option value="smoke">Smoke: 5 minutes</option><option value="acceptance">Acceptance: 1 hour</option><option value="overnight">Overnight: 8 hours</option></select></div>
|
||||||
|
<p style="color:var(--muted);font-size:12px">Applied to all tests on this page. NVIDIA uses mapped DCGM levels: smoke=quick, acceptance=targeted stress, overnight=extended stress.</p>
|
||||||
|
</div></div>
|
||||||
<div class="grid3">
|
<div class="grid3">
|
||||||
<div class="card"><div class="card-head">NVIDIA GPU Stress</div><div class="card-body">
|
<div class="card"><div class="card-head">NVIDIA GPU Stress</div><div class="card-body">
|
||||||
<div class="form-row"><label>Duration</label><select id="bi-dur"><option value="600">10 minutes</option><option value="3600">1 hour</option><option value="28800">8 hours</option><option value="86400">24 hours</option></select></div>
|
|
||||||
<button id="sat-btn-nvidia" class="btn btn-primary" onclick="runBurnIn('nvidia')">▶ Start NVIDIA Stress</button>
|
<button id="sat-btn-nvidia" class="btn btn-primary" onclick="runBurnIn('nvidia')">▶ Start NVIDIA Stress</button>
|
||||||
</div></div>
|
</div></div>
|
||||||
<div class="card"><div class="card-head">CPU Stress</div><div class="card-body">
|
<div class="card"><div class="card-head">CPU Stress</div><div class="card-body">
|
||||||
<div class="form-row"><label>Duration (seconds)</label><input type="number" id="bi-cpu-dur" value="300" min="60"></div>
|
|
||||||
<button class="btn btn-primary" onclick="runBurnIn('cpu')">▶ Start CPU Stress</button>
|
<button class="btn btn-primary" onclick="runBurnIn('cpu')">▶ Start CPU Stress</button>
|
||||||
</div></div>
|
</div></div>
|
||||||
<div class="card"><div class="card-head">AMD GPU Stress</div><div class="card-body">
|
<div class="card"><div class="card-head">AMD GPU Stress</div><div class="card-body">
|
||||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Requires ROCm tools (rocm-bandwidth-test). Missing tools reported as UNSUPPORTED.</p>
|
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs ROCm compute stress together with VRAM copy/load activity via RVS GST and records a separate <code>rocm-bandwidth-test</code> snapshot. Missing tools reported as UNSUPPORTED.</p>
|
||||||
<button id="sat-btn-amd-stress" class="btn btn-primary" onclick="runBurnIn('amd-stress')">▶ Start AMD Stress</button>
|
<button id="sat-btn-amd-stress" class="btn btn-primary" onclick="runBurnIn('amd-stress')">▶ Start AMD Stress</button>
|
||||||
</div></div>
|
</div></div>
|
||||||
<div class="card"><div class="card-head">Memory Stress</div><div class="card-body">
|
<div class="card"><div class="card-head">Memory Stress</div><div class="card-body">
|
||||||
@@ -589,6 +615,10 @@ func renderBurn() string {
|
|||||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Google stressapptest saturates CPU, memory and cache buses simultaneously. Env: <code>BEE_SAT_STRESS_SECONDS</code> (default 300), <code>BEE_SAT_STRESS_MB</code> (default auto).</p>
|
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Google stressapptest saturates CPU, memory and cache buses simultaneously. Env: <code>BEE_SAT_STRESS_SECONDS</code> (default 300), <code>BEE_SAT_STRESS_MB</code> (default auto).</p>
|
||||||
<button class="btn btn-primary" onclick="runBurnIn('sat-stress')">▶ Start SAT Stress</button>
|
<button class="btn btn-primary" onclick="runBurnIn('sat-stress')">▶ Start SAT Stress</button>
|
||||||
</div></div>
|
</div></div>
|
||||||
|
<div class="card"><div class="card-head">Platform Thermal Cycling</div><div class="card-body">
|
||||||
|
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs CPU + GPU stress simultaneously across multiple load/idle cycles with varying durations. Detects cooling systems that fail to recover under repeated load cycles. Smoke: 2 cycles ~5 min. Acceptance: 4 cycles ~25 min.</p>
|
||||||
|
<button class="btn btn-primary" onclick="runBurnIn('platform-stress')">▶ Start Thermal Cycling</button>
|
||||||
|
</div></div>
|
||||||
</div>
|
</div>
|
||||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||||
<div class="card-head">Output <span id="bi-title"></span></div>
|
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||||
@@ -598,11 +628,9 @@ func renderBurn() string {
|
|||||||
let biES = null;
|
let biES = null;
|
||||||
function runBurnIn(target) {
|
function runBurnIn(target) {
|
||||||
if (biES) { biES.close(); biES = null; }
|
if (biES) { biES.close(); biES = null; }
|
||||||
const body = {};
|
const body = { profile: document.getElementById('burn-profile').value || 'smoke' };
|
||||||
if (target === 'nvidia') body.duration = parseInt(document.getElementById('bi-dur').value)||600;
|
|
||||||
if (target === 'cpu') body.duration = parseInt(document.getElementById('bi-cpu-dur').value)||300;
|
|
||||||
document.getElementById('bi-output').style.display='block';
|
document.getElementById('bi-output').style.display='block';
|
||||||
document.getElementById('bi-title').textContent = '— ' + target;
|
document.getElementById('bi-title').textContent = '— ' + target + ' [' + body.profile + ']';
|
||||||
const term = document.getElementById('bi-terminal');
|
const term = document.getElementById('bi-terminal');
|
||||||
term.textContent = 'Enqueuing ' + target + ' stress...\n';
|
term.textContent = 'Enqueuing ' + target + ' stress...\n';
|
||||||
fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
|
fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
|
||||||
@@ -672,7 +700,7 @@ var _netCountdownTimer = null;
|
|||||||
function loadNetwork() {
|
function loadNetwork() {
|
||||||
fetch('/api/network').then(r=>r.json()).then(d => {
|
fetch('/api/network').then(r=>r.json()).then(d => {
|
||||||
const rows = (d.interfaces||[]).map(i =>
|
const rows = (d.interfaces||[]).map(i =>
|
||||||
'<tr><td>'+i.Name+'</td>' +
|
'<tr><td style="cursor:pointer" onclick="selectIface(\''+i.Name+'\')" title="Use this interface in the forms below"><span style="text-decoration:underline">'+i.Name+'</span></td>' +
|
||||||
'<td style="cursor:pointer" onclick="toggleIface(\''+i.Name+'\',\''+i.State+'\')" title="Click to toggle"><span class="badge '+(i.State==='up'?'badge-ok':'badge-warn')+'">'+i.State+'</span></td>' +
|
'<td style="cursor:pointer" onclick="toggleIface(\''+i.Name+'\',\''+i.State+'\')" title="Click to toggle"><span class="badge '+(i.State==='up'?'badge-ok':'badge-warn')+'">'+i.State+'</span></td>' +
|
||||||
'<td>'+(i.IPv4||[]).join(', ')+'</td></tr>'
|
'<td>'+(i.IPv4||[]).join(', ')+'</td></tr>'
|
||||||
).join('');
|
).join('');
|
||||||
@@ -681,6 +709,10 @@ function loadNetwork() {
|
|||||||
(d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
|
(d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
function selectIface(iface) {
|
||||||
|
document.getElementById('dhcp-iface').value = iface;
|
||||||
|
document.getElementById('st-iface').value = iface;
|
||||||
|
}
|
||||||
function toggleIface(iface, currentState) {
|
function toggleIface(iface, currentState) {
|
||||||
fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
|
fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
|
||||||
.then(r=>r.json()).then(d => {
|
.then(r=>r.json()).then(d => {
|
||||||
@@ -716,6 +748,7 @@ function runDHCP() {
|
|||||||
fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
|
fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
|
||||||
.then(r=>r.json()).then(d => {
|
.then(r=>r.json()).then(d => {
|
||||||
document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
|
document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
|
||||||
|
if (!d.error) showNetPending(d.rollback_in || 60);
|
||||||
loadNetwork();
|
loadNetwork();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -729,6 +762,7 @@ function setStatic() {
|
|||||||
dns: dns,
|
dns: dns,
|
||||||
})}).then(r=>r.json()).then(d => {
|
})}).then(r=>r.json()).then(d => {
|
||||||
document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
|
document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
|
||||||
|
if (!d.error) showNetPending(d.rollback_in || 60);
|
||||||
loadNetwork();
|
loadNetwork();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -846,10 +880,17 @@ func listExportFiles(exportDir string) ([]string, error) {
|
|||||||
|
|
||||||
func renderTools() string {
|
func renderTools() string {
|
||||||
return `<div class="card" style="margin-bottom:16px">
|
return `<div class="card" style="margin-bottom:16px">
|
||||||
<div class="card-head">Install to RAM</div>
|
<div class="card-head">System Install</div>
|
||||||
<div class="card-body">
|
<div class="card-body">
|
||||||
|
<div style="margin-bottom:20px">
|
||||||
|
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
||||||
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||||
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||||
|
</div>
|
||||||
|
<div style="border-top:1px solid var(--line);padding-top:20px">
|
||||||
|
<div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
|
||||||
|
renderInstallInline() + `
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<script>
|
<script>
|
||||||
@@ -886,9 +927,6 @@ function installToRAM() {
|
|||||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||||
renderServicesInline() + `</div></div>
|
renderServicesInline() + `</div></div>
|
||||||
|
|
||||||
<div class="card"><div class="card-head">Install to Disk</div><div class="card-body">` +
|
|
||||||
renderInstallInline() + `</div></div>
|
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
function checkTools() {
|
function checkTools() {
|
||||||
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||||
@@ -939,8 +977,6 @@ func renderInstallInline() string {
|
|||||||
<div id="install-terminal" class="terminal" style="max-height:500px"></div>
|
<div id="install-terminal" class="terminal" style="max-height:500px"></div>
|
||||||
<div id="install-status" style="margin-top:12px;font-size:13px"></div>
|
<div id="install-status" style="margin-top:12px;font-size:13px"></div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<style>
|
<style>
|
||||||
#install-disk-tbody tr{cursor:pointer}
|
#install-disk-tbody tr{cursor:pointer}
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"html"
|
||||||
"mime"
|
"mime"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
@@ -13,6 +14,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
"bee/audit/internal/runtimeenv"
|
"bee/audit/internal/runtimeenv"
|
||||||
gocharts "github.com/go-analyze/charts"
|
gocharts "github.com/go-analyze/charts"
|
||||||
"reanimator/chart/viewer"
|
"reanimator/chart/viewer"
|
||||||
@@ -35,6 +37,7 @@ func init() {
|
|||||||
// HandlerOptions configures the web UI handler.
|
// HandlerOptions configures the web UI handler.
|
||||||
type HandlerOptions struct {
|
type HandlerOptions struct {
|
||||||
Title string
|
Title string
|
||||||
|
BuildLabel string
|
||||||
AuditPath string
|
AuditPath string
|
||||||
ExportDir string
|
ExportDir string
|
||||||
App *app.App
|
App *app.App
|
||||||
@@ -69,29 +72,36 @@ func (r *metricsRing) snapshot() ([]float64, []string) {
|
|||||||
defer r.mu.Unlock()
|
defer r.mu.Unlock()
|
||||||
v := make([]float64, len(r.vals))
|
v := make([]float64, len(r.vals))
|
||||||
copy(v, r.vals)
|
copy(v, r.vals)
|
||||||
now := time.Now()
|
|
||||||
labels := make([]string, len(r.times))
|
labels := make([]string, len(r.times))
|
||||||
|
if len(r.times) == 0 {
|
||||||
|
return v, labels
|
||||||
|
}
|
||||||
|
sameDay := timestampsSameLocalDay(r.times)
|
||||||
for i, t := range r.times {
|
for i, t := range r.times {
|
||||||
labels[i] = relAgeLabel(now.Sub(t))
|
labels[i] = formatTimelineLabel(t.Local(), sameDay)
|
||||||
}
|
}
|
||||||
return v, labels
|
return v, labels
|
||||||
}
|
}
|
||||||
|
|
||||||
func relAgeLabel(age time.Duration) string {
|
func timestampsSameLocalDay(times []time.Time) bool {
|
||||||
if age <= 0 {
|
if len(times) == 0 {
|
||||||
return "0"
|
return true
|
||||||
}
|
}
|
||||||
if age < time.Hour {
|
first := times[0].Local()
|
||||||
m := int(age.Minutes())
|
for _, t := range times[1:] {
|
||||||
if m == 0 {
|
local := t.Local()
|
||||||
return "-<1m"
|
if local.Year() != first.Year() || local.YearDay() != first.YearDay() {
|
||||||
|
return false
|
||||||
}
|
}
|
||||||
return fmt.Sprintf("-%dm", m)
|
|
||||||
}
|
}
|
||||||
if age < 24*time.Hour {
|
return true
|
||||||
return fmt.Sprintf("-%dh", int(age.Hours()))
|
}
|
||||||
|
|
||||||
|
func formatTimelineLabel(ts time.Time, sameDay bool) string {
|
||||||
|
if sameDay {
|
||||||
|
return ts.Format("15:04")
|
||||||
}
|
}
|
||||||
return fmt.Sprintf("-%dd", int(age.Hours()/24))
|
return ts.Format("01-02 15:04")
|
||||||
}
|
}
|
||||||
|
|
||||||
// gpuRings holds per-GPU ring buffers.
|
// gpuRings holds per-GPU ring buffers.
|
||||||
@@ -102,31 +112,40 @@ type gpuRings struct {
|
|||||||
Power *metricsRing
|
Power *metricsRing
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type namedMetricsRing struct {
|
||||||
|
Name string
|
||||||
|
Ring *metricsRing
|
||||||
|
}
|
||||||
|
|
||||||
// pendingNetChange tracks a network state change awaiting confirmation.
|
// pendingNetChange tracks a network state change awaiting confirmation.
|
||||||
type pendingNetChange struct {
|
type pendingNetChange struct {
|
||||||
iface string
|
snapshot platform.NetworkSnapshot
|
||||||
wasUp bool
|
timer *time.Timer
|
||||||
timer *time.Timer
|
mu sync.Mutex
|
||||||
mu sync.Mutex
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// handler is the HTTP handler for the web UI.
|
// handler is the HTTP handler for the web UI.
|
||||||
type handler struct {
|
type handler struct {
|
||||||
opts HandlerOptions
|
opts HandlerOptions
|
||||||
mux *http.ServeMux
|
mux *http.ServeMux
|
||||||
// server rings
|
// server rings
|
||||||
ringCPUTemp *metricsRing
|
ringCPULoad *metricsRing
|
||||||
ringCPULoad *metricsRing
|
ringMemLoad *metricsRing
|
||||||
ringMemLoad *metricsRing
|
ringPower *metricsRing
|
||||||
ringPower *metricsRing
|
ringFans []*metricsRing
|
||||||
ringFans []*metricsRing
|
fanNames []string
|
||||||
fanNames []string
|
cpuTempRings []*namedMetricsRing
|
||||||
|
ambientTempRings []*namedMetricsRing
|
||||||
// per-GPU rings (index = GPU index)
|
// per-GPU rings (index = GPU index)
|
||||||
gpuRings []*gpuRings
|
gpuRings []*gpuRings
|
||||||
ringsMu sync.Mutex
|
ringsMu sync.Mutex
|
||||||
|
latestMu sync.RWMutex
|
||||||
|
latest *platform.LiveMetricSample
|
||||||
|
// metrics persistence (nil if DB unavailable)
|
||||||
|
metricsDB *MetricsDB
|
||||||
// install job (at most one at a time)
|
// install job (at most one at a time)
|
||||||
installJob *jobState
|
installJob *jobState
|
||||||
installMu sync.Mutex
|
installMu sync.Mutex
|
||||||
// pending network change (rollback on timeout)
|
// pending network change (rollback on timeout)
|
||||||
pendingNet *pendingNetChange
|
pendingNet *pendingNetChange
|
||||||
pendingNetMu sync.Mutex
|
pendingNetMu sync.Mutex
|
||||||
@@ -146,16 +165,31 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
|
|
||||||
h := &handler{
|
h := &handler{
|
||||||
opts: opts,
|
opts: opts,
|
||||||
ringCPUTemp: newMetricsRing(120),
|
|
||||||
ringCPULoad: newMetricsRing(120),
|
ringCPULoad: newMetricsRing(120),
|
||||||
ringMemLoad: newMetricsRing(120),
|
ringMemLoad: newMetricsRing(120),
|
||||||
ringPower: newMetricsRing(120),
|
ringPower: newMetricsRing(120),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Open metrics DB and pre-fill ring buffers from history.
|
||||||
|
if db, err := openMetricsDB(metricsDBPath); err == nil {
|
||||||
|
h.metricsDB = db
|
||||||
|
if samples, err := db.LoadRecent(120); err == nil {
|
||||||
|
for _, s := range samples {
|
||||||
|
h.feedRings(s)
|
||||||
|
}
|
||||||
|
if len(samples) > 0 {
|
||||||
|
h.setLatestMetric(samples[len(samples)-1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
h.startMetricsCollector()
|
||||||
|
|
||||||
globalQueue.startWorker(&opts)
|
globalQueue.startWorker(&opts)
|
||||||
mux := http.NewServeMux()
|
mux := http.NewServeMux()
|
||||||
|
|
||||||
// ── Infrastructure ──────────────────────────────────────────────────────
|
// ── Infrastructure ──────────────────────────────────────────────────────
|
||||||
mux.HandleFunc("GET /healthz", h.handleHealthz)
|
mux.HandleFunc("GET /healthz", h.handleHealthz)
|
||||||
|
mux.HandleFunc("GET /api/ready", h.handleReady)
|
||||||
|
|
||||||
// ── Existing read-only endpoints (preserved for compatibility) ──────────
|
// ── Existing read-only endpoints (preserved for compatibility) ──────────
|
||||||
mux.HandleFunc("GET /audit.json", h.handleAuditJSON)
|
mux.HandleFunc("GET /audit.json", h.handleAuditJSON)
|
||||||
@@ -176,9 +210,12 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
||||||
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
|
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
|
||||||
mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
|
mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
|
||||||
mux.HandleFunc("POST /api/sat/amd-stress/run", h.handleAPISATRun("amd-stress"))
|
mux.HandleFunc("POST /api/sat/amd-mem/run", h.handleAPISATRun("amd-mem"))
|
||||||
|
mux.HandleFunc("POST /api/sat/amd-bandwidth/run", h.handleAPISATRun("amd-bandwidth"))
|
||||||
|
mux.HandleFunc("POST /api/sat/amd-stress/run", h.handleAPISATRun("amd-stress"))
|
||||||
mux.HandleFunc("POST /api/sat/memory-stress/run", h.handleAPISATRun("memory-stress"))
|
mux.HandleFunc("POST /api/sat/memory-stress/run", h.handleAPISATRun("memory-stress"))
|
||||||
mux.HandleFunc("POST /api/sat/sat-stress/run", h.handleAPISATRun("sat-stress"))
|
mux.HandleFunc("POST /api/sat/sat-stress/run", h.handleAPISATRun("sat-stress"))
|
||||||
|
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
|
||||||
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
|
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
|
||||||
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
||||||
|
|
||||||
@@ -223,9 +260,10 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("POST /api/install/run", h.handleAPIInstallRun)
|
mux.HandleFunc("POST /api/install/run", h.handleAPIInstallRun)
|
||||||
mux.HandleFunc("GET /api/install/stream", h.handleAPIInstallStream)
|
mux.HandleFunc("GET /api/install/stream", h.handleAPIInstallStream)
|
||||||
|
|
||||||
// Metrics — SSE stream of live sensor data + server-side SVG charts
|
// Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export
|
||||||
mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
|
mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
|
||||||
mux.HandleFunc("GET /api/metrics/chart/", h.handleMetricsChartSVG)
|
mux.HandleFunc("GET /api/metrics/chart/", h.handleMetricsChartSVG)
|
||||||
|
mux.HandleFunc("GET /api/metrics/export.csv", h.handleAPIMetricsExportCSV)
|
||||||
|
|
||||||
// Reanimator chart static assets (viewer template expects /static/*)
|
// Reanimator chart static assets (viewer template expects /static/*)
|
||||||
mux.Handle("GET /static/", http.StripPrefix("/static/", web.Static()))
|
mux.Handle("GET /static/", http.StripPrefix("/static/", web.Static()))
|
||||||
@@ -237,6 +275,37 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
return mux
|
return mux
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) startMetricsCollector() {
|
||||||
|
go func() {
|
||||||
|
ticker := time.NewTicker(1 * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for range ticker.C {
|
||||||
|
sample := platform.SampleLiveMetrics()
|
||||||
|
h.feedRings(sample)
|
||||||
|
h.setLatestMetric(sample)
|
||||||
|
if h.metricsDB != nil {
|
||||||
|
_ = h.metricsDB.Write(sample)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
|
||||||
|
h.latestMu.Lock()
|
||||||
|
defer h.latestMu.Unlock()
|
||||||
|
cp := sample
|
||||||
|
h.latest = &cp
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {
|
||||||
|
h.latestMu.RLock()
|
||||||
|
defer h.latestMu.RUnlock()
|
||||||
|
if h.latest == nil {
|
||||||
|
return platform.LiveMetricSample{}, false
|
||||||
|
}
|
||||||
|
return *h.latest, true
|
||||||
|
}
|
||||||
|
|
||||||
// ListenAndServe starts the HTTP server.
|
// ListenAndServe starts the HTTP server.
|
||||||
func ListenAndServe(addr string, opts HandlerOptions) error {
|
func ListenAndServe(addr string, opts HandlerOptions) error {
|
||||||
return http.ListenAndServe(addr, NewHandler(opts))
|
return http.ListenAndServe(addr, NewHandler(opts))
|
||||||
@@ -364,6 +433,20 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
|
path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
|
||||||
path = strings.TrimSuffix(path, ".svg")
|
path = strings.TrimSuffix(path, ".svg")
|
||||||
|
|
||||||
|
if h.metricsDB != nil {
|
||||||
|
if datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path); ok {
|
||||||
|
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.Header().Set("Content-Type", "image/svg+xml")
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
_, _ = w.Write(buf)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
var datasets [][]float64
|
var datasets [][]float64
|
||||||
var names []string
|
var names []string
|
||||||
var labels []string
|
var labels []string
|
||||||
@@ -382,21 +465,51 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
yMin = floatPtr(0)
|
yMin = floatPtr(0)
|
||||||
yMax = floatPtr(100)
|
yMax = floatPtr(100)
|
||||||
|
|
||||||
case path == "server-temp":
|
case path == "server-temp", path == "server-temp-cpu":
|
||||||
title = "CPU Temperature"
|
title = "CPU Temperature"
|
||||||
vCPUTemp, l := h.ringCPUTemp.snapshot()
|
h.ringsMu.Lock()
|
||||||
labels = l
|
datasets, names, labels = snapshotNamedRings(h.cpuTempRings)
|
||||||
datasets = [][]float64{vCPUTemp}
|
h.ringsMu.Unlock()
|
||||||
names = []string{"CPU Temp °C"}
|
|
||||||
yMin = floatPtr(0)
|
yMin = floatPtr(0)
|
||||||
yMax = autoMax120(vCPUTemp)
|
yMax = autoMax120(datasets...)
|
||||||
|
|
||||||
|
case path == "server-temp-gpu":
|
||||||
|
title = "GPU Temperature"
|
||||||
|
h.ringsMu.Lock()
|
||||||
|
for idx, gr := range h.gpuRings {
|
||||||
|
if gr == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
vTemp, l := gr.Temp.snapshot()
|
||||||
|
datasets = append(datasets, vTemp)
|
||||||
|
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||||
|
if len(labels) == 0 {
|
||||||
|
labels = l
|
||||||
|
}
|
||||||
|
}
|
||||||
|
h.ringsMu.Unlock()
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(datasets...)
|
||||||
|
|
||||||
|
case path == "server-temp-ambient":
|
||||||
|
title = "Ambient / Other Sensors"
|
||||||
|
h.ringsMu.Lock()
|
||||||
|
datasets, names, labels = snapshotNamedRings(h.ambientTempRings)
|
||||||
|
h.ringsMu.Unlock()
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(datasets...)
|
||||||
|
|
||||||
case path == "server-power":
|
case path == "server-power":
|
||||||
title = "Power & Fans"
|
title = "System Power"
|
||||||
vPower, l := h.ringPower.snapshot()
|
vPower, l := h.ringPower.snapshot()
|
||||||
labels = l
|
labels = l
|
||||||
datasets = [][]float64{vPower}
|
datasets = [][]float64{vPower}
|
||||||
names = []string{"Power W"}
|
names = []string{"Power W"}
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(vPower)
|
||||||
|
|
||||||
|
case path == "server-fans":
|
||||||
|
title = "Fan RPM"
|
||||||
h.ringsMu.Lock()
|
h.ringsMu.Lock()
|
||||||
for i, fr := range h.ringFans {
|
for i, fr := range h.ringFans {
|
||||||
fv, _ := fr.snapshot()
|
fv, _ := fr.snapshot()
|
||||||
@@ -411,7 +524,80 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
yMin = floatPtr(0)
|
yMin = floatPtr(0)
|
||||||
yMax = autoMax120(datasets...)
|
yMax = autoMax120(datasets...)
|
||||||
|
|
||||||
// ── GPU sub-charts ────────────────────────────────────────────────────
|
// ── Combined GPU charts (all GPUs on one chart) ───────────────────────
|
||||||
|
case path == "gpu-all-load":
|
||||||
|
title = "GPU Compute Load"
|
||||||
|
h.ringsMu.Lock()
|
||||||
|
for idx, gr := range h.gpuRings {
|
||||||
|
if gr == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
vUtil, l := gr.Util.snapshot()
|
||||||
|
datasets = append(datasets, vUtil)
|
||||||
|
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||||
|
if len(labels) == 0 {
|
||||||
|
labels = l
|
||||||
|
}
|
||||||
|
}
|
||||||
|
h.ringsMu.Unlock()
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = floatPtr(100)
|
||||||
|
|
||||||
|
case path == "gpu-all-memload":
|
||||||
|
title = "GPU Memory Load"
|
||||||
|
h.ringsMu.Lock()
|
||||||
|
for idx, gr := range h.gpuRings {
|
||||||
|
if gr == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
vMem, l := gr.MemUtil.snapshot()
|
||||||
|
datasets = append(datasets, vMem)
|
||||||
|
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||||
|
if len(labels) == 0 {
|
||||||
|
labels = l
|
||||||
|
}
|
||||||
|
}
|
||||||
|
h.ringsMu.Unlock()
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = floatPtr(100)
|
||||||
|
|
||||||
|
case path == "gpu-all-power":
|
||||||
|
title = "GPU Power"
|
||||||
|
h.ringsMu.Lock()
|
||||||
|
for idx, gr := range h.gpuRings {
|
||||||
|
if gr == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
vPow, l := gr.Power.snapshot()
|
||||||
|
datasets = append(datasets, vPow)
|
||||||
|
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||||
|
if len(labels) == 0 {
|
||||||
|
labels = l
|
||||||
|
}
|
||||||
|
}
|
||||||
|
h.ringsMu.Unlock()
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(datasets...)
|
||||||
|
|
||||||
|
case path == "gpu-all-temp":
|
||||||
|
title = "GPU Temperature"
|
||||||
|
h.ringsMu.Lock()
|
||||||
|
for idx, gr := range h.gpuRings {
|
||||||
|
if gr == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
vTemp, l := gr.Temp.snapshot()
|
||||||
|
datasets = append(datasets, vTemp)
|
||||||
|
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||||
|
if len(labels) == 0 {
|
||||||
|
labels = l
|
||||||
|
}
|
||||||
|
}
|
||||||
|
h.ringsMu.Unlock()
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(datasets...)
|
||||||
|
|
||||||
|
// ── Per-GPU sub-charts ────────────────────────────────────────────────
|
||||||
case strings.HasPrefix(path, "gpu/"):
|
case strings.HasPrefix(path, "gpu/"):
|
||||||
rest := strings.TrimPrefix(path, "gpu/")
|
rest := strings.TrimPrefix(path, "gpu/")
|
||||||
// rest is either "{idx}-load", "{idx}-temp", "{idx}-power", or legacy "{idx}"
|
// rest is either "{idx}-load", "{idx}-temp", "{idx}-power", or legacy "{idx}"
|
||||||
@@ -475,6 +661,259 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
_, _ = w.Write(buf)
|
_, _ = w.Write(buf)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
||||||
|
samples, err := h.metricsDB.LoadAll()
|
||||||
|
if err != nil || len(samples) == 0 {
|
||||||
|
return nil, nil, nil, "", nil, nil, false
|
||||||
|
}
|
||||||
|
return chartDataFromSamples(path, samples)
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
||||||
|
var datasets [][]float64
|
||||||
|
var names []string
|
||||||
|
var title string
|
||||||
|
var yMin, yMax *float64
|
||||||
|
labels := sampleTimeLabels(samples)
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case path == "server-load":
|
||||||
|
title = "CPU / Memory Load"
|
||||||
|
cpu := make([]float64, len(samples))
|
||||||
|
mem := make([]float64, len(samples))
|
||||||
|
for i, s := range samples {
|
||||||
|
cpu[i] = s.CPULoadPct
|
||||||
|
mem[i] = s.MemLoadPct
|
||||||
|
}
|
||||||
|
datasets = [][]float64{cpu, mem}
|
||||||
|
names = []string{"CPU Load %", "Mem Load %"}
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = floatPtr(100)
|
||||||
|
|
||||||
|
case path == "server-temp", path == "server-temp-cpu":
|
||||||
|
title = "CPU Temperature"
|
||||||
|
datasets, names = namedTempDatasets(samples, "cpu")
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(datasets...)
|
||||||
|
|
||||||
|
case path == "server-temp-gpu":
|
||||||
|
title = "GPU Temperature"
|
||||||
|
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(datasets...)
|
||||||
|
|
||||||
|
case path == "server-temp-ambient":
|
||||||
|
title = "Ambient / Other Sensors"
|
||||||
|
datasets, names = namedTempDatasets(samples, "ambient")
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(datasets...)
|
||||||
|
|
||||||
|
case path == "server-power":
|
||||||
|
title = "System Power"
|
||||||
|
power := make([]float64, len(samples))
|
||||||
|
for i, s := range samples {
|
||||||
|
power[i] = s.PowerW
|
||||||
|
}
|
||||||
|
datasets = [][]float64{power}
|
||||||
|
names = []string{"Power W"}
|
||||||
|
yMin, yMax = autoBounds120(power)
|
||||||
|
|
||||||
|
case path == "server-fans":
|
||||||
|
title = "Fan RPM"
|
||||||
|
datasets, names = namedFanDatasets(samples)
|
||||||
|
yMin, yMax = autoBounds120(datasets...)
|
||||||
|
|
||||||
|
case path == "gpu-all-load":
|
||||||
|
title = "GPU Compute Load"
|
||||||
|
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = floatPtr(100)
|
||||||
|
|
||||||
|
case path == "gpu-all-memload":
|
||||||
|
title = "GPU Memory Load"
|
||||||
|
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = floatPtr(100)
|
||||||
|
|
||||||
|
case path == "gpu-all-power":
|
||||||
|
title = "GPU Power"
|
||||||
|
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||||
|
yMin, yMax = autoBounds120(datasets...)
|
||||||
|
|
||||||
|
case path == "gpu-all-temp":
|
||||||
|
title = "GPU Temperature"
|
||||||
|
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(datasets...)
|
||||||
|
|
||||||
|
case strings.HasPrefix(path, "gpu/"):
|
||||||
|
rest := strings.TrimPrefix(path, "gpu/")
|
||||||
|
sub := ""
|
||||||
|
if i := strings.LastIndex(rest, "-"); i > 0 {
|
||||||
|
sub = rest[i+1:]
|
||||||
|
rest = rest[:i]
|
||||||
|
}
|
||||||
|
idx := 0
|
||||||
|
fmt.Sscanf(rest, "%d", &idx)
|
||||||
|
switch sub {
|
||||||
|
case "load":
|
||||||
|
title = fmt.Sprintf("GPU %d Load", idx)
|
||||||
|
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||||
|
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||||
|
if util == nil && mem == nil {
|
||||||
|
return nil, nil, nil, "", nil, nil, false
|
||||||
|
}
|
||||||
|
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
|
||||||
|
names = []string{"Load %", "Mem %"}
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = floatPtr(100)
|
||||||
|
case "temp":
|
||||||
|
title = fmt.Sprintf("GPU %d Temperature", idx)
|
||||||
|
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||||
|
if temp == nil {
|
||||||
|
return nil, nil, nil, "", nil, nil, false
|
||||||
|
}
|
||||||
|
datasets = [][]float64{temp}
|
||||||
|
names = []string{"Temp °C"}
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(temp)
|
||||||
|
default:
|
||||||
|
title = fmt.Sprintf("GPU %d Power", idx)
|
||||||
|
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||||
|
if power == nil {
|
||||||
|
return nil, nil, nil, "", nil, nil, false
|
||||||
|
}
|
||||||
|
datasets = [][]float64{power}
|
||||||
|
names = []string{"Power W"}
|
||||||
|
yMin, yMax = autoBounds120(power)
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
return nil, nil, nil, "", nil, nil, false
|
||||||
|
}
|
||||||
|
|
||||||
|
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
|
||||||
|
labels := make([]string, len(samples))
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return labels
|
||||||
|
}
|
||||||
|
times := make([]time.Time, len(samples))
|
||||||
|
for i, s := range samples {
|
||||||
|
times[i] = s.Timestamp
|
||||||
|
}
|
||||||
|
sameDay := timestampsSameLocalDay(times)
|
||||||
|
for i, s := range samples {
|
||||||
|
labels[i] = formatTimelineLabel(s.Timestamp.Local(), sameDay)
|
||||||
|
}
|
||||||
|
return labels
|
||||||
|
}
|
||||||
|
|
||||||
|
func namedTempDatasets(samples []platform.LiveMetricSample, group string) ([][]float64, []string) {
|
||||||
|
seen := map[string]bool{}
|
||||||
|
var names []string
|
||||||
|
for _, s := range samples {
|
||||||
|
for _, t := range s.Temps {
|
||||||
|
if t.Group == group && !seen[t.Name] {
|
||||||
|
seen[t.Name] = true
|
||||||
|
names = append(names, t.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
datasets := make([][]float64, 0, len(names))
|
||||||
|
for _, name := range names {
|
||||||
|
ds := make([]float64, len(samples))
|
||||||
|
for i, s := range samples {
|
||||||
|
for _, t := range s.Temps {
|
||||||
|
if t.Group == group && t.Name == name {
|
||||||
|
ds[i] = t.Celsius
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
datasets = append(datasets, ds)
|
||||||
|
}
|
||||||
|
return datasets, names
|
||||||
|
}
|
||||||
|
|
||||||
|
func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []string) {
|
||||||
|
seen := map[string]bool{}
|
||||||
|
var names []string
|
||||||
|
for _, s := range samples {
|
||||||
|
for _, f := range s.Fans {
|
||||||
|
if !seen[f.Name] {
|
||||||
|
seen[f.Name] = true
|
||||||
|
names = append(names, f.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
datasets := make([][]float64, 0, len(names))
|
||||||
|
for _, name := range names {
|
||||||
|
ds := make([]float64, len(samples))
|
||||||
|
for i, s := range samples {
|
||||||
|
for _, f := range s.Fans {
|
||||||
|
if f.Name == name {
|
||||||
|
ds[i] = f.RPM
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
datasets = append(datasets, ds)
|
||||||
|
}
|
||||||
|
return datasets, names
|
||||||
|
}
|
||||||
|
|
||||||
|
func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetricRow) float64) ([][]float64, []string) {
|
||||||
|
seen := map[int]bool{}
|
||||||
|
var indices []int
|
||||||
|
for _, s := range samples {
|
||||||
|
for _, g := range s.GPUs {
|
||||||
|
if !seen[g.GPUIndex] {
|
||||||
|
seen[g.GPUIndex] = true
|
||||||
|
indices = append(indices, g.GPUIndex)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
datasets := make([][]float64, 0, len(indices))
|
||||||
|
names := make([]string, 0, len(indices))
|
||||||
|
for _, idx := range indices {
|
||||||
|
ds := gpuDatasetByIndex(samples, idx, pick)
|
||||||
|
if ds == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
datasets = append(datasets, ds)
|
||||||
|
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||||
|
}
|
||||||
|
return datasets, names
|
||||||
|
}
|
||||||
|
|
||||||
|
func gpuDatasetByIndex(samples []platform.LiveMetricSample, idx int, pick func(platform.GPUMetricRow) float64) []float64 {
|
||||||
|
found := false
|
||||||
|
ds := make([]float64, len(samples))
|
||||||
|
for i, s := range samples {
|
||||||
|
for _, g := range s.GPUs {
|
||||||
|
if g.GPUIndex == idx {
|
||||||
|
ds[i] = pick(g)
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return ds
|
||||||
|
}
|
||||||
|
|
||||||
|
func coalesceDataset(ds []float64, n int) []float64 {
|
||||||
|
if ds != nil {
|
||||||
|
return ds
|
||||||
|
}
|
||||||
|
return make([]float64, n)
|
||||||
|
}
|
||||||
|
|
||||||
// floatPtr returns a pointer to a float64 value.
|
// floatPtr returns a pointer to a float64 value.
|
||||||
func floatPtr(v float64) *float64 { return &v }
|
func floatPtr(v float64) *float64 { return &v }
|
||||||
|
|
||||||
@@ -495,6 +934,47 @@ func autoMax120(datasets ...[]float64) *float64 {
|
|||||||
return &v
|
return &v
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func autoBounds120(datasets ...[]float64) (*float64, *float64) {
|
||||||
|
min := 0.0
|
||||||
|
max := 0.0
|
||||||
|
first := true
|
||||||
|
for _, ds := range datasets {
|
||||||
|
for _, v := range ds {
|
||||||
|
if first {
|
||||||
|
min, max = v, v
|
||||||
|
first = false
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if v < min {
|
||||||
|
min = v
|
||||||
|
}
|
||||||
|
if v > max {
|
||||||
|
max = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if first {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
if max <= 0 {
|
||||||
|
return floatPtr(0), nil
|
||||||
|
}
|
||||||
|
span := max - min
|
||||||
|
if span <= 0 {
|
||||||
|
span = max * 0.1
|
||||||
|
if span <= 0 {
|
||||||
|
span = 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pad := span * 0.2
|
||||||
|
low := min - pad
|
||||||
|
if low < 0 {
|
||||||
|
low = 0
|
||||||
|
}
|
||||||
|
high := max + pad
|
||||||
|
return floatPtr(low), floatPtr(high)
|
||||||
|
}
|
||||||
|
|
||||||
// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
|
// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
|
||||||
func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
|
func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
|
||||||
n := len(labels)
|
n := len(labels)
|
||||||
@@ -507,14 +987,39 @@ func renderChartSVG(title string, datasets [][]float64, names []string, labels [
|
|||||||
datasets[i] = make([]float64, n)
|
datasets[i] = make([]float64, n)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sparse := sparseLabels(labels, 6)
|
// Append global min/avg/max to title.
|
||||||
|
mn, avg, mx := globalStats(datasets)
|
||||||
|
if mx > 0 {
|
||||||
|
title = fmt.Sprintf("%s ↓%s ~%s ↑%s",
|
||||||
|
title,
|
||||||
|
chartLegendNumber(mn),
|
||||||
|
chartLegendNumber(avg),
|
||||||
|
chartLegendNumber(mx),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
title = sanitizeChartText(title)
|
||||||
|
names = sanitizeChartTexts(names)
|
||||||
|
sparse := sanitizeChartTexts(sparseLabels(labels, 6))
|
||||||
|
|
||||||
opt := gocharts.NewLineChartOptionWithData(datasets)
|
opt := gocharts.NewLineChartOptionWithData(datasets)
|
||||||
opt.Title = gocharts.TitleOption{Text: title}
|
opt.Title = gocharts.TitleOption{Text: title}
|
||||||
opt.XAxis.Labels = sparse
|
opt.XAxis.Labels = sparse
|
||||||
opt.Legend = gocharts.LegendOption{SeriesNames: names}
|
opt.Legend = gocharts.LegendOption{SeriesNames: names}
|
||||||
|
opt.Symbol = gocharts.SymbolNone
|
||||||
|
// Right padding: reserve space for the MarkLine label (library recommendation).
|
||||||
|
opt.Padding = gocharts.NewBox(20, 20, 80, 20)
|
||||||
if yMin != nil || yMax != nil {
|
if yMin != nil || yMax != nil {
|
||||||
opt.YAxis = []gocharts.YAxisOption{{Min: yMin, Max: yMax}}
|
opt.YAxis = []gocharts.YAxisOption{{
|
||||||
|
Min: yMin,
|
||||||
|
Max: yMax,
|
||||||
|
ValueFormatter: chartLegendNumber,
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add a single peak mark line on the series that holds the global maximum.
|
||||||
|
peakIdx, _ := globalPeakSeries(datasets)
|
||||||
|
if peakIdx >= 0 && peakIdx < len(opt.SeriesList) {
|
||||||
|
opt.SeriesList[peakIdx].MarkLine = gocharts.NewMarkLine(gocharts.SeriesMarkTypeMax)
|
||||||
}
|
}
|
||||||
|
|
||||||
p := gocharts.NewPainter(gocharts.PainterOptions{
|
p := gocharts.NewPainter(gocharts.PainterOptions{
|
||||||
@@ -528,6 +1033,68 @@ func renderChartSVG(title string, datasets [][]float64, names []string, labels [
|
|||||||
return p.Bytes()
|
return p.Bytes()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// globalPeakSeries returns the index of the series containing the global maximum
|
||||||
|
// value across all datasets, and that maximum value.
|
||||||
|
func globalPeakSeries(datasets [][]float64) (idx int, peak float64) {
|
||||||
|
idx = -1
|
||||||
|
for i, ds := range datasets {
|
||||||
|
for _, v := range ds {
|
||||||
|
if v > peak {
|
||||||
|
peak = v
|
||||||
|
idx = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return idx, peak
|
||||||
|
}
|
||||||
|
|
||||||
|
// globalStats returns min, average, and max across all values in all datasets.
|
||||||
|
func globalStats(datasets [][]float64) (mn, avg, mx float64) {
|
||||||
|
var sum float64
|
||||||
|
var count int
|
||||||
|
first := true
|
||||||
|
for _, ds := range datasets {
|
||||||
|
for _, v := range ds {
|
||||||
|
if first {
|
||||||
|
mn, mx = v, v
|
||||||
|
first = false
|
||||||
|
}
|
||||||
|
if v < mn {
|
||||||
|
mn = v
|
||||||
|
}
|
||||||
|
if v > mx {
|
||||||
|
mx = v
|
||||||
|
}
|
||||||
|
sum += v
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if count > 0 {
|
||||||
|
avg = sum / float64(count)
|
||||||
|
}
|
||||||
|
return mn, avg, mx
|
||||||
|
}
|
||||||
|
|
||||||
|
func sanitizeChartText(s string) string {
|
||||||
|
if s == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return html.EscapeString(strings.Map(func(r rune) rune {
|
||||||
|
if r < 0x20 && r != '\t' && r != '\n' && r != '\r' {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
return r
|
||||||
|
}, s))
|
||||||
|
}
|
||||||
|
|
||||||
|
func sanitizeChartTexts(in []string) []string {
|
||||||
|
out := make([]string, len(in))
|
||||||
|
for i, s := range in {
|
||||||
|
out[i] = sanitizeChartText(s)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
func safeIdx(s []float64, i int) float64 {
|
func safeIdx(s []float64, i int) float64 {
|
||||||
if i < len(s) {
|
if i < len(s) {
|
||||||
return s[i]
|
return s[i]
|
||||||
@@ -535,6 +1102,46 @@ func safeIdx(s []float64, i int) float64 {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []string) {
|
||||||
|
var datasets [][]float64
|
||||||
|
var names []string
|
||||||
|
var labels []string
|
||||||
|
for _, item := range rings {
|
||||||
|
if item == nil || item.Ring == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
vals, l := item.Ring.snapshot()
|
||||||
|
datasets = append(datasets, vals)
|
||||||
|
names = append(names, item.Name)
|
||||||
|
if len(labels) == 0 {
|
||||||
|
labels = l
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return datasets, names, labels
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartLegendNumber(v float64) string {
|
||||||
|
neg := v < 0
|
||||||
|
if v < 0 {
|
||||||
|
v = -v
|
||||||
|
}
|
||||||
|
var out string
|
||||||
|
switch {
|
||||||
|
case v >= 10000:
|
||||||
|
out = fmt.Sprintf("%dk", int((v+500)/1000))
|
||||||
|
case v >= 1000:
|
||||||
|
s := fmt.Sprintf("%.2f", v/1000)
|
||||||
|
s = strings.TrimRight(strings.TrimRight(s, "0"), ".")
|
||||||
|
out = strings.ReplaceAll(s, ".", ",") + "k"
|
||||||
|
default:
|
||||||
|
out = fmt.Sprintf("%.0f", v)
|
||||||
|
}
|
||||||
|
if neg {
|
||||||
|
return "-" + out
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
func sparseLabels(labels []string, n int) []string {
|
func sparseLabels(labels []string, n int) []string {
|
||||||
out := make([]string, len(labels))
|
out := make([]string, len(labels))
|
||||||
step := len(labels) / n
|
step := len(labels) / n
|
||||||
@@ -549,11 +1156,79 @@ func sparseLabels(labels []string, n int) []string {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.metricsDB == nil {
|
||||||
|
http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.Header().Set("Content-Type", "text/csv; charset=utf-8")
|
||||||
|
w.Header().Set("Content-Disposition", `attachment; filename="bee-metrics.csv"`)
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
_ = h.metricsDB.ExportCSV(w)
|
||||||
|
}
|
||||||
|
|
||||||
// ── Page handler ─────────────────────────────────────────────────────────────
|
// ── Page handler ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func (h *handler) handleReady(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
if _, err := os.Stat(h.opts.AuditPath); err != nil {
|
||||||
|
w.WriteHeader(http.StatusServiceUnavailable)
|
||||||
|
_, _ = w.Write([]byte("starting"))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
_, _ = w.Write([]byte("ready"))
|
||||||
|
}
|
||||||
|
|
||||||
|
const loadingPageHTML = `<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title>EASY-BEE</title>
|
||||||
|
<style>
|
||||||
|
*{margin:0;padding:0;box-sizing:border-box}
|
||||||
|
html,body{height:100%;background:#0f1117;display:flex;align-items:center;justify-content:center;font-family:'Courier New',monospace;color:#e2e8f0}
|
||||||
|
.logo{font-size:13px;line-height:1.4;color:#f6c90e;margin-bottom:48px;white-space:pre}
|
||||||
|
.spinner{width:48px;height:48px;border:4px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 24px}
|
||||||
|
@keyframes spin{to{transform:rotate(360deg)}}
|
||||||
|
.status{font-size:14px;color:#a0aec0;letter-spacing:.05em}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div style="text-align:center">
|
||||||
|
<div class="logo"> ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗
|
||||||
|
██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝
|
||||||
|
█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗
|
||||||
|
██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝
|
||||||
|
███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗
|
||||||
|
╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝</div>
|
||||||
|
<div class="spinner"></div>
|
||||||
|
<div class="status" id="s">Starting up...</div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
function probe(){
|
||||||
|
fetch('/api/ready',{cache:'no-store'})
|
||||||
|
.then(function(r){
|
||||||
|
if(r.ok){window.location.replace('/');}
|
||||||
|
else{setTimeout(probe,1000);}
|
||||||
|
})
|
||||||
|
.catch(function(){setTimeout(probe,1000);});
|
||||||
|
}
|
||||||
|
probe();
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>`
|
||||||
|
|
||||||
func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) {
|
||||||
page := strings.TrimPrefix(r.URL.Path, "/")
|
page := strings.TrimPrefix(r.URL.Path, "/")
|
||||||
if page == "" {
|
if page == "" {
|
||||||
|
// Serve loading page until audit snapshot exists
|
||||||
|
if _, err := os.Stat(h.opts.AuditPath); err != nil {
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
_, _ = w.Write([]byte(loadingPageHTML))
|
||||||
|
return
|
||||||
|
}
|
||||||
page = "dashboard"
|
page = "dashboard"
|
||||||
}
|
}
|
||||||
// Redirect old routes to new names
|
// Redirect old routes to new names
|
||||||
|
|||||||
@@ -7,8 +7,88 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func TestChartLegendNumber(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
in float64
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{in: 0.4, want: "0"},
|
||||||
|
{in: 61.5, want: "62"},
|
||||||
|
{in: 999.4, want: "999"},
|
||||||
|
{in: 1200, want: "1,2k"},
|
||||||
|
{in: 1250, want: "1,25k"},
|
||||||
|
{in: 1310, want: "1,31k"},
|
||||||
|
{in: 1500, want: "1,5k"},
|
||||||
|
{in: 2600, want: "2,6k"},
|
||||||
|
{in: 10200, want: "10k"},
|
||||||
|
}
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := chartLegendNumber(tc.in); got != tc.want {
|
||||||
|
t.Fatalf("chartLegendNumber(%v)=%q want %q", tc.in, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||||
|
samples := []platform.LiveMetricSample{
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-3 * time.Minute),
|
||||||
|
CPULoadPct: 10,
|
||||||
|
MemLoadPct: 20,
|
||||||
|
PowerW: 300,
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, UsagePct: 90, MemUsagePct: 5, PowerW: 120, TempC: 50},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||||
|
CPULoadPct: 30,
|
||||||
|
MemLoadPct: 40,
|
||||||
|
PowerW: 320,
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, UsagePct: 95, MemUsagePct: 7, PowerW: 125, TempC: 51},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||||
|
CPULoadPct: 50,
|
||||||
|
MemLoadPct: 60,
|
||||||
|
PowerW: 340,
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, UsagePct: 97, MemUsagePct: 9, PowerW: 130, TempC: 52},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("chartDataFromSamples returned ok=false")
|
||||||
|
}
|
||||||
|
if title != "GPU Power" {
|
||||||
|
t.Fatalf("title=%q", title)
|
||||||
|
}
|
||||||
|
if len(names) != 1 || names[0] != "GPU 0" {
|
||||||
|
t.Fatalf("names=%v", names)
|
||||||
|
}
|
||||||
|
if len(labels) != len(samples) {
|
||||||
|
t.Fatalf("labels len=%d want %d", len(labels), len(samples))
|
||||||
|
}
|
||||||
|
if len(datasets) != 1 || len(datasets[0]) != len(samples) {
|
||||||
|
t.Fatalf("datasets shape=%v", datasets)
|
||||||
|
}
|
||||||
|
if got := datasets[0][0]; got != 120 {
|
||||||
|
t.Fatalf("datasets[0][0]=%v want 120", got)
|
||||||
|
}
|
||||||
|
if got := datasets[0][2]; got != 130 {
|
||||||
|
t.Fatalf("datasets[0][2]=%v want 130", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRootRendersDashboard(t *testing.T) {
|
func TestRootRendersDashboard(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
path := filepath.Join(dir, "audit.json")
|
path := filepath.Join(dir, "audit.json")
|
||||||
@@ -31,9 +111,9 @@ func TestRootRendersDashboard(t *testing.T) {
|
|||||||
if first.Code != http.StatusOK {
|
if first.Code != http.StatusOK {
|
||||||
t.Fatalf("first status=%d", first.Code)
|
t.Fatalf("first status=%d", first.Code)
|
||||||
}
|
}
|
||||||
// Dashboard should contain the audit modal (with viewer link) and hardware summary
|
// Dashboard should contain the audit nav link and hardware summary
|
||||||
if !strings.Contains(first.Body.String(), `openAuditModal`) {
|
if !strings.Contains(first.Body.String(), `href="/audit"`) {
|
||||||
t.Fatalf("first body missing audit modal trigger: %s", first.Body.String())
|
t.Fatalf("first body missing audit nav link: %s", first.Body.String())
|
||||||
}
|
}
|
||||||
if !strings.Contains(first.Body.String(), `/viewer`) {
|
if !strings.Contains(first.Body.String(), `/viewer`) {
|
||||||
t.Fatalf("first body missing viewer link: %s", first.Body.String())
|
t.Fatalf("first body missing viewer link: %s", first.Body.String())
|
||||||
@@ -56,6 +136,28 @@ func TestRootRendersDashboard(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "audit.json")
|
||||||
|
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{AuditPath: path})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/audit", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `iframe class="viewer-frame" src="/viewer"`) {
|
||||||
|
t.Fatalf("audit page missing viewer frame: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `openAuditModal()`) {
|
||||||
|
t.Fatalf("audit page missing action modal trigger: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestViewerRendersLatestSnapshot(t *testing.T) {
|
func TestViewerRendersLatestSnapshot(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
path := filepath.Join(dir, "audit.json")
|
path := filepath.Join(dir, "audit.json")
|
||||||
|
|||||||
@@ -5,9 +5,14 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
"sort"
|
"sort"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Task statuses.
|
// Task statuses.
|
||||||
@@ -21,14 +26,17 @@ const (
|
|||||||
|
|
||||||
// taskNames maps target → human-readable name.
|
// taskNames maps target → human-readable name.
|
||||||
var taskNames = map[string]string{
|
var taskNames = map[string]string{
|
||||||
"nvidia": "NVIDIA SAT",
|
"nvidia": "NVIDIA SAT",
|
||||||
"memory": "Memory SAT",
|
"memory": "Memory SAT",
|
||||||
"storage": "Storage SAT",
|
"storage": "Storage SAT",
|
||||||
"cpu": "CPU SAT",
|
"cpu": "CPU SAT",
|
||||||
"amd": "AMD GPU SAT",
|
"amd": "AMD GPU SAT",
|
||||||
|
"amd-mem": "AMD GPU MEM Integrity",
|
||||||
|
"amd-bandwidth": "AMD GPU MEM Bandwidth",
|
||||||
"amd-stress": "AMD GPU Burn-in",
|
"amd-stress": "AMD GPU Burn-in",
|
||||||
"memory-stress": "Memory Burn-in",
|
"memory-stress": "Memory Burn-in",
|
||||||
"sat-stress": "SAT Stress (stressapptest)",
|
"sat-stress": "SAT Stress (stressapptest)",
|
||||||
|
"platform-stress": "Platform Thermal Cycling",
|
||||||
"audit": "Audit",
|
"audit": "Audit",
|
||||||
"install": "Install to Disk",
|
"install": "Install to Disk",
|
||||||
"install-to-ram": "Install to RAM",
|
"install-to-ram": "Install to RAM",
|
||||||
@@ -45,6 +53,7 @@ type Task struct {
|
|||||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||||
ErrMsg string `json:"error,omitempty"`
|
ErrMsg string `json:"error,omitempty"`
|
||||||
|
LogPath string `json:"log_path,omitempty"`
|
||||||
|
|
||||||
// runtime fields (not serialised)
|
// runtime fields (not serialised)
|
||||||
job *jobState
|
job *jobState
|
||||||
@@ -53,29 +62,124 @@ type Task struct {
|
|||||||
|
|
||||||
// taskParams holds optional parameters parsed from the run request.
|
// taskParams holds optional parameters parsed from the run request.
|
||||||
type taskParams struct {
|
type taskParams struct {
|
||||||
Duration int
|
Duration int `json:"duration,omitempty"`
|
||||||
DiagLevel int
|
DiagLevel int `json:"diag_level,omitempty"`
|
||||||
GPUIndices []int
|
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||||
Device string // for install
|
BurnProfile string `json:"burn_profile,omitempty"`
|
||||||
|
DisplayName string `json:"display_name,omitempty"`
|
||||||
|
Device string `json:"device,omitempty"` // for install
|
||||||
|
}
|
||||||
|
|
||||||
|
type persistedTask struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
Target string `json:"target"`
|
||||||
|
Priority int `json:"priority"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||||
|
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||||
|
ErrMsg string `json:"error,omitempty"`
|
||||||
|
LogPath string `json:"log_path,omitempty"`
|
||||||
|
Params taskParams `json:"params,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type burnPreset struct {
|
||||||
|
NvidiaDiag int
|
||||||
|
DurationSec int
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveBurnPreset(profile string) burnPreset {
|
||||||
|
switch profile {
|
||||||
|
case "overnight":
|
||||||
|
return burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}
|
||||||
|
case "acceptance":
|
||||||
|
return burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}
|
||||||
|
default:
|
||||||
|
return burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
||||||
|
switch profile {
|
||||||
|
case "overnight":
|
||||||
|
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||||
|
{LoadSec: 600, IdleSec: 120},
|
||||||
|
{LoadSec: 600, IdleSec: 60},
|
||||||
|
{LoadSec: 600, IdleSec: 30},
|
||||||
|
{LoadSec: 600, IdleSec: 120},
|
||||||
|
{LoadSec: 600, IdleSec: 60},
|
||||||
|
{LoadSec: 600, IdleSec: 30},
|
||||||
|
{LoadSec: 600, IdleSec: 120},
|
||||||
|
{LoadSec: 600, IdleSec: 60},
|
||||||
|
}}
|
||||||
|
case "acceptance":
|
||||||
|
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||||
|
{LoadSec: 300, IdleSec: 60},
|
||||||
|
{LoadSec: 300, IdleSec: 30},
|
||||||
|
{LoadSec: 300, IdleSec: 60},
|
||||||
|
{LoadSec: 300, IdleSec: 30},
|
||||||
|
}}
|
||||||
|
default: // smoke
|
||||||
|
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||||
|
{LoadSec: 90, IdleSec: 60},
|
||||||
|
{LoadSec: 90, IdleSec: 30},
|
||||||
|
}}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
||||||
type taskQueue struct {
|
type taskQueue struct {
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
tasks []*Task
|
tasks []*Task
|
||||||
trigger chan struct{}
|
trigger chan struct{}
|
||||||
opts *HandlerOptions // set by startWorker
|
opts *HandlerOptions // set by startWorker
|
||||||
|
statePath string
|
||||||
|
logsDir string
|
||||||
|
started bool
|
||||||
}
|
}
|
||||||
|
|
||||||
var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
||||||
|
|
||||||
const maxTaskHistory = 50
|
const maxTaskHistory = 50
|
||||||
|
|
||||||
|
var (
|
||||||
|
runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunMemoryAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunStorageAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
runAMDAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
runAMDMemIntegrityPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDMemIntegrityPackCtx(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
runAMDMemBandwidthPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDMemBandwidthPackCtx(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
runAMDStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
runMemoryStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunMemoryStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
runSATStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunSATStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
// enqueue adds a task to the queue and notifies the worker.
|
// enqueue adds a task to the queue and notifies the worker.
|
||||||
func (q *taskQueue) enqueue(t *Task) {
|
func (q *taskQueue) enqueue(t *Task) {
|
||||||
q.mu.Lock()
|
q.mu.Lock()
|
||||||
|
q.assignTaskLogPathLocked(t)
|
||||||
q.tasks = append(q.tasks, t)
|
q.tasks = append(q.tasks, t)
|
||||||
q.prune()
|
q.prune()
|
||||||
|
q.persistLocked()
|
||||||
q.mu.Unlock()
|
q.mu.Unlock()
|
||||||
select {
|
select {
|
||||||
case q.trigger <- struct{}{}:
|
case q.trigger <- struct{}{}:
|
||||||
@@ -137,6 +241,20 @@ func (q *taskQueue) findJob(id string) (*jobState, bool) {
|
|||||||
return t.job, true
|
return t.job, true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) hasActiveTarget(target string) bool {
|
||||||
|
q.mu.Lock()
|
||||||
|
defer q.mu.Unlock()
|
||||||
|
for _, t := range q.tasks {
|
||||||
|
if t.Target != target {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if t.Status == TaskPending || t.Status == TaskRunning {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
// snapshot returns a copy of all tasks sorted for display (running first, then pending by priority, then done by doneAt desc).
|
// snapshot returns a copy of all tasks sorted for display (running first, then pending by priority, then done by doneAt desc).
|
||||||
func (q *taskQueue) snapshot() []Task {
|
func (q *taskQueue) snapshot() []Task {
|
||||||
q.mu.Lock()
|
q.mu.Lock()
|
||||||
@@ -172,13 +290,30 @@ func statusOrder(s string) int {
|
|||||||
|
|
||||||
// startWorker launches the queue runner goroutine.
|
// startWorker launches the queue runner goroutine.
|
||||||
func (q *taskQueue) startWorker(opts *HandlerOptions) {
|
func (q *taskQueue) startWorker(opts *HandlerOptions) {
|
||||||
|
q.mu.Lock()
|
||||||
q.opts = opts
|
q.opts = opts
|
||||||
go q.worker()
|
q.statePath = filepath.Join(opts.ExportDir, "tasks-state.json")
|
||||||
|
q.logsDir = filepath.Join(opts.ExportDir, "tasks")
|
||||||
|
_ = os.MkdirAll(q.logsDir, 0755)
|
||||||
|
if !q.started {
|
||||||
|
q.loadLocked()
|
||||||
|
q.started = true
|
||||||
|
go q.worker()
|
||||||
|
}
|
||||||
|
hasPending := q.nextPending() != nil
|
||||||
|
q.mu.Unlock()
|
||||||
|
if hasPending {
|
||||||
|
select {
|
||||||
|
case q.trigger <- struct{}{}:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *taskQueue) worker() {
|
func (q *taskQueue) worker() {
|
||||||
for {
|
for {
|
||||||
<-q.trigger
|
<-q.trigger
|
||||||
|
setCPUGovernor("performance")
|
||||||
for {
|
for {
|
||||||
q.mu.Lock()
|
q.mu.Lock()
|
||||||
t := q.nextPending()
|
t := q.nextPending()
|
||||||
@@ -189,10 +324,13 @@ func (q *taskQueue) worker() {
|
|||||||
now := time.Now()
|
now := time.Now()
|
||||||
t.Status = TaskRunning
|
t.Status = TaskRunning
|
||||||
t.StartedAt = &now
|
t.StartedAt = &now
|
||||||
j := &jobState{}
|
t.DoneAt = nil
|
||||||
|
t.ErrMsg = ""
|
||||||
|
j := newTaskJobState(t.LogPath)
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
j.cancel = cancel
|
j.cancel = cancel
|
||||||
t.job = j
|
t.job = j
|
||||||
|
q.persistLocked()
|
||||||
q.mu.Unlock()
|
q.mu.Unlock()
|
||||||
|
|
||||||
q.runTask(t, j, ctx)
|
q.runTask(t, j, ctx)
|
||||||
@@ -209,8 +347,22 @@ func (q *taskQueue) worker() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
q.prune()
|
q.prune()
|
||||||
|
q.persistLocked()
|
||||||
q.mu.Unlock()
|
q.mu.Unlock()
|
||||||
}
|
}
|
||||||
|
setCPUGovernor("powersave")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// setCPUGovernor writes the given governor to all CPU scaling_governor sysfs files.
|
||||||
|
// Silently ignores errors (e.g. when cpufreq is not available).
|
||||||
|
func setCPUGovernor(governor string) {
|
||||||
|
matches, err := filepath.Glob("/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor")
|
||||||
|
if err != nil || len(matches) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, path := range matches {
|
||||||
|
_ = os.WriteFile(path, []byte(governor), 0644)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -224,6 +376,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
a := q.opts.App
|
a := q.opts.App
|
||||||
|
|
||||||
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
||||||
|
if len(j.lines) > 0 {
|
||||||
|
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
|
||||||
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
archive string
|
archive string
|
||||||
@@ -232,9 +387,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
|
|
||||||
switch t.Target {
|
switch t.Target {
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
if len(t.params.GPUIndices) > 0 || t.params.DiagLevel > 0 {
|
diagLevel := t.params.DiagLevel
|
||||||
|
if t.params.BurnProfile != "" && diagLevel <= 0 {
|
||||||
|
diagLevel = resolveBurnPreset(t.params.BurnProfile).NvidiaDiag
|
||||||
|
}
|
||||||
|
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
||||||
result, e := a.RunNvidiaAcceptancePackWithOptions(
|
result, e := a.RunNvidiaAcceptancePackWithOptions(
|
||||||
ctx, "", t.params.DiagLevel, t.params.GPUIndices, j.append,
|
ctx, "", diagLevel, t.params.GPUIndices, j.append,
|
||||||
)
|
)
|
||||||
if e != nil {
|
if e != nil {
|
||||||
err = e
|
err = e
|
||||||
@@ -245,23 +404,45 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
||||||
}
|
}
|
||||||
case "memory":
|
case "memory":
|
||||||
archive, err = a.RunMemoryAcceptancePack("", j.append)
|
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append)
|
||||||
case "storage":
|
case "storage":
|
||||||
archive, err = a.RunStorageAcceptancePack("", j.append)
|
archive, err = runStorageAcceptancePackCtx(a, ctx, "", j.append)
|
||||||
case "cpu":
|
case "cpu":
|
||||||
dur := t.params.Duration
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
if dur <= 0 {
|
if dur <= 0 {
|
||||||
dur = 60
|
dur = 60
|
||||||
}
|
}
|
||||||
archive, err = a.RunCPUAcceptancePack("", dur, j.append)
|
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||||
case "amd":
|
case "amd":
|
||||||
archive, err = a.RunAMDAcceptancePack("", j.append)
|
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
|
||||||
|
case "amd-mem":
|
||||||
|
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
|
||||||
|
case "amd-bandwidth":
|
||||||
|
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
|
||||||
case "amd-stress":
|
case "amd-stress":
|
||||||
archive, err = a.RunAMDStressPack("", j.append)
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
case "memory-stress":
|
case "memory-stress":
|
||||||
archive, err = a.RunMemoryStressPack("", j.append)
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
case "sat-stress":
|
case "sat-stress":
|
||||||
archive, err = a.RunSATStressPack("", j.append)
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "platform-stress":
|
||||||
|
opts := resolvePlatformStressPreset(t.params.BurnProfile)
|
||||||
|
archive, err = a.RunPlatformStress(ctx, "", opts, j.append)
|
||||||
case "audit":
|
case "audit":
|
||||||
result, e := a.RunAuditNow(q.opts.RuntimeMode)
|
result, e := a.RunAuditNow(q.opts.RuntimeMode)
|
||||||
if e != nil {
|
if e != nil {
|
||||||
@@ -272,7 +453,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
case "install-to-ram":
|
case "install-to-ram":
|
||||||
err = a.RunInstallToRAM(j.append)
|
err = a.RunInstallToRAM(ctx, j.append)
|
||||||
default:
|
default:
|
||||||
j.append("ERROR: unknown target: " + t.Target)
|
j.append("ERROR: unknown target: " + t.Target)
|
||||||
j.finish("unknown target")
|
j.finish("unknown target")
|
||||||
@@ -339,6 +520,7 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
|
|||||||
t.Status = TaskCancelled
|
t.Status = TaskCancelled
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
|
globalQueue.persistLocked()
|
||||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||||
case TaskRunning:
|
case TaskRunning:
|
||||||
if t.job != nil {
|
if t.job != nil {
|
||||||
@@ -347,6 +529,7 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
|
|||||||
t.Status = TaskCancelled
|
t.Status = TaskCancelled
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
|
globalQueue.persistLocked()
|
||||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||||
default:
|
default:
|
||||||
writeError(w, http.StatusConflict, "task is not running or pending")
|
writeError(w, http.StatusConflict, "task is not running or pending")
|
||||||
@@ -374,6 +557,7 @@ func (h *handler) handleAPITasksPriority(w http.ResponseWriter, r *http.Request)
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
t.Priority += req.Delta
|
t.Priority += req.Delta
|
||||||
|
globalQueue.persistLocked()
|
||||||
writeJSON(w, map[string]int{"priority": t.Priority})
|
writeJSON(w, map[string]int{"priority": t.Priority})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -396,6 +580,7 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
|||||||
n++
|
n++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
globalQueue.persistLocked()
|
||||||
globalQueue.mu.Unlock()
|
globalQueue.mu.Unlock()
|
||||||
writeJSON(w, map[string]int{"cancelled": n})
|
writeJSON(w, map[string]int{"cancelled": n})
|
||||||
}
|
}
|
||||||
@@ -418,3 +603,79 @@ func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
streamJob(w, r, j)
|
streamJob(w, r, j)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) assignTaskLogPathLocked(t *Task) {
|
||||||
|
if t.LogPath != "" || q.logsDir == "" || t.ID == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
t.LogPath = filepath.Join(q.logsDir, t.ID+".log")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) loadLocked() {
|
||||||
|
if q.statePath == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(q.statePath)
|
||||||
|
if err != nil || len(data) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var persisted []persistedTask
|
||||||
|
if err := json.Unmarshal(data, &persisted); err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, pt := range persisted {
|
||||||
|
t := &Task{
|
||||||
|
ID: pt.ID,
|
||||||
|
Name: pt.Name,
|
||||||
|
Target: pt.Target,
|
||||||
|
Priority: pt.Priority,
|
||||||
|
Status: pt.Status,
|
||||||
|
CreatedAt: pt.CreatedAt,
|
||||||
|
StartedAt: pt.StartedAt,
|
||||||
|
DoneAt: pt.DoneAt,
|
||||||
|
ErrMsg: pt.ErrMsg,
|
||||||
|
LogPath: pt.LogPath,
|
||||||
|
params: pt.Params,
|
||||||
|
}
|
||||||
|
q.assignTaskLogPathLocked(t)
|
||||||
|
if t.Status == TaskPending || t.Status == TaskRunning {
|
||||||
|
t.Status = TaskPending
|
||||||
|
t.DoneAt = nil
|
||||||
|
t.ErrMsg = ""
|
||||||
|
}
|
||||||
|
q.tasks = append(q.tasks, t)
|
||||||
|
}
|
||||||
|
q.prune()
|
||||||
|
q.persistLocked()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) persistLocked() {
|
||||||
|
if q.statePath == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
state := make([]persistedTask, 0, len(q.tasks))
|
||||||
|
for _, t := range q.tasks {
|
||||||
|
state = append(state, persistedTask{
|
||||||
|
ID: t.ID,
|
||||||
|
Name: t.Name,
|
||||||
|
Target: t.Target,
|
||||||
|
Priority: t.Priority,
|
||||||
|
Status: t.Status,
|
||||||
|
CreatedAt: t.CreatedAt,
|
||||||
|
StartedAt: t.StartedAt,
|
||||||
|
DoneAt: t.DoneAt,
|
||||||
|
ErrMsg: t.ErrMsg,
|
||||||
|
LogPath: t.LogPath,
|
||||||
|
Params: t.params,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
data, err := json.MarshalIndent(state, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
tmp := q.statePath + ".tmp"
|
||||||
|
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = os.Rename(tmp, q.statePath)
|
||||||
|
}
|
||||||
|
|||||||
156
audit/internal/webui/tasks_test.go
Normal file
156
audit/internal/webui/tasks_test.go
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
q := &taskQueue{
|
||||||
|
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||||
|
logsDir: filepath.Join(dir, "tasks"),
|
||||||
|
trigger: make(chan struct{}, 1),
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
started := time.Now().Add(-time.Minute)
|
||||||
|
task := &Task{
|
||||||
|
ID: "task-1",
|
||||||
|
Name: "Memory Burn-in",
|
||||||
|
Target: "memory-stress",
|
||||||
|
Priority: 2,
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now().Add(-2 * time.Minute),
|
||||||
|
StartedAt: &started,
|
||||||
|
params: taskParams{
|
||||||
|
Duration: 300,
|
||||||
|
BurnProfile: "smoke",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
q.tasks = append(q.tasks, task)
|
||||||
|
q.assignTaskLogPathLocked(task)
|
||||||
|
q.persistLocked()
|
||||||
|
|
||||||
|
recovered := &taskQueue{
|
||||||
|
statePath: q.statePath,
|
||||||
|
logsDir: q.logsDir,
|
||||||
|
trigger: make(chan struct{}, 1),
|
||||||
|
}
|
||||||
|
recovered.loadLocked()
|
||||||
|
|
||||||
|
if len(recovered.tasks) != 1 {
|
||||||
|
t.Fatalf("tasks=%d want 1", len(recovered.tasks))
|
||||||
|
}
|
||||||
|
got := recovered.tasks[0]
|
||||||
|
if got.Status != TaskPending {
|
||||||
|
t.Fatalf("status=%q want %q", got.Status, TaskPending)
|
||||||
|
}
|
||||||
|
if got.params.Duration != 300 || got.params.BurnProfile != "smoke" {
|
||||||
|
t.Fatalf("params=%+v", got.params)
|
||||||
|
}
|
||||||
|
if got.LogPath == "" {
|
||||||
|
t.Fatal("expected log path")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "task.log")
|
||||||
|
if err := os.WriteFile(path, []byte("line1\nline2\n"), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
j := newTaskJobState(path)
|
||||||
|
existing, ch := j.subscribe()
|
||||||
|
if ch == nil {
|
||||||
|
t.Fatal("expected live subscription channel")
|
||||||
|
}
|
||||||
|
if len(existing) != 2 || existing[0] != "line1" || existing[1] != "line2" {
|
||||||
|
t.Fatalf("existing=%v", existing)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveBurnPreset(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
profile string
|
||||||
|
want burnPreset
|
||||||
|
}{
|
||||||
|
{profile: "smoke", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
|
||||||
|
{profile: "acceptance", want: burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}},
|
||||||
|
{profile: "overnight", want: burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}},
|
||||||
|
{profile: "", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
|
||||||
|
}
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := resolveBurnPreset(tc.profile); got != tc.want {
|
||||||
|
t.Fatalf("resolveBurnPreset(%q)=%+v want %+v", tc.profile, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunTaskHonorsCancel(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
blocked := make(chan struct{})
|
||||||
|
released := make(chan struct{})
|
||||||
|
aRun := func(_ any, ctx context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||||
|
close(blocked)
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
close(released)
|
||||||
|
return "", ctx.Err()
|
||||||
|
case <-time.After(5 * time.Second):
|
||||||
|
close(released)
|
||||||
|
return "unexpected", nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{App: &app.App{}},
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "cpu-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
params: taskParams{Duration: 60},
|
||||||
|
}
|
||||||
|
j := &jobState{}
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
j.cancel = cancel
|
||||||
|
tk.job = j
|
||||||
|
|
||||||
|
orig := runCPUAcceptancePackCtx
|
||||||
|
runCPUAcceptancePackCtx = func(_ *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return aRun(nil, ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
defer func() { runCPUAcceptancePackCtx = orig }()
|
||||||
|
|
||||||
|
done := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
q.runTask(tk, j, ctx)
|
||||||
|
close(done)
|
||||||
|
}()
|
||||||
|
|
||||||
|
<-blocked
|
||||||
|
j.abort()
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-released:
|
||||||
|
case <-time.After(2 * time.Second):
|
||||||
|
t.Fatal("task did not observe cancel")
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
case <-time.After(2 * time.Second):
|
||||||
|
t.Fatal("runTask did not return after cancel")
|
||||||
|
}
|
||||||
|
}
|
||||||
21
bible-local/docs/iso-build-rules.md
Normal file
21
bible-local/docs/iso-build-rules.md
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
# ISO Build Rules
|
||||||
|
|
||||||
|
## Verify package names before use
|
||||||
|
|
||||||
|
ISO builds take 30–60 minutes. A wrong package name wastes an entire build cycle.
|
||||||
|
|
||||||
|
**Rule: before adding any Debian package name to the ISO config, verify it exists and check its file list.**
|
||||||
|
|
||||||
|
Use one of:
|
||||||
|
- `https://packages.debian.org/bookworm/<package-name>` — existence + description
|
||||||
|
- `https://packages.debian.org/bookworm/amd64/<package-name>/filelist` — exact files installed
|
||||||
|
- `apt-cache show <package>` inside a Debian bookworm container
|
||||||
|
|
||||||
|
This applies to:
|
||||||
|
- `iso/builder/config/package-lists/*.list.chroot`
|
||||||
|
- Any package referenced in `grub.cfg`, hooks, or overlay scripts (e.g. file paths like `/boot/memtest86+x64.bin`)
|
||||||
|
|
||||||
|
## Example of what goes wrong without this
|
||||||
|
|
||||||
|
`memtest86+` in Debian bookworm installs `/boot/memtest86+x64.bin`, not `/boot/memtest86+.bin`.
|
||||||
|
Guessing the filename caused a broken GRUB entry that only surfaced at boot time, after a full rebuild.
|
||||||
35
bible-local/docs/validate-vs-burn.md
Normal file
35
bible-local/docs/validate-vs-burn.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Validate vs Burn: Hardware Impact Policy
|
||||||
|
|
||||||
|
## Validate Tests (non-destructive)
|
||||||
|
|
||||||
|
Tests on the **Validate** page are purely diagnostic. They:
|
||||||
|
|
||||||
|
- **Do not write to disks** — no data is written to storage devices; SMART counters (power-on hours, load cycle count, reallocated sectors) are not incremented.
|
||||||
|
- **Do not run sustained high load** — commands complete quickly (seconds to minutes) and do not push hardware to thermal or electrical limits.
|
||||||
|
- **Do not increment hardware wear counters** — GPU memory ECC counters, NVMe wear leveling counters, and similar endurance metrics are unaffected.
|
||||||
|
- **Are safe to run repeatedly** — on new, production-bound, or already-deployed hardware without concern for reducing lifespan.
|
||||||
|
|
||||||
|
### What Validate tests actually do
|
||||||
|
|
||||||
|
| Test | What it runs |
|
||||||
|
|---|---|
|
||||||
|
| NVIDIA GPU | `nvidia-smi`, `dcgmi diag` (levels 1–4 read-only diagnostics) |
|
||||||
|
| Memory | `memtester` on a limited allocation; reads/writes to RAM only |
|
||||||
|
| Storage | `smartctl -a`, `nvme smart-log` — reads SMART data only |
|
||||||
|
| CPU | `stress-ng` for a bounded duration; CPU-only, no I/O |
|
||||||
|
| AMD GPU | `rocm-smi --showallinfo`, `dmidecode` — read-only queries |
|
||||||
|
|
||||||
|
## Burn Tests (hardware wear)
|
||||||
|
|
||||||
|
Tests on the **Burn** page run hardware at maximum or near-maximum load for extended durations. They:
|
||||||
|
|
||||||
|
- **Wear storage**: write-intensive patterns can reduce SSD endurance (P/E cycles).
|
||||||
|
- **Stress GPU memory**: extended ECC stress tests may surface latent defects but also exercise memory cells.
|
||||||
|
- **Accelerate thermal cycling**: repeated heat/cool cycles degrade solder joints and capacitors over time.
|
||||||
|
- **May increment wear counters**: GPU power-on hours, NVMe media wear indicator, and similar metrics will advance.
|
||||||
|
|
||||||
|
### Rule
|
||||||
|
|
||||||
|
> Run **Validate** freely on any server, at any time, before or after deployment.
|
||||||
|
> Run **Burn** only when explicitly required (e.g., initial acceptance after repair, or per customer SLA).
|
||||||
|
> Document when and why Burn tests were run.
|
||||||
@@ -11,5 +11,12 @@ CUDA_USERSPACE_VERSION=13.0.96-1
|
|||||||
DCGM_VERSION=3.3.9
|
DCGM_VERSION=3.3.9
|
||||||
ROCM_VERSION=6.3.4
|
ROCM_VERSION=6.3.4
|
||||||
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
||||||
|
ROCM_BANDWIDTH_TEST_VERSION=1.4.0.60304-76~22.04
|
||||||
|
ROCM_VALIDATION_SUITE_VERSION=1.1.0.60304-76~22.04
|
||||||
|
ROCBLAS_VERSION=4.3.0.60304-76~22.04
|
||||||
|
ROCRAND_VERSION=3.2.0.60304-76~22.04
|
||||||
|
HIP_RUNTIME_AMD_VERSION=6.3.42134.60304-76~22.04
|
||||||
|
HIPBLASLT_VERSION=0.10.0.60304-76~22.04
|
||||||
|
COMGR_VERSION=2.8.0.60304-76~22.04
|
||||||
GO_VERSION=1.24.0
|
GO_VERSION=1.24.0
|
||||||
AUDIT_VERSION=1.0.0
|
AUDIT_VERSION=1.0.0
|
||||||
|
|||||||
@@ -30,9 +30,9 @@ lb config noauto \
|
|||||||
--linux-flavours "amd64" \
|
--linux-flavours "amd64" \
|
||||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||||
--memtest none \
|
--memtest none \
|
||||||
--iso-volume "EASY-BEE" \
|
--iso-volume "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--iso-application "EASY-BEE" \
|
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--bootappend-live "boot=live components nomodeset video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||||
--apt-recommends false \
|
--apt-recommends false \
|
||||||
--chroot-squashfs-compression-type zstd \
|
--chroot-squashfs-compression-type zstd \
|
||||||
"${@}"
|
"${@}"
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
|
|||||||
AUTH_KEYS=""
|
AUTH_KEYS=""
|
||||||
REBUILD_IMAGE=0
|
REBUILD_IMAGE=0
|
||||||
CLEAN_CACHE=0
|
CLEAN_CACHE=0
|
||||||
|
VARIANT="all"
|
||||||
|
|
||||||
. "${BUILDER_DIR}/VERSIONS"
|
. "${BUILDER_DIR}/VERSIONS"
|
||||||
|
|
||||||
@@ -34,14 +35,23 @@ while [ $# -gt 0 ]; do
|
|||||||
REBUILD_IMAGE=1
|
REBUILD_IMAGE=1
|
||||||
shift
|
shift
|
||||||
;;
|
;;
|
||||||
|
--variant)
|
||||||
|
VARIANT="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
*)
|
*)
|
||||||
echo "unknown arg: $1" >&2
|
echo "unknown arg: $1" >&2
|
||||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys]" >&2
|
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
|
case "$VARIANT" in
|
||||||
|
nvidia|amd|nogpu|all) ;;
|
||||||
|
*) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
|
||||||
|
esac
|
||||||
|
|
||||||
if [ "$CLEAN_CACHE" = "1" ]; then
|
if [ "$CLEAN_CACHE" = "1" ]; then
|
||||||
echo "=== cleaning build cache: ${CACHE_DIR} ==="
|
echo "=== cleaning build cache: ${CACHE_DIR} ==="
|
||||||
rm -rf "${CACHE_DIR:?}/go-build" \
|
rm -rf "${CACHE_DIR:?}/go-build" \
|
||||||
@@ -49,8 +59,10 @@ if [ "$CLEAN_CACHE" = "1" ]; then
|
|||||||
"${CACHE_DIR:?}/tmp" \
|
"${CACHE_DIR:?}/tmp" \
|
||||||
"${CACHE_DIR:?}/bee" \
|
"${CACHE_DIR:?}/bee" \
|
||||||
"${CACHE_DIR:?}/lb-packages"
|
"${CACHE_DIR:?}/lb-packages"
|
||||||
echo "=== cleaning live-build work dir: ${REPO_ROOT}/dist/live-build-work ==="
|
echo "=== cleaning live-build work dirs ==="
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work"
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
||||||
echo "=== caches cleared, proceeding with build ==="
|
echo "=== caches cleared, proceeding with build ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -108,34 +120,75 @@ else
|
|||||||
echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
|
echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
set -- \
|
# Build base docker run args (without --authorized-keys)
|
||||||
run --rm --privileged \
|
build_run_args() {
|
||||||
--platform "${BUILDER_PLATFORM}" \
|
_variant="$1"
|
||||||
-v "${REPO_ROOT}:/work" \
|
_auth_arg=""
|
||||||
-v "${CACHE_DIR}:/cache" \
|
if [ -n "$AUTH_KEYS" ]; then
|
||||||
-e BEE_CONTAINER_BUILD=1 \
|
_auth_arg="--authorized-keys /tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||||
-e GOCACHE=/cache/go-build \
|
fi
|
||||||
-e GOMODCACHE=/cache/go-mod \
|
echo "run --rm --privileged \
|
||||||
-e TMPDIR=/cache/tmp \
|
--platform ${BUILDER_PLATFORM} \
|
||||||
-e BEE_CACHE_DIR=/cache/bee \
|
-v ${REPO_ROOT}:/work \
|
||||||
-w /work \
|
-v ${CACHE_DIR}:/cache \
|
||||||
"${IMAGE_REF}" \
|
${AUTH_KEYS:+-v ${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro} \
|
||||||
sh /work/iso/builder/build.sh
|
|
||||||
|
|
||||||
if [ -n "$AUTH_KEYS" ]; then
|
|
||||||
set -- run --rm --privileged \
|
|
||||||
--platform "${BUILDER_PLATFORM}" \
|
|
||||||
-v "${REPO_ROOT}:/work" \
|
|
||||||
-v "${CACHE_DIR}:/cache" \
|
|
||||||
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
|
|
||||||
-e BEE_CONTAINER_BUILD=1 \
|
-e BEE_CONTAINER_BUILD=1 \
|
||||||
-e GOCACHE=/cache/go-build \
|
-e GOCACHE=/cache/go-build \
|
||||||
-e GOMODCACHE=/cache/go-mod \
|
-e GOMODCACHE=/cache/go-mod \
|
||||||
-e TMPDIR=/cache/tmp \
|
-e TMPDIR=/cache/tmp \
|
||||||
-e BEE_CACHE_DIR=/cache/bee \
|
-e BEE_CACHE_DIR=/cache/bee \
|
||||||
-w /work \
|
-w /work \
|
||||||
"${IMAGE_REF}" \
|
${IMAGE_REF} \
|
||||||
sh /work/iso/builder/build.sh --authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
sh /work/iso/builder/build.sh --variant ${_variant} ${_auth_arg}"
|
||||||
fi
|
}
|
||||||
|
|
||||||
"$CONTAINER_TOOL" "$@"
|
run_variant() {
|
||||||
|
_v="$1"
|
||||||
|
echo "=== building variant: ${_v} ==="
|
||||||
|
if [ -n "$AUTH_KEYS" ]; then
|
||||||
|
"$CONTAINER_TOOL" run --rm --privileged \
|
||||||
|
--platform "${BUILDER_PLATFORM}" \
|
||||||
|
-v "${REPO_ROOT}:/work" \
|
||||||
|
-v "${CACHE_DIR}:/cache" \
|
||||||
|
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
|
||||||
|
-e BEE_CONTAINER_BUILD=1 \
|
||||||
|
-e GOCACHE=/cache/go-build \
|
||||||
|
-e GOMODCACHE=/cache/go-mod \
|
||||||
|
-e TMPDIR=/cache/tmp \
|
||||||
|
-e BEE_CACHE_DIR=/cache/bee \
|
||||||
|
-w /work \
|
||||||
|
"${IMAGE_REF}" \
|
||||||
|
sh /work/iso/builder/build.sh --variant "${_v}" \
|
||||||
|
--authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||||
|
else
|
||||||
|
"$CONTAINER_TOOL" run --rm --privileged \
|
||||||
|
--platform "${BUILDER_PLATFORM}" \
|
||||||
|
-v "${REPO_ROOT}:/work" \
|
||||||
|
-v "${CACHE_DIR}:/cache" \
|
||||||
|
-e BEE_CONTAINER_BUILD=1 \
|
||||||
|
-e GOCACHE=/cache/go-build \
|
||||||
|
-e GOMODCACHE=/cache/go-mod \
|
||||||
|
-e TMPDIR=/cache/tmp \
|
||||||
|
-e BEE_CACHE_DIR=/cache/bee \
|
||||||
|
-w /work \
|
||||||
|
"${IMAGE_REF}" \
|
||||||
|
sh /work/iso/builder/build.sh --variant "${_v}"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
case "$VARIANT" in
|
||||||
|
nvidia)
|
||||||
|
run_variant nvidia
|
||||||
|
;;
|
||||||
|
amd)
|
||||||
|
run_variant amd
|
||||||
|
;;
|
||||||
|
nogpu)
|
||||||
|
run_variant nogpu
|
||||||
|
;;
|
||||||
|
all)
|
||||||
|
run_variant nvidia
|
||||||
|
run_variant amd
|
||||||
|
run_variant nogpu
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|||||||
@@ -13,19 +13,29 @@ BUILDER_DIR="${REPO_ROOT}/iso/builder"
|
|||||||
OVERLAY_DIR="${REPO_ROOT}/iso/overlay"
|
OVERLAY_DIR="${REPO_ROOT}/iso/overlay"
|
||||||
DIST_DIR="${REPO_ROOT}/dist"
|
DIST_DIR="${REPO_ROOT}/dist"
|
||||||
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
||||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work"
|
|
||||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage"
|
|
||||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
AUTH_KEYS=""
|
AUTH_KEYS=""
|
||||||
|
BEE_GPU_VENDOR="nvidia"
|
||||||
|
|
||||||
# parse args
|
# parse args
|
||||||
while [ $# -gt 0 ]; do
|
while [ $# -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
|
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
|
||||||
|
--variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
|
||||||
*) echo "unknown arg: $1"; exit 1 ;;
|
*) echo "unknown arg: $1"; exit 1 ;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
|
case "$BEE_GPU_VENDOR" in
|
||||||
|
nvidia|amd|nogpu) ;;
|
||||||
|
*) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia, amd, or nogpu)" >&2; exit 1 ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
|
||||||
|
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
|
||||||
|
|
||||||
|
export BEE_GPU_VENDOR
|
||||||
|
|
||||||
. "${BUILDER_DIR}/VERSIONS"
|
. "${BUILDER_DIR}/VERSIONS"
|
||||||
export PATH="$PATH:/usr/local/go/bin"
|
export PATH="$PATH:/usr/local/go/bin"
|
||||||
|
|
||||||
@@ -132,7 +142,7 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
|
|||||||
apt-get install -y "linux-headers-${KVER}"
|
apt-get install -y "linux-headers-${KVER}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "=== bee ISO build ==="
|
echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
|
||||||
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
||||||
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -141,8 +151,8 @@ echo "=== syncing git submodules ==="
|
|||||||
git -C "${REPO_ROOT}" submodule update --init --recursive
|
git -C "${REPO_ROOT}" submodule update --init --recursive
|
||||||
|
|
||||||
# --- compile bee binary (static, Linux amd64) ---
|
# --- compile bee binary (static, Linux amd64) ---
|
||||||
|
# Shared between variants — built once, reused on second pass.
|
||||||
BEE_BIN="${DIST_DIR}/bee-linux-amd64"
|
BEE_BIN="${DIST_DIR}/bee-linux-amd64"
|
||||||
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
|
|
||||||
NEED_BUILD=1
|
NEED_BUILD=1
|
||||||
if [ -f "$BEE_BIN" ]; then
|
if [ -f "$BEE_BIN" ]; then
|
||||||
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
|
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
|
||||||
@@ -172,37 +182,41 @@ else
|
|||||||
echo "=== bee binary up to date, skipping build ==="
|
echo "=== bee binary up to date, skipping build ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo ""
|
# --- NVIDIA-only build steps ---
|
||||||
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
|
||||||
sh "${BUILDER_DIR}/build-cublas.sh" \
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
"${CUBLAS_VERSION}" \
|
echo ""
|
||||||
"${CUDA_USERSPACE_VERSION}" \
|
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
||||||
"${NCCL_CUDA_VERSION}" \
|
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||||
"${DIST_DIR}"
|
"${CUBLAS_VERSION}" \
|
||||||
|
"${CUDA_USERSPACE_VERSION}" \
|
||||||
|
"${NCCL_CUDA_VERSION}" \
|
||||||
|
"${DIST_DIR}"
|
||||||
|
|
||||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||||
|
|
||||||
GPU_STRESS_NEED_BUILD=1
|
GPU_STRESS_NEED_BUILD=1
|
||||||
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
|
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
|
||||||
GPU_STRESS_NEED_BUILD=0
|
GPU_STRESS_NEED_BUILD=0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||||
|
echo "=== building bee-gpu-stress ==="
|
||||||
|
gcc -O2 -s -Wall -Wextra \
|
||||||
|
-I"${CUBLAS_CACHE}/include" \
|
||||||
|
-o "$GPU_STRESS_BIN" \
|
||||||
|
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||||
|
-ldl -lm
|
||||||
|
echo "binary: $GPU_STRESS_BIN"
|
||||||
|
else
|
||||||
|
echo "=== bee-gpu-stress up to date, skipping build ==="
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
|
||||||
echo "=== building bee-gpu-stress ==="
|
|
||||||
gcc -O2 -s -Wall -Wextra \
|
|
||||||
-I"${CUBLAS_CACHE}/include" \
|
|
||||||
-o "$GPU_STRESS_BIN" \
|
|
||||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
|
||||||
-ldl -lm
|
|
||||||
echo "binary: $GPU_STRESS_BIN"
|
|
||||||
else
|
|
||||||
echo "=== bee-gpu-stress up to date, skipping build ==="
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "=== preparing staged overlay ==="
|
|
||||||
# Sync builder config into work dir, preserving lb cache (chroot + packages).
|
|
||||||
# We do NOT rm -rf BUILD_WORK_DIR so lb can reuse its chroot on repeat builds.
|
|
||||||
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||||
|
|
||||||
|
# Sync builder config into variant work dir, preserving lb cache.
|
||||||
rsync -a --delete \
|
rsync -a --delete \
|
||||||
--exclude='cache/' \
|
--exclude='cache/' \
|
||||||
--exclude='chroot/' \
|
--exclude='chroot/' \
|
||||||
@@ -212,7 +226,10 @@ rsync -a --delete \
|
|||||||
--exclude='*.contents' \
|
--exclude='*.contents' \
|
||||||
--exclude='*.files' \
|
--exclude='*.files' \
|
||||||
"${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
|
"${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
|
||||||
# Also persist package cache to CACHE_ROOT so it survives a manual wipe of BUILD_WORK_DIR.
|
|
||||||
|
# Share deb package cache across variants.
|
||||||
|
# Restore: populate work dir cache from shared cache before build.
|
||||||
|
# Persist: sync back after build (done after lb build below).
|
||||||
LB_PKG_CACHE="${CACHE_ROOT}/lb-packages"
|
LB_PKG_CACHE="${CACHE_ROOT}/lb-packages"
|
||||||
mkdir -p "${LB_PKG_CACHE}"
|
mkdir -p "${LB_PKG_CACHE}"
|
||||||
if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then
|
if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then
|
||||||
@@ -221,6 +238,7 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
|
|||||||
mkdir -p "${BUILD_WORK_DIR}/cache/packages.chroot"
|
mkdir -p "${BUILD_WORK_DIR}/cache/packages.chroot"
|
||||||
rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
|
rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
|
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
|
||||||
rm -f \
|
rm -f \
|
||||||
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
|
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
|
||||||
@@ -231,6 +249,12 @@ rm -f \
|
|||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
|
|
||||||
|
# Remove NVIDIA-specific overlay files for non-nvidia variants
|
||||||
|
if [ "$BEE_GPU_VENDOR" != "nvidia" ]; then
|
||||||
|
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-load"
|
||||||
|
rm -f "${OVERLAY_STAGE_DIR}/etc/systemd/system/bee-nvidia.service"
|
||||||
|
fi
|
||||||
|
|
||||||
# --- inject authorized_keys for SSH access ---
|
# --- inject authorized_keys for SSH access ---
|
||||||
AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys"
|
AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys"
|
||||||
mkdir -p "${OVERLAY_STAGE_DIR}/root/.ssh"
|
mkdir -p "${OVERLAY_STAGE_DIR}/root/.ssh"
|
||||||
@@ -268,8 +292,11 @@ fi
|
|||||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||||
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||||
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_STRESS_BIN" ]; then
|
||||||
|
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||||
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||||
|
fi
|
||||||
|
|
||||||
# --- inject smoketest into overlay so it runs directly on the live CD ---
|
# --- inject smoketest into overlay so it runs directly on the live CD ---
|
||||||
cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
||||||
@@ -286,100 +313,156 @@ for tool in storcli64 sas2ircu sas3ircu arcconf ssacli; do
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
# --- build NVIDIA kernel modules ---
|
# --- NVIDIA kernel modules and userspace libs ---
|
||||||
echo ""
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
|
echo ""
|
||||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
|
||||||
|
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
||||||
|
|
||||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||||
|
|
||||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||||
OVERLAY_KMOD_DIR="${OVERLAY_DIR}/usr/local/lib/nvidia"
|
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||||
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
mkdir -p "${OVERLAY_KMOD_DIR}"
|
||||||
mkdir -p "${OVERLAY_KMOD_DIR}"
|
cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"
|
||||||
cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"
|
|
||||||
|
|
||||||
# Inject nvidia-smi and libnvidia-ml
|
# Inject nvidia-smi and libnvidia-ml
|
||||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" "${OVERLAY_STAGE_DIR}/usr/lib"
|
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" "${OVERLAY_STAGE_DIR}/usr/lib"
|
||||||
cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_STAGE_DIR}/usr/local/bin/"
|
cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_STAGE_DIR}/usr/local/bin/"
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-smi"
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-smi"
|
||||||
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
|
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
|
||||||
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||||
|
|
||||||
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
|
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
|
||||||
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
|
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
|
||||||
mkdir -p "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}"
|
mkdir -p "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}"
|
||||||
cp "${NVIDIA_CACHE}/firmware/"* "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/"
|
cp "${NVIDIA_CACHE}/firmware/"* "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/"
|
||||||
echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ==="
|
echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ==="
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- build / download NCCL ---
|
||||||
|
echo ""
|
||||||
|
echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
|
||||||
|
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
||||||
|
|
||||||
|
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||||
|
|
||||||
|
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
|
||||||
|
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||||
|
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||||
|
|
||||||
|
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
|
||||||
|
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||||
|
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||||
|
|
||||||
|
# --- build nccl-tests ---
|
||||||
|
echo ""
|
||||||
|
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
|
||||||
|
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
||||||
|
"${NCCL_TESTS_VERSION}" \
|
||||||
|
"${NCCL_VERSION}" \
|
||||||
|
"${NCCL_CUDA_VERSION}" \
|
||||||
|
"${DIST_DIR}" \
|
||||||
|
"${NVCC_VERSION}" \
|
||||||
|
"${DEBIAN_VERSION}"
|
||||||
|
|
||||||
|
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||||
|
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
|
echo "=== all_reduce_perf injected ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# --- build / download NCCL ---
|
|
||||||
echo ""
|
|
||||||
echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
|
|
||||||
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
|
||||||
|
|
||||||
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
|
||||||
|
|
||||||
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
|
|
||||||
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
|
||||||
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
|
||||||
|
|
||||||
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
|
|
||||||
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
|
||||||
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
|
||||||
|
|
||||||
# --- build nccl-tests ---
|
|
||||||
echo ""
|
|
||||||
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
|
|
||||||
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
|
||||||
"${NCCL_TESTS_VERSION}" \
|
|
||||||
"${NCCL_VERSION}" \
|
|
||||||
"${NCCL_CUDA_VERSION}" \
|
|
||||||
"${DIST_DIR}" \
|
|
||||||
"${NVCC_VERSION}" \
|
|
||||||
"${DEBIAN_VERSION}"
|
|
||||||
|
|
||||||
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
|
||||||
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
|
||||||
echo "=== all_reduce_perf injected ==="
|
|
||||||
|
|
||||||
# --- embed build metadata ---
|
# --- embed build metadata ---
|
||||||
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
||||||
BUILD_DATE="$(date +%Y-%m-%d)"
|
BUILD_DATE="$(date +%Y-%m-%d)"
|
||||||
GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo unknown)"
|
GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo unknown)"
|
||||||
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
|
||||||
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
||||||
BUILD_DATE=${BUILD_DATE}
|
|
||||||
GIT_COMMIT=${GIT_COMMIT}
|
|
||||||
DEBIAN_VERSION=${DEBIAN_VERSION}
|
|
||||||
DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
|
|
||||||
NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
|
||||||
NCCL_VERSION=${NCCL_VERSION}
|
NCCL_VERSION=${NCCL_VERSION}
|
||||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}"
|
||||||
|
GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
|
||||||
|
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||||
|
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
||||||
|
GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
|
||||||
|
else
|
||||||
|
GPU_VERSION_LINE=""
|
||||||
|
GPU_BUILD_INFO="nogpu"
|
||||||
|
fi
|
||||||
|
|
||||||
|
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
||||||
|
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
||||||
|
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
||||||
|
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
||||||
|
BUILD_DATE=${BUILD_DATE}
|
||||||
|
GIT_COMMIT=${GIT_COMMIT}
|
||||||
|
DEBIAN_VERSION=${DEBIAN_VERSION}
|
||||||
|
DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
|
||||||
|
${GPU_VERSION_LINE}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
# Write GPU vendor marker for hooks
|
||||||
|
echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
|
||||||
|
|
||||||
# Patch motd with build info
|
# Patch motd with build info
|
||||||
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} nvidia:${NVIDIA_DRIVER_VERSION}"
|
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
|
||||||
if [ -f "${OVERLAY_STAGE_DIR}/etc/motd" ]; then
|
if [ -f "${OVERLAY_STAGE_DIR}/etc/motd" ]; then
|
||||||
sed "s/%%BUILD_INFO%%/${BEE_BUILD_INFO}/" "${OVERLAY_STAGE_DIR}/etc/motd" \
|
sed "s/%%BUILD_INFO%%/${BEE_BUILD_INFO}/" "${OVERLAY_STAGE_DIR}/etc/motd" \
|
||||||
> "${OVERLAY_STAGE_DIR}/etc/motd.patched"
|
> "${OVERLAY_STAGE_DIR}/etc/motd.patched"
|
||||||
mv "${OVERLAY_STAGE_DIR}/etc/motd.patched" "${OVERLAY_STAGE_DIR}/etc/motd"
|
mv "${OVERLAY_STAGE_DIR}/etc/motd.patched" "${OVERLAY_STAGE_DIR}/etc/motd"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# --- substitute version placeholders in package list ---
|
# --- copy variant-specific package list, remove all other variant lists ---
|
||||||
sed -i \
|
# live-build picks up ALL .list.chroot files — delete other variants to avoid conflicts.
|
||||||
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
cp "${BUILD_WORK_DIR}/config/package-lists/bee-${BEE_GPU_VENDOR}.list.chroot" \
|
||||||
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
|
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||||
-e "s/%%ROCM_SMI_VERSION%%/${ROCM_SMI_VERSION}/g" \
|
rm -f "${BUILD_WORK_DIR}/config/package-lists/bee-nvidia.list.chroot" \
|
||||||
"${BUILD_WORK_DIR}/config/package-lists/bee.list.chroot" \
|
"${BUILD_WORK_DIR}/config/package-lists/bee-amd.list.chroot" \
|
||||||
"${BUILD_WORK_DIR}/config/archives/rocm.list.chroot"
|
"${BUILD_WORK_DIR}/config/package-lists/bee-nogpu.list.chroot"
|
||||||
|
|
||||||
|
# --- remove archives for the other vendor(s) ---
|
||||||
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
|
rm -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" \
|
||||||
|
"${BUILD_WORK_DIR}/config/archives/rocm.key.chroot"
|
||||||
|
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||||
|
rm -f "${BUILD_WORK_DIR}/config/archives/nvidia-cuda.list.chroot" \
|
||||||
|
"${BUILD_WORK_DIR}/config/archives/nvidia-cuda.key.chroot"
|
||||||
|
else
|
||||||
|
# nogpu: remove both
|
||||||
|
rm -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" \
|
||||||
|
"${BUILD_WORK_DIR}/config/archives/rocm.key.chroot" \
|
||||||
|
"${BUILD_WORK_DIR}/config/archives/nvidia-cuda.list.chroot" \
|
||||||
|
"${BUILD_WORK_DIR}/config/archives/nvidia-cuda.key.chroot"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- substitute version placeholders in package list and archive ---
|
||||||
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
|
sed -i \
|
||||||
|
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
||||||
|
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||||
|
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||||
|
sed -i \
|
||||||
|
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
|
||||||
|
-e "s/%%ROCM_SMI_VERSION%%/${ROCM_SMI_VERSION}/g" \
|
||||||
|
-e "s/%%ROCM_BANDWIDTH_TEST_VERSION%%/${ROCM_BANDWIDTH_TEST_VERSION}/g" \
|
||||||
|
-e "s/%%ROCM_VALIDATION_SUITE_VERSION%%/${ROCM_VALIDATION_SUITE_VERSION}/g" \
|
||||||
|
-e "s/%%ROCBLAS_VERSION%%/${ROCBLAS_VERSION}/g" \
|
||||||
|
-e "s/%%ROCRAND_VERSION%%/${ROCRAND_VERSION}/g" \
|
||||||
|
-e "s/%%HIP_RUNTIME_AMD_VERSION%%/${HIP_RUNTIME_AMD_VERSION}/g" \
|
||||||
|
-e "s/%%HIPBLASLT_VERSION%%/${HIPBLASLT_VERSION}/g" \
|
||||||
|
-e "s/%%COMGR_VERSION%%/${COMGR_VERSION}/g" \
|
||||||
|
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||||
|
if [ -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" ]; then
|
||||||
|
sed -i \
|
||||||
|
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
|
||||||
|
"${BUILD_WORK_DIR}/config/archives/rocm.list.chroot"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
# --- sync overlay into live-build includes.chroot ---
|
# --- sync overlay into live-build includes.chroot ---
|
||||||
LB_DIR="${BUILD_WORK_DIR}"
|
LB_DIR="${BUILD_WORK_DIR}"
|
||||||
@@ -395,20 +478,31 @@ fi
|
|||||||
|
|
||||||
# --- build ISO using live-build ---
|
# --- build ISO using live-build ---
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== building ISO (live-build) ==="
|
echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
|
||||||
|
|
||||||
|
# Export for auto/config
|
||||||
|
BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
|
||||||
|
export BEE_GPU_VENDOR_UPPER
|
||||||
|
|
||||||
cd "${LB_DIR}"
|
cd "${LB_DIR}"
|
||||||
lb clean 2>&1 | tail -3
|
lb clean 2>&1 | tail -3
|
||||||
lb config 2>&1 | tail -5
|
lb config 2>&1 | tail -5
|
||||||
lb build 2>&1
|
lb build 2>&1
|
||||||
|
|
||||||
|
# --- persist deb package cache back to shared location ---
|
||||||
|
# This allows the second variant to reuse all downloaded packages.
|
||||||
|
if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then
|
||||||
|
rsync -a "${BUILD_WORK_DIR}/cache/packages.chroot/" "${LB_PKG_CACHE}/"
|
||||||
|
echo "=== package cache synced to ${LB_PKG_CACHE} ==="
|
||||||
|
fi
|
||||||
|
|
||||||
# live-build outputs live-image-amd64.hybrid.iso in LB_DIR
|
# live-build outputs live-image-amd64.hybrid.iso in LB_DIR
|
||||||
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
||||||
ISO_OUT="${DIST_DIR}/bee-debian${DEBIAN_VERSION}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
|
ISO_OUT="${DIST_DIR}/easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
|
||||||
if [ -f "$ISO_RAW" ]; then
|
if [ -f "$ISO_RAW" ]; then
|
||||||
cp "$ISO_RAW" "$ISO_OUT"
|
cp "$ISO_RAW" "$ISO_OUT"
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== done ==="
|
echo "=== done (${BEE_GPU_VENDOR}) ==="
|
||||||
echo "ISO: $ISO_OUT"
|
echo "ISO: $ISO_OUT"
|
||||||
if command -v stat >/dev/null 2>&1; then
|
if command -v stat >/dev/null 2>&1; then
|
||||||
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
||||||
|
|||||||
@@ -10,28 +10,39 @@ echo " ╚══════╝╚═╝ ╚═╝╚══════╝
|
|||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
menuentry "EASY-BEE" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (load to RAM)" {
|
menuentry "EASY-BEE (load to RAM)" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.nvidia.mode=normal
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (NVIDIA GSP=off)" {
|
menuentry "EASY-BEE (NVIDIA GSP=off)" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE (NVIDIA no MSI-X)" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=nomsi net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (fail-safe)" {
|
menuentry "EASY-BEE (fail-safe)" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "Memory Test (memtest86+)" {
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
linux16 /boot/memtest86+.bin
|
menuentry "Memory Test (memtest86+)" {
|
||||||
}
|
chainloader /boot/memtest86+x64.efi
|
||||||
|
}
|
||||||
|
else
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
linux16 /boot/memtest86+x64.bin
|
||||||
|
}
|
||||||
|
fi
|
||||||
|
|
||||||
if [ "${grub_platform}" = "efi" ]; then
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
menuentry "UEFI Firmware Settings" {
|
menuentry "UEFI Firmware Settings" {
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ label live-@FLAVOUR@-gsp-off
|
|||||||
menu label EASY-BEE (^NVIDIA GSP=off)
|
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off
|
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off
|
||||||
|
|
||||||
label live-@FLAVOUR@-failsafe
|
label live-@FLAVOUR@-failsafe
|
||||||
menu label EASY-BEE (^fail-safe)
|
menu label EASY-BEE (^fail-safe)
|
||||||
|
|||||||
@@ -5,25 +5,27 @@ set -e
|
|||||||
|
|
||||||
echo "=== bee chroot setup ==="
|
echo "=== bee chroot setup ==="
|
||||||
|
|
||||||
|
GPU_VENDOR=$(cat /etc/bee-gpu-vendor 2>/dev/null || echo nvidia)
|
||||||
|
echo "=== GPU vendor: ${GPU_VENDOR} ==="
|
||||||
|
|
||||||
ensure_bee_console_user() {
|
ensure_bee_console_user() {
|
||||||
if id bee >/dev/null 2>&1; then
|
if id bee >/dev/null 2>&1; then
|
||||||
usermod -d /home/bee -s /bin/sh bee 2>/dev/null || true
|
usermod -d /home/bee -s /bin/bash bee 2>/dev/null || true
|
||||||
else
|
else
|
||||||
useradd -d /home/bee -m -s /bin/sh -U bee
|
useradd -d /home/bee -m -s /bin/bash -U bee
|
||||||
fi
|
fi
|
||||||
|
|
||||||
mkdir -p /home/bee
|
mkdir -p /home/bee
|
||||||
chown -R bee:bee /home/bee
|
chown -R bee:bee /home/bee
|
||||||
echo "bee:eeb" | chpasswd
|
echo "bee:eeb" | chpasswd
|
||||||
usermod -aG sudo,video,input bee 2>/dev/null || true
|
groupadd -f ipmi 2>/dev/null || true
|
||||||
|
usermod -aG sudo,video,input,render,ipmi bee 2>/dev/null || true
|
||||||
}
|
}
|
||||||
|
|
||||||
ensure_bee_console_user
|
ensure_bee_console_user
|
||||||
|
|
||||||
# Enable bee services
|
# Enable common bee services
|
||||||
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
|
||||||
systemctl enable bee-network.service
|
systemctl enable bee-network.service
|
||||||
systemctl enable bee-nvidia.service
|
|
||||||
systemctl enable bee-preflight.service
|
systemctl enable bee-preflight.service
|
||||||
systemctl enable bee-audit.service
|
systemctl enable bee-audit.service
|
||||||
systemctl enable bee-web.service
|
systemctl enable bee-web.service
|
||||||
@@ -35,23 +37,34 @@ systemctl enable serial-getty@ttyS0.service 2>/dev/null || true
|
|||||||
systemctl enable serial-getty@ttyS1.service 2>/dev/null || true
|
systemctl enable serial-getty@ttyS1.service 2>/dev/null || true
|
||||||
systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
||||||
|
|
||||||
|
# Enable GPU-vendor specific services
|
||||||
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
|
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||||
|
systemctl enable bee-nvidia.service
|
||||||
|
elif [ "$GPU_VENDOR" = "amd" ]; then
|
||||||
|
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
||||||
|
for tool in rocm-smi rocm-bandwidth-test rvs; do
|
||||||
|
if [ ! -e /usr/local/bin/${tool} ]; then
|
||||||
|
bin_path="$(find /opt -path "*/bin/${tool}" -type f 2>/dev/null | sort | tail -1)"
|
||||||
|
[ -n "${bin_path}" ] && ln -sf "${bin_path}" /usr/local/bin/${tool}
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
# nogpu: no GPU services needed
|
||||||
|
|
||||||
# Ensure scripts are executable
|
# Ensure scripts are executable
|
||||||
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
|
||||||
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||||
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
|
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
# Reload udev rules
|
# Reload udev rules
|
||||||
udevadm control --reload-rules 2>/dev/null || true
|
udevadm control --reload-rules 2>/dev/null || true
|
||||||
|
|
||||||
# rocm-smi symlink (package installs to /opt/rocm-*/bin/rocm-smi)
|
|
||||||
if [ ! -e /usr/local/bin/rocm-smi ]; then
|
|
||||||
smi_path="$(find /opt -path '*/bin/rocm-smi' -type f 2>/dev/null | sort | tail -1)"
|
|
||||||
[ -n "${smi_path}" ] && ln -sf "${smi_path}" /usr/local/bin/rocm-smi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Create export directory
|
# Create export directory
|
||||||
mkdir -p /appdata/bee/export
|
mkdir -p /appdata/bee/export
|
||||||
|
|
||||||
@@ -59,4 +72,4 @@ if [ -f /etc/sudoers.d/bee ]; then
|
|||||||
chmod 0440 /etc/sudoers.d/bee
|
chmod 0440 /etc/sudoers.d/bee
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "=== bee chroot setup complete ==="
|
echo "=== bee chroot setup complete (${GPU_VENDOR}) ==="
|
||||||
|
|||||||
16
iso/builder/config/hooks/normal/9100-memtest.hook.binary
Executable file
16
iso/builder/config/hooks/normal/9100-memtest.hook.binary
Executable file
@@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# Copy memtest86+ binaries from chroot /boot into the ISO boot directory
|
||||||
|
# so GRUB can chainload them directly (they must be on the ISO filesystem,
|
||||||
|
# not inside the squashfs).
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo "memtest: scanning chroot/boot/ for memtest files:"
|
||||||
|
ls chroot/boot/memtest* 2>/dev/null || echo "memtest: WARNING: no memtest files found in chroot/boot/"
|
||||||
|
|
||||||
|
for f in memtest86+x64.bin memtest86+x64.efi memtest86+ia32.bin memtest86+ia32.efi; do
|
||||||
|
src="chroot/boot/${f}"
|
||||||
|
if [ -f "${src}" ]; then
|
||||||
|
cp "${src}" "binary/boot/${f}"
|
||||||
|
echo "memtest: copied ${f} to binary/boot/"
|
||||||
|
fi
|
||||||
|
done
|
||||||
9
iso/builder/config/package-lists/bee-amd.list.chroot
Normal file
9
iso/builder/config/package-lists/bee-amd.list.chroot
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
||||||
|
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
||||||
|
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
|
||||||
|
rocm-validation-suite=%%ROCM_VALIDATION_SUITE_VERSION%%
|
||||||
|
rocblas=%%ROCBLAS_VERSION%%
|
||||||
|
rocrand=%%ROCRAND_VERSION%%
|
||||||
|
hip-runtime-amd=%%HIP_RUNTIME_AMD_VERSION%%
|
||||||
|
hipblaslt=%%HIPBLASLT_VERSION%%
|
||||||
|
comgr=%%COMGR_VERSION%%
|
||||||
1
iso/builder/config/package-lists/bee-nogpu.list.chroot
Normal file
1
iso/builder/config/package-lists/bee-nogpu.list.chroot
Normal file
@@ -0,0 +1 @@
|
|||||||
|
# No GPU variant — no NVIDIA, no AMD/ROCm packages
|
||||||
2
iso/builder/config/package-lists/bee-nvidia.list.chroot
Normal file
2
iso/builder/config/package-lists/bee-nvidia.list.chroot
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
|
||||||
|
datacenter-gpu-manager=1:%%DCGM_VERSION%%
|
||||||
@@ -72,11 +72,5 @@ firmware-bnx2x
|
|||||||
firmware-cavium
|
firmware-cavium
|
||||||
firmware-qlogic
|
firmware-qlogic
|
||||||
|
|
||||||
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
|
|
||||||
datacenter-gpu-manager=1:%%DCGM_VERSION%%
|
|
||||||
|
|
||||||
# AMD ROCm SMI — GPU monitoring for Instinct cards (repo: rocm/apt/6.3.4 jammy)
|
|
||||||
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
|
||||||
|
|
||||||
# glibc compat helpers (for any external binaries that need it)
|
# glibc compat helpers (for any external binaries that need it)
|
||||||
libc6
|
libc6
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
|
|||||||
# --- PATH & binaries ---
|
# --- PATH & binaries ---
|
||||||
echo "-- PATH & binaries --"
|
echo "-- PATH & binaries --"
|
||||||
for tool in dmidecode smartctl nvme ipmitool lspci bee; do
|
for tool in dmidecode smartctl nvme ipmitool lspci bee; do
|
||||||
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
if p=$(PATH="/usr/local/bin:/usr/sbin:/sbin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||||
ok "$tool found: $p"
|
ok "$tool found: $p"
|
||||||
else
|
else
|
||||||
fail "$tool: NOT FOUND"
|
fail "$tool: NOT FOUND"
|
||||||
|
|||||||
3
iso/overlay/etc/modules-load.d/bee-ipmi.conf
Normal file
3
iso/overlay/etc/modules-load.d/bee-ipmi.conf
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# Load IPMI modules for fan/sensor/power monitoring via ipmitool
|
||||||
|
ipmi_si
|
||||||
|
ipmi_devintf
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
export PATH="$PATH:/usr/local/bin:/opt/rocm/bin:/opt/rocm/sbin"
|
export PATH="$PATH:/usr/local/bin:/usr/sbin:/sbin:/opt/rocm/bin:/opt/rocm/sbin"
|
||||||
|
|
||||||
# Print web UI URLs on the local console at login.
|
# Print web UI URLs on the local console at login.
|
||||||
if [ -z "${SSH_CONNECTION:-}" ] \
|
if [ -z "${SSH_CONNECTION:-}" ] \
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: hardware audit web viewer
|
Description=Bee: hardware audit web viewer
|
||||||
After=bee-network.service bee-audit.service
|
After=bee-network.service
|
||||||
Wants=bee-audit.service
|
Wants=bee-audit.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
|
|||||||
2
iso/overlay/etc/udev/rules.d/99-ipmi.rules
Normal file
2
iso/overlay/etc/udev/rules.d/99-ipmi.rules
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
# Allow ipmi group to access IPMI device without root
|
||||||
|
KERNEL=="ipmi[0-9]*", GROUP="ipmi", MODE="0660"
|
||||||
@@ -67,7 +67,7 @@ case "$nvidia_mode" in
|
|||||||
load_module nvidia-modeset || true
|
load_module nvidia-modeset || true
|
||||||
load_module nvidia-uvm || true
|
load_module nvidia-uvm || true
|
||||||
;;
|
;;
|
||||||
gsp-off|safe|*)
|
gsp-off|safe)
|
||||||
# NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
|
# NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
|
||||||
# be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
|
# be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
|
||||||
# conservative path for platforms where full boot-time GSP init is unstable.
|
# conservative path for platforms where full boot-time GSP init is unstable.
|
||||||
@@ -76,6 +76,15 @@ case "$nvidia_mode" in
|
|||||||
fi
|
fi
|
||||||
log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
|
log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
|
||||||
;;
|
;;
|
||||||
|
nomsi|*)
|
||||||
|
# nomsi: disable MSI-X/MSI interrupts — use when RmInitAdapter fails with
|
||||||
|
# "Failed to enable MSI-X" on one or more GPUs (IOMMU group interrupt limits).
|
||||||
|
# NVreg_EnableMSI=0 forces legacy INTx interrupts for all GPUs.
|
||||||
|
if ! load_module nvidia NVreg_EnableGpuFirmware=0 NVreg_EnableMSI=0; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
log "nomsi mode: MSI-X disabled (NVreg_EnableMSI=0), skipping nvidia-modeset and nvidia-uvm"
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
# Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
|
# Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
|
||||||
|
|||||||
@@ -2,22 +2,19 @@
|
|||||||
# openbox session: launch tint2 taskbar + chromium, then openbox as WM.
|
# openbox session: launch tint2 taskbar + chromium, then openbox as WM.
|
||||||
# This file is used as an xinitrc by bee-desktop.
|
# This file is used as an xinitrc by bee-desktop.
|
||||||
|
|
||||||
# Wait for bee-web to be accepting connections (up to 15 seconds)
|
|
||||||
i=0
|
|
||||||
while [ $i -lt 15 ]; do
|
|
||||||
if curl -sf http://localhost/healthz >/dev/null 2>&1; then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
sleep 1
|
|
||||||
i=$((i+1))
|
|
||||||
done
|
|
||||||
|
|
||||||
# Disable screensaver and DPMS
|
# Disable screensaver and DPMS
|
||||||
xset s off
|
xset s off
|
||||||
xset -dpms
|
xset -dpms
|
||||||
xset s noblank
|
xset s noblank
|
||||||
|
|
||||||
tint2 &
|
tint2 &
|
||||||
|
# Wait for bee-web to bind (Go starts fast, usually <2s)
|
||||||
|
i=0
|
||||||
|
while [ $i -lt 30 ]; do
|
||||||
|
if curl -sf http://localhost/healthz >/dev/null 2>&1; then break; fi
|
||||||
|
sleep 1
|
||||||
|
i=$((i+1))
|
||||||
|
done
|
||||||
chromium \
|
chromium \
|
||||||
--disable-infobars \
|
--disable-infobars \
|
||||||
--disable-translate \
|
--disable-translate \
|
||||||
|
|||||||
Reference in New Issue
Block a user