feat: task queue, UI overhaul, burn tests, install-to-RAM
- Task queue: all SAT/audit jobs enqueue and run one-at-a-time; tasks persist past page navigation; new Tasks page with cancel/priority/log stream - UI: consolidate nav (Validate, Burn, Tasks, Tools); Audit becomes modal; Dashboard hardware summary badges + split metrics charts (load/temp/power); Tools page consolidates network, services, install, support bundle - AMD GPU: acceptance test and stress burn cards; GPU presence API greys out irrelevant SAT cards automatically - Burn tests: Memory Stress (stress-ng --vm), SAT Stress (stressapptest) - Install to RAM: copies squashfs to /dev/shm, re-associates loop devices via LOOP_CHANGE_FD ioctl so live media can be ejected - Charts: relative time axis (0 = now, negative left) - memtester: LimitMEMLOCK=infinity in bee-web.service; empty output → UNSUPPORTED - SAT overlay applied dynamically on every /audit.json serve - MIME panic guard for LiveCD ramdisk I/O errors - ISO: add memtest86+, stressapptest packages; memtest86+ GRUB entry; disable screensaver/DPMS in bee-openbox-session - Unknown SAT status severity = 1 (does not override OK) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -346,19 +346,20 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
archive string
|
||||
err error
|
||||
)
|
||||
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
|
||||
switch target {
|
||||
case "nvidia":
|
||||
archive, err = application.RunNvidiaAcceptancePack("")
|
||||
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
||||
case "memory":
|
||||
archive, err = application.RunMemoryAcceptancePack("")
|
||||
archive, err = application.RunMemoryAcceptancePack("", logLine)
|
||||
case "storage":
|
||||
archive, err = application.RunStorageAcceptancePack("")
|
||||
archive, err = application.RunStorageAcceptancePack("", logLine)
|
||||
case "cpu":
|
||||
dur := *duration
|
||||
if dur <= 0 {
|
||||
dur = 60
|
||||
}
|
||||
archive, err = application.RunCPUAcceptancePack("", dur)
|
||||
archive, err = application.RunCPUAcceptancePack("", dur, logLine)
|
||||
}
|
||||
if err != nil {
|
||||
slog.Error("run sat", "target", target, "err", err)
|
||||
|
||||
@@ -53,6 +53,8 @@ type networkManager interface {
|
||||
DHCPOne(iface string) (string, error)
|
||||
DHCPAll() (string, error)
|
||||
SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error)
|
||||
SetInterfaceState(iface string, up bool) error
|
||||
GetInterfaceState(iface string) (bool, error)
|
||||
}
|
||||
|
||||
type serviceManager interface {
|
||||
@@ -75,20 +77,46 @@ type toolManager interface {
|
||||
type installer interface {
|
||||
ListInstallDisks() ([]platform.InstallDisk, error)
|
||||
InstallToDisk(ctx context.Context, device string, logFile string) error
|
||||
IsLiveMediaInRAM() bool
|
||||
RunInstallToRAM(logFunc func(string)) error
|
||||
}
|
||||
|
||||
type GPUPresenceResult struct {
|
||||
Nvidia bool
|
||||
AMD bool
|
||||
}
|
||||
|
||||
func (a *App) DetectGPUPresence() GPUPresenceResult {
|
||||
vendor := a.sat.DetectGPUVendor()
|
||||
return GPUPresenceResult{
|
||||
Nvidia: vendor == "nvidia",
|
||||
AMD: vendor == "amd",
|
||||
}
|
||||
}
|
||||
|
||||
func (a *App) IsLiveMediaInRAM() bool {
|
||||
return a.installer.IsLiveMediaInRAM()
|
||||
}
|
||||
|
||||
func (a *App) RunInstallToRAM(logFunc func(string)) error {
|
||||
return a.installer.RunInstallToRAM(logFunc)
|
||||
}
|
||||
|
||||
type satRunner interface {
|
||||
RunNvidiaAcceptancePack(baseDir string) (string, error)
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error)
|
||||
RunMemoryAcceptancePack(baseDir string) (string, error)
|
||||
RunStorageAcceptancePack(baseDir string) (string, error)
|
||||
RunCPUAcceptancePack(baseDir string, durationSec int) (string, error)
|
||||
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
||||
RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
||||
RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||
DetectGPUVendor() string
|
||||
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
||||
RunAMDAcceptancePack(baseDir string) (string, error)
|
||||
RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
||||
RunAMDStressPack(baseDir string, logFunc func(string)) (string, error)
|
||||
RunMemoryStressPack(baseDir string, logFunc func(string)) (string, error)
|
||||
RunSATStressPack(baseDir string, logFunc func(string)) (string, error)
|
||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
}
|
||||
|
||||
type runtimeChecker interface {
|
||||
@@ -108,6 +136,17 @@ func New(platform *platform.System) *App {
|
||||
}
|
||||
}
|
||||
|
||||
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
||||
// and returns the updated JSON. Used by the web UI to serve always-fresh status.
|
||||
func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(auditJSON, &snap); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
|
||||
return json.MarshalIndent(snap, "", " ")
|
||||
}
|
||||
|
||||
func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) {
|
||||
if runtimeMode == runtimeenv.ModeLiveCD {
|
||||
if err := a.runtime.CaptureTechnicalDump(DefaultTechDumpDir); err != nil {
|
||||
@@ -301,6 +340,14 @@ func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
|
||||
return a.network.SetStaticIPv4(cfg)
|
||||
}
|
||||
|
||||
func (a *App) SetInterfaceState(iface string, up bool) error {
|
||||
return a.network.SetInterfaceState(iface, up)
|
||||
}
|
||||
|
||||
func (a *App) GetInterfaceState(iface string) (bool, error) {
|
||||
return a.network.GetInterfaceState(iface)
|
||||
}
|
||||
|
||||
func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
|
||||
body, err := a.network.SetStaticIPv4(cfg)
|
||||
return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
|
||||
@@ -416,15 +463,15 @@ func (a *App) AuditLogTailResult() ActionResult {
|
||||
return ActionResult{Title: "Audit log tail", Body: body}
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||
func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaAcceptancePack(baseDir)
|
||||
return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunNvidiaAcceptancePack(baseDir)
|
||||
path, err := a.RunNvidiaAcceptancePack(baseDir, nil)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
@@ -436,11 +483,11 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return a.sat.ListNvidiaGPUs()
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (ActionResult, error) {
|
||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices)
|
||||
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
@@ -448,39 +495,39 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
|
||||
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunMemoryAcceptancePack(baseDir)
|
||||
return a.sat.RunMemoryAcceptancePack(baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunMemoryAcceptancePack(baseDir)
|
||||
path, err := a.RunMemoryAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
|
||||
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunCPUAcceptancePack(baseDir, durationSec)
|
||||
return a.sat.RunCPUAcceptancePack(baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
|
||||
path, err := a.RunCPUAcceptancePack(baseDir, durationSec)
|
||||
path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil)
|
||||
return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunStorageAcceptancePack(baseDir)
|
||||
return a.sat.RunStorageAcceptancePack(baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunStorageAcceptancePack(baseDir)
|
||||
path, err := a.RunStorageAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
@@ -492,18 +539,33 @@ func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
||||
return a.sat.ListAMDGPUs()
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePack(baseDir string) (string, error) {
|
||||
func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDAcceptancePack(baseDir)
|
||||
return a.sat.RunAMDAcceptancePack(baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunAMDAcceptancePack(baseDir)
|
||||
path, err := a.RunAMDAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryStressPack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.sat.RunMemoryStressPack(baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunSATStressPack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.sat.RunSATStressPack(baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDStressPack(baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDStressPack(baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
@@ -512,7 +574,7 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir)
|
||||
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
||||
body := "Results: " + path
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
|
||||
@@ -43,6 +43,9 @@ func (f fakeNetwork) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error
|
||||
return f.setStaticIPv4Fn(cfg)
|
||||
}
|
||||
|
||||
func (f fakeNetwork) SetInterfaceState(_ string, _ bool) error { return nil }
|
||||
func (f fakeNetwork) GetInterfaceState(_ string) (bool, error) { return true, nil }
|
||||
|
||||
type fakeServices struct {
|
||||
serviceStatusFn func(string) (string, error)
|
||||
serviceDoFn func(string, platform.ServiceAction) (string, error)
|
||||
@@ -123,11 +126,11 @@ type fakeSAT struct {
|
||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ []int) (string, error) {
|
||||
func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ []int, _ func(string)) (string, error) {
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
@@ -138,15 +141,15 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||
return f.runMemoryFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
func (f fakeSAT) RunStorageAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||
return f.runStorageFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
|
||||
func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int, _ func(string)) (string, error) {
|
||||
if f.runCPUFn != nil {
|
||||
return f.runCPUFn(baseDir, durationSec)
|
||||
}
|
||||
@@ -167,18 +170,22 @@ func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) {
|
||||
func (f fakeSAT) RunAMDAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||
if f.runAMDPackFn != nil {
|
||||
return f.runAMDPackFn(baseDir)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDStressPack(_ string, _ func(string)) (string, error) { return "", nil }
|
||||
func (f fakeSAT) RunMemoryStressPack(_ string, _ func(string)) (string, error) { return "", nil }
|
||||
func (f fakeSAT) RunSATStressPack(_ string, _ func(string)) (string, error) { return "", nil }
|
||||
|
||||
func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string) (string, error) {
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
@@ -574,13 +581,13 @@ func TestRunSATDefaultsToExportDir(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
if _, err := a.RunNvidiaAcceptancePack(""); err != nil {
|
||||
if _, err := a.RunNvidiaAcceptancePack("", nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if _, err := a.RunMemoryAcceptancePack(""); err != nil {
|
||||
if _, err := a.RunMemoryAcceptancePack("", nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if _, err := a.RunStorageAcceptancePack(""); err != nil {
|
||||
if _, err := a.RunStorageAcceptancePack("", nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -141,9 +141,11 @@ func satSummaryStatus(summary satSummary, label string) (string, string, bool) {
|
||||
func satKeyStatus(rawStatus, label string) (string, string, bool) {
|
||||
switch strings.ToUpper(strings.TrimSpace(rawStatus)) {
|
||||
case "OK":
|
||||
return "OK", label + " passed", true
|
||||
// No error description on success — error_description is for problems only.
|
||||
return "OK", "", true
|
||||
case "PARTIAL", "UNSUPPORTED", "CANCELED", "CANCELLED":
|
||||
return "Warning", label + " incomplete", true
|
||||
// Tool couldn't run or test was incomplete — we can't assert hardware health.
|
||||
return "Unknown", "", true
|
||||
case "FAILED":
|
||||
return "Critical", label + " failed", true
|
||||
default:
|
||||
@@ -180,6 +182,8 @@ func statusSeverity(status string) int {
|
||||
return 2
|
||||
case "OK":
|
||||
return 1
|
||||
case "Unknown":
|
||||
return 1 // same as OK — does not override OK from another source
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
|
||||
@@ -76,6 +76,58 @@ func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
return sampleGPUMetrics(gpuIndices)
|
||||
}
|
||||
|
||||
// sampleAMDGPUMetrics queries rocm-smi for live GPU metrics.
|
||||
func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
||||
// --showtemp --showuse --showpower --csv — one row per GPU
|
||||
out, err := runROCmSMI("--showtemp", "--showuse", "--showpower", "--showmemuse", "--csv")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var rows []GPUMetricRow
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" || strings.HasPrefix(strings.ToLower(line), "device") {
|
||||
continue
|
||||
}
|
||||
// CSV format: device,temp_c,gpu_use%,mem_use%,power_w (order may vary by rocm-smi version)
|
||||
// We parse by column header from the first line.
|
||||
parts := strings.Split(line, ",")
|
||||
if len(parts) < 2 {
|
||||
continue
|
||||
}
|
||||
idx := len(rows)
|
||||
row := GPUMetricRow{GPUIndex: idx}
|
||||
// rocm-smi CSV columns vary; extract what we can
|
||||
for i, p := range parts {
|
||||
p = strings.TrimSpace(p)
|
||||
switch {
|
||||
case i == 0:
|
||||
// device index like "card0" or "0"
|
||||
case strings.Contains(strings.ToLower(p), "n/a"):
|
||||
// skip N/A
|
||||
default:
|
||||
// Try to match by position heuristic: temp, use%, memuse%, power
|
||||
v := parseGPUFloat(p)
|
||||
switch {
|
||||
case i == 1 && row.TempC == 0:
|
||||
row.TempC = v
|
||||
case i == 2 && row.UsagePct == 0:
|
||||
row.UsagePct = v
|
||||
case i == 3 && row.MemUsagePct == 0:
|
||||
row.MemUsagePct = v
|
||||
case i == 4 && row.PowerW == 0:
|
||||
row.PowerW = v
|
||||
}
|
||||
}
|
||||
}
|
||||
rows = append(rows, row)
|
||||
}
|
||||
if len(rows) == 0 {
|
||||
return nil, fmt.Errorf("rocm-smi: no GPU rows parsed")
|
||||
}
|
||||
return rows, nil
|
||||
}
|
||||
|
||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||
var b bytes.Buffer
|
||||
|
||||
178
audit/internal/platform/install_to_ram.go
Normal file
178
audit/internal/platform/install_to_ram.go
Normal file
@@ -0,0 +1,178 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func (s *System) IsLiveMediaInRAM() bool {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", "/run/live/medium").Output()
|
||||
if err != nil {
|
||||
return toramActive()
|
||||
}
|
||||
return strings.TrimSpace(string(out)) == "tmpfs"
|
||||
}
|
||||
|
||||
func (s *System) RunInstallToRAM(logFunc func(string)) error {
|
||||
log := func(msg string) {
|
||||
if logFunc != nil {
|
||||
logFunc(msg)
|
||||
}
|
||||
}
|
||||
|
||||
if s.IsLiveMediaInRAM() {
|
||||
log("Already running from RAM — installation media can be safely disconnected.")
|
||||
return nil
|
||||
}
|
||||
|
||||
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||
if err != nil || len(squashfsFiles) == 0 {
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
|
||||
}
|
||||
|
||||
free := freeMemBytes()
|
||||
var needed int64
|
||||
for _, sf := range squashfsFiles {
|
||||
fi, err2 := os.Stat(sf)
|
||||
if err2 != nil {
|
||||
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||
}
|
||||
needed += fi.Size()
|
||||
}
|
||||
const headroom = 256 * 1024 * 1024
|
||||
if free > 0 && needed+headroom > free {
|
||||
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||
humanBytes(needed+headroom), humanBytes(free))
|
||||
}
|
||||
|
||||
dstDir := "/dev/shm/bee-live"
|
||||
if err := os.MkdirAll(dstDir, 0755); err != nil {
|
||||
return fmt.Errorf("create tmpfs dir: %v", err)
|
||||
}
|
||||
|
||||
for _, sf := range squashfsFiles {
|
||||
base := filepath.Base(sf)
|
||||
dst := filepath.Join(dstDir, base)
|
||||
log(fmt.Sprintf("Copying %s to RAM...", base))
|
||||
if err := copyFileLarge(sf, dst, log); err != nil {
|
||||
return fmt.Errorf("copy %s: %v", base, err)
|
||||
}
|
||||
log(fmt.Sprintf("Copied %s.", base))
|
||||
|
||||
loopDev, err := findLoopForFile(sf)
|
||||
if err != nil {
|
||||
log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, err))
|
||||
continue
|
||||
}
|
||||
if err := reassociateLoopDevice(loopDev, dst); err != nil {
|
||||
log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, err))
|
||||
} else {
|
||||
log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
|
||||
}
|
||||
}
|
||||
|
||||
log("Copying remaining medium files...")
|
||||
if err := cpDir("/run/live/medium", dstDir, log); err != nil {
|
||||
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
||||
}
|
||||
if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
|
||||
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
|
||||
}
|
||||
|
||||
log("Done. Installation media can be safely disconnected.")
|
||||
return nil
|
||||
}
|
||||
|
||||
func copyFileLarge(src, dst string, logFunc func(string)) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
fi, err := in.Stat()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
out, err := os.Create(dst)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
total := fi.Size()
|
||||
var copied int64
|
||||
buf := make([]byte, 4*1024*1024)
|
||||
for {
|
||||
n, err := in.Read(buf)
|
||||
if n > 0 {
|
||||
if _, werr := out.Write(buf[:n]); werr != nil {
|
||||
return werr
|
||||
}
|
||||
copied += int64(n)
|
||||
if logFunc != nil && total > 0 {
|
||||
pct := int(float64(copied) / float64(total) * 100)
|
||||
logFunc(fmt.Sprintf(" %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
|
||||
}
|
||||
}
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return out.Sync()
|
||||
}
|
||||
|
||||
func cpDir(src, dst string, logFunc func(string)) error {
|
||||
return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
rel, _ := filepath.Rel(src, path)
|
||||
target := filepath.Join(dst, rel)
|
||||
if fi.IsDir() {
|
||||
return os.MkdirAll(target, fi.Mode())
|
||||
}
|
||||
if strings.HasSuffix(path, ".squashfs") {
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat(target); err == nil {
|
||||
return nil
|
||||
}
|
||||
return copyFileLarge(path, target, nil)
|
||||
})
|
||||
}
|
||||
|
||||
func findLoopForFile(backingFile string) (string, error) {
|
||||
out, err := exec.Command("losetup", "--list", "--json").Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
var result struct {
|
||||
Loopdevices []struct {
|
||||
Name string `json:"name"`
|
||||
BackFile string `json:"back-file"`
|
||||
} `json:"loopdevices"`
|
||||
}
|
||||
if err := json.Unmarshal(out, &result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
for _, dev := range result.Loopdevices {
|
||||
if dev.BackFile == backingFile {
|
||||
return dev.Name, nil
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("no loop device found for %s", backingFile)
|
||||
}
|
||||
|
||||
func reassociateLoopDevice(loopDev, newFile string) error {
|
||||
if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
|
||||
return nil
|
||||
}
|
||||
return loopChangeFD(loopDev, newFile)
|
||||
}
|
||||
28
audit/internal/platform/install_to_ram_linux.go
Normal file
28
audit/internal/platform/install_to_ram_linux.go
Normal file
@@ -0,0 +1,28 @@
|
||||
//go:build linux
|
||||
|
||||
package platform
|
||||
|
||||
import (
|
||||
"os"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
const ioctlLoopChangeFD = 0x4C08
|
||||
|
||||
func loopChangeFD(loopDev, newFile string) error {
|
||||
lf, err := os.OpenFile(loopDev, os.O_RDWR, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer lf.Close()
|
||||
nf, err := os.OpenFile(newFile, os.O_RDONLY, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer nf.Close()
|
||||
_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, lf.Fd(), ioctlLoopChangeFD, nf.Fd())
|
||||
if errno != 0 {
|
||||
return errno
|
||||
}
|
||||
return nil
|
||||
}
|
||||
9
audit/internal/platform/install_to_ram_other.go
Normal file
9
audit/internal/platform/install_to_ram_other.go
Normal file
@@ -0,0 +1,9 @@
|
||||
//go:build !linux
|
||||
|
||||
package platform
|
||||
|
||||
import "errors"
|
||||
|
||||
func loopChangeFD(loopDev, newFile string) error {
|
||||
return errors.New("LOOP_CHANGE_FD not available on this platform")
|
||||
}
|
||||
@@ -32,9 +32,12 @@ type TempReading struct {
|
||||
func SampleLiveMetrics() LiveMetricSample {
|
||||
s := LiveMetricSample{Timestamp: time.Now().UTC()}
|
||||
|
||||
// GPU metrics — skipped silently if nvidia-smi unavailable
|
||||
gpus, _ := SampleGPUMetrics(nil)
|
||||
s.GPUs = gpus
|
||||
// GPU metrics — try NVIDIA first, fall back to AMD
|
||||
if gpus, err := SampleGPUMetrics(nil); err == nil && len(gpus) > 0 {
|
||||
s.GPUs = gpus
|
||||
} else if amdGPUs, err := sampleAMDGPUMetrics(); err == nil && len(amdGPUs) > 0 {
|
||||
s.GPUs = amdGPUs
|
||||
}
|
||||
|
||||
// Fan speeds — skipped silently if ipmitool unavailable
|
||||
fans, _ := sampleFanSpeeds()
|
||||
|
||||
@@ -131,6 +131,25 @@ func (s *System) SetStaticIPv4(cfg StaticIPv4Config) (string, error) {
|
||||
return out.String(), nil
|
||||
}
|
||||
|
||||
// SetInterfaceState brings a network interface up or down.
|
||||
func (s *System) SetInterfaceState(iface string, up bool) error {
|
||||
state := "down"
|
||||
if up {
|
||||
state = "up"
|
||||
}
|
||||
return exec.Command("ip", "link", "set", "dev", iface, state).Run()
|
||||
}
|
||||
|
||||
// GetInterfaceState returns true if the interface is UP.
|
||||
func (s *System) GetInterfaceState(iface string) (bool, error) {
|
||||
raw, err := os.ReadFile(fmt.Sprintf("/sys/class/net/%s/operstate", iface))
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
state := strings.TrimSpace(string(raw))
|
||||
return state == "up", nil
|
||||
}
|
||||
|
||||
func listInterfaceNames() ([]string, error) {
|
||||
raw, err := exec.Command("ip", "-o", "link", "show").Output()
|
||||
if err != nil {
|
||||
|
||||
@@ -2,6 +2,8 @@ package platform
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bufio"
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"errors"
|
||||
@@ -13,6 +15,7 @@ import (
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
@@ -32,6 +35,40 @@ var (
|
||||
}
|
||||
)
|
||||
|
||||
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
|
||||
// Returns combined stdout+stderr as a byte slice.
|
||||
func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
|
||||
pr, pw := io.Pipe()
|
||||
cmd.Stdout = pw
|
||||
cmd.Stderr = pw
|
||||
|
||||
var buf bytes.Buffer
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
scanner := bufio.NewScanner(pr)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
buf.WriteString(line + "\n")
|
||||
if logFunc != nil {
|
||||
logFunc(line)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
err := cmd.Start()
|
||||
if err != nil {
|
||||
_ = pw.Close()
|
||||
wg.Wait()
|
||||
return nil, err
|
||||
}
|
||||
waitErr := cmd.Wait()
|
||||
_ = pw.Close()
|
||||
wg.Wait()
|
||||
return buf.Bytes(), waitErr
|
||||
}
|
||||
|
||||
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
||||
type NvidiaGPU struct {
|
||||
Index int
|
||||
@@ -80,13 +117,27 @@ func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
|
||||
}
|
||||
|
||||
// RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi.
|
||||
func (s *System) RunAMDAcceptancePack(baseDir string) (string, error) {
|
||||
func (s *System) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return runAcceptancePack(baseDir, "gpu-amd", []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}},
|
||||
{name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "04-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
})
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// RunAMDStressPack runs an AMD GPU burn-in pack.
|
||||
// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
|
||||
func (s *System) RunAMDStressPack(baseDir string, logFunc func(string)) (string, error) {
|
||||
seconds := envInt("BEE_AMD_STRESS_SECONDS", 300)
|
||||
return runAcceptancePack(baseDir, "gpu-amd-stress", []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||
{name: fmt.Sprintf("03-rocm-smi-monitor-%ds.log", seconds), cmd: []string{
|
||||
"rocm-smi", "--showtemp", "--showpower",
|
||||
fmt.Sprintf("--duration=%d", seconds),
|
||||
}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
||||
@@ -123,7 +174,7 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string) (string, error) {
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
// detect GPU count
|
||||
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
|
||||
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
|
||||
@@ -136,32 +187,65 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string) (string, erro
|
||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||
}},
|
||||
})
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
|
||||
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
|
||||
}
|
||||
|
||||
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
|
||||
// diagLevel: 1=quick, 2=medium, 3=targeted stress, 4=extended stress.
|
||||
// gpuIndices: specific GPU indices to test (empty = all GPUs).
|
||||
// ctx cancellation kills the running job.
|
||||
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error) {
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices))
|
||||
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||
func (s *System) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
||||
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
||||
return runAcceptancePack(baseDir, "memory", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||
})
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
|
||||
func (s *System) RunMemoryStressPack(baseDir string, logFunc func(string)) (string, error) {
|
||||
seconds := envInt("BEE_VM_STRESS_SECONDS", 300)
|
||||
// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
|
||||
sizeArg := "80%"
|
||||
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
||||
sizeArg = fmt.Sprintf("%dM", mb)
|
||||
}
|
||||
return runAcceptancePack(baseDir, "memory-stress", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-stress-ng-vm.log", cmd: []string{
|
||||
"stress-ng", "--vm", "1",
|
||||
"--vm-bytes", sizeArg,
|
||||
"--vm-method", "all",
|
||||
"--timeout", fmt.Sprintf("%d", seconds),
|
||||
"--metrics-brief",
|
||||
}},
|
||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunSATStressPack(baseDir string, logFunc func(string)) (string, error) {
|
||||
seconds := envInt("BEE_SAT_STRESS_SECONDS", 300)
|
||||
cmd := []string{"stressapptest", "-s", fmt.Sprintf("%d", seconds), "-W", "--cc_test"}
|
||||
if mb := envInt("BEE_SAT_STRESS_MB", 0); mb > 0 {
|
||||
cmd = append(cmd, "-M", fmt.Sprintf("%d", mb))
|
||||
}
|
||||
return runAcceptancePack(baseDir, "sat-stress", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-stressapptest.log", cmd: cmd},
|
||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if durationSec <= 0 {
|
||||
durationSec = 60
|
||||
}
|
||||
@@ -170,10 +254,10 @@ func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int) (string,
|
||||
{name: "02-sensors-before.log", cmd: []string{"sensors"}},
|
||||
{name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}},
|
||||
{name: "04-sensors-after.log", cmd: []string{"sensors"}},
|
||||
})
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
func (s *System) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
@@ -205,7 +289,7 @@ func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
commands := storageSATCommands(devPath)
|
||||
for cmdIndex, job := range commands {
|
||||
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
|
||||
out, err := runSATCommand(verboseLog, job.name, job.cmd)
|
||||
out, err := runSATCommand(verboseLog, job.name, job.cmd, logFunc)
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
@@ -254,7 +338,7 @@ func nvidiaSATJobs() []satJob {
|
||||
}
|
||||
}
|
||||
|
||||
func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
|
||||
func runAcceptancePack(baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
@@ -269,11 +353,13 @@ func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
|
||||
stats := satStats{}
|
||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
for _, job := range jobs {
|
||||
var out []byte
|
||||
var err error
|
||||
cmd := make([]string, 0, len(job.cmd))
|
||||
for _, arg := range job.cmd {
|
||||
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
||||
}
|
||||
out, err := runSATCommand(verboseLog, job.name, cmd)
|
||||
out, err = runSATCommand(verboseLog, job.name, cmd, logFunc)
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
@@ -315,7 +401,7 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||
}
|
||||
}
|
||||
|
||||
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob) (string, error) {
|
||||
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
@@ -342,9 +428,9 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
var err error
|
||||
|
||||
if job.collectGPU {
|
||||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir)
|
||||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
||||
} else {
|
||||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env)
|
||||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
||||
}
|
||||
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||
@@ -368,13 +454,16 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
return archive, nil
|
||||
}
|
||||
|
||||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
|
||||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
|
||||
start := time.Now().UTC()
|
||||
resolvedCmd, err := resolveSATCommand(cmd)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||
)
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("=== %s ===", name))
|
||||
}
|
||||
if err != nil {
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||
@@ -389,7 +478,7 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
||||
if len(env) > 0 {
|
||||
c.Env = append(os.Environ(), env...)
|
||||
}
|
||||
out, err := c.CombinedOutput()
|
||||
out, err := streamExecOutput(c, logFunc)
|
||||
|
||||
rc := 0
|
||||
if err != nil {
|
||||
@@ -464,6 +553,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
||||
}
|
||||
|
||||
text := strings.ToLower(string(out))
|
||||
// No output at all means the tool failed to start (mlock limit, binary missing,
|
||||
// etc.) — we cannot say anything about hardware health → UNSUPPORTED.
|
||||
if len(strings.TrimSpace(text)) == 0 {
|
||||
return "UNSUPPORTED", rc
|
||||
}
|
||||
if strings.Contains(text, "unsupported") ||
|
||||
strings.Contains(text, "not supported") ||
|
||||
strings.Contains(text, "invalid opcode") ||
|
||||
@@ -472,19 +566,25 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
||||
strings.Contains(text, "not available") ||
|
||||
strings.Contains(text, "cuda_error_system_not_ready") ||
|
||||
strings.Contains(text, "no such device") ||
|
||||
// nvidia-smi on a machine with no NVIDIA GPU
|
||||
strings.Contains(text, "couldn't communicate with the nvidia driver") ||
|
||||
strings.Contains(text, "no nvidia gpu") ||
|
||||
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
||||
return "UNSUPPORTED", rc
|
||||
}
|
||||
return "FAILED", rc
|
||||
}
|
||||
|
||||
func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
||||
func runSATCommand(verboseLog, name string, cmd []string, logFunc func(string)) ([]byte, error) {
|
||||
start := time.Now().UTC()
|
||||
resolvedCmd, err := resolveSATCommand(cmd)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||
)
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("=== %s ===", name))
|
||||
}
|
||||
if err != nil {
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||
@@ -495,7 +595,7 @@ func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
||||
return []byte(err.Error() + "\n"), err
|
||||
}
|
||||
|
||||
out, err := satExecCommand(resolvedCmd[0], resolvedCmd[1:]...).CombinedOutput()
|
||||
out, err := streamExecOutput(satExecCommand(resolvedCmd[0], resolvedCmd[1:]...), logFunc)
|
||||
|
||||
rc := 0
|
||||
if err != nil {
|
||||
@@ -597,7 +697,7 @@ func parseStorageDevices(raw string) []string {
|
||||
|
||||
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
|
||||
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
|
||||
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {
|
||||
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string, logFunc func(string)) ([]byte, error) {
|
||||
stopCh := make(chan struct{})
|
||||
doneCh := make(chan struct{})
|
||||
var metricRows []GPUMetricRow
|
||||
@@ -625,7 +725,7 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd
|
||||
}
|
||||
}()
|
||||
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env)
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc)
|
||||
|
||||
close(stopCh)
|
||||
<-doneCh
|
||||
|
||||
@@ -147,7 +147,7 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
|
||||
"--seconds", strconv.Itoa(durSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env)
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env, nil)
|
||||
_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
|
||||
if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
|
||||
fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
|
||||
|
||||
@@ -17,6 +17,10 @@ func (s *System) ListBeeServices() ([]string, error) {
|
||||
}
|
||||
for _, match := range matches {
|
||||
name := strings.TrimSuffix(filepath.Base(match), ".service")
|
||||
// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
|
||||
if strings.HasSuffix(name, "@") {
|
||||
continue
|
||||
}
|
||||
if !seen[name] {
|
||||
seen[name] = true
|
||||
out = append(out, name)
|
||||
|
||||
@@ -9,7 +9,6 @@ import (
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
@@ -110,39 +109,37 @@ func runCmdJob(j *jobState, cmd *exec.Cmd) {
|
||||
|
||||
// ── Audit ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIAuditRun(w http.ResponseWriter, r *http.Request) {
|
||||
func (h *handler) handleAPIAuditRun(w http.ResponseWriter, _ *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
id := newJobID("audit")
|
||||
j := globalJobs.create(id)
|
||||
go func() {
|
||||
j.append("Running audit...")
|
||||
result, err := h.opts.App.RunAuditNow(h.opts.RuntimeMode)
|
||||
if err != nil {
|
||||
j.append("ERROR: " + err.Error())
|
||||
j.finish(err.Error())
|
||||
return
|
||||
}
|
||||
for _, line := range strings.Split(result.Body, "\n") {
|
||||
if line != "" {
|
||||
j.append(line)
|
||||
}
|
||||
}
|
||||
j.finish("")
|
||||
}()
|
||||
writeJSON(w, map[string]string{"job_id": id})
|
||||
t := &Task{
|
||||
ID: newJobID("audit"),
|
||||
Name: "Audit",
|
||||
Target: "audit",
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIAuditStream(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.URL.Query().Get("job_id")
|
||||
j, ok := globalJobs.get(id)
|
||||
if !ok {
|
||||
http.Error(w, "job not found", http.StatusNotFound)
|
||||
if id == "" {
|
||||
id = r.URL.Query().Get("task_id")
|
||||
}
|
||||
// Try task queue first, then legacy job manager
|
||||
if j, ok := globalQueue.findJob(id); ok {
|
||||
streamJob(w, r, j)
|
||||
return
|
||||
}
|
||||
streamJob(w, r, j)
|
||||
if j, ok := globalJobs.get(id); ok {
|
||||
streamJob(w, r, j)
|
||||
return
|
||||
}
|
||||
http.Error(w, "job not found", http.StatusNotFound)
|
||||
}
|
||||
|
||||
// ── SAT ───────────────────────────────────────────────────────────────────────
|
||||
@@ -153,96 +150,87 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
id := newJobID("sat-" + target)
|
||||
j := globalJobs.create(id)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
j.cancel = cancel
|
||||
|
||||
go func() {
|
||||
defer cancel()
|
||||
j.append(fmt.Sprintf("Starting %s acceptance test...", target))
|
||||
var (
|
||||
archive string
|
||||
err error
|
||||
)
|
||||
var body struct {
|
||||
Duration int `json:"duration"`
|
||||
DiagLevel int `json:"diag_level"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
}
|
||||
body.DiagLevel = 1
|
||||
if r.ContentLength > 0 {
|
||||
_ = json.NewDecoder(r.Body).Decode(&body)
|
||||
}
|
||||
|
||||
// Parse optional parameters
|
||||
var body struct {
|
||||
Duration int `json:"duration"`
|
||||
DiagLevel int `json:"diag_level"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
}
|
||||
body.DiagLevel = 1
|
||||
if r.ContentLength > 0 {
|
||||
_ = json.NewDecoder(r.Body).Decode(&body)
|
||||
}
|
||||
|
||||
switch target {
|
||||
case "nvidia":
|
||||
if len(body.GPUIndices) > 0 || body.DiagLevel > 0 {
|
||||
result, e := h.opts.App.RunNvidiaAcceptancePackWithOptions(
|
||||
ctx, "", body.DiagLevel, body.GPUIndices,
|
||||
)
|
||||
if e != nil {
|
||||
err = e
|
||||
} else {
|
||||
archive = result.Body
|
||||
}
|
||||
} else {
|
||||
archive, err = h.opts.App.RunNvidiaAcceptancePack("")
|
||||
}
|
||||
case "memory":
|
||||
archive, err = h.opts.App.RunMemoryAcceptancePack("")
|
||||
case "storage":
|
||||
archive, err = h.opts.App.RunStorageAcceptancePack("")
|
||||
case "cpu":
|
||||
dur := body.Duration
|
||||
if dur <= 0 {
|
||||
dur = 60
|
||||
}
|
||||
archive, err = h.opts.App.RunCPUAcceptancePack("", dur)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if ctx.Err() != nil {
|
||||
j.append("Aborted.")
|
||||
j.finish("aborted")
|
||||
} else {
|
||||
j.append("ERROR: " + err.Error())
|
||||
j.finish(err.Error())
|
||||
}
|
||||
return
|
||||
}
|
||||
j.append(fmt.Sprintf("Archive written: %s", archive))
|
||||
j.finish("")
|
||||
}()
|
||||
|
||||
writeJSON(w, map[string]string{"job_id": id})
|
||||
name := taskNames[target]
|
||||
if name == "" {
|
||||
name = target
|
||||
}
|
||||
t := &Task{
|
||||
ID: newJobID("sat-" + target),
|
||||
Name: name,
|
||||
Target: target,
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
Duration: body.Duration,
|
||||
DiagLevel: body.DiagLevel,
|
||||
GPUIndices: body.GPUIndices,
|
||||
},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.URL.Query().Get("job_id")
|
||||
j, ok := globalJobs.get(id)
|
||||
if !ok {
|
||||
http.Error(w, "job not found", http.StatusNotFound)
|
||||
if id == "" {
|
||||
id = r.URL.Query().Get("task_id")
|
||||
}
|
||||
if j, ok := globalQueue.findJob(id); ok {
|
||||
streamJob(w, r, j)
|
||||
return
|
||||
}
|
||||
streamJob(w, r, j)
|
||||
if j, ok := globalJobs.get(id); ok {
|
||||
streamJob(w, r, j)
|
||||
return
|
||||
}
|
||||
http.Error(w, "job not found", http.StatusNotFound)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.URL.Query().Get("job_id")
|
||||
j, ok := globalJobs.get(id)
|
||||
if !ok {
|
||||
http.Error(w, "job not found", http.StatusNotFound)
|
||||
if id == "" {
|
||||
id = r.URL.Query().Get("task_id")
|
||||
}
|
||||
if t, ok := globalQueue.findByID(id); ok {
|
||||
globalQueue.mu.Lock()
|
||||
switch t.Status {
|
||||
case TaskPending:
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
}
|
||||
globalQueue.mu.Unlock()
|
||||
writeJSON(w, map[string]string{"status": "aborted"})
|
||||
return
|
||||
}
|
||||
if j.abort() {
|
||||
writeJSON(w, map[string]string{"status": "aborted"})
|
||||
} else {
|
||||
writeJSON(w, map[string]string{"status": "not_running"})
|
||||
if j, ok := globalJobs.get(id); ok {
|
||||
if j.abort() {
|
||||
writeJSON(w, map[string]string{"status": "aborted"})
|
||||
} else {
|
||||
writeJSON(w, map[string]string{"status": "not_running"})
|
||||
}
|
||||
return
|
||||
}
|
||||
http.Error(w, "job not found", http.StatusNotFound)
|
||||
}
|
||||
|
||||
// ── Services ──────────────────────────────────────────────────────────────────
|
||||
@@ -401,6 +389,51 @@ func (h *handler) handleAPIExportBundle(w http.ResponseWriter, r *http.Request)
|
||||
})
|
||||
}
|
||||
|
||||
// ── GPU presence ──────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
gp := h.opts.App.DetectGPUPresence()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(map[string]bool{
|
||||
"nvidia": gp.Nvidia,
|
||||
"amd": gp.AMD,
|
||||
})
|
||||
}
|
||||
|
||||
// ── System ────────────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
inRAM := h.opts.App.IsLiveMediaInRAM()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(map[string]bool{"in_ram": inRAM})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
t := &Task{
|
||||
ID: newJobID("install-to-ram"),
|
||||
Name: "Install to RAM",
|
||||
Target: "install-to-ram",
|
||||
Priority: 10,
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
// ── Tools ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
var standardTools = []string{
|
||||
@@ -507,7 +540,7 @@ func (h *handler) handleAPIInstallRun(w http.ResponseWriter, r *http.Request) {
|
||||
h.installMu.Unlock()
|
||||
|
||||
logFile := platform.InstallLogPath(req.Device)
|
||||
go runCmdJob(j, exec.CommandContext(r.Context(), "bee-install", req.Device, logFile))
|
||||
go runCmdJob(j, exec.CommandContext(context.Background(), "bee-install", req.Device, logFile))
|
||||
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}
|
||||
@@ -589,3 +622,95 @@ func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Network toggle ────────────────────────────────────────────────────────────
|
||||
|
||||
const netRollbackTimeout = 60 * time.Second
|
||||
|
||||
func (h *handler) handleAPINetworkToggle(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var req struct {
|
||||
Iface string `json:"iface"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Iface == "" {
|
||||
writeError(w, http.StatusBadRequest, "iface is required")
|
||||
return
|
||||
}
|
||||
|
||||
wasUp, err := h.opts.App.GetInterfaceState(req.Iface)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
if err := h.opts.App.SetInterfaceState(req.Iface, !wasUp); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
// Cancel any existing pending change (rollback it first).
|
||||
h.pendingNetMu.Lock()
|
||||
if h.pendingNet != nil {
|
||||
prev := h.pendingNet
|
||||
prev.mu.Lock()
|
||||
prev.timer.Stop()
|
||||
_ = h.opts.App.SetInterfaceState(prev.iface, prev.wasUp)
|
||||
prev.mu.Unlock()
|
||||
}
|
||||
|
||||
pnc := &pendingNetChange{iface: req.Iface, wasUp: wasUp}
|
||||
pnc.timer = time.AfterFunc(netRollbackTimeout, func() {
|
||||
_ = h.opts.App.SetInterfaceState(req.Iface, wasUp)
|
||||
h.pendingNetMu.Lock()
|
||||
if h.pendingNet == pnc {
|
||||
h.pendingNet = nil
|
||||
}
|
||||
h.pendingNetMu.Unlock()
|
||||
})
|
||||
h.pendingNet = pnc
|
||||
h.pendingNetMu.Unlock()
|
||||
|
||||
newState := "up"
|
||||
if wasUp {
|
||||
newState = "down"
|
||||
}
|
||||
writeJSON(w, map[string]any{
|
||||
"iface": req.Iface,
|
||||
"new_state": newState,
|
||||
"rollback_in": int(netRollbackTimeout.Seconds()),
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINetworkConfirm(w http.ResponseWriter, _ *http.Request) {
|
||||
h.pendingNetMu.Lock()
|
||||
pnc := h.pendingNet
|
||||
h.pendingNet = nil
|
||||
h.pendingNetMu.Unlock()
|
||||
if pnc != nil {
|
||||
pnc.mu.Lock()
|
||||
pnc.timer.Stop()
|
||||
pnc.mu.Unlock()
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "confirmed"})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Request) {
|
||||
h.pendingNetMu.Lock()
|
||||
pnc := h.pendingNet
|
||||
h.pendingNet = nil
|
||||
h.pendingNetMu.Unlock()
|
||||
if pnc == nil {
|
||||
writeError(w, http.StatusConflict, "no pending network change")
|
||||
return
|
||||
}
|
||||
pnc.mu.Lock()
|
||||
pnc.timer.Stop()
|
||||
pnc.mu.Unlock()
|
||||
if h.opts.App != nil {
|
||||
_ = h.opts.App.SetInterfaceState(pnc.iface, pnc.wasUp)
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "rolled back"})
|
||||
}
|
||||
|
||||
@@ -84,17 +84,13 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
|
||||
}
|
||||
|
||||
func layoutNav(active string) string {
|
||||
items := []struct{ id, label, href string }{
|
||||
{"dashboard", "Dashboard", "/"},
|
||||
{"viewer", "Audit Snapshot", "/viewer"},
|
||||
{"metrics", "Metrics", "/metrics"},
|
||||
{"tests", "Acceptance Tests", "/tests"},
|
||||
{"burn-in", "Burn-in", "/burn-in"},
|
||||
{"network", "Network", "/network"},
|
||||
{"services", "Services", "/services"},
|
||||
{"export", "Export", "/export"},
|
||||
{"tools", "Tools", "/tools"},
|
||||
{"install", "Install to Disk", "/install"},
|
||||
items := []struct{ id, label, href, onclick string }{
|
||||
{"dashboard", "Dashboard", "/", ""},
|
||||
{"audit", "Audit", "#", "openAuditModal();return false;"},
|
||||
{"validate", "Validate", "/validate", ""},
|
||||
{"burn", "Burn", "/burn", ""},
|
||||
{"tasks", "Tasks", "/tasks", ""},
|
||||
{"tools", "Tools", "/tools", ""},
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<aside class="sidebar">`)
|
||||
@@ -105,8 +101,13 @@ func layoutNav(active string) string {
|
||||
if item.id == active {
|
||||
cls += " active"
|
||||
}
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`,
|
||||
cls, item.href, item.label))
|
||||
if item.onclick != "" {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" onclick="%s">%s</a>`,
|
||||
cls, item.href, item.onclick, item.label))
|
||||
} else {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`,
|
||||
cls, item.href, item.label))
|
||||
}
|
||||
}
|
||||
b.WriteString(`</nav></aside>`)
|
||||
return b.String()
|
||||
@@ -120,18 +121,35 @@ func renderPage(page string, opts HandlerOptions) string {
|
||||
pageID = "dashboard"
|
||||
title = "Dashboard"
|
||||
body = renderDashboard(opts)
|
||||
case "validate":
|
||||
pageID = "validate"
|
||||
title = "Validate"
|
||||
body = renderValidate()
|
||||
case "burn":
|
||||
pageID = "burn"
|
||||
title = "Burn"
|
||||
body = renderBurn()
|
||||
case "tasks":
|
||||
pageID = "tasks"
|
||||
title = "Tasks"
|
||||
body = renderTasks()
|
||||
case "tools":
|
||||
pageID = "tools"
|
||||
title = "Tools"
|
||||
body = renderTools()
|
||||
// Legacy routes kept accessible but not in nav
|
||||
case "metrics":
|
||||
pageID = "metrics"
|
||||
title = "Live Metrics"
|
||||
body = renderMetrics()
|
||||
case "tests":
|
||||
pageID = "tests"
|
||||
pageID = "validate"
|
||||
title = "Acceptance Tests"
|
||||
body = renderTests()
|
||||
body = renderValidate()
|
||||
case "burn-in":
|
||||
pageID = "burn-in"
|
||||
pageID = "burn"
|
||||
title = "Burn-in Tests"
|
||||
body = renderBurnIn()
|
||||
body = renderBurn()
|
||||
case "network":
|
||||
pageID = "network"
|
||||
title = "Network"
|
||||
@@ -144,10 +162,6 @@ func renderPage(page string, opts HandlerOptions) string {
|
||||
pageID = "export"
|
||||
title = "Export"
|
||||
body = renderExport(opts.ExportDir)
|
||||
case "tools":
|
||||
pageID = "tools"
|
||||
title = "Tools"
|
||||
body = renderTools()
|
||||
case "install":
|
||||
pageID = "install"
|
||||
title = "Install to Disk"
|
||||
@@ -162,48 +176,158 @@ func renderPage(page string, opts HandlerOptions) string {
|
||||
layoutNav(pageID) +
|
||||
`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
|
||||
body +
|
||||
`</div></div></body></html>`
|
||||
`</div></div>` +
|
||||
renderAuditModal() +
|
||||
`</body></html>`
|
||||
}
|
||||
|
||||
// ── Dashboard ─────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderDashboard(opts HandlerOptions) string {
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="grid2">`)
|
||||
// Left: health summary
|
||||
b.WriteString(`<div>`)
|
||||
b.WriteString(renderHardwareSummaryCard(opts))
|
||||
b.WriteString(renderHealthCard(opts))
|
||||
b.WriteString(`</div>`)
|
||||
// Right: quick actions
|
||||
b.WriteString(`<div>`)
|
||||
b.WriteString(`<div class="card"><div class="card-head">Quick Actions</div><div class="card-body">`)
|
||||
b.WriteString(`<a class="btn btn-primary" href="/export/support.tar.gz" style="display:block;margin-bottom:10px">⬇ Download Support Bundle</a>`)
|
||||
b.WriteString(`<a class="btn btn-secondary" href="/audit.json" style="display:block;margin-bottom:10px" target="_blank">📄 Open audit.json</a>`)
|
||||
b.WriteString(`<a class="btn btn-secondary" href="/export/" style="display:block">📁 Browse Export Files</a>`)
|
||||
b.WriteString(`<div style="margin-top:14px"><button class="btn btn-secondary" onclick="runAudit()">▶ Re-run Audit</button></div>`)
|
||||
b.WriteString(`</div></div>`)
|
||||
b.WriteString(`</div>`)
|
||||
b.WriteString(`</div>`)
|
||||
// Audit run output div
|
||||
b.WriteString(`<div id="audit-output" style="display:none" class="card"><div class="card-head">Audit Output</div><div class="card-body"><div id="audit-terminal" class="terminal"></div></div></div>`)
|
||||
|
||||
b.WriteString(`<script>
|
||||
function runAudit() {
|
||||
document.getElementById('audit-output').style.display='block';
|
||||
const term = document.getElementById('audit-terminal');
|
||||
term.textContent = 'Starting audit...\n';
|
||||
fetch('/api/audit/run', {method:'POST'})
|
||||
.then(r => r.json())
|
||||
.then(d => {
|
||||
const es = new EventSource('/api/audit/stream?job_id=' + d.job_id);
|
||||
es.onmessage = e => { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
es.addEventListener('done', e => { es.close(); term.textContent += (e.data ? '\\nERROR: ' + e.data : '\\nDone.') + '\n'; location.reload(); });
|
||||
});
|
||||
}
|
||||
</script>`)
|
||||
b.WriteString(renderMetrics())
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||
data, err := loadSnapshot(opts.AuditPath)
|
||||
if err != nil {
|
||||
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-unknown">No audit data</span></div></div>`
|
||||
}
|
||||
// Parse just enough fields for the summary banner
|
||||
var snap struct {
|
||||
Summary struct {
|
||||
CPU struct{ Model string }
|
||||
Memory struct{ TotalGB float64 }
|
||||
Storage []struct{ Device, Model, Size string }
|
||||
GPUs []struct{ Model string }
|
||||
PSUs []struct{ Model string }
|
||||
}
|
||||
Network struct {
|
||||
Interfaces []struct {
|
||||
Name string
|
||||
IPv4 []string
|
||||
State string
|
||||
}
|
||||
}
|
||||
}
|
||||
// Try to extract top-level fields loosely
|
||||
var raw map[string]json.RawMessage
|
||||
if err := json.Unmarshal(data, &raw); err != nil {
|
||||
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
|
||||
}
|
||||
_ = snap
|
||||
|
||||
// Also load runtime-health for badges
|
||||
type componentHealth struct {
|
||||
FailCount int `json:"fail_count"`
|
||||
WarnCount int `json:"warn_count"`
|
||||
}
|
||||
type healthSummary struct {
|
||||
CPU componentHealth `json:"cpu"`
|
||||
Memory componentHealth `json:"memory"`
|
||||
Storage componentHealth `json:"storage"`
|
||||
GPU componentHealth `json:"gpu"`
|
||||
PSU componentHealth `json:"psu"`
|
||||
Network componentHealth `json:"network"`
|
||||
}
|
||||
var health struct {
|
||||
HardwareHealth healthSummary `json:"hardware_health"`
|
||||
}
|
||||
if hdata, herr := loadSnapshot(filepath.Join(opts.ExportDir, "runtime-health.json")); herr == nil {
|
||||
_ = json.Unmarshal(hdata, &health)
|
||||
}
|
||||
|
||||
badge := func(h componentHealth) string {
|
||||
if h.FailCount > 0 {
|
||||
return `<span class="badge badge-err">FAIL</span>`
|
||||
}
|
||||
if h.WarnCount > 0 {
|
||||
return `<span class="badge badge-warn">WARN</span>`
|
||||
}
|
||||
return `<span class="badge badge-ok">OK</span>`
|
||||
}
|
||||
|
||||
// Extract readable strings from raw JSON
|
||||
getString := func(key string) string {
|
||||
v, ok := raw[key]
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
var s string
|
||||
if err := json.Unmarshal(v, &s); err == nil {
|
||||
return s
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
cpuModel := getString("cpu_model")
|
||||
memStr := getString("memory_summary")
|
||||
gpuSummary := getString("gpu_summary")
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body">`)
|
||||
b.WriteString(`<table style="width:auto">`)
|
||||
writeRow := func(label, value, badgeHTML string) {
|
||||
b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
|
||||
html.EscapeString(label), html.EscapeString(value), badgeHTML))
|
||||
}
|
||||
if cpuModel != "" {
|
||||
writeRow("CPU", cpuModel, badge(health.HardwareHealth.CPU))
|
||||
} else {
|
||||
writeRow("CPU", "—", badge(health.HardwareHealth.CPU))
|
||||
}
|
||||
if memStr != "" {
|
||||
writeRow("Memory", memStr, badge(health.HardwareHealth.Memory))
|
||||
} else {
|
||||
writeRow("Memory", "—", badge(health.HardwareHealth.Memory))
|
||||
}
|
||||
if gpuSummary != "" {
|
||||
writeRow("GPU", gpuSummary, badge(health.HardwareHealth.GPU))
|
||||
} else {
|
||||
writeRow("GPU", "—", badge(health.HardwareHealth.GPU))
|
||||
}
|
||||
writeRow("Storage", "—", badge(health.HardwareHealth.Storage))
|
||||
writeRow("PSU", "—", badge(health.HardwareHealth.PSU))
|
||||
b.WriteString(`</table>`)
|
||||
b.WriteString(`</div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func renderAuditModal() string {
|
||||
return `<div id="audit-modal-overlay" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,.5);z-index:100;align-items:center;justify-content:center">
|
||||
<div style="background:#fff;border-radius:6px;padding:24px;min-width:480px;max-width:700px;position:relative">
|
||||
<div style="font-weight:700;font-size:16px;margin-bottom:16px">Audit</div>
|
||||
<div style="margin-bottom:12px;display:flex;gap:8px">
|
||||
<button class="btn btn-primary" onclick="auditModalRun()">▶ Re-run Audit</button>
|
||||
<a class="btn btn-secondary" href="/audit.json" download>↓ Download</a>
|
||||
<a class="btn btn-secondary" href="/viewer" target="_blank">Open Viewer</a>
|
||||
</div>
|
||||
<div id="audit-modal-terminal" class="terminal" style="display:none;max-height:300px"></div>
|
||||
<button class="btn btn-secondary btn-sm" onclick="closeAuditModal()" style="position:absolute;top:12px;right:12px">✕</button>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
function openAuditModal() {
|
||||
document.getElementById('audit-modal-overlay').style.display='flex';
|
||||
}
|
||||
function closeAuditModal() {
|
||||
document.getElementById('audit-modal-overlay').style.display='none';
|
||||
}
|
||||
function auditModalRun() {
|
||||
const term = document.getElementById('audit-modal-terminal');
|
||||
term.style.display='block'; term.textContent='Starting...\n';
|
||||
fetch('/api/audit/run',{method:'POST'}).then(r=>r.json()).then(d=>{
|
||||
const es=new EventSource('/api/tasks/'+d.task_id+'/stream');
|
||||
es.onmessage=e=>{term.textContent+=e.data+'\n';term.scrollTop=term.scrollHeight;};
|
||||
es.addEventListener('done',e=>{es.close();term.textContent+=(e.data?'\nERROR: '+e.data:'\nDone.')+'\n';});
|
||||
});
|
||||
}
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderHealthCard(opts HandlerOptions) string {
|
||||
data, err := loadSnapshot(filepath.Join(opts.ExportDir, "runtime-health.json"))
|
||||
if err != nil {
|
||||
@@ -239,12 +363,26 @@ func renderHealthCard(opts HandlerOptions) string {
|
||||
// ── Metrics ───────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderMetrics() string {
|
||||
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Live metrics — updated every 2 seconds. Charts use go-analyze/charts (grafana theme).</p>
|
||||
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Live metrics — updated every 2 seconds.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Server</div>
|
||||
<div class="card-head">Server — Load</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server" src="/api/metrics/chart/server.svg" style="width:100%;display:block;border-radius:6px" alt="Server metrics">
|
||||
<img id="chart-server-load" src="/api/metrics/chart/server-load.svg" style="width:100%;display:block;border-radius:6px" alt="CPU/Mem load">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Server — Temperature</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-temp" src="/api/metrics/chart/server-temp.svg" style="width:100%;display:block;border-radius:6px" alt="CPU temperature">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Server — Power</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-power" src="/api/metrics/chart/server-power.svg" style="width:100%;display:block;border-radius:6px" alt="System power">
|
||||
<div id="sys-table" style="margin-top:8px;font-size:12px"></div>
|
||||
</div>
|
||||
</div>
|
||||
@@ -256,12 +394,16 @@ let knownGPUs = [];
|
||||
|
||||
function refreshCharts() {
|
||||
const t = '?t=' + Date.now();
|
||||
const srv = document.getElementById('chart-server');
|
||||
if (srv) srv.src = srv.src.split('?')[0] + t;
|
||||
knownGPUs.forEach(idx => {
|
||||
const el = document.getElementById('chart-gpu-' + idx);
|
||||
['chart-server-load','chart-server-temp','chart-server-power'].forEach(id => {
|
||||
const el = document.getElementById(id);
|
||||
if (el) el.src = el.src.split('?')[0] + t;
|
||||
});
|
||||
knownGPUs.forEach(idx => {
|
||||
['load','temp','power'].forEach(kind => {
|
||||
const el = document.getElementById('chart-gpu-' + idx + '-' + kind);
|
||||
if (el) el.src = el.src.split('?')[0] + t;
|
||||
});
|
||||
});
|
||||
}
|
||||
setInterval(refreshCharts, 2000);
|
||||
|
||||
@@ -276,10 +418,19 @@ es.addEventListener('metrics', e => {
|
||||
const div = document.createElement('div');
|
||||
div.className = 'card';
|
||||
div.style.marginBottom = '16px';
|
||||
div.innerHTML = '<div class="card-head">GPU ' + g.index + '</div>' +
|
||||
div.innerHTML =
|
||||
'<div class="card-head">GPU ' + g.index + ' — Load</div>' +
|
||||
'<div class="card-body" style="padding:8px">' +
|
||||
'<img id="chart-gpu-' + g.index + '" src="/api/metrics/chart/gpu/' + g.index + '.svg" style="width:100%;display:block;border-radius:6px" alt="GPU ' + g.index + '">' +
|
||||
'<div id="gpu-table-' + g.index + '" style="margin-top:8px;font-size:12px"></div>' +
|
||||
'<img id="chart-gpu-' + g.index + '-load" src="/api/metrics/chart/gpu/' + g.index + '-load.svg" style="width:100%;display:block;border-radius:6px" alt="GPU ' + g.index + ' load">' +
|
||||
'</div>' +
|
||||
'<div class="card-head">GPU ' + g.index + ' — Temperature</div>' +
|
||||
'<div class="card-body" style="padding:8px">' +
|
||||
'<img id="chart-gpu-' + g.index + '-temp" src="/api/metrics/chart/gpu/' + g.index + '-temp.svg" style="width:100%;display:block;border-radius:6px" alt="GPU ' + g.index + ' temp">' +
|
||||
'</div>' +
|
||||
'<div class="card-head">GPU ' + g.index + ' — Power</div>' +
|
||||
'<div class="card-body" style="padding:8px">' +
|
||||
'<img id="chart-gpu-' + g.index + '-power" src="/api/metrics/chart/gpu/' + g.index + '-power.svg" style="width:100%;display:block;border-radius:6px" alt="GPU ' + g.index + ' power">' +
|
||||
'<div id="gpu-table-' + g.index + '" style="margin-top:8px;font-size:12px"></div>' +
|
||||
'</div>';
|
||||
document.getElementById('gpu-charts').appendChild(div);
|
||||
});
|
||||
@@ -309,15 +460,27 @@ es.onerror = () => {};
|
||||
</script>`
|
||||
}
|
||||
|
||||
// ── Acceptance Tests ──────────────────────────────────────────────────────────
|
||||
// ── Validate (Acceptance Tests) ───────────────────────────────────────────────
|
||||
|
||||
func renderTests() string {
|
||||
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Run hardware acceptance tests and view results.</p>
|
||||
<div class="grid2">
|
||||
func renderValidate() string {
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Run All Tests</div>
|
||||
<div class="card-body" style="display:flex;align-items:center;gap:12px;flex-wrap:wrap">
|
||||
<div class="form-row" style="margin:0"><label style="margin-right:6px">Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:70px;display:inline-block"></div>
|
||||
<button class="btn btn-primary" onclick="runAllSAT()">▶ Run All</button>
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("nvidia", "NVIDIA GPU", `<div class="form-row"><label>Diag Level</label><select id="sat-nvidia-level"><option value="1">Level 1 — Quick</option><option value="2">Level 2 — Standard</option><option value="3">Level 3 — Extended</option><option value="4">Level 4 — Full</option></select></div>`) +
|
||||
renderSATCard("memory", "Memory", "") +
|
||||
renderSATCard("storage", "Storage", "") +
|
||||
renderSATCard("cpu", "CPU", `<div class="form-row"><label>Duration (seconds)</label><input type="number" id="sat-cpu-dur" value="60" min="10"></div>`) +
|
||||
renderSATCard("amd", "AMD GPU", "") +
|
||||
`</div>
|
||||
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||
@@ -326,82 +489,175 @@ func renderTests() string {
|
||||
<script>
|
||||
let satES = null;
|
||||
function runSAT(target) {
|
||||
if (satES) satES.close();
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
const body = {};
|
||||
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
|
||||
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— ' + target;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
term.textContent = 'Starting ' + target + ' test...\n';
|
||||
fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
|
||||
term.textContent = 'Enqueuing ' + target + ' test...\n';
|
||||
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
|
||||
.then(r => r.json())
|
||||
.then(d => {
|
||||
satES = new EventSource('/api/sat/stream?job_id='+d.job_id);
|
||||
term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
|
||||
satES = new EventSource('/api/tasks/'+d.task_id+'/stream');
|
||||
satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
||||
satES.addEventListener('done', e => { satES.close(); term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
||||
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
||||
});
|
||||
}
|
||||
function runAllSAT() {
|
||||
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
||||
const targets = ['nvidia','memory','storage','cpu','amd'];
|
||||
const total = targets.length * cycles;
|
||||
let enqueued = 0;
|
||||
const status = document.getElementById('sat-all-status');
|
||||
status.textContent = 'Enqueuing...';
|
||||
const enqueueNext = (cycle, idx) => {
|
||||
if (cycle >= cycles) { status.textContent = 'Enqueued '+total+' tasks.'; return; }
|
||||
if (idx >= targets.length) { enqueueNext(cycle+1, 0); return; }
|
||||
const target = targets[idx];
|
||||
const btn = document.getElementById('sat-btn-' + target);
|
||||
if (btn && btn.disabled) { enqueueNext(cycle, idx+1); return; }
|
||||
const body = {};
|
||||
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
|
||||
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
|
||||
fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
|
||||
.then(r=>r.json()).then(()=>{
|
||||
enqueued++;
|
||||
status.textContent = 'Enqueued '+enqueued+'/'+total+'...';
|
||||
enqueueNext(cycle, idx+1);
|
||||
});
|
||||
};
|
||||
enqueueNext(0, 0);
|
||||
}
|
||||
</script>
|
||||
<script>
|
||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||
});
|
||||
function disableSATCard(id, reason) {
|
||||
const btn = document.getElementById('sat-btn-' + id);
|
||||
if (!btn) return;
|
||||
btn.disabled = true;
|
||||
btn.title = reason;
|
||||
btn.style.opacity = '0.4';
|
||||
const card = btn.closest('.card');
|
||||
if (card) {
|
||||
let note = card.querySelector('.sat-unavail');
|
||||
if (!note) {
|
||||
note = document.createElement('p');
|
||||
note.className = 'sat-unavail';
|
||||
note.style.cssText = 'color:var(--muted);font-size:12px;margin-top:6px';
|
||||
btn.parentNode.insertBefore(note, btn.nextSibling);
|
||||
}
|
||||
note.textContent = reason;
|
||||
}
|
||||
}
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderSATCard(id, label, extra string) string {
|
||||
return fmt.Sprintf(`<div class="card"><div class="card-head">%s</div><div class="card-body">%s<button class="btn btn-primary" onclick="runSAT('%s')">▶ Run Test</button></div></div>`,
|
||||
label, extra, id)
|
||||
return fmt.Sprintf(`<div class="card"><div class="card-head">%s</div><div class="card-body">%s<button id="sat-btn-%s" class="btn btn-primary" onclick="runSAT('%s')">▶ Run Test</button></div></div>`,
|
||||
label, extra, id, id)
|
||||
}
|
||||
|
||||
// ── Burn-in ───────────────────────────────────────────────────────────────────
|
||||
// ── Burn ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderBurnIn() string {
|
||||
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Long-running GPU and system stress tests. Check <a href="/metrics" style="color:var(--accent)">Metrics</a> page for live telemetry.</p>
|
||||
<div class="grid2">
|
||||
<div class="card"><div class="card-head">GPU Platform Stress</div><div class="card-body">
|
||||
func renderBurn() string {
|
||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at maximum load. Repeated or prolonged use may reduce hardware lifespan (storage endurance, GPU wear). Use only when necessary.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
<div class="grid3">
|
||||
<div class="card"><div class="card-head">NVIDIA GPU Stress</div><div class="card-body">
|
||||
<div class="form-row"><label>Duration</label><select id="bi-dur"><option value="600">10 minutes</option><option value="3600">1 hour</option><option value="28800">8 hours</option><option value="86400">24 hours</option></select></div>
|
||||
<button class="btn btn-primary" onclick="runBurnIn('nvidia')">▶ Start GPU Stress</button>
|
||||
<button id="sat-btn-nvidia" class="btn btn-primary" onclick="runBurnIn('nvidia')">▶ Start NVIDIA Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">CPU Stress</div><div class="card-body">
|
||||
<div class="form-row"><label>Duration (seconds)</label><input type="number" id="bi-cpu-dur" value="300" min="60"></div>
|
||||
<button class="btn btn-primary" onclick="runBurnIn('cpu')">▶ Start CPU Stress</button>
|
||||
<button class="btn btn-primary" onclick="runBurnIn('cpu')">▶ Start CPU Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">AMD GPU Stress</div><div class="card-body">
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Requires ROCm tools (rocm-bandwidth-test). Missing tools reported as UNSUPPORTED.</p>
|
||||
<button id="sat-btn-amd-stress" class="btn btn-primary" onclick="runBurnIn('amd-stress')">▶ Start AMD Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">Memory Stress</div><div class="card-body">
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">stress-ng --vm writes and verifies memory patterns across all of RAM. Env: <code>BEE_VM_STRESS_SECONDS</code> (default 300), <code>BEE_VM_STRESS_SIZE_MB</code> (default 80%).</p>
|
||||
<button class="btn btn-primary" onclick="runBurnIn('memory-stress')">▶ Start Memory Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">SAT Stress (stressapptest)</div><div class="card-body">
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Google stressapptest saturates CPU, memory and cache buses simultaneously. Env: <code>BEE_SAT_STRESS_SECONDS</code> (default 300), <code>BEE_SAT_STRESS_MB</code> (default auto).</p>
|
||||
<button class="btn btn-primary" onclick="runBurnIn('sat-stress')">▶ Start SAT Stress</button>
|
||||
</div></div>
|
||||
</div>
|
||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Output</div>
|
||||
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
<script>
|
||||
let biES = null;
|
||||
function runBurnIn(target) {
|
||||
if (biES) biES.close();
|
||||
if (biES) { biES.close(); biES = null; }
|
||||
const body = {};
|
||||
if (target === 'nvidia') body.duration = parseInt(document.getElementById('bi-dur').value)||600;
|
||||
if (target === 'cpu') body.duration = parseInt(document.getElementById('bi-cpu-dur').value)||300;
|
||||
document.getElementById('bi-output').style.display='block';
|
||||
document.getElementById('bi-title').textContent = '— ' + target;
|
||||
const term = document.getElementById('bi-terminal');
|
||||
term.textContent = 'Starting ' + target + ' burn-in...\n';
|
||||
term.textContent = 'Enqueuing ' + target + ' stress...\n';
|
||||
fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
|
||||
.then(r => r.json())
|
||||
.then(d => {
|
||||
biES = new EventSource('/api/sat/stream?job_id='+d.job_id);
|
||||
term.textContent += 'Task ' + d.task_id + ' queued.\n';
|
||||
biES = new EventSource('/api/tasks/'+d.task_id+'/stream');
|
||||
biES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
||||
biES.addEventListener('done', e => { biES.close(); term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
||||
biES.addEventListener('done', e => { biES.close(); biES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
||||
});
|
||||
}
|
||||
</script>
|
||||
<script>
|
||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd-stress', 'No AMD GPU detected');
|
||||
});
|
||||
function disableSATCard(id, reason) {
|
||||
const btn = document.getElementById('sat-btn-' + id);
|
||||
if (!btn) return;
|
||||
btn.disabled = true;
|
||||
btn.title = reason;
|
||||
btn.style.opacity = '0.4';
|
||||
const card = btn.closest('.card');
|
||||
if (card) {
|
||||
let note = card.querySelector('.sat-unavail');
|
||||
if (!note) {
|
||||
note = document.createElement('p');
|
||||
note.className = 'sat-unavail';
|
||||
note.style.cssText = 'color:var(--muted);font-size:12px;margin-top:6px';
|
||||
btn.parentNode.insertBefore(note, btn.nextSibling);
|
||||
}
|
||||
note.textContent = reason;
|
||||
}
|
||||
}
|
||||
</script>`
|
||||
}
|
||||
|
||||
// ── Network ───────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderNetwork() string {
|
||||
return `<div class="card"><div class="card-head">Network Interfaces</div><div class="card-body">
|
||||
// renderNetworkInline returns the network UI without a wrapping card (for embedding in Tools).
|
||||
func renderNetworkInline() string {
|
||||
return `<div id="net-pending" style="display:none" class="alert alert-warn">
|
||||
<strong>⚠ Network change applied.</strong> Reverting in <span id="net-countdown">60</span>s unless confirmed.
|
||||
<button class="btn btn-primary btn-sm" style="margin-left:8px" onclick="confirmNetChange()">Confirm</button>
|
||||
<button class="btn btn-secondary btn-sm" style="margin-left:4px" onclick="rollbackNetChange()">Rollback</button>
|
||||
</div>
|
||||
<div id="iface-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
</div></div>
|
||||
<div class="grid2">
|
||||
<div class="card"><div class="card-head">DHCP</div><div class="card-body">
|
||||
<div class="grid2" style="margin-top:16px">
|
||||
<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">DHCP</div>
|
||||
<div class="form-row"><label>Interface (leave empty for all)</label><input type="text" id="dhcp-iface" placeholder="eth0"></div>
|
||||
<button class="btn btn-primary" onclick="runDHCP()">▶ Run DHCP</button>
|
||||
<button class="btn btn-primary" onclick="runDHCP()">▶ Run DHCP</button>
|
||||
<div id="dhcp-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">Static IPv4</div><div class="card-body">
|
||||
</div>
|
||||
<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">Static IPv4</div>
|
||||
<div class="form-row"><label>Interface</label><input type="text" id="st-iface" placeholder="eth0"></div>
|
||||
<div class="form-row"><label>Address</label><input type="text" id="st-addr" placeholder="192.168.1.100"></div>
|
||||
<div class="form-row"><label>Prefix length</label><input type="text" id="st-prefix" placeholder="24"></div>
|
||||
@@ -409,19 +665,52 @@ func renderNetwork() string {
|
||||
<div class="form-row"><label>DNS (comma-separated)</label><input type="text" id="st-dns" placeholder="8.8.8.8,8.8.4.4"></div>
|
||||
<button class="btn btn-primary" onclick="setStatic()">Apply Static IP</button>
|
||||
<div id="static-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
|
||||
</div></div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
var _netCountdownTimer = null;
|
||||
function loadNetwork() {
|
||||
fetch('/api/network').then(r=>r.json()).then(d => {
|
||||
const rows = (d.interfaces||[]).map(i =>
|
||||
'<tr><td>'+i.Name+'</td><td><span class="badge '+(i.State==='up'?'badge-ok':'badge-warn')+'">'+i.State+'</span></td><td>'+(i.IPv4||[]).join(', ')+'</td></tr>'
|
||||
'<tr><td>'+i.Name+'</td>' +
|
||||
'<td style="cursor:pointer" onclick="toggleIface(\''+i.Name+'\',\''+i.State+'\')" title="Click to toggle"><span class="badge '+(i.State==='up'?'badge-ok':'badge-warn')+'">'+i.State+'</span></td>' +
|
||||
'<td>'+(i.IPv4||[]).join(', ')+'</td></tr>'
|
||||
).join('');
|
||||
document.getElementById('iface-table').innerHTML =
|
||||
'<table><tr><th>Interface</th><th>State</th><th>Addresses</th></tr>'+rows+'</table>' +
|
||||
'<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
|
||||
(d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
|
||||
});
|
||||
}
|
||||
function toggleIface(iface, currentState) {
|
||||
fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
|
||||
.then(r=>r.json()).then(d => {
|
||||
if (d.error) { alert('Error: '+d.error); return; }
|
||||
loadNetwork();
|
||||
showNetPending(d.rollback_in || 60);
|
||||
});
|
||||
}
|
||||
function showNetPending(secs) {
|
||||
const el = document.getElementById('net-pending');
|
||||
el.style.display = 'block';
|
||||
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||
let remaining = secs;
|
||||
document.getElementById('net-countdown').textContent = remaining;
|
||||
_netCountdownTimer = setInterval(function() {
|
||||
remaining--;
|
||||
document.getElementById('net-countdown').textContent = remaining;
|
||||
if (remaining <= 0) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; el.style.display='none'; loadNetwork(); }
|
||||
}, 1000);
|
||||
}
|
||||
function confirmNetChange() {
|
||||
if (_netCountdownTimer) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; }
|
||||
document.getElementById('net-pending').style.display='none';
|
||||
fetch('/api/network/confirm',{method:'POST'});
|
||||
}
|
||||
function rollbackNetChange() {
|
||||
if (_netCountdownTimer) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; }
|
||||
document.getElementById('net-pending').style.display='none';
|
||||
fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork());
|
||||
}
|
||||
function runDHCP() {
|
||||
const iface = document.getElementById('dhcp-iface').value.trim();
|
||||
fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
|
||||
@@ -447,13 +736,17 @@ loadNetwork();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderNetwork() string {
|
||||
return `<div class="card"><div class="card-head">Network Interfaces</div><div class="card-body">` +
|
||||
renderNetworkInline() +
|
||||
`</div></div>`
|
||||
}
|
||||
|
||||
// ── Services ──────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderServices() string {
|
||||
return `<div class="card"><div class="card-head">Bee Services <button class="btn btn-sm btn-secondary" onclick="loadServices()" style="margin-left:auto">↻ Refresh</button></div>
|
||||
<div class="card-body">
|
||||
func renderServicesInline() string {
|
||||
return `<div style="display:flex;justify-content:flex-end;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
</div></div>
|
||||
<div id="svc-out" style="display:none;margin-top:8px" class="card">
|
||||
<div class="card-head">Output</div>
|
||||
<div class="card-body" style="padding:10px"><div id="svc-terminal" class="terminal" style="max-height:150px"></div></div>
|
||||
@@ -497,6 +790,12 @@ loadServices();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderServices() string {
|
||||
return `<div class="card"><div class="card-head">Bee Services</div><div class="card-body">` +
|
||||
renderServicesInline() +
|
||||
`</div></div>`
|
||||
}
|
||||
|
||||
// ── Export ────────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderExport(exportDir string) string {
|
||||
@@ -546,14 +845,56 @@ func listExportFiles(exportDir string) ([]string, error) {
|
||||
// ── Tools ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderTools() string {
|
||||
return `<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Click Check to verify installed tools.</p></div></div></div>
|
||||
return `<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Install to RAM</div>
|
||||
<div class="card-body">
|
||||
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||
const txt = document.getElementById('ram-status-text');
|
||||
const btn = document.getElementById('ram-install-btn');
|
||||
if (d.in_ram) {
|
||||
txt.textContent = '✓ Running from RAM — installation media can be safely disconnected.';
|
||||
txt.style.color = 'var(--ok, green)';
|
||||
} else {
|
||||
txt.textContent = 'Live media is mounted from installation device. Copy to RAM to allow media removal.';
|
||||
btn.style.display = '';
|
||||
}
|
||||
});
|
||||
function installToRAM() {
|
||||
document.getElementById('ram-install-btn').disabled = true;
|
||||
fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
|
||||
window.location.href = '/tasks#' + d.task_id;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
<a class="btn btn-primary" href="/export/support.tar.gz">↓ Download Support Bundle</a>
|
||||
</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
||||
renderNetworkInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||
renderServicesInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Install to Disk</div><div class="card-body">` +
|
||||
renderInstallInline() + `</div></div>
|
||||
|
||||
<script>
|
||||
function checkTools() {
|
||||
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||
fetch('/api/tools/check').then(r=>r.json()).then(tools => {
|
||||
const rows = tools.map(t =>
|
||||
'<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK ? 'badge-ok' : 'badge-err')+'">'+(t.OK ? '✓ '+t.Path : '✗ missing')+'</span></td></tr>'
|
||||
'<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK ? 'badge-ok' : 'badge-err')+'">'+(t.OK ? '✓ '+t.Path : '✗ missing')+'</span></td></tr>'
|
||||
).join('');
|
||||
document.getElementById('tools-table').innerHTML =
|
||||
'<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
|
||||
@@ -565,11 +906,8 @@ checkTools();
|
||||
|
||||
// ── Install to Disk ──────────────────────────────────────────────────────────
|
||||
|
||||
func renderInstall() string {
|
||||
func renderInstallInline() string {
|
||||
return `
|
||||
<div class="card">
|
||||
<div class="card-head">Install Live System to Disk</div>
|
||||
<div class="card-body">
|
||||
<div class="alert alert-warn" style="margin-bottom:16px">
|
||||
<strong>Warning:</strong> Installing will <strong>completely erase</strong> the selected
|
||||
disk and write the live system onto it. All existing data on the target disk will be lost.
|
||||
@@ -767,6 +1105,107 @@ installRefreshDisks();
|
||||
`
|
||||
}
|
||||
|
||||
func renderInstall() string {
|
||||
return `<div class="card"><div class="card-head">Install Live System to Disk</div><div class="card-body">` +
|
||||
renderInstallInline() +
|
||||
`</div></div>`
|
||||
}
|
||||
|
||||
// ── Tasks ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderTasks() string {
|
||||
return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px">
|
||||
<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
|
||||
<span style="font-size:12px;color:var(--muted)">Tasks run one at a time. Logs persist after navigation.</span>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div id="tasks-table"><p style="color:var(--muted);font-size:13px;padding:16px">Loading...</p></div>
|
||||
</div>
|
||||
<div id="task-log-section" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Logs — <span id="task-log-title"></span>
|
||||
<button class="btn btn-sm btn-secondary" onclick="closeTaskLog()" style="margin-left:auto">✕</button>
|
||||
</div>
|
||||
<div class="card-body"><div id="task-log-terminal" class="terminal" style="max-height:500px"></div></div>
|
||||
</div>
|
||||
<script>
|
||||
var _taskLogES = null;
|
||||
var _taskRefreshTimer = null;
|
||||
|
||||
function loadTasks() {
|
||||
fetch('/api/tasks').then(r=>r.json()).then(tasks => {
|
||||
if (!tasks || tasks.length === 0) {
|
||||
document.getElementById('tasks-table').innerHTML = '<p style="color:var(--muted);font-size:13px;padding:16px">No tasks.</p>';
|
||||
return;
|
||||
}
|
||||
const rows = tasks.map(t => {
|
||||
const dur = t.started_at ? formatDur(t.started_at, t.done_at) : '';
|
||||
const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
|
||||
const statusLabel = {running:'▶ running',pending:'pending',done:'✓ done',failed:'✗ failed',cancelled:'cancelled'}[t.status]||t.status;
|
||||
let actions = '<button class="btn btn-sm btn-secondary" onclick="viewLog(\''+t.id+'\',\''+escHtml(t.name)+'\')">Logs</button>';
|
||||
if (t.status === 'running' || t.status === 'pending') {
|
||||
actions += ' <button class="btn btn-sm btn-danger" onclick="cancelTask(\''+t.id+'\')">Cancel</button>';
|
||||
}
|
||||
if (t.status === 'pending') {
|
||||
actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',1)" title="Increase priority">⇧</button>';
|
||||
actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',-1)" title="Decrease priority">⇩</button>';
|
||||
}
|
||||
return '<tr><td>'+escHtml(t.name)+'</td>' +
|
||||
'<td><span class="badge '+statusClass+'">'+statusLabel+'</span></td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+fmtTime(t.created_at)+'</td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+dur+'</td>' +
|
||||
'<td>'+t.priority+'</td>' +
|
||||
'<td>'+actions+'</td></tr>';
|
||||
}).join('');
|
||||
document.getElementById('tasks-table').innerHTML =
|
||||
'<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>';
|
||||
});
|
||||
}
|
||||
|
||||
function escHtml(s) { return (s||'').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"'); }
|
||||
function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
|
||||
function formatDur(start, end) {
|
||||
try {
|
||||
const s = new Date(start), e = end ? new Date(end) : new Date();
|
||||
const sec = Math.round((e-s)/1000);
|
||||
if (sec < 60) return sec+'s';
|
||||
const m = Math.floor(sec/60), ss = sec%60;
|
||||
return m+'m '+ss+'s';
|
||||
} catch(e){ return ''; }
|
||||
}
|
||||
|
||||
function cancelTask(id) {
|
||||
fetch('/api/tasks/'+id+'/cancel',{method:'POST'}).then(()=>loadTasks());
|
||||
}
|
||||
function cancelAll() {
|
||||
fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
|
||||
}
|
||||
function setPriority(id, delta) {
|
||||
fetch('/api/tasks/'+id+'/priority',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({delta:delta})})
|
||||
.then(()=>loadTasks());
|
||||
}
|
||||
function viewLog(id, name) {
|
||||
if (_taskLogES) { _taskLogES.close(); _taskLogES = null; }
|
||||
document.getElementById('task-log-section').style.display = '';
|
||||
document.getElementById('task-log-title').textContent = name;
|
||||
const term = document.getElementById('task-log-terminal');
|
||||
term.textContent = 'Connecting...\n';
|
||||
_taskLogES = new EventSource('/api/tasks/'+id+'/stream');
|
||||
_taskLogES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
||||
_taskLogES.addEventListener('done', e => {
|
||||
_taskLogES.close(); _taskLogES=null;
|
||||
term.textContent += (e.data ? '\nERROR: '+e.data : '\nDone.')+'\n';
|
||||
});
|
||||
}
|
||||
function closeTaskLog() {
|
||||
if (_taskLogES) { _taskLogES.close(); _taskLogES=null; }
|
||||
document.getElementById('task-log-section').style.display='none';
|
||||
}
|
||||
|
||||
loadTasks();
|
||||
_taskRefreshTimer = setInterval(loadTasks, 2000);
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderExportIndex(exportDir string) (string, error) {
|
||||
entries, err := listExportFiles(exportDir)
|
||||
if err != nil {
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"mime"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
@@ -20,6 +21,17 @@ import (
|
||||
|
||||
const defaultTitle = "Bee Hardware Audit"
|
||||
|
||||
func init() {
|
||||
// On some LiveCD ramdisk environments, /usr/share/mime/globs2 exists but
|
||||
// causes an I/O error mid-read. Go's mime package panics (not errors) in
|
||||
// that case, crashing the first HTTP goroutine that serves a static file.
|
||||
// Pre-trigger initialization here with recover so subsequent calls are safe.
|
||||
func() {
|
||||
defer func() { recover() }() //nolint:errcheck
|
||||
mime.TypeByExtension(".gz")
|
||||
}()
|
||||
}
|
||||
|
||||
// HandlerOptions configures the web UI handler.
|
||||
type HandlerOptions struct {
|
||||
Title string
|
||||
@@ -31,14 +43,14 @@ type HandlerOptions struct {
|
||||
|
||||
// metricsRing holds a rolling window of live metric samples.
|
||||
type metricsRing struct {
|
||||
mu sync.Mutex
|
||||
vals []float64
|
||||
labels []string
|
||||
size int
|
||||
mu sync.Mutex
|
||||
vals []float64
|
||||
times []time.Time
|
||||
size int
|
||||
}
|
||||
|
||||
func newMetricsRing(size int) *metricsRing {
|
||||
return &metricsRing{size: size, vals: make([]float64, 0, size), labels: make([]string, 0, size)}
|
||||
return &metricsRing{size: size, vals: make([]float64, 0, size), times: make([]time.Time, 0, size)}
|
||||
}
|
||||
|
||||
func (r *metricsRing) push(v float64) {
|
||||
@@ -46,20 +58,40 @@ func (r *metricsRing) push(v float64) {
|
||||
defer r.mu.Unlock()
|
||||
if len(r.vals) >= r.size {
|
||||
r.vals = r.vals[1:]
|
||||
r.labels = r.labels[1:]
|
||||
r.times = r.times[1:]
|
||||
}
|
||||
r.vals = append(r.vals, v)
|
||||
r.labels = append(r.labels, time.Now().Format("15:04"))
|
||||
r.times = append(r.times, time.Now())
|
||||
}
|
||||
|
||||
func (r *metricsRing) snapshot() ([]float64, []string) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
v := make([]float64, len(r.vals))
|
||||
l := make([]string, len(r.labels))
|
||||
copy(v, r.vals)
|
||||
copy(l, r.labels)
|
||||
return v, l
|
||||
now := time.Now()
|
||||
labels := make([]string, len(r.times))
|
||||
for i, t := range r.times {
|
||||
labels[i] = relAgeLabel(now.Sub(t))
|
||||
}
|
||||
return v, labels
|
||||
}
|
||||
|
||||
func relAgeLabel(age time.Duration) string {
|
||||
if age <= 0 {
|
||||
return "0"
|
||||
}
|
||||
if age < time.Hour {
|
||||
m := int(age.Minutes())
|
||||
if m == 0 {
|
||||
return "-<1m"
|
||||
}
|
||||
return fmt.Sprintf("-%dm", m)
|
||||
}
|
||||
if age < 24*time.Hour {
|
||||
return fmt.Sprintf("-%dh", int(age.Hours()))
|
||||
}
|
||||
return fmt.Sprintf("-%dd", int(age.Hours()/24))
|
||||
}
|
||||
|
||||
// gpuRings holds per-GPU ring buffers.
|
||||
@@ -70,6 +102,14 @@ type gpuRings struct {
|
||||
Power *metricsRing
|
||||
}
|
||||
|
||||
// pendingNetChange tracks a network state change awaiting confirmation.
|
||||
type pendingNetChange struct {
|
||||
iface string
|
||||
wasUp bool
|
||||
timer *time.Timer
|
||||
mu sync.Mutex
|
||||
}
|
||||
|
||||
// handler is the HTTP handler for the web UI.
|
||||
type handler struct {
|
||||
opts HandlerOptions
|
||||
@@ -87,6 +127,9 @@ type handler struct {
|
||||
// install job (at most one at a time)
|
||||
installJob *jobState
|
||||
installMu sync.Mutex
|
||||
// pending network change (rollback on timeout)
|
||||
pendingNet *pendingNetChange
|
||||
pendingNetMu sync.Mutex
|
||||
}
|
||||
|
||||
// NewHandler creates the HTTP mux with all routes.
|
||||
@@ -108,6 +151,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
ringMemLoad: newMetricsRing(120),
|
||||
ringPower: newMetricsRing(120),
|
||||
}
|
||||
globalQueue.startWorker(&opts)
|
||||
mux := http.NewServeMux()
|
||||
|
||||
// ── Infrastructure ──────────────────────────────────────────────────────
|
||||
@@ -131,9 +175,20 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
|
||||
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
||||
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
|
||||
mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
|
||||
mux.HandleFunc("POST /api/sat/amd-stress/run", h.handleAPISATRun("amd-stress"))
|
||||
mux.HandleFunc("POST /api/sat/memory-stress/run", h.handleAPISATRun("memory-stress"))
|
||||
mux.HandleFunc("POST /api/sat/sat-stress/run", h.handleAPISATRun("sat-stress"))
|
||||
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
|
||||
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
||||
|
||||
// Tasks
|
||||
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
|
||||
mux.HandleFunc("POST /api/tasks/cancel-all", h.handleAPITasksCancelAll)
|
||||
mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel)
|
||||
mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority)
|
||||
mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream)
|
||||
|
||||
// Services
|
||||
mux.HandleFunc("GET /api/services", h.handleAPIServicesList)
|
||||
mux.HandleFunc("POST /api/services/action", h.handleAPIServicesAction)
|
||||
@@ -142,6 +197,9 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("GET /api/network", h.handleAPINetworkStatus)
|
||||
mux.HandleFunc("POST /api/network/dhcp", h.handleAPINetworkDHCP)
|
||||
mux.HandleFunc("POST /api/network/static", h.handleAPINetworkStatic)
|
||||
mux.HandleFunc("POST /api/network/toggle", h.handleAPINetworkToggle)
|
||||
mux.HandleFunc("POST /api/network/confirm", h.handleAPINetworkConfirm)
|
||||
mux.HandleFunc("POST /api/network/rollback", h.handleAPINetworkRollback)
|
||||
|
||||
// Export
|
||||
mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
|
||||
@@ -150,6 +208,13 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// Tools
|
||||
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||
|
||||
// GPU presence
|
||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||
|
||||
// System
|
||||
mux.HandleFunc("GET /api/system/ram-status", h.handleAPIRAMStatus)
|
||||
mux.HandleFunc("POST /api/system/install-to-ram", h.handleAPIInstallToRAM)
|
||||
|
||||
// Preflight
|
||||
mux.HandleFunc("GET /api/preflight", h.handleAPIPreflight)
|
||||
|
||||
@@ -197,6 +262,11 @@ func (h *handler) handleAuditJSON(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, fmt.Sprintf("read audit snapshot: %v", err), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
// Re-apply SAT overlay on every request so that SAT results run after the
|
||||
// last audit always appear in the downloaded JSON without needing a re-audit.
|
||||
if overlaid, err := app.ApplySATOverlay(data); err == nil {
|
||||
data = overlaid
|
||||
}
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||
_, _ = w.Write(data)
|
||||
@@ -240,9 +310,33 @@ func (h *handler) handleExportFile(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "invalid path", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
// Set Content-Type explicitly to avoid mime.TypeByExtension which panics on
|
||||
// LiveCD environments where /usr/share/mime/globs2 has an I/O read error.
|
||||
w.Header().Set("Content-Type", mimeByExt(filepath.Ext(clean)))
|
||||
http.ServeFile(w, r, filepath.Join(h.opts.ExportDir, clean))
|
||||
}
|
||||
|
||||
// mimeByExt returns a Content-Type for known extensions, falling back to
|
||||
// application/octet-stream. Used to avoid calling mime.TypeByExtension.
|
||||
func mimeByExt(ext string) string {
|
||||
switch strings.ToLower(ext) {
|
||||
case ".json":
|
||||
return "application/json"
|
||||
case ".gz":
|
||||
return "application/gzip"
|
||||
case ".tar":
|
||||
return "application/x-tar"
|
||||
case ".log", ".txt":
|
||||
return "text/plain; charset=utf-8"
|
||||
case ".html":
|
||||
return "text/html; charset=utf-8"
|
||||
case ".svg":
|
||||
return "image/svg+xml"
|
||||
default:
|
||||
return "application/octet-stream"
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) handleExportIndex(w http.ResponseWriter, r *http.Request) {
|
||||
body, err := renderExportIndex(h.opts.ExportDir)
|
||||
if err != nil {
|
||||
@@ -274,18 +368,35 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
var names []string
|
||||
var labels []string
|
||||
var title string
|
||||
var yMin, yMax *float64 // nil = auto; for load charts fixed 0-100
|
||||
|
||||
switch {
|
||||
case path == "server":
|
||||
title = "Server"
|
||||
vCPUTemp, l := h.ringCPUTemp.snapshot()
|
||||
vCPULoad, _ := h.ringCPULoad.snapshot()
|
||||
// ── Server sub-charts ─────────────────────────────────────────────────
|
||||
case path == "server-load":
|
||||
title = "CPU / Memory Load"
|
||||
vCPULoad, l := h.ringCPULoad.snapshot()
|
||||
vMemLoad, _ := h.ringMemLoad.snapshot()
|
||||
vPower, _ := h.ringPower.snapshot()
|
||||
labels = l
|
||||
datasets = [][]float64{vCPUTemp, vCPULoad, vMemLoad, vPower}
|
||||
names = []string{"CPU Temp °C", "CPU Load %", "Mem Load %", "Power W"}
|
||||
datasets = [][]float64{vCPULoad, vMemLoad}
|
||||
names = []string{"CPU Load %", "Mem Load %"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
|
||||
case path == "server-temp":
|
||||
title = "CPU Temperature"
|
||||
vCPUTemp, l := h.ringCPUTemp.snapshot()
|
||||
labels = l
|
||||
datasets = [][]float64{vCPUTemp}
|
||||
names = []string{"CPU Temp °C"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(vCPUTemp)
|
||||
|
||||
case path == "server-power":
|
||||
title = "Power & Fans"
|
||||
vPower, l := h.ringPower.snapshot()
|
||||
labels = l
|
||||
datasets = [][]float64{vPower}
|
||||
names = []string{"Power W"}
|
||||
h.ringsMu.Lock()
|
||||
for i, fr := range h.ringFans {
|
||||
fv, _ := fr.snapshot()
|
||||
@@ -297,11 +408,20 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
names = append(names, name+" RPM")
|
||||
}
|
||||
h.ringsMu.Unlock()
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
// ── GPU sub-charts ────────────────────────────────────────────────────
|
||||
case strings.HasPrefix(path, "gpu/"):
|
||||
idxStr := strings.TrimPrefix(path, "gpu/")
|
||||
rest := strings.TrimPrefix(path, "gpu/")
|
||||
// rest is either "{idx}-load", "{idx}-temp", "{idx}-power", or legacy "{idx}"
|
||||
sub := ""
|
||||
if i := strings.LastIndex(rest, "-"); i > 0 {
|
||||
sub = rest[i+1:]
|
||||
rest = rest[:i]
|
||||
}
|
||||
idx := 0
|
||||
fmt.Sscanf(idxStr, "%d", &idx)
|
||||
fmt.Sscanf(rest, "%d", &idx)
|
||||
h.ringsMu.Lock()
|
||||
var gr *gpuRings
|
||||
if idx < len(h.gpuRings) {
|
||||
@@ -312,21 +432,71 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
vTemp, l := gr.Temp.snapshot()
|
||||
vUtil, _ := gr.Util.snapshot()
|
||||
vMemUtil, _ := gr.MemUtil.snapshot()
|
||||
vPower, _ := gr.Power.snapshot()
|
||||
labels = l
|
||||
title = fmt.Sprintf("GPU %d", idx)
|
||||
datasets = [][]float64{vTemp, vUtil, vMemUtil, vPower}
|
||||
names = []string{"Temp °C", "Load %", "Mem %", "Power W"}
|
||||
switch sub {
|
||||
case "load":
|
||||
vUtil, l := gr.Util.snapshot()
|
||||
vMemUtil, _ := gr.MemUtil.snapshot()
|
||||
labels = l
|
||||
title = fmt.Sprintf("GPU %d Load", idx)
|
||||
datasets = [][]float64{vUtil, vMemUtil}
|
||||
names = []string{"Load %", "Mem %"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
case "temp":
|
||||
vTemp, l := gr.Temp.snapshot()
|
||||
labels = l
|
||||
title = fmt.Sprintf("GPU %d Temperature", idx)
|
||||
datasets = [][]float64{vTemp}
|
||||
names = []string{"Temp °C"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(vTemp)
|
||||
default: // "power" or legacy (no sub)
|
||||
vPower, l := gr.Power.snapshot()
|
||||
labels = l
|
||||
title = fmt.Sprintf("GPU %d Power", idx)
|
||||
datasets = [][]float64{vPower}
|
||||
names = []string{"Power W"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(vPower)
|
||||
}
|
||||
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
|
||||
// Ensure all datasets same length as labels
|
||||
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "image/svg+xml")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
_, _ = w.Write(buf)
|
||||
}
|
||||
|
||||
// floatPtr returns a pointer to a float64 value.
|
||||
func floatPtr(v float64) *float64 { return &v }
|
||||
|
||||
// autoMax120 returns 0→max+20% Y-axis max across all datasets.
|
||||
func autoMax120(datasets ...[]float64) *float64 {
|
||||
max := 0.0
|
||||
for _, ds := range datasets {
|
||||
for _, v := range ds {
|
||||
if v > max {
|
||||
max = v
|
||||
}
|
||||
}
|
||||
}
|
||||
if max == 0 {
|
||||
return nil // let library auto-scale
|
||||
}
|
||||
v := max * 1.2
|
||||
return &v
|
||||
}
|
||||
|
||||
// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
|
||||
func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
|
||||
n := len(labels)
|
||||
if n == 0 {
|
||||
n = 1
|
||||
@@ -337,31 +507,25 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
datasets[i] = make([]float64, n)
|
||||
}
|
||||
}
|
||||
|
||||
sparse := sparseLabels(labels, 6)
|
||||
|
||||
opt := gocharts.NewLineChartOptionWithData(datasets)
|
||||
opt.Title = gocharts.TitleOption{Text: title}
|
||||
opt.XAxis.Labels = sparse
|
||||
opt.Legend = gocharts.LegendOption{SeriesNames: names}
|
||||
if yMin != nil || yMax != nil {
|
||||
opt.YAxis = []gocharts.YAxisOption{{Min: yMin, Max: yMax}}
|
||||
}
|
||||
|
||||
p := gocharts.NewPainter(gocharts.PainterOptions{
|
||||
OutputFormat: gocharts.ChartOutputSVG,
|
||||
Width: 1400,
|
||||
Height: 280,
|
||||
Height: 240,
|
||||
}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
|
||||
if err := p.LineChart(opt); err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
return nil, err
|
||||
}
|
||||
buf, err := p.Bytes()
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "image/svg+xml")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
_, _ = w.Write(buf)
|
||||
return p.Bytes()
|
||||
}
|
||||
|
||||
func safeIdx(s []float64, i int) float64 {
|
||||
@@ -392,6 +556,15 @@ func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) {
|
||||
if page == "" {
|
||||
page = "dashboard"
|
||||
}
|
||||
// Redirect old routes to new names
|
||||
switch page {
|
||||
case "tests":
|
||||
http.Redirect(w, r, "/validate", http.StatusMovedPermanently)
|
||||
return
|
||||
case "burn-in":
|
||||
http.Redirect(w, r, "/burn", http.StatusMovedPermanently)
|
||||
return
|
||||
}
|
||||
body := renderPage(page, h.opts)
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestRootRendersShellWithIframe(t *testing.T) {
|
||||
func TestRootRendersDashboard(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
@@ -31,11 +31,12 @@ func TestRootRendersShellWithIframe(t *testing.T) {
|
||||
if first.Code != http.StatusOK {
|
||||
t.Fatalf("first status=%d", first.Code)
|
||||
}
|
||||
if !strings.Contains(first.Body.String(), `iframe`) || !strings.Contains(first.Body.String(), `src="/viewer"`) {
|
||||
t.Fatalf("first body missing iframe viewer: %s", first.Body.String())
|
||||
// Dashboard should contain the audit modal (with viewer link) and hardware summary
|
||||
if !strings.Contains(first.Body.String(), `openAuditModal`) {
|
||||
t.Fatalf("first body missing audit modal trigger: %s", first.Body.String())
|
||||
}
|
||||
if !strings.Contains(first.Body.String(), "/export/support.tar.gz") {
|
||||
t.Fatalf("first body missing support bundle link: %s", first.Body.String())
|
||||
if !strings.Contains(first.Body.String(), `/viewer`) {
|
||||
t.Fatalf("first body missing viewer link: %s", first.Body.String())
|
||||
}
|
||||
if got := first.Header().Get("Cache-Control"); got != "no-store" {
|
||||
t.Fatalf("first cache-control=%q", got)
|
||||
@@ -50,8 +51,8 @@ func TestRootRendersShellWithIframe(t *testing.T) {
|
||||
if second.Code != http.StatusOK {
|
||||
t.Fatalf("second status=%d", second.Code)
|
||||
}
|
||||
if !strings.Contains(second.Body.String(), `src="/viewer"`) {
|
||||
t.Fatalf("second body missing iframe viewer: %s", second.Body.String())
|
||||
if !strings.Contains(second.Body.String(), `Hardware Summary`) {
|
||||
t.Fatalf("second body missing hardware summary: %s", second.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,8 +104,8 @@ func TestAuditJSONServesLatestSnapshot(t *testing.T) {
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
if got := strings.TrimSpace(rec.Body.String()); got != body {
|
||||
t.Fatalf("body=%q want %q", got, body)
|
||||
if !strings.Contains(rec.Body.String(), "SERIAL-API") {
|
||||
t.Fatalf("body missing expected serial: %s", rec.Body.String())
|
||||
}
|
||||
if got := rec.Header().Get("Content-Type"); !strings.Contains(got, "application/json") {
|
||||
t.Fatalf("content-type=%q", got)
|
||||
|
||||
420
audit/internal/webui/tasks.go
Normal file
420
audit/internal/webui/tasks.go
Normal file
@@ -0,0 +1,420 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Task statuses.
|
||||
const (
|
||||
TaskPending = "pending"
|
||||
TaskRunning = "running"
|
||||
TaskDone = "done"
|
||||
TaskFailed = "failed"
|
||||
TaskCancelled = "cancelled"
|
||||
)
|
||||
|
||||
// taskNames maps target → human-readable name.
|
||||
var taskNames = map[string]string{
|
||||
"nvidia": "NVIDIA SAT",
|
||||
"memory": "Memory SAT",
|
||||
"storage": "Storage SAT",
|
||||
"cpu": "CPU SAT",
|
||||
"amd": "AMD GPU SAT",
|
||||
"amd-stress": "AMD GPU Burn-in",
|
||||
"memory-stress": "Memory Burn-in",
|
||||
"sat-stress": "SAT Stress (stressapptest)",
|
||||
"audit": "Audit",
|
||||
"install": "Install to Disk",
|
||||
"install-to-ram": "Install to RAM",
|
||||
}
|
||||
|
||||
// Task represents one unit of work in the queue.
|
||||
type Task struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Target string `json:"target"`
|
||||
Priority int `json:"priority"`
|
||||
Status string `json:"status"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||
ErrMsg string `json:"error,omitempty"`
|
||||
|
||||
// runtime fields (not serialised)
|
||||
job *jobState
|
||||
params taskParams
|
||||
}
|
||||
|
||||
// taskParams holds optional parameters parsed from the run request.
|
||||
type taskParams struct {
|
||||
Duration int
|
||||
DiagLevel int
|
||||
GPUIndices []int
|
||||
Device string // for install
|
||||
}
|
||||
|
||||
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
||||
type taskQueue struct {
|
||||
mu sync.Mutex
|
||||
tasks []*Task
|
||||
trigger chan struct{}
|
||||
opts *HandlerOptions // set by startWorker
|
||||
}
|
||||
|
||||
var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
||||
|
||||
const maxTaskHistory = 50
|
||||
|
||||
// enqueue adds a task to the queue and notifies the worker.
|
||||
func (q *taskQueue) enqueue(t *Task) {
|
||||
q.mu.Lock()
|
||||
q.tasks = append(q.tasks, t)
|
||||
q.prune()
|
||||
q.mu.Unlock()
|
||||
select {
|
||||
case q.trigger <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
// prune removes oldest completed tasks beyond maxTaskHistory.
|
||||
func (q *taskQueue) prune() {
|
||||
var done []*Task
|
||||
var active []*Task
|
||||
for _, t := range q.tasks {
|
||||
switch t.Status {
|
||||
case TaskDone, TaskFailed, TaskCancelled:
|
||||
done = append(done, t)
|
||||
default:
|
||||
active = append(active, t)
|
||||
}
|
||||
}
|
||||
if len(done) > maxTaskHistory {
|
||||
done = done[len(done)-maxTaskHistory:]
|
||||
}
|
||||
q.tasks = append(active, done...)
|
||||
}
|
||||
|
||||
// nextPending returns the highest-priority pending task (nil if none).
|
||||
func (q *taskQueue) nextPending() *Task {
|
||||
var best *Task
|
||||
for _, t := range q.tasks {
|
||||
if t.Status != TaskPending {
|
||||
continue
|
||||
}
|
||||
if best == nil || t.Priority > best.Priority ||
|
||||
(t.Priority == best.Priority && t.CreatedAt.Before(best.CreatedAt)) {
|
||||
best = t
|
||||
}
|
||||
}
|
||||
return best
|
||||
}
|
||||
|
||||
// findByID looks up a task by ID.
|
||||
func (q *taskQueue) findByID(id string) (*Task, bool) {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
for _, t := range q.tasks {
|
||||
if t.ID == id {
|
||||
return t, true
|
||||
}
|
||||
}
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// findJob returns the jobState for a task ID (for SSE streaming compatibility).
|
||||
func (q *taskQueue) findJob(id string) (*jobState, bool) {
|
||||
t, ok := q.findByID(id)
|
||||
if !ok || t.job == nil {
|
||||
return nil, false
|
||||
}
|
||||
return t.job, true
|
||||
}
|
||||
|
||||
// snapshot returns a copy of all tasks sorted for display (running first, then pending by priority, then done by doneAt desc).
|
||||
func (q *taskQueue) snapshot() []Task {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
out := make([]Task, len(q.tasks))
|
||||
for i, t := range q.tasks {
|
||||
out[i] = *t
|
||||
}
|
||||
sort.SliceStable(out, func(i, j int) bool {
|
||||
si := statusOrder(out[i].Status)
|
||||
sj := statusOrder(out[j].Status)
|
||||
if si != sj {
|
||||
return si < sj
|
||||
}
|
||||
if out[i].Priority != out[j].Priority {
|
||||
return out[i].Priority > out[j].Priority
|
||||
}
|
||||
return out[i].CreatedAt.Before(out[j].CreatedAt)
|
||||
})
|
||||
return out
|
||||
}
|
||||
|
||||
func statusOrder(s string) int {
|
||||
switch s {
|
||||
case TaskRunning:
|
||||
return 0
|
||||
case TaskPending:
|
||||
return 1
|
||||
default:
|
||||
return 2
|
||||
}
|
||||
}
|
||||
|
||||
// startWorker launches the queue runner goroutine.
|
||||
func (q *taskQueue) startWorker(opts *HandlerOptions) {
|
||||
q.opts = opts
|
||||
go q.worker()
|
||||
}
|
||||
|
||||
func (q *taskQueue) worker() {
|
||||
for {
|
||||
<-q.trigger
|
||||
for {
|
||||
q.mu.Lock()
|
||||
t := q.nextPending()
|
||||
if t == nil {
|
||||
q.mu.Unlock()
|
||||
break
|
||||
}
|
||||
now := time.Now()
|
||||
t.Status = TaskRunning
|
||||
t.StartedAt = &now
|
||||
j := &jobState{}
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
j.cancel = cancel
|
||||
t.job = j
|
||||
q.mu.Unlock()
|
||||
|
||||
q.runTask(t, j, ctx)
|
||||
|
||||
q.mu.Lock()
|
||||
now2 := time.Now()
|
||||
t.DoneAt = &now2
|
||||
if t.Status == TaskRunning { // not cancelled externally
|
||||
if j.err != "" {
|
||||
t.Status = TaskFailed
|
||||
t.ErrMsg = j.err
|
||||
} else {
|
||||
t.Status = TaskDone
|
||||
}
|
||||
}
|
||||
q.prune()
|
||||
q.mu.Unlock()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// runTask executes the work for a task, writing output to j.
|
||||
func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
if q.opts == nil || q.opts.App == nil {
|
||||
j.append("ERROR: app not configured")
|
||||
j.finish("app not configured")
|
||||
return
|
||||
}
|
||||
a := q.opts.App
|
||||
|
||||
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
||||
|
||||
var (
|
||||
archive string
|
||||
err error
|
||||
)
|
||||
|
||||
switch t.Target {
|
||||
case "nvidia":
|
||||
if len(t.params.GPUIndices) > 0 || t.params.DiagLevel > 0 {
|
||||
result, e := a.RunNvidiaAcceptancePackWithOptions(
|
||||
ctx, "", t.params.DiagLevel, t.params.GPUIndices, j.append,
|
||||
)
|
||||
if e != nil {
|
||||
err = e
|
||||
} else {
|
||||
archive = result.Body
|
||||
}
|
||||
} else {
|
||||
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
||||
}
|
||||
case "memory":
|
||||
archive, err = a.RunMemoryAcceptancePack("", j.append)
|
||||
case "storage":
|
||||
archive, err = a.RunStorageAcceptancePack("", j.append)
|
||||
case "cpu":
|
||||
dur := t.params.Duration
|
||||
if dur <= 0 {
|
||||
dur = 60
|
||||
}
|
||||
archive, err = a.RunCPUAcceptancePack("", dur, j.append)
|
||||
case "amd":
|
||||
archive, err = a.RunAMDAcceptancePack("", j.append)
|
||||
case "amd-stress":
|
||||
archive, err = a.RunAMDStressPack("", j.append)
|
||||
case "memory-stress":
|
||||
archive, err = a.RunMemoryStressPack("", j.append)
|
||||
case "sat-stress":
|
||||
archive, err = a.RunSATStressPack("", j.append)
|
||||
case "audit":
|
||||
result, e := a.RunAuditNow(q.opts.RuntimeMode)
|
||||
if e != nil {
|
||||
err = e
|
||||
} else {
|
||||
for _, line := range splitLines(result.Body) {
|
||||
j.append(line)
|
||||
}
|
||||
}
|
||||
case "install-to-ram":
|
||||
err = a.RunInstallToRAM(j.append)
|
||||
default:
|
||||
j.append("ERROR: unknown target: " + t.Target)
|
||||
j.finish("unknown target")
|
||||
return
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if ctx.Err() != nil {
|
||||
j.append("Aborted.")
|
||||
j.finish("aborted")
|
||||
} else {
|
||||
j.append("ERROR: " + err.Error())
|
||||
j.finish(err.Error())
|
||||
}
|
||||
return
|
||||
}
|
||||
if archive != "" {
|
||||
j.append("Archive: " + archive)
|
||||
}
|
||||
j.finish("")
|
||||
}
|
||||
|
||||
func splitLines(s string) []string {
|
||||
var out []string
|
||||
for _, l := range splitNL(s) {
|
||||
if l != "" {
|
||||
out = append(out, l)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func splitNL(s string) []string {
|
||||
var out []string
|
||||
start := 0
|
||||
for i, c := range s {
|
||||
if c == '\n' {
|
||||
out = append(out, s[start:i])
|
||||
start = i + 1
|
||||
}
|
||||
}
|
||||
out = append(out, s[start:])
|
||||
return out
|
||||
}
|
||||
|
||||
// ── HTTP handlers ─────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPITasksList(w http.ResponseWriter, _ *http.Request) {
|
||||
tasks := globalQueue.snapshot()
|
||||
writeJSON(w, tasks)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
t, ok := globalQueue.findByID(id)
|
||||
if !ok {
|
||||
writeError(w, http.StatusNotFound, "task not found")
|
||||
return
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
switch t.Status {
|
||||
case TaskPending:
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||
default:
|
||||
writeError(w, http.StatusConflict, "task is not running or pending")
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksPriority(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
t, ok := globalQueue.findByID(id)
|
||||
if !ok {
|
||||
writeError(w, http.StatusNotFound, "task not found")
|
||||
return
|
||||
}
|
||||
var req struct {
|
||||
Delta int `json:"delta"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid body")
|
||||
return
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if t.Status != TaskPending {
|
||||
writeError(w, http.StatusConflict, "only pending tasks can be reprioritised")
|
||||
return
|
||||
}
|
||||
t.Priority += req.Delta
|
||||
writeJSON(w, map[string]int{"priority": t.Priority})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request) {
|
||||
globalQueue.mu.Lock()
|
||||
now := time.Now()
|
||||
n := 0
|
||||
for _, t := range globalQueue.tasks {
|
||||
switch t.Status {
|
||||
case TaskPending:
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
n++
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
n++
|
||||
}
|
||||
}
|
||||
globalQueue.mu.Unlock()
|
||||
writeJSON(w, map[string]int{"cancelled": n})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
// Wait up to 5s for the task to get a job (it may be pending)
|
||||
deadline := time.Now().Add(5 * time.Second)
|
||||
var j *jobState
|
||||
for time.Now().Before(deadline) {
|
||||
if jj, ok := globalQueue.findJob(id); ok {
|
||||
j = jj
|
||||
break
|
||||
}
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
}
|
||||
if j == nil {
|
||||
http.Error(w, "task not found or not yet started", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
streamJob(w, r, j)
|
||||
}
|
||||
Reference in New Issue
Block a user