From 0a98ed8ae9b8b0364a0035280afd6ecb68fcbbf3 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Sat, 28 Mar 2026 21:15:11 +0300 Subject: [PATCH] feat: task queue, UI overhaul, burn tests, install-to-RAM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Task queue: all SAT/audit jobs enqueue and run one-at-a-time; tasks persist past page navigation; new Tasks page with cancel/priority/log stream - UI: consolidate nav (Validate, Burn, Tasks, Tools); Audit becomes modal; Dashboard hardware summary badges + split metrics charts (load/temp/power); Tools page consolidates network, services, install, support bundle - AMD GPU: acceptance test and stress burn cards; GPU presence API greys out irrelevant SAT cards automatically - Burn tests: Memory Stress (stress-ng --vm), SAT Stress (stressapptest) - Install to RAM: copies squashfs to /dev/shm, re-associates loop devices via LOOP_CHANGE_FD ioctl so live media can be ejected - Charts: relative time axis (0 = now, negative left) - memtester: LimitMEMLOCK=infinity in bee-web.service; empty output → UNSUPPORTED - SAT overlay applied dynamically on every /audit.json serve - MIME panic guard for LiveCD ramdisk I/O errors - ISO: add memtest86+, stressapptest packages; memtest86+ GRUB entry; disable screensaver/DPMS in bee-openbox-session - Unknown SAT status severity = 1 (does not override OK) Co-Authored-By: Claude Sonnet 4.6 --- audit/cmd/bee/main.go | 9 +- audit/internal/app/app.go | 112 ++- audit/internal/app/app_test.go | 27 +- audit/internal/app/sat_overlay.go | 8 +- audit/internal/platform/gpu_metrics.go | 52 ++ audit/internal/platform/install_to_ram.go | 178 +++++ .../internal/platform/install_to_ram_linux.go | 28 + .../internal/platform/install_to_ram_other.go | 9 + audit/internal/platform/live_metrics.go | 9 +- audit/internal/platform/network.go | 19 + audit/internal/platform/sat.go | 150 +++- audit/internal/platform/sat_fan_stress.go | 2 +- audit/internal/platform/services.go | 4 + audit/internal/webui/api.go | 323 ++++++--- audit/internal/webui/pages.go | 655 +++++++++++++++--- audit/internal/webui/server.go | 253 +++++-- audit/internal/webui/server_test.go | 19 +- audit/internal/webui/tasks.go | 420 +++++++++++ .../config/bootloaders/grub-pc/grub.cfg | 4 + .../config/package-lists/bee.list.chroot | 2 + .../etc/systemd/system/bee-web.service | 1 + iso/overlay/usr/local/bin/bee-openbox-session | 6 + 22 files changed, 1964 insertions(+), 326 deletions(-) create mode 100644 audit/internal/platform/install_to_ram.go create mode 100644 audit/internal/platform/install_to_ram_linux.go create mode 100644 audit/internal/platform/install_to_ram_other.go create mode 100644 audit/internal/webui/tasks.go diff --git a/audit/cmd/bee/main.go b/audit/cmd/bee/main.go index 2cd6599..ee592e7 100644 --- a/audit/cmd/bee/main.go +++ b/audit/cmd/bee/main.go @@ -346,19 +346,20 @@ func runSAT(args []string, stdout, stderr io.Writer) int { archive string err error ) + logLine := func(s string) { fmt.Fprintln(os.Stderr, s) } switch target { case "nvidia": - archive, err = application.RunNvidiaAcceptancePack("") + archive, err = application.RunNvidiaAcceptancePack("", logLine) case "memory": - archive, err = application.RunMemoryAcceptancePack("") + archive, err = application.RunMemoryAcceptancePack("", logLine) case "storage": - archive, err = application.RunStorageAcceptancePack("") + archive, err = application.RunStorageAcceptancePack("", logLine) case "cpu": dur := *duration if dur <= 0 { dur = 60 } - archive, err = application.RunCPUAcceptancePack("", dur) + archive, err = application.RunCPUAcceptancePack("", dur, logLine) } if err != nil { slog.Error("run sat", "target", target, "err", err) diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index 836108d..910b549 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -53,6 +53,8 @@ type networkManager interface { DHCPOne(iface string) (string, error) DHCPAll() (string, error) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) + SetInterfaceState(iface string, up bool) error + GetInterfaceState(iface string) (bool, error) } type serviceManager interface { @@ -75,20 +77,46 @@ type toolManager interface { type installer interface { ListInstallDisks() ([]platform.InstallDisk, error) InstallToDisk(ctx context.Context, device string, logFile string) error + IsLiveMediaInRAM() bool + RunInstallToRAM(logFunc func(string)) error +} + +type GPUPresenceResult struct { + Nvidia bool + AMD bool +} + +func (a *App) DetectGPUPresence() GPUPresenceResult { + vendor := a.sat.DetectGPUVendor() + return GPUPresenceResult{ + Nvidia: vendor == "nvidia", + AMD: vendor == "amd", + } +} + +func (a *App) IsLiveMediaInRAM() bool { + return a.installer.IsLiveMediaInRAM() +} + +func (a *App) RunInstallToRAM(logFunc func(string)) error { + return a.installer.RunInstallToRAM(logFunc) } type satRunner interface { - RunNvidiaAcceptancePack(baseDir string) (string, error) - RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error) - RunMemoryAcceptancePack(baseDir string) (string, error) - RunStorageAcceptancePack(baseDir string) (string, error) - RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) + RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) + RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) + RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) + RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) + RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) DetectGPUVendor() string ListAMDGPUs() ([]platform.AMDGPUInfo, error) - RunAMDAcceptancePack(baseDir string) (string, error) + RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) + RunAMDStressPack(baseDir string, logFunc func(string)) (string, error) + RunMemoryStressPack(baseDir string, logFunc func(string)) (string, error) + RunSATStressPack(baseDir string, logFunc func(string)) (string, error) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) - RunNCCLTests(ctx context.Context, baseDir string) (string, error) + RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) } type runtimeChecker interface { @@ -108,6 +136,17 @@ func New(platform *platform.System) *App { } } +// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results, +// and returns the updated JSON. Used by the web UI to serve always-fresh status. +func ApplySATOverlay(auditJSON []byte) ([]byte, error) { + var snap schema.HardwareIngestRequest + if err := json.Unmarshal(auditJSON, &snap); err != nil { + return nil, err + } + applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir) + return json.MarshalIndent(snap, "", " ") +} + func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) { if runtimeMode == runtimeenv.ModeLiveCD { if err := a.runtime.CaptureTechnicalDump(DefaultTechDumpDir); err != nil { @@ -301,6 +340,14 @@ func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) { return a.network.SetStaticIPv4(cfg) } +func (a *App) SetInterfaceState(iface string, up bool) error { + return a.network.SetInterfaceState(iface, up) +} + +func (a *App) GetInterfaceState(iface string) (bool, error) { + return a.network.GetInterfaceState(iface) +} + func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) { body, err := a.network.SetStaticIPv4(cfg) return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err @@ -416,15 +463,15 @@ func (a *App) AuditLogTailResult() ActionResult { return ActionResult{Title: "Audit log tail", Body: body} } -func (a *App) RunNvidiaAcceptancePack(baseDir string) (string, error) { +func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) { if strings.TrimSpace(baseDir) == "" { baseDir = DefaultSATBaseDir } - return a.sat.RunNvidiaAcceptancePack(baseDir) + return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc) } func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) { - path, err := a.RunNvidiaAcceptancePack(baseDir) + path, err := a.RunNvidiaAcceptancePack(baseDir, nil) body := "Archive written." if path != "" { body = "Archive written to " + path @@ -436,11 +483,11 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) { return a.sat.ListNvidiaGPUs() } -func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (ActionResult, error) { +func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) { if strings.TrimSpace(baseDir) == "" { baseDir = DefaultSATBaseDir } - path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices) + path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc) body := "Archive written." if path != "" { body = "Archive written to " + path @@ -448,39 +495,39 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st return ActionResult{Title: "NVIDIA DCGM", Body: body}, err } -func (a *App) RunMemoryAcceptancePack(baseDir string) (string, error) { +func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) { if strings.TrimSpace(baseDir) == "" { baseDir = DefaultSATBaseDir } - return a.sat.RunMemoryAcceptancePack(baseDir) + return a.sat.RunMemoryAcceptancePack(baseDir, logFunc) } func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) { - path, err := a.RunMemoryAcceptancePack(baseDir) + path, err := a.RunMemoryAcceptancePack(baseDir, nil) return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err } -func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) { +func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) { if strings.TrimSpace(baseDir) == "" { baseDir = DefaultSATBaseDir } - return a.sat.RunCPUAcceptancePack(baseDir, durationSec) + return a.sat.RunCPUAcceptancePack(baseDir, durationSec, logFunc) } func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) { - path, err := a.RunCPUAcceptancePack(baseDir, durationSec) + path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil) return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err } -func (a *App) RunStorageAcceptancePack(baseDir string) (string, error) { +func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) { if strings.TrimSpace(baseDir) == "" { baseDir = DefaultSATBaseDir } - return a.sat.RunStorageAcceptancePack(baseDir) + return a.sat.RunStorageAcceptancePack(baseDir, logFunc) } func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) { - path, err := a.RunStorageAcceptancePack(baseDir) + path, err := a.RunStorageAcceptancePack(baseDir, nil) return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err } @@ -492,18 +539,33 @@ func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) { return a.sat.ListAMDGPUs() } -func (a *App) RunAMDAcceptancePack(baseDir string) (string, error) { +func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) { if strings.TrimSpace(baseDir) == "" { baseDir = DefaultSATBaseDir } - return a.sat.RunAMDAcceptancePack(baseDir) + return a.sat.RunAMDAcceptancePack(baseDir, logFunc) } func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) { - path, err := a.RunAMDAcceptancePack(baseDir) + path, err := a.RunAMDAcceptancePack(baseDir, nil) return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err } +func (a *App) RunMemoryStressPack(baseDir string, logFunc func(string)) (string, error) { + return a.sat.RunMemoryStressPack(baseDir, logFunc) +} + +func (a *App) RunSATStressPack(baseDir string, logFunc func(string)) (string, error) { + return a.sat.RunSATStressPack(baseDir, logFunc) +} + +func (a *App) RunAMDStressPack(baseDir string, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunAMDStressPack(baseDir, logFunc) +} + func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) { if strings.TrimSpace(baseDir) == "" { baseDir = DefaultSATBaseDir @@ -512,7 +574,7 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor } func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) { - path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir) + path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil) body := "Results: " + path if err != nil && err != context.Canceled { body += "\nERROR: " + err.Error() diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go index bd1d79e..f6a4d87 100644 --- a/audit/internal/app/app_test.go +++ b/audit/internal/app/app_test.go @@ -43,6 +43,9 @@ func (f fakeNetwork) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error return f.setStaticIPv4Fn(cfg) } +func (f fakeNetwork) SetInterfaceState(_ string, _ bool) error { return nil } +func (f fakeNetwork) GetInterfaceState(_ string) (bool, error) { return true, nil } + type fakeServices struct { serviceStatusFn func(string) (string, error) serviceDoFn func(string, platform.ServiceAction) (string, error) @@ -123,11 +126,11 @@ type fakeSAT struct { listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error) } -func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) { +func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) { return f.runNvidiaFn(baseDir) } -func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ []int) (string, error) { +func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ []int, _ func(string)) (string, error) { return f.runNvidiaFn(baseDir) } @@ -138,15 +141,15 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) { return nil, nil } -func (f fakeSAT) RunMemoryAcceptancePack(baseDir string) (string, error) { +func (f fakeSAT) RunMemoryAcceptancePack(baseDir string, _ func(string)) (string, error) { return f.runMemoryFn(baseDir) } -func (f fakeSAT) RunStorageAcceptancePack(baseDir string) (string, error) { +func (f fakeSAT) RunStorageAcceptancePack(baseDir string, _ func(string)) (string, error) { return f.runStorageFn(baseDir) } -func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) { +func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int, _ func(string)) (string, error) { if f.runCPUFn != nil { return f.runCPUFn(baseDir, durationSec) } @@ -167,18 +170,22 @@ func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) { return nil, nil } -func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) { +func (f fakeSAT) RunAMDAcceptancePack(baseDir string, _ func(string)) (string, error) { if f.runAMDPackFn != nil { return f.runAMDPackFn(baseDir) } return "", nil } +func (f fakeSAT) RunAMDStressPack(_ string, _ func(string)) (string, error) { return "", nil } +func (f fakeSAT) RunMemoryStressPack(_ string, _ func(string)) (string, error) { return "", nil } +func (f fakeSAT) RunSATStressPack(_ string, _ func(string)) (string, error) { return "", nil } + func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) { return "", nil } -func (f fakeSAT) RunNCCLTests(_ context.Context, _ string) (string, error) { +func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) { return "", nil } @@ -574,13 +581,13 @@ func TestRunSATDefaultsToExportDir(t *testing.T) { }, } - if _, err := a.RunNvidiaAcceptancePack(""); err != nil { + if _, err := a.RunNvidiaAcceptancePack("", nil); err != nil { t.Fatal(err) } - if _, err := a.RunMemoryAcceptancePack(""); err != nil { + if _, err := a.RunMemoryAcceptancePack("", nil); err != nil { t.Fatal(err) } - if _, err := a.RunStorageAcceptancePack(""); err != nil { + if _, err := a.RunStorageAcceptancePack("", nil); err != nil { t.Fatal(err) } } diff --git a/audit/internal/app/sat_overlay.go b/audit/internal/app/sat_overlay.go index 6b88b80..79abadc 100644 --- a/audit/internal/app/sat_overlay.go +++ b/audit/internal/app/sat_overlay.go @@ -141,9 +141,11 @@ func satSummaryStatus(summary satSummary, label string) (string, string, bool) { func satKeyStatus(rawStatus, label string) (string, string, bool) { switch strings.ToUpper(strings.TrimSpace(rawStatus)) { case "OK": - return "OK", label + " passed", true + // No error description on success — error_description is for problems only. + return "OK", "", true case "PARTIAL", "UNSUPPORTED", "CANCELED", "CANCELLED": - return "Warning", label + " incomplete", true + // Tool couldn't run or test was incomplete — we can't assert hardware health. + return "Unknown", "", true case "FAILED": return "Critical", label + " failed", true default: @@ -180,6 +182,8 @@ func statusSeverity(status string) int { return 2 case "OK": return 1 + case "Unknown": + return 1 // same as OK — does not override OK from another source default: return 0 } diff --git a/audit/internal/platform/gpu_metrics.go b/audit/internal/platform/gpu_metrics.go index 03eef28..c10af5e 100644 --- a/audit/internal/platform/gpu_metrics.go +++ b/audit/internal/platform/gpu_metrics.go @@ -76,6 +76,58 @@ func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) { return sampleGPUMetrics(gpuIndices) } +// sampleAMDGPUMetrics queries rocm-smi for live GPU metrics. +func sampleAMDGPUMetrics() ([]GPUMetricRow, error) { + // --showtemp --showuse --showpower --csv — one row per GPU + out, err := runROCmSMI("--showtemp", "--showuse", "--showpower", "--showmemuse", "--csv") + if err != nil { + return nil, err + } + var rows []GPUMetricRow + for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { + line = strings.TrimSpace(line) + if line == "" || strings.HasPrefix(strings.ToLower(line), "device") { + continue + } + // CSV format: device,temp_c,gpu_use%,mem_use%,power_w (order may vary by rocm-smi version) + // We parse by column header from the first line. + parts := strings.Split(line, ",") + if len(parts) < 2 { + continue + } + idx := len(rows) + row := GPUMetricRow{GPUIndex: idx} + // rocm-smi CSV columns vary; extract what we can + for i, p := range parts { + p = strings.TrimSpace(p) + switch { + case i == 0: + // device index like "card0" or "0" + case strings.Contains(strings.ToLower(p), "n/a"): + // skip N/A + default: + // Try to match by position heuristic: temp, use%, memuse%, power + v := parseGPUFloat(p) + switch { + case i == 1 && row.TempC == 0: + row.TempC = v + case i == 2 && row.UsagePct == 0: + row.UsagePct = v + case i == 3 && row.MemUsagePct == 0: + row.MemUsagePct = v + case i == 4 && row.PowerW == 0: + row.PowerW = v + } + } + } + rows = append(rows, row) + } + if len(rows) == 0 { + return nil, fmt.Errorf("rocm-smi: no GPU rows parsed") + } + return rows, nil +} + // WriteGPUMetricsCSV writes collected rows as a CSV file. func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error { var b bytes.Buffer diff --git a/audit/internal/platform/install_to_ram.go b/audit/internal/platform/install_to_ram.go new file mode 100644 index 0000000..426263d --- /dev/null +++ b/audit/internal/platform/install_to_ram.go @@ -0,0 +1,178 @@ +package platform + +import ( + "encoding/json" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "strings" +) + +func (s *System) IsLiveMediaInRAM() bool { + out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", "/run/live/medium").Output() + if err != nil { + return toramActive() + } + return strings.TrimSpace(string(out)) == "tmpfs" +} + +func (s *System) RunInstallToRAM(logFunc func(string)) error { + log := func(msg string) { + if logFunc != nil { + logFunc(msg) + } + } + + if s.IsLiveMediaInRAM() { + log("Already running from RAM — installation media can be safely disconnected.") + return nil + } + + squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs") + if err != nil || len(squashfsFiles) == 0 { + return fmt.Errorf("no squashfs files found in /run/live/medium/live/") + } + + free := freeMemBytes() + var needed int64 + for _, sf := range squashfsFiles { + fi, err2 := os.Stat(sf) + if err2 != nil { + return fmt.Errorf("stat %s: %v", sf, err2) + } + needed += fi.Size() + } + const headroom = 256 * 1024 * 1024 + if free > 0 && needed+headroom > free { + return fmt.Errorf("insufficient RAM: need %s, available %s", + humanBytes(needed+headroom), humanBytes(free)) + } + + dstDir := "/dev/shm/bee-live" + if err := os.MkdirAll(dstDir, 0755); err != nil { + return fmt.Errorf("create tmpfs dir: %v", err) + } + + for _, sf := range squashfsFiles { + base := filepath.Base(sf) + dst := filepath.Join(dstDir, base) + log(fmt.Sprintf("Copying %s to RAM...", base)) + if err := copyFileLarge(sf, dst, log); err != nil { + return fmt.Errorf("copy %s: %v", base, err) + } + log(fmt.Sprintf("Copied %s.", base)) + + loopDev, err := findLoopForFile(sf) + if err != nil { + log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, err)) + continue + } + if err := reassociateLoopDevice(loopDev, dst); err != nil { + log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, err)) + } else { + log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev)) + } + } + + log("Copying remaining medium files...") + if err := cpDir("/run/live/medium", dstDir, log); err != nil { + log(fmt.Sprintf("Warning: partial copy: %v", err)) + } + if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil { + log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err)) + } + + log("Done. Installation media can be safely disconnected.") + return nil +} + +func copyFileLarge(src, dst string, logFunc func(string)) error { + in, err := os.Open(src) + if err != nil { + return err + } + defer in.Close() + fi, err := in.Stat() + if err != nil { + return err + } + out, err := os.Create(dst) + if err != nil { + return err + } + defer out.Close() + total := fi.Size() + var copied int64 + buf := make([]byte, 4*1024*1024) + for { + n, err := in.Read(buf) + if n > 0 { + if _, werr := out.Write(buf[:n]); werr != nil { + return werr + } + copied += int64(n) + if logFunc != nil && total > 0 { + pct := int(float64(copied) / float64(total) * 100) + logFunc(fmt.Sprintf(" %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct)) + } + } + if err == io.EOF { + break + } + if err != nil { + return err + } + } + return out.Sync() +} + +func cpDir(src, dst string, logFunc func(string)) error { + return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error { + if err != nil { + return nil + } + rel, _ := filepath.Rel(src, path) + target := filepath.Join(dst, rel) + if fi.IsDir() { + return os.MkdirAll(target, fi.Mode()) + } + if strings.HasSuffix(path, ".squashfs") { + return nil + } + if _, err := os.Stat(target); err == nil { + return nil + } + return copyFileLarge(path, target, nil) + }) +} + +func findLoopForFile(backingFile string) (string, error) { + out, err := exec.Command("losetup", "--list", "--json").Output() + if err != nil { + return "", err + } + var result struct { + Loopdevices []struct { + Name string `json:"name"` + BackFile string `json:"back-file"` + } `json:"loopdevices"` + } + if err := json.Unmarshal(out, &result); err != nil { + return "", err + } + for _, dev := range result.Loopdevices { + if dev.BackFile == backingFile { + return dev.Name, nil + } + } + return "", fmt.Errorf("no loop device found for %s", backingFile) +} + +func reassociateLoopDevice(loopDev, newFile string) error { + if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil { + return nil + } + return loopChangeFD(loopDev, newFile) +} diff --git a/audit/internal/platform/install_to_ram_linux.go b/audit/internal/platform/install_to_ram_linux.go new file mode 100644 index 0000000..c87cc82 --- /dev/null +++ b/audit/internal/platform/install_to_ram_linux.go @@ -0,0 +1,28 @@ +//go:build linux + +package platform + +import ( + "os" + "syscall" +) + +const ioctlLoopChangeFD = 0x4C08 + +func loopChangeFD(loopDev, newFile string) error { + lf, err := os.OpenFile(loopDev, os.O_RDWR, 0) + if err != nil { + return err + } + defer lf.Close() + nf, err := os.OpenFile(newFile, os.O_RDONLY, 0) + if err != nil { + return err + } + defer nf.Close() + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, lf.Fd(), ioctlLoopChangeFD, nf.Fd()) + if errno != 0 { + return errno + } + return nil +} diff --git a/audit/internal/platform/install_to_ram_other.go b/audit/internal/platform/install_to_ram_other.go new file mode 100644 index 0000000..adfe4c7 --- /dev/null +++ b/audit/internal/platform/install_to_ram_other.go @@ -0,0 +1,9 @@ +//go:build !linux + +package platform + +import "errors" + +func loopChangeFD(loopDev, newFile string) error { + return errors.New("LOOP_CHANGE_FD not available on this platform") +} diff --git a/audit/internal/platform/live_metrics.go b/audit/internal/platform/live_metrics.go index 0b516a9..968f9b8 100644 --- a/audit/internal/platform/live_metrics.go +++ b/audit/internal/platform/live_metrics.go @@ -32,9 +32,12 @@ type TempReading struct { func SampleLiveMetrics() LiveMetricSample { s := LiveMetricSample{Timestamp: time.Now().UTC()} - // GPU metrics — skipped silently if nvidia-smi unavailable - gpus, _ := SampleGPUMetrics(nil) - s.GPUs = gpus + // GPU metrics — try NVIDIA first, fall back to AMD + if gpus, err := SampleGPUMetrics(nil); err == nil && len(gpus) > 0 { + s.GPUs = gpus + } else if amdGPUs, err := sampleAMDGPUMetrics(); err == nil && len(amdGPUs) > 0 { + s.GPUs = amdGPUs + } // Fan speeds — skipped silently if ipmitool unavailable fans, _ := sampleFanSpeeds() diff --git a/audit/internal/platform/network.go b/audit/internal/platform/network.go index 2b7a897..03aa652 100644 --- a/audit/internal/platform/network.go +++ b/audit/internal/platform/network.go @@ -131,6 +131,25 @@ func (s *System) SetStaticIPv4(cfg StaticIPv4Config) (string, error) { return out.String(), nil } +// SetInterfaceState brings a network interface up or down. +func (s *System) SetInterfaceState(iface string, up bool) error { + state := "down" + if up { + state = "up" + } + return exec.Command("ip", "link", "set", "dev", iface, state).Run() +} + +// GetInterfaceState returns true if the interface is UP. +func (s *System) GetInterfaceState(iface string) (bool, error) { + raw, err := os.ReadFile(fmt.Sprintf("/sys/class/net/%s/operstate", iface)) + if err != nil { + return false, err + } + state := strings.TrimSpace(string(raw)) + return state == "up", nil +} + func listInterfaceNames() ([]string, error) { raw, err := exec.Command("ip", "-o", "link", "show").Output() if err != nil { diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index e16b490..8b9988f 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -2,6 +2,8 @@ package platform import ( "archive/tar" + "bufio" + "bytes" "compress/gzip" "context" "errors" @@ -13,6 +15,7 @@ import ( "sort" "strconv" "strings" + "sync" "time" ) @@ -32,6 +35,40 @@ var ( } ) +// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil). +// Returns combined stdout+stderr as a byte slice. +func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) { + pr, pw := io.Pipe() + cmd.Stdout = pw + cmd.Stderr = pw + + var buf bytes.Buffer + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + scanner := bufio.NewScanner(pr) + for scanner.Scan() { + line := scanner.Text() + buf.WriteString(line + "\n") + if logFunc != nil { + logFunc(line) + } + } + }() + + err := cmd.Start() + if err != nil { + _ = pw.Close() + wg.Wait() + return nil, err + } + waitErr := cmd.Wait() + _ = pw.Close() + wg.Wait() + return buf.Bytes(), waitErr +} + // NvidiaGPU holds basic GPU info from nvidia-smi. type NvidiaGPU struct { Index int @@ -80,13 +117,27 @@ func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) { } // RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi. -func (s *System) RunAMDAcceptancePack(baseDir string) (string, error) { +func (s *System) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) { return runAcceptancePack(baseDir, "gpu-amd", []satJob{ {name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}}, {name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}}, {name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}}, {name: "04-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}}, - }) + }, logFunc) +} + +// RunAMDStressPack runs an AMD GPU burn-in pack. +// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern. +func (s *System) RunAMDStressPack(baseDir string, logFunc func(string)) (string, error) { + seconds := envInt("BEE_AMD_STRESS_SECONDS", 300) + return runAcceptancePack(baseDir, "gpu-amd-stress", []satJob{ + {name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}}, + {name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}}, + {name: fmt.Sprintf("03-rocm-smi-monitor-%ds.log", seconds), cmd: []string{ + "rocm-smi", "--showtemp", "--showpower", + fmt.Sprintf("--duration=%d", seconds), + }}, + }, logFunc) } // ListNvidiaGPUs returns GPUs visible to nvidia-smi. @@ -123,7 +174,7 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) { // RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs. // Measures collective communication bandwidth over NVLink/PCIe. -func (s *System) RunNCCLTests(ctx context.Context, baseDir string) (string, error) { +func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { // detect GPU count out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output() gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n")) @@ -136,32 +187,65 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string) (string, erro "all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2", "-g", strconv.Itoa(gpuCount), "--iters", "20", }}, - }) + }, logFunc) } -func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) { - return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs()) +func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) { + return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc) } // RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM. // diagLevel: 1=quick, 2=medium, 3=targeted stress, 4=extended stress. // gpuIndices: specific GPU indices to test (empty = all GPUs). // ctx cancellation kills the running job. -func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error) { - return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices)) +func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) { + return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc) } -func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) { +func (s *System) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) { sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128) passes := envInt("BEE_MEMTESTER_PASSES", 1) return runAcceptancePack(baseDir, "memory", []satJob{ {name: "01-free-before.log", cmd: []string{"free", "-h"}}, {name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}}, {name: "03-free-after.log", cmd: []string{"free", "-h"}}, - }) + }, logFunc) } -func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) { +func (s *System) RunMemoryStressPack(baseDir string, logFunc func(string)) (string, error) { + seconds := envInt("BEE_VM_STRESS_SECONDS", 300) + // Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB. + sizeArg := "80%" + if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 { + sizeArg = fmt.Sprintf("%dM", mb) + } + return runAcceptancePack(baseDir, "memory-stress", []satJob{ + {name: "01-free-before.log", cmd: []string{"free", "-h"}}, + {name: "02-stress-ng-vm.log", cmd: []string{ + "stress-ng", "--vm", "1", + "--vm-bytes", sizeArg, + "--vm-method", "all", + "--timeout", fmt.Sprintf("%d", seconds), + "--metrics-brief", + }}, + {name: "03-free-after.log", cmd: []string{"free", "-h"}}, + }, logFunc) +} + +func (s *System) RunSATStressPack(baseDir string, logFunc func(string)) (string, error) { + seconds := envInt("BEE_SAT_STRESS_SECONDS", 300) + cmd := []string{"stressapptest", "-s", fmt.Sprintf("%d", seconds), "-W", "--cc_test"} + if mb := envInt("BEE_SAT_STRESS_MB", 0); mb > 0 { + cmd = append(cmd, "-M", fmt.Sprintf("%d", mb)) + } + return runAcceptancePack(baseDir, "sat-stress", []satJob{ + {name: "01-free-before.log", cmd: []string{"free", "-h"}}, + {name: "02-stressapptest.log", cmd: cmd}, + {name: "03-free-after.log", cmd: []string{"free", "-h"}}, + }, logFunc) +} + +func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) { if durationSec <= 0 { durationSec = 60 } @@ -170,10 +254,10 @@ func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int) (string, {name: "02-sensors-before.log", cmd: []string{"sensors"}}, {name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}}, {name: "04-sensors-after.log", cmd: []string{"sensors"}}, - }) + }, logFunc) } -func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) { +func (s *System) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) { if baseDir == "" { baseDir = "/var/log/bee-sat" } @@ -205,7 +289,7 @@ func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) { commands := storageSATCommands(devPath) for cmdIndex, job := range commands { name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name) - out, err := runSATCommand(verboseLog, job.name, job.cmd) + out, err := runSATCommand(verboseLog, job.name, job.cmd, logFunc) if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil { return "", writeErr } @@ -254,7 +338,7 @@ func nvidiaSATJobs() []satJob { } } -func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) { +func runAcceptancePack(baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) { if baseDir == "" { baseDir = "/var/log/bee-sat" } @@ -269,11 +353,13 @@ func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) { stats := satStats{} fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339)) for _, job := range jobs { + var out []byte + var err error cmd := make([]string, 0, len(job.cmd)) for _, arg := range job.cmd { cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir)) } - out, err := runSATCommand(verboseLog, job.name, cmd) + out, err = runSATCommand(verboseLog, job.name, cmd, logFunc) if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil { return "", writeErr } @@ -315,7 +401,7 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob { } } -func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob) (string, error) { +func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) { if baseDir == "" { baseDir = "/var/log/bee-sat" } @@ -342,9 +428,9 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa var err error if job.collectGPU { - out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir) + out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc) } else { - out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env) + out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc) } if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil { @@ -368,13 +454,16 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa return archive, nil } -func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) { +func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) { start := time.Now().UTC() resolvedCmd, err := resolveSATCommand(cmd) appendSATVerboseLog(verboseLog, fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name), "cmd: "+strings.Join(resolvedCmd, " "), ) + if logFunc != nil { + logFunc(fmt.Sprintf("=== %s ===", name)) + } if err != nil { appendSATVerboseLog(verboseLog, fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name), @@ -389,7 +478,7 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string if len(env) > 0 { c.Env = append(os.Environ(), env...) } - out, err := c.CombinedOutput() + out, err := streamExecOutput(c, logFunc) rc := 0 if err != nil { @@ -464,6 +553,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) { } text := strings.ToLower(string(out)) + // No output at all means the tool failed to start (mlock limit, binary missing, + // etc.) — we cannot say anything about hardware health → UNSUPPORTED. + if len(strings.TrimSpace(text)) == 0 { + return "UNSUPPORTED", rc + } if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") || strings.Contains(text, "invalid opcode") || @@ -472,19 +566,25 @@ func classifySATResult(name string, out []byte, err error) (string, int) { strings.Contains(text, "not available") || strings.Contains(text, "cuda_error_system_not_ready") || strings.Contains(text, "no such device") || + // nvidia-smi on a machine with no NVIDIA GPU + strings.Contains(text, "couldn't communicate with the nvidia driver") || + strings.Contains(text, "no nvidia gpu") || (strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) { return "UNSUPPORTED", rc } return "FAILED", rc } -func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) { +func runSATCommand(verboseLog, name string, cmd []string, logFunc func(string)) ([]byte, error) { start := time.Now().UTC() resolvedCmd, err := resolveSATCommand(cmd) appendSATVerboseLog(verboseLog, fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name), "cmd: "+strings.Join(resolvedCmd, " "), ) + if logFunc != nil { + logFunc(fmt.Sprintf("=== %s ===", name)) + } if err != nil { appendSATVerboseLog(verboseLog, fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name), @@ -495,7 +595,7 @@ func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) { return []byte(err.Error() + "\n"), err } - out, err := satExecCommand(resolvedCmd[0], resolvedCmd[1:]...).CombinedOutput() + out, err := streamExecOutput(satExecCommand(resolvedCmd[0], resolvedCmd[1:]...), logFunc) rc := 0 if err != nil { @@ -597,7 +697,7 @@ func parseStorageDevices(raw string) []string { // runSATCommandWithMetrics runs a command while collecting GPU metrics in the background. // On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir. -func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) { +func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string, logFunc func(string)) ([]byte, error) { stopCh := make(chan struct{}) doneCh := make(chan struct{}) var metricRows []GPUMetricRow @@ -625,7 +725,7 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd } }() - out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env) + out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc) close(stopCh) <-doneCh diff --git a/audit/internal/platform/sat_fan_stress.go b/audit/internal/platform/sat_fan_stress.go index c64920d..5c916cd 100644 --- a/audit/internal/platform/sat_fan_stress.go +++ b/audit/internal/platform/sat_fan_stress.go @@ -147,7 +147,7 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS "--seconds", strconv.Itoa(durSec), "--size-mb", strconv.Itoa(opts.SizeMB), } - out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env) + out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env, nil) _ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644) if err != nil && err != context.Canceled && err.Error() != "signal: killed" { fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName) diff --git a/audit/internal/platform/services.go b/audit/internal/platform/services.go index f1fbd03..cbeb910 100644 --- a/audit/internal/platform/services.go +++ b/audit/internal/platform/services.go @@ -17,6 +17,10 @@ func (s *System) ListBeeServices() ([]string, error) { } for _, match := range matches { name := strings.TrimSuffix(filepath.Base(match), ".service") + // Skip template units (e.g. bee-journal-mirror@) — they have no instances to query. + if strings.HasSuffix(name, "@") { + continue + } if !seen[name] { seen[name] = true out = append(out, name) diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index 518e92d..521c0dd 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -9,7 +9,6 @@ import ( "net/http" "os/exec" "path/filepath" - "strings" "sync/atomic" "time" @@ -110,39 +109,37 @@ func runCmdJob(j *jobState, cmd *exec.Cmd) { // ── Audit ───────────────────────────────────────────────────────────────────── -func (h *handler) handleAPIAuditRun(w http.ResponseWriter, r *http.Request) { +func (h *handler) handleAPIAuditRun(w http.ResponseWriter, _ *http.Request) { if h.opts.App == nil { writeError(w, http.StatusServiceUnavailable, "app not configured") return } - id := newJobID("audit") - j := globalJobs.create(id) - go func() { - j.append("Running audit...") - result, err := h.opts.App.RunAuditNow(h.opts.RuntimeMode) - if err != nil { - j.append("ERROR: " + err.Error()) - j.finish(err.Error()) - return - } - for _, line := range strings.Split(result.Body, "\n") { - if line != "" { - j.append(line) - } - } - j.finish("") - }() - writeJSON(w, map[string]string{"job_id": id}) + t := &Task{ + ID: newJobID("audit"), + Name: "Audit", + Target: "audit", + Status: TaskPending, + CreatedAt: time.Now(), + } + globalQueue.enqueue(t) + writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID}) } func (h *handler) handleAPIAuditStream(w http.ResponseWriter, r *http.Request) { id := r.URL.Query().Get("job_id") - j, ok := globalJobs.get(id) - if !ok { - http.Error(w, "job not found", http.StatusNotFound) + if id == "" { + id = r.URL.Query().Get("task_id") + } + // Try task queue first, then legacy job manager + if j, ok := globalQueue.findJob(id); ok { + streamJob(w, r, j) return } - streamJob(w, r, j) + if j, ok := globalJobs.get(id); ok { + streamJob(w, r, j) + return + } + http.Error(w, "job not found", http.StatusNotFound) } // ── SAT ─────────────────────────────────────────────────────────────────────── @@ -153,96 +150,87 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc { writeError(w, http.StatusServiceUnavailable, "app not configured") return } - id := newJobID("sat-" + target) - j := globalJobs.create(id) - ctx, cancel := context.WithCancel(context.Background()) - j.cancel = cancel - go func() { - defer cancel() - j.append(fmt.Sprintf("Starting %s acceptance test...", target)) - var ( - archive string - err error - ) + var body struct { + Duration int `json:"duration"` + DiagLevel int `json:"diag_level"` + GPUIndices []int `json:"gpu_indices"` + } + body.DiagLevel = 1 + if r.ContentLength > 0 { + _ = json.NewDecoder(r.Body).Decode(&body) + } - // Parse optional parameters - var body struct { - Duration int `json:"duration"` - DiagLevel int `json:"diag_level"` - GPUIndices []int `json:"gpu_indices"` - } - body.DiagLevel = 1 - if r.ContentLength > 0 { - _ = json.NewDecoder(r.Body).Decode(&body) - } - - switch target { - case "nvidia": - if len(body.GPUIndices) > 0 || body.DiagLevel > 0 { - result, e := h.opts.App.RunNvidiaAcceptancePackWithOptions( - ctx, "", body.DiagLevel, body.GPUIndices, - ) - if e != nil { - err = e - } else { - archive = result.Body - } - } else { - archive, err = h.opts.App.RunNvidiaAcceptancePack("") - } - case "memory": - archive, err = h.opts.App.RunMemoryAcceptancePack("") - case "storage": - archive, err = h.opts.App.RunStorageAcceptancePack("") - case "cpu": - dur := body.Duration - if dur <= 0 { - dur = 60 - } - archive, err = h.opts.App.RunCPUAcceptancePack("", dur) - } - - if err != nil { - if ctx.Err() != nil { - j.append("Aborted.") - j.finish("aborted") - } else { - j.append("ERROR: " + err.Error()) - j.finish(err.Error()) - } - return - } - j.append(fmt.Sprintf("Archive written: %s", archive)) - j.finish("") - }() - - writeJSON(w, map[string]string{"job_id": id}) + name := taskNames[target] + if name == "" { + name = target + } + t := &Task{ + ID: newJobID("sat-" + target), + Name: name, + Target: target, + Status: TaskPending, + CreatedAt: time.Now(), + params: taskParams{ + Duration: body.Duration, + DiagLevel: body.DiagLevel, + GPUIndices: body.GPUIndices, + }, + } + globalQueue.enqueue(t) + writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID}) } } func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) { id := r.URL.Query().Get("job_id") - j, ok := globalJobs.get(id) - if !ok { - http.Error(w, "job not found", http.StatusNotFound) + if id == "" { + id = r.URL.Query().Get("task_id") + } + if j, ok := globalQueue.findJob(id); ok { + streamJob(w, r, j) return } - streamJob(w, r, j) + if j, ok := globalJobs.get(id); ok { + streamJob(w, r, j) + return + } + http.Error(w, "job not found", http.StatusNotFound) } func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) { id := r.URL.Query().Get("job_id") - j, ok := globalJobs.get(id) - if !ok { - http.Error(w, "job not found", http.StatusNotFound) + if id == "" { + id = r.URL.Query().Get("task_id") + } + if t, ok := globalQueue.findByID(id); ok { + globalQueue.mu.Lock() + switch t.Status { + case TaskPending: + t.Status = TaskCancelled + now := time.Now() + t.DoneAt = &now + case TaskRunning: + if t.job != nil { + t.job.abort() + } + t.Status = TaskCancelled + now := time.Now() + t.DoneAt = &now + } + globalQueue.mu.Unlock() + writeJSON(w, map[string]string{"status": "aborted"}) return } - if j.abort() { - writeJSON(w, map[string]string{"status": "aborted"}) - } else { - writeJSON(w, map[string]string{"status": "not_running"}) + if j, ok := globalJobs.get(id); ok { + if j.abort() { + writeJSON(w, map[string]string{"status": "aborted"}) + } else { + writeJSON(w, map[string]string{"status": "not_running"}) + } + return } + http.Error(w, "job not found", http.StatusNotFound) } // ── Services ────────────────────────────────────────────────────────────────── @@ -401,6 +389,51 @@ func (h *handler) handleAPIExportBundle(w http.ResponseWriter, r *http.Request) }) } +// ── GPU presence ────────────────────────────────────────────────────────────── + +func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) { + if h.opts.App == nil { + writeError(w, http.StatusServiceUnavailable, "app not configured") + return + } + gp := h.opts.App.DetectGPUPresence() + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(map[string]bool{ + "nvidia": gp.Nvidia, + "amd": gp.AMD, + }) +} + +// ── System ──────────────────────────────────────────────────────────────────── + +func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) { + if h.opts.App == nil { + writeError(w, http.StatusServiceUnavailable, "app not configured") + return + } + inRAM := h.opts.App.IsLiveMediaInRAM() + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(map[string]bool{"in_ram": inRAM}) +} + +func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) { + if h.opts.App == nil { + writeError(w, http.StatusServiceUnavailable, "app not configured") + return + } + t := &Task{ + ID: newJobID("install-to-ram"), + Name: "Install to RAM", + Target: "install-to-ram", + Priority: 10, + Status: TaskPending, + CreatedAt: time.Now(), + } + globalQueue.enqueue(t) + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(map[string]string{"task_id": t.ID}) +} + // ── Tools ───────────────────────────────────────────────────────────────────── var standardTools = []string{ @@ -507,7 +540,7 @@ func (h *handler) handleAPIInstallRun(w http.ResponseWriter, r *http.Request) { h.installMu.Unlock() logFile := platform.InstallLogPath(req.Device) - go runCmdJob(j, exec.CommandContext(r.Context(), "bee-install", req.Device, logFile)) + go runCmdJob(j, exec.CommandContext(context.Background(), "bee-install", req.Device, logFile)) w.WriteHeader(http.StatusNoContent) } @@ -589,3 +622,95 @@ func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request) } } } + +// ── Network toggle ──────────────────────────────────────────────────────────── + +const netRollbackTimeout = 60 * time.Second + +func (h *handler) handleAPINetworkToggle(w http.ResponseWriter, r *http.Request) { + if h.opts.App == nil { + writeError(w, http.StatusServiceUnavailable, "app not configured") + return + } + var req struct { + Iface string `json:"iface"` + } + if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Iface == "" { + writeError(w, http.StatusBadRequest, "iface is required") + return + } + + wasUp, err := h.opts.App.GetInterfaceState(req.Iface) + if err != nil { + writeError(w, http.StatusInternalServerError, err.Error()) + return + } + + if err := h.opts.App.SetInterfaceState(req.Iface, !wasUp); err != nil { + writeError(w, http.StatusInternalServerError, err.Error()) + return + } + + // Cancel any existing pending change (rollback it first). + h.pendingNetMu.Lock() + if h.pendingNet != nil { + prev := h.pendingNet + prev.mu.Lock() + prev.timer.Stop() + _ = h.opts.App.SetInterfaceState(prev.iface, prev.wasUp) + prev.mu.Unlock() + } + + pnc := &pendingNetChange{iface: req.Iface, wasUp: wasUp} + pnc.timer = time.AfterFunc(netRollbackTimeout, func() { + _ = h.opts.App.SetInterfaceState(req.Iface, wasUp) + h.pendingNetMu.Lock() + if h.pendingNet == pnc { + h.pendingNet = nil + } + h.pendingNetMu.Unlock() + }) + h.pendingNet = pnc + h.pendingNetMu.Unlock() + + newState := "up" + if wasUp { + newState = "down" + } + writeJSON(w, map[string]any{ + "iface": req.Iface, + "new_state": newState, + "rollback_in": int(netRollbackTimeout.Seconds()), + }) +} + +func (h *handler) handleAPINetworkConfirm(w http.ResponseWriter, _ *http.Request) { + h.pendingNetMu.Lock() + pnc := h.pendingNet + h.pendingNet = nil + h.pendingNetMu.Unlock() + if pnc != nil { + pnc.mu.Lock() + pnc.timer.Stop() + pnc.mu.Unlock() + } + writeJSON(w, map[string]string{"status": "confirmed"}) +} + +func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Request) { + h.pendingNetMu.Lock() + pnc := h.pendingNet + h.pendingNet = nil + h.pendingNetMu.Unlock() + if pnc == nil { + writeError(w, http.StatusConflict, "no pending network change") + return + } + pnc.mu.Lock() + pnc.timer.Stop() + pnc.mu.Unlock() + if h.opts.App != nil { + _ = h.opts.App.SetInterfaceState(pnc.iface, pnc.wasUp) + } + writeJSON(w, map[string]string{"status": "rolled back"}) +} diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index 7e4043f..bf5c195 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -84,17 +84,13 @@ tbody tr:hover td{background:rgba(0,0,0,.03)} } func layoutNav(active string) string { - items := []struct{ id, label, href string }{ - {"dashboard", "Dashboard", "/"}, - {"viewer", "Audit Snapshot", "/viewer"}, - {"metrics", "Metrics", "/metrics"}, - {"tests", "Acceptance Tests", "/tests"}, - {"burn-in", "Burn-in", "/burn-in"}, - {"network", "Network", "/network"}, - {"services", "Services", "/services"}, - {"export", "Export", "/export"}, - {"tools", "Tools", "/tools"}, - {"install", "Install to Disk", "/install"}, + items := []struct{ id, label, href, onclick string }{ + {"dashboard", "Dashboard", "/", ""}, + {"audit", "Audit", "#", "openAuditModal();return false;"}, + {"validate", "Validate", "/validate", ""}, + {"burn", "Burn", "/burn", ""}, + {"tasks", "Tasks", "/tasks", ""}, + {"tools", "Tools", "/tools", ""}, } var b strings.Builder b.WriteString(``) return b.String() @@ -120,18 +121,35 @@ func renderPage(page string, opts HandlerOptions) string { pageID = "dashboard" title = "Dashboard" body = renderDashboard(opts) + case "validate": + pageID = "validate" + title = "Validate" + body = renderValidate() + case "burn": + pageID = "burn" + title = "Burn" + body = renderBurn() + case "tasks": + pageID = "tasks" + title = "Tasks" + body = renderTasks() + case "tools": + pageID = "tools" + title = "Tools" + body = renderTools() + // Legacy routes kept accessible but not in nav case "metrics": pageID = "metrics" title = "Live Metrics" body = renderMetrics() case "tests": - pageID = "tests" + pageID = "validate" title = "Acceptance Tests" - body = renderTests() + body = renderValidate() case "burn-in": - pageID = "burn-in" + pageID = "burn" title = "Burn-in Tests" - body = renderBurnIn() + body = renderBurn() case "network": pageID = "network" title = "Network" @@ -144,10 +162,6 @@ func renderPage(page string, opts HandlerOptions) string { pageID = "export" title = "Export" body = renderExport(opts.ExportDir) - case "tools": - pageID = "tools" - title = "Tools" - body = renderTools() case "install": pageID = "install" title = "Install to Disk" @@ -162,48 +176,158 @@ func renderPage(page string, opts HandlerOptions) string { layoutNav(pageID) + `

` + html.EscapeString(title) + `

` + body + - `
` + `` + + renderAuditModal() + + `` } // ── Dashboard ───────────────────────────────────────────────────────────────── func renderDashboard(opts HandlerOptions) string { var b strings.Builder - b.WriteString(`
`) - // Left: health summary - b.WriteString(`
`) + b.WriteString(renderHardwareSummaryCard(opts)) b.WriteString(renderHealthCard(opts)) - b.WriteString(`
`) - // Right: quick actions - b.WriteString(`
`) - b.WriteString(`
Quick Actions
`) - b.WriteString(`⬇ Download Support Bundle`) - b.WriteString(`📄 Open audit.json`) - b.WriteString(`📁 Browse Export Files`) - b.WriteString(`
`) - b.WriteString(`
`) - b.WriteString(`
`) - b.WriteString(`
`) - // Audit run output div - b.WriteString(``) - - b.WriteString(``) + b.WriteString(renderMetrics()) return b.String() } +func renderHardwareSummaryCard(opts HandlerOptions) string { + data, err := loadSnapshot(opts.AuditPath) + if err != nil { + return `
Hardware Summary
No audit data
` + } + // Parse just enough fields for the summary banner + var snap struct { + Summary struct { + CPU struct{ Model string } + Memory struct{ TotalGB float64 } + Storage []struct{ Device, Model, Size string } + GPUs []struct{ Model string } + PSUs []struct{ Model string } + } + Network struct { + Interfaces []struct { + Name string + IPv4 []string + State string + } + } + } + // Try to extract top-level fields loosely + var raw map[string]json.RawMessage + if err := json.Unmarshal(data, &raw); err != nil { + return `
Hardware Summary
Parse error
` + } + _ = snap + + // Also load runtime-health for badges + type componentHealth struct { + FailCount int `json:"fail_count"` + WarnCount int `json:"warn_count"` + } + type healthSummary struct { + CPU componentHealth `json:"cpu"` + Memory componentHealth `json:"memory"` + Storage componentHealth `json:"storage"` + GPU componentHealth `json:"gpu"` + PSU componentHealth `json:"psu"` + Network componentHealth `json:"network"` + } + var health struct { + HardwareHealth healthSummary `json:"hardware_health"` + } + if hdata, herr := loadSnapshot(filepath.Join(opts.ExportDir, "runtime-health.json")); herr == nil { + _ = json.Unmarshal(hdata, &health) + } + + badge := func(h componentHealth) string { + if h.FailCount > 0 { + return `FAIL` + } + if h.WarnCount > 0 { + return `WARN` + } + return `OK` + } + + // Extract readable strings from raw JSON + getString := func(key string) string { + v, ok := raw[key] + if !ok { + return "" + } + var s string + if err := json.Unmarshal(v, &s); err == nil { + return s + } + return "" + } + + cpuModel := getString("cpu_model") + memStr := getString("memory_summary") + gpuSummary := getString("gpu_summary") + + var b strings.Builder + b.WriteString(`
Hardware Summary
`) + b.WriteString(``) + writeRow := func(label, value, badgeHTML string) { + b.WriteString(fmt.Sprintf(``, + html.EscapeString(label), html.EscapeString(value), badgeHTML)) + } + if cpuModel != "" { + writeRow("CPU", cpuModel, badge(health.HardwareHealth.CPU)) + } else { + writeRow("CPU", "—", badge(health.HardwareHealth.CPU)) + } + if memStr != "" { + writeRow("Memory", memStr, badge(health.HardwareHealth.Memory)) + } else { + writeRow("Memory", "—", badge(health.HardwareHealth.Memory)) + } + if gpuSummary != "" { + writeRow("GPU", gpuSummary, badge(health.HardwareHealth.GPU)) + } else { + writeRow("GPU", "—", badge(health.HardwareHealth.GPU)) + } + writeRow("Storage", "—", badge(health.HardwareHealth.Storage)) + writeRow("PSU", "—", badge(health.HardwareHealth.PSU)) + b.WriteString(`
%s%s%s
`) + b.WriteString(`
`) + return b.String() +} + +func renderAuditModal() string { + return ` +` +} + func renderHealthCard(opts HandlerOptions) string { data, err := loadSnapshot(filepath.Join(opts.ExportDir, "runtime-health.json")) if err != nil { @@ -239,12 +363,26 @@ func renderHealthCard(opts HandlerOptions) string { // ── Metrics ─────────────────────────────────────────────────────────────────── func renderMetrics() string { - return `

Live metrics — updated every 2 seconds. Charts use go-analyze/charts (grafana theme).

+ return `

Live metrics — updated every 2 seconds.

-
Server
+
Server — Load
- Server metrics + CPU/Mem load +
+
+ +
+
Server — Temperature
+
+ CPU temperature +
+
+ +
+
Server — Power
+
+ System power
@@ -256,12 +394,16 @@ let knownGPUs = []; function refreshCharts() { const t = '?t=' + Date.now(); - const srv = document.getElementById('chart-server'); - if (srv) srv.src = srv.src.split('?')[0] + t; - knownGPUs.forEach(idx => { - const el = document.getElementById('chart-gpu-' + idx); + ['chart-server-load','chart-server-temp','chart-server-power'].forEach(id => { + const el = document.getElementById(id); if (el) el.src = el.src.split('?')[0] + t; }); + knownGPUs.forEach(idx => { + ['load','temp','power'].forEach(kind => { + const el = document.getElementById('chart-gpu-' + idx + '-' + kind); + if (el) el.src = el.src.split('?')[0] + t; + }); + }); } setInterval(refreshCharts, 2000); @@ -276,10 +418,19 @@ es.addEventListener('metrics', e => { const div = document.createElement('div'); div.className = 'card'; div.style.marginBottom = '16px'; - div.innerHTML = '
GPU ' + g.index + '
' + + div.innerHTML = + '
GPU ' + g.index + ' — Load
' + '
' + - 'GPU ' + g.index + '' + - '
' + + 'GPU ' + g.index + ' load' + + '
' + + '
GPU ' + g.index + ' — Temperature
' + + '
' + + 'GPU ' + g.index + ' temp' + + '
' + + '
GPU ' + g.index + ' — Power
' + + '
' + + 'GPU ' + g.index + ' power' + + '
' + '
'; document.getElementById('gpu-charts').appendChild(div); }); @@ -309,15 +460,27 @@ es.onerror = () => {}; ` } -// ── Acceptance Tests ────────────────────────────────────────────────────────── +// ── Validate (Acceptance Tests) ─────────────────────────────────────────────── -func renderTests() string { - return `

Run hardware acceptance tests and view results.

-
+func renderValidate() string { + return `
Non-destructive: Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.
+

Tasks continue in the background — view progress in Tasks.

+ +
+
Run All Tests
+
+
+ + +
+
+ +
` + renderSATCard("nvidia", "NVIDIA GPU", `
`) + renderSATCard("memory", "Memory", "") + renderSATCard("storage", "Storage", "") + renderSATCard("cpu", "CPU", `
`) + + renderSATCard("amd", "AMD GPU", "") + `