Compare commits
26 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| dbab43db90 | |||
| bcb7fe5fe9 | |||
| d21d9d191b | |||
| ef45246ea0 | |||
| 348db35119 | |||
| 1dd7f243f5 | |||
| 938e499ac2 | |||
| 964ab39656 | |||
| c2aecc6ce9 | |||
| 439b86ce59 | |||
| eb60100297 | |||
|
|
2baf3be640 | ||
|
|
d92f8f41d0 | ||
|
|
76a9100779 | ||
|
|
1b6d592bf3 | ||
|
|
c95bbff23b | ||
|
|
4e4debd4da | ||
|
|
5839f870b7 | ||
|
|
b447717a5a | ||
|
|
f6f4923ac9 | ||
|
|
c394845b34 | ||
|
|
3472afea32 | ||
|
|
942f11937f | ||
|
|
b5b34983f1 | ||
| 45221d1e9a | |||
| 3869788bac |
@@ -36,6 +36,8 @@ var supportBundleCommands = []struct {
|
|||||||
{name: "system/dmesg-tail.txt", cmd: []string{"sh", "-c", "dmesg | tail -n 200"}},
|
{name: "system/dmesg-tail.txt", cmd: []string{"sh", "-c", "dmesg | tail -n 200"}},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const supportBundleGlob = "bee-support-*.tar.gz"
|
||||||
|
|
||||||
func BuildSupportBundle(exportDir string) (string, error) {
|
func BuildSupportBundle(exportDir string) (string, error) {
|
||||||
exportDir = strings.TrimSpace(exportDir)
|
exportDir = strings.TrimSpace(exportDir)
|
||||||
if exportDir == "" {
|
if exportDir == "" {
|
||||||
@@ -86,34 +88,64 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
return archivePath, nil
|
return archivePath, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func LatestSupportBundlePath() (string, error) {
|
||||||
|
return latestSupportBundlePath(os.TempDir())
|
||||||
|
}
|
||||||
|
|
||||||
func cleanupOldSupportBundles(dir string) error {
|
func cleanupOldSupportBundles(dir string) error {
|
||||||
matches, err := filepath.Glob(filepath.Join(dir, "bee-support-*.tar.gz"))
|
matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
type entry struct {
|
entries := supportBundleEntries(matches)
|
||||||
path string
|
for path, mod := range entries {
|
||||||
mod time.Time
|
if time.Since(mod) > 24*time.Hour {
|
||||||
|
_ = os.Remove(path)
|
||||||
|
delete(entries, path)
|
||||||
}
|
}
|
||||||
list := make([]entry, 0, len(matches))
|
}
|
||||||
|
ordered := orderSupportBundles(entries)
|
||||||
|
if len(ordered) > 3 {
|
||||||
|
for _, old := range ordered[3:] {
|
||||||
|
_ = os.Remove(old)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func latestSupportBundlePath(dir string) (string, error) {
|
||||||
|
matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
ordered := orderSupportBundles(supportBundleEntries(matches))
|
||||||
|
if len(ordered) == 0 {
|
||||||
|
return "", os.ErrNotExist
|
||||||
|
}
|
||||||
|
return ordered[0], nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func supportBundleEntries(matches []string) map[string]time.Time {
|
||||||
|
entries := make(map[string]time.Time, len(matches))
|
||||||
for _, match := range matches {
|
for _, match := range matches {
|
||||||
info, err := os.Stat(match)
|
info, err := os.Stat(match)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if time.Since(info.ModTime()) > 24*time.Hour {
|
entries[match] = info.ModTime()
|
||||||
_ = os.Remove(match)
|
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
list = append(list, entry{path: match, mod: info.ModTime()})
|
return entries
|
||||||
|
}
|
||||||
|
|
||||||
|
func orderSupportBundles(entries map[string]time.Time) []string {
|
||||||
|
ordered := make([]string, 0, len(entries))
|
||||||
|
for path := range entries {
|
||||||
|
ordered = append(ordered, path)
|
||||||
}
|
}
|
||||||
sort.Slice(list, func(i, j int) bool { return list[i].mod.After(list[j].mod) })
|
sort.Slice(ordered, func(i, j int) bool {
|
||||||
if len(list) > 3 {
|
return entries[ordered[i]].After(entries[ordered[j]])
|
||||||
for _, old := range list[3:] {
|
})
|
||||||
_ = os.Remove(old.path)
|
return ordered
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func writeJournalDump(dst string) error {
|
func writeJournalDump(dst string) error {
|
||||||
|
|||||||
@@ -21,6 +21,10 @@ type nvidiaGPUInfo struct {
|
|||||||
ECCUncorrected *int64
|
ECCUncorrected *int64
|
||||||
ECCCorrected *int64
|
ECCCorrected *int64
|
||||||
HWSlowdown *bool
|
HWSlowdown *bool
|
||||||
|
PCIeLinkGenCurrent *int
|
||||||
|
PCIeLinkGenMax *int
|
||||||
|
PCIeLinkWidthCur *int
|
||||||
|
PCIeLinkWidthMax *int
|
||||||
}
|
}
|
||||||
|
|
||||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||||
@@ -94,7 +98,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
|||||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||||
out, err := exec.Command(
|
out, err := exec.Command(
|
||||||
"nvidia-smi",
|
"nvidia-smi",
|
||||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
|
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||||
"--format=csv,noheader,nounits",
|
"--format=csv,noheader,nounits",
|
||||||
).Output()
|
).Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -118,8 +122,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
|||||||
if len(rec) == 0 {
|
if len(rec) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if len(rec) < 9 {
|
if len(rec) < 13 {
|
||||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
|
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
|
||||||
}
|
}
|
||||||
|
|
||||||
bdf := normalizePCIeBDF(rec[1])
|
bdf := normalizePCIeBDF(rec[1])
|
||||||
@@ -136,6 +140,10 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
|||||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||||
HWSlowdown: parseMaybeBool(rec[8]),
|
HWSlowdown: parseMaybeBool(rec[8]),
|
||||||
|
PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
|
||||||
|
PCIeLinkGenMax: parseMaybeInt(rec[10]),
|
||||||
|
PCIeLinkWidthCur: parseMaybeInt(rec[11]),
|
||||||
|
PCIeLinkWidthMax: parseMaybeInt(rec[12]),
|
||||||
}
|
}
|
||||||
result[bdf] = info
|
result[bdf] = info
|
||||||
}
|
}
|
||||||
@@ -167,6 +175,22 @@ func parseMaybeInt64(v string) *int64 {
|
|||||||
return &n
|
return &n
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseMaybeInt(v string) *int {
|
||||||
|
v = strings.TrimSpace(v)
|
||||||
|
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
n, err := strconv.Atoi(v)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return &n
|
||||||
|
}
|
||||||
|
|
||||||
|
func pcieLinkGenLabel(gen int) string {
|
||||||
|
return fmt.Sprintf("Gen%d", gen)
|
||||||
|
}
|
||||||
|
|
||||||
func parseMaybeBool(v string) *bool {
|
func parseMaybeBool(v string) *bool {
|
||||||
v = strings.TrimSpace(strings.ToLower(v))
|
v = strings.TrimSpace(strings.ToLower(v))
|
||||||
switch v {
|
switch v {
|
||||||
@@ -231,4 +255,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
|||||||
if info.HWSlowdown != nil {
|
if info.HWSlowdown != nil {
|
||||||
dev.HWSlowdown = info.HWSlowdown
|
dev.HWSlowdown = info.HWSlowdown
|
||||||
}
|
}
|
||||||
|
// Override PCIe link speed/width with nvidia-smi driver values.
|
||||||
|
// sysfs current_link_speed reflects the instantaneous physical link state and
|
||||||
|
// can show Gen1 when the GPU is idle due to ASPM power management. The driver
|
||||||
|
// knows the negotiated speed regardless of the current power state.
|
||||||
|
if info.PCIeLinkGenCurrent != nil {
|
||||||
|
s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent)
|
||||||
|
dev.LinkSpeed = &s
|
||||||
|
}
|
||||||
|
if info.PCIeLinkGenMax != nil {
|
||||||
|
s := pcieLinkGenLabel(*info.PCIeLinkGenMax)
|
||||||
|
dev.MaxLinkSpeed = &s
|
||||||
|
}
|
||||||
|
if info.PCIeLinkWidthCur != nil {
|
||||||
|
dev.LinkWidth = info.PCIeLinkWidthCur
|
||||||
|
}
|
||||||
|
if info.PCIeLinkWidthMax != nil {
|
||||||
|
dev.MaxLinkWidth = info.PCIeLinkWidthMax
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestParseNVIDIASMIQuery(t *testing.T) {
|
func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n"
|
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||||
byBDF, err := parseNVIDIASMIQuery(raw)
|
byBDF, err := parseNVIDIASMIQuery(raw)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("parse failed: %v", err)
|
t.Fatalf("parse failed: %v", err)
|
||||||
@@ -28,6 +28,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
|||||||
if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
|
if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
|
||||||
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
|
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
|
||||||
}
|
}
|
||||||
|
if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 {
|
||||||
|
t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent)
|
||||||
|
}
|
||||||
|
if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 {
|
||||||
|
t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestNormalizePCIeBDF(t *testing.T) {
|
func TestNormalizePCIeBDF(t *testing.T) {
|
||||||
|
|||||||
@@ -77,11 +77,24 @@ func discoverStorageDevices() []lsblkDevice {
|
|||||||
if dev.Type != "disk" {
|
if dev.Type != "disk" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
if isVirtualBMCDisk(dev) {
|
||||||
|
slog.Debug("storage: skipping BMC virtual disk", "name", dev.Name, "model", dev.Model)
|
||||||
|
continue
|
||||||
|
}
|
||||||
disks = append(disks, dev)
|
disks = append(disks, dev)
|
||||||
}
|
}
|
||||||
return disks
|
return disks
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isVirtualBMCDisk returns true for BMC/IPMI virtual USB mass storage devices
|
||||||
|
// that appear as disks but are not real hardware (e.g. iDRAC Virtual HDisk*).
|
||||||
|
// These have zero reported size, a generic fake serial, and a model name that
|
||||||
|
// starts with "Virtual HDisk".
|
||||||
|
func isVirtualBMCDisk(dev lsblkDevice) bool {
|
||||||
|
model := strings.ToLower(strings.TrimSpace(dev.Model))
|
||||||
|
return strings.HasPrefix(model, "virtual hdisk")
|
||||||
|
}
|
||||||
|
|
||||||
func lsblkDevices() []lsblkDevice {
|
func lsblkDevices() []lsblkDevice {
|
||||||
out, err := exec.Command("lsblk", "-J", "-d",
|
out, err := exec.Command("lsblk", "-J", "-d",
|
||||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
||||||
|
|||||||
@@ -68,18 +68,20 @@ func SampleLiveMetrics() LiveMetricSample {
|
|||||||
|
|
||||||
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
|
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
|
||||||
// the overall CPU utilisation percentage.
|
// the overall CPU utilisation percentage.
|
||||||
var cpuStatPrev [2]uint64 // [total, idle]
|
|
||||||
|
|
||||||
func sampleCPULoadPct() float64 {
|
func sampleCPULoadPct() float64 {
|
||||||
total, idle := readCPUStat()
|
total0, idle0 := readCPUStat()
|
||||||
if total == 0 {
|
if total0 == 0 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
prevTotal, prevIdle := cpuStatPrev[0], cpuStatPrev[1]
|
time.Sleep(200 * time.Millisecond)
|
||||||
cpuStatPrev = [2]uint64{total, idle}
|
total1, idle1 := readCPUStat()
|
||||||
if prevTotal == 0 {
|
if total1 == 0 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
return cpuLoadPctBetween(total0, idle0, total1, idle1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func cpuLoadPctBetween(prevTotal, prevIdle, total, idle uint64) float64 {
|
||||||
dt := float64(total - prevTotal)
|
dt := float64(total - prevTotal)
|
||||||
di := float64(idle - prevIdle)
|
di := float64(idle - prevIdle)
|
||||||
if dt <= 0 {
|
if dt <= 0 {
|
||||||
|
|||||||
@@ -42,3 +42,53 @@ func TestCompactAmbientTempName(t *testing.T) {
|
|||||||
t.Fatalf("got %q", got)
|
t.Fatalf("got %q", got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestCPULoadPctBetween(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
prevTotal uint64
|
||||||
|
prevIdle uint64
|
||||||
|
total uint64
|
||||||
|
idle uint64
|
||||||
|
want float64
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "busy half",
|
||||||
|
prevTotal: 100,
|
||||||
|
prevIdle: 40,
|
||||||
|
total: 200,
|
||||||
|
idle: 90,
|
||||||
|
want: 50,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "fully busy",
|
||||||
|
prevTotal: 100,
|
||||||
|
prevIdle: 40,
|
||||||
|
total: 200,
|
||||||
|
idle: 40,
|
||||||
|
want: 100,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "no progress",
|
||||||
|
prevTotal: 100,
|
||||||
|
prevIdle: 40,
|
||||||
|
total: 100,
|
||||||
|
idle: 40,
|
||||||
|
want: 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "idle delta larger than total clamps to zero",
|
||||||
|
prevTotal: 100,
|
||||||
|
prevIdle: 40,
|
||||||
|
total: 200,
|
||||||
|
idle: 150,
|
||||||
|
want: 0,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := cpuLoadPctBetween(tc.prevTotal, tc.prevIdle, tc.total, tc.idle); got != tc.want {
|
||||||
|
t.Fatalf("%s: cpuLoadPctBetween(...)=%v want %v", tc.name, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-stress", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||||
job,
|
job,
|
||||||
@@ -24,6 +24,17 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
|
|||||||
}, logFunc)
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func nvidiaStressArchivePrefix(loader string) string {
|
||||||
|
switch strings.TrimSpace(strings.ToLower(loader)) {
|
||||||
|
case NvidiaStressLoaderJohn:
|
||||||
|
return "gpu-nvidia-john"
|
||||||
|
case NvidiaStressLoaderNCCL:
|
||||||
|
return "gpu-nvidia-nccl"
|
||||||
|
default:
|
||||||
|
return "gpu-nvidia-burn"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||||
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
|
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -84,9 +95,7 @@ func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
|
|||||||
if opts.DurationSec <= 0 {
|
if opts.DurationSec <= 0 {
|
||||||
opts.DurationSec = 300
|
opts.DurationSec = 300
|
||||||
}
|
}
|
||||||
if opts.SizeMB <= 0 {
|
// SizeMB=0 means "auto" — bee-gpu-burn will query per-GPU memory at runtime.
|
||||||
opts.SizeMB = 64
|
|
||||||
}
|
|
||||||
switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
|
switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
|
||||||
case "", NvidiaStressLoaderBuiltin:
|
case "", NvidiaStressLoaderBuiltin:
|
||||||
opts.Loader = NvidiaStressLoaderBuiltin
|
opts.Loader = NvidiaStressLoaderBuiltin
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ type PlatformStressCycle struct {
|
|||||||
// PlatformStressOptions controls the thermal cycling test.
|
// PlatformStressOptions controls the thermal cycling test.
|
||||||
type PlatformStressOptions struct {
|
type PlatformStressOptions struct {
|
||||||
Cycles []PlatformStressCycle
|
Cycles []PlatformStressCycle
|
||||||
|
Components []string // if empty: run all; values: "cpu", "gpu"
|
||||||
}
|
}
|
||||||
|
|
||||||
// platformStressRow is one second of telemetry.
|
// platformStressRow is one second of telemetry.
|
||||||
@@ -68,8 +69,11 @@ func (s *System) RunPlatformStress(
|
|||||||
return "", fmt.Errorf("mkdir run dir: %w", err)
|
return "", fmt.Errorf("mkdir run dir: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
hasCPU := len(opts.Components) == 0 || containsComponent(opts.Components, "cpu")
|
||||||
|
hasGPU := len(opts.Components) == 0 || containsComponent(opts.Components, "gpu")
|
||||||
|
|
||||||
vendor := s.DetectGPUVendor()
|
vendor := s.DetectGPUVendor()
|
||||||
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor))
|
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s, cpu=%v gpu=%v", len(opts.Cycles), vendor, hasCPU, hasGPU))
|
||||||
|
|
||||||
var rows []platformStressRow
|
var rows []platformStressRow
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
@@ -88,6 +92,7 @@ func (s *System) RunPlatformStress(
|
|||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
|
|
||||||
// CPU stress
|
// CPU stress
|
||||||
|
if hasCPU {
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
@@ -98,8 +103,10 @@ func (s *System) RunPlatformStress(
|
|||||||
}
|
}
|
||||||
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||||
}()
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
// GPU stress
|
// GPU stress
|
||||||
|
if hasGPU {
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
@@ -109,6 +116,7 @@ func (s *System) RunPlatformStress(
|
|||||||
}
|
}
|
||||||
_ = gpuCmd.Wait()
|
_ = gpuCmd.Wait()
|
||||||
}()
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
// Monitoring goroutine for load phase
|
// Monitoring goroutine for load phase
|
||||||
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
|
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
|
||||||
@@ -439,7 +447,7 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
|
cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
_ = startLowPriorityCmd(cmd, 10)
|
_ = startLowPriorityCmd(cmd, 10)
|
||||||
@@ -486,6 +494,15 @@ func platformStressMemoryMB() int {
|
|||||||
return mb
|
return mb
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func containsComponent(components []string, name string) bool {
|
||||||
|
for _, c := range components {
|
||||||
|
if c == name {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
func packPlatformDir(dir, dest string) error {
|
func packPlatformDir(dir, dest string) error {
|
||||||
f, err := os.Create(dest)
|
f, err := os.Create(dest)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"syscall"
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -531,6 +532,13 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
|||||||
}
|
}
|
||||||
|
|
||||||
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
||||||
|
c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
c.Cancel = func() error {
|
||||||
|
if c.Process != nil {
|
||||||
|
_ = syscall.Kill(-c.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
if len(env) > 0 {
|
if len(env) > 0 {
|
||||||
c.Env = append(os.Environ(), env...)
|
c.Env = append(os.Environ(), env...)
|
||||||
}
|
}
|
||||||
@@ -684,7 +692,11 @@ func resolveSATCommand(cmd []string) ([]string, error) {
|
|||||||
case "rvs":
|
case "rvs":
|
||||||
return resolveRVSCommand(cmd[1:]...)
|
return resolveRVSCommand(cmd[1:]...)
|
||||||
}
|
}
|
||||||
return cmd, nil
|
path, err := satLookPath(cmd[0])
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("%s not found in PATH: %w", cmd[0], err)
|
||||||
|
}
|
||||||
|
return append([]string{path}, cmd[1:]...), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func resolveRVSCommand(args ...string) ([]string, error) {
|
func resolveRVSCommand(args ...string) ([]string, error) {
|
||||||
|
|||||||
@@ -51,6 +51,18 @@ type FanStressRow struct {
|
|||||||
SysPowerW float64 // DCMI system power reading
|
SysPowerW float64 // DCMI system power reading
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type cachedPowerReading struct {
|
||||||
|
Value float64
|
||||||
|
UpdatedAt time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
systemPowerCacheMu sync.Mutex
|
||||||
|
systemPowerCache cachedPowerReading
|
||||||
|
)
|
||||||
|
|
||||||
|
const systemPowerHoldTTL = 15 * time.Second
|
||||||
|
|
||||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||||
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||||
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||||
@@ -508,11 +520,17 @@ func sampleCPUTempViaSensors() float64 {
|
|||||||
|
|
||||||
// sampleSystemPower reads system power draw via DCMI.
|
// sampleSystemPower reads system power draw via DCMI.
|
||||||
func sampleSystemPower() float64 {
|
func sampleSystemPower() float64 {
|
||||||
|
now := time.Now()
|
||||||
|
current := 0.0
|
||||||
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
||||||
if err != nil {
|
if err == nil {
|
||||||
return 0
|
current = parseDCMIPowerReading(string(out))
|
||||||
}
|
}
|
||||||
return parseDCMIPowerReading(string(out))
|
systemPowerCacheMu.Lock()
|
||||||
|
defer systemPowerCacheMu.Unlock()
|
||||||
|
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
|
||||||
|
systemPowerCache = updated
|
||||||
|
return value
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||||
@@ -535,6 +553,17 @@ func parseDCMIPowerReading(raw string) float64 {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
|
||||||
|
if current > 0 {
|
||||||
|
cache = cachedPowerReading{Value: current, UpdatedAt: now}
|
||||||
|
return current, cache
|
||||||
|
}
|
||||||
|
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
||||||
|
return cache.Value, cache
|
||||||
|
}
|
||||||
|
return 0, cache
|
||||||
|
}
|
||||||
|
|
||||||
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
||||||
// during either load phase.
|
// during either load phase.
|
||||||
func analyzeThrottling(rows []FanStressRow) bool {
|
func analyzeThrottling(rows []FanStressRow) bool {
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import "testing"
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
func TestParseFanSpeeds(t *testing.T) {
|
func TestParseFanSpeeds(t *testing.T) {
|
||||||
raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n"
|
raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n"
|
||||||
@@ -25,3 +28,40 @@ func TestFirstFanInputValue(t *testing.T) {
|
|||||||
t.Fatalf("got=%v ok=%v", got, ok)
|
t.Fatalf("got=%v ok=%v", got, ok)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseDCMIPowerReading(t *testing.T) {
|
||||||
|
raw := `
|
||||||
|
Instantaneous power reading: 512 Watts
|
||||||
|
Minimum during sampling period: 498 Watts
|
||||||
|
`
|
||||||
|
if got := parseDCMIPowerReading(raw); got != 512 {
|
||||||
|
t.Fatalf("parseDCMIPowerReading()=%v want 512", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||||
|
now := time.Now()
|
||||||
|
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
||||||
|
|
||||||
|
got, updated := effectiveSystemPowerReading(cache, 0, now)
|
||||||
|
if got != 480 {
|
||||||
|
t.Fatalf("got=%v want cached 480", got)
|
||||||
|
}
|
||||||
|
if updated.Value != 480 {
|
||||||
|
t.Fatalf("updated=%+v", updated)
|
||||||
|
}
|
||||||
|
|
||||||
|
got, updated = effectiveSystemPowerReading(cache, 530, now)
|
||||||
|
if got != 530 {
|
||||||
|
t.Fatalf("got=%v want 530", got)
|
||||||
|
}
|
||||||
|
if updated.Value != 530 {
|
||||||
|
t.Fatalf("updated=%+v", updated)
|
||||||
|
}
|
||||||
|
|
||||||
|
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
||||||
|
got, _ = effectiveSystemPowerReading(expired, 0, now)
|
||||||
|
if got != 0 {
|
||||||
|
t.Fatalf("expired cache returned %v want 0", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -162,6 +162,25 @@ func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
loader string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{loader: NvidiaStressLoaderBuiltin, want: "gpu-nvidia-burn"},
|
||||||
|
{loader: NvidiaStressLoaderJohn, want: "gpu-nvidia-john"},
|
||||||
|
{loader: NvidiaStressLoaderNCCL, want: "gpu-nvidia-nccl"},
|
||||||
|
{loader: "", want: "gpu-nvidia-burn"},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
if got := nvidiaStressArchivePrefix(tt.loader); got != tt.want {
|
||||||
|
t.Fatalf("loader=%q prefix=%q want %q", tt.loader, got, tt.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestEnvIntFallback(t *testing.T) {
|
func TestEnvIntFallback(t *testing.T) {
|
||||||
os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
|
os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
|
||||||
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
||||||
@@ -237,6 +256,44 @@ func TestResolveROCmSMICommandFromPATH(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestResolveSATCommandUsesLookPathForGenericTools(t *testing.T) {
|
||||||
|
oldLookPath := satLookPath
|
||||||
|
satLookPath = func(file string) (string, error) {
|
||||||
|
if file == "stress-ng" {
|
||||||
|
return "/usr/bin/stress-ng", nil
|
||||||
|
}
|
||||||
|
return "", exec.ErrNotFound
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||||
|
|
||||||
|
cmd, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveSATCommand error: %v", err)
|
||||||
|
}
|
||||||
|
if len(cmd) != 3 {
|
||||||
|
t.Fatalf("cmd len=%d want 3 (%v)", len(cmd), cmd)
|
||||||
|
}
|
||||||
|
if cmd[0] != "/usr/bin/stress-ng" {
|
||||||
|
t.Fatalf("cmd[0]=%q want /usr/bin/stress-ng", cmd[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveSATCommandFailsForMissingGenericTool(t *testing.T) {
|
||||||
|
oldLookPath := satLookPath
|
||||||
|
satLookPath = func(file string) (string, error) {
|
||||||
|
return "", exec.ErrNotFound
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||||
|
|
||||||
|
_, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error")
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), "stress-ng not found in PATH") {
|
||||||
|
t.Fatalf("error=%q", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
|
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
|
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
|
||||||
|
|||||||
@@ -2,11 +2,12 @@ package webui
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"context"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
"regexp"
|
||||||
@@ -85,15 +86,16 @@ func streamJob(w http.ResponseWriter, r *http.Request, j *jobState) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// runCmdJob runs an exec.Cmd as a background job, streaming stdout+stderr lines.
|
// streamCmdJob runs an exec.Cmd and streams stdout+stderr lines into j.
|
||||||
func runCmdJob(j *jobState, cmd *exec.Cmd) {
|
func streamCmdJob(j *jobState, cmd *exec.Cmd) error {
|
||||||
pr, pw := io.Pipe()
|
pr, pw := io.Pipe()
|
||||||
cmd.Stdout = pw
|
cmd.Stdout = pw
|
||||||
cmd.Stderr = pw
|
cmd.Stderr = pw
|
||||||
|
|
||||||
if err := cmd.Start(); err != nil {
|
if err := cmd.Start(); err != nil {
|
||||||
j.finish(err.Error())
|
_ = pw.Close()
|
||||||
return
|
_ = pr.Close()
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
// Lower the CPU scheduling priority of stress/audit subprocesses to nice+10
|
// Lower the CPU scheduling priority of stress/audit subprocesses to nice+10
|
||||||
// so the X server and kernel interrupt handling remain responsive under load
|
// so the X server and kernel interrupt handling remain responsive under load
|
||||||
@@ -102,8 +104,10 @@ func runCmdJob(j *jobState, cmd *exec.Cmd) {
|
|||||||
_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, 10)
|
_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, 10)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
scanDone := make(chan error, 1)
|
||||||
go func() {
|
go func() {
|
||||||
scanner := bufio.NewScanner(pr)
|
scanner := bufio.NewScanner(pr)
|
||||||
|
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
// Split on \r to handle progress-bar style output (e.g. \r overwrites)
|
// Split on \r to handle progress-bar style output (e.g. \r overwrites)
|
||||||
// and strip ANSI escape codes so logs are readable in the browser.
|
// and strip ANSI escape codes so logs are readable in the browser.
|
||||||
@@ -115,15 +119,21 @@ func runCmdJob(j *jobState, cmd *exec.Cmd) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if err := scanner.Err(); err != nil && !errors.Is(err, io.ErrClosedPipe) {
|
||||||
|
scanDone <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
scanDone <- nil
|
||||||
}()
|
}()
|
||||||
|
|
||||||
err := cmd.Wait()
|
err := cmd.Wait()
|
||||||
_ = pw.Close()
|
_ = pw.Close()
|
||||||
|
scanErr := <-scanDone
|
||||||
|
_ = pr.Close()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
j.finish(err.Error())
|
return err
|
||||||
} else {
|
|
||||||
j.finish("")
|
|
||||||
}
|
}
|
||||||
|
return scanErr
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Audit ─────────────────────────────────────────────────────────────────────
|
// ── Audit ─────────────────────────────────────────────────────────────────────
|
||||||
@@ -178,20 +188,16 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
Loader string `json:"loader"`
|
Loader string `json:"loader"`
|
||||||
Profile string `json:"profile"`
|
Profile string `json:"profile"`
|
||||||
DisplayName string `json:"display_name"`
|
DisplayName string `json:"display_name"`
|
||||||
|
PlatformComponents []string `json:"platform_components"`
|
||||||
|
}
|
||||||
|
if r.Body != nil {
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||||
|
return
|
||||||
}
|
}
|
||||||
if r.ContentLength > 0 {
|
|
||||||
_ = json.NewDecoder(r.Body).Decode(&body)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
name := taskNames[target]
|
name := taskDisplayName(target, body.Profile, body.Loader)
|
||||||
if body.Profile != "" {
|
|
||||||
if n, ok := burnNames[target]; ok {
|
|
||||||
name = n
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if name == "" {
|
|
||||||
name = target
|
|
||||||
}
|
|
||||||
t := &Task{
|
t := &Task{
|
||||||
ID: newJobID("sat-" + target),
|
ID: newJobID("sat-" + target),
|
||||||
Name: name,
|
Name: name,
|
||||||
@@ -206,6 +212,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
Loader: body.Loader,
|
Loader: body.Loader,
|
||||||
BurnProfile: body.Profile,
|
BurnProfile: body.Profile,
|
||||||
DisplayName: body.DisplayName,
|
DisplayName: body.DisplayName,
|
||||||
|
PlatformComponents: body.PlatformComponents,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
if strings.TrimSpace(body.DisplayName) != "" {
|
if strings.TrimSpace(body.DisplayName) != "" {
|
||||||
@@ -341,6 +348,8 @@ func (h *handler) handleAPINetworkStatus(w http.ResponseWriter, r *http.Request)
|
|||||||
writeJSON(w, map[string]any{
|
writeJSON(w, map[string]any{
|
||||||
"interfaces": ifaces,
|
"interfaces": ifaces,
|
||||||
"default_route": h.opts.App.DefaultRoute(),
|
"default_route": h.opts.App.DefaultRoute(),
|
||||||
|
"pending_change": h.hasPendingNetworkChange(),
|
||||||
|
"rollback_in": h.pendingNetworkRollbackIn(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -420,14 +429,22 @@ func (h *handler) handleAPIExportList(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPIExportBundle(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPIExportBundle(w http.ResponseWriter, r *http.Request) {
|
||||||
archive, err := app.BuildSupportBundle(h.opts.ExportDir)
|
if globalQueue.hasActiveTarget("support-bundle") {
|
||||||
if err != nil {
|
writeError(w, http.StatusConflict, "support bundle task is already pending or running")
|
||||||
writeError(w, http.StatusInternalServerError, err.Error())
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
t := &Task{
|
||||||
|
ID: newJobID("support-bundle"),
|
||||||
|
Name: "Support Bundle",
|
||||||
|
Target: "support-bundle",
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
}
|
||||||
|
globalQueue.enqueue(t)
|
||||||
writeJSON(w, map[string]string{
|
writeJSON(w, map[string]string{
|
||||||
"status": "ok",
|
"status": "queued",
|
||||||
"path": archive,
|
"task_id": t.ID,
|
||||||
|
"job_id": t.ID,
|
||||||
"url": "/export/support.tar.gz",
|
"url": "/export/support.tar.gz",
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -499,6 +516,26 @@ func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── GPU tools ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func (h *handler) handleAPIGPUTools(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
type toolEntry struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Available bool `json:"available"`
|
||||||
|
Vendor string `json:"vendor"` // "nvidia" | "amd"
|
||||||
|
}
|
||||||
|
_, nvidiaErr := os.Stat("/dev/nvidia0")
|
||||||
|
_, amdErr := os.Stat("/dev/kfd")
|
||||||
|
nvidiaUp := nvidiaErr == nil
|
||||||
|
amdUp := amdErr == nil
|
||||||
|
writeJSON(w, []toolEntry{
|
||||||
|
{ID: "bee-gpu-burn", Available: nvidiaUp, Vendor: "nvidia"},
|
||||||
|
{ID: "john", Available: nvidiaUp, Vendor: "nvidia"},
|
||||||
|
{ID: "nccl", Available: nvidiaUp, Vendor: "nvidia"},
|
||||||
|
{ID: "rvs", Available: amdUp, Vendor: "amd"},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
// ── System ────────────────────────────────────────────────────────────────────
|
// ── System ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
|
||||||
@@ -516,10 +553,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
|
|||||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
h.installMu.Lock()
|
if globalQueue.hasActiveTarget("install") {
|
||||||
installRunning := h.installJob != nil && !h.installJob.isDone()
|
|
||||||
h.installMu.Unlock()
|
|
||||||
if installRunning {
|
|
||||||
writeError(w, http.StatusConflict, "install to disk is already running")
|
writeError(w, http.StatusConflict, "install to disk is already running")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -634,35 +668,23 @@ func (h *handler) handleAPIInstallRun(w http.ResponseWriter, r *http.Request) {
|
|||||||
writeError(w, http.StatusConflict, "install to RAM task is already pending or running")
|
writeError(w, http.StatusConflict, "install to RAM task is already pending or running")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
if globalQueue.hasActiveTarget("install") {
|
||||||
h.installMu.Lock()
|
writeError(w, http.StatusConflict, "install task is already pending or running")
|
||||||
if h.installJob != nil && !h.installJob.isDone() {
|
|
||||||
h.installMu.Unlock()
|
|
||||||
writeError(w, http.StatusConflict, "install already running")
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
j := &jobState{}
|
t := &Task{
|
||||||
h.installJob = j
|
ID: newJobID("install"),
|
||||||
h.installMu.Unlock()
|
Name: "Install to Disk",
|
||||||
|
Target: "install",
|
||||||
logFile := platform.InstallLogPath(req.Device)
|
Priority: 20,
|
||||||
go runCmdJob(j, exec.CommandContext(context.Background(), "bee-install", req.Device, logFile))
|
Status: TaskPending,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
w.WriteHeader(http.StatusNoContent)
|
params: taskParams{
|
||||||
}
|
Device: req.Device,
|
||||||
|
},
|
||||||
func (h *handler) handleAPIInstallStream(w http.ResponseWriter, r *http.Request) {
|
|
||||||
h.installMu.Lock()
|
|
||||||
j := h.installJob
|
|
||||||
h.installMu.Unlock()
|
|
||||||
if j == nil {
|
|
||||||
if !sseStart(w) {
|
|
||||||
return
|
|
||||||
}
|
}
|
||||||
sseWrite(w, "done", "")
|
globalQueue.enqueue(t)
|
||||||
return
|
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||||
}
|
|
||||||
streamJob(w, r, j)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Metrics SSE ───────────────────────────────────────────────────────────────
|
// ── Metrics SSE ───────────────────────────────────────────────────────────────
|
||||||
@@ -724,13 +746,7 @@ func (h *handler) feedRings(sample platform.LiveMetricSample) {
|
|||||||
h.ringMemLoad.push(sample.MemLoadPct)
|
h.ringMemLoad.push(sample.MemLoadPct)
|
||||||
|
|
||||||
h.ringsMu.Lock()
|
h.ringsMu.Lock()
|
||||||
for i, fan := range sample.Fans {
|
h.pushFanRings(sample.Fans)
|
||||||
for len(h.ringFans) <= i {
|
|
||||||
h.ringFans = append(h.ringFans, newMetricsRing(120))
|
|
||||||
h.fanNames = append(h.fanNames, fan.Name)
|
|
||||||
}
|
|
||||||
h.ringFans[i].push(float64(fan.RPM))
|
|
||||||
}
|
|
||||||
for _, gpu := range sample.GPUs {
|
for _, gpu := range sample.GPUs {
|
||||||
idx := gpu.GPUIndex
|
idx := gpu.GPUIndex
|
||||||
for len(h.gpuRings) <= idx {
|
for len(h.gpuRings) <= idx {
|
||||||
@@ -749,6 +765,51 @@ func (h *handler) feedRings(sample platform.LiveMetricSample) {
|
|||||||
h.ringsMu.Unlock()
|
h.ringsMu.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) pushFanRings(fans []platform.FanReading) {
|
||||||
|
if len(fans) == 0 && len(h.ringFans) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fanValues := make(map[string]float64, len(fans))
|
||||||
|
for _, fan := range fans {
|
||||||
|
if fan.Name == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fanValues[fan.Name] = fan.RPM
|
||||||
|
found := false
|
||||||
|
for i, name := range h.fanNames {
|
||||||
|
if name == fan.Name {
|
||||||
|
found = true
|
||||||
|
if i >= len(h.ringFans) {
|
||||||
|
h.ringFans = append(h.ringFans, newMetricsRing(120))
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
h.fanNames = append(h.fanNames, fan.Name)
|
||||||
|
h.ringFans = append(h.ringFans, newMetricsRing(120))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for i, ring := range h.ringFans {
|
||||||
|
if ring == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
name := ""
|
||||||
|
if i < len(h.fanNames) {
|
||||||
|
name = h.fanNames[i]
|
||||||
|
}
|
||||||
|
if rpm, ok := fanValues[name]; ok {
|
||||||
|
ring.push(rpm)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if last, ok := ring.latest(); ok {
|
||||||
|
ring.push(last)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ring.push(0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) pushNamedMetricRing(dst *[]*namedMetricsRing, name string, value float64) {
|
func (h *handler) pushNamedMetricRing(dst *[]*namedMetricsRing, name string, value float64) {
|
||||||
if name == "" {
|
if name == "" {
|
||||||
return
|
return
|
||||||
@@ -827,7 +888,10 @@ func (h *handler) applyPendingNetworkChange(apply func() (app.ActionResult, erro
|
|||||||
return result, err
|
return result, err
|
||||||
}
|
}
|
||||||
|
|
||||||
pnc := &pendingNetChange{snapshot: snapshot}
|
pnc := &pendingNetChange{
|
||||||
|
snapshot: snapshot,
|
||||||
|
deadline: time.Now().Add(netRollbackTimeout),
|
||||||
|
}
|
||||||
pnc.timer = time.AfterFunc(netRollbackTimeout, func() {
|
pnc.timer = time.AfterFunc(netRollbackTimeout, func() {
|
||||||
_ = h.opts.App.RestoreNetworkSnapshot(snapshot)
|
_ = h.opts.App.RestoreNetworkSnapshot(snapshot)
|
||||||
h.pendingNetMu.Lock()
|
h.pendingNetMu.Lock()
|
||||||
@@ -844,6 +908,25 @@ func (h *handler) applyPendingNetworkChange(apply func() (app.ActionResult, erro
|
|||||||
return result, nil
|
return result, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) hasPendingNetworkChange() bool {
|
||||||
|
h.pendingNetMu.Lock()
|
||||||
|
defer h.pendingNetMu.Unlock()
|
||||||
|
return h.pendingNet != nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) pendingNetworkRollbackIn() int {
|
||||||
|
h.pendingNetMu.Lock()
|
||||||
|
defer h.pendingNetMu.Unlock()
|
||||||
|
if h.pendingNet == nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
remaining := int(time.Until(h.pendingNet.deadline).Seconds())
|
||||||
|
if remaining < 1 {
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
return remaining
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPINetworkConfirm(w http.ResponseWriter, _ *http.Request) {
|
func (h *handler) handleAPINetworkConfirm(w http.ResponseWriter, _ *http.Request) {
|
||||||
h.pendingNetMu.Lock()
|
h.pendingNetMu.Lock()
|
||||||
pnc := h.pendingNet
|
pnc := h.pendingNet
|
||||||
@@ -933,8 +1016,31 @@ func parseXrandrOutput(out string) []displayInfo {
|
|||||||
return infos
|
return infos
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func xrandrCommand(args ...string) *exec.Cmd {
|
||||||
|
cmd := exec.Command("xrandr", args...)
|
||||||
|
env := append([]string{}, os.Environ()...)
|
||||||
|
hasDisplay := false
|
||||||
|
hasXAuthority := false
|
||||||
|
for _, kv := range env {
|
||||||
|
if strings.HasPrefix(kv, "DISPLAY=") && strings.TrimPrefix(kv, "DISPLAY=") != "" {
|
||||||
|
hasDisplay = true
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(kv, "XAUTHORITY=") && strings.TrimPrefix(kv, "XAUTHORITY=") != "" {
|
||||||
|
hasXAuthority = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !hasDisplay {
|
||||||
|
env = append(env, "DISPLAY=:0")
|
||||||
|
}
|
||||||
|
if !hasXAuthority {
|
||||||
|
env = append(env, "XAUTHORITY=/home/bee/.Xauthority")
|
||||||
|
}
|
||||||
|
cmd.Env = env
|
||||||
|
return cmd
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
|
func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
|
||||||
out, err := exec.Command("xrandr").Output()
|
out, err := xrandrCommand().Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
|
writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
|
||||||
return
|
return
|
||||||
@@ -961,7 +1067,7 @@ func (h *handler) handleAPIDisplaySet(w http.ResponseWriter, r *http.Request) {
|
|||||||
writeError(w, http.StatusBadRequest, "invalid output name")
|
writeError(w, http.StatusBadRequest, "invalid output name")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if out, err := exec.Command("xrandr", "--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
|
if out, err := xrandrCommand("--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
|
||||||
writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
|
writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|||||||
129
audit/internal/webui/api_test.go
Normal file
129
audit/internal/webui/api_test.go
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"net/http/httptest"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
|
||||||
|
t.Setenv("DISPLAY", "")
|
||||||
|
t.Setenv("XAUTHORITY", "")
|
||||||
|
|
||||||
|
cmd := xrandrCommand("--query")
|
||||||
|
|
||||||
|
var hasDisplay bool
|
||||||
|
var hasXAuthority bool
|
||||||
|
for _, kv := range cmd.Env {
|
||||||
|
if kv == "DISPLAY=:0" {
|
||||||
|
hasDisplay = true
|
||||||
|
}
|
||||||
|
if kv == "XAUTHORITY=/home/bee/.Xauthority" {
|
||||||
|
hasXAuthority = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !hasDisplay {
|
||||||
|
t.Fatalf("DISPLAY not injected: %v", cmd.Env)
|
||||||
|
}
|
||||||
|
if !hasXAuthority {
|
||||||
|
t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/sat/cpu/run", strings.NewReader(`{"profile":"smoke"}`))
|
||||||
|
req.ContentLength = -1
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPISATRun("cpu").ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 1 {
|
||||||
|
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
|
||||||
|
t.Fatalf("burn profile=%q want smoke", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIExportBundleQueuesTask(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{ExportDir: t.TempDir()}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/export/bundle", nil)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPIExportBundle(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
var body map[string]string
|
||||||
|
if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil {
|
||||||
|
t.Fatalf("decode response: %v", err)
|
||||||
|
}
|
||||||
|
if body["task_id"] == "" {
|
||||||
|
t.Fatalf("missing task_id in response: %v", body)
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 1 {
|
||||||
|
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[0].Target; got != "support-bundle" {
|
||||||
|
t.Fatalf("target=%q want support-bundle", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||||
|
h := &handler{}
|
||||||
|
h.pushFanRings([]platform.FanReading{
|
||||||
|
{Name: "FAN_A", RPM: 4200},
|
||||||
|
{Name: "FAN_B", RPM: 5100},
|
||||||
|
})
|
||||||
|
h.pushFanRings([]platform.FanReading{
|
||||||
|
{Name: "FAN_B", RPM: 5200},
|
||||||
|
})
|
||||||
|
|
||||||
|
if len(h.fanNames) != 2 || h.fanNames[0] != "FAN_A" || h.fanNames[1] != "FAN_B" {
|
||||||
|
t.Fatalf("fanNames=%v", h.fanNames)
|
||||||
|
}
|
||||||
|
aVals, _ := h.ringFans[0].snapshot()
|
||||||
|
bVals, _ := h.ringFans[1].snapshot()
|
||||||
|
if len(aVals) != 2 || len(bVals) != 2 {
|
||||||
|
t.Fatalf("fan ring lengths: A=%d B=%d", len(aVals), len(bVals))
|
||||||
|
}
|
||||||
|
if aVals[1] != 4200 {
|
||||||
|
t.Fatalf("FAN_A should carry forward last value, got %v", aVals)
|
||||||
|
}
|
||||||
|
if bVals[1] != 5200 {
|
||||||
|
t.Fatalf("FAN_B should use latest sampled value, got %v", bVals)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -4,6 +4,8 @@ import (
|
|||||||
"database/sql"
|
"database/sql"
|
||||||
"encoding/csv"
|
"encoding/csv"
|
||||||
"io"
|
"io"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -20,6 +22,9 @@ type MetricsDB struct {
|
|||||||
|
|
||||||
// openMetricsDB opens (or creates) the metrics database at the given path.
|
// openMetricsDB opens (or creates) the metrics database at the given path.
|
||||||
func openMetricsDB(path string) (*MetricsDB, error) {
|
func openMetricsDB(path string) (*MetricsDB, error) {
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
db, err := sql.Open("sqlite", path+"?_journal=WAL&_busy_timeout=5000")
|
db, err := sql.Open("sqlite", path+"?_journal=WAL&_busy_timeout=5000")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -115,7 +120,7 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
|||||||
|
|
||||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n)
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadAll returns all persisted samples in chronological order (oldest first).
|
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||||
@@ -146,17 +151,15 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
|||||||
if len(sysRows) == 0 {
|
if len(sysRows) == 0 {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
// Reverse to chronological order
|
|
||||||
for i, j := 0, len(sysRows)-1; i < j; i, j = i+1, j-1 {
|
|
||||||
sysRows[i], sysRows[j] = sysRows[j], sysRows[i]
|
|
||||||
}
|
|
||||||
|
|
||||||
// Collect min/max ts for range query
|
// Collect min/max ts for range query
|
||||||
minTS := sysRows[0].ts
|
minTS := sysRows[0].ts
|
||||||
maxTS := sysRows[len(sysRows)-1].ts
|
maxTS := sysRows[len(sysRows)-1].ts
|
||||||
|
|
||||||
// Load GPU rows in range
|
// Load GPU rows in range
|
||||||
type gpuKey struct{ ts int64; idx int }
|
type gpuKey struct {
|
||||||
|
ts int64
|
||||||
|
idx int
|
||||||
|
}
|
||||||
gpuData := map[gpuKey]platform.GPUMetricRow{}
|
gpuData := map[gpuKey]platform.GPUMetricRow{}
|
||||||
gRows, err := m.db.Query(
|
gRows, err := m.db.Query(
|
||||||
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
|
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
|
||||||
@@ -174,7 +177,10 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Load fan rows in range
|
// Load fan rows in range
|
||||||
type fanKey struct{ ts int64; name string }
|
type fanKey struct {
|
||||||
|
ts int64
|
||||||
|
name string
|
||||||
|
}
|
||||||
fanData := map[fanKey]float64{}
|
fanData := map[fanKey]float64{}
|
||||||
fRows, err := m.db.Query(
|
fRows, err := m.db.Query(
|
||||||
`SELECT ts,name,rpm FROM fan_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
`SELECT ts,name,rpm FROM fan_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||||
@@ -192,7 +198,10 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Load temp rows in range
|
// Load temp rows in range
|
||||||
type tempKey struct{ ts int64; name string }
|
type tempKey struct {
|
||||||
|
ts int64
|
||||||
|
name string
|
||||||
|
}
|
||||||
tempData := map[tempKey]platform.TempReading{}
|
tempData := map[tempKey]platform.TempReading{}
|
||||||
tRows, err := m.db.Query(
|
tRows, err := m.db.Query(
|
||||||
`SELECT ts,name,grp,celsius FROM temp_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
`SELECT ts,name,grp,celsius FROM temp_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||||
|
|||||||
69
audit/internal/webui/metricsdb_test.go
Normal file
69
audit/internal/webui/metricsdb_test.go
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
|
||||||
|
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("openMetricsDB: %v", err)
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
|
||||||
|
base := time.Unix(1_700_000_000, 0).UTC()
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
err := db.Write(platform.LiveMetricSample{
|
||||||
|
Timestamp: base.Add(time.Duration(i) * time.Second),
|
||||||
|
CPULoadPct: float64(10 + i),
|
||||||
|
MemLoadPct: float64(20 + i),
|
||||||
|
PowerW: float64(300 + i),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, PowerW: float64(100 + i)},
|
||||||
|
{GPUIndex: 2, PowerW: float64(200 + i)},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Write(%d): %v", i, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
all, err := db.LoadAll()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadAll: %v", err)
|
||||||
|
}
|
||||||
|
if len(all) != 3 {
|
||||||
|
t.Fatalf("LoadAll len=%d want 3", len(all))
|
||||||
|
}
|
||||||
|
for i, sample := range all {
|
||||||
|
if len(sample.GPUs) != 2 {
|
||||||
|
t.Fatalf("LoadAll sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||||
|
}
|
||||||
|
if sample.GPUs[0].GPUIndex != 0 || sample.GPUs[0].PowerW != float64(100+i) {
|
||||||
|
t.Fatalf("LoadAll sample %d GPU0=%+v", i, sample.GPUs[0])
|
||||||
|
}
|
||||||
|
if sample.GPUs[1].GPUIndex != 2 || sample.GPUs[1].PowerW != float64(200+i) {
|
||||||
|
t.Fatalf("LoadAll sample %d GPU1=%+v", i, sample.GPUs[1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
recent, err := db.LoadRecent(2)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadRecent: %v", err)
|
||||||
|
}
|
||||||
|
if len(recent) != 2 {
|
||||||
|
t.Fatalf("LoadRecent len=%d want 2", len(recent))
|
||||||
|
}
|
||||||
|
if !recent[0].Timestamp.Before(recent[1].Timestamp) {
|
||||||
|
t.Fatalf("LoadRecent timestamps not ascending: %v >= %v", recent[0].Timestamp, recent[1].Timestamp)
|
||||||
|
}
|
||||||
|
for i, sample := range recent {
|
||||||
|
if len(sample.GPUs) != 2 {
|
||||||
|
t.Fatalf("LoadRecent sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -289,7 +289,7 @@ func renderAudit() string {
|
|||||||
func renderHardwareSummaryCard(opts HandlerOptions) string {
|
func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||||
data, err := loadSnapshot(opts.AuditPath)
|
data, err := loadSnapshot(opts.AuditPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-unknown">No audit data</span></div></div>`
|
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><button class="btn btn-primary" onclick="auditModalRun()">▶ Run Audit</button></div></div>`
|
||||||
}
|
}
|
||||||
// Parse just enough fields for the summary banner
|
// Parse just enough fields for the summary banner
|
||||||
var snap struct {
|
var snap struct {
|
||||||
@@ -522,13 +522,30 @@ func renderMetrics() string {
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
|
const chartIds = [
|
||||||
|
'chart-server-load','chart-server-temp-cpu','chart-server-temp-gpu','chart-server-temp-ambient','chart-server-power','chart-server-fans',
|
||||||
|
'chart-gpu-all-load','chart-gpu-all-memload','chart-gpu-all-power','chart-gpu-all-temp'
|
||||||
|
];
|
||||||
|
|
||||||
|
function refreshChartImage(el) {
|
||||||
|
if (!el || el.dataset.loading === '1') return;
|
||||||
|
const baseSrc = el.dataset.baseSrc || el.src.split('?')[0];
|
||||||
|
const nextSrc = baseSrc + '?t=' + Date.now();
|
||||||
|
const probe = new Image();
|
||||||
|
el.dataset.baseSrc = baseSrc;
|
||||||
|
el.dataset.loading = '1';
|
||||||
|
probe.onload = function() {
|
||||||
|
el.src = nextSrc;
|
||||||
|
el.dataset.loading = '0';
|
||||||
|
};
|
||||||
|
probe.onerror = function() {
|
||||||
|
el.dataset.loading = '0';
|
||||||
|
};
|
||||||
|
probe.src = nextSrc;
|
||||||
|
}
|
||||||
|
|
||||||
function refreshCharts() {
|
function refreshCharts() {
|
||||||
const t = '?t=' + Date.now();
|
chartIds.forEach(id => refreshChartImage(document.getElementById(id)));
|
||||||
['chart-server-load','chart-server-temp-cpu','chart-server-temp-gpu','chart-server-temp-ambient','chart-server-power','chart-server-fans',
|
|
||||||
'chart-gpu-all-load','chart-gpu-all-memload','chart-gpu-all-power','chart-gpu-all-temp'].forEach(id => {
|
|
||||||
const el = document.getElementById(id);
|
|
||||||
if (el) el.src = el.src.split('?')[0] + t;
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
setInterval(refreshCharts, 3000);
|
setInterval(refreshCharts, 3000);
|
||||||
|
|
||||||
@@ -657,96 +674,210 @@ func renderSATCard(id, label, extra string) string {
|
|||||||
func renderBurn() string {
|
func renderBurn() string {
|
||||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at maximum load. Repeated or prolonged use may reduce hardware lifespan (storage endurance, GPU wear). Use only when necessary.</div>
|
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at maximum load. Repeated or prolonged use may reduce hardware lifespan (storage endurance, GPU wear). Use only when necessary.</div>
|
||||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
<div class="card"><div class="card-head">Burn Profile</div><div class="card-body">
|
|
||||||
<div class="form-row" style="max-width:320px"><label>Preset</label><select id="burn-profile"><option value="smoke" selected>Smoke: quick check (~5 min CPU / DCGM level 1)</option><option value="acceptance">Acceptance: 1 hour (DCGM level 3)</option><option value="overnight">Overnight: 8 hours (DCGM level 4)</option></select></div>
|
<div class="card" style="margin-bottom:16px">
|
||||||
<p style="color:var(--muted);font-size:12px">Applied to all tests on this page. NVIDIA SAT on the Validate page still uses DCGM. NVIDIA GPU Stress on this page uses the selected stress loader for the preset duration.</p>
|
<div class="card-head">Burn Profile</div>
|
||||||
</div></div>
|
<div class="card-body" style="display:flex;align-items:center;gap:16px;flex-wrap:wrap">
|
||||||
<div class="grid3">
|
<div class="form-row" style="margin:0;max-width:380px"><label>Preset</label><select id="burn-profile">
|
||||||
<div class="card"><div class="card-head">NVIDIA GPU Stress</div><div class="card-body">
|
<option value="smoke" selected>Smoke — quick check (~5 min)</option>
|
||||||
<div class="form-row"><label>Load Tool</label><select id="nvidia-stress-loader"><option value="builtin" selected>bee-gpu-burn</option><option value="nccl">NCCL all_reduce_perf</option><option value="john">John the Ripper jumbo (OpenCL)</option></select></div>
|
<option value="acceptance">Acceptance — 1 hour</option>
|
||||||
<div class="form-row"><label>Exclude GPU indices</label><input type="text" id="nvidia-stress-exclude" placeholder="e.g. 1,3"></div>
|
<option value="overnight">Overnight — 8 hours</option>
|
||||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px"><code>bee-gpu-burn</code> runs on all detected NVIDIA GPUs by default. <code>NCCL all_reduce_perf</code> is useful for multi-GPU / interconnect load. Use exclusions only when one or more cards must be skipped.</p>
|
</select></div>
|
||||||
<button id="sat-btn-nvidia-stress" class="btn btn-primary" onclick="runBurnIn('nvidia-stress')">▶ Start NVIDIA Stress</button>
|
<button class="btn btn-primary" onclick="runAll()">▶ Run All</button>
|
||||||
</div></div>
|
<span id="burn-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||||
<div class="card"><div class="card-head">CPU Stress</div><div class="card-body">
|
</div>
|
||||||
<button class="btn btn-primary" onclick="runBurnIn('cpu')">▶ Start CPU Stress</button>
|
|
||||||
</div></div>
|
|
||||||
<div class="card"><div class="card-head">AMD GPU Stress</div><div class="card-body">
|
|
||||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs ROCm compute stress together with VRAM copy/load activity via RVS GST and records a separate <code>rocm-bandwidth-test</code> snapshot. Missing tools reported as UNSUPPORTED.</p>
|
|
||||||
<button id="sat-btn-amd-stress" class="btn btn-primary" onclick="runBurnIn('amd-stress')">▶ Start AMD Stress</button>
|
|
||||||
</div></div>
|
|
||||||
<div class="card"><div class="card-head">Memory Stress</div><div class="card-body">
|
|
||||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">stress-ng --vm writes and verifies memory patterns across all of RAM. Env: <code>BEE_VM_STRESS_SECONDS</code> (default 300), <code>BEE_VM_STRESS_SIZE_MB</code> (default 80%).</p>
|
|
||||||
<button class="btn btn-primary" onclick="runBurnIn('memory-stress')">▶ Start Memory Stress</button>
|
|
||||||
</div></div>
|
|
||||||
<div class="card"><div class="card-head">SAT Stress (stressapptest)</div><div class="card-body">
|
|
||||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Google stressapptest saturates CPU, memory and cache buses simultaneously. Env: <code>BEE_SAT_STRESS_SECONDS</code> (default 300), <code>BEE_SAT_STRESS_MB</code> (default auto).</p>
|
|
||||||
<button class="btn btn-primary" onclick="runBurnIn('sat-stress')">▶ Start SAT Stress</button>
|
|
||||||
</div></div>
|
|
||||||
<div class="card"><div class="card-head">Platform Thermal Cycling</div><div class="card-body">
|
|
||||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs CPU + GPU stress simultaneously across multiple load/idle cycles with varying durations. Detects cooling systems that fail to recover under repeated load cycles. Smoke: 2 cycles ~5 min. Acceptance: 4 cycles ~25 min.</p>
|
|
||||||
<button class="btn btn-primary" onclick="runBurnIn('platform-stress')">▶ Start Thermal Cycling</button>
|
|
||||||
</div></div>
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div class="grid3" style="margin-bottom:16px">
|
||||||
|
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-head">GPU Stress</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Tests run on all GPUs in the system. Availability determined by driver status.</p>
|
||||||
|
<div id="gpu-tools-list">
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-gpu-bee" value="bee-gpu-burn" disabled><span>bee-gpu-burn <span class="cb-note" id="note-bee"></span></span></label>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-gpu-john" value="john" disabled><span>John the Ripper (OpenCL) <span class="cb-note" id="note-john"></span></span></label>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-gpu-nccl" value="nccl" disabled><span>NCCL all_reduce_perf <span class="cb-note" id="note-nccl"></span></span></label>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" value="rvs" disabled><span>RVS GST (AMD) <span class="cb-note" id="note-rvs"></span></span></label>
|
||||||
|
</div>
|
||||||
|
<button class="btn btn-primary" style="margin-top:10px" onclick="runGPUStress()">▶ Run GPU Stress</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-head">Compute Stress</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Select which subsystems to stress. Each checked item runs as a separate task.</p>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-cpu" checked><span>CPU stress (stress-ng)</span></label>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-mem-stress" checked><span>Memory stress (stress-ng --vm)</span></label>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-sat-stress"><span>stressapptest (CPU + memory bus)</span></label>
|
||||||
|
<button class="btn btn-primary" style="margin-top:10px" onclick="runComputeStress()">▶ Run Compute Stress</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-head">Platform Thermal Cycling</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Repeated load+idle cycles. Detects cooling recovery failures and GPU throttle. Smoke: 2×90s. Acceptance: 4×300s.</p>
|
||||||
|
<p style="font-size:12px;font-weight:600;margin:0 0 6px">Load components:</p>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-pt-cpu" checked><span>CPU (stressapptest)</span></label>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-pt-nvidia" disabled><span>NVIDIA GPU <span class="cb-note" id="note-pt-nvidia"></span></span></label>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-pt-amd" disabled><span>AMD GPU <span class="cb-note" id="note-pt-amd"></span></span></label>
|
||||||
|
<button class="btn btn-primary" style="margin-top:10px" onclick="runPlatformStress()">▶ Run Thermal Cycling</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||||
<div class="card-head">Output <span id="bi-title"></span></div>
|
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||||
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
.cb-row { display:flex; align-items:center; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
|
||||||
|
.cb-row input[type=checkbox] { width:16px; height:16px; flex-shrink:0; }
|
||||||
|
.cb-row input[type=checkbox]:disabled { opacity:0.4; cursor:not-allowed; }
|
||||||
|
.cb-row input[type=checkbox]:disabled ~ span { opacity:0.45; cursor:not-allowed; }
|
||||||
|
.cb-note { font-size:11px; color:var(--muted); font-style:italic; }
|
||||||
|
</style>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
let biES = null;
|
let biES = null;
|
||||||
function parseGPUIndexList(raw) {
|
|
||||||
return (raw || '')
|
function profile() { return document.getElementById('burn-profile').value || 'smoke'; }
|
||||||
.split(',')
|
|
||||||
.map(v => v.trim())
|
function enqueueTask(target, extra) {
|
||||||
.filter(v => v !== '')
|
const body = Object.assign({ profile: profile() }, extra || {});
|
||||||
.map(v => Number(v))
|
return fetch('/api/sat/'+target+'/run', {
|
||||||
.filter(v => Number.isInteger(v) && v >= 0);
|
method: 'POST', headers: {'Content-Type':'application/json'}, body: JSON.stringify(body)
|
||||||
|
}).then(r => r.json());
|
||||||
}
|
}
|
||||||
function runBurnIn(target) {
|
|
||||||
|
function streamTask(taskId, label) {
|
||||||
if (biES) { biES.close(); biES = null; }
|
if (biES) { biES.close(); biES = null; }
|
||||||
const body = { profile: document.getElementById('burn-profile').value || 'smoke' };
|
document.getElementById('bi-output').style.display = 'block';
|
||||||
if (target === 'nvidia-stress') {
|
document.getElementById('bi-title').textContent = '— ' + label + ' [' + profile() + ']';
|
||||||
body.loader = document.getElementById('nvidia-stress-loader').value || 'builtin';
|
|
||||||
body.exclude_gpu_indices = parseGPUIndexList(document.getElementById('nvidia-stress-exclude').value);
|
|
||||||
}
|
|
||||||
document.getElementById('bi-output').style.display='block';
|
|
||||||
const loaderLabel = body.loader ? ' / ' + body.loader : '';
|
|
||||||
document.getElementById('bi-title').textContent = '— ' + target + loaderLabel + ' [' + body.profile + ']';
|
|
||||||
const term = document.getElementById('bi-terminal');
|
const term = document.getElementById('bi-terminal');
|
||||||
term.textContent = 'Enqueuing ' + target + ' stress...\n';
|
term.textContent = 'Task ' + taskId + ' queued. Streaming...\n';
|
||||||
fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
|
biES = new EventSource('/api/tasks/'+taskId+'/stream');
|
||||||
.then(r => r.json())
|
biES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop = term.scrollHeight; };
|
||||||
.then(d => {
|
biES.addEventListener('done', e => {
|
||||||
term.textContent += 'Task ' + d.task_id + ' queued.\n';
|
biES.close(); biES = null;
|
||||||
biES = new EventSource('/api/tasks/'+d.task_id+'/stream');
|
term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n';
|
||||||
biES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
|
||||||
biES.addEventListener('done', e => { biES.close(); biES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
</script>
|
|
||||||
<script>
|
function runGPUStress() {
|
||||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
const ids = ['burn-gpu-bee','burn-gpu-john','burn-gpu-nccl','burn-gpu-rvs'];
|
||||||
if (!gp.nvidia) disableSATCard('nvidia-stress', 'No NVIDIA GPU detected');
|
const loaderMap = {'burn-gpu-bee':'builtin','burn-gpu-john':'john','burn-gpu-nccl':'nccl','burn-gpu-rvs':'rvs'};
|
||||||
if (!gp.amd) disableSATCard('amd-stress', 'No AMD GPU detected');
|
const targetMap = {'burn-gpu-bee':'nvidia-stress','burn-gpu-john':'nvidia-stress','burn-gpu-nccl':'nvidia-stress','burn-gpu-rvs':'amd-stress'};
|
||||||
});
|
let last = null;
|
||||||
function disableSATCard(id, reason) {
|
ids.filter(id => {
|
||||||
const btn = document.getElementById('sat-btn-' + id);
|
const el = document.getElementById(id);
|
||||||
if (!btn) return;
|
return el && el.checked && !el.disabled;
|
||||||
btn.disabled = true;
|
}).forEach(id => {
|
||||||
btn.title = reason;
|
const target = targetMap[id];
|
||||||
btn.style.opacity = '0.4';
|
const extra = target === 'nvidia-stress' ? {loader: loaderMap[id]} : {};
|
||||||
const card = btn.closest('.card');
|
enqueueTask(target, extra).then(d => { last = d; streamTask(d.task_id, target + ' / ' + loaderMap[id]); });
|
||||||
if (card) {
|
});
|
||||||
let note = card.querySelector('.sat-unavail');
|
|
||||||
if (!note) {
|
|
||||||
note = document.createElement('p');
|
|
||||||
note.className = 'sat-unavail';
|
|
||||||
note.style.cssText = 'color:var(--muted);font-size:12px;margin-top:6px';
|
|
||||||
btn.parentNode.insertBefore(note, btn.nextSibling);
|
|
||||||
}
|
|
||||||
note.textContent = reason;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function runComputeStress() {
|
||||||
|
const tasks = [
|
||||||
|
{id:'burn-cpu', target:'cpu'},
|
||||||
|
{id:'burn-mem-stress', target:'memory-stress'},
|
||||||
|
{id:'burn-sat-stress', target:'sat-stress'},
|
||||||
|
];
|
||||||
|
let last = null;
|
||||||
|
tasks.filter(t => {
|
||||||
|
const el = document.getElementById(t.id);
|
||||||
|
return el && el.checked;
|
||||||
|
}).forEach(t => {
|
||||||
|
enqueueTask(t.target).then(d => { last = d; streamTask(d.task_id, t.target); });
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function runPlatformStress() {
|
||||||
|
const comps = [];
|
||||||
|
if (document.getElementById('burn-pt-cpu').checked) comps.push('cpu');
|
||||||
|
const nv = document.getElementById('burn-pt-nvidia');
|
||||||
|
if (nv && nv.checked && !nv.disabled) comps.push('gpu');
|
||||||
|
const am = document.getElementById('burn-pt-amd');
|
||||||
|
if (am && am.checked && !am.disabled) comps.push('gpu');
|
||||||
|
const extra = comps.length > 0 ? {platform_components: comps} : {};
|
||||||
|
enqueueTask('platform-stress', extra).then(d => streamTask(d.task_id, 'platform-stress'));
|
||||||
|
}
|
||||||
|
|
||||||
|
function runAll() {
|
||||||
|
const status = document.getElementById('burn-all-status');
|
||||||
|
status.textContent = 'Enqueuing...';
|
||||||
|
let count = 0;
|
||||||
|
const done = () => { count++; status.textContent = count + ' tasks queued.'; };
|
||||||
|
|
||||||
|
// GPU tests
|
||||||
|
const gpuIds = ['burn-gpu-bee','burn-gpu-john','burn-gpu-nccl','burn-gpu-rvs'];
|
||||||
|
const loaderMap = {'burn-gpu-bee':'builtin','burn-gpu-john':'john','burn-gpu-nccl':'nccl','burn-gpu-rvs':'rvs'};
|
||||||
|
const gpuTargetMap = {'burn-gpu-bee':'nvidia-stress','burn-gpu-john':'nvidia-stress','burn-gpu-nccl':'nvidia-stress','burn-gpu-rvs':'amd-stress'};
|
||||||
|
gpuIds.filter(id => { const el = document.getElementById(id); return el && el.checked && !el.disabled; }).forEach(id => {
|
||||||
|
const target = gpuTargetMap[id];
|
||||||
|
const extra = target === 'nvidia-stress' ? {loader: loaderMap[id]} : {};
|
||||||
|
enqueueTask(target, extra).then(d => { streamTask(d.task_id, target); done(); });
|
||||||
|
});
|
||||||
|
|
||||||
|
// Compute tests
|
||||||
|
[{id:'burn-cpu',target:'cpu'},{id:'burn-mem-stress',target:'memory-stress'},{id:'burn-sat-stress',target:'sat-stress'}]
|
||||||
|
.filter(t => { const el = document.getElementById(t.id); return el && el.checked; })
|
||||||
|
.forEach(t => enqueueTask(t.target).then(d => { streamTask(d.task_id, t.target); done(); }));
|
||||||
|
|
||||||
|
// Platform
|
||||||
|
const comps = [];
|
||||||
|
if (document.getElementById('burn-pt-cpu').checked) comps.push('cpu');
|
||||||
|
const nv = document.getElementById('burn-pt-nvidia');
|
||||||
|
if (nv && nv.checked && !nv.disabled) comps.push('gpu');
|
||||||
|
const am = document.getElementById('burn-pt-amd');
|
||||||
|
if (am && am.checked && !am.disabled) comps.push('gpu');
|
||||||
|
const ptExtra = comps.length > 0 ? {platform_components: comps} : {};
|
||||||
|
enqueueTask('platform-stress', ptExtra).then(d => { streamTask(d.task_id, 'platform-stress'); done(); });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load GPU tool availability
|
||||||
|
fetch('/api/gpu/tools').then(r => r.json()).then(tools => {
|
||||||
|
const nvidiaMap = {'bee-gpu-burn':'burn-gpu-bee','john':'burn-gpu-john','nccl':'burn-gpu-nccl','rvs':'burn-gpu-rvs'};
|
||||||
|
const noteMap = {'bee-gpu-burn':'note-bee','john':'note-john','nccl':'note-nccl','rvs':'note-rvs'};
|
||||||
|
tools.forEach(t => {
|
||||||
|
const cb = document.getElementById(nvidiaMap[t.id]);
|
||||||
|
const note = document.getElementById(noteMap[t.id]);
|
||||||
|
if (!cb) return;
|
||||||
|
if (t.available) {
|
||||||
|
cb.disabled = false;
|
||||||
|
if (t.id === 'bee-gpu-burn') cb.checked = true;
|
||||||
|
} else {
|
||||||
|
const reason = t.vendor === 'nvidia' ? 'NVIDIA driver not running' : 'AMD driver not running';
|
||||||
|
if (note) note.textContent = '— ' + reason;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}).catch(() => {});
|
||||||
|
|
||||||
|
// Load GPU presence for platform thermal cycling
|
||||||
|
fetch('/api/gpu/presence').then(r => r.json()).then(gp => {
|
||||||
|
const nvCb = document.getElementById('burn-pt-nvidia');
|
||||||
|
const amCb = document.getElementById('burn-pt-amd');
|
||||||
|
const nvNote = document.getElementById('note-pt-nvidia');
|
||||||
|
const amNote = document.getElementById('note-pt-amd');
|
||||||
|
if (gp.nvidia) {
|
||||||
|
nvCb.disabled = false;
|
||||||
|
nvCb.checked = true;
|
||||||
|
} else {
|
||||||
|
if (nvNote) nvNote.textContent = '— NVIDIA driver not running';
|
||||||
|
}
|
||||||
|
if (gp.amd) {
|
||||||
|
amCb.disabled = false;
|
||||||
|
amCb.checked = true;
|
||||||
|
} else {
|
||||||
|
if (amNote) amNote.textContent = '— AMD driver not running';
|
||||||
|
}
|
||||||
|
}).catch(() => {});
|
||||||
</script>`
|
</script>`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -778,6 +909,8 @@ func renderNetworkInline() string {
|
|||||||
</div>
|
</div>
|
||||||
<script>
|
<script>
|
||||||
var _netCountdownTimer = null;
|
var _netCountdownTimer = null;
|
||||||
|
var _netRefreshTimer = null;
|
||||||
|
const NET_ROLLBACK_SECS = 60;
|
||||||
function loadNetwork() {
|
function loadNetwork() {
|
||||||
fetch('/api/network').then(r=>r.json()).then(d => {
|
fetch('/api/network').then(r=>r.json()).then(d => {
|
||||||
const rows = (d.interfaces||[]).map(i =>
|
const rows = (d.interfaces||[]).map(i =>
|
||||||
@@ -788,21 +921,33 @@ function loadNetwork() {
|
|||||||
document.getElementById('iface-table').innerHTML =
|
document.getElementById('iface-table').innerHTML =
|
||||||
'<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
|
'<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
|
||||||
(d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
|
(d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
|
||||||
});
|
if (d.pending_change) showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||||
|
else hideNetPending();
|
||||||
|
}).catch(function() {});
|
||||||
}
|
}
|
||||||
function selectIface(iface) {
|
function selectIface(iface) {
|
||||||
document.getElementById('dhcp-iface').value = iface;
|
document.getElementById('dhcp-iface').value = iface;
|
||||||
document.getElementById('st-iface').value = iface;
|
document.getElementById('st-iface').value = iface;
|
||||||
}
|
}
|
||||||
function toggleIface(iface, currentState) {
|
function toggleIface(iface, currentState) {
|
||||||
|
showNetPending(NET_ROLLBACK_SECS);
|
||||||
fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
|
fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
|
||||||
.then(r=>r.json()).then(d => {
|
.then(r=>r.json()).then(d => {
|
||||||
if (d.error) { alert('Error: '+d.error); return; }
|
if (d.error) { hideNetPending(); alert('Error: '+d.error); return; }
|
||||||
loadNetwork();
|
loadNetwork();
|
||||||
showNetPending(d.rollback_in || 60);
|
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||||
|
}).catch(function() {
|
||||||
|
setTimeout(loadNetwork, 1500);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
function hideNetPending() {
|
||||||
|
const el = document.getElementById('net-pending');
|
||||||
|
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||||
|
_netCountdownTimer = null;
|
||||||
|
el.style.display = 'none';
|
||||||
|
}
|
||||||
function showNetPending(secs) {
|
function showNetPending(secs) {
|
||||||
|
if (!secs || secs < 1) { hideNetPending(); return; }
|
||||||
const el = document.getElementById('net-pending');
|
const el = document.getElementById('net-pending');
|
||||||
el.style.display = 'block';
|
el.style.display = 'block';
|
||||||
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||||
@@ -811,30 +956,33 @@ function showNetPending(secs) {
|
|||||||
_netCountdownTimer = setInterval(function() {
|
_netCountdownTimer = setInterval(function() {
|
||||||
remaining--;
|
remaining--;
|
||||||
document.getElementById('net-countdown').textContent = remaining;
|
document.getElementById('net-countdown').textContent = remaining;
|
||||||
if (remaining <= 0) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; el.style.display='none'; loadNetwork(); }
|
if (remaining <= 0) { hideNetPending(); loadNetwork(); }
|
||||||
}, 1000);
|
}, 1000);
|
||||||
}
|
}
|
||||||
function confirmNetChange() {
|
function confirmNetChange() {
|
||||||
if (_netCountdownTimer) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; }
|
hideNetPending();
|
||||||
document.getElementById('net-pending').style.display='none';
|
fetch('/api/network/confirm',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||||
fetch('/api/network/confirm',{method:'POST'});
|
|
||||||
}
|
}
|
||||||
function rollbackNetChange() {
|
function rollbackNetChange() {
|
||||||
if (_netCountdownTimer) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; }
|
hideNetPending();
|
||||||
document.getElementById('net-pending').style.display='none';
|
fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||||
fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork());
|
|
||||||
}
|
}
|
||||||
function runDHCP() {
|
function runDHCP() {
|
||||||
const iface = document.getElementById('dhcp-iface').value.trim();
|
const iface = document.getElementById('dhcp-iface').value.trim();
|
||||||
|
showNetPending(NET_ROLLBACK_SECS);
|
||||||
fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
|
fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
|
||||||
.then(r=>r.json()).then(d => {
|
.then(r=>r.json()).then(d => {
|
||||||
document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
|
document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
|
||||||
if (!d.error) showNetPending(d.rollback_in || 60);
|
if (d.error) { hideNetPending(); return; }
|
||||||
|
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||||
loadNetwork();
|
loadNetwork();
|
||||||
|
}).catch(function() {
|
||||||
|
setTimeout(loadNetwork, 1500);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
function setStatic() {
|
function setStatic() {
|
||||||
const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
|
const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
|
||||||
|
showNetPending(NET_ROLLBACK_SECS);
|
||||||
fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
|
fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
|
||||||
interface: document.getElementById('st-iface').value,
|
interface: document.getElementById('st-iface').value,
|
||||||
address: document.getElementById('st-addr').value,
|
address: document.getElementById('st-addr').value,
|
||||||
@@ -843,11 +991,16 @@ function setStatic() {
|
|||||||
dns: dns,
|
dns: dns,
|
||||||
})}).then(r=>r.json()).then(d => {
|
})}).then(r=>r.json()).then(d => {
|
||||||
document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
|
document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
|
||||||
if (!d.error) showNetPending(d.rollback_in || 60);
|
if (d.error) { hideNetPending(); return; }
|
||||||
|
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||||
loadNetwork();
|
loadNetwork();
|
||||||
|
}).catch(function() {
|
||||||
|
setTimeout(loadNetwork, 1500);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
loadNetwork();
|
loadNetwork();
|
||||||
|
if (_netRefreshTimer) clearInterval(_netRefreshTimer);
|
||||||
|
_netRefreshTimer = setInterval(loadNetwork, 5000);
|
||||||
</script>`
|
</script>`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -926,7 +1079,7 @@ func renderExport(exportDir string) string {
|
|||||||
return `<div class="grid2">
|
return `<div class="grid2">
|
||||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
|
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||||
<a class="btn btn-primary" href="/export/support.tar.gz">↓ Download Support Bundle</a>
|
` + renderSupportBundleInline() + `
|
||||||
</div></div>
|
</div></div>
|
||||||
<div class="card"><div class="card-head">Export Files</div><div class="card-body">
|
<div class="card"><div class="card-head">Export Files</div><div class="card-body">
|
||||||
<table><tr><th>File</th></tr>` + rows.String() + `</table>
|
<table><tr><th>File</th></tr>` + rows.String() + `</table>
|
||||||
@@ -1024,6 +1177,77 @@ func listExportFiles(exportDir string) ([]string, error) {
|
|||||||
return entries, nil
|
return entries, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func renderSupportBundleInline() string {
|
||||||
|
return `<button id="support-bundle-btn" class="btn btn-primary" onclick="supportBundleBuild()">Build Support Bundle</button>
|
||||||
|
<a id="support-bundle-download" class="btn btn-secondary" href="/export/support.tar.gz" style="display:none">↓ Download Support Bundle</a>
|
||||||
|
<div id="support-bundle-status" style="margin-top:12px;font-size:13px;color:var(--muted)">No support bundle built in this session.</div>
|
||||||
|
<div id="support-bundle-log" class="terminal" style="display:none;margin-top:12px;max-height:260px"></div>
|
||||||
|
<script>
|
||||||
|
(function(){
|
||||||
|
var _supportBundleES = null;
|
||||||
|
window.supportBundleBuild = function() {
|
||||||
|
var btn = document.getElementById('support-bundle-btn');
|
||||||
|
var status = document.getElementById('support-bundle-status');
|
||||||
|
var log = document.getElementById('support-bundle-log');
|
||||||
|
var download = document.getElementById('support-bundle-download');
|
||||||
|
if (_supportBundleES) {
|
||||||
|
_supportBundleES.close();
|
||||||
|
_supportBundleES = null;
|
||||||
|
}
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Building...';
|
||||||
|
status.textContent = 'Queueing support bundle task...';
|
||||||
|
status.style.color = 'var(--muted)';
|
||||||
|
log.style.display = '';
|
||||||
|
log.textContent = '';
|
||||||
|
download.style.display = 'none';
|
||||||
|
|
||||||
|
fetch('/api/export/bundle', {method:'POST'}).then(function(r){
|
||||||
|
return r.json().then(function(j){
|
||||||
|
if (!r.ok) throw new Error(j.error || r.statusText);
|
||||||
|
return j;
|
||||||
|
});
|
||||||
|
}).then(function(data){
|
||||||
|
if (!data.task_id) throw new Error('missing task id');
|
||||||
|
status.textContent = 'Building support bundle...';
|
||||||
|
_supportBundleES = new EventSource('/api/tasks/' + data.task_id + '/stream');
|
||||||
|
_supportBundleES.onmessage = function(e) {
|
||||||
|
log.textContent += e.data + '\n';
|
||||||
|
log.scrollTop = log.scrollHeight;
|
||||||
|
};
|
||||||
|
_supportBundleES.addEventListener('done', function(e) {
|
||||||
|
_supportBundleES.close();
|
||||||
|
_supportBundleES = null;
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = 'Build Support Bundle';
|
||||||
|
if (e.data) {
|
||||||
|
status.textContent = 'Error: ' + e.data;
|
||||||
|
status.style.color = 'var(--crit-fg)';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
status.textContent = 'Support bundle ready.';
|
||||||
|
status.style.color = 'var(--ok-fg)';
|
||||||
|
download.style.display = '';
|
||||||
|
});
|
||||||
|
_supportBundleES.onerror = function() {
|
||||||
|
if (_supportBundleES) _supportBundleES.close();
|
||||||
|
_supportBundleES = null;
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = 'Build Support Bundle';
|
||||||
|
status.textContent = 'Support bundle stream disconnected.';
|
||||||
|
status.style.color = 'var(--crit-fg)';
|
||||||
|
};
|
||||||
|
}).catch(function(e){
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = 'Build Support Bundle';
|
||||||
|
status.textContent = 'Error: ' + e;
|
||||||
|
status.style.color = 'var(--crit-fg)';
|
||||||
|
});
|
||||||
|
};
|
||||||
|
})();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
// ── Display Resolution ────────────────────────────────────────────────────────
|
// ── Display Resolution ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func renderDisplayInline() string {
|
func renderDisplayInline() string {
|
||||||
@@ -1113,7 +1337,7 @@ function installToRAM() {
|
|||||||
|
|
||||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||||
<a class="btn btn-primary" href="/export/support.tar.gz">↓ Download Support Bundle</a>
|
` + renderSupportBundleInline() + `
|
||||||
</div></div>
|
</div></div>
|
||||||
|
|
||||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||||
@@ -1292,21 +1516,23 @@ function installStart() {
|
|||||||
headers: {'Content-Type': 'application/json'},
|
headers: {'Content-Type': 'application/json'},
|
||||||
body: JSON.stringify({device: _installSelected.device})
|
body: JSON.stringify({device: _installSelected.device})
|
||||||
}).then(function(r){
|
}).then(function(r){
|
||||||
if (r.status === 204) {
|
return r.json().then(function(j){
|
||||||
installStreamLog();
|
if (!r.ok) throw new Error(j.error || r.statusText);
|
||||||
} else {
|
return j;
|
||||||
return r.json().then(function(j){ throw new Error(j.error || r.statusText); });
|
});
|
||||||
}
|
}).then(function(j){
|
||||||
|
if (!j.task_id) throw new Error('missing task id');
|
||||||
|
installStreamLog(j.task_id);
|
||||||
}).catch(function(e){
|
}).catch(function(e){
|
||||||
status.textContent = 'Error: ' + e;
|
status.textContent = 'Error: ' + e;
|
||||||
status.style.color = 'var(--crit-fg)';
|
status.style.color = 'var(--crit-fg)';
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
function installStreamLog() {
|
function installStreamLog(taskId) {
|
||||||
var term = document.getElementById('install-terminal');
|
var term = document.getElementById('install-terminal');
|
||||||
var status = document.getElementById('install-status');
|
var status = document.getElementById('install-status');
|
||||||
var es = new EventSource('/api/install/stream');
|
var es = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
es.onmessage = function(e) {
|
es.onmessage = function(e) {
|
||||||
term.textContent += e.data + '\n';
|
term.textContent += e.data + '\n';
|
||||||
term.scrollTop = term.scrollHeight;
|
term.scrollTop = term.scrollHeight;
|
||||||
@@ -1375,7 +1601,7 @@ function loadTasks() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const rows = tasks.map(t => {
|
const rows = tasks.map(t => {
|
||||||
const dur = t.started_at ? formatDur(t.started_at, t.done_at) : '';
|
const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
|
||||||
const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
|
const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
|
||||||
const statusLabel = {running:'▶ running',pending:'pending',done:'✓ done',failed:'✗ failed',cancelled:'cancelled'}[t.status]||t.status;
|
const statusLabel = {running:'▶ running',pending:'pending',done:'✓ done',failed:'✗ failed',cancelled:'cancelled'}[t.status]||t.status;
|
||||||
let actions = '<button class="btn btn-sm btn-secondary" onclick="viewLog(\''+t.id+'\',\''+escHtml(t.name)+'\')">Logs</button>';
|
let actions = '<button class="btn btn-sm btn-secondary" onclick="viewLog(\''+t.id+'\',\''+escHtml(t.name)+'\')">Logs</button>';
|
||||||
@@ -1400,14 +1626,11 @@ function loadTasks() {
|
|||||||
|
|
||||||
function escHtml(s) { return (s||'').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"'); }
|
function escHtml(s) { return (s||'').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"'); }
|
||||||
function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
|
function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
|
||||||
function formatDur(start, end) {
|
function formatDurSec(sec) {
|
||||||
try {
|
sec = Math.max(0, Math.round(sec||0));
|
||||||
const s = new Date(start), e = end ? new Date(end) : new Date();
|
|
||||||
const sec = Math.round((e-s)/1000);
|
|
||||||
if (sec < 60) return sec+'s';
|
if (sec < 60) return sec+'s';
|
||||||
const m = Math.floor(sec/60), ss = sec%60;
|
const m = Math.floor(sec/60), ss = sec%60;
|
||||||
return m+'m '+ss+'s';
|
return m+'m '+ss+'s';
|
||||||
} catch(e){ return ''; }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function cancelTask(id) {
|
function cancelTask(id) {
|
||||||
|
|||||||
@@ -5,10 +5,12 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"html"
|
"html"
|
||||||
|
"log/slog"
|
||||||
"mime"
|
"mime"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
@@ -83,6 +85,15 @@ func (r *metricsRing) snapshot() ([]float64, []string) {
|
|||||||
return v, labels
|
return v, labels
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (r *metricsRing) latest() (float64, bool) {
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
if len(r.vals) == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return r.vals[len(r.vals)-1], true
|
||||||
|
}
|
||||||
|
|
||||||
func timestampsSameLocalDay(times []time.Time) bool {
|
func timestampsSameLocalDay(times []time.Time) bool {
|
||||||
if len(times) == 0 {
|
if len(times) == 0 {
|
||||||
return true
|
return true
|
||||||
@@ -117,9 +128,12 @@ type namedMetricsRing struct {
|
|||||||
Ring *metricsRing
|
Ring *metricsRing
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const metricsChartWindow = 120
|
||||||
|
|
||||||
// pendingNetChange tracks a network state change awaiting confirmation.
|
// pendingNetChange tracks a network state change awaiting confirmation.
|
||||||
type pendingNetChange struct {
|
type pendingNetChange struct {
|
||||||
snapshot platform.NetworkSnapshot
|
snapshot platform.NetworkSnapshot
|
||||||
|
deadline time.Time
|
||||||
timer *time.Timer
|
timer *time.Timer
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
}
|
}
|
||||||
@@ -143,9 +157,6 @@ type handler struct {
|
|||||||
latest *platform.LiveMetricSample
|
latest *platform.LiveMetricSample
|
||||||
// metrics persistence (nil if DB unavailable)
|
// metrics persistence (nil if DB unavailable)
|
||||||
metricsDB *MetricsDB
|
metricsDB *MetricsDB
|
||||||
// install job (at most one at a time)
|
|
||||||
installJob *jobState
|
|
||||||
installMu sync.Mutex
|
|
||||||
// pending network change (rollback on timeout)
|
// pending network change (rollback on timeout)
|
||||||
pendingNet *pendingNetChange
|
pendingNet *pendingNetChange
|
||||||
pendingNetMu sync.Mutex
|
pendingNetMu sync.Mutex
|
||||||
@@ -173,14 +184,18 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
// Open metrics DB and pre-fill ring buffers from history.
|
// Open metrics DB and pre-fill ring buffers from history.
|
||||||
if db, err := openMetricsDB(metricsDBPath); err == nil {
|
if db, err := openMetricsDB(metricsDBPath); err == nil {
|
||||||
h.metricsDB = db
|
h.metricsDB = db
|
||||||
if samples, err := db.LoadRecent(120); err == nil {
|
if samples, err := db.LoadRecent(metricsChartWindow); err == nil {
|
||||||
for _, s := range samples {
|
for _, s := range samples {
|
||||||
h.feedRings(s)
|
h.feedRings(s)
|
||||||
}
|
}
|
||||||
if len(samples) > 0 {
|
if len(samples) > 0 {
|
||||||
h.setLatestMetric(samples[len(samples)-1])
|
h.setLatestMetric(samples[len(samples)-1])
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
slog.Warn("metrics history unavailable", "path", metricsDBPath, "err", err)
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
slog.Warn("metrics db disabled", "path", metricsDBPath, "err", err)
|
||||||
}
|
}
|
||||||
h.startMetricsCollector()
|
h.startMetricsCollector()
|
||||||
|
|
||||||
@@ -253,8 +268,9 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("GET /api/display/resolutions", h.handleAPIDisplayResolutions)
|
mux.HandleFunc("GET /api/display/resolutions", h.handleAPIDisplayResolutions)
|
||||||
mux.HandleFunc("POST /api/display/set", h.handleAPIDisplaySet)
|
mux.HandleFunc("POST /api/display/set", h.handleAPIDisplaySet)
|
||||||
|
|
||||||
// GPU presence
|
// GPU presence / tools
|
||||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||||
|
mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
|
||||||
|
|
||||||
// System
|
// System
|
||||||
mux.HandleFunc("GET /api/system/ram-status", h.handleAPIRAMStatus)
|
mux.HandleFunc("GET /api/system/ram-status", h.handleAPIRAMStatus)
|
||||||
@@ -266,7 +282,6 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
// Install
|
// Install
|
||||||
mux.HandleFunc("GET /api/install/disks", h.handleAPIInstallDisks)
|
mux.HandleFunc("GET /api/install/disks", h.handleAPIInstallDisks)
|
||||||
mux.HandleFunc("POST /api/install/run", h.handleAPIInstallRun)
|
mux.HandleFunc("POST /api/install/run", h.handleAPIInstallRun)
|
||||||
mux.HandleFunc("GET /api/install/stream", h.handleAPIInstallStream)
|
|
||||||
|
|
||||||
// Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export
|
// Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export
|
||||||
mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
|
mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
|
||||||
@@ -290,11 +305,11 @@ func (h *handler) startMetricsCollector() {
|
|||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
for range ticker.C {
|
for range ticker.C {
|
||||||
sample := platform.SampleLiveMetrics()
|
sample := platform.SampleLiveMetrics()
|
||||||
h.feedRings(sample)
|
|
||||||
h.setLatestMetric(sample)
|
|
||||||
if h.metricsDB != nil {
|
if h.metricsDB != nil {
|
||||||
_ = h.metricsDB.Write(sample)
|
_ = h.metricsDB.Write(sample)
|
||||||
}
|
}
|
||||||
|
h.feedRings(sample)
|
||||||
|
h.setLatestMetric(sample)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
@@ -366,9 +381,13 @@ func (h *handler) handleRuntimeHealthJSON(w http.ResponseWriter, r *http.Request
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) handleSupportBundleDownload(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleSupportBundleDownload(w http.ResponseWriter, r *http.Request) {
|
||||||
archive, err := app.BuildSupportBundle(h.opts.ExportDir)
|
archive, err := app.LatestSupportBundlePath()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, fmt.Sprintf("build support bundle: %v", err), http.StatusInternalServerError)
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
|
http.Error(w, "support bundle not built yet", http.StatusNotFound)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
http.Error(w, fmt.Sprintf("locate support bundle: %v", err), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
w.Header().Set("Cache-Control", "no-store")
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
@@ -442,221 +461,13 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
|
path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
|
||||||
path = strings.TrimSuffix(path, ".svg")
|
path = strings.TrimSuffix(path, ".svg")
|
||||||
|
|
||||||
if h.metricsDB != nil {
|
if h.metricsDB == nil {
|
||||||
if datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path); ok {
|
http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
|
||||||
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
|
|
||||||
if err != nil {
|
|
||||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
w.Header().Set("Content-Type", "image/svg+xml")
|
datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path)
|
||||||
w.Header().Set("Cache-Control", "no-store")
|
if !ok {
|
||||||
_, _ = w.Write(buf)
|
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var datasets [][]float64
|
|
||||||
var names []string
|
|
||||||
var labels []string
|
|
||||||
var title string
|
|
||||||
var yMin, yMax *float64 // nil = auto; for load charts fixed 0-100
|
|
||||||
|
|
||||||
switch {
|
|
||||||
// ── Server sub-charts ─────────────────────────────────────────────────
|
|
||||||
case path == "server-load":
|
|
||||||
title = "CPU / Memory Load"
|
|
||||||
vCPULoad, l := h.ringCPULoad.snapshot()
|
|
||||||
vMemLoad, _ := h.ringMemLoad.snapshot()
|
|
||||||
labels = l
|
|
||||||
datasets = [][]float64{vCPULoad, vMemLoad}
|
|
||||||
names = []string{"CPU Load %", "Mem Load %"}
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = floatPtr(100)
|
|
||||||
|
|
||||||
case path == "server-temp", path == "server-temp-cpu":
|
|
||||||
title = "CPU Temperature"
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
datasets, names, labels = snapshotNamedRings(h.cpuTempRings)
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(datasets...)
|
|
||||||
|
|
||||||
case path == "server-temp-gpu":
|
|
||||||
title = "GPU Temperature"
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
for idx, gr := range h.gpuRings {
|
|
||||||
if gr == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
vTemp, l := gr.Temp.snapshot()
|
|
||||||
datasets = append(datasets, vTemp)
|
|
||||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
|
||||||
if len(labels) == 0 {
|
|
||||||
labels = l
|
|
||||||
}
|
|
||||||
}
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(datasets...)
|
|
||||||
|
|
||||||
case path == "server-temp-ambient":
|
|
||||||
title = "Ambient / Other Sensors"
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
datasets, names, labels = snapshotNamedRings(h.ambientTempRings)
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(datasets...)
|
|
||||||
|
|
||||||
case path == "server-power":
|
|
||||||
title = "System Power"
|
|
||||||
vPower, l := h.ringPower.snapshot()
|
|
||||||
labels = l
|
|
||||||
datasets = [][]float64{vPower}
|
|
||||||
names = []string{"Power W"}
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(vPower)
|
|
||||||
|
|
||||||
case path == "server-fans":
|
|
||||||
title = "Fan RPM"
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
for i, fr := range h.ringFans {
|
|
||||||
fv, _ := fr.snapshot()
|
|
||||||
datasets = append(datasets, fv)
|
|
||||||
name := "Fan"
|
|
||||||
if i < len(h.fanNames) {
|
|
||||||
name = h.fanNames[i]
|
|
||||||
}
|
|
||||||
names = append(names, name+" RPM")
|
|
||||||
}
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(datasets...)
|
|
||||||
|
|
||||||
// ── Combined GPU charts (all GPUs on one chart) ───────────────────────
|
|
||||||
case path == "gpu-all-load":
|
|
||||||
title = "GPU Compute Load"
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
for idx, gr := range h.gpuRings {
|
|
||||||
if gr == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
vUtil, l := gr.Util.snapshot()
|
|
||||||
datasets = append(datasets, vUtil)
|
|
||||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
|
||||||
if len(labels) == 0 {
|
|
||||||
labels = l
|
|
||||||
}
|
|
||||||
}
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = floatPtr(100)
|
|
||||||
|
|
||||||
case path == "gpu-all-memload":
|
|
||||||
title = "GPU Memory Load"
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
for idx, gr := range h.gpuRings {
|
|
||||||
if gr == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
vMem, l := gr.MemUtil.snapshot()
|
|
||||||
datasets = append(datasets, vMem)
|
|
||||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
|
||||||
if len(labels) == 0 {
|
|
||||||
labels = l
|
|
||||||
}
|
|
||||||
}
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = floatPtr(100)
|
|
||||||
|
|
||||||
case path == "gpu-all-power":
|
|
||||||
title = "GPU Power"
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
for idx, gr := range h.gpuRings {
|
|
||||||
if gr == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
vPow, l := gr.Power.snapshot()
|
|
||||||
datasets = append(datasets, vPow)
|
|
||||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
|
||||||
if len(labels) == 0 {
|
|
||||||
labels = l
|
|
||||||
}
|
|
||||||
}
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(datasets...)
|
|
||||||
|
|
||||||
case path == "gpu-all-temp":
|
|
||||||
title = "GPU Temperature"
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
for idx, gr := range h.gpuRings {
|
|
||||||
if gr == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
vTemp, l := gr.Temp.snapshot()
|
|
||||||
datasets = append(datasets, vTemp)
|
|
||||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
|
||||||
if len(labels) == 0 {
|
|
||||||
labels = l
|
|
||||||
}
|
|
||||||
}
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(datasets...)
|
|
||||||
|
|
||||||
// ── Per-GPU sub-charts ────────────────────────────────────────────────
|
|
||||||
case strings.HasPrefix(path, "gpu/"):
|
|
||||||
rest := strings.TrimPrefix(path, "gpu/")
|
|
||||||
// rest is either "{idx}-load", "{idx}-temp", "{idx}-power", or legacy "{idx}"
|
|
||||||
sub := ""
|
|
||||||
if i := strings.LastIndex(rest, "-"); i > 0 {
|
|
||||||
sub = rest[i+1:]
|
|
||||||
rest = rest[:i]
|
|
||||||
}
|
|
||||||
idx := 0
|
|
||||||
fmt.Sscanf(rest, "%d", &idx)
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
var gr *gpuRings
|
|
||||||
if idx < len(h.gpuRings) {
|
|
||||||
gr = h.gpuRings[idx]
|
|
||||||
}
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
if gr == nil {
|
|
||||||
http.NotFound(w, r)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
switch sub {
|
|
||||||
case "load":
|
|
||||||
vUtil, l := gr.Util.snapshot()
|
|
||||||
vMemUtil, _ := gr.MemUtil.snapshot()
|
|
||||||
labels = l
|
|
||||||
title = fmt.Sprintf("GPU %d Load", idx)
|
|
||||||
datasets = [][]float64{vUtil, vMemUtil}
|
|
||||||
names = []string{"Load %", "Mem %"}
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = floatPtr(100)
|
|
||||||
case "temp":
|
|
||||||
vTemp, l := gr.Temp.snapshot()
|
|
||||||
labels = l
|
|
||||||
title = fmt.Sprintf("GPU %d Temperature", idx)
|
|
||||||
datasets = [][]float64{vTemp}
|
|
||||||
names = []string{"Temp °C"}
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(vTemp)
|
|
||||||
default: // "power" or legacy (no sub)
|
|
||||||
vPower, l := gr.Power.snapshot()
|
|
||||||
labels = l
|
|
||||||
title = fmt.Sprintf("GPU %d Power", idx)
|
|
||||||
datasets = [][]float64{vPower}
|
|
||||||
names = []string{"Power W"}
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(vPower)
|
|
||||||
}
|
|
||||||
|
|
||||||
default:
|
|
||||||
http.NotFound(w, r)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -723,9 +534,11 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
for i, s := range samples {
|
for i, s := range samples {
|
||||||
power[i] = s.PowerW
|
power[i] = s.PowerW
|
||||||
}
|
}
|
||||||
|
power = normalizePowerSeries(power)
|
||||||
datasets = [][]float64{power}
|
datasets = [][]float64{power}
|
||||||
names = []string{"Power W"}
|
names = []string{"Power W"}
|
||||||
yMin, yMax = autoBounds120(power)
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(power)
|
||||||
|
|
||||||
case path == "server-fans":
|
case path == "server-fans":
|
||||||
title = "Fan RPM"
|
title = "Fan RPM"
|
||||||
@@ -831,6 +644,7 @@ func namedTempDatasets(samples []platform.LiveMetricSample, group string) ([][]f
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
sort.Strings(names)
|
||||||
datasets := make([][]float64, 0, len(names))
|
datasets := make([][]float64, 0, len(names))
|
||||||
for _, name := range names {
|
for _, name := range names {
|
||||||
ds := make([]float64, len(samples))
|
ds := make([]float64, len(samples))
|
||||||
@@ -858,6 +672,7 @@ func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []strin
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
sort.Strings(names)
|
||||||
datasets := make([][]float64, 0, len(names))
|
datasets := make([][]float64, 0, len(names))
|
||||||
for _, name := range names {
|
for _, name := range names {
|
||||||
ds := make([]float64, len(samples))
|
ds := make([]float64, len(samples))
|
||||||
@@ -869,7 +684,7 @@ func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []strin
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
datasets = append(datasets, ds)
|
datasets = append(datasets, normalizeFanSeries(ds))
|
||||||
}
|
}
|
||||||
return datasets, names
|
return datasets, names
|
||||||
}
|
}
|
||||||
@@ -885,6 +700,7 @@ func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetr
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
sort.Ints(indices)
|
||||||
datasets := make([][]float64, 0, len(indices))
|
datasets := make([][]float64, 0, len(indices))
|
||||||
names := make([]string, 0, len(indices))
|
names := make([]string, 0, len(indices))
|
||||||
for _, idx := range indices {
|
for _, idx := range indices {
|
||||||
@@ -923,6 +739,48 @@ func coalesceDataset(ds []float64, n int) []float64 {
|
|||||||
return make([]float64, n)
|
return make([]float64, n)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func normalizePowerSeries(ds []float64) []float64 {
|
||||||
|
if len(ds) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
out := make([]float64, len(ds))
|
||||||
|
copy(out, ds)
|
||||||
|
last := 0.0
|
||||||
|
haveLast := false
|
||||||
|
for i, v := range out {
|
||||||
|
if v > 0 {
|
||||||
|
last = v
|
||||||
|
haveLast = true
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if haveLast {
|
||||||
|
out[i] = last
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeFanSeries(ds []float64) []float64 {
|
||||||
|
if len(ds) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
out := make([]float64, len(ds))
|
||||||
|
var lastPositive float64
|
||||||
|
for i, v := range ds {
|
||||||
|
if v > 0 {
|
||||||
|
lastPositive = v
|
||||||
|
out[i] = v
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if lastPositive > 0 {
|
||||||
|
out[i] = lastPositive
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out[i] = 0
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
// floatPtr returns a pointer to a float64 value.
|
// floatPtr returns a pointer to a float64 value.
|
||||||
func floatPtr(v float64) *float64 { return &v }
|
func floatPtr(v float64) *float64 { return &v }
|
||||||
|
|
||||||
@@ -1014,15 +872,17 @@ func renderChartSVG(title string, datasets [][]float64, names []string, labels [
|
|||||||
opt.Title = gocharts.TitleOption{Text: title}
|
opt.Title = gocharts.TitleOption{Text: title}
|
||||||
opt.XAxis.Labels = sparse
|
opt.XAxis.Labels = sparse
|
||||||
opt.Legend = gocharts.LegendOption{SeriesNames: names}
|
opt.Legend = gocharts.LegendOption{SeriesNames: names}
|
||||||
|
if chartLegendVisible(len(names)) {
|
||||||
|
opt.Legend.Offset = gocharts.OffsetStr{Top: gocharts.PositionBottom}
|
||||||
|
opt.Legend.OverlayChart = gocharts.Ptr(false)
|
||||||
|
} else {
|
||||||
|
opt.Legend.Show = gocharts.Ptr(false)
|
||||||
|
}
|
||||||
opt.Symbol = gocharts.SymbolNone
|
opt.Symbol = gocharts.SymbolNone
|
||||||
// Right padding: reserve space for the MarkLine label (library recommendation).
|
// Right padding: reserve space for the MarkLine label (library recommendation).
|
||||||
opt.Padding = gocharts.NewBox(20, 20, 80, 20)
|
opt.Padding = gocharts.NewBox(20, 20, 80, 20)
|
||||||
if yMin != nil || yMax != nil {
|
if yMin != nil || yMax != nil {
|
||||||
opt.YAxis = []gocharts.YAxisOption{{
|
opt.YAxis = []gocharts.YAxisOption{chartYAxisOption(yMin, yMax)}
|
||||||
Min: yMin,
|
|
||||||
Max: yMax,
|
|
||||||
ValueFormatter: chartLegendNumber,
|
|
||||||
}}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add a single peak mark line on the series that holds the global maximum.
|
// Add a single peak mark line on the series that holds the global maximum.
|
||||||
@@ -1034,7 +894,7 @@ func renderChartSVG(title string, datasets [][]float64, names []string, labels [
|
|||||||
p := gocharts.NewPainter(gocharts.PainterOptions{
|
p := gocharts.NewPainter(gocharts.PainterOptions{
|
||||||
OutputFormat: gocharts.ChartOutputSVG,
|
OutputFormat: gocharts.ChartOutputSVG,
|
||||||
Width: 1400,
|
Width: 1400,
|
||||||
Height: 240,
|
Height: chartCanvasHeight(len(names)),
|
||||||
}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
|
}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
|
||||||
if err := p.LineChart(opt); err != nil {
|
if err := p.LineChart(opt); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -1042,6 +902,26 @@ func renderChartSVG(title string, datasets [][]float64, names []string, labels [
|
|||||||
return p.Bytes()
|
return p.Bytes()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func chartLegendVisible(seriesCount int) bool {
|
||||||
|
return seriesCount <= 8
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartCanvasHeight(seriesCount int) int {
|
||||||
|
if chartLegendVisible(seriesCount) {
|
||||||
|
return 360
|
||||||
|
}
|
||||||
|
return 288
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartYAxisOption(yMin, yMax *float64) gocharts.YAxisOption {
|
||||||
|
return gocharts.YAxisOption{
|
||||||
|
Min: yMin,
|
||||||
|
Max: yMax,
|
||||||
|
LabelCount: 11,
|
||||||
|
ValueFormatter: chartYAxisNumber,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// globalPeakSeries returns the index of the series containing the global maximum
|
// globalPeakSeries returns the index of the series containing the global maximum
|
||||||
// value across all datasets, and that maximum value.
|
// value across all datasets, and that maximum value.
|
||||||
func globalPeakSeries(datasets [][]float64) (idx int, peak float64) {
|
func globalPeakSeries(datasets [][]float64) (idx int, peak float64) {
|
||||||
@@ -1129,6 +1009,28 @@ func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []str
|
|||||||
return datasets, names, labels
|
return datasets, names, labels
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func snapshotFanRings(rings []*metricsRing, fanNames []string) ([][]float64, []string, []string) {
|
||||||
|
var datasets [][]float64
|
||||||
|
var names []string
|
||||||
|
var labels []string
|
||||||
|
for i, ring := range rings {
|
||||||
|
if ring == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
vals, l := ring.snapshot()
|
||||||
|
datasets = append(datasets, normalizeFanSeries(vals))
|
||||||
|
name := "Fan"
|
||||||
|
if i < len(fanNames) {
|
||||||
|
name = fanNames[i]
|
||||||
|
}
|
||||||
|
names = append(names, name+" RPM")
|
||||||
|
if len(labels) == 0 {
|
||||||
|
labels = l
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return datasets, names, labels
|
||||||
|
}
|
||||||
|
|
||||||
func chartLegendNumber(v float64) string {
|
func chartLegendNumber(v float64) string {
|
||||||
neg := v < 0
|
neg := v < 0
|
||||||
if v < 0 {
|
if v < 0 {
|
||||||
@@ -1151,6 +1053,23 @@ func chartLegendNumber(v float64) string {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func chartYAxisNumber(v float64) string {
|
||||||
|
neg := v < 0
|
||||||
|
if neg {
|
||||||
|
v = -v
|
||||||
|
}
|
||||||
|
var out string
|
||||||
|
if v >= 1000 {
|
||||||
|
out = fmt.Sprintf("%dк", int((v+500)/1000))
|
||||||
|
} else {
|
||||||
|
out = fmt.Sprintf("%.0f", v)
|
||||||
|
}
|
||||||
|
if neg {
|
||||||
|
return "-" + out
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
func sparseLabels(labels []string, n int) []string {
|
func sparseLabels(labels []string, n int) []string {
|
||||||
out := make([]string, len(labels))
|
out := make([]string, len(labels))
|
||||||
step := len(labels) / n
|
step := len(labels) / n
|
||||||
|
|||||||
@@ -89,6 +89,177 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
|
||||||
|
samples := []platform.LiveMetricSample{
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 7, PowerW: 170},
|
||||||
|
{GPUIndex: 2, PowerW: 120},
|
||||||
|
{GPUIndex: 0, PowerW: 100},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, PowerW: 101},
|
||||||
|
{GPUIndex: 7, PowerW: 171},
|
||||||
|
{GPUIndex: 2, PowerW: 121},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("chartDataFromSamples returned ok=false")
|
||||||
|
}
|
||||||
|
if title != "GPU Power" {
|
||||||
|
t.Fatalf("title=%q", title)
|
||||||
|
}
|
||||||
|
wantNames := []string{"GPU 0", "GPU 2", "GPU 7"}
|
||||||
|
if len(names) != len(wantNames) {
|
||||||
|
t.Fatalf("names len=%d want %d: %v", len(names), len(wantNames), names)
|
||||||
|
}
|
||||||
|
for i := range wantNames {
|
||||||
|
if names[i] != wantNames[i] {
|
||||||
|
t.Fatalf("names[%d]=%q want %q; full=%v", i, names[i], wantNames[i], names)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if got := datasets[0]; len(got) != 2 || got[0] != 100 || got[1] != 101 {
|
||||||
|
t.Fatalf("GPU 0 dataset=%v want [100 101]", got)
|
||||||
|
}
|
||||||
|
if got := datasets[1]; len(got) != 2 || got[0] != 120 || got[1] != 121 {
|
||||||
|
t.Fatalf("GPU 2 dataset=%v want [120 121]", got)
|
||||||
|
}
|
||||||
|
if got := datasets[2]; len(got) != 2 || got[0] != 170 || got[1] != 171 {
|
||||||
|
t.Fatalf("GPU 7 dataset=%v want [170 171]", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
|
||||||
|
got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
|
||||||
|
want := []float64{0, 480, 480, 480, 510, 510}
|
||||||
|
if len(got) != len(want) {
|
||||||
|
t.Fatalf("len=%d want %d", len(got), len(want))
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if got[i] != want[i] {
|
||||||
|
t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
|
||||||
|
body := renderMetrics()
|
||||||
|
if !strings.Contains(body, "const probe = new Image();") {
|
||||||
|
t.Fatalf("metrics page should preload chart images before swap: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, "el.dataset.loading === '1'") {
|
||||||
|
t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartLegendVisible(t *testing.T) {
|
||||||
|
if !chartLegendVisible(8) {
|
||||||
|
t.Fatal("legend should stay visible for charts with up to 8 series")
|
||||||
|
}
|
||||||
|
if chartLegendVisible(9) {
|
||||||
|
t.Fatal("legend should be hidden for charts with more than 8 series")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartYAxisNumber(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
in float64
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{in: 999, want: "999"},
|
||||||
|
{in: 1000, want: "1к"},
|
||||||
|
{in: 1370, want: "1к"},
|
||||||
|
{in: 1500, want: "2к"},
|
||||||
|
{in: 10200, want: "10к"},
|
||||||
|
{in: -1499, want: "-1к"},
|
||||||
|
}
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := chartYAxisNumber(tc.in); got != tc.want {
|
||||||
|
t.Fatalf("chartYAxisNumber(%v)=%q want %q", tc.in, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartCanvasHeight(t *testing.T) {
|
||||||
|
if got := chartCanvasHeight(4); got != 360 {
|
||||||
|
t.Fatalf("chartCanvasHeight(4)=%d want 360", got)
|
||||||
|
}
|
||||||
|
if got := chartCanvasHeight(12); got != 288 {
|
||||||
|
t.Fatalf("chartCanvasHeight(12)=%d want 288", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
||||||
|
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
|
||||||
|
want := []float64{4200, 4200, 4200, 4300, 4300}
|
||||||
|
if len(got) != len(want) {
|
||||||
|
t.Fatalf("len=%d want %d", len(got), len(want))
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if got[i] != want[i] {
|
||||||
|
t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartYAxisOption(t *testing.T) {
|
||||||
|
min := floatPtr(0)
|
||||||
|
max := floatPtr(100)
|
||||||
|
opt := chartYAxisOption(min, max)
|
||||||
|
if opt.Min != min || opt.Max != max {
|
||||||
|
t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
|
||||||
|
}
|
||||||
|
if opt.LabelCount != 11 {
|
||||||
|
t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
|
||||||
|
}
|
||||||
|
if got := opt.ValueFormatter(1000); got != "1к" {
|
||||||
|
t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
|
||||||
|
r1 := newMetricsRing(4)
|
||||||
|
r2 := newMetricsRing(4)
|
||||||
|
r1.push(1000)
|
||||||
|
r1.push(1100)
|
||||||
|
r2.push(1200)
|
||||||
|
r2.push(1300)
|
||||||
|
|
||||||
|
datasets, names, labels := snapshotFanRings([]*metricsRing{r1, r2}, []string{"FAN_A", "FAN_B"})
|
||||||
|
if len(datasets) != 2 {
|
||||||
|
t.Fatalf("datasets=%d want 2", len(datasets))
|
||||||
|
}
|
||||||
|
if len(names) != 2 || names[0] != "FAN_A RPM" || names[1] != "FAN_B RPM" {
|
||||||
|
t.Fatalf("names=%v", names)
|
||||||
|
}
|
||||||
|
if len(labels) != 2 {
|
||||||
|
t.Fatalf("labels=%v want 2 entries", labels)
|
||||||
|
}
|
||||||
|
if labels[0] == "" || labels[1] == "" {
|
||||||
|
t.Fatalf("labels should contain timeline values, got %v", labels)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRenderNetworkInlineSyncsPendingState(t *testing.T) {
|
||||||
|
body := renderNetworkInline()
|
||||||
|
if !strings.Contains(body, "d.pending_change") {
|
||||||
|
t.Fatalf("network UI should read pending network state from API: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, "setInterval(loadNetwork, 5000)") {
|
||||||
|
t.Fatalf("network UI should periodically refresh network state: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, "showNetPending(NET_ROLLBACK_SECS)") {
|
||||||
|
t.Fatalf("network UI should show pending confirmation immediately on apply: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRootRendersDashboard(t *testing.T) {
|
func TestRootRendersDashboard(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
path := filepath.Join(dir, "audit.json")
|
path := filepath.Join(dir, "audit.json")
|
||||||
@@ -136,6 +307,33 @@ func TestRootRendersDashboard(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
exportDir := filepath.Join(dir, "export")
|
||||||
|
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{
|
||||||
|
Title: "Bee Hardware Audit",
|
||||||
|
AuditPath: filepath.Join(dir, "missing-audit.json"),
|
||||||
|
ExportDir: exportDir,
|
||||||
|
})
|
||||||
|
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `Run Audit`) {
|
||||||
|
t.Fatalf("dashboard missing run audit button: %s", body)
|
||||||
|
}
|
||||||
|
if strings.Contains(body, `No audit data`) {
|
||||||
|
t.Fatalf("dashboard still shows empty audit badge: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
path := filepath.Join(dir, "audit.json")
|
path := filepath.Join(dir, "audit.json")
|
||||||
@@ -232,6 +430,17 @@ func TestSupportBundleEndpointReturnsArchive(t *testing.T) {
|
|||||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.log"), []byte("audit log"), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.log"), []byte("audit log"), 0644); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
archive, err := os.CreateTemp(os.TempDir(), "bee-support-server-test-*.tar.gz")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { _ = os.Remove(archive.Name()) })
|
||||||
|
if _, err := archive.WriteString("support-bundle"); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := archive.Close(); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
|
|||||||
@@ -6,8 +6,10 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"sort"
|
"sort"
|
||||||
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -39,6 +41,7 @@ var taskNames = map[string]string{
|
|||||||
"sat-stress": "SAT Stress (stressapptest)",
|
"sat-stress": "SAT Stress (stressapptest)",
|
||||||
"platform-stress": "Platform Thermal Cycling",
|
"platform-stress": "Platform Thermal Cycling",
|
||||||
"audit": "Audit",
|
"audit": "Audit",
|
||||||
|
"support-bundle": "Support Bundle",
|
||||||
"install": "Install to Disk",
|
"install": "Install to Disk",
|
||||||
"install-to-ram": "Install to RAM",
|
"install-to-ram": "Install to RAM",
|
||||||
}
|
}
|
||||||
@@ -51,6 +54,33 @@ var burnNames = map[string]string{
|
|||||||
"amd": "AMD GPU Burn-in",
|
"amd": "AMD GPU Burn-in",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func nvidiaStressTaskName(loader string) string {
|
||||||
|
switch strings.TrimSpace(strings.ToLower(loader)) {
|
||||||
|
case platform.NvidiaStressLoaderJohn:
|
||||||
|
return "NVIDIA GPU Stress (John/OpenCL)"
|
||||||
|
case platform.NvidiaStressLoaderNCCL:
|
||||||
|
return "NVIDIA GPU Stress (NCCL)"
|
||||||
|
default:
|
||||||
|
return "NVIDIA GPU Stress (bee-gpu-burn)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskDisplayName(target, profile, loader string) string {
|
||||||
|
name := taskNames[target]
|
||||||
|
if profile != "" {
|
||||||
|
if n, ok := burnNames[target]; ok {
|
||||||
|
name = n
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if target == "nvidia-stress" {
|
||||||
|
name = nvidiaStressTaskName(loader)
|
||||||
|
}
|
||||||
|
if name == "" {
|
||||||
|
name = target
|
||||||
|
}
|
||||||
|
return name
|
||||||
|
}
|
||||||
|
|
||||||
// Task represents one unit of work in the queue.
|
// Task represents one unit of work in the queue.
|
||||||
type Task struct {
|
type Task struct {
|
||||||
ID string `json:"id"`
|
ID string `json:"id"`
|
||||||
@@ -61,6 +91,7 @@ type Task struct {
|
|||||||
CreatedAt time.Time `json:"created_at"`
|
CreatedAt time.Time `json:"created_at"`
|
||||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||||
|
ElapsedSec int `json:"elapsed_sec,omitempty"`
|
||||||
ErrMsg string `json:"error,omitempty"`
|
ErrMsg string `json:"error,omitempty"`
|
||||||
LogPath string `json:"log_path,omitempty"`
|
LogPath string `json:"log_path,omitempty"`
|
||||||
|
|
||||||
@@ -79,6 +110,7 @@ type taskParams struct {
|
|||||||
BurnProfile string `json:"burn_profile,omitempty"`
|
BurnProfile string `json:"burn_profile,omitempty"`
|
||||||
DisplayName string `json:"display_name,omitempty"`
|
DisplayName string `json:"display_name,omitempty"`
|
||||||
Device string `json:"device,omitempty"` // for install
|
Device string `json:"device,omitempty"` // for install
|
||||||
|
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type persistedTask struct {
|
type persistedTask struct {
|
||||||
@@ -185,6 +217,10 @@ var (
|
|||||||
runSATStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
runSATStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
return a.RunSATStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
return a.RunSATStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||||
}
|
}
|
||||||
|
buildSupportBundle = app.BuildSupportBundle
|
||||||
|
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
||||||
|
return exec.CommandContext(ctx, "bee-install", device, logPath)
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
// enqueue adds a task to the queue and notifies the worker.
|
// enqueue adds a task to the queue and notifies the worker.
|
||||||
@@ -276,6 +312,7 @@ func (q *taskQueue) snapshot() []Task {
|
|||||||
out := make([]Task, len(q.tasks))
|
out := make([]Task, len(q.tasks))
|
||||||
for i, t := range q.tasks {
|
for i, t := range q.tasks {
|
||||||
out[i] = *t
|
out[i] = *t
|
||||||
|
out[i].ElapsedSec = taskElapsedSec(&out[i], time.Now())
|
||||||
}
|
}
|
||||||
sort.SliceStable(out, func(i, j int) bool {
|
sort.SliceStable(out, func(i, j int) bool {
|
||||||
si := statusOrder(out[i].Status)
|
si := statusOrder(out[i].Status)
|
||||||
@@ -382,9 +419,9 @@ func setCPUGovernor(governor string) {
|
|||||||
|
|
||||||
// runTask executes the work for a task, writing output to j.
|
// runTask executes the work for a task, writing output to j.
|
||||||
func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||||
if q.opts == nil || q.opts.App == nil {
|
if q.opts == nil {
|
||||||
j.append("ERROR: app not configured")
|
j.append("ERROR: handler options not configured")
|
||||||
j.finish("app not configured")
|
j.finish("handler options not configured")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
a := q.opts.App
|
a := q.opts.App
|
||||||
@@ -401,6 +438,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
|
|
||||||
switch t.Target {
|
switch t.Target {
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
diagLevel := t.params.DiagLevel
|
diagLevel := t.params.DiagLevel
|
||||||
if t.params.BurnProfile != "" && diagLevel <= 0 {
|
if t.params.BurnProfile != "" && diagLevel <= 0 {
|
||||||
diagLevel = resolveBurnPreset(t.params.BurnProfile).NvidiaDiag
|
diagLevel = resolveBurnPreset(t.params.BurnProfile).NvidiaDiag
|
||||||
@@ -418,6 +459,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
||||||
}
|
}
|
||||||
case "nvidia-stress":
|
case "nvidia-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
dur := t.params.Duration
|
dur := t.params.Duration
|
||||||
if t.params.BurnProfile != "" && dur <= 0 {
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
@@ -429,10 +474,22 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
}, j.append)
|
}, j.append)
|
||||||
case "memory":
|
case "memory":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append)
|
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append)
|
||||||
case "storage":
|
case "storage":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
archive, err = runStorageAcceptancePackCtx(a, ctx, "", j.append)
|
archive, err = runStorageAcceptancePackCtx(a, ctx, "", j.append)
|
||||||
case "cpu":
|
case "cpu":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
dur := t.params.Duration
|
dur := t.params.Duration
|
||||||
if t.params.BurnProfile != "" && dur <= 0 {
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
@@ -440,35 +497,69 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
if dur <= 0 {
|
if dur <= 0 {
|
||||||
dur = 60
|
dur = 60
|
||||||
}
|
}
|
||||||
|
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
|
||||||
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||||
case "amd":
|
case "amd":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
|
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
|
||||||
case "amd-mem":
|
case "amd-mem":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
|
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
|
||||||
case "amd-bandwidth":
|
case "amd-bandwidth":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
|
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
|
||||||
case "amd-stress":
|
case "amd-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
dur := t.params.Duration
|
dur := t.params.Duration
|
||||||
if t.params.BurnProfile != "" && dur <= 0 {
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
}
|
}
|
||||||
archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
|
archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
case "memory-stress":
|
case "memory-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
dur := t.params.Duration
|
dur := t.params.Duration
|
||||||
if t.params.BurnProfile != "" && dur <= 0 {
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
}
|
}
|
||||||
archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
|
archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
case "sat-stress":
|
case "sat-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
dur := t.params.Duration
|
dur := t.params.Duration
|
||||||
if t.params.BurnProfile != "" && dur <= 0 {
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
}
|
}
|
||||||
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
case "platform-stress":
|
case "platform-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
opts := resolvePlatformStressPreset(t.params.BurnProfile)
|
opts := resolvePlatformStressPreset(t.params.BurnProfile)
|
||||||
|
opts.Components = t.params.PlatformComponents
|
||||||
archive, err = a.RunPlatformStress(ctx, "", opts, j.append)
|
archive, err = a.RunPlatformStress(ctx, "", opts, j.append)
|
||||||
case "audit":
|
case "audit":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
result, e := a.RunAuditNow(q.opts.RuntimeMode)
|
result, e := a.RunAuditNow(q.opts.RuntimeMode)
|
||||||
if e != nil {
|
if e != nil {
|
||||||
err = e
|
err = e
|
||||||
@@ -477,7 +568,22 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
j.append(line)
|
j.append(line)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
case "support-bundle":
|
||||||
|
j.append("Building support bundle...")
|
||||||
|
archive, err = buildSupportBundle(q.opts.ExportDir)
|
||||||
|
case "install":
|
||||||
|
if strings.TrimSpace(t.params.Device) == "" {
|
||||||
|
err = fmt.Errorf("device is required")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
installLogPath := platform.InstallLogPath(t.params.Device)
|
||||||
|
j.append("Install log: " + installLogPath)
|
||||||
|
err = streamCmdJob(j, installCommand(ctx, t.params.Device, installLogPath))
|
||||||
case "install-to-ram":
|
case "install-to-ram":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
err = a.RunInstallToRAM(ctx, j.append)
|
err = a.RunInstallToRAM(ctx, j.append)
|
||||||
default:
|
default:
|
||||||
j.append("ERROR: unknown target: " + t.Target)
|
j.append("ERROR: unknown target: " + t.Target)
|
||||||
@@ -665,6 +771,7 @@ func (q *taskQueue) loadLocked() {
|
|||||||
q.assignTaskLogPathLocked(t)
|
q.assignTaskLogPathLocked(t)
|
||||||
if t.Status == TaskPending || t.Status == TaskRunning {
|
if t.Status == TaskPending || t.Status == TaskRunning {
|
||||||
t.Status = TaskPending
|
t.Status = TaskPending
|
||||||
|
t.StartedAt = nil
|
||||||
t.DoneAt = nil
|
t.DoneAt = nil
|
||||||
t.ErrMsg = ""
|
t.ErrMsg = ""
|
||||||
}
|
}
|
||||||
@@ -704,3 +811,21 @@ func (q *taskQueue) persistLocked() {
|
|||||||
}
|
}
|
||||||
_ = os.Rename(tmp, q.statePath)
|
_ = os.Rename(tmp, q.statePath)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func taskElapsedSec(t *Task, now time.Time) int {
|
||||||
|
if t == nil || t.StartedAt == nil || t.StartedAt.IsZero() {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
start := *t.StartedAt
|
||||||
|
if !t.CreatedAt.IsZero() && start.Before(t.CreatedAt) {
|
||||||
|
start = t.CreatedAt
|
||||||
|
}
|
||||||
|
end := now
|
||||||
|
if t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||||
|
end = *t.DoneAt
|
||||||
|
}
|
||||||
|
if end.Before(start) {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return int(end.Sub(start).Round(time.Second) / time.Second)
|
||||||
|
}
|
||||||
|
|||||||
@@ -3,7 +3,9 @@ package webui
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"os"
|
"os"
|
||||||
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -53,6 +55,9 @@ func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
|||||||
if got.Status != TaskPending {
|
if got.Status != TaskPending {
|
||||||
t.Fatalf("status=%q want %q", got.Status, TaskPending)
|
t.Fatalf("status=%q want %q", got.Status, TaskPending)
|
||||||
}
|
}
|
||||||
|
if got.StartedAt != nil {
|
||||||
|
t.Fatalf("started_at=%v want nil for recovered pending task", got.StartedAt)
|
||||||
|
}
|
||||||
if got.params.Duration != 300 || got.params.BurnProfile != "smoke" {
|
if got.params.Duration != 300 || got.params.BurnProfile != "smoke" {
|
||||||
t.Fatalf("params=%+v", got.params)
|
t.Fatalf("params=%+v", got.params)
|
||||||
}
|
}
|
||||||
@@ -95,9 +100,24 @@ func TestResolveBurnPreset(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRunTaskHonorsCancel(t *testing.T) {
|
func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
|
||||||
t.Parallel()
|
tests := []struct {
|
||||||
|
loader string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{loader: "", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
|
||||||
|
{loader: "builtin", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
|
||||||
|
{loader: "john", want: "NVIDIA GPU Stress (John/OpenCL)"},
|
||||||
|
{loader: "nccl", want: "NVIDIA GPU Stress (NCCL)"},
|
||||||
|
}
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := taskDisplayName("nvidia-stress", "acceptance", tc.loader); got != tc.want {
|
||||||
|
t.Fatalf("taskDisplayName(loader=%q)=%q want %q", tc.loader, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunTaskHonorsCancel(t *testing.T) {
|
||||||
blocked := make(chan struct{})
|
blocked := make(chan struct{})
|
||||||
released := make(chan struct{})
|
released := make(chan struct{})
|
||||||
aRun := func(_ any, ctx context.Context, _ string, _ int, _ func(string)) (string, error) {
|
aRun := func(_ any, ctx context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||||
@@ -154,3 +174,131 @@ func TestRunTaskHonorsCancel(t *testing.T) {
|
|||||||
t.Fatal("runTask did not return after cancel")
|
t.Fatal("runTask did not return after cancel")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
|
||||||
|
var gotDuration int
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{App: &app.App{}},
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "cpu-burn-1",
|
||||||
|
Name: "CPU Burn-in",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
params: taskParams{BurnProfile: "smoke"},
|
||||||
|
}
|
||||||
|
j := &jobState{}
|
||||||
|
|
||||||
|
orig := runCPUAcceptancePackCtx
|
||||||
|
runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, durationSec int, _ func(string)) (string, error) {
|
||||||
|
gotDuration = durationSec
|
||||||
|
return "/tmp/cpu-burn.tar.gz", nil
|
||||||
|
}
|
||||||
|
defer func() { runCPUAcceptancePackCtx = orig }()
|
||||||
|
|
||||||
|
q.runTask(tk, j, context.Background())
|
||||||
|
|
||||||
|
if gotDuration != 5*60 {
|
||||||
|
t.Fatalf("duration=%d want %d", gotDuration, 5*60)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{ExportDir: dir},
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "support-bundle-1",
|
||||||
|
Name: "Support Bundle",
|
||||||
|
Target: "support-bundle",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
}
|
||||||
|
j := &jobState{}
|
||||||
|
|
||||||
|
var gotExportDir string
|
||||||
|
orig := buildSupportBundle
|
||||||
|
buildSupportBundle = func(exportDir string) (string, error) {
|
||||||
|
gotExportDir = exportDir
|
||||||
|
return filepath.Join(exportDir, "bundle.tar.gz"), nil
|
||||||
|
}
|
||||||
|
defer func() { buildSupportBundle = orig }()
|
||||||
|
|
||||||
|
q.runTask(tk, j, context.Background())
|
||||||
|
|
||||||
|
if gotExportDir != dir {
|
||||||
|
t.Fatalf("exportDir=%q want %q", gotExportDir, dir)
|
||||||
|
}
|
||||||
|
if j.err != "" {
|
||||||
|
t.Fatalf("unexpected error: %q", j.err)
|
||||||
|
}
|
||||||
|
if !strings.Contains(strings.Join(j.lines, "\n"), "Archive: "+filepath.Join(dir, "bundle.tar.gz")) {
|
||||||
|
t.Fatalf("lines=%v", j.lines)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskElapsedSecClampsInvalidStartedAt(t *testing.T) {
|
||||||
|
now := time.Date(2026, 4, 1, 19, 10, 0, 0, time.UTC)
|
||||||
|
created := time.Date(2026, 4, 1, 19, 4, 5, 0, time.UTC)
|
||||||
|
started := time.Time{}
|
||||||
|
task := &Task{
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: created,
|
||||||
|
StartedAt: &started,
|
||||||
|
}
|
||||||
|
if got := taskElapsedSec(task, now); got != 0 {
|
||||||
|
t.Fatalf("taskElapsedSec(zero start)=%d want 0", got)
|
||||||
|
}
|
||||||
|
|
||||||
|
stale := created.Add(-24 * time.Hour)
|
||||||
|
task.StartedAt = &stale
|
||||||
|
if got := taskElapsedSec(task, now); got != int(now.Sub(created).Seconds()) {
|
||||||
|
t.Fatalf("taskElapsedSec(stale start)=%d want %d", got, int(now.Sub(created).Seconds()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{},
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "install-1",
|
||||||
|
Name: "Install to Disk",
|
||||||
|
Target: "install",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
params: taskParams{Device: "/dev/sda"},
|
||||||
|
}
|
||||||
|
j := &jobState{}
|
||||||
|
|
||||||
|
var gotDevice string
|
||||||
|
var gotLogPath string
|
||||||
|
orig := installCommand
|
||||||
|
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
||||||
|
gotDevice = device
|
||||||
|
gotLogPath = logPath
|
||||||
|
return exec.CommandContext(ctx, "sh", "-c", "printf 'line1\nline2\n'")
|
||||||
|
}
|
||||||
|
defer func() { installCommand = orig }()
|
||||||
|
|
||||||
|
q.runTask(tk, j, context.Background())
|
||||||
|
|
||||||
|
if gotDevice != "/dev/sda" {
|
||||||
|
t.Fatalf("device=%q want /dev/sda", gotDevice)
|
||||||
|
}
|
||||||
|
if gotLogPath == "" {
|
||||||
|
t.Fatal("expected install log path")
|
||||||
|
}
|
||||||
|
logs := strings.Join(j.lines, "\n")
|
||||||
|
if !strings.Contains(logs, "Install log: ") {
|
||||||
|
t.Fatalf("missing install log line: %v", j.lines)
|
||||||
|
}
|
||||||
|
if !strings.Contains(logs, "line1") || !strings.Contains(logs, "line2") {
|
||||||
|
t.Fatalf("missing streamed output: %v", j.lines)
|
||||||
|
}
|
||||||
|
if j.err != "" {
|
||||||
|
t.Fatalf("unexpected error: %q", j.err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
2
bible
2
bible
Submodule bible updated: 456c1f022c...688b87e98d
@@ -9,6 +9,34 @@ All live metrics charts in the web UI are server-side SVG images served by Go
|
|||||||
and polled by the browser every 2 seconds via `<img src="...?t=now">`.
|
and polled by the browser every 2 seconds via `<img src="...?t=now">`.
|
||||||
There is no client-side canvas or JS chart library.
|
There is no client-side canvas or JS chart library.
|
||||||
|
|
||||||
|
## Rule: live charts must be visually uniform
|
||||||
|
|
||||||
|
Live charts are a single UI family, not a set of one-off widgets. New charts and
|
||||||
|
changes to existing charts must keep the same rendering model and presentation
|
||||||
|
rules unless there is an explicit architectural decision to diverge.
|
||||||
|
|
||||||
|
Default expectations:
|
||||||
|
|
||||||
|
- same server-side SVG pipeline for all live metrics charts
|
||||||
|
- same refresh behaviour and failure handling in the browser
|
||||||
|
- same canvas size class and card layout
|
||||||
|
- same legend placement policy across charts
|
||||||
|
- same axis, title, and summary conventions
|
||||||
|
- no chart-specific visual exceptions added as a quick fix
|
||||||
|
|
||||||
|
Current default for live charts:
|
||||||
|
|
||||||
|
- legend below the plot area when a chart has 8 series or fewer
|
||||||
|
- legend hidden when a chart has more than 8 series
|
||||||
|
- 10 equal Y-axis steps across the chart height
|
||||||
|
- 1400 x 360 SVG canvas with legend
|
||||||
|
- 1400 x 288 SVG canvas without legend
|
||||||
|
- full-width card rendering in a single-column stack
|
||||||
|
|
||||||
|
If one chart needs a different layout or legend behaviour, treat that as a
|
||||||
|
design-level decision affecting the whole chart family, not as a local tweak to
|
||||||
|
just one endpoint.
|
||||||
|
|
||||||
### Why go-analyze/charts
|
### Why go-analyze/charts
|
||||||
|
|
||||||
- Pure Go, no CGO — builds cleanly inside the live-build container
|
- Pure Go, no CGO — builds cleanly inside the live-build container
|
||||||
@@ -29,7 +57,8 @@ self-contained SVG renderer used **only** for completed SAT run reports
|
|||||||
| `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
|
| `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
|
||||||
| `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |
|
| `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |
|
||||||
|
|
||||||
Charts are 1400 × 280 px SVG. The page renders them at `width: 100%` in a
|
Charts are 1400 × 360 px SVG when the legend is shown, and 1400 × 288 px when
|
||||||
|
the legend is hidden. The page renders them at `width: 100%` in a
|
||||||
single-column layout so they always fill the viewport width.
|
single-column layout so they always fill the viewport width.
|
||||||
|
|
||||||
### Ring buffers
|
### Ring buffers
|
||||||
|
|||||||
@@ -60,6 +60,8 @@ Rules:
|
|||||||
- Chromium opens `http://localhost/` — the full interactive web UI
|
- Chromium opens `http://localhost/` — the full interactive web UI
|
||||||
- SSH is independent from the desktop path
|
- SSH is independent from the desktop path
|
||||||
- serial console support is enabled for VM boot debugging
|
- serial console support is enabled for VM boot debugging
|
||||||
|
- Default boot keeps the server-safe graphics path (`nomodeset` + forced `fbdev`) for IPMI/BMC consoles
|
||||||
|
- Higher-resolution mode selection is expected only when booting through an explicit `bee.display=kms` menu entry, which disables the forced `fbdev` Xorg config before `lightdm`
|
||||||
|
|
||||||
## ISO build sequence
|
## ISO build sequence
|
||||||
|
|
||||||
|
|||||||
224
bible-local/decisions/2026-04-01-memtest-build-strategy.md
Normal file
224
bible-local/decisions/2026-04-01-memtest-build-strategy.md
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
# Decision: Treat memtest as explicit ISO content, not as trusted live-build magic
|
||||||
|
|
||||||
|
**Date:** 2026-04-01
|
||||||
|
**Status:** resolved
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
We have already iterated on `memtest` multiple times and kept cycling between the same ideas.
|
||||||
|
The commit history shows several distinct attempts:
|
||||||
|
|
||||||
|
- `f91bce8` — fixed Bookworm memtest file names to `memtest86+x64.bin` / `memtest86+x64.efi`
|
||||||
|
- `5857805` — added a binary hook to copy memtest files from the build tree into the ISO root
|
||||||
|
- `f96b149` — added fallback extraction from the cached `.deb` when `chroot/boot/` stayed empty
|
||||||
|
- `d43a9ae` — removed the custom hook and switched back to live-build built-in memtest integration
|
||||||
|
- `60cb8f8` — restored explicit memtest menu entries and added ISO validation
|
||||||
|
- `3dbc218` / `3869788` — added archived build logs and better memtest diagnostics
|
||||||
|
|
||||||
|
Current evidence from the archived `easy-bee-nvidia-v3.14-amd64` logs dated 2026-04-01:
|
||||||
|
|
||||||
|
- `lb binary_memtest` does run and installs `memtest86+`
|
||||||
|
- but the final ISO still does **not** contain `boot/memtest86+x64.bin`
|
||||||
|
- the final ISO also does **not** contain memtest menu entries in `boot/grub/grub.cfg` or `isolinux/live.cfg`
|
||||||
|
|
||||||
|
So the assumption "live-build built-in memtest integration is enough on this stack" is currently false for this project until proven otherwise by a real built ISO.
|
||||||
|
|
||||||
|
Additional evidence from the archived `easy-bee-nvidia-v3.17-dirty-amd64` logs dated 2026-04-01:
|
||||||
|
|
||||||
|
- the build now completes successfully because memtest is non-blocking by default
|
||||||
|
- `lb binary_memtest` still runs and installs `memtest86+`
|
||||||
|
- the project-owned hook `config/hooks/normal/9100-memtest.hook.binary` does execute
|
||||||
|
- but it executes too early for its current target paths:
|
||||||
|
- `binary/boot/grub/grub.cfg` is still missing at hook time
|
||||||
|
- `binary/isolinux/live.cfg` is still missing at hook time
|
||||||
|
- memtest binaries are also still absent in `binary/boot/`
|
||||||
|
- later in the build, live-build does create intermediate bootloader configs with memtest lines in the workdir
|
||||||
|
- but the final ISO still lacks memtest binaries and still lacks memtest lines in extracted ISO `boot/grub/grub.cfg` and `isolinux/live.cfg`
|
||||||
|
|
||||||
|
So the assumption "the current normal binary hook path is late enough to patch final memtest artifacts" is also false.
|
||||||
|
|
||||||
|
Correction after inspecting the real `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
|
||||||
|
artifact dated 2026-04-01:
|
||||||
|
|
||||||
|
- the final ISO does contain `boot/memtest86+x64.bin`
|
||||||
|
- the final ISO does contain `boot/memtest86+x64.efi`
|
||||||
|
- the final ISO does contain memtest menu entries in both `boot/grub/grub.cfg`
|
||||||
|
and `isolinux/live.cfg`
|
||||||
|
- so `v3.20-5-g76a9100` was **not** another real memtest regression in the
|
||||||
|
shipped ISO
|
||||||
|
- the regression was in the build-time validator/debug path in `build.sh`
|
||||||
|
|
||||||
|
Root cause of the false alarm:
|
||||||
|
|
||||||
|
- `build.sh` treated "ISO reader command exists" as equivalent to "ISO reader
|
||||||
|
successfully listed/extracted members"
|
||||||
|
- `iso_list_files` / `iso_extract_file` failures were collapsed into the same
|
||||||
|
observable output as "memtest content missing"
|
||||||
|
- this made a reader failure look identical to a missing memtest payload
|
||||||
|
- as a result, we re-entered the same memtest investigation loop even though
|
||||||
|
the real ISO was already correct
|
||||||
|
|
||||||
|
Additional correction from the subsequent `v3.21` build logs dated 2026-04-01:
|
||||||
|
|
||||||
|
- once ISO reading was fixed, the post-build debug correctly showed the raw ISO
|
||||||
|
still carried live-build's default memtest layout (`live/memtest.bin`,
|
||||||
|
`live/memtest.efi`, `boot/grub/memtest.cfg`, `isolinux/memtest.cfg`)
|
||||||
|
- that mismatch is expected to trigger project recovery, because `bee` requires
|
||||||
|
`boot/memtest86+x64.bin` / `boot/memtest86+x64.efi` plus matching menu paths
|
||||||
|
- however, `build.sh` exited before recovery because `set -e` treated a direct
|
||||||
|
`iso_memtest_present` return code of `1` as fatal
|
||||||
|
- so the next repeated loop was caused by shell control flow, not by proof that
|
||||||
|
the recovery design itself was wrong
|
||||||
|
|
||||||
|
## Known Failed Attempts
|
||||||
|
|
||||||
|
These approaches were already tried and should not be repeated blindly:
|
||||||
|
|
||||||
|
1. Built-in live-build memtest only.
|
||||||
|
Reason it failed:
|
||||||
|
- `lb binary_memtest` runs, but the final ISO still misses memtest binaries and menu entries.
|
||||||
|
|
||||||
|
2. Fixing only the memtest file names for Debian Bookworm.
|
||||||
|
Reason it failed:
|
||||||
|
- correct file names alone do not make the files appear in the final ISO.
|
||||||
|
|
||||||
|
3. Copying memtest from `chroot/boot/` into `binary/boot/` via a binary hook.
|
||||||
|
Reason it failed:
|
||||||
|
- in this stack `chroot/boot/` is often empty for memtest payloads at the relevant time.
|
||||||
|
|
||||||
|
4. Fallback extraction from cached `memtest86+` `.deb`.
|
||||||
|
Reason it failed:
|
||||||
|
- this was explored already and was not enough to stabilize the final ISO path end-to-end.
|
||||||
|
|
||||||
|
5. Restoring explicit memtest menu entries in source bootloader templates only.
|
||||||
|
Reason it failed:
|
||||||
|
- memtest lines in source templates or intermediate workdir configs do not guarantee the final ISO contains them.
|
||||||
|
|
||||||
|
6. Patching `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` from the current `config/hooks/normal/9100-memtest.hook.binary`.
|
||||||
|
Reason it failed:
|
||||||
|
- the hook runs before those files exist, so the hook cannot patch them there.
|
||||||
|
|
||||||
|
## What This Means
|
||||||
|
|
||||||
|
When revisiting memtest later, start from the constraints above rather than retrying the same patterns:
|
||||||
|
|
||||||
|
- do not assume the built-in memtest stage is sufficient
|
||||||
|
- do not assume `chroot/boot/` will contain memtest payloads
|
||||||
|
- do not assume source bootloader templates are the last writer of final ISO configs
|
||||||
|
- do not assume the current normal binary hook timing is late enough for final patching
|
||||||
|
|
||||||
|
Any future memtest fix must explicitly identify:
|
||||||
|
|
||||||
|
- where the memtest binaries are reliably available at build time
|
||||||
|
- which exact build stage writes the final bootloader configs that land in the ISO
|
||||||
|
- and a post-build proof from a real ISO, not only from intermediate workdir files
|
||||||
|
- whether the ISO inspection step itself succeeded, rather than merely whether
|
||||||
|
the validator printed a memtest warning
|
||||||
|
- whether a non-zero probe is intentionally handled inside an `if` / `case`
|
||||||
|
context rather than accidentally tripping `set -e`
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
For `bee`, memtest must be treated as an explicit ISO artifact with explicit post-build validation.
|
||||||
|
|
||||||
|
Project rules from now on:
|
||||||
|
|
||||||
|
- Do **not** trust `--memtest memtest86+` by itself.
|
||||||
|
- A memtest implementation is considered valid only if the produced ISO actually contains:
|
||||||
|
- `boot/memtest86+x64.bin`
|
||||||
|
- `boot/memtest86+x64.efi`
|
||||||
|
- a GRUB menu entry
|
||||||
|
- an isolinux menu entry
|
||||||
|
- If live-build built-in integration does not produce those artifacts, use an explicit project-owned mechanism such as:
|
||||||
|
- a binary hook copying files into `binary/boot/`
|
||||||
|
- extraction from the cached `memtest86+` `.deb`
|
||||||
|
- another deterministic build-time copy step
|
||||||
|
- Do **not** remove such explicit logic later unless a fresh real ISO build proves that built-in integration alone produces all required files and menu entries.
|
||||||
|
|
||||||
|
Current implementation direction:
|
||||||
|
|
||||||
|
- keep the live-build memtest stage enabled if it helps package acquisition
|
||||||
|
- do not rely on the current early `binary_hooks` timing for final patching
|
||||||
|
- prefer a post-`lb build` recovery step in `build.sh` that:
|
||||||
|
- patches the fully materialized `LB_DIR/binary` tree
|
||||||
|
- injects memtest binaries there
|
||||||
|
- ensures final bootloader entries there
|
||||||
|
- reruns late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) after the patch
|
||||||
|
- also treat ISO validation tooling as part of the critical path:
|
||||||
|
- install a stable ISO reader in the builder image
|
||||||
|
- fail with an explicit reader error if ISO listing/extraction fails
|
||||||
|
- do not treat reader failure as evidence that memtest is missing
|
||||||
|
- do not call a probe that may return "needs recovery" as a bare command under
|
||||||
|
`set -e`; wrap it in explicit control flow
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
- Future memtest changes must begin by reading this ADR and the commits listed above.
|
||||||
|
- Future memtest changes must also begin by reading the failed-attempt list above.
|
||||||
|
- We should stop re-introducing "prefer built-in live-build memtest" as a default assumption without new evidence.
|
||||||
|
- Memtest validation in `build.sh` is not optional; it is the acceptance gate that prevents another silent regression.
|
||||||
|
- But validation output is only trustworthy if ISO reading itself succeeded. A
|
||||||
|
"missing memtest" warning without a successful ISO read is not evidence.
|
||||||
|
- If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change.
|
||||||
|
|
||||||
|
## Working Solution (confirmed 2026-04-01, commits 76a9100 → 2baf3be)
|
||||||
|
|
||||||
|
This approach was confirmed working in ISO `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
|
||||||
|
and validated again in subsequent builds. The final ISO contains all required memtest artifacts.
|
||||||
|
|
||||||
|
### Components
|
||||||
|
|
||||||
|
**1. Binary hook `config/hooks/normal/9100-memtest.hook.binary`**
|
||||||
|
|
||||||
|
Runs inside the live-build binary phase. Does not patch bootloader files at hook time —
|
||||||
|
those files may not exist yet. Instead:
|
||||||
|
|
||||||
|
- Tries to copy `memtest86+x64.bin` / `memtest86+x64.efi` from `chroot/boot/` first.
|
||||||
|
- Falls back to extracting from the cached `.deb` (via `dpkg-deb -x`) if `chroot/boot/` is empty.
|
||||||
|
- Appends GRUB and isolinux menu entries only if the respective cfg files already exist at hook time.
|
||||||
|
If they do not exist, the hook warns and continues (does not fail).
|
||||||
|
|
||||||
|
Controlled by `BEE_REQUIRE_MEMTEST=1` env var to turn warnings into hard errors when needed.
|
||||||
|
|
||||||
|
**2. Post-`lb build` recovery step in `build.sh`**
|
||||||
|
|
||||||
|
After `lb build` completes, `build.sh` checks whether the fully materialized `binary/` tree
|
||||||
|
contains all required memtest artifacts. If not:
|
||||||
|
|
||||||
|
- Copies/extracts memtest binaries into `binary/boot/`.
|
||||||
|
- Patches `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` directly.
|
||||||
|
- Reruns the late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) to rebuild
|
||||||
|
the ISO with the patched tree.
|
||||||
|
|
||||||
|
This is the deterministic safety net: even if the hook runs at the wrong time, the recovery
|
||||||
|
step handles the final `binary/` tree after live-build has written all bootloader configs.
|
||||||
|
|
||||||
|
**3. ISO validation hardening**
|
||||||
|
|
||||||
|
The memtest probe in `build.sh` is wrapped in explicit `if` / `case` control flow, not called
|
||||||
|
as a bare command under `set -e`. A non-zero probe return (needs recovery) is intentional and
|
||||||
|
handled — it does not abort the build prematurely.
|
||||||
|
|
||||||
|
ISO reading (`xorriso -indev -ls` / extraction) is treated as a separate prerequisite.
|
||||||
|
If the reader fails, the validator reports a reader error explicitly, not a memtest warning.
|
||||||
|
This prevents the false-negative loop that burned 2026-04-01 v3.14–v3.19.
|
||||||
|
|
||||||
|
### Why this works when earlier attempts did not
|
||||||
|
|
||||||
|
The earlier patterns all shared a single flaw: they assumed a single build-time point
|
||||||
|
(hook or source template) would be the last writer of bootloader configs and memtest payloads.
|
||||||
|
In live-build on Debian Bookworm that assumption is false — live-build continues writing
|
||||||
|
bootloader files after custom hooks run, and `chroot/boot/` does not reliably hold memtest payloads.
|
||||||
|
|
||||||
|
The recovery step sidesteps the ordering problem entirely: it acts on the fully materialized
|
||||||
|
`binary/` tree after `lb build` finishes, then rebuilds the ISO from that patched tree.
|
||||||
|
There is no ordering dependency to get wrong.
|
||||||
|
|
||||||
|
### Do not revert
|
||||||
|
|
||||||
|
Do not remove the recovery step or the hook without a fresh real ISO build proving
|
||||||
|
live-build alone produces all four required artifacts:
|
||||||
|
- `boot/memtest86+x64.bin`
|
||||||
|
- `boot/memtest86+x64.efi`
|
||||||
|
- memtest entry in `boot/grub/grub.cfg`
|
||||||
|
- memtest entry in `isolinux/live.cfg`
|
||||||
@@ -5,3 +5,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
|
|||||||
| Date | Decision | Status |
|
| Date | Decision | Status |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
||||||
|
| 2026-04-01 | Treat memtest as explicit ISO content | active |
|
||||||
|
|||||||
@@ -17,6 +17,46 @@ This applies to:
|
|||||||
|
|
||||||
## Memtest rule
|
## Memtest rule
|
||||||
|
|
||||||
Prefer live-build's built-in memtest integration over custom hooks or hardcoded
|
Do not assume live-build's built-in memtest integration is sufficient for `bee`.
|
||||||
bootloader paths. If you ever need to reference memtest files manually, verify
|
We already tried that path and regressed again on 2026-04-01: `lb binary_memtest`
|
||||||
the exact package file list first for the target Debian release.
|
ran, but the final ISO still lacked memtest binaries and menu entries.
|
||||||
|
|
||||||
|
For this project, memtest is accepted only when the produced ISO actually
|
||||||
|
contains all of the following:
|
||||||
|
|
||||||
|
- `boot/memtest86+x64.bin`
|
||||||
|
- `boot/memtest86+x64.efi`
|
||||||
|
- a memtest entry in `boot/grub/grub.cfg`
|
||||||
|
- a memtest entry in `isolinux/live.cfg`
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
|
||||||
|
- Keep explicit post-build memtest validation in `build.sh`.
|
||||||
|
- Treat ISO reader success as a separate prerequisite from memtest content.
|
||||||
|
If the reader cannot list or extract from the ISO, that is a validator
|
||||||
|
failure, not proof that memtest is missing.
|
||||||
|
- If built-in integration does not produce the artifacts above, use a
|
||||||
|
deterministic project-owned copy/extract step instead of hoping live-build
|
||||||
|
will "start working".
|
||||||
|
- Do not switch back to built-in-only memtest without fresh build evidence from
|
||||||
|
a real ISO.
|
||||||
|
- If you reference memtest files manually, verify the exact package file list
|
||||||
|
first for the target Debian release.
|
||||||
|
|
||||||
|
Known bad loops for this repository:
|
||||||
|
|
||||||
|
- Do not retry built-in-only memtest without new evidence. We already proved
|
||||||
|
that `lb binary_memtest` can run while the final ISO still has no memtest.
|
||||||
|
- Do not assume fixing memtest file names is enough. Correct names did not fix
|
||||||
|
the final artifact path.
|
||||||
|
- Do not assume `chroot/boot/` contains memtest payloads at the time hooks run.
|
||||||
|
- Do not assume source `grub.cfg` / `live.cfg.in` are the final writers of ISO
|
||||||
|
bootloader configs.
|
||||||
|
- Do not assume the current `config/hooks/normal/9100-memtest.hook.binary`
|
||||||
|
timing is late enough to patch final `binary/boot/grub/grub.cfg` or
|
||||||
|
`binary/isolinux/live.cfg`; logs from 2026-04-01 showed those files were not
|
||||||
|
present yet when the hook executed.
|
||||||
|
- Do not treat a validator warning as ground truth until you have confirmed the
|
||||||
|
ISO reader actually succeeded. On 2026-04-01 we misdiagnosed another memtest
|
||||||
|
regression because the final ISO was correct but the validator produced a
|
||||||
|
false negative.
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ RUN apt-get update -qq && apt-get install -y \
|
|||||||
wget \
|
wget \
|
||||||
curl \
|
curl \
|
||||||
tar \
|
tar \
|
||||||
|
libarchive-tools \
|
||||||
xz-utils \
|
xz-utils \
|
||||||
rsync \
|
rsync \
|
||||||
build-essential \
|
build-essential \
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ NCCL_TESTS_VERSION=2.13.10
|
|||||||
NVCC_VERSION=12.8
|
NVCC_VERSION=12.8
|
||||||
CUBLAS_VERSION=13.0.2.14-1
|
CUBLAS_VERSION=13.0.2.14-1
|
||||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||||
DCGM_VERSION=4.5.2-1
|
DCGM_VERSION=4.5.3-1
|
||||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||||
ROCM_VERSION=6.3.4
|
ROCM_VERSION=6.3.4
|
||||||
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ lb config noauto \
|
|||||||
--memtest memtest86+ \
|
--memtest memtest86+ \
|
||||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||||
--apt-recommends false \
|
--apt-recommends false \
|
||||||
--chroot-squashfs-compression-type zstd \
|
--chroot-squashfs-compression-type zstd \
|
||||||
"${@}"
|
"${@}"
|
||||||
|
|||||||
@@ -46,7 +46,10 @@ CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
|
|||||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
||||||
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
||||||
|
CACHE_LAYOUT_VERSION="2"
|
||||||
|
CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
|
||||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
||||||
|
&& [ -f "$CACHE_LAYOUT_MARKER" ] \
|
||||||
&& [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then
|
&& [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||||
echo "=== NVIDIA cached, skipping build ==="
|
echo "=== NVIDIA cached, skipping build ==="
|
||||||
echo "cache: $CACHE_DIR"
|
echo "cache: $CACHE_DIR"
|
||||||
@@ -130,24 +133,30 @@ else
|
|||||||
echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)"
|
echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Copy ALL userspace library files.
|
# Copy NVIDIA userspace libraries broadly instead of whitelisting a few names.
|
||||||
# libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation
|
# Newer driver branches add extra runtime deps (for example OpenCL/compiler side
|
||||||
# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND.
|
# libraries). If we only copy a narrow allowlist, clinfo/John can see nvidia.icd
|
||||||
|
# but still fail with "no OpenCL platforms" because one dependent .so is absent.
|
||||||
|
copied_libs=0
|
||||||
|
for f in $(find "$EXTRACT_DIR" -maxdepth 1 \( -name 'libnvidia*.so.*' -o -name 'libcuda.so.*' \) -type f 2>/dev/null | sort); do
|
||||||
|
cp "$f" "$CACHE_DIR/lib/"
|
||||||
|
copied_libs=$((copied_libs+1))
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "$copied_libs" -eq 0 ]; then
|
||||||
|
echo "ERROR: no NVIDIA userspace libraries found in $EXTRACT_DIR"
|
||||||
|
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -40 || true
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
for lib in \
|
for lib in \
|
||||||
libnvidia-ml \
|
libnvidia-ml \
|
||||||
libcuda \
|
libcuda \
|
||||||
libnvidia-ptxjitcompiler \
|
libnvidia-ptxjitcompiler \
|
||||||
libnvidia-opencl \
|
libnvidia-opencl; do
|
||||||
libnvidia-compiler \
|
if ! ls "$CACHE_DIR/lib/${lib}.so."* >/dev/null 2>&1; then
|
||||||
libnvidia-nvvm \
|
echo "ERROR: required ${lib}.so.* not found in extracted userspace libs"
|
||||||
libnvidia-fatbinaryloader; do
|
ls "$CACHE_DIR/lib/" | sort >&2 || true
|
||||||
count=0
|
|
||||||
for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
|
|
||||||
cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
|
|
||||||
done
|
|
||||||
if [ "$count" -eq 0 ]; then
|
|
||||||
echo "ERROR: ${lib}.so.* not found in $EXTRACT_DIR"
|
|
||||||
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -20 || true
|
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
@@ -156,23 +165,17 @@ done
|
|||||||
ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
|
ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
|
||||||
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
||||||
|
|
||||||
# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
|
# Create soname symlinks for every copied versioned library.
|
||||||
for lib in \
|
for versioned in "$CACHE_DIR"/lib/*.so.*; do
|
||||||
libnvidia-ml \
|
[ -f "$versioned" ] || continue
|
||||||
libcuda \
|
|
||||||
libnvidia-ptxjitcompiler \
|
|
||||||
libnvidia-opencl \
|
|
||||||
libnvidia-compiler \
|
|
||||||
libnvidia-nvvm \
|
|
||||||
libnvidia-fatbinaryloader; do
|
|
||||||
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
|
|
||||||
[ -n "$versioned" ] || continue
|
|
||||||
base=$(basename "$versioned")
|
base=$(basename "$versioned")
|
||||||
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1"
|
stem=${base%%.so.*}
|
||||||
ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
|
ln -sf "$base" "$CACHE_DIR/lib/${stem}.so.1"
|
||||||
echo "${lib}: .so.1 -> $base"
|
ln -sf "${stem}.so.1" "$CACHE_DIR/lib/${stem}.so" 2>/dev/null || true
|
||||||
done
|
done
|
||||||
|
|
||||||
|
touch "$CACHE_LAYOUT_MARKER"
|
||||||
|
|
||||||
echo "=== NVIDIA build complete ==="
|
echo "=== NVIDIA build complete ==="
|
||||||
echo "cache: $CACHE_DIR"
|
echo "cache: $CACHE_DIR"
|
||||||
echo "modules: $ko_count .ko files"
|
echo "modules: $ko_count .ko files"
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ export BEE_GPU_VENDOR
|
|||||||
|
|
||||||
. "${BUILDER_DIR}/VERSIONS"
|
. "${BUILDER_DIR}/VERSIONS"
|
||||||
export PATH="$PATH:/usr/local/go/bin"
|
export PATH="$PATH:/usr/local/go/bin"
|
||||||
|
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||||
|
|
||||||
# Allow git to read the bind-mounted repo (different UID inside container).
|
# Allow git to read the bind-mounted repo (different UID inside container).
|
||||||
git config --global safe.directory "${REPO_ROOT}"
|
git config --global safe.directory "${REPO_ROOT}"
|
||||||
@@ -111,6 +112,64 @@ resolve_iso_version() {
|
|||||||
resolve_audit_version
|
resolve_audit_version
|
||||||
}
|
}
|
||||||
|
|
||||||
|
iso_list_files() {
|
||||||
|
iso_path="$1"
|
||||||
|
|
||||||
|
if command -v bsdtar >/dev/null 2>&1; then
|
||||||
|
bsdtar -tf "$iso_path"
|
||||||
|
return $?
|
||||||
|
fi
|
||||||
|
|
||||||
|
if command -v xorriso >/dev/null 2>&1; then
|
||||||
|
xorriso -indev "$iso_path" -find / -type f -print 2>/dev/null | sed 's#^/##'
|
||||||
|
return $?
|
||||||
|
fi
|
||||||
|
|
||||||
|
return 127
|
||||||
|
}
|
||||||
|
|
||||||
|
iso_extract_file() {
|
||||||
|
iso_path="$1"
|
||||||
|
iso_member="$2"
|
||||||
|
|
||||||
|
if command -v bsdtar >/dev/null 2>&1; then
|
||||||
|
bsdtar -xOf "$iso_path" "$iso_member"
|
||||||
|
return $?
|
||||||
|
fi
|
||||||
|
|
||||||
|
if command -v xorriso >/dev/null 2>&1; then
|
||||||
|
xorriso -osirrox on -indev "$iso_path" -cat "/$iso_member" 2>/dev/null
|
||||||
|
return $?
|
||||||
|
fi
|
||||||
|
|
||||||
|
return 127
|
||||||
|
}
|
||||||
|
|
||||||
|
iso_read_file_list() {
|
||||||
|
iso_path="$1"
|
||||||
|
out_path="$2"
|
||||||
|
|
||||||
|
iso_list_files "$iso_path" > "$out_path" || return 1
|
||||||
|
[ -s "$out_path" ] || return 1
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
iso_read_member() {
|
||||||
|
iso_path="$1"
|
||||||
|
iso_member="$2"
|
||||||
|
out_path="$3"
|
||||||
|
|
||||||
|
iso_extract_file "$iso_path" "$iso_member" > "$out_path" || return 1
|
||||||
|
[ -s "$out_path" ] || return 1
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
require_iso_reader() {
|
||||||
|
command -v bsdtar >/dev/null 2>&1 && return 0
|
||||||
|
command -v xorriso >/dev/null 2>&1 && return 0
|
||||||
|
memtest_fail "ISO reader is required for validation/debug (expected bsdtar or xorriso)" "${1:-}"
|
||||||
|
}
|
||||||
|
|
||||||
dump_memtest_debug() {
|
dump_memtest_debug() {
|
||||||
phase="$1"
|
phase="$1"
|
||||||
lb_dir="${2:-}"
|
lb_dir="${2:-}"
|
||||||
@@ -138,6 +197,16 @@ dump_memtest_debug() {
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
echo "-- source binary hooks --"
|
||||||
|
for hook in \
|
||||||
|
"${BUILDER_DIR}/config/hooks/normal/9100-memtest.hook.binary"; do
|
||||||
|
if [ -f "$hook" ]; then
|
||||||
|
echo " hook: $hook"
|
||||||
|
else
|
||||||
|
echo " (missing $hook)"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
if [ -n "$lb_dir" ] && [ -d "$lb_dir" ]; then
|
if [ -n "$lb_dir" ] && [ -d "$lb_dir" ]; then
|
||||||
echo "-- live-build workdir package lists --"
|
echo "-- live-build workdir package lists --"
|
||||||
for pkg in \
|
for pkg in \
|
||||||
@@ -164,6 +233,20 @@ dump_memtest_debug() {
|
|||||||
echo " (missing $lb_dir/binary/boot)"
|
echo " (missing $lb_dir/binary/boot)"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
echo "-- live-build binary grub cfg --"
|
||||||
|
if [ -f "$lb_dir/binary/boot/grub/grub.cfg" ]; then
|
||||||
|
grep -n 'Memory Test\|memtest' "$lb_dir/binary/boot/grub/grub.cfg" || echo " (no memtest lines)"
|
||||||
|
else
|
||||||
|
echo " (missing $lb_dir/binary/boot/grub/grub.cfg)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "-- live-build binary isolinux cfg --"
|
||||||
|
if [ -f "$lb_dir/binary/isolinux/live.cfg" ]; then
|
||||||
|
grep -n 'Memory Test\|memtest' "$lb_dir/binary/isolinux/live.cfg" || echo " (no memtest lines)"
|
||||||
|
else
|
||||||
|
echo " (missing $lb_dir/binary/isolinux/live.cfg)"
|
||||||
|
fi
|
||||||
|
|
||||||
echo "-- live-build package cache --"
|
echo "-- live-build package cache --"
|
||||||
if [ -d "$lb_dir/cache/packages.chroot" ]; then
|
if [ -d "$lb_dir/cache/packages.chroot" ]; then
|
||||||
find "$lb_dir/cache/packages.chroot" -maxdepth 1 -name 'memtest86+*.deb' -print | sed 's/^/ /' || true
|
find "$lb_dir/cache/packages.chroot" -maxdepth 1 -name 'memtest86+*.deb' -print | sed 's/^/ /' || true
|
||||||
@@ -173,14 +256,32 @@ dump_memtest_debug() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -n "$iso_path" ] && [ -f "$iso_path" ]; then
|
if [ -n "$iso_path" ] && [ -f "$iso_path" ]; then
|
||||||
|
iso_files="$(mktemp)"
|
||||||
|
iso_grub_cfg="$(mktemp)"
|
||||||
|
iso_isolinux_cfg="$(mktemp)"
|
||||||
|
|
||||||
echo "-- ISO memtest files --"
|
echo "-- ISO memtest files --"
|
||||||
bsdtar -tf "$iso_path" | grep 'memtest' | sed 's/^/ /' || echo " (no memtest files in ISO)"
|
if iso_read_file_list "$iso_path" "$iso_files"; then
|
||||||
|
grep 'memtest' "$iso_files" | sed 's/^/ /' || echo " (no memtest files in ISO)"
|
||||||
|
else
|
||||||
|
echo " (failed to list ISO contents)"
|
||||||
|
fi
|
||||||
|
|
||||||
echo "-- ISO GRUB memtest lines --"
|
echo "-- ISO GRUB memtest lines --"
|
||||||
bsdtar -xOf "$iso_path" boot/grub/grub.cfg 2>/dev/null | grep -n 'Memory Test\|memtest' || echo " (no memtest lines in boot/grub/grub.cfg)"
|
if iso_read_member "$iso_path" boot/grub/grub.cfg "$iso_grub_cfg"; then
|
||||||
|
grep -n 'Memory Test\|memtest' "$iso_grub_cfg" || echo " (no memtest lines in boot/grub/grub.cfg)"
|
||||||
|
else
|
||||||
|
echo " (failed to read boot/grub/grub.cfg from ISO)"
|
||||||
|
fi
|
||||||
|
|
||||||
echo "-- ISO isolinux memtest lines --"
|
echo "-- ISO isolinux memtest lines --"
|
||||||
bsdtar -xOf "$iso_path" isolinux/live.cfg 2>/dev/null | grep -n 'Memory Test\|memtest' || echo " (no memtest lines in isolinux/live.cfg)"
|
if iso_read_member "$iso_path" isolinux/live.cfg "$iso_isolinux_cfg"; then
|
||||||
|
grep -n 'Memory Test\|memtest' "$iso_isolinux_cfg" || echo " (no memtest lines in isolinux/live.cfg)"
|
||||||
|
else
|
||||||
|
echo " (failed to read isolinux/live.cfg from ISO)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm -f "$iso_files" "$iso_grub_cfg" "$iso_isolinux_cfg"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "=== end memtest debug: ${phase} ==="
|
echo "=== end memtest debug: ${phase} ==="
|
||||||
@@ -196,51 +297,297 @@ dump_memtest_debug() {
|
|||||||
memtest_fail() {
|
memtest_fail() {
|
||||||
msg="$1"
|
msg="$1"
|
||||||
iso_path="${2:-}"
|
iso_path="${2:-}"
|
||||||
echo "ERROR: ${msg}" >&2
|
level="WARNING"
|
||||||
|
if [ "${BEE_REQUIRE_MEMTEST:-0}" = "1" ]; then
|
||||||
|
level="ERROR"
|
||||||
|
fi
|
||||||
|
echo "${level}: ${msg}" >&2
|
||||||
dump_memtest_debug "failure" "${LB_DIR:-}" "$iso_path" >&2
|
dump_memtest_debug "failure" "${LB_DIR:-}" "$iso_path" >&2
|
||||||
|
if [ "${BEE_REQUIRE_MEMTEST:-0}" = "1" ]; then
|
||||||
exit 1
|
exit 1
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
iso_memtest_present() {
|
||||||
|
iso_path="$1"
|
||||||
|
iso_files="$(mktemp)"
|
||||||
|
|
||||||
|
[ -f "$iso_path" ] || return 1
|
||||||
|
|
||||||
|
if command -v bsdtar >/dev/null 2>&1; then
|
||||||
|
:
|
||||||
|
elif command -v xorriso >/dev/null 2>&1; then
|
||||||
|
:
|
||||||
|
else
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
iso_read_file_list "$iso_path" "$iso_files" || {
|
||||||
|
rm -f "$iso_files"
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
grep -q '^boot/memtest86+x64\.bin$' "$iso_files" || {
|
||||||
|
rm -f "$iso_files"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
grep -q '^boot/memtest86+x64\.efi$' "$iso_files" || {
|
||||||
|
rm -f "$iso_files"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
grub_cfg="$(mktemp)"
|
||||||
|
isolinux_cfg="$(mktemp)"
|
||||||
|
|
||||||
|
iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
|
||||||
|
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
|
||||||
|
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
grep -q 'Memory Test (memtest86+)' "$grub_cfg" || {
|
||||||
|
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
grep -q '/boot/memtest86+x64\.efi' "$grub_cfg" || {
|
||||||
|
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
grep -q '/boot/memtest86+x64\.bin' "$grub_cfg" || {
|
||||||
|
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" || {
|
||||||
|
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
grep -q '/boot/memtest86+x64\.bin' "$isolinux_cfg" || {
|
||||||
|
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||||
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
validate_iso_memtest() {
|
validate_iso_memtest() {
|
||||||
iso_path="$1"
|
iso_path="$1"
|
||||||
echo "=== validating memtest in ISO ==="
|
echo "=== validating memtest in ISO ==="
|
||||||
|
|
||||||
[ -f "$iso_path" ] || memtest_fail "ISO not found for validation: $iso_path" "$iso_path"
|
[ -f "$iso_path" ] || {
|
||||||
command -v bsdtar >/dev/null 2>&1 || memtest_fail "bsdtar is required for ISO validation" "$iso_path"
|
memtest_fail "ISO not found for validation: $iso_path" "$iso_path"
|
||||||
|
return 0
|
||||||
bsdtar -tf "$iso_path" | grep -q '^boot/memtest86+x64\.bin$' || {
|
|
||||||
memtest_fail "memtest BIOS binary missing in ISO: boot/memtest86+x64.bin" "$iso_path"
|
|
||||||
}
|
}
|
||||||
bsdtar -tf "$iso_path" | grep -q '^boot/memtest86+x64\.efi$' || {
|
require_iso_reader "$iso_path" || return 0
|
||||||
|
|
||||||
|
iso_files="$(mktemp)"
|
||||||
|
iso_read_file_list "$iso_path" "$iso_files" || {
|
||||||
|
memtest_fail "failed to list ISO contents while validating memtest" "$iso_path"
|
||||||
|
rm -f "$iso_files"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
grep -q '^boot/memtest86+x64\.bin$' "$iso_files" || {
|
||||||
|
memtest_fail "memtest BIOS binary missing in ISO: boot/memtest86+x64.bin" "$iso_path"
|
||||||
|
rm -f "$iso_files"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
grep -q '^boot/memtest86+x64\.efi$' "$iso_files" || {
|
||||||
memtest_fail "memtest EFI binary missing in ISO: boot/memtest86+x64.efi" "$iso_path"
|
memtest_fail "memtest EFI binary missing in ISO: boot/memtest86+x64.efi" "$iso_path"
|
||||||
|
rm -f "$iso_files"
|
||||||
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
grub_cfg="$(mktemp)"
|
grub_cfg="$(mktemp)"
|
||||||
isolinux_cfg="$(mktemp)"
|
isolinux_cfg="$(mktemp)"
|
||||||
|
|
||||||
bsdtar -xOf "$iso_path" boot/grub/grub.cfg > "$grub_cfg" || memtest_fail "failed to extract boot/grub/grub.cfg from ISO" "$iso_path"
|
iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
|
||||||
bsdtar -xOf "$iso_path" isolinux/live.cfg > "$isolinux_cfg" || memtest_fail "failed to extract isolinux/live.cfg from ISO" "$iso_path"
|
memtest_fail "failed to read boot/grub/grub.cfg from ISO" "$iso_path"
|
||||||
|
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
|
||||||
|
memtest_fail "failed to read isolinux/live.cfg from ISO" "$iso_path"
|
||||||
|
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
grep -q 'Memory Test (memtest86+)' "$grub_cfg" || {
|
grep -q 'Memory Test (memtest86+)' "$grub_cfg" || {
|
||||||
memtest_fail "GRUB menu entry for memtest is missing" "$iso_path"
|
memtest_fail "GRUB menu entry for memtest is missing" "$iso_path"
|
||||||
|
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||||
|
return 0
|
||||||
}
|
}
|
||||||
grep -q '/boot/memtest86+x64\.efi' "$grub_cfg" || {
|
grep -q '/boot/memtest86+x64\.efi' "$grub_cfg" || {
|
||||||
memtest_fail "GRUB memtest EFI path is missing" "$iso_path"
|
memtest_fail "GRUB memtest EFI path is missing" "$iso_path"
|
||||||
|
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||||
|
return 0
|
||||||
}
|
}
|
||||||
grep -q '/boot/memtest86+x64\.bin' "$grub_cfg" || {
|
grep -q '/boot/memtest86+x64\.bin' "$grub_cfg" || {
|
||||||
memtest_fail "GRUB memtest BIOS path is missing" "$iso_path"
|
memtest_fail "GRUB memtest BIOS path is missing" "$iso_path"
|
||||||
|
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||||
|
return 0
|
||||||
}
|
}
|
||||||
grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" || {
|
grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" || {
|
||||||
memtest_fail "isolinux menu entry for memtest is missing" "$iso_path"
|
memtest_fail "isolinux menu entry for memtest is missing" "$iso_path"
|
||||||
|
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||||
|
return 0
|
||||||
}
|
}
|
||||||
grep -q '/boot/memtest86+x64\.bin' "$isolinux_cfg" || {
|
grep -q '/boot/memtest86+x64\.bin' "$isolinux_cfg" || {
|
||||||
memtest_fail "isolinux memtest path is missing" "$iso_path"
|
memtest_fail "isolinux memtest path is missing" "$iso_path"
|
||||||
|
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||||
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||||
echo "=== memtest validation OK ==="
|
echo "=== memtest validation OK ==="
|
||||||
}
|
}
|
||||||
|
|
||||||
|
append_memtest_grub_entry() {
|
||||||
|
grub_cfg="$1"
|
||||||
|
[ -f "$grub_cfg" ] || return 1
|
||||||
|
grep -q 'Memory Test (memtest86+)' "$grub_cfg" && return 0
|
||||||
|
grep -q '### BEE MEMTEST ###' "$grub_cfg" && return 0
|
||||||
|
|
||||||
|
cat >> "$grub_cfg" <<'EOF'
|
||||||
|
|
||||||
|
### BEE MEMTEST ###
|
||||||
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
chainloader /boot/memtest86+x64.efi
|
||||||
|
}
|
||||||
|
else
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
linux16 /boot/memtest86+x64.bin
|
||||||
|
}
|
||||||
|
fi
|
||||||
|
### /BEE MEMTEST ###
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
append_memtest_isolinux_entry() {
|
||||||
|
isolinux_cfg="$1"
|
||||||
|
[ -f "$isolinux_cfg" ] || return 1
|
||||||
|
grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" && return 0
|
||||||
|
grep -q '### BEE MEMTEST ###' "$isolinux_cfg" && return 0
|
||||||
|
|
||||||
|
cat >> "$isolinux_cfg" <<'EOF'
|
||||||
|
|
||||||
|
# ### BEE MEMTEST ###
|
||||||
|
label memtest
|
||||||
|
menu label ^Memory Test (memtest86+)
|
||||||
|
linux /boot/memtest86+x64.bin
|
||||||
|
# ### /BEE MEMTEST ###
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
copy_memtest_from_deb() {
|
||||||
|
deb="$1"
|
||||||
|
dst_boot="$2"
|
||||||
|
tmpdir="$(mktemp -d)"
|
||||||
|
|
||||||
|
dpkg-deb -x "$deb" "$tmpdir"
|
||||||
|
for f in memtest86+x64.bin memtest86+x64.efi; do
|
||||||
|
if [ -f "$tmpdir/boot/$f" ]; then
|
||||||
|
cp "$tmpdir/boot/$f" "$dst_boot/$f"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
rm -rf "$tmpdir"
|
||||||
|
}
|
||||||
|
|
||||||
|
reset_live_build_stage() {
|
||||||
|
lb_dir="$1"
|
||||||
|
stage="$2"
|
||||||
|
|
||||||
|
for root in \
|
||||||
|
"$lb_dir/.build" \
|
||||||
|
"$lb_dir/.stage" \
|
||||||
|
"$lb_dir/auto"; do
|
||||||
|
[ -d "$root" ] || continue
|
||||||
|
find "$root" -maxdepth 1 \( -name "${stage}" -o -name "${stage}.*" -o -name "*${stage}*" \) -exec rm -rf {} + 2>/dev/null || true
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
recover_iso_memtest() {
|
||||||
|
lb_dir="$1"
|
||||||
|
iso_path="$2"
|
||||||
|
binary_boot="$lb_dir/binary/boot"
|
||||||
|
grub_cfg="$lb_dir/binary/boot/grub/grub.cfg"
|
||||||
|
isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
|
||||||
|
|
||||||
|
echo "=== attempting memtest recovery in binary tree ==="
|
||||||
|
|
||||||
|
mkdir -p "$binary_boot"
|
||||||
|
|
||||||
|
for root in \
|
||||||
|
"$lb_dir/chroot/boot" \
|
||||||
|
"/boot"; do
|
||||||
|
for f in memtest86+x64.bin memtest86+x64.efi; do
|
||||||
|
if [ ! -f "$binary_boot/$f" ] && [ -f "$root/$f" ]; then
|
||||||
|
cp "$root/$f" "$binary_boot/$f"
|
||||||
|
echo "memtest recovery: copied $f from $root"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ ! -f "$binary_boot/memtest86+x64.bin" ] || [ ! -f "$binary_boot/memtest86+x64.efi" ]; then
|
||||||
|
for dir in \
|
||||||
|
"$lb_dir/cache/packages.binary" \
|
||||||
|
"$lb_dir/cache/packages.chroot" \
|
||||||
|
"$lb_dir/chroot/var/cache/apt/archives" \
|
||||||
|
"${BEE_CACHE_DIR:-${DIST_DIR}/cache}/lb-packages" \
|
||||||
|
"/var/cache/apt/archives"; do
|
||||||
|
[ -d "$dir" ] || continue
|
||||||
|
deb="$(find "$dir" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
|
||||||
|
[ -n "$deb" ] || continue
|
||||||
|
echo "memtest recovery: extracting payload from $deb"
|
||||||
|
copy_memtest_from_deb "$deb" "$binary_boot"
|
||||||
|
break
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f "$binary_boot/memtest86+x64.bin" ] || [ ! -f "$binary_boot/memtest86+x64.efi" ]; then
|
||||||
|
tmpdl="$(mktemp -d)"
|
||||||
|
if (
|
||||||
|
cd "$tmpdl" && apt-get download memtest86+ >/dev/null 2>&1
|
||||||
|
); then
|
||||||
|
deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
|
||||||
|
if [ -n "$deb" ]; then
|
||||||
|
echo "memtest recovery: downloaded $deb"
|
||||||
|
copy_memtest_from_deb "$deb" "$binary_boot"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
rm -rf "$tmpdl"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -f "$grub_cfg" ]; then
|
||||||
|
append_memtest_grub_entry "$grub_cfg" && echo "memtest recovery: ensured GRUB entry"
|
||||||
|
else
|
||||||
|
echo "memtest recovery: WARNING: missing $grub_cfg"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -f "$isolinux_cfg" ]; then
|
||||||
|
append_memtest_isolinux_entry "$isolinux_cfg" && echo "memtest recovery: ensured isolinux entry"
|
||||||
|
else
|
||||||
|
echo "memtest recovery: WARNING: missing $isolinux_cfg"
|
||||||
|
fi
|
||||||
|
|
||||||
|
reset_live_build_stage "$lb_dir" "binary_checksums"
|
||||||
|
reset_live_build_stage "$lb_dir" "binary_iso"
|
||||||
|
reset_live_build_stage "$lb_dir" "binary_zsync"
|
||||||
|
|
||||||
|
run_optional_step_sh "rebuild live-build checksums after memtest recovery" "91-lb-checksums" "lb binary_checksums 2>&1"
|
||||||
|
run_optional_step_sh "rebuild ISO after memtest recovery" "92-lb-binary-iso" "rm -f '$iso_path' && lb binary_iso 2>&1"
|
||||||
|
run_optional_step_sh "rebuild zsync after memtest recovery" "93-lb-zsync" "lb binary_zsync 2>&1"
|
||||||
|
|
||||||
|
if [ ! -f "$iso_path" ]; then
|
||||||
|
memtest_fail "ISO rebuild was skipped or failed after memtest recovery: $iso_path" "$iso_path"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
||||||
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
||||||
ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
||||||
@@ -253,6 +600,10 @@ cleanup_build_log() {
|
|||||||
status="${1:-$?}"
|
status="${1:-$?}"
|
||||||
trap - EXIT INT TERM HUP
|
trap - EXIT INT TERM HUP
|
||||||
|
|
||||||
|
if [ "${STEP_LOG_ACTIVE:-0}" = "1" ]; then
|
||||||
|
cleanup_step_log "${status}" || true
|
||||||
|
fi
|
||||||
|
|
||||||
if [ "${BUILD_LOG_ACTIVE:-0}" = "1" ]; then
|
if [ "${BUILD_LOG_ACTIVE:-0}" = "1" ]; then
|
||||||
BUILD_LOG_ACTIVE=0
|
BUILD_LOG_ACTIVE=0
|
||||||
exec 1>&3 2>&4
|
exec 1>&3 2>&4
|
||||||
@@ -296,6 +647,89 @@ start_build_log() {
|
|||||||
echo "=== build log archive: ${LOG_ARCHIVE} ==="
|
echo "=== build log archive: ${LOG_ARCHIVE} ==="
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cleanup_step_log() {
|
||||||
|
status="${1:-$?}"
|
||||||
|
|
||||||
|
if [ "${STEP_LOG_ACTIVE:-0}" = "1" ]; then
|
||||||
|
STEP_LOG_ACTIVE=0
|
||||||
|
exec 1>&5 2>&6
|
||||||
|
exec 5>&- 6>&-
|
||||||
|
if [ -n "${STEP_TEE_PID:-}" ]; then
|
||||||
|
wait "${STEP_TEE_PID}" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
rm -f "${STEP_LOG_PIPE}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
return "${status}"
|
||||||
|
}
|
||||||
|
|
||||||
|
run_step() {
|
||||||
|
step_name="$1"
|
||||||
|
step_slug="$2"
|
||||||
|
shift 2
|
||||||
|
|
||||||
|
step_log="${LOG_DIR}/${step_slug}.log"
|
||||||
|
echo ""
|
||||||
|
echo "=== step: ${step_name} ==="
|
||||||
|
echo "=== step log: ${step_log} ==="
|
||||||
|
|
||||||
|
STEP_LOG_PIPE="$(mktemp -u "${TMPDIR:-/tmp}/bee-step-log.XXXXXX")"
|
||||||
|
mkfifo "${STEP_LOG_PIPE}"
|
||||||
|
|
||||||
|
exec 5>&1 6>&2
|
||||||
|
tee "${step_log}" < "${STEP_LOG_PIPE}" >&5 &
|
||||||
|
STEP_TEE_PID=$!
|
||||||
|
exec > "${STEP_LOG_PIPE}" 2>&1
|
||||||
|
STEP_LOG_ACTIVE=1
|
||||||
|
|
||||||
|
set +e
|
||||||
|
"$@"
|
||||||
|
step_status=$?
|
||||||
|
set -e
|
||||||
|
|
||||||
|
cleanup_step_log "${step_status}"
|
||||||
|
if [ "${step_status}" -ne 0 ]; then
|
||||||
|
echo "ERROR: step failed: ${step_name} (see ${step_log})" >&2
|
||||||
|
exit "${step_status}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "=== step OK: ${step_name} ==="
|
||||||
|
}
|
||||||
|
|
||||||
|
run_step_sh() {
|
||||||
|
step_name="$1"
|
||||||
|
step_slug="$2"
|
||||||
|
step_script="$3"
|
||||||
|
|
||||||
|
run_step "${step_name}" "${step_slug}" sh -c "${step_script}"
|
||||||
|
}
|
||||||
|
|
||||||
|
run_optional_step_sh() {
|
||||||
|
step_name="$1"
|
||||||
|
step_slug="$2"
|
||||||
|
step_script="$3"
|
||||||
|
|
||||||
|
if [ "${BEE_REQUIRE_MEMTEST:-0}" = "1" ]; then
|
||||||
|
run_step_sh "${step_name}" "${step_slug}" "${step_script}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
step_log="${LOG_DIR}/${step_slug}.log"
|
||||||
|
echo ""
|
||||||
|
echo "=== optional step: ${step_name} ==="
|
||||||
|
echo "=== optional step log: ${step_log} ==="
|
||||||
|
set +e
|
||||||
|
sh -c "${step_script}" > "${step_log}" 2>&1
|
||||||
|
step_status=$?
|
||||||
|
set -e
|
||||||
|
cat "${step_log}"
|
||||||
|
if [ "${step_status}" -ne 0 ]; then
|
||||||
|
echo "WARNING: optional step failed: ${step_name} (see ${step_log})" >&2
|
||||||
|
else
|
||||||
|
echo "=== optional step OK: ${step_name} ==="
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
start_build_log
|
start_build_log
|
||||||
|
|
||||||
# Auto-detect kernel ABI: refresh apt index, then query current linux-image-amd64 dependency.
|
# Auto-detect kernel ABI: refresh apt index, then query current linux-image-amd64 dependency.
|
||||||
@@ -331,8 +765,8 @@ echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERS
|
|||||||
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
echo "=== syncing git submodules ==="
|
run_step "sync git submodules" "05-git-submodules" \
|
||||||
git -C "${REPO_ROOT}" submodule update --init --recursive
|
git -C "${REPO_ROOT}" submodule update --init --recursive
|
||||||
|
|
||||||
# --- compile bee binary (static, Linux amd64) ---
|
# --- compile bee binary (static, Linux amd64) ---
|
||||||
# Shared between variants — built once, reused on second pass.
|
# Shared between variants — built once, reused on second pass.
|
||||||
@@ -344,13 +778,13 @@ if [ -f "$BEE_BIN" ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$NEED_BUILD" = "1" ]; then
|
if [ "$NEED_BUILD" = "1" ]; then
|
||||||
echo "=== building bee binary ==="
|
run_step_sh "build bee binary" "10-build-bee" \
|
||||||
cd "${REPO_ROOT}/audit"
|
"cd '${REPO_ROOT}/audit' && \
|
||||||
GOOS=linux GOARCH=amd64 CGO_ENABLED=0 \
|
env GOOS=linux GOARCH=amd64 CGO_ENABLED=0 \
|
||||||
go build \
|
go build \
|
||||||
-ldflags "-s -w -X main.Version=${AUDIT_VERSION_EFFECTIVE}" \
|
-ldflags '-s -w -X main.Version=${AUDIT_VERSION_EFFECTIVE}' \
|
||||||
-o "$BEE_BIN" \
|
-o '${BEE_BIN}' \
|
||||||
./cmd/bee
|
./cmd/bee"
|
||||||
echo "binary: $BEE_BIN"
|
echo "binary: $BEE_BIN"
|
||||||
if command -v stat >/dev/null 2>&1; then
|
if command -v stat >/dev/null 2>&1; then
|
||||||
BEE_SIZE_BYTES="$(stat -c '%s' "$BEE_BIN" 2>/dev/null || stat -f '%z' "$BEE_BIN")"
|
BEE_SIZE_BYTES="$(stat -c '%s' "$BEE_BIN" 2>/dev/null || stat -f '%z' "$BEE_BIN")"
|
||||||
@@ -369,8 +803,7 @@ fi
|
|||||||
# --- NVIDIA-only build steps ---
|
# --- NVIDIA-only build steps ---
|
||||||
GPU_BURN_WORKER_BIN="${DIST_DIR}/bee-gpu-burn-worker-linux-amd64"
|
GPU_BURN_WORKER_BIN="${DIST_DIR}/bee-gpu-burn-worker-linux-amd64"
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
echo ""
|
run_step "download cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace" "20-cublas" \
|
||||||
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
|
||||||
sh "${BUILDER_DIR}/build-cublas.sh" \
|
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||||
"${CUBLAS_VERSION}" \
|
"${CUBLAS_VERSION}" \
|
||||||
"${CUDA_USERSPACE_VERSION}" \
|
"${CUDA_USERSPACE_VERSION}" \
|
||||||
@@ -385,7 +818,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||||
echo "=== building bee-gpu-burn worker ==="
|
run_step "build bee-gpu-burn worker" "21-gpu-burn-worker" \
|
||||||
gcc -O2 -s -Wall -Wextra \
|
gcc -O2 -s -Wall -Wextra \
|
||||||
-I"${CUBLAS_CACHE}/include" \
|
-I"${CUBLAS_CACHE}/include" \
|
||||||
-o "$GPU_BURN_WORKER_BIN" \
|
-o "$GPU_BURN_WORKER_BIN" \
|
||||||
@@ -429,7 +862,6 @@ rm -f \
|
|||||||
"${OVERLAY_STAGE_DIR}/etc/bee-release" \
|
"${OVERLAY_STAGE_DIR}/etc/bee-release" \
|
||||||
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \
|
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
|
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
||||||
@@ -507,8 +939,7 @@ done
|
|||||||
|
|
||||||
# --- NVIDIA kernel modules and userspace libs ---
|
# --- NVIDIA kernel modules and userspace libs ---
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
echo ""
|
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
||||||
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
|
|
||||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
||||||
|
|
||||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||||
@@ -537,8 +968,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# --- build / download NCCL ---
|
# --- build / download NCCL ---
|
||||||
echo ""
|
run_step "download NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}" "50-nccl" \
|
||||||
echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
|
|
||||||
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
||||||
|
|
||||||
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||||
@@ -552,8 +982,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||||
|
|
||||||
# --- build nccl-tests ---
|
# --- build nccl-tests ---
|
||||||
echo ""
|
run_step "build nccl-tests ${NCCL_TESTS_VERSION}" "60-nccl-tests" \
|
||||||
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
|
|
||||||
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
||||||
"${NCCL_TESTS_VERSION}" \
|
"${NCCL_TESTS_VERSION}" \
|
||||||
"${NCCL_VERSION}" \
|
"${NCCL_VERSION}" \
|
||||||
@@ -568,8 +997,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
cp "${NCCL_TESTS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
cp "${NCCL_TESTS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||||
echo "=== all_reduce_perf injected ==="
|
echo "=== all_reduce_perf injected ==="
|
||||||
|
|
||||||
echo ""
|
run_step "build john jumbo ${JOHN_JUMBO_COMMIT}" "70-john" \
|
||||||
echo "=== building john jumbo ${JOHN_JUMBO_COMMIT} ==="
|
|
||||||
sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}"
|
sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}"
|
||||||
JOHN_CACHE="${DIST_DIR}/john-${JOHN_JUMBO_COMMIT}"
|
JOHN_CACHE="${DIST_DIR}/john-${JOHN_JUMBO_COMMIT}"
|
||||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
|
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
|
||||||
@@ -691,10 +1119,10 @@ BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
|
|||||||
export BEE_GPU_VENDOR_UPPER
|
export BEE_GPU_VENDOR_UPPER
|
||||||
|
|
||||||
cd "${LB_DIR}"
|
cd "${LB_DIR}"
|
||||||
lb clean 2>&1 | tail -3
|
run_step_sh "live-build clean" "80-lb-clean" "lb clean 2>&1 | tail -3"
|
||||||
lb config 2>&1 | tail -5
|
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
||||||
dump_memtest_debug "pre-build" "${LB_DIR}"
|
dump_memtest_debug "pre-build" "${LB_DIR}"
|
||||||
lb build 2>&1
|
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
||||||
|
|
||||||
# --- persist deb package cache back to shared location ---
|
# --- persist deb package cache back to shared location ---
|
||||||
# This allows the second variant to reuse all downloaded packages.
|
# This allows the second variant to reuse all downloaded packages.
|
||||||
@@ -707,6 +1135,17 @@ fi
|
|||||||
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
||||||
if [ -f "$ISO_RAW" ]; then
|
if [ -f "$ISO_RAW" ]; then
|
||||||
dump_memtest_debug "post-build" "${LB_DIR}" "$ISO_RAW"
|
dump_memtest_debug "post-build" "${LB_DIR}" "$ISO_RAW"
|
||||||
|
if iso_memtest_present "$ISO_RAW"; then
|
||||||
|
:
|
||||||
|
else
|
||||||
|
memtest_status=$?
|
||||||
|
if [ "$memtest_status" -eq 1 ]; then
|
||||||
|
recover_iso_memtest "${LB_DIR}" "$ISO_RAW"
|
||||||
|
dump_memtest_debug "post-recovery" "${LB_DIR}" "$ISO_RAW"
|
||||||
|
elif [ "$memtest_status" -eq 2 ]; then
|
||||||
|
memtest_fail "failed to inspect ISO for memtest before recovery" "$ISO_RAW"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
validate_iso_memtest "$ISO_RAW"
|
validate_iso_memtest "$ISO_RAW"
|
||||||
cp "$ISO_RAW" "$ISO_OUT"
|
cp "$ISO_RAW" "$ISO_OUT"
|
||||||
echo ""
|
echo ""
|
||||||
|
|||||||
@@ -14,6 +14,11 @@ menuentry "EASY-BEE" {
|
|||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE (graphics/KMS)" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (load to RAM)" {
|
menuentry "EASY-BEE (load to RAM)" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
@@ -24,6 +29,11 @@ menuentry "EASY-BEE (NVIDIA GSP=off)" {
|
|||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE (graphics/KMS, GSP=off)" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (fail-safe)" {
|
menuentry "EASY-BEE (fail-safe)" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
|
|||||||
@@ -5,6 +5,12 @@ label live-@FLAVOUR@-normal
|
|||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ bee.nvidia.mode=normal
|
append @APPEND_LIVE@ bee.nvidia.mode=normal
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-kms
|
||||||
|
menu label EASY-BEE (^graphics/KMS)
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal
|
||||||
|
|
||||||
label live-@FLAVOUR@-toram
|
label live-@FLAVOUR@-toram
|
||||||
menu label EASY-BEE (^load to RAM)
|
menu label EASY-BEE (^load to RAM)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
@@ -17,6 +23,12 @@ label live-@FLAVOUR@-gsp-off
|
|||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off
|
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-kms-gsp-off
|
||||||
|
menu label EASY-BEE (g^raphics/KMS, GSP=off)
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off
|
||||||
|
|
||||||
label live-@FLAVOUR@-failsafe
|
label live-@FLAVOUR@-failsafe
|
||||||
menu label EASY-BEE (^fail-safe)
|
menu label EASY-BEE (^fail-safe)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
|
|||||||
139
iso/builder/config/hooks/normal/9100-memtest.hook.binary
Executable file
139
iso/builder/config/hooks/normal/9100-memtest.hook.binary
Executable file
@@ -0,0 +1,139 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# Ensure memtest is present in the final ISO even if live-build's built-in
|
||||||
|
# memtest stage does not copy the binaries or expose menu entries.
|
||||||
|
set -e
|
||||||
|
|
||||||
|
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||||
|
|
||||||
|
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
|
||||||
|
BINARY_BOOT_DIR="binary/boot"
|
||||||
|
GRUB_CFG="binary/boot/grub/grub.cfg"
|
||||||
|
ISOLINUX_CFG="binary/isolinux/live.cfg"
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "memtest hook: $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
fail_or_warn() {
|
||||||
|
msg="$1"
|
||||||
|
if [ "${BEE_REQUIRE_MEMTEST}" = "1" ]; then
|
||||||
|
log "ERROR: ${msg}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
log "WARNING: ${msg}"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
copy_memtest_file() {
|
||||||
|
src="$1"
|
||||||
|
base="$(basename "$src")"
|
||||||
|
dst="${BINARY_BOOT_DIR}/${base}"
|
||||||
|
|
||||||
|
[ -f "$src" ] || return 1
|
||||||
|
mkdir -p "${BINARY_BOOT_DIR}"
|
||||||
|
cp "$src" "$dst"
|
||||||
|
log "copied ${base} from ${src}"
|
||||||
|
}
|
||||||
|
|
||||||
|
extract_memtest_from_deb() {
|
||||||
|
deb="$1"
|
||||||
|
tmpdir="$(mktemp -d)"
|
||||||
|
|
||||||
|
log "extracting memtest payload from ${deb}"
|
||||||
|
dpkg-deb -x "$deb" "$tmpdir"
|
||||||
|
for f in ${MEMTEST_FILES}; do
|
||||||
|
if [ -f "${tmpdir}/boot/${f}" ]; then
|
||||||
|
copy_memtest_file "${tmpdir}/boot/${f}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
rm -rf "$tmpdir"
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_memtest_binaries() {
|
||||||
|
missing=0
|
||||||
|
for f in ${MEMTEST_FILES}; do
|
||||||
|
[ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
|
||||||
|
done
|
||||||
|
[ "$missing" -eq 1 ] || return 0
|
||||||
|
|
||||||
|
for root in chroot/boot /boot; do
|
||||||
|
for f in ${MEMTEST_FILES}; do
|
||||||
|
[ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
missing=0
|
||||||
|
for f in ${MEMTEST_FILES}; do
|
||||||
|
[ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
|
||||||
|
done
|
||||||
|
[ "$missing" -eq 1 ] || return 0
|
||||||
|
|
||||||
|
for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
|
||||||
|
[ -d "$root" ] || continue
|
||||||
|
deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
|
||||||
|
[ -n "$deb" ] || continue
|
||||||
|
extract_memtest_from_deb "$deb"
|
||||||
|
break
|
||||||
|
done
|
||||||
|
|
||||||
|
missing=0
|
||||||
|
for f in ${MEMTEST_FILES}; do
|
||||||
|
if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
|
||||||
|
fail_or_warn "missing ${BINARY_BOOT_DIR}/${f}"
|
||||||
|
missing=1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
[ "$missing" -eq 0 ] || return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_grub_entry() {
|
||||||
|
[ -f "$GRUB_CFG" ] || {
|
||||||
|
fail_or_warn "missing ${GRUB_CFG}"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
grep -q '### BEE MEMTEST ###' "$GRUB_CFG" && return 0
|
||||||
|
|
||||||
|
cat >> "$GRUB_CFG" <<'EOF'
|
||||||
|
|
||||||
|
### BEE MEMTEST ###
|
||||||
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
chainloader /boot/memtest86+x64.efi
|
||||||
|
}
|
||||||
|
else
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
linux16 /boot/memtest86+x64.bin
|
||||||
|
}
|
||||||
|
fi
|
||||||
|
### /BEE MEMTEST ###
|
||||||
|
EOF
|
||||||
|
|
||||||
|
log "appended memtest entry to ${GRUB_CFG}"
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_isolinux_entry() {
|
||||||
|
[ -f "$ISOLINUX_CFG" ] || {
|
||||||
|
fail_or_warn "missing ${ISOLINUX_CFG}"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
grep -q '### BEE MEMTEST ###' "$ISOLINUX_CFG" && return 0
|
||||||
|
|
||||||
|
cat >> "$ISOLINUX_CFG" <<'EOF'
|
||||||
|
|
||||||
|
# ### BEE MEMTEST ###
|
||||||
|
label memtest
|
||||||
|
menu label ^Memory Test (memtest86+)
|
||||||
|
linux /boot/memtest86+x64.bin
|
||||||
|
# ### /BEE MEMTEST ###
|
||||||
|
EOF
|
||||||
|
|
||||||
|
log "appended memtest entry to ${ISOLINUX_CFG}"
|
||||||
|
}
|
||||||
|
|
||||||
|
log "ensuring memtest binaries and menu entries in binary image"
|
||||||
|
ensure_memtest_binaries
|
||||||
|
ensure_grub_entry
|
||||||
|
ensure_isolinux_entry
|
||||||
|
log "memtest assets ready"
|
||||||
@@ -1,3 +1,6 @@
|
|||||||
|
# AMD GPU firmware
|
||||||
|
firmware-amd-graphics
|
||||||
|
|
||||||
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
||||||
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
||||||
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
|
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
|
||||||
|
|||||||
@@ -71,9 +71,7 @@ lightdm
|
|||||||
firmware-linux-free
|
firmware-linux-free
|
||||||
firmware-linux-nonfree
|
firmware-linux-nonfree
|
||||||
firmware-misc-nonfree
|
firmware-misc-nonfree
|
||||||
firmware-amd-graphics
|
|
||||||
firmware-realtek
|
firmware-realtek
|
||||||
firmware-intel-sound
|
|
||||||
firmware-bnx2
|
firmware-bnx2
|
||||||
firmware-bnx2x
|
firmware-bnx2x
|
||||||
firmware-cavium
|
firmware-cavium
|
||||||
|
|||||||
@@ -52,6 +52,14 @@ else
|
|||||||
fail "nvidia-smi: NOT FOUND"
|
fail "nvidia-smi: NOT FOUND"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
|
||||||
|
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||||
|
ok "$tool found: $p"
|
||||||
|
else
|
||||||
|
fail "$tool: NOT FOUND"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "-- NVIDIA modules --"
|
echo "-- NVIDIA modules --"
|
||||||
KO_DIR="/usr/local/lib/nvidia"
|
KO_DIR="/usr/local/lib/nvidia"
|
||||||
@@ -109,6 +117,40 @@ else
|
|||||||
fail "nvidia-smi: not found in PATH"
|
fail "nvidia-smi: not found in PATH"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "-- OpenCL / John --"
|
||||||
|
if [ -f /etc/OpenCL/vendors/nvidia.icd ]; then
|
||||||
|
ok "OpenCL ICD present: /etc/OpenCL/vendors/nvidia.icd"
|
||||||
|
else
|
||||||
|
fail "OpenCL ICD missing: /etc/OpenCL/vendors/nvidia.icd"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ldconfig -p 2>/dev/null | grep -q "libnvidia-opencl.so.1"; then
|
||||||
|
ok "libnvidia-opencl.so.1 present in linker cache"
|
||||||
|
else
|
||||||
|
fail "libnvidia-opencl.so.1 missing from linker cache"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if command -v clinfo >/dev/null 2>&1; then
|
||||||
|
if clinfo -l 2>/dev/null | grep -q "Platform"; then
|
||||||
|
ok "clinfo: OpenCL platform detected"
|
||||||
|
else
|
||||||
|
fail "clinfo: no OpenCL platform detected"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
fail "clinfo: not found in PATH"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if command -v john >/dev/null 2>&1; then
|
||||||
|
if john --list=opencl-devices 2>/dev/null | grep -q "Device #"; then
|
||||||
|
ok "john: OpenCL devices detected"
|
||||||
|
else
|
||||||
|
fail "john: no OpenCL devices detected"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
fail "john: not found in PATH"
|
||||||
|
fi
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "-- lib symlinks --"
|
echo "-- lib symlinks --"
|
||||||
for lib in libnvidia-ml libcuda; do
|
for lib in libnvidia-ml libcuda; do
|
||||||
|
|||||||
@@ -1,9 +1,14 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: on-demand hardware audit (not started automatically)
|
Description=Bee: hardware audit
|
||||||
|
After=bee-preflight.service bee-network.service bee-nvidia.service
|
||||||
|
Before=bee-web.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=oneshot
|
Type=oneshot
|
||||||
RemainAfterExit=yes
|
RemainAfterExit=yes
|
||||||
ExecStart=/bin/sh -c 'curl -sf -X POST http://localhost/api/audit/run >/dev/null'
|
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-audit.log /usr/local/bin/bee audit --runtime auto --output file:/appdata/bee/export/bee-audit.json
|
||||||
StandardOutput=journal
|
StandardOutput=journal
|
||||||
StandardError=journal
|
StandardError=journal
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: hardware audit web viewer
|
Description=Bee: hardware audit web viewer
|
||||||
|
After=bee-audit.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
|
|||||||
@@ -0,0 +1,6 @@
|
|||||||
|
[Unit]
|
||||||
|
Wants=bee-preflight.service
|
||||||
|
After=bee-preflight.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
ExecStartPre=/usr/local/bin/bee-display-mode
|
||||||
54
iso/overlay/usr/local/bin/bee-display-mode
Executable file
54
iso/overlay/usr/local/bin/bee-display-mode
Executable file
@@ -0,0 +1,54 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# Select Xorg display mode based on kernel cmdline.
|
||||||
|
# Default is the current server-safe path: keep forced fbdev.
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
cmdline_param() {
|
||||||
|
key="$1"
|
||||||
|
for token in $(cat /proc/cmdline 2>/dev/null); do
|
||||||
|
case "$token" in
|
||||||
|
"$key"=*)
|
||||||
|
echo "${token#*=}"
|
||||||
|
return 0
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "bee-display-mode: $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
mode="$(cmdline_param bee.display || true)"
|
||||||
|
if [ -z "$mode" ]; then
|
||||||
|
mode="safe"
|
||||||
|
fi
|
||||||
|
|
||||||
|
xorg_dir="/etc/X11/xorg.conf.d"
|
||||||
|
fbdev_conf="${xorg_dir}/10-fbdev.conf"
|
||||||
|
fbdev_park="${xorg_dir}/10-fbdev.conf.disabled"
|
||||||
|
|
||||||
|
mkdir -p "$xorg_dir"
|
||||||
|
|
||||||
|
case "$mode" in
|
||||||
|
kms|auto)
|
||||||
|
if [ -f "$fbdev_conf" ]; then
|
||||||
|
mv "$fbdev_conf" "$fbdev_park"
|
||||||
|
log "mode=${mode}; disabled forced fbdev config"
|
||||||
|
else
|
||||||
|
log "mode=${mode}; fbdev config already disabled"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
safe|fbdev|"")
|
||||||
|
if [ -f "$fbdev_park" ] && [ ! -f "$fbdev_conf" ]; then
|
||||||
|
mv "$fbdev_park" "$fbdev_conf"
|
||||||
|
log "mode=${mode}; restored forced fbdev config"
|
||||||
|
else
|
||||||
|
log "mode=${mode}; keeping forced fbdev config"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
log "unknown bee.display=${mode}; keeping forced fbdev config"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
@@ -2,7 +2,7 @@
|
|||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
SECONDS=5
|
SECONDS=5
|
||||||
SIZE_MB=64
|
SIZE_MB=0
|
||||||
DEVICES=""
|
DEVICES=""
|
||||||
EXCLUDE=""
|
EXCLUDE=""
|
||||||
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||||
@@ -68,8 +68,17 @@ trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
|||||||
WORKERS=""
|
WORKERS=""
|
||||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||||
log="${TMP_DIR}/gpu-${id}.log"
|
log="${TMP_DIR}/gpu-${id}.log"
|
||||||
echo "starting gpu ${id}"
|
gpu_size_mb="${SIZE_MB}"
|
||||||
"${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${SIZE_MB}" >"${log}" 2>&1 &
|
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
|
||||||
|
total_mb=$(nvidia-smi --id="${id}" --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | tr -d '[:space:]')
|
||||||
|
if [ -n "${total_mb}" ] && [ "${total_mb}" -gt 0 ] 2>/dev/null; then
|
||||||
|
gpu_size_mb=$(( total_mb * 95 / 100 ))
|
||||||
|
else
|
||||||
|
gpu_size_mb=512
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
||||||
|
"${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||||
pid=$!
|
pid=$!
|
||||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||||
done
|
done
|
||||||
|
|||||||
@@ -7,6 +7,8 @@ EXCLUDE=""
|
|||||||
FORMAT=""
|
FORMAT=""
|
||||||
JOHN_DIR="/usr/local/lib/bee/john/run"
|
JOHN_DIR="/usr/local/lib/bee/john/run"
|
||||||
JOHN_BIN="${JOHN_DIR}/john"
|
JOHN_BIN="${JOHN_DIR}/john"
|
||||||
|
export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
|
||||||
|
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
||||||
@@ -24,6 +26,21 @@ contains_csv() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
show_opencl_diagnostics() {
|
show_opencl_diagnostics() {
|
||||||
|
echo "-- OpenCL ICD vendors --" >&2
|
||||||
|
if [ -d /etc/OpenCL/vendors ]; then
|
||||||
|
ls -l /etc/OpenCL/vendors >&2 || true
|
||||||
|
for icd in /etc/OpenCL/vendors/*.icd; do
|
||||||
|
[ -f "${icd}" ] || continue
|
||||||
|
echo " file: ${icd}" >&2
|
||||||
|
sed 's/^/ /' "${icd}" >&2 || true
|
||||||
|
done
|
||||||
|
else
|
||||||
|
echo " /etc/OpenCL/vendors is missing" >&2
|
||||||
|
fi
|
||||||
|
echo "-- NVIDIA device nodes --" >&2
|
||||||
|
ls -l /dev/nvidia* >&2 || true
|
||||||
|
echo "-- ldconfig OpenCL/NVIDIA --" >&2
|
||||||
|
ldconfig -p 2>/dev/null | grep 'libOpenCL\|libcuda\|libnvidia-opencl' >&2 || true
|
||||||
if command -v clinfo >/dev/null 2>&1; then
|
if command -v clinfo >/dev/null 2>&1; then
|
||||||
echo "-- clinfo -l --" >&2
|
echo "-- clinfo -l --" >&2
|
||||||
clinfo -l >&2 || true
|
clinfo -l >&2 || true
|
||||||
@@ -32,6 +49,17 @@ show_opencl_diagnostics() {
|
|||||||
./john --list=opencl-devices >&2 || true
|
./john --list=opencl-devices >&2 || true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
refresh_nvidia_runtime() {
|
||||||
|
if [ "$(id -u)" != "0" ]; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
if command -v bee-nvidia-load >/dev/null 2>&1; then
|
||||||
|
bee-nvidia-load >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
ldconfig >/dev/null 2>&1 || true
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
ensure_nvidia_uvm() {
|
ensure_nvidia_uvm() {
|
||||||
if lsmod 2>/dev/null | grep -q '^nvidia_uvm '; then
|
if lsmod 2>/dev/null | grep -q '^nvidia_uvm '; then
|
||||||
return 0
|
return 0
|
||||||
@@ -61,6 +89,13 @@ ensure_opencl_ready() {
|
|||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if refresh_nvidia_runtime; then
|
||||||
|
out=$(./john --list=opencl-devices 2>&1 || true)
|
||||||
|
if echo "${out}" | grep -q "Device #"; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
if ensure_nvidia_uvm; then
|
if ensure_nvidia_uvm; then
|
||||||
out=$(./john --list=opencl-devices 2>&1 || true)
|
out=$(./john --list=opencl-devices 2>&1 || true)
|
||||||
if echo "${out}" | grep -q "Device #"; then
|
if echo "${out}" | grep -q "Device #"; then
|
||||||
@@ -155,4 +190,16 @@ CHOSEN_FORMAT=$(choose_format) || {
|
|||||||
}
|
}
|
||||||
|
|
||||||
echo "format=${CHOSEN_FORMAT}"
|
echo "format=${CHOSEN_FORMAT}"
|
||||||
exec ./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${JOHN_DEVICES}"
|
PIDS=""
|
||||||
|
_first=1
|
||||||
|
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
||||||
|
[ "${_first}" = "1" ] || sleep 3
|
||||||
|
_first=0
|
||||||
|
./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" &
|
||||||
|
PIDS="${PIDS} $!"
|
||||||
|
done
|
||||||
|
FAIL=0
|
||||||
|
for pid in ${PIDS}; do
|
||||||
|
wait "${pid}" || FAIL=$((FAIL+1))
|
||||||
|
done
|
||||||
|
[ "${FAIL}" -eq 0 ] || { echo "john: ${FAIL} device(s) failed" >&2; exit 1; }
|
||||||
|
|||||||
@@ -6,25 +6,66 @@ LOG_PREFIX="bee-network"
|
|||||||
|
|
||||||
log() { echo "[$LOG_PREFIX] $*"; }
|
log() { echo "[$LOG_PREFIX] $*"; }
|
||||||
|
|
||||||
# find physical interfaces: exclude lo and virtual (docker/virbr/veth/tun/tap)
|
list_interfaces() {
|
||||||
interfaces=$(ip -o link show \
|
ip -o link show \
|
||||||
| awk -F': ' '{print $2}' \
|
| awk -F': ' '{print $2}' \
|
||||||
| grep -v '^lo$' \
|
| grep -v '^lo$' \
|
||||||
| grep -vE '^(docker|virbr|veth|tun|tap|br-|bond|dummy)' \
|
| grep -vE '^(docker|virbr|veth|tun|tap|br-|bond|dummy)' \
|
||||||
| sort)
|
| sort
|
||||||
|
}
|
||||||
|
|
||||||
if [ -z "$interfaces" ]; then
|
# Give udev a short chance to expose late NICs before the first scan.
|
||||||
log "no physical interfaces found"
|
if command -v udevadm >/dev/null 2>&1; then
|
||||||
exit 0
|
udevadm settle --timeout=5 >/dev/null 2>&1 || log "WARN: udevadm settle timed out"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
for iface in $interfaces; do
|
started_ifaces=""
|
||||||
|
started_count=0
|
||||||
|
scan_pass=1
|
||||||
|
|
||||||
|
# Some server NICs appear a bit later after module/firmware init. Do a small
|
||||||
|
# bounded rescan window without turning network bring-up into a boot blocker.
|
||||||
|
while [ "$scan_pass" -le 3 ]; do
|
||||||
|
interfaces=$(list_interfaces)
|
||||||
|
|
||||||
|
if [ -n "$interfaces" ]; then
|
||||||
|
for iface in $interfaces; do
|
||||||
|
case " $started_ifaces " in
|
||||||
|
*" $iface "*) continue ;;
|
||||||
|
esac
|
||||||
|
|
||||||
log "bringing up $iface"
|
log "bringing up $iface"
|
||||||
ip link set "$iface" up || { log "WARN: could not bring up $iface"; continue; }
|
if ! ip link set "$iface" up; then
|
||||||
|
log "WARN: could not bring up $iface"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
carrier=$(cat "/sys/class/net/$iface/carrier" 2>/dev/null || true)
|
||||||
|
if [ "$carrier" = "1" ]; then
|
||||||
|
log "carrier detected on $iface"
|
||||||
|
else
|
||||||
|
log "carrier not detected yet on $iface"
|
||||||
|
fi
|
||||||
|
|
||||||
# DHCP in background — non-blocking, keep dhclient verbose output in the service log.
|
# DHCP in background — non-blocking, keep dhclient verbose output in the service log.
|
||||||
dhclient -4 -v -nw "$iface" &
|
dhclient -4 -v -nw "$iface" &
|
||||||
log "DHCP started for $iface (pid $!)"
|
log "DHCP started for $iface (pid $!)"
|
||||||
|
|
||||||
|
started_ifaces="$started_ifaces $iface"
|
||||||
|
started_count=$((started_count + 1))
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$scan_pass" -ge 3 ]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
scan_pass=$((scan_pass + 1))
|
||||||
|
sleep 2
|
||||||
done
|
done
|
||||||
|
|
||||||
log "done"
|
if [ "$started_count" -eq 0 ]; then
|
||||||
|
log "no physical interfaces found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "done (interfaces started: $started_count)"
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ chromium \
|
|||||||
--no-first-run \
|
--no-first-run \
|
||||||
--disable-session-crashed-bubble \
|
--disable-session-crashed-bubble \
|
||||||
--disable-features=TranslateUI \
|
--disable-features=TranslateUI \
|
||||||
--start-fullscreen \
|
--start-maximized \
|
||||||
http://localhost/ &
|
http://localhost/ &
|
||||||
|
|
||||||
exec openbox
|
exec openbox
|
||||||
|
|||||||
@@ -3,6 +3,11 @@
|
|||||||
# Type 'a' at any prompt to abort, 'b' to go back.
|
# Type 'a' at any prompt to abort, 'b' to go back.
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
# Requires root for ip/dhclient/resolv.conf — re-exec via sudo if needed.
|
||||||
|
if [ "$(id -u)" -ne 0 ]; then
|
||||||
|
exec sudo "$0" "$@"
|
||||||
|
fi
|
||||||
|
|
||||||
abort() { echo "Aborted."; exit 0; }
|
abort() { echo "Aborted."; exit 0; }
|
||||||
|
|
||||||
ask() {
|
ask() {
|
||||||
|
|||||||
Reference in New Issue
Block a user