Files
bee/audit/internal/platform/live_metrics.go
Michael Chus 0a98ed8ae9 feat: task queue, UI overhaul, burn tests, install-to-RAM
- Task queue: all SAT/audit jobs enqueue and run one-at-a-time;
  tasks persist past page navigation; new Tasks page with cancel/priority/log stream
- UI: consolidate nav (Validate, Burn, Tasks, Tools); Audit becomes modal;
  Dashboard hardware summary badges + split metrics charts (load/temp/power);
  Tools page consolidates network, services, install, support bundle
- AMD GPU: acceptance test and stress burn cards; GPU presence API greys
  out irrelevant SAT cards automatically
- Burn tests: Memory Stress (stress-ng --vm), SAT Stress (stressapptest)
- Install to RAM: copies squashfs to /dev/shm, re-associates loop devices
  via LOOP_CHANGE_FD ioctl so live media can be ejected
- Charts: relative time axis (0 = now, negative left)
- memtester: LimitMEMLOCK=infinity in bee-web.service; empty output → UNSUPPORTED
- SAT overlay applied dynamically on every /audit.json serve
- MIME panic guard for LiveCD ramdisk I/O errors
- ISO: add memtest86+, stressapptest packages; memtest86+ GRUB entry;
  disable screensaver/DPMS in bee-openbox-session
- Unknown SAT status severity = 1 (does not override OK)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-28 21:15:11 +03:00

143 lines
3.4 KiB
Go

package platform
import (
"bufio"
"os"
"strconv"
"strings"
"time"
)
// LiveMetricSample is a single point-in-time snapshot of server metrics
// collected for the web UI metrics page.
type LiveMetricSample struct {
Timestamp time.Time `json:"ts"`
Fans []FanReading `json:"fans"`
Temps []TempReading `json:"temps"`
PowerW float64 `json:"power_w"`
CPULoadPct float64 `json:"cpu_load_pct"`
MemLoadPct float64 `json:"mem_load_pct"`
GPUs []GPUMetricRow `json:"gpus"`
}
// TempReading is a named temperature sensor value.
type TempReading struct {
Name string `json:"name"`
Celsius float64 `json:"celsius"`
}
// SampleLiveMetrics collects a single metrics snapshot from all available
// sources: GPU (via nvidia-smi), fans and temperatures (via ipmitool/sensors),
// and system power (via ipmitool dcmi). Missing sources are silently skipped.
func SampleLiveMetrics() LiveMetricSample {
s := LiveMetricSample{Timestamp: time.Now().UTC()}
// GPU metrics — try NVIDIA first, fall back to AMD
if gpus, err := SampleGPUMetrics(nil); err == nil && len(gpus) > 0 {
s.GPUs = gpus
} else if amdGPUs, err := sampleAMDGPUMetrics(); err == nil && len(amdGPUs) > 0 {
s.GPUs = amdGPUs
}
// Fan speeds — skipped silently if ipmitool unavailable
fans, _ := sampleFanSpeeds()
s.Fans = fans
// CPU/system temperature — returns 0 if unavailable
cpuTemp := sampleCPUMaxTemp()
if cpuTemp > 0 {
s.Temps = append(s.Temps, TempReading{Name: "CPU", Celsius: cpuTemp})
}
// System power — returns 0 if unavailable
s.PowerW = sampleSystemPower()
// CPU load — from /proc/stat
s.CPULoadPct = sampleCPULoadPct()
// Memory load — from /proc/meminfo
s.MemLoadPct = sampleMemLoadPct()
return s
}
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
// the overall CPU utilisation percentage.
var cpuStatPrev [2]uint64 // [total, idle]
func sampleCPULoadPct() float64 {
total, idle := readCPUStat()
if total == 0 {
return 0
}
prevTotal, prevIdle := cpuStatPrev[0], cpuStatPrev[1]
cpuStatPrev = [2]uint64{total, idle}
if prevTotal == 0 {
return 0
}
dt := float64(total - prevTotal)
di := float64(idle - prevIdle)
if dt <= 0 {
return 0
}
pct := (1 - di/dt) * 100
if pct < 0 {
return 0
}
if pct > 100 {
return 100
}
return pct
}
func readCPUStat() (total, idle uint64) {
f, err := os.Open("/proc/stat")
if err != nil {
return 0, 0
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
line := sc.Text()
if !strings.HasPrefix(line, "cpu ") {
continue
}
fields := strings.Fields(line)[1:] // skip "cpu"
var vals [10]uint64
for i := 0; i < len(fields) && i < 10; i++ {
vals[i], _ = strconv.ParseUint(fields[i], 10, 64)
}
// idle = idle + iowait
idle = vals[3] + vals[4]
for _, v := range vals {
total += v
}
return total, idle
}
return 0, 0
}
func sampleMemLoadPct() float64 {
f, err := os.Open("/proc/meminfo")
if err != nil {
return 0
}
defer f.Close()
vals := map[string]uint64{}
sc := bufio.NewScanner(f)
for sc.Scan() {
fields := strings.Fields(sc.Text())
if len(fields) >= 2 {
v, _ := strconv.ParseUint(fields[1], 10, 64)
vals[strings.TrimSuffix(fields[0], ":")] = v
}
}
total := vals["MemTotal"]
avail := vals["MemAvailable"]
if total == 0 {
return 0
}
used := total - avail
return float64(used) / float64(total) * 100
}