release: v3.1

This commit is contained in:
2026-03-28 22:51:36 +03:00
parent 0dbfaf6121
commit 59a1d4b209
22 changed files with 1560 additions and 269 deletions

View File

@@ -1,6 +1,7 @@
package platform
import (
"context"
"encoding/json"
"fmt"
"io"
@@ -18,7 +19,7 @@ func (s *System) IsLiveMediaInRAM() bool {
return strings.TrimSpace(string(out)) == "tmpfs"
}
func (s *System) RunInstallToRAM(logFunc func(string)) error {
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
log := func(msg string) {
if logFunc != nil {
logFunc(msg)
@@ -56,10 +57,13 @@ func (s *System) RunInstallToRAM(logFunc func(string)) error {
}
for _, sf := range squashfsFiles {
if err := ctx.Err(); err != nil {
return err
}
base := filepath.Base(sf)
dst := filepath.Join(dstDir, base)
log(fmt.Sprintf("Copying %s to RAM...", base))
if err := copyFileLarge(sf, dst, log); err != nil {
if err := copyFileLarge(ctx, sf, dst, log); err != nil {
return fmt.Errorf("copy %s: %v", base, err)
}
log(fmt.Sprintf("Copied %s.", base))
@@ -77,9 +81,12 @@ func (s *System) RunInstallToRAM(logFunc func(string)) error {
}
log("Copying remaining medium files...")
if err := cpDir("/run/live/medium", dstDir, log); err != nil {
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
log(fmt.Sprintf("Warning: partial copy: %v", err))
}
if err := ctx.Err(); err != nil {
return err
}
if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
}
@@ -88,7 +95,7 @@ func (s *System) RunInstallToRAM(logFunc func(string)) error {
return nil
}
func copyFileLarge(src, dst string, logFunc func(string)) error {
func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
in, err := os.Open(src)
if err != nil {
return err
@@ -107,6 +114,9 @@ func copyFileLarge(src, dst string, logFunc func(string)) error {
var copied int64
buf := make([]byte, 4*1024*1024)
for {
if err := ctx.Err(); err != nil {
return err
}
n, err := in.Read(buf)
if n > 0 {
if _, werr := out.Write(buf[:n]); werr != nil {
@@ -128,8 +138,11 @@ func copyFileLarge(src, dst string, logFunc func(string)) error {
return out.Sync()
}
func cpDir(src, dst string, logFunc func(string)) error {
func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
if ctx.Err() != nil {
return ctx.Err()
}
if err != nil {
return nil
}
@@ -144,7 +157,7 @@ func cpDir(src, dst string, logFunc func(string)) error {
if _, err := os.Stat(target); err == nil {
return nil
}
return copyFileLarge(path, target, nil)
return copyFileLarge(ctx, path, target, nil)
})
}

View File

@@ -2,7 +2,10 @@ package platform
import (
"bufio"
"encoding/json"
"os"
"os/exec"
"sort"
"strconv"
"strings"
"time"
@@ -23,6 +26,7 @@ type LiveMetricSample struct {
// TempReading is a named temperature sensor value.
type TempReading struct {
Name string `json:"name"`
Group string `json:"group,omitempty"`
Celsius float64 `json:"celsius"`
}
@@ -43,10 +47,11 @@ func SampleLiveMetrics() LiveMetricSample {
fans, _ := sampleFanSpeeds()
s.Fans = fans
// CPU/system temperature — returns 0 if unavailable
cpuTemp := sampleCPUMaxTemp()
if cpuTemp > 0 {
s.Temps = append(s.Temps, TempReading{Name: "CPU", Celsius: cpuTemp})
s.Temps = append(s.Temps, sampleLiveTemperatureReadings()...)
if !hasTempGroup(s.Temps, "cpu") {
if cpuTemp := sampleCPUMaxTemp(); cpuTemp > 0 {
s.Temps = append(s.Temps, TempReading{Name: "CPU Max", Group: "cpu", Celsius: cpuTemp})
}
}
// System power — returns 0 if unavailable
@@ -140,3 +145,181 @@ func sampleMemLoadPct() float64 {
used := total - avail
return float64(used) / float64(total) * 100
}
func hasTempGroup(temps []TempReading, group string) bool {
for _, t := range temps {
if t.Group == group {
return true
}
}
return false
}
func sampleLiveTemperatureReadings() []TempReading {
if temps := sampleLiveTempsViaSensorsJSON(); len(temps) > 0 {
return temps
}
return sampleLiveTempsViaIPMI()
}
func sampleLiveTempsViaSensorsJSON() []TempReading {
out, err := exec.Command("sensors", "-j").Output()
if err != nil || len(out) == 0 {
return nil
}
var doc map[string]map[string]any
if err := json.Unmarshal(out, &doc); err != nil {
return nil
}
chips := make([]string, 0, len(doc))
for chip := range doc {
chips = append(chips, chip)
}
sort.Strings(chips)
temps := make([]TempReading, 0, len(chips))
seen := map[string]struct{}{}
for _, chip := range chips {
features := doc[chip]
featureNames := make([]string, 0, len(features))
for name := range features {
featureNames = append(featureNames, name)
}
sort.Strings(featureNames)
for _, name := range featureNames {
if strings.EqualFold(name, "Adapter") {
continue
}
feature, ok := features[name].(map[string]any)
if !ok {
continue
}
value, ok := firstTempInputValue(feature)
if !ok || value <= 0 || value > 150 {
continue
}
group := classifyLiveTempGroup(chip, name)
if group == "gpu" {
continue
}
label := strings.TrimSpace(name)
if label == "" {
continue
}
if group == "ambient" {
label = compactAmbientTempName(chip, label)
}
key := group + "\x00" + label
if _, ok := seen[key]; ok {
continue
}
seen[key] = struct{}{}
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
}
}
return temps
}
func sampleLiveTempsViaIPMI() []TempReading {
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
if err != nil || len(out) == 0 {
return nil
}
var temps []TempReading
seen := map[string]struct{}{}
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
parts := strings.Split(line, "|")
if len(parts) < 3 {
continue
}
name := strings.TrimSpace(parts[0])
if name == "" {
continue
}
unit := strings.ToLower(strings.TrimSpace(parts[2]))
if !strings.Contains(unit, "degrees") {
continue
}
raw := strings.TrimSpace(parts[1])
if raw == "" || strings.EqualFold(raw, "na") {
continue
}
value, err := strconv.ParseFloat(raw, 64)
if err != nil || value <= 0 || value > 150 {
continue
}
group := classifyLiveTempGroup("", name)
if group == "gpu" {
continue
}
label := name
if group == "ambient" {
label = compactAmbientTempName("", label)
}
key := group + "\x00" + label
if _, ok := seen[key]; ok {
continue
}
seen[key] = struct{}{}
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
}
return temps
}
func firstTempInputValue(feature map[string]any) (float64, bool) {
keys := make([]string, 0, len(feature))
for key := range feature {
keys = append(keys, key)
}
sort.Strings(keys)
for _, key := range keys {
lower := strings.ToLower(key)
if !strings.Contains(lower, "temp") || !strings.HasSuffix(lower, "_input") {
continue
}
switch value := feature[key].(type) {
case float64:
return value, true
case string:
f, err := strconv.ParseFloat(value, 64)
if err == nil {
return f, true
}
}
}
return 0, false
}
func classifyLiveTempGroup(chip, name string) string {
text := strings.ToLower(strings.TrimSpace(chip + " " + name))
switch {
case strings.Contains(text, "gpu"), strings.Contains(text, "amdgpu"), strings.Contains(text, "nvidia"), strings.Contains(text, "adeon"):
return "gpu"
case strings.Contains(text, "coretemp"),
strings.Contains(text, "k10temp"),
strings.Contains(text, "zenpower"),
strings.Contains(text, "package id"),
strings.Contains(text, "x86_pkg_temp"),
strings.Contains(text, "tctl"),
strings.Contains(text, "tdie"),
strings.Contains(text, "cpu"),
strings.Contains(text, "peci"):
return "cpu"
default:
return "ambient"
}
}
func compactAmbientTempName(chip, name string) string {
chip = strings.TrimSpace(chip)
name = strings.TrimSpace(name)
if chip == "" || strings.EqualFold(chip, name) {
return name
}
if strings.Contains(strings.ToLower(name), strings.ToLower(chip)) {
return name
}
return chip + " / " + name
}

View File

@@ -0,0 +1,44 @@
package platform
import "testing"
func TestFirstTempInputValue(t *testing.T) {
feature := map[string]any{
"temp1_input": 61.5,
"temp1_max": 80.0,
}
got, ok := firstTempInputValue(feature)
if !ok {
t.Fatal("expected value")
}
if got != 61.5 {
t.Fatalf("got %v want 61.5", got)
}
}
func TestClassifyLiveTempGroup(t *testing.T) {
tests := []struct {
chip string
name string
want string
}{
{chip: "coretemp-isa-0000", name: "Package id 0", want: "cpu"},
{chip: "amdgpu-pci-4300", name: "edge", want: "gpu"},
{chip: "nvme-pci-0100", name: "Composite", want: "ambient"},
{chip: "acpitz-acpi-0", name: "temp1", want: "ambient"},
}
for _, tc := range tests {
if got := classifyLiveTempGroup(tc.chip, tc.name); got != tc.want {
t.Fatalf("classifyLiveTempGroup(%q,%q)=%q want %q", tc.chip, tc.name, got, tc.want)
}
}
}
func TestCompactAmbientTempName(t *testing.T) {
if got := compactAmbientTempName("nvme-pci-0100", "Composite"); got != "nvme-pci-0100 / Composite" {
t.Fatalf("got %q", got)
}
if got := compactAmbientTempName("", "Inlet Temp"); got != "Inlet Temp" {
t.Fatalf("got %q", got)
}
}

View File

@@ -2,6 +2,7 @@ package platform
import (
"bytes"
"errors"
"fmt"
"os"
"os/exec"
@@ -18,21 +19,17 @@ func (s *System) ListInterfaces() ([]InterfaceInfo, error) {
out := make([]InterfaceInfo, 0, len(names))
for _, name := range names {
state := "unknown"
if raw, err := exec.Command("ip", "-o", "link", "show", name).Output(); err == nil {
fields := strings.Fields(string(raw))
if len(fields) >= 9 {
state = fields[8]
if up, err := interfaceAdminState(name); err == nil {
if up {
state = "up"
} else {
state = "down"
}
}
var ipv4 []string
if raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", name).Output(); err == nil {
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
fields := strings.Fields(line)
if len(fields) >= 4 {
ipv4 = append(ipv4, fields[3])
}
}
ipv4, err := interfaceIPv4Addrs(name)
if err != nil {
ipv4 = nil
}
out = append(out, InterfaceInfo{Name: name, State: state, IPv4: ipv4})
@@ -55,6 +52,109 @@ func (s *System) DefaultRoute() string {
return ""
}
func (s *System) CaptureNetworkSnapshot() (NetworkSnapshot, error) {
names, err := listInterfaceNames()
if err != nil {
return NetworkSnapshot{}, err
}
snapshot := NetworkSnapshot{
Interfaces: make([]NetworkInterfaceSnapshot, 0, len(names)),
}
for _, name := range names {
up, err := interfaceAdminState(name)
if err != nil {
return NetworkSnapshot{}, err
}
ipv4, err := interfaceIPv4Addrs(name)
if err != nil {
return NetworkSnapshot{}, err
}
snapshot.Interfaces = append(snapshot.Interfaces, NetworkInterfaceSnapshot{
Name: name,
Up: up,
IPv4: ipv4,
})
}
if raw, err := exec.Command("ip", "route", "show", "default").Output(); err == nil {
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
line = strings.TrimSpace(line)
if line != "" {
snapshot.DefaultRoutes = append(snapshot.DefaultRoutes, line)
}
}
}
if raw, err := os.ReadFile("/etc/resolv.conf"); err == nil {
snapshot.ResolvConf = string(raw)
}
return snapshot, nil
}
func (s *System) RestoreNetworkSnapshot(snapshot NetworkSnapshot) error {
var errs []string
for _, iface := range snapshot.Interfaces {
if err := exec.Command("ip", "link", "set", "dev", iface.Name, "up").Run(); err != nil {
errs = append(errs, fmt.Sprintf("%s: bring up before restore: %v", iface.Name, err))
continue
}
if err := exec.Command("ip", "addr", "flush", "dev", iface.Name).Run(); err != nil {
errs = append(errs, fmt.Sprintf("%s: flush addresses: %v", iface.Name, err))
}
for _, cidr := range iface.IPv4 {
if raw, err := exec.Command("ip", "addr", "add", cidr, "dev", iface.Name).CombinedOutput(); err != nil {
detail := strings.TrimSpace(string(raw))
if detail != "" {
errs = append(errs, fmt.Sprintf("%s: restore address %s: %v: %s", iface.Name, cidr, err, detail))
} else {
errs = append(errs, fmt.Sprintf("%s: restore address %s: %v", iface.Name, cidr, err))
}
}
}
state := "down"
if iface.Up {
state = "up"
}
if err := exec.Command("ip", "link", "set", "dev", iface.Name, state).Run(); err != nil {
errs = append(errs, fmt.Sprintf("%s: restore state %s: %v", iface.Name, state, err))
}
}
if err := exec.Command("ip", "route", "del", "default").Run(); err != nil {
var exitErr *exec.ExitError
if !errors.As(err, &exitErr) {
errs = append(errs, fmt.Sprintf("clear default route: %v", err))
}
}
for _, route := range snapshot.DefaultRoutes {
fields := strings.Fields(route)
if len(fields) == 0 {
continue
}
args := append([]string{"route", "add"}, fields...)
if raw, err := exec.Command("ip", args...).CombinedOutput(); err != nil {
detail := strings.TrimSpace(string(raw))
if detail != "" {
errs = append(errs, fmt.Sprintf("restore route %q: %v: %s", route, err, detail))
} else {
errs = append(errs, fmt.Sprintf("restore route %q: %v", route, err))
}
}
}
if err := os.WriteFile("/etc/resolv.conf", []byte(snapshot.ResolvConf), 0644); err != nil {
errs = append(errs, fmt.Sprintf("restore resolv.conf: %v", err))
}
if len(errs) > 0 {
return errors.New(strings.Join(errs, "; "))
}
return nil
}
func (s *System) DHCPOne(iface string) (string, error) {
var out bytes.Buffer
if err := exec.Command("ip", "link", "set", iface, "up").Run(); err != nil {
@@ -142,12 +242,52 @@ func (s *System) SetInterfaceState(iface string, up bool) error {
// GetInterfaceState returns true if the interface is UP.
func (s *System) GetInterfaceState(iface string) (bool, error) {
raw, err := os.ReadFile(fmt.Sprintf("/sys/class/net/%s/operstate", iface))
return interfaceAdminState(iface)
}
func interfaceAdminState(iface string) (bool, error) {
raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
if err != nil {
return false, err
}
state := strings.TrimSpace(string(raw))
return state == "up", nil
return parseInterfaceAdminState(string(raw))
}
func parseInterfaceAdminState(raw string) (bool, error) {
start := strings.IndexByte(raw, '<')
if start == -1 {
return false, fmt.Errorf("ip link output missing flags")
}
end := strings.IndexByte(raw[start+1:], '>')
if end == -1 {
return false, fmt.Errorf("ip link output missing flag terminator")
}
flags := strings.Split(raw[start+1:start+1+end], ",")
for _, flag := range flags {
if strings.TrimSpace(flag) == "UP" {
return true, nil
}
}
return false, nil
}
func interfaceIPv4Addrs(iface string) ([]string, error) {
raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", iface).Output()
if err != nil {
var exitErr *exec.ExitError
if errors.As(err, &exitErr) {
return nil, nil
}
return nil, err
}
var ipv4 []string
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
fields := strings.Fields(line)
if len(fields) >= 4 {
ipv4 = append(ipv4, fields[3])
}
}
return ipv4, nil
}
func listInterfaceNames() ([]string, error) {

View File

@@ -0,0 +1,46 @@
package platform
import "testing"
func TestParseInterfaceAdminState(t *testing.T) {
tests := []struct {
name string
raw string
want bool
wantErr bool
}{
{
name: "admin up with no carrier",
raw: "2: enp1s0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc mq state DOWN mode DEFAULT group default qlen 1000\n",
want: true,
},
{
name: "admin down",
raw: "2: enp1s0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000\n",
want: false,
},
{
name: "malformed output",
raw: "2: enp1s0: mtu 1500 state DOWN\n",
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := parseInterfaceAdminState(tt.raw)
if tt.wantErr {
if err == nil {
t.Fatal("expected error")
}
return
}
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got != tt.want {
t.Fatalf("got %v want %v", got, tt.want)
}
})
}
}

View File

@@ -90,6 +90,12 @@ func (s *System) DetectGPUVendor() string {
if _, err := os.Stat("/dev/kfd"); err == nil {
return "amd"
}
if raw, err := exec.Command("lspci", "-nn").Output(); err == nil {
text := strings.ToLower(string(raw))
if strings.Contains(text, "advanced micro devices") || strings.Contains(text, "amd/ati") {
return "amd"
}
}
return ""
}
@@ -117,8 +123,8 @@ func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
}
// RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi.
func (s *System) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
return runAcceptancePack(baseDir, "gpu-amd", []satJob{
func (s *System) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd", []satJob{
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
{name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}},
{name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
@@ -128,14 +134,20 @@ func (s *System) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (str
// RunAMDStressPack runs an AMD GPU burn-in pack.
// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
func (s *System) RunAMDStressPack(baseDir string, logFunc func(string)) (string, error) {
seconds := envInt("BEE_AMD_STRESS_SECONDS", 300)
return runAcceptancePack(baseDir, "gpu-amd-stress", []satJob{
func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
seconds := durationSec
if seconds <= 0 {
seconds = envInt("BEE_AMD_STRESS_SECONDS", 300)
}
if err := ensureAMDRuntimeReady(); err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", []satJob{
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
{name: fmt.Sprintf("03-rocm-smi-monitor-%ds.log", seconds), cmd: []string{
"rocm-smi", "--showtemp", "--showpower",
fmt.Sprintf("--duration=%d", seconds),
"bash", "-lc",
fmt.Sprintf("end=$((SECONDS+%d)); while [ \"$SECONDS\" -lt \"$end\" ]; do rocm-smi --showtemp --showpower --csv; sleep 1; done", seconds),
}},
}, logFunc)
}
@@ -191,7 +203,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
}
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
}
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
@@ -202,24 +214,27 @@ func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
}
func (s *System) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
passes := envInt("BEE_MEMTESTER_PASSES", 1)
return runAcceptancePack(baseDir, "memory", []satJob{
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
}, logFunc)
}
func (s *System) RunMemoryStressPack(baseDir string, logFunc func(string)) (string, error) {
seconds := envInt("BEE_VM_STRESS_SECONDS", 300)
func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
seconds := durationSec
if seconds <= 0 {
seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
}
// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
sizeArg := "80%"
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
sizeArg = fmt.Sprintf("%dM", mb)
}
return runAcceptancePack(baseDir, "memory-stress", []satJob{
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
{name: "02-stress-ng-vm.log", cmd: []string{
"stress-ng", "--vm", "1",
@@ -232,24 +247,27 @@ func (s *System) RunMemoryStressPack(baseDir string, logFunc func(string)) (stri
}, logFunc)
}
func (s *System) RunSATStressPack(baseDir string, logFunc func(string)) (string, error) {
seconds := envInt("BEE_SAT_STRESS_SECONDS", 300)
func (s *System) RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
seconds := durationSec
if seconds <= 0 {
seconds = envInt("BEE_SAT_STRESS_SECONDS", 300)
}
cmd := []string{"stressapptest", "-s", fmt.Sprintf("%d", seconds), "-W", "--cc_test"}
if mb := envInt("BEE_SAT_STRESS_MB", 0); mb > 0 {
cmd = append(cmd, "-M", fmt.Sprintf("%d", mb))
}
return runAcceptancePack(baseDir, "sat-stress", []satJob{
return runAcceptancePackCtx(ctx, baseDir, "sat-stress", []satJob{
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
{name: "02-stressapptest.log", cmd: cmd},
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
}, logFunc)
}
func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
if durationSec <= 0 {
durationSec = 60
}
return runAcceptancePack(baseDir, "cpu", []satJob{
return runAcceptancePackCtx(ctx, baseDir, "cpu", []satJob{
{name: "01-lscpu.log", cmd: []string{"lscpu"}},
{name: "02-sensors-before.log", cmd: []string{"sensors"}},
{name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}},
@@ -257,7 +275,7 @@ func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc f
}, logFunc)
}
func (s *System) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
if baseDir == "" {
baseDir = "/var/log/bee-sat"
}
@@ -285,11 +303,17 @@ func (s *System) RunStorageAcceptancePack(baseDir string, logFunc func(string))
}
for index, devPath := range devices {
if ctx.Err() != nil {
break
}
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
commands := storageSATCommands(devPath)
for cmdIndex, job := range commands {
if ctx.Err() != nil {
break
}
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
out, err := runSATCommand(verboseLog, job.name, job.cmd, logFunc)
out, err := runSATCommandCtx(ctx, verboseLog, job.name, job.cmd, nil, logFunc)
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
return "", writeErr
}
@@ -338,49 +362,6 @@ func nvidiaSATJobs() []satJob {
}
}
func runAcceptancePack(baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
if baseDir == "" {
baseDir = "/var/log/bee-sat"
}
ts := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, prefix+"-"+ts)
if err := os.MkdirAll(runDir, 0755); err != nil {
return "", err
}
verboseLog := filepath.Join(runDir, "verbose.log")
var summary strings.Builder
stats := satStats{}
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
for _, job := range jobs {
var out []byte
var err error
cmd := make([]string, 0, len(job.cmd))
for _, arg := range job.cmd {
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
}
out, err = runSATCommand(verboseLog, job.name, cmd, logFunc)
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
return "", writeErr
}
status, rc := classifySATResult(job.name, out, err)
stats.Add(status)
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
}
writeSATStats(&summary, stats)
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
return "", err
}
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
if err := createTarGz(archive, runDir); err != nil {
return "", err
}
return archive, nil
}
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
if diagLevel < 1 || diagLevel > 4 {
diagLevel = 3
@@ -402,6 +383,9 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
}
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
if ctx == nil {
ctx = context.Background()
}
if baseDir == "" {
baseDir = "/var/log/bee-sat"
}
@@ -649,6 +633,20 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
}
func ensureAMDRuntimeReady() error {
if _, err := os.Stat("/dev/kfd"); err == nil {
return nil
}
if raw, err := os.ReadFile("/sys/module/amdgpu/initstate"); err == nil {
state := strings.TrimSpace(string(raw))
if strings.EqualFold(state, "live") {
return nil
}
return fmt.Errorf("AMD driver is present but not initialized: amdgpu initstate=%q", state)
}
return errors.New("AMD GPUs are present but the runtime is not initialized: /dev/kfd is missing and amdgpu is not loaded")
}
func rocmSMIExecutableCandidates() []string {
return expandExistingPaths(rocmSMIExecutableGlobs)
}

View File

@@ -2,10 +2,12 @@ package platform
import (
"context"
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"sort"
"strconv"
"strings"
"sync"
@@ -304,10 +306,19 @@ func sampleGPUStressMetrics(gpuIndices []int) []GPUStressMetric {
// sampleFanSpeeds reads fan RPM values from ipmitool sdr.
func sampleFanSpeeds() ([]FanReading, error) {
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
if err == nil {
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
return fans, nil
}
}
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
if len(fans) > 0 {
return fans, nil
}
if err != nil {
return nil, err
}
return parseFanSpeeds(string(out)), nil
return nil, sensorsErr
}
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
@@ -316,18 +327,21 @@ func parseFanSpeeds(raw string) []FanReading {
var fans []FanReading
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
parts := strings.Split(line, "|")
if len(parts) < 3 {
if len(parts) < 2 {
continue
}
unit := strings.TrimSpace(parts[2])
if !strings.EqualFold(unit, "RPM") {
continue
unit := ""
if len(parts) >= 3 {
unit = strings.TrimSpace(parts[2])
}
valStr := strings.TrimSpace(parts[1])
if !strings.EqualFold(unit, "RPM") && !strings.Contains(strings.ToUpper(valStr), "RPM") {
continue
}
if strings.EqualFold(valStr, "na") || strings.EqualFold(valStr, "disabled") || valStr == "" {
continue
}
val, err := strconv.ParseFloat(valStr, 64)
val, err := parseFanRPMValue(valStr)
if err != nil {
continue
}
@@ -339,6 +353,84 @@ func parseFanSpeeds(raw string) []FanReading {
return fans
}
func parseFanRPMValue(raw string) (float64, error) {
fields := strings.Fields(strings.TrimSpace(strings.ReplaceAll(raw, ",", "")))
if len(fields) == 0 {
return 0, strconv.ErrSyntax
}
return strconv.ParseFloat(fields[0], 64)
}
func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
out, err := exec.Command("sensors", "-j").Output()
if err != nil || len(out) == 0 {
return nil, err
}
var doc map[string]map[string]any
if err := json.Unmarshal(out, &doc); err != nil {
return nil, err
}
chips := make([]string, 0, len(doc))
for chip := range doc {
chips = append(chips, chip)
}
sort.Strings(chips)
var fans []FanReading
seen := map[string]struct{}{}
for _, chip := range chips {
features := doc[chip]
names := make([]string, 0, len(features))
for name := range features {
names = append(names, name)
}
sort.Strings(names)
for _, name := range names {
feature, ok := features[name].(map[string]any)
if !ok {
continue
}
rpm, ok := firstFanInputValue(feature)
if !ok || rpm <= 0 {
continue
}
label := strings.TrimSpace(name)
if chip != "" && !strings.Contains(strings.ToLower(label), strings.ToLower(chip)) {
label = chip + " / " + label
}
if _, ok := seen[label]; ok {
continue
}
seen[label] = struct{}{}
fans = append(fans, FanReading{Name: label, RPM: rpm})
}
}
return fans, nil
}
func firstFanInputValue(feature map[string]any) (float64, bool) {
keys := make([]string, 0, len(feature))
for key := range feature {
keys = append(keys, key)
}
sort.Strings(keys)
for _, key := range keys {
lower := strings.ToLower(key)
if !strings.Contains(lower, "fan") || !strings.HasSuffix(lower, "_input") {
continue
}
switch value := feature[key].(type) {
case float64:
return value, true
case string:
f, err := strconv.ParseFloat(value, 64)
if err == nil {
return f, true
}
}
}
return 0, false
}
// sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors.
func sampleCPUMaxTemp() float64 {
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()

View File

@@ -0,0 +1,27 @@
package platform
import "testing"
func TestParseFanSpeeds(t *testing.T) {
raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n"
got := parseFanSpeeds(raw)
if len(got) != 2 {
t.Fatalf("fans=%d want 2 (%v)", len(got), got)
}
if got[0].Name != "FAN1" || got[0].RPM != 2400 {
t.Fatalf("fan0=%+v", got[0])
}
if got[1].Name != "FAN2" || got[1].RPM != 1800 {
t.Fatalf("fan1=%+v", got[1])
}
}
func TestFirstFanInputValue(t *testing.T) {
feature := map[string]any{
"fan1_input": 9200.0,
}
got, ok := firstFanInputValue(feature)
if !ok || got != 9200 {
t.Fatalf("got=%v ok=%v", got, ok)
}
}

View File

@@ -8,6 +8,18 @@ type InterfaceInfo struct {
IPv4 []string
}
type NetworkInterfaceSnapshot struct {
Name string
Up bool
IPv4 []string
}
type NetworkSnapshot struct {
Interfaces []NetworkInterfaceSnapshot
DefaultRoutes []string
ResolvConf string
}
type ServiceAction string
const (