Compare commits
18 Commits
1e62f828c6
...
iso/v1.0.1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b132f7973a | ||
|
|
bd94b6c792 | ||
|
|
06017eddfd | ||
|
|
0ac7b6a963 | ||
|
|
3d2ae4cdcb | ||
|
|
4669f14f4f | ||
|
|
540a9e39b8 | ||
|
|
58510207fa | ||
|
|
4cd7c9ab4e | ||
|
|
cfe255f6e4 | ||
|
|
8b9d3447d7 | ||
|
|
614b7cad61 | ||
|
|
9a1df9b1ba | ||
|
|
30cf014d58 | ||
|
|
27d478aed6 | ||
|
|
d36e8442a9 | ||
|
|
b345b0d14d | ||
|
|
0a1ac2ab9f |
@@ -80,6 +80,7 @@ type satRunner interface {
|
||||
DetectGPUVendor() string
|
||||
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
||||
RunAMDAcceptancePack(baseDir string) (string, error)
|
||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||
}
|
||||
|
||||
type runtimeChecker interface {
|
||||
@@ -105,6 +106,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
}
|
||||
}
|
||||
result := collector.Run(runtimeMode)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
|
||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||
result.Runtime = &health
|
||||
}
|
||||
@@ -173,11 +175,20 @@ func (a *App) RuntimeHealthResult() ActionResult {
|
||||
if err != nil {
|
||||
return ActionResult{Title: "Runtime issues", Body: "No runtime health found."}
|
||||
}
|
||||
driverLabel := "Driver ready"
|
||||
accelLabel := "CUDA ready"
|
||||
switch a.sat.DetectGPUVendor() {
|
||||
case "amd":
|
||||
driverLabel = "AMDGPU ready"
|
||||
accelLabel = "ROCm SMI ready"
|
||||
case "nvidia":
|
||||
driverLabel = "NVIDIA ready"
|
||||
}
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "Status: %s\n", firstNonEmpty(health.Status, "UNKNOWN"))
|
||||
fmt.Fprintf(&body, "Export dir: %s\n", firstNonEmpty(health.ExportDir, DefaultExportDir))
|
||||
fmt.Fprintf(&body, "Driver ready: %t\n", health.DriverReady)
|
||||
fmt.Fprintf(&body, "CUDA ready: %t\n", health.CUDAReady)
|
||||
fmt.Fprintf(&body, "%s: %t\n", driverLabel, health.DriverReady)
|
||||
fmt.Fprintf(&body, "%s: %t\n", accelLabel, health.CUDAReady)
|
||||
fmt.Fprintf(&body, "Network: %s", firstNonEmpty(health.NetworkStatus, "UNKNOWN"))
|
||||
if len(health.Issues) > 0 {
|
||||
body.WriteString("\n\nIssues:\n")
|
||||
@@ -238,9 +249,9 @@ func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, erro
|
||||
|
||||
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||
path, err := a.ExportSupportBundle(target)
|
||||
body := "Support bundle exported."
|
||||
body := "Support bundle exported. USB target unmounted and safe to remove."
|
||||
if path != "" {
|
||||
body = "Support bundle exported to " + path
|
||||
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
|
||||
}
|
||||
return ActionResult{Title: "Export support bundle", Body: body}, err
|
||||
}
|
||||
@@ -481,6 +492,67 @@ func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
||||
path, err := a.RunFanStressTest(ctx, "", opts)
|
||||
body := formatFanStressResult(path)
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err
|
||||
}
|
||||
|
||||
// formatFanStressResult formats the summary.txt from a fan-stress run, including
|
||||
// the per-step pass/fail display and the analysis section (throttling, max temps, fan response).
|
||||
func formatFanStressResult(archivePath string) string {
|
||||
if archivePath == "" {
|
||||
return "No output produced."
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return "Archive written to " + archivePath
|
||||
}
|
||||
content := strings.TrimSpace(string(raw))
|
||||
kv := parseKeyValueSummary(content)
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(formatSATDetail(content))
|
||||
|
||||
// Append analysis section.
|
||||
var analysis []string
|
||||
if v, ok := kv["throttling_detected"]; ok {
|
||||
label := "NO"
|
||||
if v == "true" {
|
||||
label = "YES ← throttling detected during load"
|
||||
}
|
||||
analysis = append(analysis, "Throttling: "+label)
|
||||
}
|
||||
if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" {
|
||||
analysis = append(analysis, "Max GPU temp: "+v+"°C")
|
||||
}
|
||||
if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" {
|
||||
analysis = append(analysis, "Max CPU temp: "+v+"°C")
|
||||
}
|
||||
if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" {
|
||||
analysis = append(analysis, "Fan response: "+v+"s")
|
||||
}
|
||||
|
||||
if len(analysis) > 0 {
|
||||
b.WriteString("\n\n=== Analysis ===\n")
|
||||
for _, line := range analysis {
|
||||
b.WriteString(line + "\n")
|
||||
}
|
||||
}
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz)
|
||||
// and returns a formatted human-readable result. Falls back to a plain message if unreadable.
|
||||
func satResultBody(archivePath string) string {
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
@@ -57,13 +60,22 @@ func (f fakeServices) ServiceDo(name string, action platform.ServiceAction) (str
|
||||
return f.serviceDoFn(name, action)
|
||||
}
|
||||
|
||||
type fakeExports struct{}
|
||||
type fakeExports struct {
|
||||
listTargetsFn func() ([]platform.RemovableTarget, error)
|
||||
exportToTargetFn func(string, platform.RemovableTarget) (string, error)
|
||||
}
|
||||
|
||||
func (f fakeExports) ListRemovableTargets() ([]platform.RemovableTarget, error) {
|
||||
if f.listTargetsFn != nil {
|
||||
return f.listTargetsFn()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeExports) ExportFileToTarget(src string, target platform.RemovableTarget) (string, error) {
|
||||
if f.exportToTargetFn != nil {
|
||||
return f.exportToTargetFn(src, target)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
@@ -97,10 +109,14 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
||||
}
|
||||
|
||||
type fakeSAT struct {
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
runCPUFn func(string, int) (string, error)
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
runCPUFn func(string, int) (string, error)
|
||||
detectVendorFn func() string
|
||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||
runAMDPackFn func(string) (string, error)
|
||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||
@@ -112,6 +128,9 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
|
||||
}
|
||||
|
||||
func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
if f.listNvidiaGPUsFn != nil {
|
||||
return f.listNvidiaGPUsFn()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
@@ -130,11 +149,30 @@ func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int) (string,
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) DetectGPUVendor() string { return "" }
|
||||
func (f fakeSAT) DetectGPUVendor() string {
|
||||
if f.detectVendorFn != nil {
|
||||
return f.detectVendorFn()
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) { return nil, nil }
|
||||
func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
||||
if f.listAMDGPUsFn != nil {
|
||||
return f.listAMDGPUsFn()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) { return "", nil }
|
||||
func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) {
|
||||
if f.runAMDPackFn != nil {
|
||||
return f.runAMDPackFn(baseDir)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||
t.Parallel()
|
||||
@@ -394,6 +432,44 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
t.Cleanup(func() { DefaultExportDir = oldExportDir })
|
||||
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
|
||||
t.Fatalf("write bee-audit.json: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
|
||||
t.Fatalf("write bee-audit.log: %v", err)
|
||||
}
|
||||
|
||||
a := &App{
|
||||
exports: fakeExports{
|
||||
exportToTargetFn: func(src string, target platform.RemovableTarget) (string, error) {
|
||||
if filepath.Base(src) == "" {
|
||||
t.Fatalf("expected non-empty source path")
|
||||
}
|
||||
return "/media/bee/" + filepath.Base(src), nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sdb1"})
|
||||
if err != nil {
|
||||
t.Fatalf("ExportSupportBundleResult error: %v", err)
|
||||
}
|
||||
if result.Title != "Export support bundle" {
|
||||
t.Fatalf("title=%q want %q", result.Title, "Export support bundle")
|
||||
}
|
||||
if want := "USB target unmounted and safe to remove."; !contains(result.Body, want) {
|
||||
t.Fatalf("body missing %q\nbody=%s", want, result.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -516,6 +592,9 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
archive, err := BuildSupportBundle(exportDir)
|
||||
if err != nil {
|
||||
@@ -524,6 +603,44 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
if _, err := os.Stat(archive); err != nil {
|
||||
t.Fatalf("archive stat: %v", err)
|
||||
}
|
||||
|
||||
file, err := os.Open(archive)
|
||||
if err != nil {
|
||||
t.Fatalf("open archive: %v", err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
gzr, err := gzip.NewReader(file)
|
||||
if err != nil {
|
||||
t.Fatalf("gzip reader: %v", err)
|
||||
}
|
||||
defer gzr.Close()
|
||||
|
||||
tr := tar.NewReader(gzr)
|
||||
var names []string
|
||||
for {
|
||||
hdr, err := tr.Next()
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("read tar entry: %v", err)
|
||||
}
|
||||
names = append(names, hdr.Name)
|
||||
}
|
||||
|
||||
var foundRaw bool
|
||||
for _, name := range names {
|
||||
if contains(name, "/export/bee-sat/memory-run/verbose.log") {
|
||||
foundRaw = true
|
||||
}
|
||||
if contains(name, "/export/bee-sat/memory-run.tar.gz") {
|
||||
t.Fatalf("support bundle should not contain nested SAT archive: %s", name)
|
||||
}
|
||||
}
|
||||
if !foundRaw {
|
||||
t.Fatalf("support bundle missing raw SAT log, names=%v", names)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainBanner(t *testing.T) {
|
||||
@@ -600,6 +717,44 @@ func TestMainBanner(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeHealthResultUsesAMDLabels(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
oldRuntimePath := DefaultRuntimeJSONPath
|
||||
DefaultRuntimeJSONPath = filepath.Join(tmp, "runtime-health.json")
|
||||
t.Cleanup(func() { DefaultRuntimeJSONPath = oldRuntimePath })
|
||||
|
||||
raw, err := json.Marshal(schema.RuntimeHealth{
|
||||
Status: "OK",
|
||||
ExportDir: "/appdata/bee/export",
|
||||
DriverReady: true,
|
||||
CUDAReady: true,
|
||||
NetworkStatus: "OK",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("marshal runtime health: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(DefaultRuntimeJSONPath, raw, 0644); err != nil {
|
||||
t.Fatalf("write runtime health: %v", err)
|
||||
}
|
||||
|
||||
a := &App{
|
||||
sat: fakeSAT{
|
||||
detectVendorFn: func() string { return "amd" },
|
||||
},
|
||||
}
|
||||
|
||||
result := a.RuntimeHealthResult()
|
||||
if !contains(result.Body, "AMDGPU ready: true") {
|
||||
t.Fatalf("body missing AMD driver label:\n%s", result.Body)
|
||||
}
|
||||
if !contains(result.Body, "ROCm SMI ready: true") {
|
||||
t.Fatalf("body missing ROCm label:\n%s", result.Body)
|
||||
}
|
||||
if contains(result.Body, "CUDA ready") {
|
||||
t.Fatalf("body should not mention CUDA on AMD:\n%s", result.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func intPtr(v int) *int { return &v }
|
||||
|
||||
func contains(haystack, needle string) bool {
|
||||
|
||||
214
audit/internal/app/sat_overlay.go
Normal file
214
audit/internal/app/sat_overlay.go
Normal file
@@ -0,0 +1,214 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
||||
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
||||
return
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "gpu-amd-"); ok {
|
||||
applyGPUVendorSAT(snap.PCIeDevices, "amd", summary)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
|
||||
applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
|
||||
applyMemorySAT(snap.Memory, summary)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "cpu-"); ok {
|
||||
applyCPUSAT(snap.CPUs, summary)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
||||
applyStorageSAT(snap.Storage, summary)
|
||||
}
|
||||
}
|
||||
|
||||
type satSummary struct {
|
||||
runAtUTC string
|
||||
overall string
|
||||
kv map[string]string
|
||||
}
|
||||
|
||||
func loadLatestSATSummary(baseDir, prefix string) (satSummary, bool) {
|
||||
matches, err := filepath.Glob(filepath.Join(baseDir, prefix+"*/summary.txt"))
|
||||
if err != nil || len(matches) == 0 {
|
||||
return satSummary{}, false
|
||||
}
|
||||
sort.Strings(matches)
|
||||
raw, err := os.ReadFile(matches[len(matches)-1])
|
||||
if err != nil {
|
||||
return satSummary{}, false
|
||||
}
|
||||
kv := parseKeyValueSummary(string(raw))
|
||||
return satSummary{
|
||||
runAtUTC: strings.TrimSpace(kv["run_at_utc"]),
|
||||
overall: strings.ToUpper(strings.TrimSpace(kv["overall_status"])),
|
||||
kv: kv,
|
||||
}, true
|
||||
}
|
||||
|
||||
func applyGPUVendorSAT(devs []schema.HardwarePCIeDevice, vendor string, summary satSummary) {
|
||||
status, description, ok := satSummaryStatus(summary, vendor+" GPU SAT")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for i := range devs {
|
||||
if !matchesGPUVendor(devs[i], vendor) {
|
||||
continue
|
||||
}
|
||||
mergeComponentStatus(&devs[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
||||
}
|
||||
}
|
||||
|
||||
func applyMemorySAT(dimms []schema.HardwareMemory, summary satSummary) {
|
||||
status, description, ok := satSummaryStatus(summary, "memory SAT")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for i := range dimms {
|
||||
mergeComponentStatus(&dimms[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
||||
}
|
||||
}
|
||||
|
||||
func applyCPUSAT(cpus []schema.HardwareCPU, summary satSummary) {
|
||||
status, description, ok := satSummaryStatus(summary, "CPU SAT")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for i := range cpus {
|
||||
mergeComponentStatus(&cpus[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
||||
}
|
||||
}
|
||||
|
||||
func applyStorageSAT(disks []schema.HardwareStorage, summary satSummary) {
|
||||
byDevice := parseStorageSATStatus(summary)
|
||||
for i := range disks {
|
||||
devPath, _ := disks[i].Telemetry["linux_device"].(string)
|
||||
devName := filepath.Base(strings.TrimSpace(devPath))
|
||||
if devName == "" {
|
||||
continue
|
||||
}
|
||||
result, ok := byDevice[devName]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
mergeComponentStatus(&disks[i].HardwareComponentStatus, summary.runAtUTC, result.status, result.description)
|
||||
}
|
||||
}
|
||||
|
||||
type satStatusResult struct {
|
||||
status string
|
||||
description string
|
||||
ok bool
|
||||
}
|
||||
|
||||
func parseStorageSATStatus(summary satSummary) map[string]satStatusResult {
|
||||
result := map[string]satStatusResult{}
|
||||
for key, value := range summary.kv {
|
||||
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
||||
continue
|
||||
}
|
||||
base := strings.TrimSuffix(key, "_status")
|
||||
idx := strings.Index(base, "_")
|
||||
if idx <= 0 {
|
||||
continue
|
||||
}
|
||||
devName := base[:idx]
|
||||
step := strings.ReplaceAll(base[idx+1:], "_", "-")
|
||||
stepStatus, desc, ok := satKeyStatus(strings.ToUpper(strings.TrimSpace(value)), "storage "+step)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
current := result[devName]
|
||||
if !current.ok || statusSeverity(stepStatus) > statusSeverity(current.status) {
|
||||
result[devName] = satStatusResult{status: stepStatus, description: desc, ok: true}
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func satSummaryStatus(summary satSummary, label string) (string, string, bool) {
|
||||
return satKeyStatus(summary.overall, label)
|
||||
}
|
||||
|
||||
func satKeyStatus(rawStatus, label string) (string, string, bool) {
|
||||
switch strings.ToUpper(strings.TrimSpace(rawStatus)) {
|
||||
case "OK":
|
||||
return "OK", label + " passed", true
|
||||
case "PARTIAL", "UNSUPPORTED", "CANCELED", "CANCELLED":
|
||||
return "Warning", label + " incomplete", true
|
||||
case "FAILED":
|
||||
return "Critical", label + " failed", true
|
||||
default:
|
||||
return "", "", false
|
||||
}
|
||||
}
|
||||
|
||||
func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
|
||||
if component == nil || satStatus == "" {
|
||||
return
|
||||
}
|
||||
current := strings.TrimSpace(ptrString(component.Status))
|
||||
if current == "" || current == "Unknown" || statusSeverity(satStatus) > statusSeverity(current) {
|
||||
component.Status = appStringPtr(satStatus)
|
||||
if strings.TrimSpace(description) != "" {
|
||||
component.ErrorDescription = appStringPtr(description)
|
||||
}
|
||||
if strings.TrimSpace(changedAt) != "" {
|
||||
component.StatusChangedAt = appStringPtr(changedAt)
|
||||
component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
|
||||
Status: satStatus,
|
||||
ChangedAt: changedAt,
|
||||
Details: appStringPtr(description),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func statusSeverity(status string) int {
|
||||
switch strings.TrimSpace(status) {
|
||||
case "Critical":
|
||||
return 3
|
||||
case "Warning":
|
||||
return 2
|
||||
case "OK":
|
||||
return 1
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Controller") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Accelerator") {
|
||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Display") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Video") {
|
||||
return false
|
||||
}
|
||||
}
|
||||
manufacturer := strings.ToLower(strings.TrimSpace(ptrString(dev.Manufacturer)))
|
||||
switch vendor {
|
||||
case "amd":
|
||||
return strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd/ati")
|
||||
case "nvidia":
|
||||
return strings.Contains(manufacturer, "nvidia")
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func ptrString(v *string) string {
|
||||
if v == nil {
|
||||
return ""
|
||||
}
|
||||
return *v
|
||||
}
|
||||
|
||||
func appStringPtr(value string) *string {
|
||||
return &value
|
||||
}
|
||||
61
audit/internal/app/sat_overlay_test.go
Normal file
61
audit/internal/app/sat_overlay_test.go
Normal file
@@ -0,0 +1,61 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
|
||||
baseDir := t.TempDir()
|
||||
runDir := filepath.Join(baseDir, "storage-20260325-161151")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
raw := "run_at_utc=2026-03-25T16:11:51Z\nnvme0n1_nvme_smart_log_status=OK\nsda_smartctl_health_status=FAILED\noverall_status=FAILED\n"
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(raw), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
nvme := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/nvme0n1"}}
|
||||
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
||||
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir)
|
||||
|
||||
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
||||
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
||||
}
|
||||
if snap.Storage[1].Status == nil || *snap.Storage[1].Status != "Critical" {
|
||||
t.Fatalf("sda status=%v want Critical", snap.Storage[1].Status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
||||
baseDir := t.TempDir()
|
||||
runDir := filepath.Join(baseDir, "gpu-amd-20260325-161436")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
raw := "run_at_utc=2026-03-25T16:14:36Z\noverall_status=FAILED\n"
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(raw), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
class := "DisplayController"
|
||||
manufacturer := "Advanced Micro Devices, Inc. [AMD/ATI]"
|
||||
snap := schema.HardwareSnapshot{
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
}},
|
||||
}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir)
|
||||
|
||||
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||
}
|
||||
}
|
||||
@@ -56,7 +56,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
}
|
||||
defer os.RemoveAll(stageRoot)
|
||||
|
||||
if err := copyDirContents(exportDir, filepath.Join(stageRoot, "export")); err != nil {
|
||||
if err := copyExportDirForSupportBundle(exportDir, filepath.Join(stageRoot, "export")); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := writeJournalDump(filepath.Join(stageRoot, "systemd", "combined.journal.log")); err != nil {
|
||||
@@ -214,6 +214,40 @@ func copyDirContents(srcDir, dstDir string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func copyExportDirForSupportBundle(srcDir, dstDir string) error {
|
||||
return copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
|
||||
cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
|
||||
if cleanRel == "" {
|
||||
return true
|
||||
}
|
||||
if strings.HasPrefix(cleanRel, "bee-sat/") && strings.HasSuffix(cleanRel, ".tar.gz") {
|
||||
return false
|
||||
}
|
||||
if strings.HasPrefix(filepath.Base(cleanRel), "bee-support-") && strings.HasSuffix(cleanRel, ".tar.gz") {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
})
|
||||
}
|
||||
|
||||
func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
|
||||
entries, err := os.ReadDir(srcDir)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
for _, entry := range entries {
|
||||
src := filepath.Join(srcDir, entry.Name())
|
||||
dst := filepath.Join(dstDir, entry.Name())
|
||||
if err := copyPathFiltered(srcDir, src, dst, keep); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func copyPath(src, dst string) error {
|
||||
info, err := os.Stat(src)
|
||||
if err != nil {
|
||||
@@ -254,6 +288,36 @@ func copyPath(src, dst string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
func copyPathFiltered(rootSrc, src, dst string, keep func(rel string, info os.FileInfo) bool) error {
|
||||
info, err := os.Stat(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel, err := filepath.Rel(rootSrc, src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if keep != nil && !keep(rel, info) {
|
||||
return nil
|
||||
}
|
||||
if info.IsDir() {
|
||||
if err := os.MkdirAll(dst, info.Mode().Perm()); err != nil {
|
||||
return err
|
||||
}
|
||||
entries, err := os.ReadDir(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, entry := range entries {
|
||||
if err := copyPathFiltered(rootSrc, filepath.Join(src, entry.Name()), filepath.Join(dst, entry.Name()), keep); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
return copyPath(src, dst)
|
||||
}
|
||||
|
||||
func createSupportTarGz(dst, srcDir string) error {
|
||||
file, err := os.Create(dst)
|
||||
if err != nil {
|
||||
|
||||
252
audit/internal/collector/amdgpu.go
Normal file
252
audit/internal/collector/amdgpu.go
Normal file
@@ -0,0 +1,252 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
var (
|
||||
amdSMIExecCommand = exec.Command
|
||||
amdSMILookPath = exec.LookPath
|
||||
amdSMIGlob = filepath.Glob
|
||||
)
|
||||
|
||||
var amdSMIExecutableGlobs = []string{
|
||||
"/opt/rocm/bin/rocm-smi",
|
||||
"/opt/rocm-*/bin/rocm-smi",
|
||||
"/usr/local/bin/rocm-smi",
|
||||
}
|
||||
|
||||
type amdGPUInfo struct {
|
||||
BDF string
|
||||
Serial string
|
||||
Product string
|
||||
Firmware string
|
||||
PowerW *float64
|
||||
TempC *float64
|
||||
}
|
||||
|
||||
func enrichPCIeWithAMD(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
if !hasAMDGPUDevices(devs) {
|
||||
return devs
|
||||
}
|
||||
infoByBDF, err := queryAMDGPUs()
|
||||
if err != nil {
|
||||
slog.Info("amdgpu: enrichment skipped", "err", err)
|
||||
return devs
|
||||
}
|
||||
enriched := 0
|
||||
for i := range devs {
|
||||
if !isAMDGPUDevice(devs[i]) || devs[i].BDF == nil {
|
||||
continue
|
||||
}
|
||||
info, ok := infoByBDF[normalizePCIeBDF(*devs[i].BDF)]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(info.Serial) != "" {
|
||||
devs[i].SerialNumber = &info.Serial
|
||||
}
|
||||
if strings.TrimSpace(info.Firmware) != "" {
|
||||
devs[i].Firmware = &info.Firmware
|
||||
}
|
||||
if strings.TrimSpace(info.Product) != "" && devs[i].Model == nil {
|
||||
devs[i].Model = &info.Product
|
||||
}
|
||||
if info.PowerW != nil {
|
||||
devs[i].PowerW = info.PowerW
|
||||
}
|
||||
if info.TempC != nil {
|
||||
devs[i].TemperatureC = info.TempC
|
||||
}
|
||||
enriched++
|
||||
}
|
||||
if enriched > 0 {
|
||||
slog.Info("amdgpu: enriched", "count", enriched)
|
||||
}
|
||||
return devs
|
||||
}
|
||||
|
||||
func hasAMDGPUDevices(devs []schema.HardwarePCIeDevice) bool {
|
||||
for _, dev := range devs {
|
||||
if isAMDGPUDevice(dev) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.Manufacturer == nil || dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
manufacturer := strings.ToLower(strings.TrimSpace(*dev.Manufacturer))
|
||||
return strings.Contains(manufacturer, "advanced micro devices") && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
||||
}
|
||||
|
||||
func queryAMDGPUs() (map[string]amdGPUInfo, error) {
|
||||
busByCard, err := queryAMDField("--showbus")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
infoByCard := map[string]amdGPUInfo{}
|
||||
for card, bus := range busByCard {
|
||||
bdf := normalizePCIeBDF(bus)
|
||||
if bdf == "" {
|
||||
continue
|
||||
}
|
||||
infoByCard[card] = amdGPUInfo{BDF: bdf}
|
||||
}
|
||||
if len(infoByCard) == 0 {
|
||||
return map[string]amdGPUInfo{}, nil
|
||||
}
|
||||
mergeAMDField(infoByCard, "--showserial", func(info *amdGPUInfo, value string) { info.Serial = value })
|
||||
mergeAMDField(infoByCard, "--showproductname", func(info *amdGPUInfo, value string) { info.Product = value })
|
||||
mergeAMDField(infoByCard, "--showvbios", func(info *amdGPUInfo, value string) { info.Firmware = value })
|
||||
mergeAMDNumericField(infoByCard, "--showpower", func(info *amdGPUInfo, value float64) { info.PowerW = &value })
|
||||
mergeAMDNumericField(infoByCard, "--showtemp", func(info *amdGPUInfo, value float64) { info.TempC = &value })
|
||||
|
||||
result := make(map[string]amdGPUInfo, len(infoByCard))
|
||||
for _, info := range infoByCard {
|
||||
if info.BDF == "" {
|
||||
continue
|
||||
}
|
||||
result[info.BDF] = info
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func mergeAMDField(infoByCard map[string]amdGPUInfo, flag string, apply func(*amdGPUInfo, string)) {
|
||||
values, err := queryAMDField(flag)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
for card, value := range values {
|
||||
info, ok := infoByCard[card]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
value = strings.TrimSpace(value)
|
||||
if value == "" {
|
||||
continue
|
||||
}
|
||||
apply(&info, value)
|
||||
infoByCard[card] = info
|
||||
}
|
||||
}
|
||||
|
||||
func mergeAMDNumericField(infoByCard map[string]amdGPUInfo, flag string, apply func(*amdGPUInfo, float64)) {
|
||||
values, err := queryAMDNumericField(flag)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
for card, value := range values {
|
||||
info, ok := infoByCard[card]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
apply(&info, value)
|
||||
infoByCard[card] = info
|
||||
}
|
||||
}
|
||||
|
||||
func queryAMDField(flag string) (map[string]string, error) {
|
||||
cmd, err := resolveAMDSMICmd(flag, "--csv")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out, err := amdSMIExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return parseROCmSingleValueCSV(string(out)), nil
|
||||
}
|
||||
|
||||
func queryAMDNumericField(flag string) (map[string]float64, error) {
|
||||
values, err := queryAMDField(flag)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out := map[string]float64{}
|
||||
for card, raw := range values {
|
||||
if value, ok := firstFloat(raw); ok {
|
||||
out[card] = value
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func resolveAMDSMICmd(args ...string) ([]string, error) {
|
||||
if path, err := amdSMILookPath("rocm-smi"); err == nil {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
for _, pattern := range amdSMIExecutableGlobs {
|
||||
matches, err := amdSMIGlob(pattern)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sort.Strings(matches)
|
||||
for _, match := range matches {
|
||||
return append([]string{match}, args...), nil
|
||||
}
|
||||
}
|
||||
return nil, exec.ErrNotFound
|
||||
}
|
||||
|
||||
func parseROCmSingleValueCSV(raw string) map[string]string {
|
||||
rows := map[string]string{}
|
||||
reader := csv.NewReader(strings.NewReader(raw))
|
||||
reader.FieldsPerRecord = -1
|
||||
records, err := reader.ReadAll()
|
||||
if err != nil {
|
||||
return rows
|
||||
}
|
||||
for _, rec := range records {
|
||||
if len(rec) < 2 {
|
||||
continue
|
||||
}
|
||||
card := normalizeROCmCardKey(rec[0])
|
||||
if card == "" {
|
||||
continue
|
||||
}
|
||||
value := strings.TrimSpace(strings.Join(rec[1:], ","))
|
||||
if value == "" || looksLikeCSVHeaderValue(value) {
|
||||
continue
|
||||
}
|
||||
rows[card] = value
|
||||
}
|
||||
return rows
|
||||
}
|
||||
|
||||
func normalizeROCmCardKey(raw string) string {
|
||||
raw = strings.ToLower(strings.TrimSpace(raw))
|
||||
raw = strings.Trim(raw, "\"")
|
||||
if raw == "" {
|
||||
return ""
|
||||
}
|
||||
if raw == "device" || raw == "gpu" || raw == "card" {
|
||||
return ""
|
||||
}
|
||||
if strings.HasPrefix(raw, "card") {
|
||||
return raw
|
||||
}
|
||||
if _, err := strconv.Atoi(raw); err == nil {
|
||||
return "card" + raw
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func looksLikeCSVHeaderValue(value string) bool {
|
||||
value = strings.ToLower(strings.TrimSpace(value))
|
||||
return strings.Contains(value, "product") ||
|
||||
strings.Contains(value, "serial") ||
|
||||
strings.Contains(value, "vbios") ||
|
||||
strings.Contains(value, "bus")
|
||||
}
|
||||
56
audit/internal/collector/amdgpu_test.go
Normal file
56
audit/internal/collector/amdgpu_test.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"os/exec"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseROCmSingleValueCSV(t *testing.T) {
|
||||
raw := "device,Serial Number\ncard0,ABC123\ncard1,XYZ789\n"
|
||||
got := parseROCmSingleValueCSV(raw)
|
||||
if got["card0"] != "ABC123" {
|
||||
t.Fatalf("card0=%q want ABC123", got["card0"])
|
||||
}
|
||||
if got["card1"] != "XYZ789" {
|
||||
t.Fatalf("card1=%q want XYZ789", got["card1"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestQueryAMDNumericFieldParsesUnits(t *testing.T) {
|
||||
origExec := amdSMIExecCommand
|
||||
origLookPath := amdSMILookPath
|
||||
t.Cleanup(func() {
|
||||
amdSMIExecCommand = origExec
|
||||
amdSMILookPath = origLookPath
|
||||
})
|
||||
|
||||
amdSMILookPath = func(string) (string, error) { return "/usr/bin/rocm-smi", nil }
|
||||
amdSMIExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
return exec.Command("sh", "-c", "printf 'device,Temperature\\ncard0,45.5c\\ncard1,67.0c\\n'")
|
||||
}
|
||||
|
||||
got, err := queryAMDNumericField("--showtemp")
|
||||
if err != nil {
|
||||
t.Fatalf("queryAMDNumericField: %v", err)
|
||||
}
|
||||
if got["card0"] != 45.5 {
|
||||
t.Fatalf("card0=%v want 45.5", got["card0"])
|
||||
}
|
||||
if got["card1"] != 67.0 {
|
||||
t.Fatalf("card1=%v want 67.0", got["card1"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeROCmCardKey(t *testing.T) {
|
||||
tests := map[string]string{
|
||||
"0": "card0",
|
||||
"card1": "card1",
|
||||
"Device": "",
|
||||
"": "",
|
||||
}
|
||||
for input, want := range tests {
|
||||
if got := normalizeROCmCardKey(input); got != want {
|
||||
t.Fatalf("normalizeROCmCardKey(%q)=%q want %q", input, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -36,6 +36,8 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
|
||||
snap.Storage = collectStorage()
|
||||
snap.PCIeDevices = collectPCIe()
|
||||
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
||||
|
||||
@@ -41,7 +41,18 @@ func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
|
||||
func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
|
||||
out := make([]schema.HardwarePowerSupply, 0, len(psus))
|
||||
for _, psu := range psus {
|
||||
if psu.SerialNumber == nil || *psu.SerialNumber == "" {
|
||||
hasIdentity := false
|
||||
switch {
|
||||
case psu.SerialNumber != nil && *psu.SerialNumber != "":
|
||||
hasIdentity = true
|
||||
case psu.Slot != nil && *psu.Slot != "":
|
||||
hasIdentity = true
|
||||
case psu.Model != nil && *psu.Model != "":
|
||||
hasIdentity = true
|
||||
case psu.Vendor != nil && *psu.Vendor != "":
|
||||
hasIdentity = true
|
||||
}
|
||||
if !hasIdentity {
|
||||
continue
|
||||
}
|
||||
out = append(out, psu)
|
||||
|
||||
@@ -61,3 +61,20 @@ func TestFinalizeSnapshotPreservesDuplicateSerials(t *testing.T) {
|
||||
t.Fatalf("duplicate serial should stay unchanged: %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFilterPSUsKeepsSlotOnlyEntries(t *testing.T) {
|
||||
slot := "0"
|
||||
status := statusOK
|
||||
|
||||
got := filterPSUs([]schema.HardwarePowerSupply{
|
||||
{Slot: &slot, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
})
|
||||
|
||||
if len(got) != 1 {
|
||||
t.Fatalf("len(got)=%d want 1", len(got))
|
||||
}
|
||||
if got[0].Slot == nil || *got[0].Slot != "0" {
|
||||
t.Fatalf("unexpected kept PSU: %+v", got[0])
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,6 +44,11 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
||||
}
|
||||
iface := ifaces[0]
|
||||
devs[i].MacAddresses = collectInterfaceMACs(ifaces)
|
||||
if devs[i].SerialNumber == nil {
|
||||
if serial := queryPCIDeviceSerial(bdf); serial != "" {
|
||||
devs[i].SerialNumber = &serial
|
||||
}
|
||||
}
|
||||
|
||||
if devs[i].Firmware == nil {
|
||||
if out, err := ethtoolInfoQuery(iface); err == nil {
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseSFPDOM(t *testing.T) {
|
||||
raw := `
|
||||
@@ -29,6 +33,74 @@ func TestParseSFPDOM(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLSPCIDetailSerial(t *testing.T) {
|
||||
raw := `
|
||||
05:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6]
|
||||
Serial number: NIC-SN-12345
|
||||
`
|
||||
if got := parseLSPCIDetailSerial(raw); got != "NIC-SN-12345" {
|
||||
t.Fatalf("serial=%q want %q", got, "NIC-SN-12345")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParsePCIVPDSerial(t *testing.T) {
|
||||
raw := []byte{0x82, 0x05, 0x00, 'M', 'L', 'X', '5', 0x90, 0x08, 0x00, 'S', 'N', 0x08, 'M', 'T', '1', '2', '3', '4', '5', '6'}
|
||||
if got := parsePCIVPDSerial(raw); got != "MT123456" {
|
||||
t.Fatalf("serial=%q want %q", got, "MT123456")
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
||||
origDetail := queryPCILSPCIDetail
|
||||
origVPD := readPCIVPDFile
|
||||
origIfaces := netIfacesByBDF
|
||||
origReadMAC := readNetAddressFile
|
||||
origEth := ethtoolInfoQuery
|
||||
origModule := ethtoolModuleQuery
|
||||
t.Cleanup(func() {
|
||||
queryPCILSPCIDetail = origDetail
|
||||
readPCIVPDFile = origVPD
|
||||
netIfacesByBDF = origIfaces
|
||||
readNetAddressFile = origReadMAC
|
||||
ethtoolInfoQuery = origEth
|
||||
ethtoolModuleQuery = origModule
|
||||
})
|
||||
|
||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||
if bdf != "0000:18:00.0" {
|
||||
t.Fatalf("unexpected bdf: %s", bdf)
|
||||
}
|
||||
return "Serial number: NIC-SN-98765\n", nil
|
||||
}
|
||||
readPCIVPDFile = func(string) ([]byte, error) {
|
||||
return nil, fmt.Errorf("no vpd needed")
|
||||
}
|
||||
netIfacesByBDF = func(string) []string { return []string{"eth0"} }
|
||||
readNetAddressFile = func(iface string) (string, error) {
|
||||
if iface != "eth0" {
|
||||
t.Fatalf("unexpected iface: %s", iface)
|
||||
}
|
||||
return "aa:bb:cc:dd:ee:ff", nil
|
||||
}
|
||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
|
||||
|
||||
class := "EthernetController"
|
||||
bdf := "0000:18:00.0"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
BDF: &bdf,
|
||||
}}
|
||||
|
||||
out := enrichPCIeWithNICTelemetry(devs)
|
||||
if out[0].SerialNumber == nil || *out[0].SerialNumber != "NIC-SN-98765" {
|
||||
t.Fatalf("serial=%v want NIC-SN-98765", out[0].SerialNumber)
|
||||
}
|
||||
if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
|
||||
t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDBMValue(t *testing.T) {
|
||||
tests := []struct {
|
||||
in string
|
||||
|
||||
@@ -37,7 +37,7 @@ func parseLspci(output string) []schema.HardwarePCIeDevice {
|
||||
val := strings.TrimSpace(line[idx+2:])
|
||||
fields[key] = val
|
||||
}
|
||||
if !shouldIncludePCIeDevice(fields["Class"]) {
|
||||
if !shouldIncludePCIeDevice(fields["Class"], fields["Vendor"], fields["Device"]) {
|
||||
continue
|
||||
}
|
||||
dev := parseLspciDevice(fields)
|
||||
@@ -46,8 +46,10 @@ func parseLspci(output string) []schema.HardwarePCIeDevice {
|
||||
return devs
|
||||
}
|
||||
|
||||
func shouldIncludePCIeDevice(class string) bool {
|
||||
func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
||||
c := strings.ToLower(strings.TrimSpace(class))
|
||||
v := strings.ToLower(strings.TrimSpace(vendor))
|
||||
d := strings.ToLower(strings.TrimSpace(device))
|
||||
if c == "" {
|
||||
return true
|
||||
}
|
||||
@@ -68,12 +70,28 @@ func shouldIncludePCIeDevice(class string) bool {
|
||||
"audio device",
|
||||
"serial bus controller",
|
||||
"unassigned class",
|
||||
"non-essential instrumentation",
|
||||
}
|
||||
for _, bad := range excluded {
|
||||
if strings.Contains(c, bad) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
|
||||
internalAMDPatterns := []string{
|
||||
"dummy function",
|
||||
"reserved spp",
|
||||
"ptdma",
|
||||
"cryptographic coprocessor pspcpp",
|
||||
"pspcpp",
|
||||
}
|
||||
for _, bad := range internalAMDPatterns {
|
||||
if strings.Contains(d, bad) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -98,6 +116,8 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
}
|
||||
if numaNode, ok := readPCINumaNode(bdf); ok {
|
||||
dev.NUMANode = &numaNode
|
||||
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
||||
dev.NUMANode = &numaNode
|
||||
}
|
||||
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
|
||||
dev.LinkWidth = &width
|
||||
@@ -165,6 +185,18 @@ func readPCINumaNode(bdf string) (int, bool) {
|
||||
return value, true
|
||||
}
|
||||
|
||||
func parsePCINumaNode(raw string) (int, bool) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return 0, false
|
||||
}
|
||||
value, err := strconv.Atoi(raw)
|
||||
if err != nil || value < 0 {
|
||||
return 0, false
|
||||
}
|
||||
return value, true
|
||||
}
|
||||
|
||||
func readPCIIntAttribute(bdf, attribute string) (int, bool) {
|
||||
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
|
||||
if err != nil {
|
||||
|
||||
@@ -8,32 +8,42 @@ import (
|
||||
|
||||
func TestShouldIncludePCIeDevice(t *testing.T) {
|
||||
tests := []struct {
|
||||
class string
|
||||
want bool
|
||||
name string
|
||||
class string
|
||||
vendor string
|
||||
device string
|
||||
want bool
|
||||
}{
|
||||
{"USB controller", false},
|
||||
{"System peripheral", false},
|
||||
{"Audio device", false},
|
||||
{"Host bridge", false},
|
||||
{"PCI bridge", false},
|
||||
{"SMBus", false},
|
||||
{"Performance counters", false},
|
||||
{"Ethernet controller", true},
|
||||
{"RAID bus controller", true},
|
||||
{"Non-Volatile memory controller", true},
|
||||
{"VGA compatible controller", true},
|
||||
{name: "usb", class: "USB controller", want: false},
|
||||
{name: "system peripheral", class: "System peripheral", want: false},
|
||||
{name: "audio", class: "Audio device", want: false},
|
||||
{name: "host bridge", class: "Host bridge", want: false},
|
||||
{name: "pci bridge", class: "PCI bridge", want: false},
|
||||
{name: "smbus", class: "SMBus", want: false},
|
||||
{name: "perf", class: "Performance counters", want: false},
|
||||
{name: "non essential instrumentation", class: "Non-Essential Instrumentation", want: false},
|
||||
{name: "amd dummy function", class: "Encryption controller", vendor: "Advanced Micro Devices, Inc. [AMD]", device: "Starship/Matisse PTDMA", want: false},
|
||||
{name: "amd pspcpp", class: "Encryption controller", vendor: "Advanced Micro Devices, Inc. [AMD]", device: "Starship/Matisse Cryptographic Coprocessor PSPCPP", want: false},
|
||||
{name: "ethernet", class: "Ethernet controller", want: true},
|
||||
{name: "raid", class: "RAID bus controller", want: true},
|
||||
{name: "nvme", class: "Non-Volatile memory controller", want: true},
|
||||
{name: "vga", class: "VGA compatible controller", want: true},
|
||||
{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got := shouldIncludePCIeDevice(tt.class)
|
||||
if got != tt.want {
|
||||
t.Fatalf("class %q include=%v want %v", tt.class, got, tt.want)
|
||||
}
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := shouldIncludePCIeDevice(tt.class, tt.vendor, tt.device)
|
||||
if got != tt.want {
|
||||
t.Fatalf("class=%q vendor=%q device=%q include=%v want %v", tt.class, tt.vendor, tt.device, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspci_filtersExcludedClasses(t *testing.T) {
|
||||
input := "Slot:\t0000:00:14.0\nClass:\tUSB controller\nVendor:\tIntel Corporation\nDevice:\tUSB 3.0\n\n" +
|
||||
"Slot:\t0000:00:18.0\nClass:\tNon-Essential Instrumentation\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PCIe Dummy Function\n\n" +
|
||||
"Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
@@ -51,6 +61,21 @@ func TestParseLspci_filtersExcludedClasses(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspci_filtersAMDChipsetNoise(t *testing.T) {
|
||||
input := "" +
|
||||
"Slot:\t0000:1a:00.0\nClass:\tNon-Essential Instrumentation\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PCIe Dummy Function\n\n" +
|
||||
"Slot:\t0000:1a:00.2\nClass:\tEncryption controller\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PTDMA\n\n" +
|
||||
"Slot:\t0000:05:00.0\nClass:\tEthernet controller\nVendor:\tMellanox Technologies\nDevice:\tMT28908 Family [ConnectX-6]\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
if len(devs) != 1 {
|
||||
t.Fatalf("expected 1 remaining device, got %d", len(devs))
|
||||
}
|
||||
if devs[0].Model == nil || *devs[0].Model != "MT28908 Family [ConnectX-6]" {
|
||||
t.Fatalf("unexpected remaining device: %+v", devs[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
|
||||
input := "Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
@@ -68,6 +93,18 @@ func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspciUsesNUMANodeFieldWhenSysfsUnavailable(t *testing.T) {
|
||||
input := "Slot:\t0000:65:00.0\nClass:\tEthernet controller\nVendor:\tIntel Corporation\nDevice:\tX710\nNUMANode:\t1\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
if len(devs) != 1 {
|
||||
t.Fatalf("expected 1 device, got %d", len(devs))
|
||||
}
|
||||
if devs[0].NUMANode == nil || *devs[0].NUMANode != 1 {
|
||||
t.Fatalf("numa_node=%v want 1", devs[0].NUMANode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePCILinkSpeed(t *testing.T) {
|
||||
tests := []struct {
|
||||
raw string
|
||||
|
||||
123
audit/internal/collector/pcie_identity.go
Normal file
123
audit/internal/collector/pcie_identity.go
Normal file
@@ -0,0 +1,123 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||
out, err := exec.Command("lspci", "-vv", "-s", bdf).Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
readPCIVPDFile = func(bdf string) ([]byte, error) {
|
||||
return os.ReadFile(filepath.Join("/sys/bus/pci/devices", bdf, "vpd"))
|
||||
}
|
||||
)
|
||||
|
||||
func enrichPCIeWithPCISerials(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
enriched := 0
|
||||
for i := range devs {
|
||||
if !shouldProbePCIeSerial(devs[i]) {
|
||||
continue
|
||||
}
|
||||
bdf := normalizePCIeBDF(*devs[i].BDF)
|
||||
if bdf == "" {
|
||||
continue
|
||||
}
|
||||
if serial := queryPCIDeviceSerial(bdf); serial != "" {
|
||||
devs[i].SerialNumber = &serial
|
||||
enriched++
|
||||
}
|
||||
}
|
||||
if enriched > 0 {
|
||||
slog.Info("pcie: serials enriched", "count", enriched)
|
||||
}
|
||||
return devs
|
||||
}
|
||||
|
||||
func shouldProbePCIeSerial(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.BDF == nil || dev.SerialNumber != nil {
|
||||
return false
|
||||
}
|
||||
if dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
class := strings.TrimSpace(*dev.DeviceClass)
|
||||
return isNICClass(class) || isGPUClass(class)
|
||||
}
|
||||
|
||||
func queryPCIDeviceSerial(bdf string) string {
|
||||
if out, err := queryPCILSPCIDetail(bdf); err == nil {
|
||||
if serial := parseLSPCIDetailSerial(out); serial != "" {
|
||||
return serial
|
||||
}
|
||||
}
|
||||
if raw, err := readPCIVPDFile(bdf); err == nil {
|
||||
return parsePCIVPDSerial(raw)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func parseLSPCIDetailSerial(raw string) string {
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
lower := strings.ToLower(line)
|
||||
if !strings.Contains(lower, "serial number:") {
|
||||
continue
|
||||
}
|
||||
idx := strings.Index(line, ":")
|
||||
if idx < 0 {
|
||||
continue
|
||||
}
|
||||
if serial := strings.TrimSpace(line[idx+1:]); serial != "" {
|
||||
return serial
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func parsePCIVPDSerial(raw []byte) string {
|
||||
for i := 0; i+3 < len(raw); i++ {
|
||||
if raw[i] != 'S' || raw[i+1] != 'N' {
|
||||
continue
|
||||
}
|
||||
length := int(raw[i+2])
|
||||
if length <= 0 || length > 64 || i+3+length > len(raw) {
|
||||
continue
|
||||
}
|
||||
value := strings.TrimSpace(strings.Trim(string(raw[i+3:i+3+length]), "\x00"))
|
||||
if !looksLikeSerial(value) {
|
||||
continue
|
||||
}
|
||||
return value
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func looksLikeSerial(value string) bool {
|
||||
if len(value) < 4 {
|
||||
return false
|
||||
}
|
||||
hasAlphaNum := false
|
||||
for _, r := range value {
|
||||
switch {
|
||||
case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9':
|
||||
hasAlphaNum = true
|
||||
case strings.ContainsRune(" -_./:", r):
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
return hasAlphaNum
|
||||
}
|
||||
47
audit/internal/collector/pcie_identity_test.go
Normal file
47
audit/internal/collector/pcie_identity_test.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestEnrichPCIeWithPCISerialsAddsGPUFallback(t *testing.T) {
|
||||
origDetail := queryPCILSPCIDetail
|
||||
origVPD := readPCIVPDFile
|
||||
t.Cleanup(func() {
|
||||
queryPCILSPCIDetail = origDetail
|
||||
readPCIVPDFile = origVPD
|
||||
})
|
||||
|
||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||
if bdf != "0000:11:00.0" {
|
||||
t.Fatalf("unexpected bdf: %s", bdf)
|
||||
}
|
||||
return "Serial number: GPU-SN-12345\n", nil
|
||||
}
|
||||
readPCIVPDFile = func(string) ([]byte, error) {
|
||||
return nil, fmt.Errorf("no vpd needed")
|
||||
}
|
||||
|
||||
class := "DisplayController"
|
||||
bdf := "0000:11:00.0"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
BDF: &bdf,
|
||||
}}
|
||||
|
||||
out := enrichPCIeWithPCISerials(devs)
|
||||
if out[0].SerialNumber == nil || *out[0].SerialNumber != "GPU-SN-12345" {
|
||||
t.Fatalf("serial=%v want GPU-SN-12345", out[0].SerialNumber)
|
||||
}
|
||||
}
|
||||
|
||||
func TestShouldProbePCIeSerialSkipsNonGPUOrNIC(t *testing.T) {
|
||||
class := "StorageController"
|
||||
bdf := "0000:19:00.0"
|
||||
dev := schema.HardwarePCIeDevice{DeviceClass: &class, BDF: &bdf}
|
||||
if shouldProbePCIeSerial(dev) {
|
||||
t.Fatal("unexpected probe for storage controller")
|
||||
}
|
||||
}
|
||||
@@ -190,6 +190,7 @@ type smartctlInfo struct {
|
||||
func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
present := true
|
||||
s := schema.HardwareStorage{Present: &present}
|
||||
s.Telemetry = map[string]any{"linux_device": "/dev/" + dev.Name}
|
||||
|
||||
tran := strings.ToLower(dev.Tran)
|
||||
devPath := "/dev/" + dev.Name
|
||||
@@ -348,6 +349,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
Present: &present,
|
||||
Type: &devType,
|
||||
Interface: &iface,
|
||||
Telemetry: map[string]any{"linux_device": "/dev/" + dev.Name},
|
||||
}
|
||||
|
||||
devPath := "/dev/" + dev.Name
|
||||
|
||||
@@ -9,8 +9,10 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
var exportExecCommand = exec.Command
|
||||
|
||||
func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
||||
raw, err := exec.Command("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
|
||||
raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -52,7 +54,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string, error) {
|
||||
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst string, retErr error) {
|
||||
if src == "" || target.Device == "" {
|
||||
return "", fmt.Errorf("source and target are required")
|
||||
}
|
||||
@@ -62,20 +64,39 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
|
||||
|
||||
mountpoint := strings.TrimSpace(target.Mountpoint)
|
||||
mountedHere := false
|
||||
mounted := mountpoint != ""
|
||||
if mountpoint == "" {
|
||||
mountpoint = filepath.Join("/tmp", "bee-export-"+filepath.Base(target.Device))
|
||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if raw, err := exec.Command("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||
if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||
_ = os.Remove(mountpoint)
|
||||
return string(raw), err
|
||||
}
|
||||
mountedHere = true
|
||||
mounted = true
|
||||
}
|
||||
defer func() {
|
||||
if !mounted {
|
||||
return
|
||||
}
|
||||
_ = exportExecCommand("sync").Run()
|
||||
if raw, err := exportExecCommand("umount", mountpoint).CombinedOutput(); err != nil && retErr == nil {
|
||||
msg := strings.TrimSpace(string(raw))
|
||||
if msg == "" {
|
||||
retErr = err
|
||||
} else {
|
||||
retErr = fmt.Errorf("%s: %w", msg, err)
|
||||
}
|
||||
}
|
||||
if mountedHere {
|
||||
_ = os.Remove(mountpoint)
|
||||
}
|
||||
}()
|
||||
|
||||
filename := filepath.Base(src)
|
||||
dst := filepath.Join(mountpoint, filename)
|
||||
dst = filepath.Join(mountpoint, filename)
|
||||
data, err := os.ReadFile(src)
|
||||
if err != nil {
|
||||
return "", err
|
||||
@@ -83,12 +104,6 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
|
||||
if err := os.WriteFile(dst, data, 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
_ = exec.Command("sync").Run()
|
||||
|
||||
if mountedHere {
|
||||
_ = exec.Command("umount", mountpoint).Run()
|
||||
_ = os.Remove(mountpoint)
|
||||
}
|
||||
|
||||
return dst, nil
|
||||
}
|
||||
|
||||
56
audit/internal/platform/export_test.go
Normal file
56
audit/internal/platform/export_test.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
src := filepath.Join(tmp, "bundle.tar.gz")
|
||||
mountpoint := filepath.Join(tmp, "mnt")
|
||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||
t.Fatalf("mkdir mountpoint: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
|
||||
t.Fatalf("write src: %v", err)
|
||||
}
|
||||
|
||||
var calls [][]string
|
||||
oldExec := exportExecCommand
|
||||
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
calls = append(calls, append([]string{name}, args...))
|
||||
return exec.Command("sh", "-c", "exit 0")
|
||||
}
|
||||
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||
|
||||
s := &System{}
|
||||
dst, err := s.ExportFileToTarget(src, RemovableTarget{
|
||||
Device: "/dev/sdb1",
|
||||
Mountpoint: mountpoint,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("ExportFileToTarget error: %v", err)
|
||||
}
|
||||
if got, want := dst, filepath.Join(mountpoint, "bundle.tar.gz"); got != want {
|
||||
t.Fatalf("dst=%q want %q", got, want)
|
||||
}
|
||||
if _, err := os.Stat(filepath.Join(mountpoint, "bundle.tar.gz")); err != nil {
|
||||
t.Fatalf("exported file missing: %v", err)
|
||||
}
|
||||
|
||||
foundUmount := false
|
||||
for _, call := range calls {
|
||||
if len(call) == 2 && call[0] == "umount" && call[1] == mountpoint {
|
||||
foundUmount = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !foundUmount {
|
||||
t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
|
||||
}
|
||||
}
|
||||
@@ -16,9 +16,6 @@ var runtimeRequiredTools = []string{
|
||||
"smartctl",
|
||||
"nvme",
|
||||
"ipmitool",
|
||||
"nvidia-smi",
|
||||
"nvidia-bug-report.sh",
|
||||
"bee-gpu-stress",
|
||||
"dhclient",
|
||||
"mount",
|
||||
}
|
||||
@@ -93,7 +90,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
||||
}
|
||||
}
|
||||
|
||||
for _, tool := range s.CheckTools(runtimeRequiredTools) {
|
||||
vendor := s.DetectGPUVendor()
|
||||
for _, tool := range s.runtimeToolStatuses(vendor) {
|
||||
health.Tools = append(health.Tools, schema.RuntimeToolStatus{
|
||||
Name: tool.Name,
|
||||
Path: tool.Path,
|
||||
@@ -115,39 +113,7 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
||||
})
|
||||
}
|
||||
|
||||
lsmodText := commandText("lsmod")
|
||||
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
||||
if !health.DriverReady {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_kernel_module_missing",
|
||||
Severity: "warning",
|
||||
Description: "NVIDIA kernel module is not loaded.",
|
||||
})
|
||||
}
|
||||
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_modeset_failed",
|
||||
Severity: "warning",
|
||||
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
|
||||
})
|
||||
}
|
||||
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
|
||||
health.DriverReady = true
|
||||
}
|
||||
|
||||
health.CUDAReady = false
|
||||
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
|
||||
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
||||
if err == nil {
|
||||
health.CUDAReady = true
|
||||
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "cuda_runtime_not_ready",
|
||||
Severity: "warning",
|
||||
Description: "CUDA runtime is not ready for GPU SAT.",
|
||||
})
|
||||
}
|
||||
}
|
||||
s.collectGPURuntimeHealth(vendor, &health)
|
||||
|
||||
if health.Status != "FAILED" && len(health.Issues) > 0 {
|
||||
health.Status = "PARTIAL"
|
||||
@@ -162,3 +128,87 @@ func commandText(name string, args ...string) string {
|
||||
}
|
||||
return string(raw)
|
||||
}
|
||||
|
||||
func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
||||
tools := s.CheckTools(runtimeRequiredTools)
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
tools = append(tools, s.CheckTools([]string{
|
||||
"nvidia-smi",
|
||||
"nvidia-bug-report.sh",
|
||||
"bee-gpu-stress",
|
||||
})...)
|
||||
case "amd":
|
||||
tool := ToolStatus{Name: "rocm-smi"}
|
||||
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
|
||||
tool.Path = cmd[0]
|
||||
if len(cmd) > 1 && strings.HasSuffix(cmd[1], "rocm_smi.py") {
|
||||
tool.Path = cmd[1]
|
||||
}
|
||||
tool.OK = true
|
||||
}
|
||||
tools = append(tools, tool)
|
||||
}
|
||||
return tools
|
||||
}
|
||||
|
||||
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
||||
lsmodText := commandText("lsmod")
|
||||
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
||||
if !health.DriverReady {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_kernel_module_missing",
|
||||
Severity: "warning",
|
||||
Description: "NVIDIA kernel module is not loaded.",
|
||||
})
|
||||
}
|
||||
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_modeset_failed",
|
||||
Severity: "warning",
|
||||
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
|
||||
})
|
||||
}
|
||||
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
|
||||
health.DriverReady = true
|
||||
}
|
||||
|
||||
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
|
||||
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
||||
if err == nil {
|
||||
health.CUDAReady = true
|
||||
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "cuda_runtime_not_ready",
|
||||
Severity: "warning",
|
||||
Description: "CUDA runtime is not ready for GPU SAT.",
|
||||
})
|
||||
}
|
||||
}
|
||||
case "amd":
|
||||
health.DriverReady = strings.Contains(lsmodText, "amdgpu ") || strings.Contains(lsmodText, "amdkfd")
|
||||
if !health.DriverReady {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "amdgpu_kernel_module_missing",
|
||||
Severity: "warning",
|
||||
Description: "AMD GPU driver is not loaded.",
|
||||
})
|
||||
}
|
||||
|
||||
out, err := runROCmSMI("--showproductname", "--csv")
|
||||
if err == nil && strings.TrimSpace(string(out)) != "" {
|
||||
health.CUDAReady = true
|
||||
health.DriverReady = true
|
||||
return
|
||||
}
|
||||
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "rocm_smi_unavailable",
|
||||
Severity: "warning",
|
||||
Description: "ROCm SMI is not available for AMD GPU SAT.",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"archive/tar"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
@@ -15,6 +16,22 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
satExecCommand = exec.Command
|
||||
satLookPath = exec.LookPath
|
||||
satGlob = filepath.Glob
|
||||
satStat = os.Stat
|
||||
|
||||
rocmSMIExecutableGlobs = []string{
|
||||
"/opt/rocm/bin/rocm-smi",
|
||||
"/opt/rocm-*/bin/rocm-smi",
|
||||
}
|
||||
rocmSMIScriptGlobs = []string{
|
||||
"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
|
||||
"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
|
||||
}
|
||||
)
|
||||
|
||||
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
||||
type NvidiaGPU struct {
|
||||
Index int
|
||||
@@ -41,7 +58,7 @@ func (s *System) DetectGPUVendor() string {
|
||||
|
||||
// ListAMDGPUs returns AMD GPUs visible to rocm-smi.
|
||||
func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
|
||||
out, err := exec.Command("rocm-smi", "--showproductname", "--csv").Output()
|
||||
out, err := runROCmSMI("--showproductname", "--csv")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("rocm-smi: %w", err)
|
||||
}
|
||||
@@ -337,12 +354,22 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
|
||||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
|
||||
start := time.Now().UTC()
|
||||
resolvedCmd, err := resolveSATCommand(cmd)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||
"cmd: "+strings.Join(cmd, " "),
|
||||
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||
)
|
||||
if err != nil {
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||
"rc: 1",
|
||||
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||||
"",
|
||||
)
|
||||
return []byte(err.Error() + "\n"), err
|
||||
}
|
||||
|
||||
c := exec.CommandContext(ctx, cmd[0], cmd[1:]...)
|
||||
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
||||
if len(env) > 0 {
|
||||
c.Env = append(os.Environ(), env...)
|
||||
}
|
||||
@@ -362,19 +389,11 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
||||
}
|
||||
|
||||
func listStorageDevices() ([]string, error) {
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "NAME,TYPE").Output()
|
||||
out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var devices []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
fields := strings.Fields(strings.TrimSpace(line))
|
||||
if len(fields) != 2 || fields[1] != "disk" {
|
||||
continue
|
||||
}
|
||||
devices = append(devices, "/dev/"+fields[0])
|
||||
}
|
||||
return devices, nil
|
||||
return parseStorageDevices(string(out)), nil
|
||||
}
|
||||
|
||||
func storageSATCommands(devPath string) []satJob {
|
||||
@@ -445,12 +464,22 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
||||
|
||||
func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
||||
start := time.Now().UTC()
|
||||
resolvedCmd, err := resolveSATCommand(cmd)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||
"cmd: "+strings.Join(cmd, " "),
|
||||
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||
)
|
||||
if err != nil {
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||
"rc: 1",
|
||||
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||||
"",
|
||||
)
|
||||
return []byte(err.Error() + "\n"), err
|
||||
}
|
||||
|
||||
out, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput()
|
||||
out, err := satExecCommand(resolvedCmd[0], resolvedCmd[1:]...).CombinedOutput()
|
||||
|
||||
rc := 0
|
||||
if err != nil {
|
||||
@@ -465,6 +494,91 @@ func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
||||
return out, err
|
||||
}
|
||||
|
||||
func runROCmSMI(args ...string) ([]byte, error) {
|
||||
cmd, err := resolveROCmSMICommand(args...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return satExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
|
||||
}
|
||||
|
||||
func resolveSATCommand(cmd []string) ([]string, error) {
|
||||
if len(cmd) == 0 {
|
||||
return nil, errors.New("empty SAT command")
|
||||
}
|
||||
if cmd[0] != "rocm-smi" {
|
||||
return cmd, nil
|
||||
}
|
||||
return resolveROCmSMICommand(cmd[1:]...)
|
||||
}
|
||||
|
||||
func resolveROCmSMICommand(args ...string) ([]string, error) {
|
||||
if path, err := satLookPath("rocm-smi"); err == nil {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
|
||||
for _, path := range rocmSMIExecutableCandidates() {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
|
||||
pythonPath, pyErr := satLookPath("python3")
|
||||
if pyErr == nil {
|
||||
for _, script := range rocmSMIScriptCandidates() {
|
||||
cmd := []string{pythonPath, script}
|
||||
cmd = append(cmd, args...)
|
||||
return cmd, nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
||||
}
|
||||
|
||||
func rocmSMIExecutableCandidates() []string {
|
||||
return expandExistingPaths(rocmSMIExecutableGlobs)
|
||||
}
|
||||
|
||||
func rocmSMIScriptCandidates() []string {
|
||||
return expandExistingPaths(rocmSMIScriptGlobs)
|
||||
}
|
||||
|
||||
func expandExistingPaths(patterns []string) []string {
|
||||
seen := make(map[string]struct{})
|
||||
var paths []string
|
||||
for _, pattern := range patterns {
|
||||
matches, err := satGlob(pattern)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sort.Strings(matches)
|
||||
for _, match := range matches {
|
||||
if _, err := satStat(match); err != nil {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[match]; ok {
|
||||
continue
|
||||
}
|
||||
seen[match] = struct{}{}
|
||||
paths = append(paths, match)
|
||||
}
|
||||
}
|
||||
return paths
|
||||
}
|
||||
|
||||
func parseStorageDevices(raw string) []string {
|
||||
var devices []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
fields := strings.Fields(strings.TrimSpace(line))
|
||||
if len(fields) < 2 || fields[1] != "disk" {
|
||||
continue
|
||||
}
|
||||
if len(fields) >= 3 && strings.EqualFold(fields[2], "usb") {
|
||||
continue
|
||||
}
|
||||
devices = append(devices, "/dev/"+fields[0])
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
|
||||
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
|
||||
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {
|
||||
|
||||
587
audit/internal/platform/sat_fan_stress.go
Normal file
587
audit/internal/platform/sat_fan_stress.go
Normal file
@@ -0,0 +1,587 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// FanStressOptions configures the fan-stress / thermal cycling test.
|
||||
type FanStressOptions struct {
|
||||
BaselineSec int // idle monitoring before and after load (default 30)
|
||||
Phase1DurSec int // first load phase duration in seconds (default 300)
|
||||
PauseSec int // pause between the two load phases (default 60)
|
||||
Phase2DurSec int // second load phase duration in seconds (default 300)
|
||||
SizeMB int // GPU memory to allocate per GPU during stress (default 64)
|
||||
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
||||
}
|
||||
|
||||
// FanReading holds one fan sensor reading.
|
||||
type FanReading struct {
|
||||
Name string
|
||||
RPM float64
|
||||
}
|
||||
|
||||
// GPUStressMetric holds per-GPU metrics during the stress test.
|
||||
type GPUStressMetric struct {
|
||||
Index int
|
||||
TempC float64
|
||||
UsagePct float64
|
||||
PowerW float64
|
||||
ClockMHz float64
|
||||
Throttled bool // true if any throttle reason is active
|
||||
}
|
||||
|
||||
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
|
||||
type FanStressRow struct {
|
||||
TimestampUTC string
|
||||
ElapsedSec float64
|
||||
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
||||
GPUs []GPUStressMetric
|
||||
Fans []FanReading
|
||||
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
||||
SysPowerW float64 // DCMI system power reading
|
||||
}
|
||||
|
||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||
func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanStressOptions) (string, error) {
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
applyFanStressDefaults(&opts)
|
||||
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, "fan-stress-"+ts)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||
|
||||
// Phase name shared between sampler goroutine and main goroutine.
|
||||
var phaseMu sync.Mutex
|
||||
currentPhase := "init"
|
||||
setPhase := func(name string) {
|
||||
phaseMu.Lock()
|
||||
currentPhase = name
|
||||
phaseMu.Unlock()
|
||||
}
|
||||
getPhase := func() string {
|
||||
phaseMu.Lock()
|
||||
defer phaseMu.Unlock()
|
||||
return currentPhase
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
var rowsMu sync.Mutex
|
||||
var allRows []FanStressRow
|
||||
|
||||
// Start background sampler (every second).
|
||||
stopCh := make(chan struct{})
|
||||
doneCh := make(chan struct{})
|
||||
go func() {
|
||||
defer close(doneCh)
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
return
|
||||
case <-ticker.C:
|
||||
row := sampleFanStressRow(opts.GPUIndices, getPhase(), time.Since(start).Seconds())
|
||||
rowsMu.Lock()
|
||||
allRows = append(allRows, row)
|
||||
rowsMu.Unlock()
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
var summary strings.Builder
|
||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
|
||||
stats := satStats{}
|
||||
|
||||
// idlePhase sleeps for durSec while the sampler stamps phaseName on each row.
|
||||
idlePhase := func(phaseName, stepName string, durSec int) {
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
setPhase(phaseName)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start %s (idle %ds)", time.Now().UTC().Format(time.RFC3339), stepName, durSec),
|
||||
)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case <-time.After(time.Duration(durSec) * time.Second):
|
||||
}
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), stepName),
|
||||
)
|
||||
fmt.Fprintf(&summary, "%s_status=OK\n", stepName)
|
||||
stats.OK++
|
||||
}
|
||||
|
||||
// loadPhase runs bee-gpu-stress for durSec; sampler stamps phaseName on each row.
|
||||
loadPhase := func(phaseName, stepName string, durSec int) {
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
setPhase(phaseName)
|
||||
var env []string
|
||||
if len(opts.GPUIndices) > 0 {
|
||||
ids := make([]string, len(opts.GPUIndices))
|
||||
for i, idx := range opts.GPUIndices {
|
||||
ids[i] = strconv.Itoa(idx)
|
||||
}
|
||||
env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
|
||||
}
|
||||
cmd := []string{
|
||||
"bee-gpu-stress",
|
||||
"--seconds", strconv.Itoa(durSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env)
|
||||
_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
|
||||
if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
|
||||
fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
|
||||
stats.Failed++
|
||||
} else {
|
||||
fmt.Fprintf(&summary, "%s_status=OK\n", stepName)
|
||||
stats.OK++
|
||||
}
|
||||
}
|
||||
|
||||
// Execute test phases.
|
||||
idlePhase("baseline", "01-baseline", opts.BaselineSec)
|
||||
loadPhase("load1", "02-load1", opts.Phase1DurSec)
|
||||
idlePhase("pause", "03-pause", opts.PauseSec)
|
||||
loadPhase("load2", "04-load2", opts.Phase2DurSec)
|
||||
idlePhase("cooldown", "05-cooldown", opts.BaselineSec)
|
||||
|
||||
// Stop sampler and collect rows.
|
||||
close(stopCh)
|
||||
<-doneCh
|
||||
|
||||
rowsMu.Lock()
|
||||
rows := allRows
|
||||
rowsMu.Unlock()
|
||||
|
||||
// Analysis.
|
||||
throttled := analyzeThrottling(rows)
|
||||
maxGPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 {
|
||||
var m float64
|
||||
for _, g := range r.GPUs {
|
||||
if g.TempC > m {
|
||||
m = g.TempC
|
||||
}
|
||||
}
|
||||
return m
|
||||
})
|
||||
maxCPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 {
|
||||
return r.CPUMaxTempC
|
||||
})
|
||||
fanResponseSec := analyzeFanResponse(rows)
|
||||
|
||||
fmt.Fprintf(&summary, "throttling_detected=%v\n", throttled)
|
||||
fmt.Fprintf(&summary, "max_gpu_temp_c=%.1f\n", maxGPUTemp)
|
||||
fmt.Fprintf(&summary, "max_cpu_temp_c=%.1f\n", maxCPUTemp)
|
||||
if fanResponseSec >= 0 {
|
||||
fmt.Fprintf(&summary, "fan_response_sec=%.1f\n", fanResponseSec)
|
||||
} else {
|
||||
fmt.Fprintf(&summary, "fan_response_sec=N/A\n")
|
||||
}
|
||||
|
||||
// Throttling failure counts against overall result.
|
||||
if throttled {
|
||||
stats.Failed++
|
||||
}
|
||||
writeSATStats(&summary, stats)
|
||||
|
||||
// Write CSV outputs.
|
||||
if err := WriteFanStressCSV(filepath.Join(runDir, "metrics.csv"), rows, opts.GPUIndices); err != nil {
|
||||
return "", err
|
||||
}
|
||||
_ = WriteFanSensorsCSV(filepath.Join(runDir, "fan-sensors.csv"), rows)
|
||||
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archive, nil
|
||||
}
|
||||
|
||||
func applyFanStressDefaults(opts *FanStressOptions) {
|
||||
if opts.BaselineSec <= 0 {
|
||||
opts.BaselineSec = 30
|
||||
}
|
||||
if opts.Phase1DurSec <= 0 {
|
||||
opts.Phase1DurSec = 300
|
||||
}
|
||||
if opts.PauseSec <= 0 {
|
||||
opts.PauseSec = 60
|
||||
}
|
||||
if opts.Phase2DurSec <= 0 {
|
||||
opts.Phase2DurSec = 300
|
||||
}
|
||||
if opts.SizeMB <= 0 {
|
||||
opts.SizeMB = 64
|
||||
}
|
||||
}
|
||||
|
||||
// sampleFanStressRow collects all metrics for one telemetry sample.
|
||||
func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStressRow {
|
||||
row := FanStressRow{
|
||||
TimestampUTC: time.Now().UTC().Format(time.RFC3339),
|
||||
ElapsedSec: elapsed,
|
||||
Phase: phase,
|
||||
}
|
||||
row.GPUs = sampleGPUStressMetrics(gpuIndices)
|
||||
row.Fans, _ = sampleFanSpeeds()
|
||||
row.CPUMaxTempC = sampleCPUMaxTemp()
|
||||
row.SysPowerW = sampleSystemPower()
|
||||
return row
|
||||
}
|
||||
|
||||
// sampleGPUStressMetrics queries nvidia-smi for temperature, utilization, power,
|
||||
// clock frequency, and active throttle reasons for each GPU.
|
||||
func sampleGPUStressMetrics(gpuIndices []int) []GPUStressMetric {
|
||||
args := []string{
|
||||
"--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics,clocks_throttle_reasons.active",
|
||||
"--format=csv,noheader,nounits",
|
||||
}
|
||||
if len(gpuIndices) > 0 {
|
||||
ids := make([]string, len(gpuIndices))
|
||||
for i, idx := range gpuIndices {
|
||||
ids[i] = strconv.Itoa(idx)
|
||||
}
|
||||
args = append([]string{"--id=" + strings.Join(ids, ",")}, args...)
|
||||
}
|
||||
out, err := exec.Command("nvidia-smi", args...).Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var metrics []GPUStressMetric
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ", ")
|
||||
if len(parts) < 6 {
|
||||
continue
|
||||
}
|
||||
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
throttleVal := strings.TrimSpace(parts[5])
|
||||
// Throttled if active reasons bitmask is non-zero.
|
||||
throttled := throttleVal != "0x0000000000000000" &&
|
||||
throttleVal != "0x0" &&
|
||||
throttleVal != "0" &&
|
||||
throttleVal != "" &&
|
||||
throttleVal != "N/A"
|
||||
metrics = append(metrics, GPUStressMetric{
|
||||
Index: idx,
|
||||
TempC: parseGPUFloat(parts[1]),
|
||||
UsagePct: parseGPUFloat(parts[2]),
|
||||
PowerW: parseGPUFloat(parts[3]),
|
||||
ClockMHz: parseGPUFloat(parts[4]),
|
||||
Throttled: throttled,
|
||||
})
|
||||
}
|
||||
return metrics
|
||||
}
|
||||
|
||||
// sampleFanSpeeds reads fan RPM values from ipmitool sdr.
|
||||
func sampleFanSpeeds() ([]FanReading, error) {
|
||||
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return parseFanSpeeds(string(out)), nil
|
||||
}
|
||||
|
||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||
// Line format: "FAN1 | 2400.000 | RPM | ok"
|
||||
func parseFanSpeeds(raw string) []FanReading {
|
||||
var fans []FanReading
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 3 {
|
||||
continue
|
||||
}
|
||||
unit := strings.TrimSpace(parts[2])
|
||||
if !strings.EqualFold(unit, "RPM") {
|
||||
continue
|
||||
}
|
||||
valStr := strings.TrimSpace(parts[1])
|
||||
if strings.EqualFold(valStr, "na") || strings.EqualFold(valStr, "disabled") || valStr == "" {
|
||||
continue
|
||||
}
|
||||
val, err := strconv.ParseFloat(valStr, 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
fans = append(fans, FanReading{
|
||||
Name: strings.TrimSpace(parts[0]),
|
||||
RPM: val,
|
||||
})
|
||||
}
|
||||
return fans
|
||||
}
|
||||
|
||||
// sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors.
|
||||
func sampleCPUMaxTemp() float64 {
|
||||
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
||||
if err != nil {
|
||||
return sampleCPUTempViaSensors()
|
||||
}
|
||||
return parseIPMIMaxTemp(string(out))
|
||||
}
|
||||
|
||||
// parseIPMIMaxTemp extracts the maximum temperature from "ipmitool sdr type Temperature".
|
||||
func parseIPMIMaxTemp(raw string) float64 {
|
||||
var max float64
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 3 {
|
||||
continue
|
||||
}
|
||||
unit := strings.TrimSpace(parts[2])
|
||||
if !strings.Contains(strings.ToLower(unit), "degrees") {
|
||||
continue
|
||||
}
|
||||
valStr := strings.TrimSpace(parts[1])
|
||||
if strings.EqualFold(valStr, "na") || valStr == "" {
|
||||
continue
|
||||
}
|
||||
val, err := strconv.ParseFloat(valStr, 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if val > max {
|
||||
max = val
|
||||
}
|
||||
}
|
||||
return max
|
||||
}
|
||||
|
||||
// sampleCPUTempViaSensors falls back to lm-sensors when ipmitool is unavailable.
|
||||
func sampleCPUTempViaSensors() float64 {
|
||||
out, err := exec.Command("sensors", "-u").Output()
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
var max float64
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 2 {
|
||||
continue
|
||||
}
|
||||
if !strings.HasSuffix(fields[0], "_input:") {
|
||||
continue
|
||||
}
|
||||
val, err := strconv.ParseFloat(fields[1], 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if val > 0 && val < 150 && val > max {
|
||||
max = val
|
||||
}
|
||||
}
|
||||
return max
|
||||
}
|
||||
|
||||
// sampleSystemPower reads system power draw via DCMI.
|
||||
func sampleSystemPower() float64 {
|
||||
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return parseDCMIPowerReading(string(out))
|
||||
}
|
||||
|
||||
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||
// Sample: " Instantaneous power reading: 500 Watts"
|
||||
func parseDCMIPowerReading(raw string) float64 {
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
if !strings.Contains(strings.ToLower(line), "instantaneous") {
|
||||
continue
|
||||
}
|
||||
parts := strings.Fields(line)
|
||||
for i, p := range parts {
|
||||
if strings.EqualFold(p, "Watts") && i > 0 {
|
||||
val, err := strconv.ParseFloat(parts[i-1], 64)
|
||||
if err == nil {
|
||||
return val
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
||||
// during either load phase.
|
||||
func analyzeThrottling(rows []FanStressRow) bool {
|
||||
for _, row := range rows {
|
||||
if row.Phase != "load1" && row.Phase != "load2" {
|
||||
continue
|
||||
}
|
||||
for _, gpu := range row.GPUs {
|
||||
if gpu.Throttled {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// analyzeMaxTemp returns the maximum value of the given extractor across all rows.
|
||||
func analyzeMaxTemp(rows []FanStressRow, extract func(FanStressRow) float64) float64 {
|
||||
var max float64
|
||||
for _, row := range rows {
|
||||
if v := extract(row); v > max {
|
||||
max = v
|
||||
}
|
||||
}
|
||||
return max
|
||||
}
|
||||
|
||||
// analyzeFanResponse returns the seconds from load1 start until fan RPM first
|
||||
// increased by more than 5% above the baseline average. Returns -1 if undetermined.
|
||||
func analyzeFanResponse(rows []FanStressRow) float64 {
|
||||
// Compute baseline average fan RPM.
|
||||
var baseTotal, baseCount float64
|
||||
for _, row := range rows {
|
||||
if row.Phase != "baseline" {
|
||||
continue
|
||||
}
|
||||
for _, f := range row.Fans {
|
||||
baseTotal += f.RPM
|
||||
baseCount++
|
||||
}
|
||||
}
|
||||
if baseCount == 0 || baseTotal == 0 {
|
||||
return -1
|
||||
}
|
||||
baseAvg := baseTotal / baseCount
|
||||
threshold := baseAvg * 1.05 // 5% increase signals fan ramp-up
|
||||
|
||||
// Find elapsed time when load1 started.
|
||||
var load1Start float64 = -1
|
||||
for _, row := range rows {
|
||||
if row.Phase == "load1" {
|
||||
load1Start = row.ElapsedSec
|
||||
break
|
||||
}
|
||||
}
|
||||
if load1Start < 0 {
|
||||
return -1
|
||||
}
|
||||
|
||||
// Find first load1 row where average RPM crosses the threshold.
|
||||
for _, row := range rows {
|
||||
if row.Phase != "load1" {
|
||||
continue
|
||||
}
|
||||
var total, count float64
|
||||
for _, f := range row.Fans {
|
||||
total += f.RPM
|
||||
count++
|
||||
}
|
||||
if count > 0 && total/count >= threshold {
|
||||
return row.ElapsedSec - load1Start
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// WriteFanStressCSV writes the wide-format metrics CSV with one row per second.
|
||||
// GPU columns are generated per index in gpuIndices order.
|
||||
func WriteFanStressCSV(path string, rows []FanStressRow, gpuIndices []int) error {
|
||||
if len(rows) == 0 {
|
||||
return os.WriteFile(path, []byte("no data\n"), 0644)
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
|
||||
// Header: fixed system columns + per-GPU columns.
|
||||
b.WriteString("timestamp_utc,elapsed_sec,phase,fan_avg_rpm,fan_min_rpm,fan_max_rpm,cpu_max_temp_c,sys_power_w")
|
||||
for _, idx := range gpuIndices {
|
||||
fmt.Fprintf(&b, ",gpu%d_temp_c,gpu%d_usage_pct,gpu%d_power_w,gpu%d_clock_mhz,gpu%d_throttled",
|
||||
idx, idx, idx, idx, idx)
|
||||
}
|
||||
b.WriteRune('\n')
|
||||
|
||||
for _, row := range rows {
|
||||
favg, fmin, fmax := fanRPMStats(row.Fans)
|
||||
fmt.Fprintf(&b, "%s,%.1f,%s,%.0f,%.0f,%.0f,%.1f,%.1f",
|
||||
row.TimestampUTC,
|
||||
row.ElapsedSec,
|
||||
row.Phase,
|
||||
favg, fmin, fmax,
|
||||
row.CPUMaxTempC,
|
||||
row.SysPowerW,
|
||||
)
|
||||
gpuByIdx := make(map[int]GPUStressMetric, len(row.GPUs))
|
||||
for _, g := range row.GPUs {
|
||||
gpuByIdx[g.Index] = g
|
||||
}
|
||||
for _, idx := range gpuIndices {
|
||||
g := gpuByIdx[idx]
|
||||
throttled := 0
|
||||
if g.Throttled {
|
||||
throttled = 1
|
||||
}
|
||||
fmt.Fprintf(&b, ",%.1f,%.1f,%.1f,%.0f,%d",
|
||||
g.TempC, g.UsagePct, g.PowerW, g.ClockMHz, throttled)
|
||||
}
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
|
||||
return os.WriteFile(path, []byte(b.String()), 0644)
|
||||
}
|
||||
|
||||
// WriteFanSensorsCSV writes individual fan sensor readings in long (tidy) format.
|
||||
func WriteFanSensorsCSV(path string, rows []FanStressRow) error {
|
||||
var b strings.Builder
|
||||
b.WriteString("timestamp_utc,elapsed_sec,phase,fan_name,rpm\n")
|
||||
for _, row := range rows {
|
||||
for _, f := range row.Fans {
|
||||
fmt.Fprintf(&b, "%s,%.1f,%s,%s,%.0f\n",
|
||||
row.TimestampUTC, row.ElapsedSec, row.Phase, f.Name, f.RPM)
|
||||
}
|
||||
}
|
||||
return os.WriteFile(path, []byte(b.String()), 0644)
|
||||
}
|
||||
|
||||
// fanRPMStats computes average, min, max RPM across all fans in a sample row.
|
||||
func fanRPMStats(fans []FanReading) (avg, min, max float64) {
|
||||
if len(fans) == 0 {
|
||||
return 0, 0, 0
|
||||
}
|
||||
min = fans[0].RPM
|
||||
max = fans[0].RPM
|
||||
var total float64
|
||||
for _, f := range fans {
|
||||
total += f.RPM
|
||||
if f.RPM < min {
|
||||
min = f.RPM
|
||||
}
|
||||
if f.RPM > max {
|
||||
max = f.RPM
|
||||
}
|
||||
}
|
||||
return total / float64(len(fans)), min, max
|
||||
}
|
||||
@@ -3,6 +3,8 @@ package platform
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@@ -91,3 +93,90 @@ func TestClassifySATResult(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := "nvme0n1 disk nvme\nsda disk usb\nloop0 loop\nsdb disk sata\n"
|
||||
got := parseStorageDevices(raw)
|
||||
want := []string{"/dev/nvme0n1", "/dev/sdb"}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("len(devices)=%d want %d (%v)", len(got), len(want), got)
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("devices[%d]=%q want %q", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveROCmSMICommandFromPATH(t *testing.T) {
|
||||
t.Setenv("PATH", t.TempDir())
|
||||
|
||||
toolPath := filepath.Join(os.Getenv("PATH"), "rocm-smi")
|
||||
if err := os.WriteFile(toolPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
|
||||
t.Fatalf("write rocm-smi: %v", err)
|
||||
}
|
||||
|
||||
cmd, err := resolveROCmSMICommand("--showproductname")
|
||||
if err != nil {
|
||||
t.Fatalf("resolveROCmSMICommand error: %v", err)
|
||||
}
|
||||
if len(cmd) != 2 {
|
||||
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
|
||||
}
|
||||
if cmd[0] != toolPath {
|
||||
t.Fatalf("cmd[0]=%q want %q", cmd[0], toolPath)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
|
||||
if err := os.MkdirAll(filepath.Dir(execPath), 0755); err != nil {
|
||||
t.Fatalf("mkdir: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(execPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
|
||||
t.Fatalf("write rocm-smi: %v", err)
|
||||
}
|
||||
|
||||
oldGlob := rocmSMIExecutableGlobs
|
||||
oldScriptGlobs := rocmSMIScriptGlobs
|
||||
rocmSMIExecutableGlobs = []string{execPath}
|
||||
rocmSMIScriptGlobs = nil
|
||||
t.Cleanup(func() {
|
||||
rocmSMIExecutableGlobs = oldGlob
|
||||
rocmSMIScriptGlobs = oldScriptGlobs
|
||||
})
|
||||
|
||||
t.Setenv("PATH", "")
|
||||
|
||||
cmd, err := resolveROCmSMICommand("--showallinfo")
|
||||
if err != nil {
|
||||
t.Fatalf("resolveROCmSMICommand error: %v", err)
|
||||
}
|
||||
if len(cmd) != 2 {
|
||||
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
|
||||
}
|
||||
if cmd[0] != execPath {
|
||||
t.Fatalf("cmd[0]=%q want %q", cmd[0], execPath)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunROCmSMIReportsMissingCommand(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
oldExecGlobs := rocmSMIExecutableGlobs
|
||||
oldScriptGlobs := rocmSMIScriptGlobs
|
||||
satLookPath = func(string) (string, error) { return "", exec.ErrNotFound }
|
||||
rocmSMIExecutableGlobs = nil
|
||||
rocmSMIScriptGlobs = nil
|
||||
t.Cleanup(func() {
|
||||
satLookPath = oldLookPath
|
||||
rocmSMIExecutableGlobs = oldExecGlobs
|
||||
rocmSMIScriptGlobs = oldScriptGlobs
|
||||
})
|
||||
|
||||
if _, err := runROCmSMI("--showproductname"); err == nil {
|
||||
t.Fatal("expected missing rocm-smi error")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,15 +24,23 @@ var techDumpFixedCommands = []struct {
|
||||
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
||||
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
|
||||
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
|
||||
}
|
||||
|
||||
var techDumpNvidiaCommands = []struct {
|
||||
Name string
|
||||
Args []string
|
||||
File string
|
||||
}{
|
||||
{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
|
||||
{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
|
||||
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
|
||||
}
|
||||
|
||||
type lsblkDumpRoot struct {
|
||||
Blockdevices []struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Tran string `json:"tran"`
|
||||
} `json:"blockdevices"`
|
||||
}
|
||||
|
||||
@@ -50,6 +58,15 @@ func (s *System) CaptureTechnicalDump(baseDir string) error {
|
||||
for _, cmd := range techDumpFixedCommands {
|
||||
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
|
||||
}
|
||||
switch s.DetectGPUVendor() {
|
||||
case "nvidia":
|
||||
for _, cmd := range techDumpNvidiaCommands {
|
||||
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
|
||||
}
|
||||
case "amd":
|
||||
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi.txt"))
|
||||
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi-showallinfo.txt"), "--showallinfo")
|
||||
}
|
||||
|
||||
for _, dev := range lsblkDumpDevices(filepath.Join(baseDir, "lsblk.json")) {
|
||||
writeCommandDump(filepath.Join(baseDir, "smartctl-"+sanitizeDumpName(dev)+".json"), "smartctl", "-j", "-a", "/dev/"+dev)
|
||||
@@ -69,6 +86,14 @@ func writeCommandDump(path, name string, args ...string) {
|
||||
_ = os.WriteFile(path, out, 0644)
|
||||
}
|
||||
|
||||
func writeROCmSMIDump(path string, args ...string) {
|
||||
out, err := runROCmSMI(args...)
|
||||
if err != nil && len(out) == 0 {
|
||||
return
|
||||
}
|
||||
_ = os.WriteFile(path, out, 0644)
|
||||
}
|
||||
|
||||
func lsblkDumpDevices(path string) []string {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
@@ -80,6 +105,9 @@ func lsblkDumpDevices(path string) []string {
|
||||
}
|
||||
var devices []string
|
||||
for _, dev := range root.Blockdevices {
|
||||
if strings.EqualFold(strings.TrimSpace(dev.Tran), "usb") {
|
||||
continue
|
||||
}
|
||||
if dev.Type == "disk" && strings.TrimSpace(dev.Name) != "" {
|
||||
devices = append(devices, strings.TrimSpace(dev.Name))
|
||||
}
|
||||
|
||||
@@ -12,12 +12,12 @@ func TestLSBLKDumpDevices(t *testing.T) {
|
||||
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "lsblk.json")
|
||||
if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk"}]}`), 0644); err != nil {
|
||||
if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk","tran":"usb"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk","tran":"nvme"},{"name":"sdb","type":"disk","tran":"sata"}]}`), 0644); err != nil {
|
||||
t.Fatalf("write lsblk fixture: %v", err)
|
||||
}
|
||||
|
||||
got := lsblkDumpDevices(path)
|
||||
want := []string{"nvme0n1", "sda"}
|
||||
want := []string{"nvme0n1", "sdb"}
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Fatalf("lsblkDumpDevices=%v want %v", got, want)
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ package tui
|
||||
import (
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
@@ -137,6 +138,8 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
},
|
||||
pollSATProgress("gpu-amd", since),
|
||||
)
|
||||
case actionRunFanStress:
|
||||
return m.startGPUStressTest()
|
||||
}
|
||||
case "ctrl+c":
|
||||
return m, tea.Quit
|
||||
@@ -148,9 +151,53 @@ func (m model) confirmCancelTarget() screen {
|
||||
switch m.pendingAction {
|
||||
case actionExportBundle:
|
||||
return screenExportTargets
|
||||
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
|
||||
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT, actionRunFanStress:
|
||||
return screenHealthCheck
|
||||
default:
|
||||
return screenMain
|
||||
}
|
||||
}
|
||||
|
||||
// hcFanStressOpts builds FanStressOptions for the selected mode, auto-detecting all GPUs.
|
||||
func hcFanStressOpts(hcMode int, application interface {
|
||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||
}) platform.FanStressOptions {
|
||||
// Phase durations per mode: [baseline, load1, pause, load2]
|
||||
type durations struct{ baseline, load1, pause, load2 int }
|
||||
modes := [3]durations{
|
||||
{30, 120, 30, 120}, // Quick: ~5 min total
|
||||
{60, 300, 60, 300}, // Standard: ~12 min total
|
||||
{60, 600, 120, 600}, // Express: ~24 min total
|
||||
}
|
||||
if hcMode < 0 || hcMode >= len(modes) {
|
||||
hcMode = 0
|
||||
}
|
||||
d := modes[hcMode]
|
||||
|
||||
// Use all detected NVIDIA GPUs.
|
||||
var indices []int
|
||||
if gpus, err := application.ListNvidiaGPUs(); err == nil {
|
||||
for _, g := range gpus {
|
||||
indices = append(indices, g.Index)
|
||||
}
|
||||
}
|
||||
|
||||
// Use minimum GPU memory size to fit all GPUs.
|
||||
sizeMB := 64
|
||||
if gpus, err := application.ListNvidiaGPUs(); err == nil {
|
||||
for _, g := range gpus {
|
||||
if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) {
|
||||
sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return platform.FanStressOptions{
|
||||
BaselineSec: d.baseline,
|
||||
Phase1DurSec: d.load1,
|
||||
PauseSec: d.pause,
|
||||
Phase2DurSec: d.load2,
|
||||
SizeMB: sizeMB,
|
||||
GPUIndices: indices,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,3 +44,9 @@ type nvidiaSATDoneMsg struct {
|
||||
body string
|
||||
err error
|
||||
}
|
||||
|
||||
type gpuStressDoneMsg struct {
|
||||
title string
|
||||
body string
|
||||
err error
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ package tui
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
@@ -18,16 +19,17 @@ const (
|
||||
|
||||
// Cursor positions in Health Check screen.
|
||||
const (
|
||||
hcCurGPU = 0
|
||||
hcCurMemory = 1
|
||||
hcCurStorage = 2
|
||||
hcCurCPU = 3
|
||||
hcCurSelectAll = 4
|
||||
hcCurModeQuick = 5
|
||||
hcCurModeStd = 6
|
||||
hcCurModeExpr = 7
|
||||
hcCurRunAll = 8
|
||||
hcCurTotal = 9
|
||||
hcCurGPU = 0
|
||||
hcCurMemory = 1
|
||||
hcCurStorage = 2
|
||||
hcCurCPU = 3
|
||||
hcCurSelectAll = 4
|
||||
hcCurModeQuick = 5
|
||||
hcCurModeStd = 6
|
||||
hcCurModeExpr = 7
|
||||
hcCurRunAll = 8
|
||||
hcCurFanStress = 9
|
||||
hcCurTotal = 10
|
||||
)
|
||||
|
||||
// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
|
||||
@@ -82,6 +84,8 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
m.hcMode = m.hcCursor - hcCurModeQuick
|
||||
case hcCurRunAll:
|
||||
return m.hcRunAll()
|
||||
case hcCurFanStress:
|
||||
return m.hcRunFanStress()
|
||||
}
|
||||
case "g", "G":
|
||||
return m.hcRunSingle(hcGPU)
|
||||
@@ -93,6 +97,8 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
return m.hcRunSingle(hcCPU)
|
||||
case "r", "R":
|
||||
return m.hcRunAll()
|
||||
case "f", "F":
|
||||
return m.hcRunFanStress()
|
||||
case "a", "A":
|
||||
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
|
||||
for i := range m.hcSel {
|
||||
@@ -143,6 +149,71 @@ func (m model) hcRunSingle(idx int) (tea.Model, tea.Cmd) {
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
|
||||
m.pendingAction = actionRunFanStress
|
||||
m.screen = screenConfirm
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// startGPUStressTest launches the GPU Platform Stress Test and nvtop concurrently.
|
||||
// nvtop occupies the full terminal as a live chart; the stress test runs in background.
|
||||
func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
|
||||
opts := hcFanStressOpts(m.hcMode, m.app)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
m.gpuStressCancel = cancel
|
||||
m.gpuStressAborted = false
|
||||
m.screen = screenGPUStressRunning
|
||||
m.nvidiaSATCursor = 0
|
||||
|
||||
stressCmd := func() tea.Msg {
|
||||
result, err := m.app.RunFanStressTestResult(ctx, opts)
|
||||
return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err}
|
||||
}
|
||||
|
||||
nvtopPath, lookErr := exec.LookPath("nvtop")
|
||||
if lookErr != nil {
|
||||
return m, stressCmd
|
||||
}
|
||||
|
||||
return m, tea.Batch(
|
||||
stressCmd,
|
||||
tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
|
||||
return nvtopClosedMsg{}
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
// updateGPUStressRunning handles keys on the GPU stress running screen.
|
||||
func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch msg.String() {
|
||||
case "o", "O":
|
||||
nvtopPath, err := exec.LookPath("nvtop")
|
||||
if err != nil {
|
||||
return m, nil
|
||||
}
|
||||
return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
|
||||
return nvtopClosedMsg{}
|
||||
})
|
||||
case "a", "A":
|
||||
if m.gpuStressCancel != nil {
|
||||
m.gpuStressCancel()
|
||||
m.gpuStressCancel = nil
|
||||
}
|
||||
m.gpuStressAborted = true
|
||||
m.screen = screenHealthCheck
|
||||
m.cursor = 0
|
||||
case "ctrl+c":
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func renderGPUStressRunning() string {
|
||||
return "GPU PLATFORM STRESS TEST\n\nTest is running...\n\n[o] Open nvtop [a] Abort test [ctrl+c] quit\n"
|
||||
}
|
||||
|
||||
func (m model) hcRunAll() (tea.Model, tea.Cmd) {
|
||||
for _, sel := range m.hcSel {
|
||||
if sel {
|
||||
@@ -300,8 +371,16 @@ func renderHealthCheck(m model) string {
|
||||
fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx)
|
||||
}
|
||||
|
||||
{
|
||||
pfx := " "
|
||||
if m.hcCursor == hcCurFanStress {
|
||||
pfx = "> "
|
||||
}
|
||||
fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx)
|
||||
}
|
||||
|
||||
fmt.Fprintln(&b)
|
||||
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
|
||||
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [Esc] back")
|
||||
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [F] gpu stress [Esc] back")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@ package tui
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
@@ -102,7 +101,7 @@ func (m model) updateNvidiaSATSetup(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// startNvidiaSAT launches the SAT and nvtop.
|
||||
// startNvidiaSAT launches the NVIDIA acceptance pack.
|
||||
func (m model) startNvidiaSAT() (tea.Model, tea.Cmd) {
|
||||
var selectedGPUs []platform.NvidiaGPU
|
||||
for i, sel := range m.nvidiaGPUSel {
|
||||
@@ -142,31 +141,12 @@ func (m model) startNvidiaSAT() (tea.Model, tea.Cmd) {
|
||||
return nvidiaSATDoneMsg{title: result.Title, body: result.Body, err: err}
|
||||
}
|
||||
|
||||
nvtopPath, lookErr := exec.LookPath("nvtop")
|
||||
if lookErr != nil {
|
||||
// nvtop not available: just run the SAT, show running screen
|
||||
return m, satCmd
|
||||
}
|
||||
|
||||
return m, tea.Batch(
|
||||
satCmd,
|
||||
tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
|
||||
return nvtopClosedMsg{}
|
||||
}),
|
||||
)
|
||||
return m, satCmd
|
||||
}
|
||||
|
||||
// updateNvidiaSATRunning handles keys on the running screen.
|
||||
func (m model) updateNvidiaSATRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch msg.String() {
|
||||
case "o", "O":
|
||||
nvtopPath, err := exec.LookPath("nvtop")
|
||||
if err != nil {
|
||||
return m, nil
|
||||
}
|
||||
return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
|
||||
return nvtopClosedMsg{}
|
||||
})
|
||||
case "a", "A":
|
||||
if m.nvidiaSATCancel != nil {
|
||||
m.nvidiaSATCancel()
|
||||
@@ -234,5 +214,5 @@ func renderNvidiaSATSetup(m model) string {
|
||||
|
||||
// renderNvidiaSATRunning renders the running screen.
|
||||
func renderNvidiaSATRunning() string {
|
||||
return "NVIDIA SAT\n\nTest is running...\n\n[o] Open nvtop [a] Abort test [ctrl+c] quit\n"
|
||||
return "NVIDIA SAT\n\nTest is running...\n\n[a] Abort test [ctrl+c] quit\n"
|
||||
}
|
||||
|
||||
@@ -27,6 +27,7 @@ const (
|
||||
screenConfirm screen = "confirm"
|
||||
screenNvidiaSATSetup screen = "nvidia_sat_setup"
|
||||
screenNvidiaSATRunning screen = "nvidia_sat_running"
|
||||
screenGPUStressRunning screen = "gpu_stress_running"
|
||||
)
|
||||
|
||||
type actionKind string
|
||||
@@ -40,7 +41,8 @@ const (
|
||||
actionRunMemorySAT actionKind = "run_memory_sat"
|
||||
actionRunStorageSAT actionKind = "run_storage_sat"
|
||||
actionRunCPUSAT actionKind = "run_cpu_sat"
|
||||
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
|
||||
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
|
||||
actionRunFanStress actionKind = "run_fan_stress"
|
||||
)
|
||||
|
||||
type model struct {
|
||||
@@ -92,6 +94,10 @@ type model struct {
|
||||
nvidiaSATCancel func()
|
||||
nvidiaSATAborted bool
|
||||
|
||||
// GPU Platform Stress Test running
|
||||
gpuStressCancel func()
|
||||
gpuStressAborted bool
|
||||
|
||||
// SAT verbose progress (CPU / Memory / Storage / AMD GPU)
|
||||
progressLines []string
|
||||
progressPrefix string
|
||||
@@ -188,6 +194,11 @@ func (m model) confirmBody() (string, string) {
|
||||
return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode]
|
||||
case actionRunAMDGPUSAT:
|
||||
return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?"
|
||||
case actionRunFanStress:
|
||||
modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
|
||||
return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" +
|
||||
"Monitors fans, temps, power — detects throttling.\n" +
|
||||
"Mode: " + modes[m.hcMode] + "\n\nAll NVIDIA GPUs will be stressed."
|
||||
default:
|
||||
return "Confirm", "Proceed?"
|
||||
}
|
||||
|
||||
@@ -108,6 +108,28 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
||||
return m.handleNvidiaGPUsMsg(msg)
|
||||
case nvtopClosedMsg:
|
||||
return m, nil
|
||||
case gpuStressDoneMsg:
|
||||
if m.gpuStressAborted {
|
||||
return m, nil
|
||||
}
|
||||
if m.gpuStressCancel != nil {
|
||||
m.gpuStressCancel()
|
||||
m.gpuStressCancel = nil
|
||||
}
|
||||
m.prevScreen = screenHealthCheck
|
||||
m.screen = screenOutput
|
||||
m.title = msg.title
|
||||
if msg.err != nil {
|
||||
body := strings.TrimSpace(msg.body)
|
||||
if body == "" {
|
||||
m.body = fmt.Sprintf("ERROR: %v", msg.err)
|
||||
} else {
|
||||
m.body = fmt.Sprintf("%s\n\nERROR: %v", body, msg.err)
|
||||
}
|
||||
} else {
|
||||
m.body = msg.body
|
||||
}
|
||||
return m, m.refreshSnapshotCmd()
|
||||
case nvidiaSATDoneMsg:
|
||||
if m.nvidiaSATAborted {
|
||||
return m, nil
|
||||
@@ -152,6 +174,8 @@ func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
return m.updateNvidiaSATSetup(msg)
|
||||
case screenNvidiaSATRunning:
|
||||
return m.updateNvidiaSATRunning(msg)
|
||||
case screenGPUStressRunning:
|
||||
return m.updateGPUStressRunning(msg)
|
||||
case screenExportTargets:
|
||||
return m.updateMenu(msg, len(m.targets), m.handleExportTargetsMenu)
|
||||
case screenInterfacePick:
|
||||
|
||||
@@ -78,6 +78,8 @@ func (m model) View() string {
|
||||
body = renderNvidiaSATSetup(m)
|
||||
case screenNvidiaSATRunning:
|
||||
body = renderNvidiaSATRunning()
|
||||
case screenGPUStressRunning:
|
||||
body = renderGPUStressRunning()
|
||||
case screenOutput:
|
||||
body = fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
|
||||
default:
|
||||
|
||||
@@ -32,6 +32,6 @@ lb config noauto \
|
||||
--memtest none \
|
||||
--iso-volume "EASY-BEE" \
|
||||
--iso-application "EASY-BEE" \
|
||||
--bootappend-live "boot=live components console=tty0 console=ttyS0,115200n8 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--bootappend-live "boot=live components console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--apt-recommends false \
|
||||
"${@}"
|
||||
|
||||
@@ -46,7 +46,8 @@ CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
||||
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ]; then
|
||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
||||
&& [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||
echo "=== NVIDIA cached, skipping build ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) .ko files"
|
||||
@@ -129,8 +130,10 @@ else
|
||||
echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)"
|
||||
fi
|
||||
|
||||
# Copy ALL userspace library files
|
||||
for lib in libnvidia-ml libcuda; do
|
||||
# Copy ALL userspace library files.
|
||||
# libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation
|
||||
# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND.
|
||||
for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
|
||||
count=0
|
||||
for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
|
||||
cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
|
||||
@@ -147,7 +150,7 @@ ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
|
||||
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
||||
|
||||
# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
|
||||
for lib in libnvidia-ml libcuda; do
|
||||
for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
|
||||
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
|
||||
[ -n "$versioned" ] || continue
|
||||
base=$(basename "$versioned")
|
||||
|
||||
@@ -34,6 +34,63 @@ mkdir -p "${CACHE_ROOT}"
|
||||
: "${GOMODCACHE:=${CACHE_ROOT}/go-mod}"
|
||||
export GOCACHE GOMODCACHE
|
||||
|
||||
resolve_audit_version() {
|
||||
if [ -n "${BEE_AUDIT_VERSION:-}" ]; then
|
||||
echo "${BEE_AUDIT_VERSION}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'audit/v*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
if [ -z "${tag}" ]; then
|
||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
fi
|
||||
case "${tag}" in
|
||||
audit/v*)
|
||||
echo "${tag#audit/v}"
|
||||
return 0
|
||||
;;
|
||||
v*)
|
||||
echo "${tag#v}"
|
||||
return 0
|
||||
;;
|
||||
"")
|
||||
;;
|
||||
*)
|
||||
echo "${tag}"
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ -n "${AUDIT_VERSION:-}" ]; then
|
||||
echo "${AUDIT_VERSION}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
date +%Y%m%d
|
||||
}
|
||||
|
||||
# ISO image versioned separately from the audit binary (iso/v* tags).
|
||||
resolve_iso_version() {
|
||||
if [ -n "${BEE_ISO_VERSION:-}" ]; then
|
||||
echo "${BEE_ISO_VERSION}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'iso/v*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
case "${tag}" in
|
||||
iso/v*)
|
||||
echo "${tag#iso/v}"
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
|
||||
# Fall back to audit version so the name is still meaningful
|
||||
resolve_audit_version
|
||||
}
|
||||
|
||||
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
||||
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
||||
|
||||
# Auto-detect kernel ABI: refresh apt index, then query current linux-image-amd64 dependency.
|
||||
# If headers for the detected ABI are not yet installed (kernel updated since image build),
|
||||
# install them on the fly so NVIDIA modules and ISO kernel always match.
|
||||
@@ -64,6 +121,7 @@ fi
|
||||
|
||||
echo "=== bee ISO build ==="
|
||||
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
||||
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
||||
echo ""
|
||||
|
||||
echo "=== syncing git submodules ==="
|
||||
@@ -83,7 +141,7 @@ if [ "$NEED_BUILD" = "1" ]; then
|
||||
cd "${REPO_ROOT}/audit"
|
||||
GOOS=linux GOARCH=amd64 CGO_ENABLED=0 \
|
||||
go build \
|
||||
-ldflags "-s -w -X main.Version=${AUDIT_VERSION:-$(date +%Y%m%d)}" \
|
||||
-ldflags "-s -w -X main.Version=${AUDIT_VERSION_EFFECTIVE}" \
|
||||
-o "$BEE_BIN" \
|
||||
./cmd/bee
|
||||
echo "binary: $BEE_BIN"
|
||||
@@ -230,8 +288,8 @@ mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
||||
BUILD_DATE="$(date +%Y-%m-%d)"
|
||||
GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo unknown)"
|
||||
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
||||
BEE_ISO_VERSION=${AUDIT_VERSION}
|
||||
BEE_AUDIT_VERSION=${AUDIT_VERSION}
|
||||
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
||||
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
||||
BUILD_DATE=${BUILD_DATE}
|
||||
GIT_COMMIT=${GIT_COMMIT}
|
||||
DEBIAN_VERSION=${DEBIAN_VERSION}
|
||||
@@ -272,7 +330,7 @@ lb build 2>&1
|
||||
|
||||
# live-build outputs live-image-amd64.hybrid.iso in LB_DIR
|
||||
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
||||
ISO_OUT="${DIST_DIR}/bee-debian${DEBIAN_VERSION}-v${AUDIT_VERSION}-amd64.iso"
|
||||
ISO_OUT="${DIST_DIR}/bee-debian${DEBIAN_VERSION}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
|
||||
if [ -f "$ISO_RAW" ]; then
|
||||
cp "$ISO_RAW" "$ISO_OUT"
|
||||
echo ""
|
||||
|
||||
@@ -10,12 +10,17 @@ echo " ╚══════╝╚═╝ ╚═╝╚══════╝
|
||||
echo ""
|
||||
|
||||
menuentry "EASY-BEE" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE (NVIDIA GSP=off)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE (fail-safe)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
|
||||
18
iso/builder/config/bootloaders/isolinux/live.cfg.in
Normal file
18
iso/builder/config/bootloaders/isolinux/live.cfg.in
Normal file
@@ -0,0 +1,18 @@
|
||||
label live-@FLAVOUR@-normal
|
||||
menu label ^EASY-BEE
|
||||
menu default
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.nvidia.mode=normal
|
||||
|
||||
label live-@FLAVOUR@-gsp-off
|
||||
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off
|
||||
|
||||
label live-@FLAVOUR@-failsafe
|
||||
menu label EASY-BEE (^fail-safe)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
||||
@@ -5,6 +5,21 @@ set -e
|
||||
|
||||
echo "=== bee chroot setup ==="
|
||||
|
||||
ensure_bee_console_user() {
|
||||
if id bee >/dev/null 2>&1; then
|
||||
usermod -d /home/bee -s /bin/sh bee 2>/dev/null || true
|
||||
else
|
||||
useradd -d /home/bee -m -s /bin/sh -U bee
|
||||
fi
|
||||
|
||||
mkdir -p /home/bee
|
||||
chown -R bee:bee /home/bee
|
||||
echo "bee:eeb" | chpasswd
|
||||
usermod -aG sudo bee 2>/dev/null || true
|
||||
}
|
||||
|
||||
ensure_bee_console_user
|
||||
|
||||
# Enable bee services
|
||||
systemctl enable bee-network.service
|
||||
systemctl enable bee-nvidia.service
|
||||
@@ -15,6 +30,8 @@ systemctl enable bee-sshsetup.service
|
||||
systemctl enable ssh.service
|
||||
systemctl enable qemu-guest-agent.service 2>/dev/null || true
|
||||
systemctl enable serial-getty@ttyS0.service 2>/dev/null || true
|
||||
systemctl enable serial-getty@ttyS1.service 2>/dev/null || true
|
||||
systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
||||
|
||||
# Ensure scripts are executable
|
||||
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
|
||||
@@ -23,6 +40,7 @@ chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-tui 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||
|
||||
# Reload udev rules
|
||||
udevadm control --reload-rules 2>/dev/null || true
|
||||
|
||||
@@ -2,36 +2,100 @@
|
||||
# 9001-amd-rocm.hook.chroot — install AMD ROCm SMI tool for Instinct GPU monitoring.
|
||||
# Runs inside the live-build chroot. Adds AMD's apt repository and installs
|
||||
# rocm-smi-lib which provides the `rocm-smi` CLI (analogous to nvidia-smi).
|
||||
#
|
||||
# AMD does NOT publish Debian Bookworm packages. The repo uses Ubuntu codenames
|
||||
# (jammy/noble). We use jammy (Ubuntu 22.04) — its packages install cleanly on
|
||||
# Debian 12 (Bookworm) due to compatible glibc/libstdc++.
|
||||
# Tried versions newest-first; falls back if a point release is missing.
|
||||
|
||||
set -e
|
||||
|
||||
ROCM_VERSION="6.4"
|
||||
# Ubuntu codename to use for the AMD repo (Debian has no AMD packages).
|
||||
ROCM_UBUNTU_DIST="jammy"
|
||||
|
||||
# ROCm point-releases to try newest-first. AMD drops old point releases
|
||||
# from the repo, so we walk backwards until one responds 200.
|
||||
ROCM_CANDIDATES="6.3.4 6.3.3 6.3.2 6.3.1 6.3 6.2.4 6.2.3 6.2.2 6.2.1 6.2"
|
||||
|
||||
ROCM_KEYRING="/etc/apt/keyrings/rocm.gpg"
|
||||
ROCM_LIST="/etc/apt/sources.list.d/rocm.list"
|
||||
|
||||
echo "=== AMD ROCm ${ROCM_VERSION}: adding repository ==="
|
||||
APT_UPDATED=0
|
||||
|
||||
mkdir -p /etc/apt/keyrings
|
||||
|
||||
ensure_tool() {
|
||||
tool="$1"
|
||||
pkg="$2"
|
||||
if command -v "${tool}" >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
if [ "${APT_UPDATED}" -eq 0 ]; then
|
||||
apt-get update -qq
|
||||
APT_UPDATED=1
|
||||
fi
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends "${pkg}"
|
||||
}
|
||||
|
||||
ensure_cert_bundle() {
|
||||
if [ -s /etc/ssl/certs/ca-certificates.crt ]; then
|
||||
return 0
|
||||
fi
|
||||
if [ "${APT_UPDATED}" -eq 0 ]; then
|
||||
apt-get update -qq
|
||||
APT_UPDATED=1
|
||||
fi
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates
|
||||
}
|
||||
|
||||
# live-build chroot may not include fetch/signing tools yet
|
||||
if ! ensure_cert_bundle || ! ensure_tool wget wget || ! ensure_tool gpg gpg; then
|
||||
echo "WARN: failed to install wget/gpg/ca-certificates prerequisites — skipping ROCm install"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Download and import AMD GPG key
|
||||
if ! wget -qO- "https://repo.radeon.com/rocm/rocm.gpg.key" \
|
||||
| gpg --dearmor > "${ROCM_KEYRING}"; then
|
||||
| gpg --dearmor --yes --output "${ROCM_KEYRING}"; then
|
||||
echo "WARN: failed to fetch AMD ROCm GPG key — skipping ROCm install"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
cat > "${ROCM_LIST}" <<EOF
|
||||
deb [arch=amd64 signed-by=${ROCM_KEYRING}] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} bookworm main
|
||||
# Try each ROCm version until apt-get update succeeds.
|
||||
# AMD repo uses Ubuntu codenames; bookworm is not published — use jammy.
|
||||
ROCM_VERSION=""
|
||||
for candidate in ${ROCM_CANDIDATES}; do
|
||||
cat > "${ROCM_LIST}" <<EOF
|
||||
deb [arch=amd64 signed-by=${ROCM_KEYRING}] https://repo.radeon.com/rocm/apt/${candidate} ${ROCM_UBUNTU_DIST} main
|
||||
EOF
|
||||
if apt-get update -qq 2>/dev/null; then
|
||||
ROCM_VERSION="${candidate}"
|
||||
echo "=== AMD ROCm ${ROCM_VERSION} (${ROCM_UBUNTU_DIST}): repository available ==="
|
||||
break
|
||||
fi
|
||||
echo "WARN: ROCm ${candidate} not available, trying next..."
|
||||
rm -f "${ROCM_LIST}"
|
||||
done
|
||||
|
||||
apt-get update -qq
|
||||
if [ -z "${ROCM_VERSION}" ]; then
|
||||
echo "WARN: no ROCm apt repository available — skipping ROCm install"
|
||||
rm -f "${ROCM_KEYRING}"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# rocm-smi-lib provides the rocm-smi CLI tool for GPU monitoring
|
||||
if apt-get install -y --no-install-recommends rocm-smi-lib 2>/dev/null; then
|
||||
echo "=== AMD ROCm: rocm-smi installed ==="
|
||||
if DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-smi-lib; then
|
||||
echo "=== AMD ROCm: rocm-smi-lib installed ==="
|
||||
if [ -x /opt/rocm/bin/rocm-smi ]; then
|
||||
ln -sf /opt/rocm/bin/rocm-smi /usr/local/bin/rocm-smi
|
||||
else
|
||||
smi_path="$(find /opt -path '*/bin/rocm-smi' -type f 2>/dev/null | sort | tail -1)"
|
||||
if [ -n "${smi_path}" ]; then
|
||||
ln -sf "${smi_path}" /usr/local/bin/rocm-smi
|
||||
fi
|
||||
fi
|
||||
rocm-smi --version 2>/dev/null || true
|
||||
else
|
||||
echo "WARN: rocm-smi-lib install failed — GPU monitoring unavailable"
|
||||
echo "WARN: rocm-smi-lib install failed — AMD GPU monitoring unavailable"
|
||||
fi
|
||||
|
||||
# Clean up apt lists to keep ISO size down
|
||||
|
||||
@@ -26,6 +26,15 @@ echo ""
|
||||
|
||||
KVER=$(uname -r)
|
||||
info "kernel: $KVER"
|
||||
NVIDIA_BOOT_MODE="normal"
|
||||
for arg in $(cat /proc/cmdline 2>/dev/null); do
|
||||
case "$arg" in
|
||||
bee.nvidia.mode=*)
|
||||
NVIDIA_BOOT_MODE="${arg#*=}"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
|
||||
|
||||
# --- PATH & binaries ---
|
||||
echo "-- PATH & binaries --"
|
||||
@@ -53,17 +62,25 @@ else
|
||||
fail "NVIDIA ko dir missing: $KO_DIR"
|
||||
fi
|
||||
|
||||
for mod in nvidia nvidia_modeset nvidia_uvm; do
|
||||
if /sbin/lsmod 2>/dev/null | grep -q "^nvidia "; then
|
||||
ok "module loaded: nvidia"
|
||||
else
|
||||
fail "module NOT loaded: nvidia"
|
||||
fi
|
||||
|
||||
for mod in nvidia_modeset nvidia_uvm; do
|
||||
if /sbin/lsmod 2>/dev/null | grep -q "^$mod "; then
|
||||
ok "module loaded: $mod"
|
||||
elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
|
||||
fail "module NOT loaded in normal mode: $mod"
|
||||
else
|
||||
fail "module NOT loaded: $mod"
|
||||
warn "module not loaded in GSP-off mode: $mod"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "-- NVIDIA device nodes --"
|
||||
for dev in nvidiactl nvidia0 nvidia-uvm; do
|
||||
for dev in nvidiactl nvidia0; do
|
||||
if [ -e "/dev/$dev" ]; then
|
||||
ok "/dev/$dev exists"
|
||||
else
|
||||
@@ -71,6 +88,14 @@ for dev in nvidiactl nvidia0 nvidia-uvm; do
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -e /dev/nvidia-uvm ]; then
|
||||
ok "/dev/nvidia-uvm exists"
|
||||
elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
|
||||
fail "/dev/nvidia-uvm missing in normal mode"
|
||||
else
|
||||
warn "/dev/nvidia-uvm missing — CUDA stress path may be unavailable until loaded on demand"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "-- nvidia-smi --"
|
||||
if PATH="/usr/local/bin:$PATH" command -v nvidia-smi >/dev/null 2>&1; then
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
export PATH="$PATH:/usr/local/bin"
|
||||
export PATH="$PATH:/usr/local/bin:/opt/rocm/bin:/opt/rocm/sbin"
|
||||
|
||||
menu() {
|
||||
if [ -x /usr/local/bin/bee-tui ]; then
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
[Journal]
|
||||
ForwardToConsole=yes
|
||||
TTYPath=/dev/ttyS0
|
||||
MaxLevelConsole=debug
|
||||
@@ -5,9 +5,9 @@ Before=bee-web.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/bin/sh -c '/usr/local/bin/bee audit --runtime livecd --output file:/appdata/bee/export/bee-audit.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-audit] WARN: audit exited with rc=$rc"; fi; exit 0'
|
||||
StandardOutput=append:/appdata/bee/export/bee-audit.log
|
||||
StandardError=append:/appdata/bee/export/bee-audit.log
|
||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-audit.log /bin/sh -c '/usr/local/bin/bee audit --runtime livecd --output file:/appdata/bee/export/bee-audit.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-audit] WARN: audit exited with rc=$rc"; fi; exit 0'
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
RemainAfterExit=yes
|
||||
|
||||
[Install]
|
||||
|
||||
16
iso/overlay/etc/systemd/system/bee-journal-mirror@.service
Normal file
16
iso/overlay/etc/systemd/system/bee-journal-mirror@.service
Normal file
@@ -0,0 +1,16 @@
|
||||
[Unit]
|
||||
Description=Bee: mirror system journal to %I
|
||||
After=systemd-journald.service
|
||||
Requires=systemd-journald.service
|
||||
ConditionPathExists=/dev/%I
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/bin/sh -c 'exec journalctl -f -n 200 -o short-monotonic > /dev/%I'
|
||||
Restart=always
|
||||
RestartSec=1
|
||||
StandardOutput=null
|
||||
StandardError=journal
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@@ -5,9 +5,9 @@ Before=network-online.target bee-audit.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/bee-network.sh
|
||||
StandardOutput=append:/appdata/bee/export/bee-network.log
|
||||
StandardError=append:/appdata/bee/export/bee-network.log
|
||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-network.log /usr/local/bin/bee-network.sh
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
RemainAfterExit=yes
|
||||
|
||||
[Install]
|
||||
|
||||
@@ -5,9 +5,9 @@ Before=bee-audit.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/bee-nvidia-load
|
||||
StandardOutput=append:/appdata/bee/export/bee-nvidia.log
|
||||
StandardError=append:/appdata/bee/export/bee-nvidia.log
|
||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-nvidia.log /usr/local/bin/bee-nvidia-load
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
RemainAfterExit=yes
|
||||
|
||||
[Install]
|
||||
|
||||
@@ -5,9 +5,9 @@ Before=bee-audit.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/bin/sh -c '/usr/local/bin/bee preflight --output file:/appdata/bee/export/runtime-health.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-preflight] WARN: preflight exited with rc=$rc"; fi; exit 0'
|
||||
StandardOutput=append:/appdata/bee/export/runtime-health.log
|
||||
StandardError=append:/appdata/bee/export/runtime-health.log
|
||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/runtime-health.log /bin/sh -c '/usr/local/bin/bee preflight --output file:/appdata/bee/export/runtime-health.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-preflight] WARN: preflight exited with rc=$rc"; fi; exit 0'
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
RemainAfterExit=yes
|
||||
|
||||
[Install]
|
||||
|
||||
@@ -5,9 +5,9 @@ Before=ssh.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/bee-sshsetup
|
||||
StandardOutput=append:/appdata/bee/export/bee-sshsetup.log
|
||||
StandardError=append:/appdata/bee/export/bee-sshsetup.log
|
||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-sshsetup.log /usr/local/bin/bee-sshsetup
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
RemainAfterExit=yes
|
||||
|
||||
[Install]
|
||||
|
||||
@@ -5,11 +5,11 @@ Wants=bee-audit.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
|
||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
|
||||
Restart=always
|
||||
RestartSec=2
|
||||
StandardOutput=append:/appdata/bee/export/bee-web.log
|
||||
StandardError=append:/appdata/bee/export/bee-web.log
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
29
iso/overlay/usr/local/bin/bee-log-run
Normal file
29
iso/overlay/usr/local/bin/bee-log-run
Normal file
@@ -0,0 +1,29 @@
|
||||
#!/bin/bash
|
||||
# bee-log-run — run a command, append its output to a file, and keep stdout/stderr
|
||||
# connected to systemd so journald and the serial console also receive the logs.
|
||||
|
||||
set -o pipefail
|
||||
|
||||
log_file="$1"
|
||||
shift
|
||||
|
||||
if [ -z "$log_file" ] || [ "$#" -eq 0 ]; then
|
||||
echo "usage: $0 <log-file> <command> [args...]" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
mkdir -p "$(dirname "$log_file")"
|
||||
|
||||
serial_sink() {
|
||||
local tty="$1"
|
||||
if [ -w "$tty" ]; then
|
||||
cat > "$tty"
|
||||
else
|
||||
cat > /dev/null
|
||||
fi
|
||||
}
|
||||
|
||||
"$@" 2>&1 | tee -a "$log_file" \
|
||||
>(serial_sink /dev/ttyS0) \
|
||||
>(serial_sink /dev/ttyS1)
|
||||
exit "${PIPESTATUS[0]}"
|
||||
@@ -22,24 +22,61 @@ fi
|
||||
log "module dir: $NVIDIA_KO_DIR"
|
||||
ls "$NVIDIA_KO_DIR"/*.ko 2>/dev/null | sed 's/^/ /' || true
|
||||
|
||||
# Some kernels expose backlight helper symbols only after loading `video`.
|
||||
modprobe video >/dev/null 2>&1 && log "loaded helper module: video" || log "helper module unavailable: video"
|
||||
cmdline_param() {
|
||||
key="$1"
|
||||
for token in $(cat /proc/cmdline 2>/dev/null); do
|
||||
case "$token" in
|
||||
"$key"=*)
|
||||
echo "${token#*=}"
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
# Load modules via insmod (direct load — no depmod needed)
|
||||
for mod in nvidia nvidia-modeset nvidia-uvm; do
|
||||
nvidia_mode="$(cmdline_param bee.nvidia.mode || true)"
|
||||
if [ -z "$nvidia_mode" ]; then
|
||||
nvidia_mode="normal"
|
||||
fi
|
||||
log "boot mode: $nvidia_mode"
|
||||
|
||||
load_module() {
|
||||
mod="$1"
|
||||
shift
|
||||
ko="$NVIDIA_KO_DIR/${mod}.ko"
|
||||
[ -f "$ko" ] || ko="$NVIDIA_KO_DIR/${mod//-/_}.ko"
|
||||
if [ -f "$ko" ]; then
|
||||
if insmod "$ko"; then
|
||||
log "loaded: $mod"
|
||||
else
|
||||
log "WARN: failed to load: $mod"
|
||||
dmesg | tail -n 5 | sed 's/^/ dmesg: /' || true
|
||||
fi
|
||||
else
|
||||
if [ ! -f "$ko" ]; then
|
||||
log "WARN: not found: $ko"
|
||||
return 1
|
||||
fi
|
||||
done
|
||||
if insmod "$ko" "$@"; then
|
||||
log "loaded: $mod $*"
|
||||
return 0
|
||||
fi
|
||||
log "WARN: failed to load: $mod"
|
||||
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
||||
return 1
|
||||
}
|
||||
|
||||
case "$nvidia_mode" in
|
||||
normal|full)
|
||||
if ! load_module nvidia; then
|
||||
exit 1
|
||||
fi
|
||||
load_module nvidia-modeset || true
|
||||
load_module nvidia-uvm || true
|
||||
;;
|
||||
gsp-off|safe|*)
|
||||
# NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
|
||||
# be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
|
||||
# conservative path for platforms where full boot-time GSP init is unstable.
|
||||
if ! load_module nvidia NVreg_EnableGpuFirmware=0; then
|
||||
exit 1
|
||||
fi
|
||||
log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
|
||||
;;
|
||||
esac
|
||||
|
||||
# Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
|
||||
nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices | awk '{print $1}')
|
||||
@@ -61,8 +98,11 @@ if [ -n "$uvm_major" ]; then
|
||||
&& log "created /dev/nvidia-uvm (major $uvm_major)" \
|
||||
|| log "WARN: /dev/nvidia-uvm already exists"
|
||||
mknod -m 666 /dev/nvidia-uvm-tools c "$uvm_major" 1 || true
|
||||
else
|
||||
log "WARN: nvidia-uvm not in /proc/devices"
|
||||
fi
|
||||
|
||||
# Refresh dynamic linker cache so that NVIDIA/NCCL libs injected into /usr/lib/
|
||||
# are visible to dlopen() calls (libcuda, libnvidia-ptxjitcompiler, libnccl, etc.)
|
||||
ldconfig 2>/dev/null || true
|
||||
log "ldconfig refreshed"
|
||||
|
||||
log "done"
|
||||
|
||||
Reference in New Issue
Block a user