Compare commits

..

3 Commits
v7.7 ... v7.10

Author SHA1 Message Date
Mikhail Chusavitin
525ed8b8fc Fix GPU clock lock normalization for Blackwell (clocks.max.* unsupported)
clocks.max.graphics / clocks.max.memory CSV fields return exit status 2 on
RTX PRO 6000 Blackwell (driver 98.x), causing the entire gpu inventory query
to fail and clock lock to be skipped → normalization: partial.

Fix:
- Add minimal fallback query (index,uuid,name,pci.bus_id,vbios_version,
  power.limit) that succeeds even without clock fields
- Add enrichGPUInfoWithMaxClocks: parses "Max Clocks" section of
  nvidia-smi -q verbose output to fill MaxGraphicsClockMHz /
  MaxMemoryClockMHz when CSV fields fail
- Move nvidia-smi -q execution before queryBenchmarkGPUInfo so its output
  is available for clock enrichment immediately after
- Tests: cover enrichment and skip-if-populated cases

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 13:33:54 +03:00
Mikhail Chusavitin
4f94ebcb2c Add HPC tuning: PCIe ASPM off, C-states, performance CPU governor
- grub.cfg + isolinux/live.cfg.in: add pcie_aspm=off,
  intel_idle.max_cstate=1 and processor.max_cstate=1 to all
  non-failsafe boot entries
- bee-hpc-tuning: new script that sets all CPU cores to performance
  governor via sysfs and logs THP state at boot
- bee-hpc-tuning.service: runs before bee-nvidia and bee-audit
- 9000-bee-setup.hook.chroot: enable service and mark script executable

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 13:07:32 +03:00
Mikhail Chusavitin
05c1fde233 Warn on PCIe link speed degradation and collect lspci -vvv in techdump
- collector/pcie: add applyPCIeLinkSpeedWarning that sets status=Warning
  and ErrorDescription when current link speed is below maximum negotiated
  speed (e.g. Gen1 running on a Gen5 slot)
- collector/pcie: add pcieLinkSpeedRank helper for Gen string comparison
- collector/pcie_filter_test: cover degraded and healthy link speed cases
- platform/techdump: collect lspci -vvv → lspci-vvv.txt for LnkCap/LnkSta

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 12:42:17 +03:00
10 changed files with 359 additions and 29 deletions

View File

@@ -2,6 +2,7 @@ package collector
import ( import (
"bee/audit/internal/schema" "bee/audit/internal/schema"
"fmt"
"log/slog" "log/slog"
"os/exec" "os/exec"
"strconv" "strconv"
@@ -172,6 +173,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
// SVendor/SDevice available but not in schema — skip // SVendor/SDevice available but not in schema — skip
// Warn if PCIe link is running below its maximum negotiated speed.
applyPCIeLinkSpeedWarning(&dev)
return dev return dev
} }
@@ -241,6 +245,41 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
return value, true return value, true
} }
// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
// speed is below the maximum negotiated speed supported by both ends.
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
return
}
if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
warn := statusWarning
dev.Status = &warn
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
dev.ErrorDescription = &desc
}
}
// pcieLinkSpeedRank returns a numeric rank for a normalized Gen string (e.g. "Gen4" → 4).
// Returns 0 for unrecognised values so comparisons fail safe.
func pcieLinkSpeedRank(gen string) int {
switch gen {
case "Gen1":
return 1
case "Gen2":
return 2
case "Gen3":
return 3
case "Gen4":
return 4
case "Gen5":
return 5
case "Gen6":
return 6
default:
return 0
}
}
func normalizePCILinkSpeed(raw string) string { func normalizePCILinkSpeed(raw string) string {
raw = strings.TrimSpace(strings.ToLower(raw)) raw = strings.TrimSpace(strings.ToLower(raw))
switch { switch {

View File

@@ -1,6 +1,7 @@
package collector package collector
import ( import (
"bee/audit/internal/schema"
"encoding/json" "encoding/json"
"strings" "strings"
"testing" "testing"
@@ -141,3 +142,77 @@ func TestNormalizePCILinkSpeed(t *testing.T) {
} }
} }
} }
func TestApplyPCIeLinkSpeedWarning(t *testing.T) {
ptr := func(s string) *string { return &s }
tests := []struct {
name string
linkSpeed *string
maxSpeed *string
wantWarning bool
wantGenIn string // substring expected in ErrorDescription when warning
}{
{
name: "degraded Gen1 vs Gen5",
linkSpeed: ptr("Gen1"),
maxSpeed: ptr("Gen5"),
wantWarning: true,
wantGenIn: "Gen1",
},
{
name: "at max Gen5",
linkSpeed: ptr("Gen5"),
maxSpeed: ptr("Gen5"),
wantWarning: false,
},
{
name: "degraded Gen4 vs Gen5",
linkSpeed: ptr("Gen4"),
maxSpeed: ptr("Gen5"),
wantWarning: true,
wantGenIn: "Gen4",
},
{
name: "missing current speed — no warning",
linkSpeed: nil,
maxSpeed: ptr("Gen5"),
wantWarning: false,
},
{
name: "missing max speed — no warning",
linkSpeed: ptr("Gen1"),
maxSpeed: nil,
wantWarning: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
dev := schema.HardwarePCIeDevice{}
ok := statusOK
dev.Status = &ok
dev.LinkSpeed = tt.linkSpeed
dev.MaxLinkSpeed = tt.maxSpeed
applyPCIeLinkSpeedWarning(&dev)
gotWarn := dev.Status != nil && *dev.Status == statusWarning
if gotWarn != tt.wantWarning {
t.Fatalf("wantWarning=%v gotWarning=%v (status=%v)", tt.wantWarning, gotWarn, dev.Status)
}
if tt.wantWarning {
if dev.ErrorDescription == nil {
t.Fatal("expected ErrorDescription to be set")
}
if !strings.Contains(*dev.ErrorDescription, tt.wantGenIn) {
t.Fatalf("ErrorDescription %q does not contain %q", *dev.ErrorDescription, tt.wantGenIn)
}
} else {
if dev.ErrorDescription != nil {
t.Fatalf("unexpected ErrorDescription: %s", *dev.ErrorDescription)
}
}
})
}
}

View File

@@ -121,15 +121,22 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
var serverIdleOK, serverLoadedOK bool var serverIdleOK, serverLoadedOK bool
var serverLoadedSamples int var serverLoadedSamples int
// Run nvidia-smi -q first: used both for the log file and as a fallback
// source of max clock values when CSV clock fields are unsupported.
var nvsmiQOut []byte
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
nvsmiQOut = out
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
}
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected) infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
if infoErr != nil { if infoErr != nil {
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error()) result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
result.Normalization.Status = "partial" result.Normalization.Status = "partial"
} }
// Enrich with max clocks from verbose output — covers GPUs where
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil { // clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644) enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)
}
activeApps, err := queryActiveComputeApps(selected) activeApps, err := queryActiveComputeApps(selected)
if err == nil && len(activeApps) > 0 { if err == nil && len(activeApps) > 0 {
@@ -370,9 +377,13 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
// Fields are tried in order; the first successful query wins. Extended fields // Fields are tried in order; the first successful query wins. Extended fields
// (attribute.multiprocessor_count, power.default_limit) are not supported on // (attribute.multiprocessor_count, power.default_limit) are not supported on
// all driver versions, so we fall back to the base set if the full query fails. // all driver versions, so we fall back to the base set if the full query fails.
// The minimal fallback omits clock fields entirely — clocks.max.* returns
// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
var benchmarkGPUInfoQueries = []struct { var benchmarkGPUInfoQueries = []struct {
fields string fields string
extended bool // whether this query includes optional extended fields extended bool // whether this query includes optional extended fields
minimal bool // clock fields omitted; max clocks must be filled separately
}{ }{
{ {
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit", fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
@@ -382,6 +393,83 @@ var benchmarkGPUInfoQueries = []struct {
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics", fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics",
extended: false, extended: false,
}, },
{
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit",
minimal: true,
},
}
// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
// any GPU in infoByIndex where those values are still zero. It parses the
// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
// return exit status 2 but the verbose query works fine.
func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
return
}
// Build bus_id → index map for matching verbose sections to GPU indices.
busToBenchIdx := make(map[string]int, len(infoByIndex))
for idx, info := range infoByIndex {
if info.BusID != "" {
// nvidia-smi -q uses "GPU 00000000:4E:00.0" (8-digit domain),
// while --query-gpu returns the same format; normalise to lower.
busToBenchIdx[strings.ToLower(strings.TrimSpace(info.BusID))] = idx
}
}
// Split the verbose output into per-GPU sections on "^GPU " lines.
gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`)
maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`)
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1)
for i, loc := range sectionStarts {
busID := strings.ToLower(string(nvsmiQ[loc[2]:loc[3]]))
benchIdx, ok := busToBenchIdx[busID]
if !ok {
// Bus IDs from verbose output may have a different domain prefix;
// try suffix match on the slot portion (XX:XX.X).
for k, v := range busToBenchIdx {
if strings.HasSuffix(k, busID) || strings.HasSuffix(busID, k) {
benchIdx = v
ok = true
break
}
}
}
if !ok {
continue
}
info := infoByIndex[benchIdx]
if info.MaxGraphicsClockMHz > 0 && info.MaxMemoryClockMHz > 0 {
continue // already populated
}
end := len(nvsmiQ)
if i+1 < len(sectionStarts) {
end = sectionStarts[i+1][0]
}
section := nvsmiQ[loc[0]:end]
if info.MaxGraphicsClockMHz == 0 {
if m := maxGfxRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
info.MaxGraphicsClockMHz = v
}
}
}
if info.MaxMemoryClockMHz == 0 {
if m := maxMemRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
info.MaxMemoryClockMHz = v
}
}
}
infoByIndex[benchIdx] = info
}
} }
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) { func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
@@ -409,9 +497,13 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
continue continue
} }
minFields := 6
if !q.minimal {
minFields = 9
}
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows)) infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
for _, row := range rows { for _, row := range rows {
if len(row) < 9 { if len(row) < minFields {
continue continue
} }
idx, err := strconv.Atoi(strings.TrimSpace(row[0])) idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
@@ -419,24 +511,26 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
continue continue
} }
info := benchmarkGPUInfo{ info := benchmarkGPUInfo{
Index: idx, Index: idx,
UUID: strings.TrimSpace(row[1]), UUID: strings.TrimSpace(row[1]),
Name: strings.TrimSpace(row[2]), Name: strings.TrimSpace(row[2]),
BusID: strings.TrimSpace(row[3]), BusID: strings.TrimSpace(row[3]),
VBIOS: strings.TrimSpace(row[4]), VBIOS: strings.TrimSpace(row[4]),
PowerLimitW: parseBenchmarkFloat(row[5]), PowerLimitW: parseBenchmarkFloat(row[5]),
MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
MaxMemoryClockMHz: parseBenchmarkFloat(row[7]),
} }
if len(row) >= 9 { if !q.minimal {
info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8]) info.MaxGraphicsClockMHz = parseBenchmarkFloat(row[6])
} info.MaxMemoryClockMHz = parseBenchmarkFloat(row[7])
if q.extended { if len(row) >= 9 {
if len(row) >= 10 { info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
} }
if len(row) >= 11 { if q.extended {
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10]) if len(row) >= 10 {
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
}
if len(row) >= 11 {
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
}
} }
} }
infoByIndex[idx] = info infoByIndex[idx] = info

View File

@@ -178,3 +178,67 @@ func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
t.Fatalf("report should not contain ANSI escapes\n%s", report) t.Fatalf("report should not contain ANSI escapes\n%s", report)
} }
} }
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
t.Parallel()
nvsmiQ := []byte(`
GPU 00000000:4E:00.0
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
Clocks
Graphics : 2422 MHz
Memory : 12481 MHz
Max Clocks
Graphics : 2430 MHz
SM : 2430 MHz
Memory : 12481 MHz
Video : 2107 MHz
GPU 00000000:4F:00.0
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
Max Clocks
Graphics : 2430 MHz
Memory : 12481 MHz
`)
infoByIndex := map[int]benchmarkGPUInfo{
0: {Index: 0, BusID: "00000000:4E:00.0"},
1: {Index: 1, BusID: "00000000:4F:00.0"},
}
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
}
if infoByIndex[0].MaxMemoryClockMHz != 12481 {
t.Errorf("GPU 0 MaxMemoryClockMHz = %v, want 12481", infoByIndex[0].MaxMemoryClockMHz)
}
if infoByIndex[1].MaxGraphicsClockMHz != 2430 {
t.Errorf("GPU 1 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[1].MaxGraphicsClockMHz)
}
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
}
}
func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
t.Parallel()
nvsmiQ := []byte(`
GPU 00000000:4E:00.0
Max Clocks
Graphics : 9999 MHz
Memory : 9999 MHz
`)
// Already populated — must not be overwritten.
infoByIndex := map[int]benchmarkGPUInfo{
0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
}
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
}
}

View File

@@ -20,6 +20,7 @@ var techDumpFixedCommands = []struct {
{Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"}, {Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"},
{Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"}, {Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"},
{Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"}, {Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"},
{Name: "lspci", Args: []string{"-vvv"}, File: "lspci-vvv.txt"},
{Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"}, {Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"},
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"}, {Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"}, {Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},

View File

@@ -11,18 +11,18 @@ echo " Hardware Audit LiveCD"
echo "" echo ""
menuentry "EASY-BEE" { menuentry "EASY-BEE" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
initrd @INITRD_LIVE@ initrd @INITRD_LIVE@
} }
submenu "EASY-BEE (advanced options) -->" { submenu "EASY-BEE (advanced options) -->" {
menuentry "EASY-BEE — GSP=off" { menuentry "EASY-BEE — GSP=off" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
initrd @INITRD_LIVE@ initrd @INITRD_LIVE@
} }
menuentry "EASY-BEE — KMS (no nomodeset)" { menuentry "EASY-BEE — KMS (no nomodeset)" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
initrd @INITRD_LIVE@ initrd @INITRD_LIVE@
} }

View File

@@ -3,31 +3,31 @@ label live-@FLAVOUR@-normal
menu default menu default
linux @LINUX@ linux @LINUX@
initrd @INITRD@ initrd @INITRD@
append @APPEND_LIVE@ bee.nvidia.mode=normal append @APPEND_LIVE@ bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
label live-@FLAVOUR@-kms label live-@FLAVOUR@-kms
menu label EASY-BEE (^graphics/KMS) menu label EASY-BEE (^graphics/KMS)
linux @LINUX@ linux @LINUX@
initrd @INITRD@ initrd @INITRD@
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
label live-@FLAVOUR@-toram label live-@FLAVOUR@-toram
menu label EASY-BEE (^load to RAM) menu label EASY-BEE (^load to RAM)
linux @LINUX@ linux @LINUX@
initrd @INITRD@ initrd @INITRD@
append @APPEND_LIVE@ toram bee.nvidia.mode=normal append @APPEND_LIVE@ toram bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
label live-@FLAVOUR@-gsp-off label live-@FLAVOUR@-gsp-off
menu label EASY-BEE (^NVIDIA GSP=off) menu label EASY-BEE (^NVIDIA GSP=off)
linux @LINUX@ linux @LINUX@
initrd @INITRD@ initrd @INITRD@
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
label live-@FLAVOUR@-kms-gsp-off label live-@FLAVOUR@-kms-gsp-off
menu label EASY-BEE (g^raphics/KMS, GSP=off) menu label EASY-BEE (g^raphics/KMS, GSP=off)
linux @LINUX@ linux @LINUX@
initrd @INITRD@ initrd @INITRD@
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
label live-@FLAVOUR@-failsafe label live-@FLAVOUR@-failsafe
menu label EASY-BEE (^fail-safe) menu label EASY-BEE (^fail-safe)

View File

@@ -25,6 +25,7 @@ ensure_bee_console_user() {
ensure_bee_console_user ensure_bee_console_user
# Enable common bee services # Enable common bee services
systemctl enable bee-hpc-tuning.service
systemctl enable bee-network.service systemctl enable bee-network.service
systemctl enable bee-preflight.service systemctl enable bee-preflight.service
systemctl enable bee-audit.service systemctl enable bee-audit.service
@@ -55,6 +56,7 @@ fi
# nogpu: no GPU services needed # nogpu: no GPU services needed
# Ensure scripts are executable # Ensure scripts are executable
chmod +x /usr/local/bin/bee-hpc-tuning 2>/dev/null || true
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true

View File

@@ -0,0 +1,14 @@
[Unit]
Description=Bee: HPC tuning (CPU governor, C-states)
After=local-fs.target
Before=bee-nvidia.service bee-audit.service
[Service]
Type=oneshot
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-hpc-tuning.log /usr/local/bin/bee-hpc-tuning
StandardOutput=journal
StandardError=journal
RemainAfterExit=yes
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,41 @@
#!/bin/sh
# bee-hpc-tuning — apply HPC tuning for deterministic benchmarking
# Called by bee-hpc-tuning.service at boot.
log() { echo "[bee-hpc-tuning] $*"; }
# ── CPU governor ────────────────────────────────────────────────────────────
# Set all CPU cores to performance governor via sysfs.
# cpupower is not available; write directly to scaling_governor.
governor_ok=0
governor_fail=0
for gov_path in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
[ -f "$gov_path" ] || continue
if echo performance > "$gov_path" 2>/dev/null; then
governor_ok=$((governor_ok + 1))
else
governor_fail=$((governor_fail + 1))
fi
done
if [ "$governor_ok" -gt 0 ] && [ "$governor_fail" -eq 0 ]; then
log "CPU governor set to performance on ${governor_ok} core(s)"
elif [ "$governor_ok" -gt 0 ]; then
log "WARN: CPU governor: ${governor_ok} OK, ${governor_fail} failed"
elif [ "$governor_fail" -gt 0 ]; then
log "WARN: failed to set CPU governor on ${governor_fail} core(s)"
else
log "WARN: no cpufreq scaling_governor paths found (C-state governor or HW-controlled)"
fi
# ── Transparent Huge Pages ───────────────────────────────────────────────────
# Kernel cmdline sets transparent_hugepage=always at boot, but confirm and log.
thp_path=/sys/kernel/mm/transparent_hugepage/enabled
if [ -f "$thp_path" ]; then
current=$(cat "$thp_path" 2>/dev/null)
log "transparent_hugepage: ${current}"
else
log "WARN: transparent_hugepage sysfs path not found"
fi
log "done"