Compare commits
5 Commits
ec89616585
...
v9.9
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7ce73e34a4 | ||
|
|
8a21809ade | ||
|
|
626763e31d | ||
|
|
0b8a2ff83f | ||
|
|
2c22b01fe3 |
@@ -3,6 +3,7 @@ package collector
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"bufio"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
@@ -17,14 +18,6 @@ var execDmidecode = func(typeNum string) (string, error) {
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
var execIpmitool = func(args ...string) (string, error) {
|
||||
out, err := exec.Command("ipmitool", args...).Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
// collectBoard runs dmidecode for types 0, 1, 2 and returns the board record
|
||||
// plus the BIOS firmware entry. Any failure is logged and returns zero values.
|
||||
func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) {
|
||||
@@ -80,19 +73,23 @@ func parseBoard(type1, type2 string) schema.HardwareBoard {
|
||||
|
||||
// collectBMCFirmware collects BMC firmware version via ipmitool mc info.
|
||||
// Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs.
|
||||
func collectBMCFirmware() []schema.HardwareFirmwareRecord {
|
||||
func collectBMCFirmware(manufacturer string) []schema.HardwareFirmwareRecord {
|
||||
if _, err := exec.LookPath("ipmitool"); err != nil {
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat("/dev/ipmi0"); err != nil {
|
||||
return nil
|
||||
}
|
||||
out, err := execIpmitool("mc", "info")
|
||||
profile := selectIPMIProfile(manufacturer)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), profile.mcInfoTimeout)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "mc", "info")
|
||||
raw, err := cmd.Output()
|
||||
if err != nil {
|
||||
slog.Info("bmc: ipmitool mc info unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
version := parseBMCFirmwareRevision(out)
|
||||
version := parseBMCFirmwareRevision(string(raw))
|
||||
if version == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
board, biosFW := collectBoard()
|
||||
snap.Board = board
|
||||
snap.Firmware = append(snap.Firmware, biosFW...)
|
||||
snap.Firmware = append(snap.Firmware, collectBMCFirmware()...)
|
||||
snap.Firmware = append(snap.Firmware, collectBMCFirmware(derefString(snap.Board.Manufacturer))...)
|
||||
|
||||
snap.CPUs = collectCPUs()
|
||||
|
||||
@@ -45,7 +45,8 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||||
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
|
||||
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
|
||||
snap.PowerSupplies = collectPSUs()
|
||||
snap.VROCLicense = collectVROCLicense(snap.PCIeDevices)
|
||||
snap.PowerSupplies = collectPSUs(derefString(snap.Board.Manufacturer))
|
||||
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
|
||||
snap.Sensors = buildSensorsFromDoc(sensorDoc)
|
||||
finalizeSnapshot(&snap, collectedAt)
|
||||
|
||||
92
audit/internal/collector/ipmi_profile.go
Normal file
92
audit/internal/collector/ipmi_profile.go
Normal file
@@ -0,0 +1,92 @@
|
||||
package collector
|
||||
|
||||
// Package-level IPMI tuning profiles.
|
||||
//
|
||||
// Each profile is matched by board manufacturer (already known before PSU
|
||||
// collection runs). The profile drives two things:
|
||||
// - Per-command timeouts — prevents infinite hangs on slow BMCs.
|
||||
// - FRU early-exit — streaming parser stops reading once all PSU entries
|
||||
// are found, avoiding the tail of non-PSU FRU records.
|
||||
//
|
||||
// To add a new vendor: append to ipmiProfiles. The first matching entry wins.
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ipmiProfile holds tuning parameters for one or more board manufacturers.
|
||||
type ipmiProfile struct {
|
||||
// name is shown in log messages.
|
||||
name string
|
||||
// manufacturers is a list of lowercase substrings matched against the
|
||||
// board manufacturer string from dmidecode type 1.
|
||||
manufacturers []string
|
||||
// fruTimeout is the hard deadline for the entire `ipmitool fru print`
|
||||
// command. Zero means no timeout (not recommended).
|
||||
fruTimeout time.Duration
|
||||
// sdrTimeout is the hard deadline for `ipmitool sdr`.
|
||||
sdrTimeout time.Duration
|
||||
// mcInfoTimeout is the hard deadline for `ipmitool mc info`.
|
||||
mcInfoTimeout time.Duration
|
||||
// fruEarlyExit instructs the streaming FRU parser to stop reading
|
||||
// after it has found at least one PSU entry and the current block is
|
||||
// complete. Useful on servers with many non-PSU FRU devices.
|
||||
fruEarlyExit bool
|
||||
}
|
||||
|
||||
// ipmiProfiles is the ordered list of profiles. First match wins.
|
||||
var ipmiProfiles = []ipmiProfile{
|
||||
{
|
||||
// Lenovo XCC-based servers (ThinkSystem SR6xx / SR8xx / ST series).
|
||||
// SR650 V3 has 54 FRU devices; each IPMI read takes ~2 s, so the
|
||||
// full `fru print` scan takes ~108 s on a loaded BMC. Enable early
|
||||
// exit so collection stops once PSU records are found.
|
||||
name: "lenovo",
|
||||
manufacturers: []string{"lenovo"},
|
||||
fruTimeout: 90 * time.Second,
|
||||
sdrTimeout: 45 * time.Second,
|
||||
mcInfoTimeout: 15 * time.Second,
|
||||
fruEarlyExit: true,
|
||||
},
|
||||
{
|
||||
// HPE iLO-based servers (ProLiant DL/ML/BL).
|
||||
name: "hpe",
|
||||
manufacturers: []string{"hp", "hewlett packard"},
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
},
|
||||
{
|
||||
// Dell iDRAC-based servers.
|
||||
name: "dell",
|
||||
manufacturers: []string{"dell"},
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
},
|
||||
}
|
||||
|
||||
// defaultIPMIProfile is used when no vendor profile matches.
|
||||
var defaultIPMIProfile = ipmiProfile{
|
||||
name: "default",
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
}
|
||||
|
||||
// selectIPMIProfile returns the profile for the given board manufacturer.
|
||||
func selectIPMIProfile(manufacturer string) ipmiProfile {
|
||||
mfgLower := strings.ToLower(strings.TrimSpace(manufacturer))
|
||||
for _, p := range ipmiProfiles {
|
||||
for _, m := range p.manufacturers {
|
||||
if strings.Contains(mfgLower, m) {
|
||||
return p
|
||||
}
|
||||
}
|
||||
}
|
||||
return defaultIPMIProfile
|
||||
}
|
||||
@@ -2,6 +2,8 @@ package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"bufio"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
@@ -10,16 +12,29 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
func collectPSUs() []schema.HardwarePowerSupply {
|
||||
func collectPSUs(manufacturer string) []schema.HardwarePowerSupply {
|
||||
profile := selectIPMIProfile(manufacturer)
|
||||
|
||||
var psus []schema.HardwarePowerSupply
|
||||
if out, err := exec.Command("ipmitool", "fru", "print").Output(); err == nil {
|
||||
psus = parseFRU(string(out))
|
||||
fruCtx, fruCancel := context.WithTimeout(context.Background(), profile.fruTimeout)
|
||||
defer fruCancel()
|
||||
|
||||
if profile.fruEarlyExit {
|
||||
psus = collectFRUEarlyExit(fruCtx)
|
||||
} else {
|
||||
slog.Info("psu: fru unavailable", "err", err)
|
||||
cmd := exec.CommandContext(fruCtx, "ipmitool", "fru", "print")
|
||||
if out, err := cmd.Output(); err == nil {
|
||||
psus = parseFRU(string(out))
|
||||
} else {
|
||||
slog.Info("psu: fru unavailable", "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
sdrData := map[int]psuSDR{}
|
||||
if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil {
|
||||
sdrCtx, sdrCancel := context.WithTimeout(context.Background(), profile.sdrTimeout)
|
||||
defer sdrCancel()
|
||||
cmd := exec.CommandContext(sdrCtx, "ipmitool", "sdr")
|
||||
if sdrOut, err := cmd.Output(); err == nil {
|
||||
sdrData = parsePSUSDR(string(sdrOut))
|
||||
if len(psus) == 0 {
|
||||
psus = synthesizePSUsFromSDR(sdrData)
|
||||
@@ -30,7 +45,66 @@ func collectPSUs() []schema.HardwarePowerSupply {
|
||||
slog.Info("psu: ipmitool unavailable, skipping", "err", err)
|
||||
return nil
|
||||
}
|
||||
slog.Info("psu: collected", "count", len(psus))
|
||||
slog.Info("psu: collected", "count", len(psus), "profile", profile.name)
|
||||
return psus
|
||||
}
|
||||
|
||||
// collectFRUEarlyExit streams ipmitool fru print line-by-line and stops reading
|
||||
// as soon as it has found all PSU blocks and the next block is not a PSU.
|
||||
// This avoids scanning all 50+ non-PSU FRU devices on Lenovo XCC servers.
|
||||
func collectFRUEarlyExit(ctx context.Context) []schema.HardwarePowerSupply {
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "fru", "print")
|
||||
pipe, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
slog.Info("psu: fru pipe unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
slog.Info("psu: fru start failed", "err", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
var psus []schema.HardwarePowerSupply
|
||||
var currentBlock strings.Builder
|
||||
slot := 0
|
||||
psuFound := false
|
||||
stoppedEarly := false
|
||||
|
||||
scanner := bufio.NewScanner(pipe)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
if strings.HasPrefix(line, "FRU Device Description") {
|
||||
if currentBlock.Len() > 0 {
|
||||
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
|
||||
psus = append(psus, psu)
|
||||
psuFound = true
|
||||
slot++
|
||||
}
|
||||
currentBlock.Reset()
|
||||
}
|
||||
// Stop once we've collected PSUs and hit a non-PSU block header.
|
||||
if psuFound && !isPSUHeader(strings.ToLower(line)) {
|
||||
stoppedEarly = true
|
||||
break
|
||||
}
|
||||
}
|
||||
currentBlock.WriteString(line)
|
||||
currentBlock.WriteByte('\n')
|
||||
}
|
||||
|
||||
if !stoppedEarly && currentBlock.Len() > 0 {
|
||||
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
|
||||
psus = append(psus, psu)
|
||||
}
|
||||
}
|
||||
|
||||
// Kill the process immediately on early exit rather than waiting for context timeout.
|
||||
if cmd.Process != nil {
|
||||
cmd.Process.Kill() //nolint:errcheck
|
||||
}
|
||||
cmd.Wait() //nolint:errcheck
|
||||
slog.Info("psu: fru early-exit complete", "psus_found", len(psus), "stopped_early", stoppedEarly)
|
||||
return psus
|
||||
}
|
||||
|
||||
|
||||
@@ -733,6 +733,37 @@ func parseMDStatArrays(raw string) []mdArray {
|
||||
return arrays
|
||||
}
|
||||
|
||||
// collectVROCLicense runs mdadm --detail-platform and extracts the License field.
|
||||
// Returns nil when VROC is absent or the platform does not report a license.
|
||||
func collectVROCLicense(pcie []schema.HardwarePCIeDevice) *string {
|
||||
if !hasVROCController(pcie) {
|
||||
return nil
|
||||
}
|
||||
out, err := raidToolQuery("mdadm", "--detail-platform")
|
||||
if err != nil {
|
||||
slog.Info("vroc: mdadm --detail-platform unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
return parseMDAdmPlatformLicense(string(out))
|
||||
}
|
||||
|
||||
func parseMDAdmPlatformLicense(raw string) *string {
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(strings.ToLower(trimmed), "license") {
|
||||
continue
|
||||
}
|
||||
if idx := strings.Index(trimmed, ":"); idx >= 0 {
|
||||
val := strings.TrimSpace(trimmed[idx+1:])
|
||||
if val != "" {
|
||||
v := strings.ToLower(val)
|
||||
return &v
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func queryDeviceSerial(devPath string) string {
|
||||
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
||||
var ctrl nvmeIDCtrl
|
||||
|
||||
@@ -28,6 +28,35 @@ md125 : active raid1 nvme2n1[0] nvme3n1[1]
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseMDAdmPlatformLicense(t *testing.T) {
|
||||
premium := `Platform : Intel(R) Virtual RAID on CPU
|
||||
Version : 1.3.0.1138
|
||||
RAID Levels : raid0 raid1 raid5 raid10
|
||||
Total Disks : 4
|
||||
License : Premium
|
||||
`
|
||||
got := parseMDAdmPlatformLicense(premium)
|
||||
if got == nil || *got != "premium" {
|
||||
t.Fatalf("expected 'premium', got %v", got)
|
||||
}
|
||||
|
||||
standard := `Platform : Intel(R) Virtual RAID on CPU
|
||||
License : Standard
|
||||
`
|
||||
got = parseMDAdmPlatformLicense(standard)
|
||||
if got == nil || *got != "standard" {
|
||||
t.Fatalf("expected 'standard', got %v", got)
|
||||
}
|
||||
|
||||
noLicense := `Platform : Intel(R) Virtual RAID on CPU
|
||||
Version : 1.0.0
|
||||
`
|
||||
got = parseMDAdmPlatformLicense(noLicense)
|
||||
if got != nil {
|
||||
t.Fatalf("expected nil, got %v", *got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHasVROCController(t *testing.T) {
|
||||
intel := vendorIntel
|
||||
model := "Volume Management Device NVMe RAID Controller"
|
||||
|
||||
@@ -66,6 +66,7 @@ type HardwareSnapshot struct {
|
||||
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||
VROCLicense *string `json:"vroc_license,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareHealthSummary struct {
|
||||
|
||||
@@ -125,6 +125,8 @@ func defaultTaskPriority(target string, params taskParams) int {
|
||||
return taskPriorityInstall
|
||||
case "install-to-ram":
|
||||
return taskPriorityInstallToRAM
|
||||
case "nvme-format":
|
||||
return taskPriorityInstall
|
||||
case "audit":
|
||||
return taskPriorityAudit
|
||||
case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
|
||||
@@ -1295,7 +1297,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
|
||||
var standardTools = []string{
|
||||
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
|
||||
"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
|
||||
"mstflint", "qrencode",
|
||||
"mstflint",
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
@@ -85,6 +85,27 @@ func TestHandleAPIBlackboxStatusReturnsPersistedState(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVMeFormatModes(t *testing.T) {
|
||||
raw := `
|
||||
lbaf 0 : ms:0 lbads:9 rp:0x2 (in use)
|
||||
lbaf 1 : ms:8 lbads:9 rp:0x1
|
||||
lbaf 2 : ms:0 lbads:12 rp:0
|
||||
`
|
||||
modes := parseNVMeFormatModes(raw)
|
||||
if len(modes) != 3 {
|
||||
t.Fatalf("modes=%#v want 3 modes", modes)
|
||||
}
|
||||
if modes[0].Mode != 0 || modes[0].DataBytes != 512 || modes[0].MetadataBytes != 0 || !modes[0].InUse {
|
||||
t.Fatalf("mode 0=%#v", modes[0])
|
||||
}
|
||||
if modes[1].Label != "MODE 1 (512+8)" {
|
||||
t.Fatalf("mode 1 label=%q", modes[1].Label)
|
||||
}
|
||||
if modes[2].DataBytes != 4096 || modes[2].MetadataBytes != 0 {
|
||||
t.Fatalf("mode 2=%#v", modes[2])
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
|
||||
368
audit/internal/webui/nvme_format.go
Normal file
368
audit/internal/webui/nvme_format.go
Normal file
@@ -0,0 +1,368 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type nvmeFormatMode struct {
|
||||
Mode int `json:"mode"`
|
||||
DataBytes int64 `json:"data_bytes"`
|
||||
MetadataBytes int64 `json:"metadata_bytes"`
|
||||
InUse bool `json:"in_use"`
|
||||
Label string `json:"label"`
|
||||
}
|
||||
|
||||
type nvmeFormatDisk struct {
|
||||
Device string `json:"device"`
|
||||
Model string `json:"model,omitempty"`
|
||||
Serial string `json:"serial,omitempty"`
|
||||
Size string `json:"size,omitempty"`
|
||||
CurrentMode int `json:"current_mode"`
|
||||
CurrentFormat string `json:"current_format"`
|
||||
Modes []nvmeFormatMode `json:"modes"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
type nvmeListJSON struct {
|
||||
Devices []struct {
|
||||
DevicePath string `json:"DevicePath"`
|
||||
ModelNumber string `json:"ModelNumber"`
|
||||
SerialNumber string `json:"SerialNumber"`
|
||||
PhysicalSize int64 `json:"PhysicalSize"`
|
||||
} `json:"Devices"`
|
||||
}
|
||||
|
||||
var (
|
||||
nvmeFormatDeviceRE = regexp.MustCompile(`^/dev/nvme[0-9]+n[0-9]+$`)
|
||||
nvmeLBAFCompactLineRE = regexp.MustCompile(`(?im)^\s*lbaf\s+(\d+)\s*:\s*ms:(\d+)\s+lbads:(\d+).*$`)
|
||||
nvmeLBAFVerboseLineRE = regexp.MustCompile(`(?im)^\s*LBA Format\s+(\d+)\s*:\s*Metadata Size:\s*(\d+)\s+bytes\s*-\s*Data Size:\s*(\d+)\s+bytes.*$`)
|
||||
nvmeCommandContext = exec.CommandContext
|
||||
nvmeListFormatsTimeout = 20 * time.Second
|
||||
)
|
||||
|
||||
func listNVMeFormatDisks(ctx context.Context) ([]nvmeFormatDisk, error) {
|
||||
ctx, cancel := context.WithTimeout(ctx, nvmeListFormatsTimeout)
|
||||
defer cancel()
|
||||
out, err := nvmeCommandContext(ctx, "nvme", "list", "-o", "json").Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var root nvmeListJSON
|
||||
if err := json.Unmarshal(out, &root); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
disks := make([]nvmeFormatDisk, 0, len(root.Devices))
|
||||
seen := map[string]struct{}{}
|
||||
for _, dev := range root.Devices {
|
||||
path := strings.TrimSpace(dev.DevicePath)
|
||||
if !nvmeFormatDeviceRE.MatchString(path) {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[path]; ok {
|
||||
continue
|
||||
}
|
||||
seen[path] = struct{}{}
|
||||
disk := nvmeFormatDisk{
|
||||
Device: path,
|
||||
Model: strings.TrimSpace(dev.ModelNumber),
|
||||
Serial: strings.TrimSpace(dev.SerialNumber),
|
||||
Size: formatNVMeBytes(dev.PhysicalSize),
|
||||
CurrentMode: -1,
|
||||
}
|
||||
modes, parseErr := readNVMeFormatModes(ctx, path)
|
||||
if parseErr != nil {
|
||||
disk.Error = parseErr.Error()
|
||||
}
|
||||
disk.Modes = modes
|
||||
for _, mode := range modes {
|
||||
if mode.InUse {
|
||||
disk.CurrentMode = mode.Mode
|
||||
disk.CurrentFormat = formatNVMeBlock(mode.DataBytes, mode.MetadataBytes)
|
||||
break
|
||||
}
|
||||
}
|
||||
disks = append(disks, disk)
|
||||
}
|
||||
sort.Slice(disks, func(i, j int) bool { return disks[i].Device < disks[j].Device })
|
||||
return disks, nil
|
||||
}
|
||||
|
||||
func readNVMeFormatModes(ctx context.Context, device string) ([]nvmeFormatMode, error) {
|
||||
if !nvmeFormatDeviceRE.MatchString(device) {
|
||||
return nil, fmt.Errorf("invalid NVMe device")
|
||||
}
|
||||
out, err := nvmeCommandContext(ctx, "nvme", "id-ns", device, "-H").CombinedOutput()
|
||||
if err != nil {
|
||||
msg := strings.TrimSpace(string(out))
|
||||
if msg == "" {
|
||||
msg = err.Error()
|
||||
}
|
||||
return nil, fmt.Errorf("%s", msg)
|
||||
}
|
||||
modes := parseNVMeFormatModes(string(out))
|
||||
if len(modes) == 0 {
|
||||
return nil, fmt.Errorf("no LBA format modes found")
|
||||
}
|
||||
return modes, nil
|
||||
}
|
||||
|
||||
func parseNVMeFormatModes(raw string) []nvmeFormatMode {
|
||||
byMode := map[int]nvmeFormatMode{}
|
||||
for _, m := range nvmeLBAFCompactLineRE.FindAllStringSubmatch(raw, -1) {
|
||||
mode, errMode := strconv.Atoi(m[1])
|
||||
metadata, errMS := strconv.ParseInt(m[2], 10, 64)
|
||||
lbads, errLBADS := strconv.Atoi(m[3])
|
||||
if errMode != nil || errMS != nil || errLBADS != nil || lbads < 0 || lbads >= 63 {
|
||||
continue
|
||||
}
|
||||
data := int64(1) << lbads
|
||||
line := m[0]
|
||||
byMode[mode] = nvmeFormatMode{
|
||||
Mode: mode,
|
||||
DataBytes: data,
|
||||
MetadataBytes: metadata,
|
||||
InUse: strings.Contains(strings.ToLower(line), "in use"),
|
||||
Label: fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
|
||||
}
|
||||
}
|
||||
for _, m := range nvmeLBAFVerboseLineRE.FindAllStringSubmatch(raw, -1) {
|
||||
mode, errMode := strconv.Atoi(m[1])
|
||||
metadata, errMS := strconv.ParseInt(m[2], 10, 64)
|
||||
data, errData := strconv.ParseInt(m[3], 10, 64)
|
||||
if errMode != nil || errMS != nil || errData != nil || data <= 0 {
|
||||
continue
|
||||
}
|
||||
line := m[0]
|
||||
byMode[mode] = nvmeFormatMode{
|
||||
Mode: mode,
|
||||
DataBytes: data,
|
||||
MetadataBytes: metadata,
|
||||
InUse: strings.Contains(strings.ToLower(line), "in use"),
|
||||
Label: fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
|
||||
}
|
||||
}
|
||||
modes := make([]nvmeFormatMode, 0, len(byMode))
|
||||
for _, mode := range byMode {
|
||||
modes = append(modes, mode)
|
||||
}
|
||||
sort.Slice(modes, func(i, j int) bool { return modes[i].Mode < modes[j].Mode })
|
||||
return modes
|
||||
}
|
||||
|
||||
func runNVMeFormatTask(ctx context.Context, j *jobState, device string, lbaf int) error {
|
||||
if !nvmeFormatDeviceRE.MatchString(device) {
|
||||
return fmt.Errorf("invalid NVMe device")
|
||||
}
|
||||
modes, err := readNVMeFormatModes(ctx, device)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var selected nvmeFormatMode
|
||||
found := false
|
||||
for _, mode := range modes {
|
||||
if mode.Mode == lbaf {
|
||||
selected = mode
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
return fmt.Errorf("MODE %d is not available on %s", lbaf, device)
|
||||
}
|
||||
ms := 0
|
||||
if selected.MetadataBytes > 0 {
|
||||
ms = 1
|
||||
}
|
||||
j.append(fmt.Sprintf("Formatting %s to %s with --lbaf=%d --ms=%d --force", device, formatNVMeBlock(selected.DataBytes, selected.MetadataBytes), selected.Mode, ms))
|
||||
cmd := nvmeCommandContext(ctx, "nvme", "format", device, fmt.Sprintf("--lbaf=%d", selected.Mode), fmt.Sprintf("--ms=%d", ms), "--force")
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINVMeFormats(w http.ResponseWriter, r *http.Request) {
|
||||
disks, err := listNVMeFormatDisks(r.Context())
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, disks)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINVMeFormatRun(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Device string `json:"device"`
|
||||
LBAF int `json:"lbaf"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
if !nvmeFormatDeviceRE.MatchString(req.Device) {
|
||||
writeError(w, http.StatusBadRequest, "invalid NVMe device")
|
||||
return
|
||||
}
|
||||
disks, err := listNVMeFormatDisks(r.Context())
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
var label string
|
||||
allowed := false
|
||||
for _, disk := range disks {
|
||||
if disk.Device != req.Device {
|
||||
continue
|
||||
}
|
||||
for _, mode := range disk.Modes {
|
||||
if mode.Mode == req.LBAF {
|
||||
allowed = true
|
||||
label = mode.Label
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if !allowed {
|
||||
writeError(w, http.StatusBadRequest, "LBA format mode is not available for this device")
|
||||
return
|
||||
}
|
||||
name := fmt.Sprintf("NVMe Format %s to %s", filepath.Base(req.Device), label)
|
||||
t := &Task{
|
||||
ID: newJobID("nvme-format"),
|
||||
Name: name,
|
||||
Target: "nvme-format",
|
||||
Priority: defaultTaskPriority("nvme-format", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
Device: req.Device,
|
||||
LBAF: req.LBAF,
|
||||
},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||
}
|
||||
|
||||
func formatNVMeBlock(dataBytes, metadataBytes int64) string {
|
||||
return strconv.FormatInt(dataBytes, 10) + "+" + strconv.FormatInt(metadataBytes, 10)
|
||||
}
|
||||
|
||||
func formatNVMeBytes(n int64) string {
|
||||
if n <= 0 {
|
||||
return ""
|
||||
}
|
||||
units := []string{"B", "KB", "MB", "GB", "TB", "PB"}
|
||||
v := float64(n)
|
||||
unit := 0
|
||||
for v >= 1000 && unit < len(units)-1 {
|
||||
v /= 1000
|
||||
unit++
|
||||
}
|
||||
if unit == 0 {
|
||||
return fmt.Sprintf("%d B", n)
|
||||
}
|
||||
return fmt.Sprintf("%.1f %s", v, units[unit])
|
||||
}
|
||||
|
||||
func renderNVMeFormatInline() string {
|
||||
return `<div id="nvme-format-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVMe disks...</div>
|
||||
<div id="nvme-format-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<script>
|
||||
function nvmeFormatEsc(s) {
|
||||
return String(s == null ? '' : s).replace(/[&<>"']/g, function(c) {
|
||||
return {'&':'&','<':'<','>':'>','"':'"',"'":'''}[c];
|
||||
});
|
||||
}
|
||||
function loadNVMeFormats() {
|
||||
var status = document.getElementById('nvme-format-status');
|
||||
var table = document.getElementById('nvme-format-table');
|
||||
status.textContent = 'Loading NVMe disks...';
|
||||
status.style.color = 'var(--muted)';
|
||||
table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
|
||||
fetch('/api/tools/nvme-formats').then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(disks) {
|
||||
window._nvmeFormatDisks = Array.isArray(disks) ? disks : [];
|
||||
if (!window._nvmeFormatDisks.length) {
|
||||
status.textContent = 'No NVMe disks found.';
|
||||
table.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
status.textContent = window._nvmeFormatDisks.length + ' NVMe disk(s) found.';
|
||||
var rows = window._nvmeFormatDisks.map(function(d, idx) {
|
||||
var current = d.current_format ? (d.current_format + ' / MODE ' + d.current_mode) : 'unknown';
|
||||
var detail = [d.model || '', d.serial || '', d.size || ''].filter(Boolean).join(' | ');
|
||||
var options = (d.modes || []).map(function(m) {
|
||||
return '<option value="' + m.mode + '"' + (m.in_use ? ' selected' : '') + '>' + nvmeFormatEsc(m.label) + '</option>';
|
||||
}).join('');
|
||||
var disabled = options ? '' : ' disabled';
|
||||
var err = d.error ? '<div style="font-size:12px;color:var(--crit-fg,#9f3a38);margin-top:4px">' + nvmeFormatEsc(d.error) + '</div>' : '';
|
||||
return '<tr>'
|
||||
+ '<td style="font-family:monospace;white-space:nowrap">' + nvmeFormatEsc(d.device) + (detail ? '<div style="font-family:inherit;font-size:12px;color:var(--muted)">' + nvmeFormatEsc(detail) + '</div>' : '') + '</td>'
|
||||
+ '<td style="white-space:nowrap">' + nvmeFormatEsc(current) + err + '</td>'
|
||||
+ '<td style="white-space:nowrap"><select id="nvme-format-select-' + idx + '"' + disabled + '>' + options + '</select></td>'
|
||||
+ '<td style="white-space:nowrap"><button class="btn btn-sm btn-primary" onclick="nvmeFormatRun(' + idx + ', this)"' + disabled + '>Apply</button><div class="nvme-format-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div></td>'
|
||||
+ '</tr>';
|
||||
}).join('');
|
||||
table.innerHTML = '<table><tr><th>Disk</th><th>Current block / mode</th><th>New mode</th><th>Action</th></tr>' + rows + '</table>';
|
||||
}).catch(function(e) {
|
||||
status.textContent = 'Error loading NVMe disks: ' + e.message;
|
||||
status.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
table.innerHTML = '';
|
||||
});
|
||||
}
|
||||
function nvmeWaitTaskDone(taskID, rowMsg) {
|
||||
var timer = setInterval(function() {
|
||||
fetch('/api/tasks').then(function(r) { return r.json(); }).then(function(tasks) {
|
||||
var task = (tasks || []).find(function(t) { return t.id === taskID; });
|
||||
if (!task) return;
|
||||
if (task.status === 'done' || task.status === 'failed' || task.status === 'cancelled') {
|
||||
clearInterval(timer);
|
||||
rowMsg.textContent = 'Task ' + taskID + ': ' + task.status + (task.error ? ' - ' + task.error : '');
|
||||
rowMsg.style.color = task.status === 'done' ? 'var(--ok,green)' : 'var(--crit-fg,#9f3a38)';
|
||||
loadNVMeFormats();
|
||||
}
|
||||
}).catch(function(){});
|
||||
}, 1500);
|
||||
}
|
||||
function nvmeFormatRun(idx, btn) {
|
||||
var disk = (window._nvmeFormatDisks || [])[idx];
|
||||
var select = document.getElementById('nvme-format-select-' + idx);
|
||||
var row = btn.closest('td');
|
||||
var rowMsg = row.querySelector('.nvme-format-row-msg');
|
||||
if (!disk || !select) return;
|
||||
var lbaf = parseInt(select.value, 10);
|
||||
var mode = (disk.modes || []).find(function(m) { return m.mode === lbaf; });
|
||||
if (!mode) return;
|
||||
if (!window.confirm('Format ' + disk.device + ' to ' + mode.label + '? This erases data on the namespace.')) return;
|
||||
btn.disabled = true;
|
||||
rowMsg.style.color = 'var(--muted)';
|
||||
rowMsg.textContent = 'Queued...';
|
||||
fetch('/api/tools/nvme-format/run', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({device: disk.device, lbaf: lbaf})
|
||||
}).then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(d) {
|
||||
rowMsg.textContent = 'Task ' + d.task_id + ' queued.';
|
||||
nvmeWaitTaskDone(d.task_id, rowMsg);
|
||||
}).catch(function(e) {
|
||||
rowMsg.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
rowMsg.textContent = 'Error: ' + e.message;
|
||||
}).finally(function() {
|
||||
btn.disabled = false;
|
||||
});
|
||||
}
|
||||
loadNVMeFormats();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderNVMeFormatCard() string {
|
||||
return `<div class="card"><div class="card-head">NVMe Block Format <button class="btn btn-sm btn-secondary" onclick="loadNVMeFormats()" style="margin-left:auto">↻ Refresh</button></div><div class="card-body">` +
|
||||
`<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Lists NVMe namespaces and changes their LBA format through a queued task.</p>` +
|
||||
renderNVMeFormatInline() + `</div></div>`
|
||||
}
|
||||
@@ -475,6 +475,7 @@ function installToRAM() {
|
||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||
renderServicesInline() + `</div></div>
|
||||
|
||||
` + renderNVMeFormatCard() + `
|
||||
|
||||
<script>
|
||||
function checkTools() {
|
||||
|
||||
@@ -307,6 +307,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
|
||||
// Tools
|
||||
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||
mux.HandleFunc("GET /api/tools/nvme-formats", h.handleAPINVMeFormats)
|
||||
mux.HandleFunc("POST /api/tools/nvme-format/run", h.handleAPINVMeFormatRun)
|
||||
|
||||
// GPU presence / tools
|
||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||
|
||||
@@ -677,6 +677,12 @@ func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
|
||||
if !strings.Contains(body, `/api/blackbox/status`) {
|
||||
t.Fatalf("tools page missing black-box status api usage: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `NVMe Block Format`) {
|
||||
t.Fatalf("tools page missing nvme block format section: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `/api/tools/nvme-formats`) || !strings.Contains(body, `/api/tools/nvme-format/run`) {
|
||||
t.Fatalf("tools page missing nvme format api usage: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
||||
|
||||
@@ -376,6 +376,12 @@ func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx cont
|
||||
break
|
||||
}
|
||||
err = a.RunInstallToRAM(ctx, j.append)
|
||||
case "nvme-format":
|
||||
if strings.TrimSpace(t.params.Device) == "" {
|
||||
err = fmt.Errorf("device is required")
|
||||
break
|
||||
}
|
||||
err = runNVMeFormatTask(ctx, j, t.params.Device, t.params.LBAF)
|
||||
default:
|
||||
j.append("ERROR: unknown target: " + t.Target)
|
||||
j.finish("unknown target")
|
||||
|
||||
@@ -57,6 +57,7 @@ var taskNames = map[string]string{
|
||||
"support-bundle": "Support Bundle",
|
||||
"install": "Install to Disk",
|
||||
"install-to-ram": "Install to RAM",
|
||||
"nvme-format": "NVMe Block Format Change",
|
||||
}
|
||||
|
||||
// burnNames maps target → human-readable name when a burn profile is set.
|
||||
@@ -137,6 +138,7 @@ type taskParams struct {
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
LBAF int `json:"lbaf,omitempty"`
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
}
|
||||
|
||||
|
||||
@@ -9,5 +9,62 @@ Generic engineering rules live in `bible/rules/patterns/`.
|
||||
|---|---|
|
||||
| `architecture/system-overview.md` | What bee does, scope, tech stack |
|
||||
| `architecture/runtime-flows.md` | Boot sequence, audit flow, service order |
|
||||
| `docs/customer-gpu-test-methodology.md` | Customer-facing GPU PCIe Validate / Validate -> Stress test list |
|
||||
| `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
|
||||
| `docs/validate-vs-burn.md` | Validate and Validate -> Stress hardware test policy |
|
||||
| `decisions/` | Architectural decision log, including read-only submodule policy |
|
||||
|
||||
## Validate Test Matrix
|
||||
|
||||
### Validate
|
||||
|
||||
- CPU check
|
||||
- `lscpu`
|
||||
- `sensors`
|
||||
- `stress-ng`
|
||||
- Memory check
|
||||
- `free`
|
||||
- `timeout <timeout_sec> memtester`
|
||||
- `free`
|
||||
- NVMe storage check
|
||||
- `nvme id-ctrl`
|
||||
- `nvme smart-log`
|
||||
- `nvme device-self-test`
|
||||
- SATA/SAS storage check
|
||||
- `smartctl -H -A`
|
||||
- `smartctl -t short`
|
||||
- Basic NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 2`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
|
||||
### Validate -> Stress
|
||||
|
||||
- Extended NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 3`
|
||||
- NVIDIA targeted stress
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_stress`
|
||||
- NVIDIA targeted power
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_power`
|
||||
- NVIDIA pulse test
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r pulse_test`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
|
||||
@@ -149,7 +149,6 @@ Current validation state:
|
||||
6. psu collector (ipmitool fru + sdr — silent if no /dev/ipmi0)
|
||||
7. nvidia enrichment (nvidia-smi — skipped if binary absent or driver not loaded)
|
||||
8. output JSON → /var/log/bee-audit.json
|
||||
9. QR summary to stdout (qrencode if available)
|
||||
```
|
||||
|
||||
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
||||
|
||||
54
bible-local/docs/customer-gpu-test-methodology.md
Normal file
54
bible-local/docs/customer-gpu-test-methodology.md
Normal file
@@ -0,0 +1,54 @@
|
||||
# GPU PCIe Test Methodology
|
||||
|
||||
## Validate
|
||||
|
||||
- CPU check
|
||||
- `lscpu`
|
||||
- `sensors`
|
||||
- `stress-ng`
|
||||
- Memory check
|
||||
- `free`
|
||||
- `timeout <timeout_sec> memtester`
|
||||
- `free`
|
||||
- NVMe storage check
|
||||
- `nvme id-ctrl`
|
||||
- `nvme smart-log`
|
||||
- `nvme device-self-test`
|
||||
- SATA/SAS storage check
|
||||
- `smartctl -H -A`
|
||||
- `smartctl -t short`
|
||||
- Basic NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 2`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
|
||||
## Validate -> Stress
|
||||
|
||||
- Extended NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 3`
|
||||
- NVIDIA targeted stress
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_stress`
|
||||
- NVIDIA targeted power
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_power`
|
||||
- NVIDIA pulse test
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r pulse_test`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
134
git-bible/grub-bitmap-error.md
Normal file
134
git-bible/grub-bitmap-error.md
Normal file
@@ -0,0 +1,134 @@
|
||||
# GRUB bitmap error: null src bitmap in grub_video_bitmap_create_scaled
|
||||
|
||||
## Symptom
|
||||
|
||||
```
|
||||
error: null src bitmap in grub_video_bitmap_create_scaled.
|
||||
Press any key to continue...
|
||||
```
|
||||
|
||||
Appears on boot before the GRUB menu renders. The menu still appears after pressing a key,
|
||||
but without the bee logo. Reproduced on real hardware (Lenovo SR650 V3, ASUS GPU servers).
|
||||
|
||||
## Root cause model
|
||||
|
||||
`grub_video_bitmap_create_scaled` receives a null `src` pointer, meaning the PNG loader
|
||||
returned null for `bee-logo.png`. GRUB calls this function even when no explicit
|
||||
`width`/`height` are set in `theme.txt` — it is invoked any time an image component is
|
||||
rendered, passing the image's natural dimensions as the target size.
|
||||
|
||||
The PNG file is referenced as `file = "bee-logo.png"` (relative to theme dir).
|
||||
GRUB resolves this to `/boot/grub/live-theme/bee-logo.png`.
|
||||
|
||||
## Attempts that did NOT fix the error
|
||||
|
||||
### Attempt 1 — add explicit `width`/`height` to image block (d52ec67)
|
||||
|
||||
**What was done:** First introduction of bee-logo.png with:
|
||||
```
|
||||
+ image {
|
||||
top = 4%
|
||||
left = 50%-200
|
||||
width = 400
|
||||
height = 400
|
||||
file = "bee-logo.png"
|
||||
}
|
||||
```
|
||||
PNG at this point was RGBA (color_type=6).
|
||||
|
||||
**Result:** Error appeared immediately on first ISO build.
|
||||
|
||||
---
|
||||
|
||||
### Attempt 2 — remove `width`/`height` from image block (aa284ae)
|
||||
|
||||
**Hypothesis:** Explicit scaling dimensions trigger the scale path; removing them avoids it.
|
||||
|
||||
**What was done:** Removed `width = 400` and `height = 400` from the image block.
|
||||
```
|
||||
+ image {
|
||||
top = 4%
|
||||
left = 50%-200
|
||||
file = "bee-logo.png"
|
||||
}
|
||||
```
|
||||
|
||||
**Result:** Error persists. GRUB calls `grub_video_bitmap_create_scaled` regardless of whether
|
||||
`width`/`height` are specified — if the bitmap is null (loading failed), the error fires either way.
|
||||
|
||||
---
|
||||
|
||||
### Attempt 3 — convert PNG to RGBA + strip metadata chunks (6112094)
|
||||
|
||||
**Hypothesis:** GRUB's minimal PNG parser is confused by metadata chunks (cHRM, bKGD, tIME, tEXt).
|
||||
Also re-ordered `terminal_output gfxterm` before `insmod png` / theme load.
|
||||
|
||||
**What was done:**
|
||||
- Converted PNG to RGBA color_type=6, stripped all ancillary chunks
|
||||
- Moved `terminal_output gfxterm` earlier in config.cfg
|
||||
- Removed echo ASCII art banner from grub.cfg
|
||||
|
||||
**Result:** Error persists — and this change actually confirmed RGBA does not work:
|
||||
GRUB's PNG loader does not render RGBA PNGs correctly on this platform.
|
||||
|
||||
---
|
||||
|
||||
### Attempt 4 — convert PNG from RGBA back to RGB (333c44f, most recent)
|
||||
|
||||
**Hypothesis:** GRUB does not support RGBA (color_type=6); RGB (color_type=2) is the correct format.
|
||||
Alpha channel composited onto black background (#000000) to match `desktop-color`.
|
||||
|
||||
**What was done:** Converted bee-logo.png from RGBA to RGB via ImageMagick.
|
||||
|
||||
**Current file state:**
|
||||
- 400×400 px, 8-bit/color RGB, non-interlaced
|
||||
- Only IHDR + IDAT + IEND chunks (no metadata)
|
||||
- `insmod png` is present in config.cfg
|
||||
- `terminal_output gfxterm` runs before theme is sourced
|
||||
- No explicit `width`/`height` in image block
|
||||
|
||||
**Result:** Error still occurs on real hardware. Despite the PNG being nominally correct
|
||||
(RGB, non-interlaced, minimal chunks), the bitmap load returns null.
|
||||
|
||||
## Confirmed root cause (verified on 172.16.41.94, 2026-04-30)
|
||||
|
||||
The EFI partition (`sda2`, vfat, 5 MB) contains only:
|
||||
```
|
||||
/EFI/boot/bootia32.efi
|
||||
/EFI/boot/bootx64.efi
|
||||
/EFI/boot/grubx64.efi
|
||||
/boot/grub/grub.cfg
|
||||
```
|
||||
|
||||
`config.cfg`, `theme.cfg`, and the entire `live-theme/` directory (including `bee-logo.png`)
|
||||
are **absent from the EFI image**. `live-build`'s `lb binary_grub-efi` stage is not
|
||||
copying these files. GRUB boots, sources only `grub.cfg`, then fails to load the theme
|
||||
because the file does not exist — returning a null bitmap regardless of PNG format.
|
||||
|
||||
All four fix attempts were targeting the wrong layer (PNG format/content).
|
||||
|
||||
## Fix (applied 2026-04-30)
|
||||
|
||||
Switched from PNG to TGA format:
|
||||
|
||||
1. Converted `bee-logo.png` → `bee-logo.tga` (24-bit uncompressed BGR, top-left origin,
|
||||
480018 bytes). Conversion done via Python stdlib (no external tools needed).
|
||||
2. `config.cfg`: `insmod png` → `insmod tga`
|
||||
3. `theme.txt`: `file = "bee-logo.png"` → `file = "bee-logo.tga"`
|
||||
|
||||
**Why TGA works:** GRUB's TGA reader (`tga.mod`) handles uncompressed 24-bit images
|
||||
trivially — no decompression, no complex chunk parsing. The module is present on-disk
|
||||
(`x86_64-efi/tga.mod`). PNG was failing despite a valid file; the exact GRUB bug is
|
||||
unknown but the PNG reader in Debian bookworm's grub2 is known to be fragile.
|
||||
|
||||
The old `bee-logo.png` is kept in the tree (may be useful for other tools) but is no
|
||||
longer referenced by the theme.
|
||||
|
||||
## Relevant files
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `iso/builder/config/bootloaders/grub-efi/config.cfg` | insmod png, gfxterm init, theme source |
|
||||
| `iso/builder/config/bootloaders/grub-efi/theme.cfg` | sets `theme=` path |
|
||||
| `iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt` | image component definition |
|
||||
| `iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png` | the logo PNG |
|
||||
Submodule internal/chart updated: ac8120c8ab...2a15bc87f1
@@ -27,5 +27,5 @@ insmod gfxterm
|
||||
terminal_input console serial
|
||||
terminal_output gfxterm serial
|
||||
|
||||
insmod png
|
||||
insmod tga
|
||||
source /boot/grub/theme.cfg
|
||||
|
||||
BIN
iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.tga
Normal file
BIN
iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.tga
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 469 KiB |
@@ -9,7 +9,7 @@ terminal-font: "Unifont Regular 16"
|
||||
+ image {
|
||||
top = 4%
|
||||
left = 50%-200
|
||||
file = "bee-logo.png"
|
||||
file = "bee-logo.tga"
|
||||
}
|
||||
|
||||
#help bar at the bottom
|
||||
|
||||
@@ -31,6 +31,7 @@ systemctl enable bee-preflight.service
|
||||
systemctl enable bee-audit.service
|
||||
systemctl enable bee-web.service
|
||||
systemctl enable bee-sshsetup.service
|
||||
systemctl enable bee-blackbox.service
|
||||
systemctl enable bee-selfheal.timer
|
||||
systemctl enable bee-boot-status.service
|
||||
systemctl enable ssh.service
|
||||
|
||||
@@ -66,9 +66,6 @@ jq
|
||||
curl
|
||||
net-tools
|
||||
|
||||
# QR codes (for displaying audit results)
|
||||
qrencode
|
||||
|
||||
# Local desktop (openbox + chromium kiosk)
|
||||
gparted
|
||||
openbox
|
||||
|
||||
Reference in New Issue
Block a user