Compare commits
4 Commits
ec89616585
...
v9.8
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8a21809ade | ||
|
|
626763e31d | ||
|
|
0b8a2ff83f | ||
|
|
2c22b01fe3 |
@@ -3,6 +3,7 @@ package collector
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"bufio"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
@@ -17,14 +18,6 @@ var execDmidecode = func(typeNum string) (string, error) {
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
var execIpmitool = func(args ...string) (string, error) {
|
||||
out, err := exec.Command("ipmitool", args...).Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
// collectBoard runs dmidecode for types 0, 1, 2 and returns the board record
|
||||
// plus the BIOS firmware entry. Any failure is logged and returns zero values.
|
||||
func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) {
|
||||
@@ -80,19 +73,23 @@ func parseBoard(type1, type2 string) schema.HardwareBoard {
|
||||
|
||||
// collectBMCFirmware collects BMC firmware version via ipmitool mc info.
|
||||
// Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs.
|
||||
func collectBMCFirmware() []schema.HardwareFirmwareRecord {
|
||||
func collectBMCFirmware(manufacturer string) []schema.HardwareFirmwareRecord {
|
||||
if _, err := exec.LookPath("ipmitool"); err != nil {
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat("/dev/ipmi0"); err != nil {
|
||||
return nil
|
||||
}
|
||||
out, err := execIpmitool("mc", "info")
|
||||
profile := selectIPMIProfile(manufacturer)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), profile.mcInfoTimeout)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "mc", "info")
|
||||
raw, err := cmd.Output()
|
||||
if err != nil {
|
||||
slog.Info("bmc: ipmitool mc info unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
version := parseBMCFirmwareRevision(out)
|
||||
version := parseBMCFirmwareRevision(string(raw))
|
||||
if version == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
board, biosFW := collectBoard()
|
||||
snap.Board = board
|
||||
snap.Firmware = append(snap.Firmware, biosFW...)
|
||||
snap.Firmware = append(snap.Firmware, collectBMCFirmware()...)
|
||||
snap.Firmware = append(snap.Firmware, collectBMCFirmware(derefString(snap.Board.Manufacturer))...)
|
||||
|
||||
snap.CPUs = collectCPUs()
|
||||
|
||||
@@ -45,7 +45,8 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||||
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
|
||||
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
|
||||
snap.PowerSupplies = collectPSUs()
|
||||
snap.VROCLicense = collectVROCLicense(snap.PCIeDevices)
|
||||
snap.PowerSupplies = collectPSUs(derefString(snap.Board.Manufacturer))
|
||||
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
|
||||
snap.Sensors = buildSensorsFromDoc(sensorDoc)
|
||||
finalizeSnapshot(&snap, collectedAt)
|
||||
|
||||
92
audit/internal/collector/ipmi_profile.go
Normal file
92
audit/internal/collector/ipmi_profile.go
Normal file
@@ -0,0 +1,92 @@
|
||||
package collector
|
||||
|
||||
// Package-level IPMI tuning profiles.
|
||||
//
|
||||
// Each profile is matched by board manufacturer (already known before PSU
|
||||
// collection runs). The profile drives two things:
|
||||
// - Per-command timeouts — prevents infinite hangs on slow BMCs.
|
||||
// - FRU early-exit — streaming parser stops reading once all PSU entries
|
||||
// are found, avoiding the tail of non-PSU FRU records.
|
||||
//
|
||||
// To add a new vendor: append to ipmiProfiles. The first matching entry wins.
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ipmiProfile holds tuning parameters for one or more board manufacturers.
|
||||
type ipmiProfile struct {
|
||||
// name is shown in log messages.
|
||||
name string
|
||||
// manufacturers is a list of lowercase substrings matched against the
|
||||
// board manufacturer string from dmidecode type 1.
|
||||
manufacturers []string
|
||||
// fruTimeout is the hard deadline for the entire `ipmitool fru print`
|
||||
// command. Zero means no timeout (not recommended).
|
||||
fruTimeout time.Duration
|
||||
// sdrTimeout is the hard deadline for `ipmitool sdr`.
|
||||
sdrTimeout time.Duration
|
||||
// mcInfoTimeout is the hard deadline for `ipmitool mc info`.
|
||||
mcInfoTimeout time.Duration
|
||||
// fruEarlyExit instructs the streaming FRU parser to stop reading
|
||||
// after it has found at least one PSU entry and the current block is
|
||||
// complete. Useful on servers with many non-PSU FRU devices.
|
||||
fruEarlyExit bool
|
||||
}
|
||||
|
||||
// ipmiProfiles is the ordered list of profiles. First match wins.
|
||||
var ipmiProfiles = []ipmiProfile{
|
||||
{
|
||||
// Lenovo XCC-based servers (ThinkSystem SR6xx / SR8xx / ST series).
|
||||
// SR650 V3 has 54 FRU devices; each IPMI read takes ~2 s, so the
|
||||
// full `fru print` scan takes ~108 s on a loaded BMC. Enable early
|
||||
// exit so collection stops once PSU records are found.
|
||||
name: "lenovo",
|
||||
manufacturers: []string{"lenovo"},
|
||||
fruTimeout: 90 * time.Second,
|
||||
sdrTimeout: 45 * time.Second,
|
||||
mcInfoTimeout: 15 * time.Second,
|
||||
fruEarlyExit: true,
|
||||
},
|
||||
{
|
||||
// HPE iLO-based servers (ProLiant DL/ML/BL).
|
||||
name: "hpe",
|
||||
manufacturers: []string{"hp", "hewlett packard"},
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
},
|
||||
{
|
||||
// Dell iDRAC-based servers.
|
||||
name: "dell",
|
||||
manufacturers: []string{"dell"},
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
},
|
||||
}
|
||||
|
||||
// defaultIPMIProfile is used when no vendor profile matches.
|
||||
var defaultIPMIProfile = ipmiProfile{
|
||||
name: "default",
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
}
|
||||
|
||||
// selectIPMIProfile returns the profile for the given board manufacturer.
|
||||
func selectIPMIProfile(manufacturer string) ipmiProfile {
|
||||
mfgLower := strings.ToLower(strings.TrimSpace(manufacturer))
|
||||
for _, p := range ipmiProfiles {
|
||||
for _, m := range p.manufacturers {
|
||||
if strings.Contains(mfgLower, m) {
|
||||
return p
|
||||
}
|
||||
}
|
||||
}
|
||||
return defaultIPMIProfile
|
||||
}
|
||||
@@ -2,6 +2,8 @@ package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"bufio"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
@@ -10,16 +12,29 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
func collectPSUs() []schema.HardwarePowerSupply {
|
||||
func collectPSUs(manufacturer string) []schema.HardwarePowerSupply {
|
||||
profile := selectIPMIProfile(manufacturer)
|
||||
|
||||
var psus []schema.HardwarePowerSupply
|
||||
if out, err := exec.Command("ipmitool", "fru", "print").Output(); err == nil {
|
||||
psus = parseFRU(string(out))
|
||||
fruCtx, fruCancel := context.WithTimeout(context.Background(), profile.fruTimeout)
|
||||
defer fruCancel()
|
||||
|
||||
if profile.fruEarlyExit {
|
||||
psus = collectFRUEarlyExit(fruCtx)
|
||||
} else {
|
||||
slog.Info("psu: fru unavailable", "err", err)
|
||||
cmd := exec.CommandContext(fruCtx, "ipmitool", "fru", "print")
|
||||
if out, err := cmd.Output(); err == nil {
|
||||
psus = parseFRU(string(out))
|
||||
} else {
|
||||
slog.Info("psu: fru unavailable", "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
sdrData := map[int]psuSDR{}
|
||||
if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil {
|
||||
sdrCtx, sdrCancel := context.WithTimeout(context.Background(), profile.sdrTimeout)
|
||||
defer sdrCancel()
|
||||
cmd := exec.CommandContext(sdrCtx, "ipmitool", "sdr")
|
||||
if sdrOut, err := cmd.Output(); err == nil {
|
||||
sdrData = parsePSUSDR(string(sdrOut))
|
||||
if len(psus) == 0 {
|
||||
psus = synthesizePSUsFromSDR(sdrData)
|
||||
@@ -30,7 +45,66 @@ func collectPSUs() []schema.HardwarePowerSupply {
|
||||
slog.Info("psu: ipmitool unavailable, skipping", "err", err)
|
||||
return nil
|
||||
}
|
||||
slog.Info("psu: collected", "count", len(psus))
|
||||
slog.Info("psu: collected", "count", len(psus), "profile", profile.name)
|
||||
return psus
|
||||
}
|
||||
|
||||
// collectFRUEarlyExit streams ipmitool fru print line-by-line and stops reading
|
||||
// as soon as it has found all PSU blocks and the next block is not a PSU.
|
||||
// This avoids scanning all 50+ non-PSU FRU devices on Lenovo XCC servers.
|
||||
func collectFRUEarlyExit(ctx context.Context) []schema.HardwarePowerSupply {
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "fru", "print")
|
||||
pipe, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
slog.Info("psu: fru pipe unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
slog.Info("psu: fru start failed", "err", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
var psus []schema.HardwarePowerSupply
|
||||
var currentBlock strings.Builder
|
||||
slot := 0
|
||||
psuFound := false
|
||||
stoppedEarly := false
|
||||
|
||||
scanner := bufio.NewScanner(pipe)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
if strings.HasPrefix(line, "FRU Device Description") {
|
||||
if currentBlock.Len() > 0 {
|
||||
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
|
||||
psus = append(psus, psu)
|
||||
psuFound = true
|
||||
slot++
|
||||
}
|
||||
currentBlock.Reset()
|
||||
}
|
||||
// Stop once we've collected PSUs and hit a non-PSU block header.
|
||||
if psuFound && !isPSUHeader(strings.ToLower(line)) {
|
||||
stoppedEarly = true
|
||||
break
|
||||
}
|
||||
}
|
||||
currentBlock.WriteString(line)
|
||||
currentBlock.WriteByte('\n')
|
||||
}
|
||||
|
||||
if !stoppedEarly && currentBlock.Len() > 0 {
|
||||
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
|
||||
psus = append(psus, psu)
|
||||
}
|
||||
}
|
||||
|
||||
// Kill the process immediately on early exit rather than waiting for context timeout.
|
||||
if cmd.Process != nil {
|
||||
cmd.Process.Kill() //nolint:errcheck
|
||||
}
|
||||
cmd.Wait() //nolint:errcheck
|
||||
slog.Info("psu: fru early-exit complete", "psus_found", len(psus), "stopped_early", stoppedEarly)
|
||||
return psus
|
||||
}
|
||||
|
||||
|
||||
@@ -733,6 +733,37 @@ func parseMDStatArrays(raw string) []mdArray {
|
||||
return arrays
|
||||
}
|
||||
|
||||
// collectVROCLicense runs mdadm --detail-platform and extracts the License field.
|
||||
// Returns nil when VROC is absent or the platform does not report a license.
|
||||
func collectVROCLicense(pcie []schema.HardwarePCIeDevice) *string {
|
||||
if !hasVROCController(pcie) {
|
||||
return nil
|
||||
}
|
||||
out, err := raidToolQuery("mdadm", "--detail-platform")
|
||||
if err != nil {
|
||||
slog.Info("vroc: mdadm --detail-platform unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
return parseMDAdmPlatformLicense(string(out))
|
||||
}
|
||||
|
||||
func parseMDAdmPlatformLicense(raw string) *string {
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(strings.ToLower(trimmed), "license") {
|
||||
continue
|
||||
}
|
||||
if idx := strings.Index(trimmed, ":"); idx >= 0 {
|
||||
val := strings.TrimSpace(trimmed[idx+1:])
|
||||
if val != "" {
|
||||
v := strings.ToLower(val)
|
||||
return &v
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func queryDeviceSerial(devPath string) string {
|
||||
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
||||
var ctrl nvmeIDCtrl
|
||||
|
||||
@@ -28,6 +28,35 @@ md125 : active raid1 nvme2n1[0] nvme3n1[1]
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseMDAdmPlatformLicense(t *testing.T) {
|
||||
premium := `Platform : Intel(R) Virtual RAID on CPU
|
||||
Version : 1.3.0.1138
|
||||
RAID Levels : raid0 raid1 raid5 raid10
|
||||
Total Disks : 4
|
||||
License : Premium
|
||||
`
|
||||
got := parseMDAdmPlatformLicense(premium)
|
||||
if got == nil || *got != "premium" {
|
||||
t.Fatalf("expected 'premium', got %v", got)
|
||||
}
|
||||
|
||||
standard := `Platform : Intel(R) Virtual RAID on CPU
|
||||
License : Standard
|
||||
`
|
||||
got = parseMDAdmPlatformLicense(standard)
|
||||
if got == nil || *got != "standard" {
|
||||
t.Fatalf("expected 'standard', got %v", got)
|
||||
}
|
||||
|
||||
noLicense := `Platform : Intel(R) Virtual RAID on CPU
|
||||
Version : 1.0.0
|
||||
`
|
||||
got = parseMDAdmPlatformLicense(noLicense)
|
||||
if got != nil {
|
||||
t.Fatalf("expected nil, got %v", *got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHasVROCController(t *testing.T) {
|
||||
intel := vendorIntel
|
||||
model := "Volume Management Device NVMe RAID Controller"
|
||||
|
||||
@@ -66,6 +66,7 @@ type HardwareSnapshot struct {
|
||||
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||
VROCLicense *string `json:"vroc_license,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareHealthSummary struct {
|
||||
|
||||
@@ -1295,7 +1295,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
|
||||
var standardTools = []string{
|
||||
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
|
||||
"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
|
||||
"mstflint", "qrencode",
|
||||
"mstflint",
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
@@ -9,5 +9,62 @@ Generic engineering rules live in `bible/rules/patterns/`.
|
||||
|---|---|
|
||||
| `architecture/system-overview.md` | What bee does, scope, tech stack |
|
||||
| `architecture/runtime-flows.md` | Boot sequence, audit flow, service order |
|
||||
| `docs/customer-gpu-test-methodology.md` | Customer-facing GPU PCIe Validate / Validate -> Stress test list |
|
||||
| `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
|
||||
| `docs/validate-vs-burn.md` | Validate and Validate -> Stress hardware test policy |
|
||||
| `decisions/` | Architectural decision log, including read-only submodule policy |
|
||||
|
||||
## Validate Test Matrix
|
||||
|
||||
### Validate
|
||||
|
||||
- CPU check
|
||||
- `lscpu`
|
||||
- `sensors`
|
||||
- `stress-ng`
|
||||
- Memory check
|
||||
- `free`
|
||||
- `timeout <timeout_sec> memtester`
|
||||
- `free`
|
||||
- NVMe storage check
|
||||
- `nvme id-ctrl`
|
||||
- `nvme smart-log`
|
||||
- `nvme device-self-test`
|
||||
- SATA/SAS storage check
|
||||
- `smartctl -H -A`
|
||||
- `smartctl -t short`
|
||||
- Basic NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 2`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
|
||||
### Validate -> Stress
|
||||
|
||||
- Extended NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 3`
|
||||
- NVIDIA targeted stress
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_stress`
|
||||
- NVIDIA targeted power
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_power`
|
||||
- NVIDIA pulse test
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r pulse_test`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
|
||||
@@ -149,7 +149,6 @@ Current validation state:
|
||||
6. psu collector (ipmitool fru + sdr — silent if no /dev/ipmi0)
|
||||
7. nvidia enrichment (nvidia-smi — skipped if binary absent or driver not loaded)
|
||||
8. output JSON → /var/log/bee-audit.json
|
||||
9. QR summary to stdout (qrencode if available)
|
||||
```
|
||||
|
||||
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
||||
|
||||
54
bible-local/docs/customer-gpu-test-methodology.md
Normal file
54
bible-local/docs/customer-gpu-test-methodology.md
Normal file
@@ -0,0 +1,54 @@
|
||||
# GPU PCIe Test Methodology
|
||||
|
||||
## Validate
|
||||
|
||||
- CPU check
|
||||
- `lscpu`
|
||||
- `sensors`
|
||||
- `stress-ng`
|
||||
- Memory check
|
||||
- `free`
|
||||
- `timeout <timeout_sec> memtester`
|
||||
- `free`
|
||||
- NVMe storage check
|
||||
- `nvme id-ctrl`
|
||||
- `nvme smart-log`
|
||||
- `nvme device-self-test`
|
||||
- SATA/SAS storage check
|
||||
- `smartctl -H -A`
|
||||
- `smartctl -t short`
|
||||
- Basic NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 2`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
|
||||
## Validate -> Stress
|
||||
|
||||
- Extended NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 3`
|
||||
- NVIDIA targeted stress
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_stress`
|
||||
- NVIDIA targeted power
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_power`
|
||||
- NVIDIA pulse test
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r pulse_test`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
134
git-bible/grub-bitmap-error.md
Normal file
134
git-bible/grub-bitmap-error.md
Normal file
@@ -0,0 +1,134 @@
|
||||
# GRUB bitmap error: null src bitmap in grub_video_bitmap_create_scaled
|
||||
|
||||
## Symptom
|
||||
|
||||
```
|
||||
error: null src bitmap in grub_video_bitmap_create_scaled.
|
||||
Press any key to continue...
|
||||
```
|
||||
|
||||
Appears on boot before the GRUB menu renders. The menu still appears after pressing a key,
|
||||
but without the bee logo. Reproduced on real hardware (Lenovo SR650 V3, ASUS GPU servers).
|
||||
|
||||
## Root cause model
|
||||
|
||||
`grub_video_bitmap_create_scaled` receives a null `src` pointer, meaning the PNG loader
|
||||
returned null for `bee-logo.png`. GRUB calls this function even when no explicit
|
||||
`width`/`height` are set in `theme.txt` — it is invoked any time an image component is
|
||||
rendered, passing the image's natural dimensions as the target size.
|
||||
|
||||
The PNG file is referenced as `file = "bee-logo.png"` (relative to theme dir).
|
||||
GRUB resolves this to `/boot/grub/live-theme/bee-logo.png`.
|
||||
|
||||
## Attempts that did NOT fix the error
|
||||
|
||||
### Attempt 1 — add explicit `width`/`height` to image block (d52ec67)
|
||||
|
||||
**What was done:** First introduction of bee-logo.png with:
|
||||
```
|
||||
+ image {
|
||||
top = 4%
|
||||
left = 50%-200
|
||||
width = 400
|
||||
height = 400
|
||||
file = "bee-logo.png"
|
||||
}
|
||||
```
|
||||
PNG at this point was RGBA (color_type=6).
|
||||
|
||||
**Result:** Error appeared immediately on first ISO build.
|
||||
|
||||
---
|
||||
|
||||
### Attempt 2 — remove `width`/`height` from image block (aa284ae)
|
||||
|
||||
**Hypothesis:** Explicit scaling dimensions trigger the scale path; removing them avoids it.
|
||||
|
||||
**What was done:** Removed `width = 400` and `height = 400` from the image block.
|
||||
```
|
||||
+ image {
|
||||
top = 4%
|
||||
left = 50%-200
|
||||
file = "bee-logo.png"
|
||||
}
|
||||
```
|
||||
|
||||
**Result:** Error persists. GRUB calls `grub_video_bitmap_create_scaled` regardless of whether
|
||||
`width`/`height` are specified — if the bitmap is null (loading failed), the error fires either way.
|
||||
|
||||
---
|
||||
|
||||
### Attempt 3 — convert PNG to RGBA + strip metadata chunks (6112094)
|
||||
|
||||
**Hypothesis:** GRUB's minimal PNG parser is confused by metadata chunks (cHRM, bKGD, tIME, tEXt).
|
||||
Also re-ordered `terminal_output gfxterm` before `insmod png` / theme load.
|
||||
|
||||
**What was done:**
|
||||
- Converted PNG to RGBA color_type=6, stripped all ancillary chunks
|
||||
- Moved `terminal_output gfxterm` earlier in config.cfg
|
||||
- Removed echo ASCII art banner from grub.cfg
|
||||
|
||||
**Result:** Error persists — and this change actually confirmed RGBA does not work:
|
||||
GRUB's PNG loader does not render RGBA PNGs correctly on this platform.
|
||||
|
||||
---
|
||||
|
||||
### Attempt 4 — convert PNG from RGBA back to RGB (333c44f, most recent)
|
||||
|
||||
**Hypothesis:** GRUB does not support RGBA (color_type=6); RGB (color_type=2) is the correct format.
|
||||
Alpha channel composited onto black background (#000000) to match `desktop-color`.
|
||||
|
||||
**What was done:** Converted bee-logo.png from RGBA to RGB via ImageMagick.
|
||||
|
||||
**Current file state:**
|
||||
- 400×400 px, 8-bit/color RGB, non-interlaced
|
||||
- Only IHDR + IDAT + IEND chunks (no metadata)
|
||||
- `insmod png` is present in config.cfg
|
||||
- `terminal_output gfxterm` runs before theme is sourced
|
||||
- No explicit `width`/`height` in image block
|
||||
|
||||
**Result:** Error still occurs on real hardware. Despite the PNG being nominally correct
|
||||
(RGB, non-interlaced, minimal chunks), the bitmap load returns null.
|
||||
|
||||
## Confirmed root cause (verified on 172.16.41.94, 2026-04-30)
|
||||
|
||||
The EFI partition (`sda2`, vfat, 5 MB) contains only:
|
||||
```
|
||||
/EFI/boot/bootia32.efi
|
||||
/EFI/boot/bootx64.efi
|
||||
/EFI/boot/grubx64.efi
|
||||
/boot/grub/grub.cfg
|
||||
```
|
||||
|
||||
`config.cfg`, `theme.cfg`, and the entire `live-theme/` directory (including `bee-logo.png`)
|
||||
are **absent from the EFI image**. `live-build`'s `lb binary_grub-efi` stage is not
|
||||
copying these files. GRUB boots, sources only `grub.cfg`, then fails to load the theme
|
||||
because the file does not exist — returning a null bitmap regardless of PNG format.
|
||||
|
||||
All four fix attempts were targeting the wrong layer (PNG format/content).
|
||||
|
||||
## Fix (applied 2026-04-30)
|
||||
|
||||
Switched from PNG to TGA format:
|
||||
|
||||
1. Converted `bee-logo.png` → `bee-logo.tga` (24-bit uncompressed BGR, top-left origin,
|
||||
480018 bytes). Conversion done via Python stdlib (no external tools needed).
|
||||
2. `config.cfg`: `insmod png` → `insmod tga`
|
||||
3. `theme.txt`: `file = "bee-logo.png"` → `file = "bee-logo.tga"`
|
||||
|
||||
**Why TGA works:** GRUB's TGA reader (`tga.mod`) handles uncompressed 24-bit images
|
||||
trivially — no decompression, no complex chunk parsing. The module is present on-disk
|
||||
(`x86_64-efi/tga.mod`). PNG was failing despite a valid file; the exact GRUB bug is
|
||||
unknown but the PNG reader in Debian bookworm's grub2 is known to be fragile.
|
||||
|
||||
The old `bee-logo.png` is kept in the tree (may be useful for other tools) but is no
|
||||
longer referenced by the theme.
|
||||
|
||||
## Relevant files
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `iso/builder/config/bootloaders/grub-efi/config.cfg` | insmod png, gfxterm init, theme source |
|
||||
| `iso/builder/config/bootloaders/grub-efi/theme.cfg` | sets `theme=` path |
|
||||
| `iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt` | image component definition |
|
||||
| `iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png` | the logo PNG |
|
||||
Submodule internal/chart updated: ac8120c8ab...2a15bc87f1
@@ -27,5 +27,5 @@ insmod gfxterm
|
||||
terminal_input console serial
|
||||
terminal_output gfxterm serial
|
||||
|
||||
insmod png
|
||||
insmod tga
|
||||
source /boot/grub/theme.cfg
|
||||
|
||||
BIN
iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.tga
Normal file
BIN
iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.tga
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 469 KiB |
@@ -9,7 +9,7 @@ terminal-font: "Unifont Regular 16"
|
||||
+ image {
|
||||
top = 4%
|
||||
left = 50%-200
|
||||
file = "bee-logo.png"
|
||||
file = "bee-logo.tga"
|
||||
}
|
||||
|
||||
#help bar at the bottom
|
||||
|
||||
@@ -31,6 +31,7 @@ systemctl enable bee-preflight.service
|
||||
systemctl enable bee-audit.service
|
||||
systemctl enable bee-web.service
|
||||
systemctl enable bee-sshsetup.service
|
||||
systemctl enable bee-blackbox.service
|
||||
systemctl enable bee-selfheal.timer
|
||||
systemctl enable bee-boot-status.service
|
||||
systemctl enable ssh.service
|
||||
|
||||
@@ -66,9 +66,6 @@ jq
|
||||
curl
|
||||
net-tools
|
||||
|
||||
# QR codes (for displaying audit results)
|
||||
qrencode
|
||||
|
||||
# Local desktop (openbox + chromium kiosk)
|
||||
gparted
|
||||
openbox
|
||||
|
||||
Reference in New Issue
Block a user