Compare commits
26 Commits
bbd6d009f8
...
v10.22
| Author | SHA1 | Date | |
|---|---|---|---|
| 74a3c65f64 | |||
| 884988cb2a | |||
| 963bc960ca | |||
| 4f6579e040 | |||
| dc07580adc | |||
| 87e78e230e | |||
| 805a3b277d | |||
| 5bc9bd7fb3 | |||
| 0939a647ea | |||
| 7640f20714 | |||
| 1593bf3e76 | |||
| ae80d7711e | |||
| ca78b9df65 | |||
| 5cafe63f33 | |||
| b75e65bcb1 | |||
| 8d173175eb | |||
| 5cbde0448e | |||
| 49a09fde05 | |||
| f3962422c8 | |||
| ee36e3c711 | |||
| cca3b21d35 | |||
| 75c33e073e | |||
| 7b4bcc745a | |||
| 42774d44a6 | |||
| 5dc022ddf8 | |||
| 6623e159f5 |
@@ -1,6 +1,5 @@
|
|||||||
.env
|
.env
|
||||||
.DS_Store
|
.DS_Store
|
||||||
dist/
|
dist/
|
||||||
iso/out/
|
|
||||||
build-cache/
|
build-cache/
|
||||||
audit/bee
|
audit/bee
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
|||||||
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
|
||||||
|
snap.PCIeDevices = enrichNVLinkBridgesWithGPUTopo(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||||||
|
|||||||
@@ -126,38 +126,39 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
|||||||
dev.Status = &status
|
dev.Status = &status
|
||||||
|
|
||||||
// Slot is the BDF: "0000:00:02.0"
|
// Slot is the BDF: "0000:00:02.0"
|
||||||
if bdf := fields["Slot"]; bdf != "" {
|
bdfStr := fields["Slot"]
|
||||||
dev.Slot = &bdf
|
if bdfStr != "" {
|
||||||
dev.BDF = &bdf
|
dev.Slot = &bdfStr
|
||||||
|
dev.BDF = &bdfStr
|
||||||
// parse vendor_id and device_id from sysfs
|
// parse vendor_id and device_id from sysfs
|
||||||
vendorID, deviceID := readPCIIDs(bdf)
|
vendorID, deviceID := readPCIIDs(bdfStr)
|
||||||
if vendorID != 0 {
|
if vendorID != 0 {
|
||||||
dev.VendorID = &vendorID
|
dev.VendorID = &vendorID
|
||||||
}
|
}
|
||||||
if deviceID != 0 {
|
if deviceID != 0 {
|
||||||
dev.DeviceID = &deviceID
|
dev.DeviceID = &deviceID
|
||||||
}
|
}
|
||||||
if numaNode, ok := readPCINumaNode(bdf); ok {
|
if numaNode, ok := readPCINumaNode(bdfStr); ok {
|
||||||
dev.NUMANode = &numaNode
|
dev.NUMANode = &numaNode
|
||||||
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
||||||
dev.NUMANode = &numaNode
|
dev.NUMANode = &numaNode
|
||||||
}
|
}
|
||||||
if group, ok := readPCIIOMMUGroup(bdf); ok {
|
if group, ok := readPCIIOMMUGroup(bdfStr); ok {
|
||||||
dev.IOMMUGroup = &group
|
dev.IOMMUGroup = &group
|
||||||
}
|
}
|
||||||
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
|
if width, ok := readPCIIntAttribute(bdfStr, "current_link_width"); ok {
|
||||||
dev.LinkWidth = &width
|
dev.LinkWidth = &width
|
||||||
}
|
}
|
||||||
if width, ok := readPCIIntAttribute(bdf, "max_link_width"); ok {
|
if width, ok := readPCIIntAttribute(bdfStr, "max_link_width"); ok {
|
||||||
dev.MaxLinkWidth = &width
|
dev.MaxLinkWidth = &width
|
||||||
}
|
}
|
||||||
if speed, ok := readPCIStringAttribute(bdf, "current_link_speed"); ok {
|
if speed, ok := readPCIStringAttribute(bdfStr, "current_link_speed"); ok {
|
||||||
linkSpeed := normalizePCILinkSpeed(speed)
|
linkSpeed := normalizePCILinkSpeed(speed)
|
||||||
if linkSpeed != "" {
|
if linkSpeed != "" {
|
||||||
dev.LinkSpeed = &linkSpeed
|
dev.LinkSpeed = &linkSpeed
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if speed, ok := readPCIStringAttribute(bdf, "max_link_speed"); ok {
|
if speed, ok := readPCIStringAttribute(bdfStr, "max_link_speed"); ok {
|
||||||
linkSpeed := normalizePCILinkSpeed(speed)
|
linkSpeed := normalizePCILinkSpeed(speed)
|
||||||
if linkSpeed != "" {
|
if linkSpeed != "" {
|
||||||
dev.MaxLinkSpeed = &linkSpeed
|
dev.MaxLinkSpeed = &linkSpeed
|
||||||
@@ -178,7 +179,15 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
|||||||
|
|
||||||
// SVendor/SDevice available but not in schema — skip
|
// SVendor/SDevice available but not in schema — skip
|
||||||
|
|
||||||
// Warn if PCIe link is running below its maximum negotiated speed.
|
// Detect NVLink bridge mezzanine cards (CPU→HGX internal link).
|
||||||
|
// These are Mellanox x2 devices with no host net interfaces and a DeviceName
|
||||||
|
// containing "NVLINK". The targeted lspci call is only executed for the small
|
||||||
|
// number of narrow-link Mellanox cards that pass the cheap pre-filter.
|
||||||
|
if bdfStr != "" && isNVLinkBridgeCandidate(bdfStr, dev) && confirmNVLinkBridgeDeviceName(bdfStr) {
|
||||||
|
markNVLinkBridge(&dev)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Warn (or Critical for NVLink bridges) if PCIe link is running below max.
|
||||||
applyPCIeLinkSpeedWarning(&dev)
|
applyPCIeLinkSpeedWarning(&dev)
|
||||||
|
|
||||||
return dev
|
return dev
|
||||||
@@ -265,17 +274,27 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
|||||||
return value, true
|
return value, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
|
// applyPCIeLinkSpeedWarning sets device status when the current PCIe link speed is
|
||||||
// speed is below the maximum negotiated speed supported by both ends.
|
// below the device maximum. Regular PCIe slots get Warning; NVLink bridge cards
|
||||||
|
// get Critical because they are fixed internal connectors that must always train
|
||||||
|
// to max speed — any downgrade signals a hardware fault.
|
||||||
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
||||||
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
if pcieLinkSpeedRank(*dev.LinkSpeed) >= pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||||
|
dev.ErrorDescription = &desc
|
||||||
|
|
||||||
|
isNVLinkBridge := dev.DeviceClass != nil && *dev.DeviceClass == "NVLinkBridge"
|
||||||
|
if isNVLinkBridge {
|
||||||
|
crit := statusCritical
|
||||||
|
dev.Status = &crit
|
||||||
|
} else {
|
||||||
warn := statusWarning
|
warn := statusWarning
|
||||||
dev.Status = &warn
|
dev.Status = &warn
|
||||||
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
|
||||||
dev.ErrorDescription = &desc
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,206 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"log/slog"
|
||||||
|
"os/exec"
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
var nv5re = regexp.MustCompile(`(?i)^NV(\d+)$`)
|
||||||
|
|
||||||
|
// isNVLinkBridgeCandidate returns true for Mellanox PCIe devices that look like
|
||||||
|
// NVLink bridge mezzanine cards: narrow link (x2), no host net interfaces.
|
||||||
|
// These are the CPU-side PCIe control plane of the NVSwitch fabric on HGX/DGX systems.
|
||||||
|
func isNVLinkBridgeCandidate(bdf string, dev schema.HardwarePCIeDevice) bool {
|
||||||
|
if !isMellanoxDevice(dev) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if dev.LinkWidth == nil || *dev.LinkWidth > 2 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if len(netIfacesByBDF(bdf)) > 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// confirmNVLinkBridgeDeviceName checks if the lspci DeviceName for bdf contains
|
||||||
|
// "NVLINK". This is a targeted single-device call, only executed for candidates
|
||||||
|
// already pre-filtered by isNVLinkBridgeCandidate.
|
||||||
|
func confirmNVLinkBridgeDeviceName(bdf string) bool {
|
||||||
|
out, err := exec.Command("lspci", "-s", bdf, "-v").Output()
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for _, line := range strings.Split(string(out), "\n") {
|
||||||
|
if strings.Contains(strings.ToUpper(strings.TrimSpace(line)), "NVLINK") {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// markNVLinkBridge overwrites device_class and adds telemetry flags on a detected
|
||||||
|
// NVLink bridge card. Must be called before applyPCIeLinkSpeedWarning so that the
|
||||||
|
// correct severity (Critical) is applied.
|
||||||
|
func markNVLinkBridge(dev *schema.HardwarePCIeDevice) {
|
||||||
|
class := "NVLinkBridge"
|
||||||
|
dev.DeviceClass = &class
|
||||||
|
if dev.Telemetry == nil {
|
||||||
|
dev.Telemetry = map[string]any{}
|
||||||
|
}
|
||||||
|
dev.Telemetry["nvlink_bridge"] = true
|
||||||
|
}
|
||||||
|
|
||||||
|
// enrichNVLinkBridgesWithGPUTopo cross-references NVLink bridge PCIe status with
|
||||||
|
// the GPU-side NVLink topology reported by nvidia-smi. For each bridge device it
|
||||||
|
// adds nvlink_topo_all_active and nvlink_topo_min_links to the telemetry, and
|
||||||
|
// upgrades a degraded-link Warning to Critical when the fabric is also affected.
|
||||||
|
func enrichNVLinkBridgesWithGPUTopo(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||||
|
hasBridge := false
|
||||||
|
for _, d := range devs {
|
||||||
|
if d.DeviceClass != nil && *d.DeviceClass == "NVLinkBridge" {
|
||||||
|
hasBridge = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !hasBridge {
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|
||||||
|
topo, err := queryNVIDIANVLinkTopo()
|
||||||
|
if err != nil {
|
||||||
|
slog.Info("nvlink-bridge: nvidia-smi topo unavailable, skipping cross-reference", "err", err)
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range devs {
|
||||||
|
if devs[i].DeviceClass == nil || *devs[i].DeviceClass != "NVLinkBridge" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if devs[i].Telemetry == nil {
|
||||||
|
devs[i].Telemetry = map[string]any{}
|
||||||
|
}
|
||||||
|
devs[i].Telemetry["nvlink_topo_all_active"] = topo.AllActive
|
||||||
|
devs[i].Telemetry["nvlink_topo_min_links"] = topo.MinNVLinks
|
||||||
|
devs[i].Telemetry["nvlink_topo_gpu_count"] = topo.GPUCount
|
||||||
|
|
||||||
|
// If the bridge PCIe is already degraded AND the fabric is also degraded
|
||||||
|
// (missing NVLink connections), escalate to Critical.
|
||||||
|
if devs[i].Status != nil && *devs[i].Status == statusCritical && !topo.AllActive {
|
||||||
|
devs[i].Telemetry["nvlink_fabric_affected"] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Info("nvlink-bridge: topo cross-reference applied",
|
||||||
|
"gpu_count", topo.GPUCount,
|
||||||
|
"all_active", topo.AllActive,
|
||||||
|
"min_links", topo.MinNVLinks,
|
||||||
|
)
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|
||||||
|
// nvlinkTopoResult summarises the GPU NVLink connectivity matrix.
|
||||||
|
type nvlinkTopoResult struct {
|
||||||
|
GPUCount int
|
||||||
|
AllActive bool // true if every GPU pair has at least one NVLink bond
|
||||||
|
MinNVLinks int // minimum NVLink bonds seen across any GPU pair (0 = some pair disconnected)
|
||||||
|
}
|
||||||
|
|
||||||
|
// queryNVIDIANVLinkTopo runs nvidia-smi topo -m and parses the NVLink matrix.
|
||||||
|
func queryNVIDIANVLinkTopo() (nvlinkTopoResult, error) {
|
||||||
|
out, err := exec.Command("nvidia-smi", "topo", "-m").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nvlinkTopoResult{}, err
|
||||||
|
}
|
||||||
|
return parseNVIDIATopologyMatrix(string(out)), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
|
||||||
|
// nvidia-smi topo -m matrix.
|
||||||
|
//
|
||||||
|
// Format (abbreviated):
|
||||||
|
//
|
||||||
|
// GPU0 GPU1 ... NIC0 NIC1
|
||||||
|
// GPU0 X NV18 ... NODE NODE
|
||||||
|
// GPU1 NV18 X ... NODE NODE
|
||||||
|
// NIC0 NODE NODE... X PIX
|
||||||
|
//
|
||||||
|
// The header row starts with "GPU0"; its columns may include non-GPU entries
|
||||||
|
// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are
|
||||||
|
// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
|
||||||
|
func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
|
||||||
|
lines := strings.Split(raw, "\n")
|
||||||
|
|
||||||
|
// Locate the header line and record which column indices are GPU columns.
|
||||||
|
headerIdx := -1
|
||||||
|
var gpuColIndices []int // 0-based indices within fields (excluding the row label)
|
||||||
|
var gpuCount int
|
||||||
|
for i, line := range lines {
|
||||||
|
trimmed := strings.TrimSpace(line)
|
||||||
|
if strings.HasPrefix(trimmed, "GPU0") {
|
||||||
|
parts := strings.Fields(trimmed)
|
||||||
|
for j, col := range parts {
|
||||||
|
if strings.HasPrefix(col, "GPU") {
|
||||||
|
gpuColIndices = append(gpuColIndices, j)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
gpuCount = len(gpuColIndices)
|
||||||
|
if gpuCount >= 2 {
|
||||||
|
headerIdx = i
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if headerIdx < 0 || gpuCount == 0 {
|
||||||
|
return nvlinkTopoResult{}
|
||||||
|
}
|
||||||
|
|
||||||
|
minLinks := -1 // -1 = no NV pair seen yet
|
||||||
|
allActive := true
|
||||||
|
|
||||||
|
for _, line := range lines[headerIdx+1:] {
|
||||||
|
trimmed := strings.TrimSpace(line)
|
||||||
|
if !strings.HasPrefix(trimmed, "GPU") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cells := strings.Fields(trimmed)
|
||||||
|
// cells[0] is the row label (e.g. "GPU0"); cells[1..] are column values.
|
||||||
|
// gpuColIndices are 0-based within the header fields, so they map to
|
||||||
|
// cells[idx+1] in the data rows (shift by 1 for the row label).
|
||||||
|
for _, colIdx := range gpuColIndices {
|
||||||
|
dataIdx := colIdx + 1
|
||||||
|
if dataIdx >= len(cells) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cell := cells[dataIdx]
|
||||||
|
m := nv5re.FindStringSubmatch(cell)
|
||||||
|
if len(m) != 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
n, err := strconv.Atoi(m[1])
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if n == 0 {
|
||||||
|
allActive = false
|
||||||
|
}
|
||||||
|
if minLinks < 0 || n < minLinks {
|
||||||
|
minLinks = n
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if minLinks < 0 {
|
||||||
|
minLinks = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
return nvlinkTopoResult{
|
||||||
|
GPUCount: gpuCount,
|
||||||
|
AllActive: allActive && minLinks > 0,
|
||||||
|
MinNVLinks: minLinks,
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,124 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseNVIDIATopologyMatrix(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// Real-world B200 HGX output: 8 GPUs, all pairs connected via NV18.
|
||||||
|
input := ` GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 NIC0 NIC1
|
||||||
|
GPU0 X NV18 NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||||
|
GPU1 NV18 X NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||||
|
GPU2 NV18 NV18 X NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||||
|
GPU3 NV18 NV18 NV18 X NV18 NV18 NV18 NV18 NODE NODE
|
||||||
|
GPU4 NV18 NV18 NV18 NV18 X NV18 NV18 NV18 SYS SYS
|
||||||
|
GPU5 NV18 NV18 NV18 NV18 NV18 X NV18 NV18 SYS SYS
|
||||||
|
GPU6 NV18 NV18 NV18 NV18 NV18 NV18 X NV18 SYS SYS
|
||||||
|
GPU7 NV18 NV18 NV18 NV18 NV18 NV18 NV18 X SYS SYS
|
||||||
|
NIC0 NODE NODE NODE NODE SYS SYS SYS SYS X PIX
|
||||||
|
`
|
||||||
|
got := parseNVIDIATopologyMatrix(input)
|
||||||
|
|
||||||
|
if got.GPUCount != 8 {
|
||||||
|
t.Fatalf("GPUCount=%d want 8", got.GPUCount)
|
||||||
|
}
|
||||||
|
if !got.AllActive {
|
||||||
|
t.Fatalf("AllActive=false want true")
|
||||||
|
}
|
||||||
|
if got.MinNVLinks != 18 {
|
||||||
|
t.Fatalf("MinNVLinks=%d want 18", got.MinNVLinks)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseNVIDIATopologyMatrixPartialDegradation(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// GPU1-GPU3 pair shows NV12 (reduced) instead of NV18.
|
||||||
|
input := ` GPU0 GPU1 GPU2 GPU3
|
||||||
|
GPU0 X NV18 NV18 NV18
|
||||||
|
GPU1 NV18 X NV18 NV12
|
||||||
|
GPU2 NV18 NV18 X NV18
|
||||||
|
GPU3 NV18 NV12 NV18 X
|
||||||
|
`
|
||||||
|
got := parseNVIDIATopologyMatrix(input)
|
||||||
|
|
||||||
|
if got.MinNVLinks != 12 {
|
||||||
|
t.Fatalf("MinNVLinks=%d want 12", got.MinNVLinks)
|
||||||
|
}
|
||||||
|
if !got.AllActive {
|
||||||
|
t.Fatalf("AllActive=false want true (12 links is still active)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseNVIDIATopologyMatrixDisconnected(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// GPU0-GPU1 pair fully disconnected (NV0).
|
||||||
|
input := ` GPU0 GPU1
|
||||||
|
GPU0 X NV0
|
||||||
|
GPU1 NV0 X
|
||||||
|
`
|
||||||
|
got := parseNVIDIATopologyMatrix(input)
|
||||||
|
|
||||||
|
if got.AllActive {
|
||||||
|
t.Fatalf("AllActive=true want false (NV0 means no links)")
|
||||||
|
}
|
||||||
|
if got.MinNVLinks != 0 {
|
||||||
|
t.Fatalf("MinNVLinks=%d want 0", got.MinNVLinks)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseNVIDIATopologyMatrixEmpty(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
got := parseNVIDIATopologyMatrix("no gpus here")
|
||||||
|
if got.GPUCount != 0 {
|
||||||
|
t.Fatalf("GPUCount=%d want 0", got.GPUCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestApplyPCIeLinkSpeedWarningNVLinkBridgeEscalates(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
bridgeClass := "NVLinkBridge"
|
||||||
|
linkSpeed := "Gen3"
|
||||||
|
maxLinkSpeed := "Gen4"
|
||||||
|
dev := schema.HardwarePCIeDevice{}
|
||||||
|
dev.DeviceClass = &bridgeClass
|
||||||
|
dev.LinkSpeed = &linkSpeed
|
||||||
|
dev.MaxLinkSpeed = &maxLinkSpeed
|
||||||
|
s := statusOK
|
||||||
|
dev.Status = &s
|
||||||
|
|
||||||
|
applyPCIeLinkSpeedWarning(&dev)
|
||||||
|
|
||||||
|
if dev.Status == nil || *dev.Status != statusCritical {
|
||||||
|
t.Fatalf("status=%v want Critical for NVLink bridge degradation", dev.Status)
|
||||||
|
}
|
||||||
|
if dev.ErrorDescription == nil {
|
||||||
|
t.Fatal("ErrorDescription nil, want degradation message")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestApplyPCIeLinkSpeedWarningRegularCardIsWarning(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
regularClass := "NetworkController"
|
||||||
|
linkSpeed := "Gen3"
|
||||||
|
maxLinkSpeed := "Gen4"
|
||||||
|
dev := schema.HardwarePCIeDevice{}
|
||||||
|
dev.DeviceClass = ®ularClass
|
||||||
|
dev.LinkSpeed = &linkSpeed
|
||||||
|
dev.MaxLinkSpeed = &maxLinkSpeed
|
||||||
|
s := statusOK
|
||||||
|
dev.Status = &s
|
||||||
|
|
||||||
|
applyPCIeLinkSpeedWarning(&dev)
|
||||||
|
|
||||||
|
if dev.Status == nil || *dev.Status != statusWarning {
|
||||||
|
t.Fatalf("status=%v want Warning for regular card degradation", dev.Status)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -36,6 +36,13 @@ func bestEffortRescanHotplugStorage() {
|
|||||||
slog.Info("storage: scsi host scan skipped", "pattern", scsiHostScanGlob, "err", err)
|
slog.Info("storage: scsi host scan skipped", "pattern", scsiHostScanGlob, "err", err)
|
||||||
} else {
|
} else {
|
||||||
for _, path := range hostPaths {
|
for _, path := range hostPaths {
|
||||||
|
// SAS HBAs (e.g. smartpqi) block indefinitely in sas_user_scan when
|
||||||
|
// written to — SAS topology is discovered by the driver itself.
|
||||||
|
host := filepath.Base(filepath.Dir(path))
|
||||||
|
if _, err := os.Stat("/sys/class/sas_host/" + host); err == nil {
|
||||||
|
slog.Info("storage: scsi host scan skipped (SAS host)", "path", path)
|
||||||
|
continue
|
||||||
|
}
|
||||||
if err := hotplugWriteFile(path, []byte("- - -\n"), 0644); err != nil {
|
if err := hotplugWriteFile(path, []byte("- - -\n"), 0644); err != nil {
|
||||||
slog.Info("storage: scsi host scan write failed", "path", path, "err", err)
|
slog.Info("storage: scsi host scan write failed", "path", path, "err", err)
|
||||||
continue
|
continue
|
||||||
@@ -66,17 +73,41 @@ func collectStorage() []schema.HardwareStorage {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// jsonInt64 accepts both a bare JSON number and a JSON-quoted number string.
|
||||||
|
// lsblk -J emits LOG-SEC / PHY-SEC as integers on util-linux ≥ 2.37 (Debian 12)
|
||||||
|
// but older versions emit them as strings. This type handles both.
|
||||||
|
type jsonInt64 int64
|
||||||
|
|
||||||
|
func (j *jsonInt64) UnmarshalJSON(data []byte) error {
|
||||||
|
// bare number: 512
|
||||||
|
var n int64
|
||||||
|
if err := json.Unmarshal(data, &n); err == nil {
|
||||||
|
*j = jsonInt64(n)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// quoted string: "512"
|
||||||
|
var s string
|
||||||
|
if err := json.Unmarshal(data, &s); err == nil {
|
||||||
|
n, err := strconv.ParseInt(strings.TrimSpace(s), 10, 64)
|
||||||
|
if err == nil {
|
||||||
|
*j = jsonInt64(n)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return nil // null or unexpected type — leave zero
|
||||||
|
}
|
||||||
|
|
||||||
// lsblkDevice is a minimal lsblk JSON record.
|
// lsblkDevice is a minimal lsblk JSON record.
|
||||||
type lsblkDevice struct {
|
type lsblkDevice struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Type string `json:"type"`
|
Type string `json:"type"`
|
||||||
Size string `json:"size"`
|
Size string `json:"size"`
|
||||||
Serial string `json:"serial"`
|
Serial string `json:"serial"`
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
Tran string `json:"tran"`
|
Tran string `json:"tran"`
|
||||||
Hctl string `json:"hctl"`
|
Hctl string `json:"hctl"`
|
||||||
LogSec string `json:"log-sec"`
|
LogSec jsonInt64 `json:"log-sec"`
|
||||||
PhySec string `json:"phy-sec"`
|
PhySec jsonInt64 `json:"phy-sec"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type lsblkRoot struct {
|
type lsblkRoot struct {
|
||||||
@@ -620,8 +651,8 @@ func applyStorageBlockGeometry(s *schema.HardwareStorage, dev lsblkDevice) {
|
|||||||
if s == nil {
|
if s == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
logical := parseStorageBytes(dev.LogSec)
|
logical := int64(dev.LogSec)
|
||||||
physical := parseStorageBytes(dev.PhySec)
|
physical := int64(dev.PhySec)
|
||||||
if logical <= 0 && physical <= 0 {
|
if logical <= 0 && physical <= 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package collector
|
package collector
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
@@ -38,6 +39,54 @@ func TestParseStorageBytes(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestJsonInt64UnmarshalBothFormats(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// util-linux ≥ 2.37 emits LOG-SEC / PHY-SEC as bare JSON numbers.
|
||||||
|
// Older versions emit quoted strings. Both must parse without error
|
||||||
|
// so that the entire lsblkDevices() call does not return nil on Debian 12.
|
||||||
|
cases := []struct {
|
||||||
|
json string
|
||||||
|
want int64
|
||||||
|
}{
|
||||||
|
{`512`, 512},
|
||||||
|
{`4096`, 4096},
|
||||||
|
{`"512"`, 512},
|
||||||
|
{`"4096"`, 4096},
|
||||||
|
{`null`, 0},
|
||||||
|
}
|
||||||
|
for _, tc := range cases {
|
||||||
|
var v jsonInt64
|
||||||
|
if err := v.UnmarshalJSON([]byte(tc.json)); err != nil {
|
||||||
|
t.Fatalf("UnmarshalJSON(%s): unexpected error %v", tc.json, err)
|
||||||
|
}
|
||||||
|
if int64(v) != tc.want {
|
||||||
|
t.Fatalf("UnmarshalJSON(%s)=%d want %d", tc.json, int64(v), tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Simulate the exact JSON shape that triggered the bug on Debian 12.
|
||||||
|
input := []byte(`{
|
||||||
|
"blockdevices": [
|
||||||
|
{"name":"sda","type":"disk","size":"3.6T","serial":"S1234","model":"SEAGATE","tran":"sata","hctl":"0:0:0:0","log-sec":512,"phy-sec":4096},
|
||||||
|
{"name":"sdb","type":"disk","size":"3.6T","serial":"S5678","model":"SEAGATE","tran":"sata","hctl":"0:0:1:0","log-sec":512,"phy-sec":4096}
|
||||||
|
]
|
||||||
|
}`)
|
||||||
|
var root lsblkRoot
|
||||||
|
if err := json.Unmarshal(input, &root); err != nil {
|
||||||
|
t.Fatalf("lsblkRoot unmarshal with integer log-sec/phy-sec: %v", err)
|
||||||
|
}
|
||||||
|
if len(root.Blockdevices) != 2 {
|
||||||
|
t.Fatalf("got %d blockdevices want 2", len(root.Blockdevices))
|
||||||
|
}
|
||||||
|
if int64(root.Blockdevices[0].LogSec) != 512 {
|
||||||
|
t.Fatalf("LogSec=%d want 512", root.Blockdevices[0].LogSec)
|
||||||
|
}
|
||||||
|
if int64(root.Blockdevices[0].PhySec) != 4096 {
|
||||||
|
t.Fatalf("PhySec=%d want 4096", root.Blockdevices[0].PhySec)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestBestEffortRescanHotplugStorage(t *testing.T) {
|
func TestBestEffortRescanHotplugStorage(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
@@ -38,6 +38,15 @@ var HardwareErrorPatterns = []ErrorPattern{
|
|||||||
Category: "gpu",
|
Category: "gpu",
|
||||||
Severity: "warning",
|
Severity: "warning",
|
||||||
},
|
},
|
||||||
|
// PCIe AER correctable from the NVIDIA driver — "bus correctable error" in SEL.
|
||||||
|
// Severity is warning (not critical): correctable errors are hardware-recovered.
|
||||||
|
{
|
||||||
|
Name: "nvidia-aer-correctable",
|
||||||
|
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER.*[Cc]orrect`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
Name: "nvidia-aer",
|
Name: "nvidia-aer",
|
||||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||||
@@ -54,6 +63,15 @@ var HardwareErrorPatterns = []ErrorPattern{
|
|||||||
},
|
},
|
||||||
|
|
||||||
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||||
|
// PCIe AER correctable from the root port — captures the reported device BDF
|
||||||
|
// (second BDF in "pcieport X: AER: Correctable error received: Y").
|
||||||
|
{
|
||||||
|
Name: "pcie-aer-correctable",
|
||||||
|
Re: mustPat(`(?i)pcieport.*AER:.*[Cc]orrect.*:\s*([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||||
|
Category: "pcie",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
Name: "pcie-aer",
|
Name: "pcie-aer",
|
||||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
@@ -18,7 +19,7 @@ type InstallDisk struct {
|
|||||||
MountedParts []string // partition mount points currently active
|
MountedParts []string // partition mount points currently active
|
||||||
}
|
}
|
||||||
|
|
||||||
const squashfsPath = "/run/live/medium/live/filesystem.squashfs"
|
const squashfsGlob = "/run/live/medium/live/*.squashfs"
|
||||||
|
|
||||||
// ListInstallDisks returns block devices suitable for installation.
|
// ListInstallDisks returns block devices suitable for installation.
|
||||||
// Excludes the current live boot medium but includes USB drives.
|
// Excludes the current live boot medium but includes USB drives.
|
||||||
@@ -176,11 +177,22 @@ func inferLiveBootKind(fsType, source, deviceType, transport string) string {
|
|||||||
// squashfs size × 1.5 to allow for extracted filesystem and bootloader.
|
// squashfs size × 1.5 to allow for extracted filesystem and bootloader.
|
||||||
// Returns 0 if the squashfs is not available (non-live environment).
|
// Returns 0 if the squashfs is not available (non-live environment).
|
||||||
func MinInstallBytes() int64 {
|
func MinInstallBytes() int64 {
|
||||||
fi, err := os.Stat(squashfsPath)
|
files, err := filepath.Glob(squashfsGlob)
|
||||||
if err != nil {
|
if err != nil || len(files) == 0 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
return fi.Size() * 3 / 2
|
var total int64
|
||||||
|
for _, path := range files {
|
||||||
|
fi, statErr := os.Stat(path)
|
||||||
|
if statErr != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
total += fi.Size()
|
||||||
|
}
|
||||||
|
if total == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return total * 3 / 2
|
||||||
}
|
}
|
||||||
|
|
||||||
// toramActive returns true when the live system was booted with toram.
|
// toramActive returns true when the live system was booted with toram.
|
||||||
@@ -222,12 +234,10 @@ func DiskWarnings(d InstallDisk) []string {
|
|||||||
humanBytes(min), humanBytes(d.SizeBytes)))
|
humanBytes(min), humanBytes(d.SizeBytes)))
|
||||||
}
|
}
|
||||||
if toramActive() {
|
if toramActive() {
|
||||||
sqFi, err := os.Stat(squashfsPath)
|
free := freeMemBytes()
|
||||||
if err == nil {
|
min := MinInstallBytes()
|
||||||
free := freeMemBytes()
|
if free > 0 && min > 0 && free < (min*4/3) {
|
||||||
if free > 0 && free < sqFi.Size()*2 {
|
w = append(w, "toram mode — low RAM, extraction may be slow or fail")
|
||||||
w = append(w, "toram mode — low RAM, extraction may be slow or fail")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return w
|
return w
|
||||||
|
|||||||
@@ -55,7 +55,6 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces))
|
health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces))
|
||||||
hasIPv4 := false
|
hasIPv4 := false
|
||||||
missingIPv4 := false
|
|
||||||
for _, iface := range interfaces {
|
for _, iface := range interfaces {
|
||||||
outcome := "no_offer"
|
outcome := "no_offer"
|
||||||
if len(iface.IPv4) > 0 {
|
if len(iface.IPv4) > 0 {
|
||||||
@@ -63,8 +62,6 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
|||||||
hasIPv4 = true
|
hasIPv4 = true
|
||||||
} else if strings.EqualFold(iface.State, "DOWN") {
|
} else if strings.EqualFold(iface.State, "DOWN") {
|
||||||
outcome = "link_down"
|
outcome = "link_down"
|
||||||
} else {
|
|
||||||
missingIPv4 = true
|
|
||||||
}
|
}
|
||||||
health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{
|
health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{
|
||||||
Name: iface.Name,
|
Name: iface.Name,
|
||||||
@@ -73,17 +70,9 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
|||||||
Outcome: outcome,
|
Outcome: outcome,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
switch {
|
if hasIPv4 {
|
||||||
case hasIPv4 && !missingIPv4:
|
|
||||||
health.NetworkStatus = "OK"
|
health.NetworkStatus = "OK"
|
||||||
case hasIPv4:
|
} else {
|
||||||
health.NetworkStatus = "PARTIAL"
|
|
||||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
||||||
Code: "dhcp_partial",
|
|
||||||
Severity: "warning",
|
|
||||||
Description: "At least one interface did not obtain IPv4 connectivity.",
|
|
||||||
})
|
|
||||||
default:
|
|
||||||
health.NetworkStatus = "FAILED"
|
health.NetworkStatus = "FAILED"
|
||||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
Code: "dhcp_failed",
|
Code: "dhcp_failed",
|
||||||
|
|||||||
@@ -1679,6 +1679,56 @@ func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Reque
|
|||||||
fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
|
fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Hardware summary / component detail ──────────────────────────────────────
|
||||||
|
|
||||||
|
// handleAPIHardwareSummary returns the hardware summary card HTML fragment for
|
||||||
|
// htmx polling (hx-get="/api/hardware-summary" hx-swap="outerHTML").
|
||||||
|
func (h *handler) handleAPIHardwareSummary(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
fmt.Fprint(w, renderHardwareSummaryCard(h.opts))
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleAPIComponentDetail returns an HTML fragment describing the current and
|
||||||
|
// historical status for one component type (cpu, memory, storage, gpu, psu).
|
||||||
|
func (h *handler) handleAPIComponentDetail(w http.ResponseWriter, r *http.Request) {
|
||||||
|
compType := r.PathValue("type")
|
||||||
|
var exact, prefixes []string
|
||||||
|
var title string
|
||||||
|
switch compType {
|
||||||
|
case "cpu":
|
||||||
|
title = "CPU"
|
||||||
|
exact = []string{"cpu:all"}
|
||||||
|
case "memory":
|
||||||
|
title = "Memory"
|
||||||
|
exact = []string{"memory:all"}
|
||||||
|
prefixes = []string{"memory:"}
|
||||||
|
case "storage":
|
||||||
|
title = "Storage"
|
||||||
|
exact = []string{"storage:all"}
|
||||||
|
prefixes = []string{"storage:"}
|
||||||
|
case "gpu":
|
||||||
|
title = "GPU"
|
||||||
|
prefixes = []string{"pcie:gpu:"}
|
||||||
|
case "psu":
|
||||||
|
title = "PSU"
|
||||||
|
prefixes = []string{"psu:"}
|
||||||
|
default:
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var records []app.ComponentStatusRecord
|
||||||
|
if h.opts.App != nil && h.opts.App.StatusDB != nil {
|
||||||
|
all := h.opts.App.StatusDB.All()
|
||||||
|
records = matchedRecords(all, exact, prefixes)
|
||||||
|
}
|
||||||
|
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
fmt.Fprint(w, renderComponentDetail(title, records))
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) rollbackPendingNetworkChange() error {
|
func (h *handler) rollbackPendingNetworkChange() error {
|
||||||
h.pendingNetMu.Lock()
|
h.pendingNetMu.Lock()
|
||||||
pnc := h.pendingNet
|
pnc := h.pendingNet
|
||||||
|
|||||||
@@ -0,0 +1,76 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"log/slog"
|
||||||
|
"os/exec"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/collector"
|
||||||
|
)
|
||||||
|
|
||||||
|
const healthPollInterval = 60 * time.Second
|
||||||
|
const psuIPMITimeout = 15 * time.Second
|
||||||
|
|
||||||
|
// healthPoller runs periodic health checks for hardware components that do not
|
||||||
|
// emit kernel log events (e.g. PSU). Results are written to ComponentStatusDB.
|
||||||
|
type healthPoller struct {
|
||||||
|
statusDB *app.ComponentStatusDB
|
||||||
|
}
|
||||||
|
|
||||||
|
func newHealthPoller(statusDB *app.ComponentStatusDB) *healthPoller {
|
||||||
|
return &healthPoller{statusDB: statusDB}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *healthPoller) start() {
|
||||||
|
goRecoverLoop("health poller", 5*time.Second, p.run)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *healthPoller) run() {
|
||||||
|
ticker := time.NewTicker(healthPollInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for range ticker.C {
|
||||||
|
p.pollPSU()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *healthPoller) pollPSU() {
|
||||||
|
if p.statusDB == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), psuIPMITimeout)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
cmd := exec.CommandContext(ctx, "ipmitool", "sdr")
|
||||||
|
var out bytes.Buffer
|
||||||
|
cmd.Stdout = &out
|
||||||
|
if err := cmd.Run(); err != nil {
|
||||||
|
// IPMI not available or not a server — skip silently.
|
||||||
|
slog.Debug("health poller: ipmitool sdr unavailable", "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
slots := collector.PSUSlotsFromSDR(out.String())
|
||||||
|
if len(slots) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const source = "watchdog:psu"
|
||||||
|
for slot, psu := range slots {
|
||||||
|
key := "psu:" + slot
|
||||||
|
status := psu.Status
|
||||||
|
if status == "" {
|
||||||
|
status = "Unknown"
|
||||||
|
}
|
||||||
|
detail := ""
|
||||||
|
switch status {
|
||||||
|
case "Critical":
|
||||||
|
detail = "PSU sensor reported non-OK state"
|
||||||
|
case "Warning":
|
||||||
|
detail = "PSU sensor in warning state"
|
||||||
|
}
|
||||||
|
p.statusDB.Record(key, source, status, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -73,6 +73,9 @@ func (w *kmsgWatcher) run() {
|
|||||||
w.mu.Lock()
|
w.mu.Lock()
|
||||||
if w.window != nil {
|
if w.window != nil {
|
||||||
w.recordEvent(evt)
|
w.recordEvent(evt)
|
||||||
|
} else {
|
||||||
|
evtCopy := evt
|
||||||
|
goRecoverOnce("kmsg flush immediate", func() { w.flushImmediate(evtCopy) })
|
||||||
}
|
}
|
||||||
w.mu.Unlock()
|
w.mu.Unlock()
|
||||||
}
|
}
|
||||||
@@ -162,7 +165,9 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
|||||||
for _, id := range evt.ids {
|
for _, id := range evt.ids {
|
||||||
var key string
|
var key string
|
||||||
switch evt.category {
|
switch evt.category {
|
||||||
case "gpu", "pcie":
|
case "gpu":
|
||||||
|
key = "pcie:gpu:" + normalizeBDF(id)
|
||||||
|
case "pcie":
|
||||||
key = "pcie:" + normalizeBDF(id)
|
key = "pcie:" + normalizeBDF(id)
|
||||||
case "storage":
|
case "storage":
|
||||||
key = "storage:" + id
|
key = "storage:" + id
|
||||||
@@ -180,6 +185,54 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// flushImmediate writes a single kmsg event directly to the status DB without a SAT window.
|
||||||
|
// Called when an error is detected outside of any SAT task (always-on watching).
|
||||||
|
func (w *kmsgWatcher) flushImmediate(evt kmsgEvent) {
|
||||||
|
if w.statusDB == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
const source = "watchdog:kmsg"
|
||||||
|
detail := "kernel: " + truncate(evt.raw, 120)
|
||||||
|
|
||||||
|
var severity string
|
||||||
|
for _, p := range platform.HardwareErrorPatterns {
|
||||||
|
if p.Re.MatchString(evt.raw) {
|
||||||
|
if p.Severity == "critical" {
|
||||||
|
severity = "Critical"
|
||||||
|
} else {
|
||||||
|
severity = "Warning"
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if severity == "" {
|
||||||
|
severity = "Warning"
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(evt.ids) == 0 {
|
||||||
|
key := "cpu:all"
|
||||||
|
if evt.category == "memory" {
|
||||||
|
key = "memory:all"
|
||||||
|
}
|
||||||
|
w.statusDB.Record(key, source, severity, detail)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, id := range evt.ids {
|
||||||
|
var key string
|
||||||
|
switch evt.category {
|
||||||
|
case "gpu":
|
||||||
|
key = "pcie:gpu:" + normalizeBDF(id)
|
||||||
|
case "pcie":
|
||||||
|
key = "pcie:" + normalizeBDF(id)
|
||||||
|
case "storage":
|
||||||
|
key = "storage:" + id
|
||||||
|
default:
|
||||||
|
key = "pcie:" + normalizeBDF(id)
|
||||||
|
}
|
||||||
|
w.statusDB.Record(key, source, severity, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||||
// any pattern in platform.HardwareErrorPatterns.
|
// any pattern in platform.HardwareErrorPatterns.
|
||||||
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ func layoutHead(title string) string {
|
|||||||
<style>
|
<style>
|
||||||
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
|
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
|
||||||
*{box-sizing:border-box;margin:0;padding:0}
|
*{box-sizing:border-box;margin:0;padding:0}
|
||||||
|
dialog{margin:auto}
|
||||||
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
|
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
|
||||||
a{color:var(--accent);text-decoration:none}
|
a{color:var(--accent);text-decoration:none}
|
||||||
/* Sidebar */
|
/* Sidebar */
|
||||||
|
|||||||
+248
-18
@@ -5,7 +5,9 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"html"
|
"html"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
"sort"
|
"sort"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
@@ -85,6 +87,7 @@ func renderPage(page string, opts HandlerOptions) string {
|
|||||||
body +
|
body +
|
||||||
`</div></div>` +
|
`</div></div>` +
|
||||||
renderAuditModal() +
|
renderAuditModal() +
|
||||||
|
`<dialog id="component-detail-dialog" style="min-width:600px;max-width:900px;width:90vw;padding:0;border:1px solid var(--border);border-radius:8px;background:var(--surface)"><div id="component-detail-body" style="padding-bottom:20px"></div></dialog>` +
|
||||||
`<script>
|
`<script>
|
||||||
// Add copy button to every .terminal on the page
|
// Add copy button to every .terminal on the page
|
||||||
document.querySelectorAll('.terminal').forEach(function(t){
|
document.querySelectorAll('.terminal').forEach(function(t){
|
||||||
@@ -94,6 +97,17 @@ document.querySelectorAll('.terminal').forEach(function(t){
|
|||||||
btn.onclick=function(){navigator.clipboard.writeText(t.textContent).then(function(){btn.textContent='Copied!';setTimeout(function(){btn.textContent='Copy';},1500);});};
|
btn.onclick=function(){navigator.clipboard.writeText(t.textContent).then(function(){btn.textContent='Copied!';setTimeout(function(){btn.textContent='Copy';},1500);});};
|
||||||
w.appendChild(btn);
|
w.appendChild(btn);
|
||||||
});
|
});
|
||||||
|
function openComponentDetail(type) {
|
||||||
|
var dlg = document.getElementById('component-detail-dialog');
|
||||||
|
var body = document.getElementById('component-detail-body');
|
||||||
|
body.innerHTML = '<div style="padding:20px;color:var(--muted)">Loading…</div>';
|
||||||
|
dlg.showModal();
|
||||||
|
fetch('/api/components/' + type).then(function(r){ return r.text(); }).then(function(html){
|
||||||
|
body.innerHTML = html;
|
||||||
|
}).catch(function(){
|
||||||
|
body.innerHTML = '<div style="padding:20px;color:var(--crit-fg)">Error loading details.</div>';
|
||||||
|
});
|
||||||
|
}
|
||||||
</script>` +
|
</script>` +
|
||||||
`</body></html>`
|
`</body></html>`
|
||||||
}
|
}
|
||||||
@@ -106,6 +120,14 @@ func renderDashboard(opts HandlerOptions) string {
|
|||||||
b.WriteString(renderHardwareSummaryCard(opts))
|
b.WriteString(renderHardwareSummaryCard(opts))
|
||||||
b.WriteString(renderHealthCard(opts))
|
b.WriteString(renderHealthCard(opts))
|
||||||
b.WriteString(renderMetrics())
|
b.WriteString(renderMetrics())
|
||||||
|
b.WriteString(`<script>
|
||||||
|
setInterval(function(){
|
||||||
|
fetch('/api/hardware-summary').then(function(r){return r.text();}).then(function(html){
|
||||||
|
var el=document.getElementById('hw-summary-card');
|
||||||
|
if(el){el.outerHTML=html;}
|
||||||
|
}).catch(function(){});
|
||||||
|
},30000);
|
||||||
|
</script>`)
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -184,13 +206,14 @@ func renderAudit() string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func renderHardwareSummaryCard(opts HandlerOptions) string {
|
func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||||
|
const cardID = ` id="hw-summary-card"`
|
||||||
data, err := loadSnapshot(opts.AuditPath)
|
data, err := loadSnapshot(opts.AuditPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return `<div class="card"><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
|
return `<div class="card"` + cardID + `><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
|
||||||
}
|
}
|
||||||
var ingest schema.HardwareIngestRequest
|
var ingest schema.HardwareIngestRequest
|
||||||
if err := json.Unmarshal(data, &ingest); err != nil {
|
if err := json.Unmarshal(data, &ingest); err != nil {
|
||||||
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
|
return `<div class="card"` + cardID + `><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
|
||||||
}
|
}
|
||||||
hw := ingest.Hardware
|
hw := ingest.Hardware
|
||||||
|
|
||||||
@@ -200,7 +223,7 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
b.WriteString(`<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body">`)
|
b.WriteString(`<div class="card"` + cardID + `><div class="card-head">Hardware Summary</div><div class="card-body">`)
|
||||||
|
|
||||||
// Server identity block above the component table.
|
// Server identity block above the component table.
|
||||||
{
|
{
|
||||||
@@ -229,22 +252,32 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
b.WriteString(`<table style="width:auto">`)
|
b.WriteString(`<table style="width:auto">`)
|
||||||
writeRow := func(label, value, badgeHTML string) {
|
// writeRow renders one component row. compType is the URL path segment for the detail
|
||||||
b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0;color:var(--muted);font-size:13px">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
|
// endpoint (e.g. "cpu"). Pass "" for rows that have no detail view.
|
||||||
html.EscapeString(label), html.EscapeString(value), badgeHTML))
|
writeRow := func(label, value, badgeHTML, compType string) {
|
||||||
|
var labelHTML string
|
||||||
|
if compType != "" {
|
||||||
|
labelHTML = fmt.Sprintf(
|
||||||
|
`<span style="cursor:pointer;text-decoration:underline dotted;text-underline-offset:3px" onclick="openComponentDetail('%s')">%s</span>`,
|
||||||
|
compType, html.EscapeString(label))
|
||||||
|
} else {
|
||||||
|
labelHTML = html.EscapeString(label)
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, `<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0;color:var(--muted);font-size:13px">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
|
||||||
|
labelHTML, html.EscapeString(value), badgeHTML)
|
||||||
}
|
}
|
||||||
|
|
||||||
writeRow("CPU", hwDescribeCPU(hw),
|
writeRow("CPU", hwDescribeCPU(hw),
|
||||||
renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil)))
|
renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil)), "cpu")
|
||||||
|
|
||||||
writeRow("Memory", hwDescribeMemory(hw),
|
writeRow("Memory", hwDescribeMemory(hw),
|
||||||
renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"})))
|
renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"})), "memory")
|
||||||
|
|
||||||
writeRow("Storage", hwDescribeStorage(hw),
|
writeRow("Storage", hwDescribeStorage(hw),
|
||||||
renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"})))
|
renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"})), "storage")
|
||||||
|
|
||||||
writeRow("GPU", hwDescribeGPU(hw),
|
writeRow("GPU", hwDescribeGPU(hw),
|
||||||
renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"})))
|
renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"})), "gpu")
|
||||||
|
|
||||||
psuMatched := matchedRecords(records, nil, []string{"psu:"})
|
psuMatched := matchedRecords(records, nil, []string{"psu:"})
|
||||||
if len(psuMatched) == 0 && len(hw.PowerSupplies) > 0 {
|
if len(psuMatched) == 0 && len(hw.PowerSupplies) > 0 {
|
||||||
@@ -252,10 +285,10 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
|||||||
psuStatus := hwPSUStatus(hw.PowerSupplies)
|
psuStatus := hwPSUStatus(hw.PowerSupplies)
|
||||||
psuMatched = []app.ComponentStatusRecord{{ComponentKey: "psu:ipmi", Status: psuStatus}}
|
psuMatched = []app.ComponentStatusRecord{{ComponentKey: "psu:ipmi", Status: psuStatus}}
|
||||||
}
|
}
|
||||||
writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched))
|
writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched), "psu")
|
||||||
|
|
||||||
if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
|
if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
|
||||||
writeRow("Network", nicDesc, "")
|
writeRow("Network", nicDesc, "", "")
|
||||||
}
|
}
|
||||||
|
|
||||||
b.WriteString(`</table>`)
|
b.WriteString(`</table>`)
|
||||||
@@ -614,7 +647,7 @@ func buildRuntimeNetworkRow(health schema.RuntimeHealth) runtimeHealthRow {
|
|||||||
if status == "" {
|
if status == "" {
|
||||||
status = "UNKNOWN"
|
status = "UNKNOWN"
|
||||||
}
|
}
|
||||||
issue := runtimeIssueDescriptions(health.Issues, "dhcp_partial", "dhcp_failed")
|
issue := runtimeIssueDescriptions(health.Issues, "dhcp_failed")
|
||||||
return runtimeHealthRow{Title: "Network", Status: status, Source: "ListInterfaces / DHCP", Issue: issue}
|
return runtimeHealthRow{Title: "Network", Status: status, Source: "ListInterfaces / DHCP", Issue: issue}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -672,12 +705,12 @@ func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
|
|||||||
nonActive := make([]string, 0)
|
nonActive := make([]string, 0)
|
||||||
for _, svc := range health.Services {
|
for _, svc := range health.Services {
|
||||||
state := strings.TrimSpace(strings.ToLower(svc.Status))
|
state := strings.TrimSpace(strings.ToLower(svc.Status))
|
||||||
// "activating" and "deactivating" are transient states for oneshot services
|
// "inactive" is OK for oneshot services that have completed successfully
|
||||||
// (RemainAfterExit=yes) — the service is running normally, not failed.
|
// (bee-sshsetup, bee-preflight, bee-audit, bee-network, etc.).
|
||||||
// Only "failed" and "inactive" (after services should be running) are problems.
|
// Only "failed" is a genuine problem.
|
||||||
switch state {
|
switch state {
|
||||||
case "active", "activating", "deactivating", "reloading":
|
case "active", "activating", "deactivating", "reloading", "inactive":
|
||||||
// OK — service is running or transitioning normally
|
// OK — service is running, transitioning normally, or completed successfully
|
||||||
default:
|
default:
|
||||||
nonActive = append(nonActive, svc.Name+"="+svc.Status)
|
nonActive = append(nonActive, svc.Name+"="+svc.Status)
|
||||||
}
|
}
|
||||||
@@ -999,3 +1032,200 @@ func rowIssueHTML(issue string) string {
|
|||||||
}
|
}
|
||||||
return html.EscapeString(issue)
|
return html.EscapeString(issue)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var aerStatusRe = regexp.MustCompile(`aer_status:\s*0x([0-9a-fA-F]{1,8})`)
|
||||||
|
|
||||||
|
// decodeAERStatus parses an AER status hex value from a kernel error detail string
|
||||||
|
// and returns a human-readable list of set bit names with correctable/uncorrectable label,
|
||||||
|
// or "" if no AER status is found.
|
||||||
|
func decodeAERStatus(detail string) string {
|
||||||
|
m := aerStatusRe.FindStringSubmatch(detail)
|
||||||
|
if m == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
v64, err := strconv.ParseUint(m[1], 16, 32)
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
val := uint32(v64)
|
||||||
|
|
||||||
|
type bitDef struct {
|
||||||
|
bit uint32
|
||||||
|
name string
|
||||||
|
}
|
||||||
|
corrBits := []bitDef{
|
||||||
|
{0, "Receiver Error"}, {6, "Replay Timer Timeout"}, {7, "Advisory Non-Fatal"},
|
||||||
|
{8, "Corrected Internal Error"}, {9, "Header Log Overflow"},
|
||||||
|
{13, "Replay Num Rollover"}, {14, "Bad DLLP"}, {15, "Bad TLP"},
|
||||||
|
}
|
||||||
|
uncorrBits := []bitDef{
|
||||||
|
{4, "Data Link Protocol Error"}, {5, "Surprise Down Error"},
|
||||||
|
{12, "Poisoned TLP Received"}, {13, "Flow Control Protocol Error"},
|
||||||
|
{14, "Completion Timeout"}, {15, "Completer Abort"}, {16, "Unexpected Completion"},
|
||||||
|
{17, "Receiver Overflow"}, {18, "Malformed TLP"}, {19, "ECRC Error"},
|
||||||
|
{20, "Unsupported Request Error"}, {21, "ACS Violation"}, {22, "Uncorrectable Internal Error"},
|
||||||
|
}
|
||||||
|
var corrNames, uncorrNames []string
|
||||||
|
for _, b := range corrBits {
|
||||||
|
if val&(1<<b.bit) != 0 {
|
||||||
|
corrNames = append(corrNames, b.name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, b := range uncorrBits {
|
||||||
|
if val&(1<<b.bit) != 0 {
|
||||||
|
uncorrNames = append(uncorrNames, b.name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(corrNames) >= len(uncorrNames) && len(corrNames) > 0 {
|
||||||
|
return strings.Join(corrNames, ", ") + " (correctable)"
|
||||||
|
}
|
||||||
|
if len(uncorrNames) > 0 {
|
||||||
|
return strings.Join(uncorrNames, ", ") + " (uncorrectable)"
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("unknown bits: 0x%08x", val)
|
||||||
|
}
|
||||||
|
|
||||||
|
// renderSparkline returns a small inline SVG showing non-OK events over time.
|
||||||
|
// Events are positioned proportionally along the time axis; if all share the same
|
||||||
|
// timestamp they are spaced evenly. Width is always 100px.
|
||||||
|
func renderSparkline(history []app.ComponentStatusEntry) string {
|
||||||
|
const (
|
||||||
|
svgW = 100
|
||||||
|
svgH = 20
|
||||||
|
barW = 3
|
||||||
|
barH = 14
|
||||||
|
)
|
||||||
|
var events []app.ComponentStatusEntry
|
||||||
|
for _, e := range history {
|
||||||
|
if e.Status != "OK" {
|
||||||
|
events = append(events, e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(events) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
n := len(events)
|
||||||
|
barColor := func(status string) string {
|
||||||
|
if status == "Critical" {
|
||||||
|
return "#c0392b"
|
||||||
|
}
|
||||||
|
return "#d97706"
|
||||||
|
}
|
||||||
|
yTop := (svgH - barH) / 2
|
||||||
|
|
||||||
|
var bars strings.Builder
|
||||||
|
if n == 1 {
|
||||||
|
x := (svgW - barW) / 2
|
||||||
|
fmt.Fprintf(&bars, `<rect x="%d" y="%d" width="%d" height="%d" fill="%s" rx="1"/>`,
|
||||||
|
x, yTop, barW, barH, barColor(events[0].Status))
|
||||||
|
} else {
|
||||||
|
minT := events[0].At
|
||||||
|
maxT := events[n-1].At
|
||||||
|
dur := maxT.Sub(minT).Seconds()
|
||||||
|
for i, e := range events {
|
||||||
|
var x int
|
||||||
|
if dur <= 0 {
|
||||||
|
step := svgW / n
|
||||||
|
x = i*step + (step-barW)/2
|
||||||
|
} else {
|
||||||
|
frac := e.At.Sub(minT).Seconds() / dur
|
||||||
|
x = int(frac * float64(svgW-barW))
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&bars, `<rect x="%d" y="%d" width="%d" height="%d" fill="%s" rx="1"/>`,
|
||||||
|
x, yTop, barW, barH, barColor(e.Status))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fmt.Sprintf(
|
||||||
|
`<svg width="%d" height="%d" style="display:inline-block;vertical-align:middle;margin-left:6px;flex-shrink:0" xmlns="http://www.w3.org/2000/svg">`+
|
||||||
|
`<rect x="0" y="0" width="%d" height="%d" fill="var(--surface-alt,#ebebeb)" rx="3"/>%s</svg>`,
|
||||||
|
svgW, svgH, svgW, svgH, bars.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
// renderComponentDetail renders a modal content fragment for one component type.
|
||||||
|
// Called by handleAPIComponentDetail and displayed inside #component-detail-dialog.
|
||||||
|
func renderComponentDetail(title string, records []app.ComponentStatusRecord) string {
|
||||||
|
var b strings.Builder
|
||||||
|
fmt.Fprintf(&b, `<div style="padding:20px 24px 0">`)
|
||||||
|
fmt.Fprintf(&b, `<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:16px">`)
|
||||||
|
fmt.Fprintf(&b, `<span style="font-size:16px;font-weight:700">%s — Status Detail</span>`, html.EscapeString(title))
|
||||||
|
b.WriteString(`<button class="btn btn-sm btn-secondary" onclick="document.getElementById('component-detail-dialog').close()">Close</button>`)
|
||||||
|
b.WriteString(`</div>`)
|
||||||
|
|
||||||
|
if len(records) == 0 {
|
||||||
|
b.WriteString(`<p style="color:var(--muted)">No status data recorded yet for this component type.</p>`)
|
||||||
|
b.WriteString(`</div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
sort.Slice(records, func(i, j int) bool {
|
||||||
|
return records[i].ComponentKey < records[j].ComponentKey
|
||||||
|
})
|
||||||
|
|
||||||
|
for _, rec := range records {
|
||||||
|
letter, cls := chipLetterClass(rec.Status)
|
||||||
|
|
||||||
|
// Count non-OK events across the full history for the badge + sparkline.
|
||||||
|
warnCount := 0
|
||||||
|
for _, e := range rec.History {
|
||||||
|
if e.Status != "OK" {
|
||||||
|
warnCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(&b, `<div style="margin-bottom:20px">`)
|
||||||
|
fmt.Fprintf(&b, `<div style="display:flex;align-items:center;gap:8px;margin-bottom:8px;flex-wrap:wrap">`)
|
||||||
|
fmt.Fprintf(&b, `<span class="chip %s">%s</span>`, cls, letter)
|
||||||
|
fmt.Fprintf(&b, `<span style="font-weight:700;font-size:13px">%s</span>`, html.EscapeString(rec.ComponentKey))
|
||||||
|
if !rec.LastCheckedAt.IsZero() {
|
||||||
|
fmt.Fprintf(&b, `<span style="color:var(--muted);font-size:12px">checked %s</span>`, rec.LastCheckedAt.Format("2006-01-02 15:04:05"))
|
||||||
|
}
|
||||||
|
if warnCount > 0 {
|
||||||
|
noun := "events"
|
||||||
|
if warnCount == 1 {
|
||||||
|
noun = "event"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b,
|
||||||
|
`<span style="font-size:11px;background:var(--warn-bg,#fffbeb);color:var(--warn-fg,#92400e);border:1px solid var(--warn-border,#fde68a);border-radius:10px;padding:1px 7px;white-space:nowrap">%d %s</span>`,
|
||||||
|
warnCount, noun)
|
||||||
|
b.WriteString(renderSparkline(rec.History))
|
||||||
|
}
|
||||||
|
b.WriteString(`</div>`)
|
||||||
|
|
||||||
|
if rec.ErrorSummary != "" {
|
||||||
|
fmt.Fprintf(&b, `<div style="font-size:12px;margin-bottom:4px;color:var(--muted)">%s</div>`, html.EscapeString(rec.ErrorSummary))
|
||||||
|
if decoded := decodeAERStatus(rec.ErrorSummary); decoded != "" {
|
||||||
|
fmt.Fprintf(&b,
|
||||||
|
`<div style="font-size:12px;margin-bottom:8px;color:var(--muted)"><span style="background:var(--surface-alt,#f5f5f5);border-radius:4px;padding:1px 6px;font-family:monospace">AER: %s</span></div>`,
|
||||||
|
html.EscapeString(decoded))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// History table — newest first, cap at 20 entries.
|
||||||
|
history := rec.History
|
||||||
|
if len(history) > 20 {
|
||||||
|
history = history[len(history)-20:]
|
||||||
|
}
|
||||||
|
b.WriteString(`<table style="width:100%;font-size:12px;border-collapse:collapse">`)
|
||||||
|
b.WriteString(`<tr style="color:var(--muted)"><th style="text-align:left;padding:2px 10px 2px 0;white-space:nowrap">Time</th><th style="text-align:left;padding:2px 10px 2px 0">Status</th><th style="text-align:left;padding:2px 10px 2px 0">Source</th><th style="text-align:left;padding:2px 0">Detail</th></tr>`)
|
||||||
|
for i := len(history) - 1; i >= 0; i-- {
|
||||||
|
e := history[i]
|
||||||
|
eLetter, eCls := chipLetterClass(e.Status)
|
||||||
|
detail := e.Detail
|
||||||
|
if detail == "" {
|
||||||
|
detail = "—"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b,
|
||||||
|
`<tr><td style="padding:3px 10px 3px 0;white-space:nowrap;color:var(--muted)">%s</td><td style="padding:3px 10px 3px 0"><span class="chip %s" style="font-size:10px;width:16px;height:16px">%s</span></td><td style="padding:3px 10px 3px 0;white-space:nowrap">%s</td><td style="padding:3px 0;color:var(--muted)">%s</td></tr>`,
|
||||||
|
html.EscapeString(e.At.Format("2006-01-02 15:04:05")),
|
||||||
|
eCls, eLetter,
|
||||||
|
html.EscapeString(e.Source),
|
||||||
|
html.EscapeString(detail),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
b.WriteString(`</table>`)
|
||||||
|
b.WriteString(`</div>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
b.WriteString(`</div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|||||||
@@ -221,6 +221,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
h.kmsg = newKmsgWatcher(opts.App.StatusDB)
|
h.kmsg = newKmsgWatcher(opts.App.StatusDB)
|
||||||
h.kmsg.start()
|
h.kmsg.start()
|
||||||
globalQueue.kmsgWatcher = h.kmsg
|
globalQueue.kmsgWatcher = h.kmsg
|
||||||
|
|
||||||
|
// Start periodic health poller for components that don't emit kernel log events (e.g. PSU).
|
||||||
|
if opts.App.StatusDB != nil {
|
||||||
|
newHealthPoller(opts.App.StatusDB).start()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
globalQueue.startWorker(&opts)
|
globalQueue.startWorker(&opts)
|
||||||
@@ -328,6 +333,10 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("GET /api/install/disks", h.handleAPIInstallDisks)
|
mux.HandleFunc("GET /api/install/disks", h.handleAPIInstallDisks)
|
||||||
mux.HandleFunc("POST /api/install/run", h.handleAPIInstallRun)
|
mux.HandleFunc("POST /api/install/run", h.handleAPIInstallRun)
|
||||||
|
|
||||||
|
// Hardware component detail (fragment for modal in Hardware Summary card)
|
||||||
|
mux.HandleFunc("GET /api/hardware-summary", h.handleAPIHardwareSummary)
|
||||||
|
mux.HandleFunc("GET /api/components/{type}", h.handleAPIComponentDetail)
|
||||||
|
|
||||||
// Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export
|
// Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export
|
||||||
mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
|
mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
|
||||||
mux.HandleFunc("GET /api/metrics/latest", h.handleAPIMetricsLatest)
|
mux.HandleFunc("GET /api/metrics/latest", h.handleAPIMetricsLatest)
|
||||||
|
|||||||
+1
-1
Submodule bible updated: 1d89a4918e...d2600f1279
@@ -0,0 +1,312 @@
|
|||||||
|
# GRUB Bitmap Error History
|
||||||
|
|
||||||
|
## Symptom
|
||||||
|
|
||||||
|
On some servers GRUB prints:
|
||||||
|
|
||||||
|
```text
|
||||||
|
error: null src bitmap in grub_video_bitmap_create_scaled.
|
||||||
|
Press any key to continue...
|
||||||
|
```
|
||||||
|
|
||||||
|
The important new observation as of `v10.7` is:
|
||||||
|
|
||||||
|
- the error still appears even when the logo image block is removed from
|
||||||
|
`iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt`
|
||||||
|
- therefore the current error can no longer be explained only by
|
||||||
|
`bee-logo.png` / `bee-logo.tga`
|
||||||
|
|
||||||
|
That does not prove the theme system is healthy. It proves only that the
|
||||||
|
currently remaining failure is deeper than "bad logo file".
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
Current source files:
|
||||||
|
|
||||||
|
- [iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt](/Users/mchusavitin/Documents/git/bee/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt:1)
|
||||||
|
has no `image` block anymore
|
||||||
|
- [iso/builder/config/bootloaders/grub-efi/config.cfg](/Users/mchusavitin/Documents/git/bee/iso/builder/config/bootloaders/grub-efi/config.cfg:1)
|
||||||
|
still does `insmod tga` and then `source /boot/grub/theme.cfg`
|
||||||
|
|
||||||
|
Implication:
|
||||||
|
|
||||||
|
- if the error still fires, the trigger is likely elsewhere in GRUB theme
|
||||||
|
rendering or in the assets/config GRUB resolves while sourcing `theme.cfg`
|
||||||
|
- the old "PNG parser fragility" story is no longer a sufficient explanation
|
||||||
|
for the current failure mode
|
||||||
|
|
||||||
|
Current artifact facts:
|
||||||
|
|
||||||
|
- the provided `easy-bee-nvidia-v10.7-amd64.logs` build logs reference
|
||||||
|
`linux-image-6.1.0-45`
|
||||||
|
- the provided `easy-bee-nvidia-v10.7-amd64.iso` contains
|
||||||
|
`live/initrd.img-6.1.0-45-amd64` and `live/vmlinuz-6.1.0-45-amd64`
|
||||||
|
- a later `BOOT FAILED!` screenshot showed `live/initrd.img-6.1.0-44-amd64`
|
||||||
|
and `live/vmlinuz-6.1.0-44-amd64`
|
||||||
|
|
||||||
|
Implication:
|
||||||
|
|
||||||
|
- the `BOOT FAILED!` screenshot is not from the same artifact as the provided
|
||||||
|
`v10.7` ISO/log set
|
||||||
|
- until the exact ISO filename and checksum are tied to that failure, the
|
||||||
|
GRUB bitmap issue and the live-boot failure must be treated as separate
|
||||||
|
problems
|
||||||
|
|
||||||
|
## Chronology
|
||||||
|
|
||||||
|
### 1. Initial bee GRUB theme introduction
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `d52ec67` `Stability hardening, build script fixes, GRUB bee logo`
|
||||||
|
|
||||||
|
What changed:
|
||||||
|
|
||||||
|
- bee-branded GRUB theme introduced
|
||||||
|
- image block with explicit `width` / `height`
|
||||||
|
|
||||||
|
Observed result:
|
||||||
|
|
||||||
|
- bitmap error appeared
|
||||||
|
|
||||||
|
### 2. Remove explicit scaling dimensions
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `aa284ae` `fix(iso): avoid grub logo scaling error`
|
||||||
|
|
||||||
|
What changed:
|
||||||
|
|
||||||
|
- removed `width = 400`
|
||||||
|
- removed `height = 400`
|
||||||
|
|
||||||
|
Reason stated by the change:
|
||||||
|
|
||||||
|
- try to avoid the scaling path
|
||||||
|
|
||||||
|
Observed result:
|
||||||
|
|
||||||
|
- error persisted
|
||||||
|
|
||||||
|
Conclusion:
|
||||||
|
|
||||||
|
- explicit width/height were not the sole trigger
|
||||||
|
|
||||||
|
### 3. Rework PNG handling and menu rendering
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `6112094` `fix(grub): fix bitmap error and menu rendering`
|
||||||
|
|
||||||
|
Commit message says the change was intended to:
|
||||||
|
|
||||||
|
- convert `bee-logo.png` to RGBA and strip metadata
|
||||||
|
- move `terminal_output gfxterm` before `insmod png` / theme load
|
||||||
|
- remove ASCII banner from GRUB menu area
|
||||||
|
- fix theme typography/layout fields
|
||||||
|
|
||||||
|
Observed result:
|
||||||
|
|
||||||
|
- error persisted
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
|
||||||
|
- this was still operating under the assumption that the issue was the PNG
|
||||||
|
payload or the order of gfxterm/theme init
|
||||||
|
|
||||||
|
### 4. Convert logo PNG back to RGB
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `333c44f` `Fix GRUB splash: convert bee-logo.png from RGBA to RGB`
|
||||||
|
|
||||||
|
Intended reason:
|
||||||
|
|
||||||
|
- GRUB might dislike RGBA PNG and want RGB PNG
|
||||||
|
|
||||||
|
Observed result:
|
||||||
|
|
||||||
|
- error still persisted according to later project notes
|
||||||
|
|
||||||
|
### 5. Add post-build canonical GRUB/isolinux sync
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `0cdfbc5` `fix(iso): restore boot UX and boot logs`
|
||||||
|
|
||||||
|
What this introduced:
|
||||||
|
|
||||||
|
- post-`lb build` rewriting of `binary/boot/grub/grub.cfg`
|
||||||
|
- post-`lb build` rewriting of `binary/isolinux/live.cfg`
|
||||||
|
- forced rebuild of `binary_checksums`, `binary_iso`, `binary_zsync`
|
||||||
|
|
||||||
|
Why it was added:
|
||||||
|
|
||||||
|
- restore canonical EASY-BEE boot UX after live-build wrote its own bootloader
|
||||||
|
outputs
|
||||||
|
- restore expected boot menu and logs
|
||||||
|
|
||||||
|
Important note:
|
||||||
|
|
||||||
|
- this commit did not directly solve the bitmap issue
|
||||||
|
- it added a second layer of bootloader mutation after live-build
|
||||||
|
|
||||||
|
### 6. Switch from PNG to TGA
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `626763e` `Fix GRUB bitmap error: switch from PNG to TGA for splash logo`
|
||||||
|
|
||||||
|
Commit message says:
|
||||||
|
|
||||||
|
- GRUB PNG reader was considered fragile
|
||||||
|
- switch to uncompressed 24-bit TGA
|
||||||
|
- `config.cfg`: `insmod png` -> `insmod tga`
|
||||||
|
- `theme.txt`: `bee-logo.png` -> `bee-logo.tga`
|
||||||
|
|
||||||
|
Observed result:
|
||||||
|
|
||||||
|
- this did not eliminate the problem in the current lineage
|
||||||
|
- today the system still errors even after the entire image block was removed
|
||||||
|
|
||||||
|
Conclusion:
|
||||||
|
|
||||||
|
- switching PNG -> TGA was not a durable root-cause fix
|
||||||
|
|
||||||
|
### 7. Patch EFI image after build
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `4f20c92` `Make UEFI boot safe and remove GRUB logo`
|
||||||
|
|
||||||
|
What this introduced:
|
||||||
|
|
||||||
|
- `sync_efi_grub_theme_assets`
|
||||||
|
- direct `mtools` patching of `efi.img`
|
||||||
|
- copying `config.cfg`, `theme.cfg`, and `live-theme/*` into the EFI FAT image
|
||||||
|
- removal of the logo image block from `theme.txt`
|
||||||
|
|
||||||
|
Why it was added:
|
||||||
|
|
||||||
|
- make UEFI path "safe"
|
||||||
|
- keep EFI GRUB image aligned with canonical bootloader assets
|
||||||
|
|
||||||
|
Observed result:
|
||||||
|
|
||||||
|
- later this became the direct cause of `Disk full` during build once
|
||||||
|
`bee-logo.tga` was large enough
|
||||||
|
- and even with the logo removed from `theme.txt`, the bitmap error still
|
||||||
|
remained
|
||||||
|
|
||||||
|
Conclusion:
|
||||||
|
|
||||||
|
- EFI post-build patching increased build complexity
|
||||||
|
- removing the logo alone did not remove the runtime GRUB error
|
||||||
|
|
||||||
|
### 8. Remove ASCII logo banners
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `14505ef` `Remove easy bee ASCII logo banners`
|
||||||
|
|
||||||
|
What changed:
|
||||||
|
|
||||||
|
- web loading page ASCII cleanup only
|
||||||
|
|
||||||
|
Relevance here:
|
||||||
|
|
||||||
|
- none for GRUB bitmap error
|
||||||
|
- included here only to avoid confusion with other "logo removal" work
|
||||||
|
|
||||||
|
### 9. Remove EFI post-build patching
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `5dc022d` `Drop post-build EFI bootloader patching`
|
||||||
|
|
||||||
|
Why it was done:
|
||||||
|
|
||||||
|
- stop mutating `efi.img` post-build
|
||||||
|
- remove dependence on `mtools` for EFI patching
|
||||||
|
- remove the `Disk full` failure mode
|
||||||
|
|
||||||
|
Impact:
|
||||||
|
|
||||||
|
- this did not target the GRUB bitmap error directly
|
||||||
|
- it targeted build-system complexity and EFI image overflow
|
||||||
|
|
||||||
|
### 10. Restore only GRUB/isolinux post-build sync
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `42774d4` `Restore post-build GRUB and isolinux sync`
|
||||||
|
|
||||||
|
Why it was needed:
|
||||||
|
|
||||||
|
- removing all post-build sync caused final ISO validation to fail with
|
||||||
|
missing canonical EASY-BEE boot entries
|
||||||
|
- memtest was still fine, but final GRUB menu was no longer canonical
|
||||||
|
|
||||||
|
What it restored:
|
||||||
|
|
||||||
|
- only `binary/boot/grub/grub.cfg`
|
||||||
|
- only `binary/isolinux/live.cfg`
|
||||||
|
|
||||||
|
What it did not restore:
|
||||||
|
|
||||||
|
- no EFI FAT image patching
|
||||||
|
- no `mtools` path
|
||||||
|
|
||||||
|
## What Is Proven False
|
||||||
|
|
||||||
|
The current evidence rules out several simplistic explanations:
|
||||||
|
|
||||||
|
- "the error is only caused by explicit image scaling"
|
||||||
|
- "the error is only caused by PNG vs TGA"
|
||||||
|
- "the error is only caused by the logo file itself"
|
||||||
|
|
||||||
|
Why:
|
||||||
|
|
||||||
|
- scaling dimensions were removed and error persisted
|
||||||
|
- PNG was replaced with TGA and error still survived in the lineage
|
||||||
|
- the image block itself is now absent, and the error still occurs
|
||||||
|
|
||||||
|
## Working Hypotheses Left
|
||||||
|
|
||||||
|
The remaining plausible layers are:
|
||||||
|
|
||||||
|
- GRUB theme engine still tries to render some bitmap-related element even
|
||||||
|
without the logo image block
|
||||||
|
- GRUB is resolving stale theme assets from the built EFI/ISO path rather than
|
||||||
|
what we think the source tree says
|
||||||
|
- `theme.cfg` / `theme.txt` / GRUB module loading order still triggers a bitmap
|
||||||
|
code path elsewhere
|
||||||
|
- live-build may still package a stale `theme.txt` or stale `live-theme`
|
||||||
|
directory into the final image
|
||||||
|
- the GRUB environment on the failing hardware may behave differently from the
|
||||||
|
assumptions in our source tree
|
||||||
|
|
||||||
|
## Decision Boundary
|
||||||
|
|
||||||
|
Before making another change, the next step should be evidence gathering from
|
||||||
|
the real built artifact, not another speculative edit.
|
||||||
|
|
||||||
|
That means checking on the actual built ISO or EFI image:
|
||||||
|
|
||||||
|
- exact `boot/grub/theme.cfg`
|
||||||
|
- exact `boot/grub/live-theme/theme.txt`
|
||||||
|
- exact contents of `boot/grub/live-theme/`
|
||||||
|
- whether the final image still contains a stale logo reference
|
||||||
|
- whether the EFI path and non-EFI path differ
|
||||||
|
|
||||||
|
## Relevant Commits
|
||||||
|
|
||||||
|
- `d52ec67` `Stability hardening, build script fixes, GRUB bee logo`
|
||||||
|
- `aa284ae` `fix(iso): avoid grub logo scaling error`
|
||||||
|
- `6112094` `fix(grub): fix bitmap error and menu rendering`
|
||||||
|
- `333c44f` `Fix GRUB splash: convert bee-logo.png from RGBA to RGB`
|
||||||
|
- `0cdfbc5` `fix(iso): restore boot UX and boot logs`
|
||||||
|
- `626763e` `Fix GRUB bitmap error: switch from PNG to TGA for splash logo`
|
||||||
|
- `4f20c92` `Make UEFI boot safe and remove GRUB logo`
|
||||||
|
- `5dc022d` `Drop post-build EFI bootloader patching`
|
||||||
|
- `42774d4` `Restore post-build GRUB and isolinux sync`
|
||||||
+1
-1
Submodule internal/chart updated: ac8120c8ab...2a15bc87f1
@@ -38,7 +38,7 @@ lb config noauto \
|
|||||||
--memtest memtest86+ \
|
--memtest memtest86+ \
|
||||||
--iso-volume "${LB_ISO_VOLUME}" \
|
--iso-volume "${LB_ISO_VOLUME}" \
|
||||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--bootappend-live "boot=live live-media-label=${LB_ISO_VOLUME} components video=1920x1080 console=ttyS0,115200n8 console=tty0 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
--bootappend-live "boot=live live-media=/dev/disk/by-label/${LB_ISO_VOLUME} live-media-label=${LB_ISO_VOLUME} components video=1920x1080 console=ttyS0,115200n8 console=tty0 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||||
--debootstrap-options "--include=ca-certificates" \
|
--debootstrap-options "--include=ca-certificates" \
|
||||||
--apt-recommends false \
|
--apt-recommends false \
|
||||||
--chroot-squashfs-compression-type zstd \
|
--chroot-squashfs-compression-type zstd \
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ BUILDER_DIR="${REPO_ROOT}/iso/builder"
|
|||||||
CONTAINER_TOOL="${CONTAINER_TOOL:-docker}"
|
CONTAINER_TOOL="${CONTAINER_TOOL:-docker}"
|
||||||
IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
|
IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
|
||||||
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
|
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
|
||||||
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
|
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/cache}"
|
||||||
AUTH_KEYS=""
|
AUTH_KEYS=""
|
||||||
CLEAN_CACHE=0
|
CLEAN_CACHE=0
|
||||||
VARIANT="all"
|
VARIANT="all"
|
||||||
@@ -54,14 +54,14 @@ if [ "$CLEAN_CACHE" = "1" ]; then
|
|||||||
"${CACHE_DIR:?}/bee" \
|
"${CACHE_DIR:?}/bee" \
|
||||||
"${CACHE_DIR:?}/lb-packages"
|
"${CACHE_DIR:?}/lb-packages"
|
||||||
echo "=== cleaning live-build work dirs ==="
|
echo "=== cleaning live-build work dirs ==="
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
rm -rf "${REPO_ROOT}/dist/cache/live-build-work-nvidia"
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia-legacy"
|
rm -rf "${REPO_ROOT}/dist/cache/live-build-work-nvidia-legacy"
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
rm -rf "${REPO_ROOT}/dist/cache/live-build-work-amd"
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
rm -rf "${REPO_ROOT}/dist/cache/live-build-work-nogpu"
|
||||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia"
|
rm -rf "${REPO_ROOT}/dist/cache/overlay-stage-nvidia"
|
||||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia-legacy"
|
rm -rf "${REPO_ROOT}/dist/cache/overlay-stage-nvidia-legacy"
|
||||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-amd"
|
rm -rf "${REPO_ROOT}/dist/cache/overlay-stage-amd"
|
||||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nogpu"
|
rm -rf "${REPO_ROOT}/dist/cache/overlay-stage-nogpu"
|
||||||
echo "=== caches cleared, proceeding with build ==="
|
echo "=== caches cleared, proceeding with build ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
+153
-116
@@ -51,8 +51,8 @@ case "$BUILD_VARIANT" in
|
|||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BUILD_VARIANT}"
|
BUILD_WORK_DIR="${DIST_DIR}/cache/live-build-work-${BUILD_VARIANT}"
|
||||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
|
OVERLAY_STAGE_DIR="${DIST_DIR}/cache/overlay-stage-${BUILD_VARIANT}"
|
||||||
|
|
||||||
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
||||||
|
|
||||||
@@ -63,7 +63,7 @@ export PATH="$PATH:/usr/local/go/bin"
|
|||||||
|
|
||||||
# Allow git to read the bind-mounted repo (different UID inside container).
|
# Allow git to read the bind-mounted repo (different UID inside container).
|
||||||
git config --global safe.directory "${REPO_ROOT}"
|
git config --global safe.directory "${REPO_ROOT}"
|
||||||
mkdir -p "${DIST_DIR}"
|
mkdir -p "${DIST_DIR}/cache" "${DIST_DIR}/release"
|
||||||
mkdir -p "${CACHE_ROOT}"
|
mkdir -p "${CACHE_ROOT}"
|
||||||
: "${GOCACHE:=${CACHE_ROOT}/go-build}"
|
: "${GOCACHE:=${CACHE_ROOT}/go-build}"
|
||||||
: "${GOMODCACHE:=${CACHE_ROOT}/go-mod}"
|
: "${GOMODCACHE:=${CACHE_ROOT}/go-mod}"
|
||||||
@@ -516,12 +516,12 @@ validate_iso_live_boot_entries() {
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
grep -q 'menuentry "EASY-BEE"' "$grub_cfg" || {
|
grep -q 'menuentry "EASY-BEE v' "$grub_cfg" || {
|
||||||
echo "ERROR: GRUB default EASY-BEE entry is missing" >&2
|
echo "ERROR: GRUB default EASY-BEE entry is missing" >&2
|
||||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
grep -q 'menuentry "EASY-BEE -- load to RAM (toram)"' "$grub_cfg" || {
|
grep -q 'menuentry "EASY-BEE v.* -- load to RAM (toram)"' "$grub_cfg" || {
|
||||||
echo "ERROR: GRUB toram entry is missing" >&2
|
echo "ERROR: GRUB toram entry is missing" >&2
|
||||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
exit 1
|
exit 1
|
||||||
@@ -562,40 +562,38 @@ validate_iso_live_boot_entries() {
|
|||||||
echo "=== live boot validation OK ==="
|
echo "=== live boot validation OK ==="
|
||||||
}
|
}
|
||||||
|
|
||||||
validate_iso_grub_theme_assets() {
|
validate_iso_grub_assets() {
|
||||||
iso_path="$1"
|
iso_path="$1"
|
||||||
echo "=== validating GRUB theme assets in ISO ==="
|
echo "=== validating GRUB assets in ISO ==="
|
||||||
|
|
||||||
[ -f "$iso_path" ] || {
|
[ -f "$iso_path" ] || {
|
||||||
echo "ERROR: ISO not found for GRUB theme validation: $iso_path" >&2
|
echo "ERROR: ISO not found for GRUB asset validation: $iso_path" >&2
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
require_iso_reader "$iso_path" >/dev/null 2>&1 || {
|
require_iso_reader "$iso_path" >/dev/null 2>&1 || {
|
||||||
echo "ERROR: ISO reader unavailable for GRUB theme validation" >&2
|
echo "ERROR: ISO reader unavailable for GRUB asset validation" >&2
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
iso_files="$(mktemp)"
|
iso_files="$(mktemp)"
|
||||||
iso_list_files "$iso_path" > "$iso_files" || {
|
iso_list_files "$iso_path" > "$iso_files" || {
|
||||||
echo "ERROR: failed to list ISO files for GRUB theme validation" >&2
|
echo "ERROR: failed to list ISO files for GRUB asset validation" >&2
|
||||||
rm -f "$iso_files"
|
rm -f "$iso_files"
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
for required in \
|
for required in \
|
||||||
boot/grub/config.cfg \
|
boot/grub/config.cfg \
|
||||||
boot/grub/theme.cfg \
|
boot/grub/grub.cfg; do
|
||||||
boot/grub/live-theme/theme.txt \
|
|
||||||
boot/grub/live-theme/bee-logo.tga; do
|
|
||||||
grep -q "^${required}$" "$iso_files" || {
|
grep -q "^${required}$" "$iso_files" || {
|
||||||
echo "ERROR: missing GRUB theme asset in ISO: ${required}" >&2
|
echo "ERROR: missing GRUB asset in ISO: ${required}" >&2
|
||||||
rm -f "$iso_files"
|
rm -f "$iso_files"
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
done
|
done
|
||||||
|
|
||||||
rm -f "$iso_files"
|
rm -f "$iso_files"
|
||||||
echo "=== GRUB theme validation OK ==="
|
echo "=== GRUB asset validation OK ==="
|
||||||
}
|
}
|
||||||
|
|
||||||
validate_iso_nvidia_runtime() {
|
validate_iso_nvidia_runtime() {
|
||||||
@@ -610,29 +608,37 @@ validate_iso_nvidia_runtime() {
|
|||||||
|
|
||||||
squashfs_tmp="$(mktemp)"
|
squashfs_tmp="$(mktemp)"
|
||||||
squashfs_list="$(mktemp)"
|
squashfs_list="$(mktemp)"
|
||||||
iso_read_member "$iso_path" live/filesystem.squashfs "$squashfs_tmp" || {
|
iso_files="$(mktemp)"
|
||||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
iso_list_files "$iso_path" > "$iso_files" || {
|
||||||
nvidia_runtime_fail "failed to extract live/filesystem.squashfs from ISO"
|
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||||
}
|
nvidia_runtime_fail "failed to list ISO files for NVIDIA runtime validation"
|
||||||
unsquashfs -ll "$squashfs_tmp" > "$squashfs_list" 2>/dev/null || {
|
|
||||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
|
||||||
nvidia_runtime_fail "failed to inspect filesystem.squashfs from ISO"
|
|
||||||
}
|
}
|
||||||
|
grep '^live/.*\.squashfs$' "$iso_files" | while IFS= read -r squashfs_member; do
|
||||||
|
iso_read_member "$iso_path" "$squashfs_member" "$squashfs_tmp" || {
|
||||||
|
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||||
|
nvidia_runtime_fail "failed to extract $squashfs_member from ISO"
|
||||||
|
}
|
||||||
|
unsquashfs -ll "$squashfs_tmp" >> "$squashfs_list" 2>/dev/null || {
|
||||||
|
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||||
|
nvidia_runtime_fail "failed to inspect $squashfs_member from ISO"
|
||||||
|
}
|
||||||
|
: > "$squashfs_tmp"
|
||||||
|
done
|
||||||
|
|
||||||
grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
|
grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
|
||||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||||
nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
|
nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
|
||||||
}
|
}
|
||||||
grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
|
grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
|
||||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||||
nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
|
nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
|
||||||
}
|
}
|
||||||
grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
|
grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
|
||||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||||
nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
|
nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
|
||||||
}
|
}
|
||||||
|
|
||||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||||
echo "=== NVIDIA runtime validation OK ==="
|
echo "=== NVIDIA runtime validation OK ==="
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -726,26 +732,26 @@ write_canonical_grub_cfg() {
|
|||||||
kernel="$2"
|
kernel="$2"
|
||||||
append_live="$3"
|
append_live="$3"
|
||||||
initrd="$4"
|
initrd="$4"
|
||||||
|
version_label="${PROJECT_VERSION_EFFECTIVE}"
|
||||||
|
|
||||||
cat > "$cfg" <<EOF
|
cat > "$cfg" <<EOF
|
||||||
source /boot/grub/config.cfg
|
source /boot/grub/config.cfg
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
menuentry "EASY-BEE v${version_label}" {
|
||||||
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd ${initrd}
|
initrd ${initrd}
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE -- load to RAM (toram)" {
|
menuentry "EASY-BEE v${version_label} -- load to RAM (toram)" {
|
||||||
linux ${kernel} ${append_live} toram nomodeset bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux ${kernel} ${append_live} toram nomodeset bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd ${initrd}
|
initrd ${initrd}
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE -- no GUI / no X11" {
|
menuentry "EASY-BEE v${version_label} -- no GUI / no X11" {
|
||||||
linux ${kernel} ${append_live} nomodeset bee.gui=off bee.nvidia.mode=gsp-off pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux ${kernel} ${append_live} nomodeset bee.gui=off bee.nvidia.mode=gsp-off pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd ${initrd}
|
initrd ${initrd}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if [ "\${grub_platform}" = "efi" ]; then
|
if [ "\${grub_platform}" = "efi" ]; then
|
||||||
menuentry "Memory Test (memtest86+)" {
|
menuentry "Memory Test (memtest86+)" {
|
||||||
chainloader /boot/memtest86+x64.efi
|
chainloader /boot/memtest86+x64.efi
|
||||||
@@ -769,23 +775,24 @@ write_canonical_isolinux_cfg() {
|
|||||||
kernel="$2"
|
kernel="$2"
|
||||||
initrd="$3"
|
initrd="$3"
|
||||||
append_live="$4"
|
append_live="$4"
|
||||||
|
version_label="${PROJECT_VERSION_EFFECTIVE}"
|
||||||
|
|
||||||
cat > "$cfg" <<EOF
|
cat > "$cfg" <<EOF
|
||||||
label live-@FLAVOUR@-normal
|
label live-@FLAVOUR@-normal
|
||||||
menu label ^EASY-BEE
|
menu label ^EASY-BEE v${version_label}
|
||||||
linux ${kernel}
|
linux ${kernel}
|
||||||
initrd ${initrd}
|
initrd ${initrd}
|
||||||
append ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
append ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-toram
|
label live-@FLAVOUR@-toram
|
||||||
menu label EASY-BEE (^load to RAM)
|
menu label EASY-BEE v${version_label} (^load to RAM)
|
||||||
menu default
|
menu default
|
||||||
linux ${kernel}
|
linux ${kernel}
|
||||||
initrd ${initrd}
|
initrd ${initrd}
|
||||||
append ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
append ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-console
|
label live-@FLAVOUR@-console
|
||||||
menu label EASY-BEE (^no GUI / no X11)
|
menu label EASY-BEE v${version_label} (^no GUI / no X11)
|
||||||
linux ${kernel}
|
linux ${kernel}
|
||||||
initrd ${initrd}
|
initrd ${initrd}
|
||||||
append ${append_live} nomodeset bee.gui=off bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
append ${append_live} nomodeset bee.gui=off bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
@@ -833,10 +840,7 @@ enforce_live_build_bootloader_assets() {
|
|||||||
|
|
||||||
if [ -f "$grub_cfg" ]; then
|
if [ -f "$grub_cfg" ]; then
|
||||||
if extract_live_grub_entry "$grub_cfg"; then
|
if extract_live_grub_entry "$grub_cfg"; then
|
||||||
mkdir -p "$grub_dir/live-theme"
|
|
||||||
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
|
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
|
||||||
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg"
|
|
||||||
cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/"
|
|
||||||
write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "${live_build_append:-$grub_append}" "$grub_initrd"
|
write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "${live_build_append:-$grub_append}" "$grub_initrd"
|
||||||
echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
|
echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
|
||||||
else
|
else
|
||||||
@@ -852,60 +856,6 @@ enforce_live_build_bootloader_assets() {
|
|||||||
echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
|
echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
sync_efi_grub_theme_assets "$lb_dir"
|
|
||||||
}
|
|
||||||
|
|
||||||
fat_image_has_file() {
|
|
||||||
img="$1"
|
|
||||||
path="$2"
|
|
||||||
mtype -i "$img" "$path" >/dev/null 2>&1
|
|
||||||
}
|
|
||||||
|
|
||||||
is_efi_grub_fat_image() {
|
|
||||||
img="$1"
|
|
||||||
fat_image_has_file "$img" "::/EFI/boot/grubx64.efi" || return 1
|
|
||||||
fat_image_has_file "$img" "::/boot/grub/grub.cfg" || return 1
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
copy_file_to_fat_image() {
|
|
||||||
img="$1"
|
|
||||||
src="$2"
|
|
||||||
dst="$3"
|
|
||||||
mcopy -o -i "$img" "$src" "$dst" >/dev/null
|
|
||||||
}
|
|
||||||
|
|
||||||
sync_efi_grub_theme_assets() {
|
|
||||||
lb_dir="$1"
|
|
||||||
found=0
|
|
||||||
|
|
||||||
for img in $(find "$lb_dir" -type f \( -name '*.img' -o -name '*.ima' -o -name '*.fat' \) 2>/dev/null); do
|
|
||||||
if ! is_efi_grub_fat_image "$img"; then
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
found=1
|
|
||||||
echo "bootloader sync: patching EFI GRUB image $img"
|
|
||||||
mmd -i "$img" "::/boot" >/dev/null 2>&1 || true
|
|
||||||
mmd -i "$img" "::/boot/grub" >/dev/null 2>&1 || true
|
|
||||||
|
|
||||||
copy_file_to_fat_image "$img" "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "::/boot/grub/config.cfg"
|
|
||||||
copy_file_to_fat_image "$img" "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "::/boot/grub/theme.cfg"
|
|
||||||
|
|
||||||
fat_image_has_file "$img" "::/boot/grub/config.cfg" || {
|
|
||||||
echo "ERROR: EFI GRUB image missing /boot/grub/config.cfg after sync: $img" >&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
fat_image_has_file "$img" "::/boot/grub/theme.cfg" || {
|
|
||||||
echo "ERROR: EFI GRUB image missing /boot/grub/theme.cfg after sync: $img" >&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
done
|
|
||||||
|
|
||||||
if [ "$found" != "1" ]; then
|
|
||||||
echo "ERROR: no EFI GRUB FAT image found in live-build workdir; cannot sync theme assets" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
}
|
}
|
||||||
|
|
||||||
copy_memtest_from_deb() {
|
copy_memtest_from_deb() {
|
||||||
@@ -944,8 +894,11 @@ FULL_BUILD_MARKER="${BUILD_WORK_DIR}/.bee-full-build-marker"
|
|||||||
# hooks, archives, Dockerfile, auto/config) require a full lb build.
|
# hooks, archives, Dockerfile, auto/config) require a full lb build.
|
||||||
needs_full_build() {
|
needs_full_build() {
|
||||||
[ -f "${FULL_BUILD_MARKER}" ] || return 0
|
[ -f "${FULL_BUILD_MARKER}" ] || return 0
|
||||||
[ -f "${BUILD_WORK_DIR}/binary/live/filesystem.squashfs" ] || return 0
|
|
||||||
[ -f "${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso" ] || return 0
|
[ -f "${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso" ] || return 0
|
||||||
|
# Accept any versioned squashfs (filesystem-v*.squashfs or legacy filesystem.squashfs)
|
||||||
|
_any_sq=$(find "${BUILD_WORK_DIR}/binary/live" -maxdepth 1 \
|
||||||
|
-name 'filesystem*.squashfs' 2>/dev/null | head -1)
|
||||||
|
[ -n "$_any_sq" ] || return 0
|
||||||
|
|
||||||
_heavy=$(find \
|
_heavy=$(find \
|
||||||
"${BUILDER_DIR}/VERSIONS" \
|
"${BUILDER_DIR}/VERSIONS" \
|
||||||
@@ -968,40 +921,109 @@ needs_full_build() {
|
|||||||
# Fast-path: unsquash existing filesystem, rsync overlay on top, repack.
|
# Fast-path: unsquash existing filesystem, rsync overlay on top, repack.
|
||||||
# Requires ~10 GB free in BEE_CACHE_DIR for the unpacked squashfs.
|
# Requires ~10 GB free in BEE_CACHE_DIR for the unpacked squashfs.
|
||||||
fast_path_repack_squashfs() {
|
fast_path_repack_squashfs() {
|
||||||
_sq="${BUILD_WORK_DIR}/binary/live/filesystem.squashfs"
|
_old_sq=$(find "${BUILD_WORK_DIR}/binary/live" -maxdepth 1 \
|
||||||
|
-name 'filesystem*.squashfs' | sort | head -1)
|
||||||
|
_sq="${BUILD_WORK_DIR}/binary/live/${SQUASHFS_FILENAME}"
|
||||||
_tmp="${BEE_CACHE_DIR}/fast-unsquash-${BUILD_VARIANT}"
|
_tmp="${BEE_CACHE_DIR}/fast-unsquash-${BUILD_VARIANT}"
|
||||||
echo "=== fast-path: unsquash ($(du -sh "$_sq" | cut -f1) compressed) ==="
|
echo "=== fast-path: unsquash $(basename "$_old_sq") ($(du -sh "$_old_sq" | cut -f1) compressed) ==="
|
||||||
rm -rf "$_tmp"
|
rm -rf "$_tmp"
|
||||||
unsquashfs -d "$_tmp" "$_sq"
|
unsquashfs -d "$_tmp" "$_old_sq"
|
||||||
echo "=== fast-path: syncing overlay stage ==="
|
echo "=== fast-path: syncing overlay stage ==="
|
||||||
rsync -a --checksum "${OVERLAY_STAGE_DIR}/" "$_tmp/"
|
rsync -a --checksum "${OVERLAY_STAGE_DIR}/" "$_tmp/"
|
||||||
echo "=== fast-path: repacking squashfs ==="
|
echo "=== fast-path: repacking as ${SQUASHFS_FILENAME} ==="
|
||||||
_sq_new="${_sq}.new"
|
_sq_new="${_sq}.new"
|
||||||
rm -f "$_sq_new"
|
rm -f "$_sq_new"
|
||||||
mksquashfs "$_tmp" "$_sq_new" -comp zstd -b 1048576 -noappend -no-progress
|
mksquashfs "$_tmp" "$_sq_new" -comp zstd -b 1048576 -noappend -no-progress -no-xattrs
|
||||||
mv "$_sq_new" "$_sq"
|
mv "$_sq_new" "$_sq"
|
||||||
rm -rf "$_tmp"
|
rm -rf "$_tmp"
|
||||||
|
[ "$_old_sq" != "$_sq" ] && rm -f "$_old_sq"
|
||||||
echo "=== fast-path: squashfs repacked ($(du -sh "$_sq" | cut -f1)) ==="
|
echo "=== fast-path: squashfs repacked ($(du -sh "$_sq" | cut -f1)) ==="
|
||||||
}
|
}
|
||||||
|
|
||||||
# Fast-path: rebuild ISO by replacing only live/filesystem.squashfs via xorriso.
|
# Fast-path: rebuild ISO replacing the squashfs via xorriso.
|
||||||
# Boot structure (El Torito, EFI, MBR hybrid) is replayed from the prior ISO.
|
# Boot structure (El Torito, EFI, MBR hybrid) is replayed from the prior ISO.
|
||||||
fast_path_rebuild_iso() {
|
fast_path_rebuild_iso() {
|
||||||
_sq="${BUILD_WORK_DIR}/binary/live/filesystem.squashfs"
|
_sq="${BUILD_WORK_DIR}/binary/live/${SQUASHFS_FILENAME}"
|
||||||
_prior="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso"
|
_prior="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso"
|
||||||
_new="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso.new"
|
_new="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso.new"
|
||||||
echo "=== fast-path: rebuilding ISO with xorriso ==="
|
echo "=== fast-path: rebuilding ISO with xorriso ==="
|
||||||
rm -f "$_new"
|
rm -f "$_new"
|
||||||
|
# Remove any old squashfs entries from the prior ISO before adding the new one
|
||||||
|
_old_entries=$(xorriso -indev "$_prior" -find /live -name 'filesystem*.squashfs' -- 2>/dev/null \
|
||||||
|
| grep -E '^/live/filesystem.*\.squashfs$' || true)
|
||||||
|
_rm_args=""
|
||||||
|
for _e in $_old_entries; do
|
||||||
|
_rm_args="$_rm_args -rm $_e --"
|
||||||
|
done
|
||||||
|
# shellcheck disable=SC2086
|
||||||
xorriso \
|
xorriso \
|
||||||
-indev "$_prior" \
|
-indev "$_prior" \
|
||||||
-outdev "$_new" \
|
-outdev "$_new" \
|
||||||
-map "$_sq" /live/filesystem.squashfs \
|
${_rm_args} \
|
||||||
|
-map "$_sq" /live/${SQUASHFS_FILENAME} \
|
||||||
-boot_image any replay \
|
-boot_image any replay \
|
||||||
-commit
|
-commit
|
||||||
mv "$_new" "$_prior"
|
mv "$_new" "$_prior"
|
||||||
echo "=== fast-path: ISO rebuilt ==="
|
echo "=== fast-path: ISO rebuilt ==="
|
||||||
}
|
}
|
||||||
|
|
||||||
|
dir_has_entries() {
|
||||||
|
_dir="$1"
|
||||||
|
[ -d "$_dir" ] || return 1
|
||||||
|
find "$_dir" -mindepth 1 -print -quit 2>/dev/null | grep -q .
|
||||||
|
}
|
||||||
|
|
||||||
|
move_tree_to_layer() {
|
||||||
|
_src_root="$1"
|
||||||
|
_rel="$2"
|
||||||
|
_dst_root="$3"
|
||||||
|
[ -e "${_src_root}/${_rel}" ] || return 0
|
||||||
|
mkdir -p "${_dst_root}/$(dirname "$_rel")"
|
||||||
|
mv "${_src_root}/${_rel}" "${_dst_root}/${_rel}"
|
||||||
|
}
|
||||||
|
|
||||||
|
split_live_squashfs_layers() {
|
||||||
|
lb_dir="$1"
|
||||||
|
live_dir="${lb_dir}/binary/live"
|
||||||
|
base_sq="${live_dir}/filesystem.squashfs"
|
||||||
|
usr_sq="${live_dir}/10-usr.squashfs"
|
||||||
|
fw_sq="${live_dir}/20-firmware.squashfs"
|
||||||
|
|
||||||
|
[ -f "$base_sq" ] || return 0
|
||||||
|
command -v unsquashfs >/dev/null 2>&1 || return 0
|
||||||
|
command -v mksquashfs >/dev/null 2>&1 || return 0
|
||||||
|
|
||||||
|
tmp_root="$(mktemp -d)"
|
||||||
|
tmp_usr="$(mktemp -d)"
|
||||||
|
tmp_fw="$(mktemp -d)"
|
||||||
|
|
||||||
|
echo "=== splitting live squashfs into smaller layers ==="
|
||||||
|
unsquashfs -d "$tmp_root/root" "$base_sq" >/dev/null
|
||||||
|
mkdir -p "$tmp_usr/root" "$tmp_fw/root"
|
||||||
|
|
||||||
|
move_tree_to_layer "$tmp_root/root" "usr" "$tmp_usr/root"
|
||||||
|
move_tree_to_layer "$tmp_root/root" "lib/firmware" "$tmp_fw/root"
|
||||||
|
move_tree_to_layer "$tmp_root/root" "usr/lib/firmware" "$tmp_fw/root"
|
||||||
|
move_tree_to_layer "$tmp_root/root" "boot/firmware" "$tmp_fw/root"
|
||||||
|
|
||||||
|
rm -f "$usr_sq" "$fw_sq"
|
||||||
|
mksquashfs "$tmp_root/root" "${base_sq}.new" -comp zstd -b 1048576 -noappend -no-progress -no-xattrs >/dev/null
|
||||||
|
mv "${base_sq}.new" "$base_sq"
|
||||||
|
|
||||||
|
if dir_has_entries "$tmp_usr/root"; then
|
||||||
|
mksquashfs "$tmp_usr/root" "${usr_sq}.new" -comp zstd -b 1048576 -noappend -no-progress -no-xattrs >/dev/null
|
||||||
|
mv "${usr_sq}.new" "$usr_sq"
|
||||||
|
fi
|
||||||
|
if dir_has_entries "$tmp_fw/root"; then
|
||||||
|
mksquashfs "$tmp_fw/root" "${fw_sq}.new" -comp zstd -b 1048576 -noappend -no-progress -no-xattrs >/dev/null
|
||||||
|
mv "${fw_sq}.new" "$fw_sq"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "=== live squashfs layers ==="
|
||||||
|
find "$live_dir" -maxdepth 1 -type f -name '*.squashfs' -exec du -sh {} \; | sort
|
||||||
|
rm -rf "$tmp_root" "$tmp_usr" "$tmp_fw"
|
||||||
|
}
|
||||||
|
|
||||||
recover_iso_memtest() {
|
recover_iso_memtest() {
|
||||||
lb_dir="$1"
|
lb_dir="$1"
|
||||||
iso_path="$2"
|
iso_path="$2"
|
||||||
@@ -1080,9 +1102,10 @@ recover_iso_memtest() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
PROJECT_VERSION_EFFECTIVE="$(resolve_project_version)"
|
PROJECT_VERSION_EFFECTIVE="$(resolve_project_version)"
|
||||||
|
SQUASHFS_FILENAME="filesystem-v${PROJECT_VERSION_EFFECTIVE}.squashfs"
|
||||||
ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${PROJECT_VERSION_EFFECTIVE}-amd64"
|
ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${PROJECT_VERSION_EFFECTIVE}-amd64"
|
||||||
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
||||||
OUT_DIR="${DIST_DIR}/easy-bee-v${PROJECT_VERSION_EFFECTIVE}"
|
OUT_DIR="${DIST_DIR}/release/easy-bee-v${PROJECT_VERSION_EFFECTIVE}"
|
||||||
ISO_VERSION_LABEL_TOKEN="$(printf '%s' "${PROJECT_VERSION_EFFECTIVE}" | tr '[:lower:].-' '[:upper:]__')"
|
ISO_VERSION_LABEL_TOKEN="$(printf '%s' "${PROJECT_VERSION_EFFECTIVE}" | tr '[:lower:].-' '[:upper:]__')"
|
||||||
mkdir -p "${OUT_DIR}"
|
mkdir -p "${OUT_DIR}"
|
||||||
LOG_DIR="${OUT_DIR}/${ISO_BASENAME}.logs"
|
LOG_DIR="${OUT_DIR}/${ISO_BASENAME}.logs"
|
||||||
@@ -1267,7 +1290,7 @@ run_step "sync git submodules" "05-git-submodules" \
|
|||||||
|
|
||||||
# --- compile bee binary (static, Linux amd64) ---
|
# --- compile bee binary (static, Linux amd64) ---
|
||||||
# Shared between variants — built once, reused on second pass.
|
# Shared between variants — built once, reused on second pass.
|
||||||
BEE_BIN="${DIST_DIR}/bee-linux-amd64"
|
BEE_BIN="${DIST_DIR}/cache/bee-linux-amd64"
|
||||||
NEED_BUILD=1
|
NEED_BUILD=1
|
||||||
if [ -f "$BEE_BIN" ]; then
|
if [ -f "$BEE_BIN" ]; then
|
||||||
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
|
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
|
||||||
@@ -1298,16 +1321,16 @@ else
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# --- NVIDIA-only build steps ---
|
# --- NVIDIA-only build steps ---
|
||||||
GPU_BURN_WORKER_BIN="${DIST_DIR}/bee-gpu-burn-worker-linux-amd64"
|
GPU_BURN_WORKER_BIN="${DIST_DIR}/cache/bee-gpu-burn-worker-linux-amd64"
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
run_step "download cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace" "20-cublas" \
|
run_step "download cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace" "20-cublas" \
|
||||||
sh "${BUILDER_DIR}/build-cublas.sh" \
|
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||||
"${CUBLAS_VERSION}" \
|
"${CUBLAS_VERSION}" \
|
||||||
"${CUDA_USERSPACE_VERSION}" \
|
"${CUDA_USERSPACE_VERSION}" \
|
||||||
"${NCCL_CUDA_VERSION}" \
|
"${NCCL_CUDA_VERSION}" \
|
||||||
"${DIST_DIR}"
|
"${DIST_DIR}/cache"
|
||||||
|
|
||||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
CUBLAS_CACHE="${DIST_DIR}/cache/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||||
|
|
||||||
echo "=== bee-gpu-burn FP4 header probe ==="
|
echo "=== bee-gpu-burn FP4 header probe ==="
|
||||||
fp4_type_match="$(grep -Rsnm 1 'CUDA_R_4F_E2M1' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
|
fp4_type_match="$(grep -Rsnm 1 'CUDA_R_4F_E2M1' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
|
||||||
@@ -1433,7 +1456,7 @@ fi
|
|||||||
|
|
||||||
# --- copy bee binary into overlay ---
|
# --- copy bee binary into overlay ---
|
||||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||||
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
cp "$BEE_BIN" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||||
|
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_BURN_WORKER_BIN" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_BURN_WORKER_BIN" ]; then
|
||||||
@@ -1463,10 +1486,10 @@ done
|
|||||||
# --- NVIDIA kernel modules and userspace libs ---
|
# --- NVIDIA kernel modules and userspace libs ---
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
||||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"
|
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}/cache" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"
|
||||||
|
|
||||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
NVIDIA_CACHE="${DIST_DIR}/cache/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||||
|
|
||||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||||
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||||
@@ -1492,9 +1515,9 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
|
|
||||||
# --- build / download NCCL ---
|
# --- build / download NCCL ---
|
||||||
run_step "download NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}" "50-nccl" \
|
run_step "download NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}" "50-nccl" \
|
||||||
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}/cache" "${NCCL_SHA256:-}"
|
||||||
|
|
||||||
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
NCCL_CACHE="${DIST_DIR}/cache/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||||
|
|
||||||
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
|
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
|
||||||
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||||
@@ -1510,19 +1533,19 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
"${NCCL_TESTS_VERSION}" \
|
"${NCCL_TESTS_VERSION}" \
|
||||||
"${NCCL_VERSION}" \
|
"${NCCL_VERSION}" \
|
||||||
"${NCCL_CUDA_VERSION}" \
|
"${NCCL_CUDA_VERSION}" \
|
||||||
"${DIST_DIR}" \
|
"${DIST_DIR}/cache" \
|
||||||
"${NVCC_VERSION}" \
|
"${NVCC_VERSION}" \
|
||||||
"${DEBIAN_VERSION}"
|
"${DEBIAN_VERSION}"
|
||||||
|
|
||||||
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
NCCL_TESTS_CACHE="${DIST_DIR}/cache/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||||
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
cp "${NCCL_TESTS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
cp "${NCCL_TESTS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||||
echo "=== all_reduce_perf injected ==="
|
echo "=== all_reduce_perf injected ==="
|
||||||
|
|
||||||
run_step "build john jumbo ${JOHN_JUMBO_COMMIT}" "70-john" \
|
run_step "build john jumbo ${JOHN_JUMBO_COMMIT}" "70-john" \
|
||||||
sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}"
|
sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}/cache"
|
||||||
JOHN_CACHE="${DIST_DIR}/john-${JOHN_JUMBO_COMMIT}"
|
JOHN_CACHE="${DIST_DIR}/cache/john-${JOHN_JUMBO_COMMIT}"
|
||||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
|
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
|
||||||
rsync -a --delete "${JOHN_CACHE}/run/" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/"
|
rsync -a --delete "${JOHN_CACHE}/run/" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/"
|
||||||
ln -sfn ../lib/bee/john/run/john "${OVERLAY_STAGE_DIR}/usr/local/bin/john"
|
ln -sfn ../lib/bee/john/run/john "${OVERLAY_STAGE_DIR}/usr/local/bin/john"
|
||||||
@@ -1650,7 +1673,7 @@ if ! needs_full_build; then
|
|||||||
fast_path_rebuild_iso
|
fast_path_rebuild_iso
|
||||||
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
||||||
validate_iso_live_boot_entries "$ISO_RAW"
|
validate_iso_live_boot_entries "$ISO_RAW"
|
||||||
validate_iso_grub_theme_assets "$ISO_RAW"
|
validate_iso_grub_assets "$ISO_RAW"
|
||||||
validate_iso_nvidia_runtime "$ISO_RAW"
|
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||||
cp "$ISO_RAW" "$ISO_OUT"
|
cp "$ISO_RAW" "$ISO_OUT"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -1665,16 +1688,30 @@ echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
|
|||||||
|
|
||||||
# Export for auto/config
|
# Export for auto/config
|
||||||
BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
|
BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
|
||||||
BEE_ISO_VOLUME="EASY_BEE_${BEE_GPU_VENDOR_UPPER}_V${ISO_VERSION_LABEL_TOKEN}"
|
# ISO 9660 volume ID is limited to 32 characters; truncate the version token to fit.
|
||||||
|
_vol_prefix="EASY_BEE_${BEE_GPU_VENDOR_UPPER}_V"
|
||||||
|
_max_token=$(( 32 - ${#_vol_prefix} ))
|
||||||
|
_vol_token="$(printf '%s' "${ISO_VERSION_LABEL_TOKEN}" | cut -c1-${_max_token})"
|
||||||
|
BEE_ISO_VOLUME="${_vol_prefix}${_vol_token}"
|
||||||
|
unset _vol_prefix _max_token _vol_token
|
||||||
export BEE_GPU_VENDOR_UPPER BEE_ISO_VOLUME
|
export BEE_GPU_VENDOR_UPPER BEE_ISO_VOLUME
|
||||||
|
|
||||||
cd "${LB_DIR}"
|
cd "${LB_DIR}"
|
||||||
run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
|
run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
|
||||||
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
||||||
dump_memtest_debug "pre-build" "${LB_DIR}"
|
dump_memtest_debug "pre-build" "${LB_DIR}"
|
||||||
|
export MKSQUASHFS_OPTIONS="-no-xattrs"
|
||||||
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
||||||
echo "=== enforcing canonical bootloader assets ==="
|
echo "=== enforcing canonical bootloader assets ==="
|
||||||
enforce_live_build_bootloader_assets "${LB_DIR}"
|
enforce_live_build_bootloader_assets "${LB_DIR}"
|
||||||
|
# Rename lb's default filesystem.squashfs to the versioned filename so the
|
||||||
|
# ISO contains a version-stamped squashfs (e.g. filesystem-v10.15.squashfs).
|
||||||
|
_std_sq="${LB_DIR}/binary/live/filesystem.squashfs"
|
||||||
|
_ver_sq="${LB_DIR}/binary/live/${SQUASHFS_FILENAME}"
|
||||||
|
if [ -f "${_std_sq}" ] && [ "${_std_sq}" != "${_ver_sq}" ]; then
|
||||||
|
mv "${_std_sq}" "${_ver_sq}"
|
||||||
|
echo "=== squashfs renamed: filesystem.squashfs → ${SQUASHFS_FILENAME} ==="
|
||||||
|
fi
|
||||||
reset_live_build_stage "${LB_DIR}" "binary_checksums"
|
reset_live_build_stage "${LB_DIR}" "binary_checksums"
|
||||||
reset_live_build_stage "${LB_DIR}" "binary_iso"
|
reset_live_build_stage "${LB_DIR}" "binary_iso"
|
||||||
reset_live_build_stage "${LB_DIR}" "binary_zsync"
|
reset_live_build_stage "${LB_DIR}" "binary_zsync"
|
||||||
@@ -1706,7 +1743,7 @@ if [ -f "$ISO_RAW" ]; then
|
|||||||
fi
|
fi
|
||||||
validate_iso_memtest "$ISO_RAW"
|
validate_iso_memtest "$ISO_RAW"
|
||||||
validate_iso_live_boot_entries "$ISO_RAW"
|
validate_iso_live_boot_entries "$ISO_RAW"
|
||||||
validate_iso_grub_theme_assets "$ISO_RAW"
|
validate_iso_grub_assets "$ISO_RAW"
|
||||||
validate_iso_nvidia_runtime "$ISO_RAW"
|
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||||
cp "$ISO_RAW" "$ISO_OUT"
|
cp "$ISO_RAW" "$ISO_OUT"
|
||||||
touch "${FULL_BUILD_MARKER}"
|
touch "${FULL_BUILD_MARKER}"
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
set default=1
|
set default=1
|
||||||
set timeout=10
|
set timeout=10
|
||||||
|
set color_normal=yellow/black
|
||||||
|
set color_highlight=white/brown
|
||||||
|
|
||||||
if [ x$feature_default_font_path = xy ] ; then
|
if [ x$feature_default_font_path = xy ] ; then
|
||||||
font=unicode
|
font=unicode
|
||||||
@@ -26,6 +28,3 @@ insmod gfxterm
|
|||||||
|
|
||||||
terminal_input console serial
|
terminal_input console serial
|
||||||
terminal_output gfxterm serial
|
terminal_output gfxterm serial
|
||||||
|
|
||||||
insmod tga
|
|
||||||
source /boot/grub/theme.cfg
|
|
||||||
|
|||||||
@@ -1,21 +1,26 @@
|
|||||||
source /boot/grub/config.cfg
|
source /boot/grub/config.cfg
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
menuentry "EASY-BEE v@VERSION@" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE -- load to RAM (toram)" {
|
menuentry "EASY-BEE v@VERSION@ -- load to RAM (toram)" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE -- no GUI / no X11" {
|
menuentry "EASY-BEE v@VERSION@ -- no GUI / no X11" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.gui=off bee.nvidia.mode=gsp-off pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.gui=off bee.nvidia.mode=gsp-off pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
menuentry "*** WIPE ALL DISKS (irreversible!) ***" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.gui=off bee.wipe=all net.ifnames=0 biosdevname=0
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
if [ "${grub_platform}" = "efi" ]; then
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
menuentry "Memory Test (memtest86+)" {
|
menuentry "Memory Test (memtest86+)" {
|
||||||
chainloader /boot/memtest86+x64.efi
|
chainloader /boot/memtest86+x64.efi
|
||||||
|
|||||||
@@ -1,18 +1,18 @@
|
|||||||
label live-@FLAVOUR@-normal
|
label live-@FLAVOUR@-normal
|
||||||
menu label ^EASY-BEE
|
menu label ^EASY-BEE v@VERSION@
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-toram
|
label live-@FLAVOUR@-toram
|
||||||
menu label EASY-BEE (^load to RAM)
|
menu label EASY-BEE v@VERSION@ (^load to RAM)
|
||||||
menu default
|
menu default
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
append @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-console
|
label live-@FLAVOUR@-console
|
||||||
menu label EASY-BEE (^no GUI / no X11)
|
menu label EASY-BEE v@VERSION@ (^no GUI / no X11)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ nomodeset bee.gui=off bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
append @APPEND_LIVE@ nomodeset bee.gui=off bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
@@ -41,6 +41,12 @@ label live-@FLAVOUR@-failsafe
|
|||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
|
|
||||||
|
label wipe-disks
|
||||||
|
menu label *** WIPE ALL DISKS (irreversible!) ***
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ toram nomodeset bee.gui=off bee.wipe=all net.ifnames=0 biosdevname=0
|
||||||
|
|
||||||
label memtest
|
label memtest
|
||||||
menu label ^Memory Test (memtest86+)
|
menu label ^Memory Test (memtest86+)
|
||||||
linux /boot/memtest86+x64.bin
|
linux /boot/memtest86+x64.bin
|
||||||
|
|||||||
@@ -69,6 +69,7 @@ chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
|||||||
chmod +x /usr/local/bin/bee-install 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-install 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-gui-gate 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-gui-gate 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-check-nvswitch 2>/dev/null || true
|
||||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||||
|
|||||||
+57
@@ -0,0 +1,57 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# 9012-wipe.hook.chroot
|
||||||
|
#
|
||||||
|
# Adds bee-initramfs-wipe to the initramfs so that selecting the
|
||||||
|
# "WIPE ALL DISKS" boot menu entry runs the wipe tool before squashfs
|
||||||
|
# is mounted — i.e. it works even when live boot fails.
|
||||||
|
#
|
||||||
|
# Two files are installed inside the chroot:
|
||||||
|
# /etc/initramfs-tools/hooks/bee-wipe — copies binaries into initrd
|
||||||
|
# /etc/initramfs-tools/scripts/local-premount/bee-wipe — runs at boot
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
HOOK_DIR="/etc/initramfs-tools/hooks"
|
||||||
|
SCRIPT_DIR="/etc/initramfs-tools/scripts/local-premount"
|
||||||
|
|
||||||
|
mkdir -p "${HOOK_DIR}" "${SCRIPT_DIR}"
|
||||||
|
|
||||||
|
# ── initramfs hook: copy binaries ────────────────────────────────────────────
|
||||||
|
cat > "${HOOK_DIR}/bee-wipe" << 'EOF'
|
||||||
|
#!/bin/sh
|
||||||
|
PREREQ=""
|
||||||
|
prereqs() { echo "$PREREQ"; }
|
||||||
|
case "$1" in prereqs) prereqs; exit 0 ;; esac
|
||||||
|
|
||||||
|
. /usr/share/initramfs-tools/hook-functions
|
||||||
|
|
||||||
|
for bin in lsblk blkid blkdiscard blockdev; do
|
||||||
|
b=$(command -v "$bin" 2>/dev/null) && copy_exec "$b" /bin
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -x /usr/sbin/nvme ] && copy_exec /usr/sbin/nvme /sbin
|
||||||
|
|
||||||
|
copy_exec /usr/local/bin/bee-initramfs-wipe /bin/bee-wipe
|
||||||
|
EOF
|
||||||
|
|
||||||
|
chmod +x "${HOOK_DIR}/bee-wipe"
|
||||||
|
|
||||||
|
# ── initramfs premount script: trigger on bee.wipe=all ───────────────────────
|
||||||
|
cat > "${SCRIPT_DIR}/bee-wipe" << 'EOF'
|
||||||
|
#!/bin/sh
|
||||||
|
PREREQ=""
|
||||||
|
prereqs() { echo "$PREREQ"; }
|
||||||
|
case "$1" in prereqs) prereqs; exit 0 ;; esac
|
||||||
|
|
||||||
|
grep -qw 'bee.wipe=all' /proc/cmdline 2>/dev/null || exit 0
|
||||||
|
exec /bin/bee-wipe
|
||||||
|
EOF
|
||||||
|
|
||||||
|
chmod +x "${SCRIPT_DIR}/bee-wipe"
|
||||||
|
|
||||||
|
echo "9012-wipe: installed initramfs hook and premount script"
|
||||||
|
|
||||||
|
KVER=$(ls /lib/modules | sort -V | tail -1)
|
||||||
|
echo "9012-wipe: rebuilding initramfs for kernel ${KVER}"
|
||||||
|
update-initramfs -u -k "${KVER}"
|
||||||
|
echo "9012-wipe: done"
|
||||||
@@ -0,0 +1,37 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# 9998-strip-xattrs.hook.chroot
|
||||||
|
#
|
||||||
|
# mksquashfs 4.5.1 (Debian bookworm) writes a non-INVALID xattr_id_table_start
|
||||||
|
# even with -no-xattrs when the source tree contains POSIX ACL xattrs set by
|
||||||
|
# dpkg/install-time. Linux 6.1 squashfs driver then fails with
|
||||||
|
# "unable to read xattr id index table" and aborts the mount.
|
||||||
|
#
|
||||||
|
# Strip all xattrs from the live chroot before mksquashfs sees the tree so the
|
||||||
|
# resulting squashfs has SQUASHFS_INVALID_BLK in xattr_id_table_start.
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
def strip(path):
|
||||||
|
try:
|
||||||
|
for attr in os.listxattr(path, follow_symlinks=False):
|
||||||
|
try:
|
||||||
|
os.removexattr(path, attr, follow_symlinks=False)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
removed = 0
|
||||||
|
for root, dirs, files in os.walk('/', topdown=True, followlinks=False):
|
||||||
|
for name in dirs + files:
|
||||||
|
p = os.path.join(root, name)
|
||||||
|
try:
|
||||||
|
attrs = os.listxattr(p, follow_symlinks=False)
|
||||||
|
if attrs:
|
||||||
|
strip(p)
|
||||||
|
removed += len(attrs)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
strip(root)
|
||||||
|
|
||||||
|
print(f"9998-strip-xattrs: removed xattrs from {removed} entries")
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
# AMD GPU firmware
|
# AMD GPU firmware
|
||||||
firmware-amd-graphics
|
firmware-amd-graphics
|
||||||
|
nvtop
|
||||||
|
|
||||||
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
||||||
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
||||||
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
||||||
# explicitly.
|
# explicitly.
|
||||||
|
nvtop
|
||||||
nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
|
nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
|
||||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ exfat-fuse
|
|||||||
ntfs-3g
|
ntfs-3g
|
||||||
|
|
||||||
# Utilities
|
# Utilities
|
||||||
|
infiniband-diags
|
||||||
bash
|
bash
|
||||||
procps
|
procps
|
||||||
lsof
|
lsof
|
||||||
@@ -46,7 +47,6 @@ less
|
|||||||
vim-tiny
|
vim-tiny
|
||||||
mc
|
mc
|
||||||
htop
|
htop
|
||||||
nvtop
|
|
||||||
sudo
|
sudo
|
||||||
zstd
|
zstd
|
||||||
mstflint
|
mstflint
|
||||||
|
|||||||
@@ -2,6 +2,8 @@
|
|||||||
Description=Bee: load NVIDIA kernel modules and create device nodes
|
Description=Bee: load NVIDIA kernel modules and create device nodes
|
||||||
After=local-fs.target udev.service bee-blackbox.service
|
After=local-fs.target udev.service bee-blackbox.service
|
||||||
Before=bee-audit.service
|
Before=bee-audit.service
|
||||||
|
# Skip silently if bee-nvidia-load is absent (non-nvidia builds).
|
||||||
|
ConditionPathExists=/usr/local/bin/bee-nvidia-load
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=oneshot
|
Type=oneshot
|
||||||
|
|||||||
@@ -0,0 +1,4 @@
|
|||||||
|
[Service]
|
||||||
|
# Skip fabricmanager on systems without NVSwitch hardware.
|
||||||
|
# ExecCondition exits 1-254 → unit is silently skipped (inactive, not failed).
|
||||||
|
ExecCondition=/usr/local/bin/bee-check-nvswitch
|
||||||
@@ -3,8 +3,14 @@
|
|||||||
# Shows live service status until all bee services are done or failed,
|
# Shows live service status until all bee services are done or failed,
|
||||||
# then exits so getty can show the login prompt.
|
# then exits so getty can show the login prompt.
|
||||||
|
|
||||||
CRITICAL="bee-preflight bee-nvidia bee-audit"
|
GPU_VENDOR="$(cat /etc/bee-gpu-vendor 2>/dev/null || echo nvidia)"
|
||||||
ALL="bee-sshsetup ssh bee-network bee-nvidia bee-preflight bee-audit bee-web"
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
|
CRITICAL="bee-preflight bee-nvidia bee-audit"
|
||||||
|
ALL="bee-sshsetup ssh bee-network bee-nvidia bee-preflight bee-audit bee-web"
|
||||||
|
else
|
||||||
|
CRITICAL="bee-preflight bee-audit"
|
||||||
|
ALL="bee-sshsetup ssh bee-network bee-preflight bee-audit bee-web"
|
||||||
|
fi
|
||||||
|
|
||||||
svc_state() { systemctl is-active "$1.service" 2>/dev/null || echo "inactive"; }
|
svc_state() { systemctl is-active "$1.service" 2>/dev/null || echo "inactive"; }
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,4 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# Exit 0 if NVSwitch hardware is detected; exit 1 to skip fabricmanager on non-NVSwitch systems.
|
||||||
|
# NVSwitch appears in lspci as vendor 10de, class 0680 (Bridge, Other).
|
||||||
|
lspci -Dn 2>/dev/null | awk '$2 == "0680:" && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||||
Executable
+166
@@ -0,0 +1,166 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# bee-initramfs-wipe — interactive disk wipe running entirely in the initramfs.
|
||||||
|
# Triggered by bee.wipe=all on the kernel cmdline (via local-premount hook).
|
||||||
|
# Works before squashfs is mounted, so it runs even when live boot fails.
|
||||||
|
|
||||||
|
RED='\033[1;31m'
|
||||||
|
YEL='\033[1;33m'
|
||||||
|
GRN='\033[1;32m'
|
||||||
|
CYN='\033[1;36m'
|
||||||
|
NC='\033[0m'
|
||||||
|
|
||||||
|
p() { printf '%b\n' "$*"; }
|
||||||
|
pp() { printf '%b' "$*"; }
|
||||||
|
|
||||||
|
banner() {
|
||||||
|
p ""
|
||||||
|
p "${RED}╔══════════════════════════════════════════════════════════╗${NC}"
|
||||||
|
p "${RED}║ BEE DRIVE WIPE — initramfs stage ║${NC}"
|
||||||
|
p "${RED}╚══════════════════════════════════════════════════════════╝${NC}"
|
||||||
|
p ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── find boot device ─────────────────────────────────────────────────────────
|
||||||
|
boot_dev() {
|
||||||
|
local label token
|
||||||
|
for token in $(cat /proc/cmdline 2>/dev/null); do
|
||||||
|
case "$token" in
|
||||||
|
live-media-label=*) label="${token#*=}" ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
[ -z "$label" ] && return
|
||||||
|
|
||||||
|
local dev
|
||||||
|
dev=$(blkid -L "$label" 2>/dev/null) || return
|
||||||
|
# strip partition suffix: /dev/sdb1 → /dev/sdb, /dev/nvme0n1p1 → /dev/nvme0n1
|
||||||
|
echo "$dev" | sed 's/p\?[0-9]\+$//'
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── enumerate candidate disks ─────────────────────────────────────────────────
|
||||||
|
list_disks() {
|
||||||
|
local boot
|
||||||
|
boot=$(boot_dev)
|
||||||
|
|
||||||
|
lsblk -d -n -o NAME,TYPE,SIZE,MODEL 2>/dev/null | while read -r name type size model; do
|
||||||
|
[ "$type" = "disk" ] || continue
|
||||||
|
[ "$size" = "0B" ] && continue
|
||||||
|
local dev="/dev/$name"
|
||||||
|
[ "$dev" = "$boot" ] && continue
|
||||||
|
printf '%s\t%s\t%s\n' "$dev" "$size" "${model:-}"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── wipe one disk ─────────────────────────────────────────────────────────────
|
||||||
|
wipe_one() {
|
||||||
|
local dev="$1"
|
||||||
|
p ""
|
||||||
|
p "=== ${YEL}${dev}${NC} ==="
|
||||||
|
|
||||||
|
if echo "$dev" | grep -q '^/dev/nvme'; then
|
||||||
|
if nvme format --ses=1 "$dev" 2>&1; then
|
||||||
|
p " ${GRN}nvme format OK${NC}"
|
||||||
|
blockdev --flushbufs "$dev" 2>/dev/null || true
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
p " nvme format failed — falling back to blkdiscard"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if blkdiscard -f "$dev" 2>&1; then
|
||||||
|
p " ${GRN}blkdiscard OK${NC}"
|
||||||
|
blockdev --flushbufs "$dev" 2>/dev/null || true
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
p " blkdiscard not supported — zeroing partition tables (HDD fallback)"
|
||||||
|
local size_bytes mb32 skip
|
||||||
|
size_bytes=$(blockdev --getsize64 "$dev" 2>/dev/null || echo 0)
|
||||||
|
mb32=$(( 32 * 1024 * 1024 ))
|
||||||
|
|
||||||
|
dd if=/dev/zero of="$dev" bs=4M count=8 conv=fsync status=progress 2>&1 || true
|
||||||
|
|
||||||
|
if [ "$size_bytes" -gt $(( mb32 * 2 )) ]; then
|
||||||
|
skip=$(( (size_bytes - mb32) / (4 * 1024 * 1024) ))
|
||||||
|
dd if=/dev/zero of="$dev" bs=4M count=8 seek="$skip" conv=fsync status=progress 2>&1 || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
blockdev --flushbufs "$dev" 2>/dev/null || true
|
||||||
|
p " ${GRN}done (partition tables zeroed)${NC}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── main ──────────────────────────────────────────────────────────────────────
|
||||||
|
banner
|
||||||
|
|
||||||
|
BOOT=$(boot_dev)
|
||||||
|
[ -n "$BOOT" ] && p "Boot device (excluded): ${CYN}${BOOT}${NC}\n"
|
||||||
|
|
||||||
|
# build indexed list
|
||||||
|
i=0
|
||||||
|
DEVS=""
|
||||||
|
IFS='
|
||||||
|
'
|
||||||
|
for line in $(list_disks); do
|
||||||
|
i=$(( i + 1 ))
|
||||||
|
dev=$(echo "$line" | cut -f1)
|
||||||
|
size=$(echo "$line" | cut -f2)
|
||||||
|
model=$(echo "$line" | cut -f3)
|
||||||
|
DEVS="${DEVS}${i}:${dev}:${size}:${model}
|
||||||
|
"
|
||||||
|
printf " ${CYN}[%d]${NC} %-16s %8s %s\n" "$i" "$dev" "$size" "$model"
|
||||||
|
done
|
||||||
|
IFS='
|
||||||
|
'
|
||||||
|
|
||||||
|
if [ "$i" -eq 0 ]; then
|
||||||
|
p "\nNo physical disks found (boot device excluded)."
|
||||||
|
p "Dropping to shell — type 'exit' to continue boot."
|
||||||
|
exec /bin/sh
|
||||||
|
fi
|
||||||
|
|
||||||
|
p ""
|
||||||
|
pp "Enter numbers to wipe (space-separated), ${YEL}all${NC} for all, ${YEL}q${NC} to abort: "
|
||||||
|
read -r SELECTION
|
||||||
|
|
||||||
|
case "$SELECTION" in
|
||||||
|
q|Q|'') p "\nAborted."; exec /bin/sh ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# resolve selection → list of devs
|
||||||
|
SELECTED=""
|
||||||
|
if [ "$SELECTION" = "all" ] || [ "$SELECTION" = "ALL" ]; then
|
||||||
|
SELECTED=$(echo "$DEVS" | grep -v '^$' | cut -d: -f2 | tr '\n' ' ')
|
||||||
|
else
|
||||||
|
for num in $SELECTION; do
|
||||||
|
match=$(echo "$DEVS" | grep "^${num}:" | cut -d: -f2)
|
||||||
|
if [ -z "$match" ]; then
|
||||||
|
p "${RED}Unknown index: ${num}${NC}"; exec /bin/sh
|
||||||
|
fi
|
||||||
|
SELECTED="${SELECTED}${match} "
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
SELECTED=$(echo "$SELECTED" | tr -s ' ' | sed 's/ $//')
|
||||||
|
|
||||||
|
p ""
|
||||||
|
p "Selected for wipe: ${YEL}${SELECTED}${NC}"
|
||||||
|
p "${RED}WARNING: This is IRREVERSIBLE. All data on the selected disks will be lost.${NC}"
|
||||||
|
p ""
|
||||||
|
pp "Type YES to confirm, anything else to abort: "
|
||||||
|
read -r CONFIRM
|
||||||
|
|
||||||
|
if [ "$CONFIRM" != "YES" ]; then
|
||||||
|
p "\nAborted — no disks were touched."
|
||||||
|
exec /bin/sh
|
||||||
|
fi
|
||||||
|
|
||||||
|
p "\nStarting wipe..."
|
||||||
|
for dev in $SELECTED; do
|
||||||
|
wipe_one "$dev"
|
||||||
|
done
|
||||||
|
|
||||||
|
sync
|
||||||
|
p ""
|
||||||
|
p "${GRN}=== All selected disks wiped and flushed. ===${NC}"
|
||||||
|
p ""
|
||||||
|
pp "Press Enter to reboot..."
|
||||||
|
read -r _
|
||||||
|
reboot
|
||||||
@@ -8,7 +8,7 @@
|
|||||||
# Layout (UEFI): GPT, /dev/sdX1=EFI 512MB vfat, /dev/sdX2=root ext4
|
# Layout (UEFI): GPT, /dev/sdX1=EFI 512MB vfat, /dev/sdX2=root ext4
|
||||||
# Layout (BIOS): MBR, /dev/sdX1=root ext4
|
# Layout (BIOS): MBR, /dev/sdX1=root ext4
|
||||||
#
|
#
|
||||||
# Squashfs source: /run/live/medium/live/filesystem.squashfs
|
# Squashfs sources: /run/live/medium/live/*.squashfs
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
@@ -62,9 +62,9 @@ for tool in parted mkfs.vfat mkfs.ext4 unsquashfs grub-install update-grub; do
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
SQUASHFS="/run/live/medium/live/filesystem.squashfs"
|
mapfile -t SQUASHFS_FILES < <(find /run/live/medium/live -maxdepth 1 -type f -name '*.squashfs' | sort)
|
||||||
if [ ! -f "$SQUASHFS" ]; then
|
if [ "${#SQUASHFS_FILES[@]}" -eq 0 ]; then
|
||||||
echo "ERROR: squashfs not found at $SQUASHFS" >&2
|
echo "ERROR: no squashfs files found under /run/live/medium/live" >&2
|
||||||
echo " The live medium may have been disconnected." >&2
|
echo " The live medium may have been disconnected." >&2
|
||||||
echo " Reconnect the disc and run: bee-remount-medium --wait" >&2
|
echo " Reconnect the disc and run: bee-remount-medium --wait" >&2
|
||||||
echo " Then re-run bee-install." >&2
|
echo " Then re-run bee-install." >&2
|
||||||
@@ -106,7 +106,10 @@ log "=== BEE DISK INSTALLER ==="
|
|||||||
log "Target device : $DEVICE"
|
log "Target device : $DEVICE"
|
||||||
log "Root partition: $PART_ROOT"
|
log "Root partition: $PART_ROOT"
|
||||||
[ "$UEFI" = "1" ] && log "EFI partition : $PART_EFI"
|
[ "$UEFI" = "1" ] && log "EFI partition : $PART_EFI"
|
||||||
log "Squashfs : $SQUASHFS ($(du -sh "$SQUASHFS" | cut -f1))"
|
log "Squashfs : ${#SQUASHFS_FILES[@]} layer(s)"
|
||||||
|
for sf in "${SQUASHFS_FILES[@]}"; do
|
||||||
|
log " - $sf ($(du -sh "$sf" | cut -f1))"
|
||||||
|
done
|
||||||
log "Log : $LOGFILE"
|
log "Log : $LOGFILE"
|
||||||
log ""
|
log ""
|
||||||
|
|
||||||
@@ -163,7 +166,9 @@ log " Mounted."
|
|||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
|
log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
|
||||||
log " Source: $SQUASHFS"
|
for sf in "${SQUASHFS_FILES[@]}"; do
|
||||||
|
log " Source: $sf"
|
||||||
|
done
|
||||||
log " Target: $MOUNT_ROOT"
|
log " Target: $MOUNT_ROOT"
|
||||||
|
|
||||||
# unsquashfs does not support resume, so retry the entire unpack step if the
|
# unsquashfs does not support resume, so retry the entire unpack step if the
|
||||||
@@ -177,9 +182,9 @@ while true; do
|
|||||||
fi
|
fi
|
||||||
[ "$UNPACK_ATTEMPTS" -gt 1 ] && log " Retry attempt $UNPACK_ATTEMPTS / $UNPACK_MAX ..."
|
[ "$UNPACK_ATTEMPTS" -gt 1 ] && log " Retry attempt $UNPACK_ATTEMPTS / $UNPACK_MAX ..."
|
||||||
|
|
||||||
# Re-check squashfs is reachable before each attempt
|
mapfile -t SQUASHFS_FILES < <(find /run/live/medium/live -maxdepth 1 -type f -name '*.squashfs' | sort)
|
||||||
if [ ! -f "$SQUASHFS" ]; then
|
if [ "${#SQUASHFS_FILES[@]}" -eq 0 ]; then
|
||||||
log " SOURCE LOST: $SQUASHFS not found."
|
log " SOURCE LOST: no squashfs files found under /run/live/medium/live."
|
||||||
log " Reconnect the disc and run 'bee-remount-medium --wait' in another terminal,"
|
log " Reconnect the disc and run 'bee-remount-medium --wait' in another terminal,"
|
||||||
log " then press Enter here to retry."
|
log " then press Enter here to retry."
|
||||||
read -r _
|
read -r _
|
||||||
@@ -194,12 +199,17 @@ while true; do
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
UNPACK_OK=0
|
UNPACK_OK=0
|
||||||
unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
|
for sf in "${SQUASHFS_FILES[@]}"; do
|
||||||
grep -E '^\[|^inod|^created|^extract|^ERROR|failed' | \
|
log " Unpacking $(basename "$sf") ..."
|
||||||
while IFS= read -r line; do log " $line"; done || UNPACK_OK=$?
|
unsquashfs -f -d "$MOUNT_ROOT" "$sf" 2>&1 | \
|
||||||
|
grep -E '^\[|^inod|^created|^extract|^ERROR|failed' | \
|
||||||
|
while IFS= read -r line; do log " $line"; done || UNPACK_OK=$?
|
||||||
|
[ "$UNPACK_OK" -eq 0 ] || break
|
||||||
|
done
|
||||||
|
|
||||||
# Check squashfs is still reachable (gone = disc pulled during copy)
|
# Check squashfs is still reachable (gone = disc pulled during copy)
|
||||||
if [ ! -f "$SQUASHFS" ]; then
|
mapfile -t SQUASHFS_FILES < <(find /run/live/medium/live -maxdepth 1 -type f -name '*.squashfs' | sort)
|
||||||
|
if [ "${#SQUASHFS_FILES[@]}" -eq 0 ]; then
|
||||||
log " WARNING: source medium lost during unpack — will retry after remount."
|
log " WARNING: source medium lost during unpack — will retry after remount."
|
||||||
log " Run 'bee-remount-medium --wait' in another terminal, then press Enter."
|
log " Run 'bee-remount-medium --wait' in another terminal, then press Enter."
|
||||||
read -r _
|
read -r _
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
# bee-remount-medium — find and remount the live ISO medium to /run/live/medium
|
# bee-remount-medium — find and remount the live ISO medium to /run/live/medium
|
||||||
#
|
#
|
||||||
# Run this after reconnecting the ISO source disc (USB/CD) if the live medium
|
# Run this after reconnecting the ISO source disc (USB/CD) if the live medium
|
||||||
# was lost and /run/live/medium/live/filesystem.squashfs is missing.
|
# was lost and /run/live/medium/live/*.squashfs are missing.
|
||||||
#
|
#
|
||||||
# Usage: bee-remount-medium [--wait]
|
# Usage: bee-remount-medium [--wait]
|
||||||
# --wait keep retrying every 5 seconds until the medium is found (useful
|
# --wait keep retrying every 5 seconds until the medium is found (useful
|
||||||
@@ -11,7 +11,7 @@
|
|||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
MEDIUM_DIR="/run/live/medium"
|
MEDIUM_DIR="/run/live/medium"
|
||||||
SQUASHFS_REL="live/filesystem.squashfs"
|
SQUASHFS_GLOB="live/*.squashfs"
|
||||||
WAIT_MODE=0
|
WAIT_MODE=0
|
||||||
|
|
||||||
for arg in "$@"; do
|
for arg in "$@"; do
|
||||||
@@ -56,7 +56,7 @@ try_mount() {
|
|||||||
local tmpdir
|
local tmpdir
|
||||||
tmpdir=$(mktemp -d /tmp/bee-probe-XXXXXX)
|
tmpdir=$(mktemp -d /tmp/bee-probe-XXXXXX)
|
||||||
if mount -o ro "$dev" "$tmpdir" 2>/dev/null; then
|
if mount -o ro "$dev" "$tmpdir" 2>/dev/null; then
|
||||||
if [ -f "${tmpdir}/${SQUASHFS_REL}" ]; then
|
if find "${tmpdir}/live" -maxdepth 1 -type f -name '*.squashfs' 2>/dev/null | grep -q .; then
|
||||||
# Unmount probe mount and mount properly onto live path
|
# Unmount probe mount and mount properly onto live path
|
||||||
umount "$tmpdir" 2>/dev/null || true
|
umount "$tmpdir" 2>/dev/null || true
|
||||||
rmdir "$tmpdir" 2>/dev/null || true
|
rmdir "$tmpdir" 2>/dev/null || true
|
||||||
@@ -82,8 +82,9 @@ attempt() {
|
|||||||
for dev in $(find_candidates); do
|
for dev in $(find_candidates); do
|
||||||
log " Trying $dev ..."
|
log " Trying $dev ..."
|
||||||
if try_mount "$dev"; then
|
if try_mount "$dev"; then
|
||||||
local sq="${MEDIUM_DIR}/${SQUASHFS_REL}"
|
local count
|
||||||
log "SUCCESS: squashfs available at $sq ($(du -sh "$sq" | cut -f1))"
|
count=$(find "${MEDIUM_DIR}/live" -maxdepth 1 -type f -name '*.squashfs' 2>/dev/null | wc -l | tr -d ' ')
|
||||||
|
log "SUCCESS: ${count} squashfs layer(s) available under ${MEDIUM_DIR}/live"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
@@ -100,5 +101,5 @@ if [ "$WAIT_MODE" = "1" ]; then
|
|||||||
sleep 5
|
sleep 5
|
||||||
done
|
done
|
||||||
else
|
else
|
||||||
attempt || die "No ISO medium with ${SQUASHFS_REL} found. Reconnect the disc and re-run, or use --wait."
|
attempt || die "No ISO medium with ${SQUASHFS_GLOB} found. Reconnect the disc and re-run, or use --wait."
|
||||||
fi
|
fi
|
||||||
|
|||||||
Executable
+132
@@ -0,0 +1,132 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# bee-wipe-disks — erase all physical disks (interactive, confirmation required)
|
||||||
|
#
|
||||||
|
# Triggered automatically when the kernel cmdline contains bee.wipe=all.
|
||||||
|
# Can also be run manually from a root shell.
|
||||||
|
#
|
||||||
|
# Wipe strategy:
|
||||||
|
# NVMe — nvme format (ATA-style secure erase, fast)
|
||||||
|
# Other — blkdiscard -f (TRIM/UNMAP, fast on SSDs)
|
||||||
|
# dd if=/dev/zero (fallback for HDDs, zeros first+last 32 MB)
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
RED=$'\033[1;31m'
|
||||||
|
YEL=$'\033[1;33m'
|
||||||
|
GRN=$'\033[1;32m'
|
||||||
|
NC=$'\033[0m'
|
||||||
|
|
||||||
|
banner() {
|
||||||
|
echo ""
|
||||||
|
echo "${RED}╔══════════════════════════════════════════════════════════╗${NC}"
|
||||||
|
echo "${RED}║ BEE DISK WIPE — ALL DATA WILL BE DESTROYED ║${NC}"
|
||||||
|
echo "${RED}╚══════════════════════════════════════════════════════════╝${NC}"
|
||||||
|
echo ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── find boot device to skip ──────────────────────────────────────────────────
|
||||||
|
live_dev() {
|
||||||
|
local src
|
||||||
|
src=$(findmnt -n -o SOURCE /run/live/medium 2>/dev/null || true)
|
||||||
|
[ -z "$src" ] && return
|
||||||
|
# Strip partition suffix: /dev/sdb1 → /dev/sdb, /dev/nvme0n1p1 → /dev/nvme0n1
|
||||||
|
echo "$src" | sed 's/p\?[0-9]\+$//'
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── enumerate target disks ────────────────────────────────────────────────────
|
||||||
|
find_disks() {
|
||||||
|
local boot_dev
|
||||||
|
boot_dev=$(live_dev)
|
||||||
|
|
||||||
|
lsblk -d -n -o NAME,TYPE,SIZE,MODEL | while read -r name type size model; do
|
||||||
|
[ "$type" = "disk" ] || continue
|
||||||
|
[ "$size" = "0B" ] && continue # empty virtual media
|
||||||
|
|
||||||
|
local dev="/dev/$name"
|
||||||
|
[ "$dev" = "$boot_dev" ] && continue # skip boot device
|
||||||
|
|
||||||
|
printf '%s\t%s\t%s\n' "$dev" "$size" "$model"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── wipe one disk ─────────────────────────────────────────────────────────────
|
||||||
|
wipe_disk() {
|
||||||
|
local dev="$1"
|
||||||
|
echo ""
|
||||||
|
echo "=== ${YEL}${dev}${NC} ==="
|
||||||
|
|
||||||
|
if echo "$dev" | grep -q '^/dev/nvme'; then
|
||||||
|
# NVMe format (ses=1 = user data erase)
|
||||||
|
if nvme format --ses=1 "$dev" 2>&1; then
|
||||||
|
echo " ${GRN}nvme format OK${NC}"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
echo " nvme format failed, falling back to blkdiscard"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if blkdiscard -f "$dev" 2>&1; then
|
||||||
|
echo " ${GRN}blkdiscard OK${NC}"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo " blkdiscard not supported — zeroing partition tables (HDD fallback)"
|
||||||
|
local size_bytes
|
||||||
|
size_bytes=$(blockdev --getsize64 "$dev")
|
||||||
|
local mb32=$(( 32 * 1024 * 1024 ))
|
||||||
|
|
||||||
|
# Zero first 32 MB (MBR, GPT, filesystem superblocks)
|
||||||
|
dd if=/dev/zero of="$dev" bs=4M count=8 conv=fsync status=progress 2>&1 || true
|
||||||
|
|
||||||
|
# Zero last 32 MB (backup GPT)
|
||||||
|
if [ "$size_bytes" -gt $(( mb32 * 2 )) ]; then
|
||||||
|
local skip=$(( (size_bytes - mb32) / (4 * 1024 * 1024) ))
|
||||||
|
dd if=/dev/zero of="$dev" bs=4M count=8 seek="$skip" conv=fsync status=progress 2>&1 || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo " ${GRN}done (partition tables zeroed)${NC}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── main ──────────────────────────────────────────────────────────────────────
|
||||||
|
banner
|
||||||
|
|
||||||
|
mapfile -t DISKS < <(find_disks | awk '{print $1}')
|
||||||
|
|
||||||
|
if [ ${#DISKS[@]} -eq 0 ]; then
|
||||||
|
echo "No physical disks found (boot device excluded)."
|
||||||
|
echo "Nothing to wipe."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Disks to be ${RED}COMPLETELY ERASED${NC}:"
|
||||||
|
echo ""
|
||||||
|
find_disks | while IFS=$'\t' read -r dev size model; do
|
||||||
|
printf " ${YEL}%-16s${NC} %8s %s\n" "$dev" "$size" "$model"
|
||||||
|
done
|
||||||
|
echo ""
|
||||||
|
echo "${RED}WARNING: This is IRREVERSIBLE. All data on the listed disks will be lost.${NC}"
|
||||||
|
echo ""
|
||||||
|
printf "Type YES to confirm wipe, anything else to abort: "
|
||||||
|
read -r CONFIRM
|
||||||
|
|
||||||
|
if [ "$CONFIRM" != "YES" ]; then
|
||||||
|
echo ""
|
||||||
|
echo "Aborted — no disks were touched."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Starting wipe..."
|
||||||
|
|
||||||
|
for dev in "${DISKS[@]}"; do
|
||||||
|
wipe_disk "$dev"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "${GRN}=== All disks wiped. ===${NC}"
|
||||||
|
echo ""
|
||||||
|
printf "Reboot now to return to the boot menu? [Y/n] "
|
||||||
|
read -r REBOOT
|
||||||
|
case "${REBOOT:-Y}" in
|
||||||
|
[Nn]*) echo "You can reboot manually when ready." ;;
|
||||||
|
*) echo "Rebooting..."; sleep 2; reboot ;;
|
||||||
|
esac
|
||||||
Executable
+125
@@ -0,0 +1,125 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# build.sh -- single entry point for ISO builds.
|
||||||
|
#
|
||||||
|
# Local build (default):
|
||||||
|
# sh scripts/build.sh
|
||||||
|
# sh scripts/build.sh --variant nvidia
|
||||||
|
# sh scripts/build.sh --clean-build
|
||||||
|
#
|
||||||
|
# Remote build (set BUILDER_HOST + BUILDER_USER in .env):
|
||||||
|
# sh scripts/build.sh
|
||||||
|
# sh scripts/build.sh --authorized-keys ~/.ssh/authorized_keys
|
||||||
|
#
|
||||||
|
# All flags are forwarded to build-in-container.sh.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||||
|
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
||||||
|
|
||||||
|
ENV_FILE="${REPO_ROOT}/.env"
|
||||||
|
if [ -f "$ENV_FILE" ]; then
|
||||||
|
# shellcheck disable=SC1090
|
||||||
|
. "$ENV_FILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
BUILDER_HOST="${BUILDER_HOST:-}"
|
||||||
|
BUILDER_USER="${BUILDER_USER:-}"
|
||||||
|
|
||||||
|
# Cache lives inside the repo under dist/ (gitignored).
|
||||||
|
CACHE_DIR="${REPO_ROOT}/dist/cache"
|
||||||
|
|
||||||
|
# Forward all arguments as-is to the underlying build script.
|
||||||
|
EXTRA_ARGS="$*"
|
||||||
|
|
||||||
|
# ── Remote build ────────────────────────────────────────────────────────────
|
||||||
|
if [ -n "$BUILDER_HOST" ]; then
|
||||||
|
if [ -z "$BUILDER_USER" ]; then
|
||||||
|
echo "ERROR: BUILDER_USER not set. Set it in .env."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "=== bee builder (remote: ${BUILDER_USER}@${BUILDER_HOST}) ==="
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
cd "${REPO_ROOT}"
|
||||||
|
git fetch --quiet origin main
|
||||||
|
LOCAL=$(git rev-parse HEAD)
|
||||||
|
REMOTE=$(git rev-parse origin/main)
|
||||||
|
if [ "$LOCAL" != "$REMOTE" ]; then
|
||||||
|
echo "ERROR: local repo is not in sync with remote."
|
||||||
|
echo " local: $LOCAL"
|
||||||
|
echo " remote: $REMOTE"
|
||||||
|
echo ""
|
||||||
|
echo "Push or pull before building:"
|
||||||
|
echo " git push -- if you have unpushed commits"
|
||||||
|
echo " git pull -- if remote is ahead"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "repo: in sync with remote ($LOCAL)"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
ssh -o StrictHostKeyChecking=no "${BUILDER_USER}@${BUILDER_HOST}" /bin/sh <<ENDSSH
|
||||||
|
set -e
|
||||||
|
REPO="/home/${BUILDER_USER}/bee"
|
||||||
|
LOG=/tmp/bee-build.log
|
||||||
|
|
||||||
|
if [ ! -d "\$REPO/.git" ]; then
|
||||||
|
echo "--- cloning bee repo ---"
|
||||||
|
git clone https://git.mchus.pro/reanimator/bee.git "\$REPO"
|
||||||
|
fi
|
||||||
|
|
||||||
|
cd "\$REPO"
|
||||||
|
echo "--- pulling latest ---"
|
||||||
|
sudo git checkout -- .
|
||||||
|
git pull --ff-only
|
||||||
|
chmod +x iso/overlay/usr/local/bin/* 2>/dev/null || true
|
||||||
|
|
||||||
|
screen -S bee-build -X quit 2>/dev/null || true
|
||||||
|
|
||||||
|
echo "--- starting build in screen session (survives SSH disconnect) ---"
|
||||||
|
echo "--- log: \$LOG ---"
|
||||||
|
screen -dmS bee-build sh -c "sh iso/builder/build-in-container.sh --cache-dir \$REPO/dist/cache ${EXTRA_ARGS} > \$LOG 2>&1; echo \$? > /tmp/bee-build-exit"
|
||||||
|
|
||||||
|
echo "--- streaming build log (Ctrl+C safe -- build continues on VM) ---"
|
||||||
|
tail -n +1 -f "\$LOG" 2>/dev/null &
|
||||||
|
TAIL_PID=\$!
|
||||||
|
while screen -list 2>/dev/null | grep -q bee-build; do
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
sleep 1
|
||||||
|
kill \$TAIL_PID 2>/dev/null || true
|
||||||
|
|
||||||
|
tail -n 20 "\$LOG" 2>/dev/null || true
|
||||||
|
|
||||||
|
EXIT_CODE=\$(cat /tmp/bee-build-exit 2>/dev/null || echo 1)
|
||||||
|
exit \$EXIT_CODE
|
||||||
|
ENDSSH
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== downloading ISO ==="
|
||||||
|
LOCAL_ISO_DIR="${REPO_ROOT}/dist/release"
|
||||||
|
mkdir -p "${LOCAL_ISO_DIR}"
|
||||||
|
if command -v rsync >/dev/null 2>&1 && ssh -o StrictHostKeyChecking=no "${BUILDER_USER}@${BUILDER_HOST}" command -v rsync >/dev/null 2>&1; then
|
||||||
|
rsync -az --progress \
|
||||||
|
-e "ssh -o StrictHostKeyChecking=no" \
|
||||||
|
"${BUILDER_USER}@${BUILDER_HOST}:/home/${BUILDER_USER}/bee/dist/*.iso" \
|
||||||
|
"${LOCAL_ISO_DIR}/"
|
||||||
|
else
|
||||||
|
scp -o StrictHostKeyChecking=no \
|
||||||
|
"${BUILDER_USER}@${BUILDER_HOST}:/home/${BUILDER_USER}/bee/dist/*.iso" \
|
||||||
|
"${LOCAL_ISO_DIR}/"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
echo "=== build complete ==="
|
||||||
|
echo "ISO saved to: ${LOCAL_ISO_DIR}/"
|
||||||
|
ls -lh "${LOCAL_ISO_DIR}/"*.iso 2>/dev/null || true
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Local build ─────────────────────────────────────────────────────────────
|
||||||
|
echo "=== bee builder (local) ==="
|
||||||
|
echo "cache: ${CACHE_DIR}"
|
||||||
|
echo ""
|
||||||
|
# shellcheck disable=SC2086
|
||||||
|
exec sh "${REPO_ROOT}/iso/builder/build-in-container.sh" --cache-dir "${CACHE_DIR}" $EXTRA_ARGS
|
||||||
+1
-1
@@ -48,7 +48,7 @@ echo "==> Сборка бинарника..."
|
|||||||
echo " OK: $(ls -lh "${LOCAL_BIN}" | awk '{print $5, $9}')"
|
echo " OK: $(ls -lh "${LOCAL_BIN}" | awk '{print $5, $9}')"
|
||||||
|
|
||||||
LOCAL_SHA="$(shasum -a 256 "${LOCAL_BIN}" | awk '{print $1}')"
|
LOCAL_SHA="$(shasum -a 256 "${LOCAL_BIN}" | awk '{print $1}')"
|
||||||
REMOTE_SHA="$("${SSH_CMD[@]}" "$REMOTE" "if [ -f '${REMOTE_BIN}' ] && command -v sha256sum >/dev/null 2>&1; then sha256sum '${REMOTE_BIN}' | awk '{print \\$1}'; fi" 2>/dev/null || true)"
|
REMOTE_SHA="$("${SSH_CMD[@]}" "$REMOTE" "if [ -f '${REMOTE_BIN}' ] && command -v sha256sum >/dev/null 2>&1; then sha256sum '${REMOTE_BIN}' | awk '{print \$1}'; fi" 2>/dev/null || true)"
|
||||||
if [[ -n "${REMOTE_SHA}" && "${LOCAL_SHA}" == "${REMOTE_SHA}" ]]; then
|
if [[ -n "${REMOTE_SHA}" && "${LOCAL_SHA}" == "${REMOTE_SHA}" ]]; then
|
||||||
echo "==> Бинарник не изменился (${LOCAL_SHA}); копирование и перезапуск сервисов пропущены."
|
echo "==> Бинарник не изменился (${LOCAL_SHA}); копирование и перезапуск сервисов пропущены."
|
||||||
exit 0
|
exit 0
|
||||||
|
|||||||
Reference in New Issue
Block a user