Compare commits

..

3 Commits
v9.7 ... main

Author SHA1 Message Date
Mikhail Chusavitin
58d6da0e4f Fix live task logs and SAT windows 2026-04-30 17:26:45 +03:00
Mikhail Chusavitin
7ce73e34a4 Add NVMe block format tool 2026-04-30 16:27:25 +03:00
Mikhail Chusavitin
8a21809ade Update chart submodule to v2.0 (hardware contract 2.10)
New in chart:
- event_logs and platform_config sections in viewer
- Storage columns: logical_block_size_bytes, physical_block_size_bytes,
  metadata_bytes_per_block
- Compact status/severity icons, severity filtering for event logs
- Fixed JS MIME type and base stylesheet

bee audit schema already has all required fields; no schema changes needed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-30 15:52:30 +03:00
11 changed files with 517 additions and 1 deletions

View File

@@ -125,6 +125,8 @@ func defaultTaskPriority(target string, params taskParams) int {
return taskPriorityInstall
case "install-to-ram":
return taskPriorityInstallToRAM
case "nvme-format":
return taskPriorityInstall
case "audit":
return taskPriorityAudit
case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":

View File

@@ -85,6 +85,27 @@ func TestHandleAPIBlackboxStatusReturnsPersistedState(t *testing.T) {
}
}
func TestParseNVMeFormatModes(t *testing.T) {
raw := `
lbaf 0 : ms:0 lbads:9 rp:0x2 (in use)
lbaf 1 : ms:8 lbads:9 rp:0x1
lbaf 2 : ms:0 lbads:12 rp:0
`
modes := parseNVMeFormatModes(raw)
if len(modes) != 3 {
t.Fatalf("modes=%#v want 3 modes", modes)
}
if modes[0].Mode != 0 || modes[0].DataBytes != 512 || modes[0].MetadataBytes != 0 || !modes[0].InUse {
t.Fatalf("mode 0=%#v", modes[0])
}
if modes[1].Label != "MODE 1 (512+8)" {
t.Fatalf("mode 1 label=%q", modes[1].Label)
}
if modes[2].DataBytes != 4096 || modes[2].MetadataBytes != 0 {
t.Fatalf("mode 2=%#v", modes[2])
}
}
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
globalQueue.mu.Lock()
originalTasks := globalQueue.tasks

View File

@@ -91,6 +91,7 @@ func (j *jobState) writeLogLineLocked(line string) {
j.logBuf = bufio.NewWriterSize(f, 64*1024)
}
_, _ = j.logBuf.WriteString(line + "\n")
_ = j.logBuf.Flush()
}
// closeLog flushes and closes the log file. Called after all task output is done.

View File

@@ -0,0 +1,368 @@
package webui
import (
"context"
"encoding/json"
"fmt"
"net/http"
"os/exec"
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
"time"
)
type nvmeFormatMode struct {
Mode int `json:"mode"`
DataBytes int64 `json:"data_bytes"`
MetadataBytes int64 `json:"metadata_bytes"`
InUse bool `json:"in_use"`
Label string `json:"label"`
}
type nvmeFormatDisk struct {
Device string `json:"device"`
Model string `json:"model,omitempty"`
Serial string `json:"serial,omitempty"`
Size string `json:"size,omitempty"`
CurrentMode int `json:"current_mode"`
CurrentFormat string `json:"current_format"`
Modes []nvmeFormatMode `json:"modes"`
Error string `json:"error,omitempty"`
}
type nvmeListJSON struct {
Devices []struct {
DevicePath string `json:"DevicePath"`
ModelNumber string `json:"ModelNumber"`
SerialNumber string `json:"SerialNumber"`
PhysicalSize int64 `json:"PhysicalSize"`
} `json:"Devices"`
}
var (
nvmeFormatDeviceRE = regexp.MustCompile(`^/dev/nvme[0-9]+n[0-9]+$`)
nvmeLBAFCompactLineRE = regexp.MustCompile(`(?im)^\s*lbaf\s+(\d+)\s*:\s*ms:(\d+)\s+lbads:(\d+).*$`)
nvmeLBAFVerboseLineRE = regexp.MustCompile(`(?im)^\s*LBA Format\s+(\d+)\s*:\s*Metadata Size:\s*(\d+)\s+bytes\s*-\s*Data Size:\s*(\d+)\s+bytes.*$`)
nvmeCommandContext = exec.CommandContext
nvmeListFormatsTimeout = 20 * time.Second
)
func listNVMeFormatDisks(ctx context.Context) ([]nvmeFormatDisk, error) {
ctx, cancel := context.WithTimeout(ctx, nvmeListFormatsTimeout)
defer cancel()
out, err := nvmeCommandContext(ctx, "nvme", "list", "-o", "json").Output()
if err != nil {
return nil, err
}
var root nvmeListJSON
if err := json.Unmarshal(out, &root); err != nil {
return nil, err
}
disks := make([]nvmeFormatDisk, 0, len(root.Devices))
seen := map[string]struct{}{}
for _, dev := range root.Devices {
path := strings.TrimSpace(dev.DevicePath)
if !nvmeFormatDeviceRE.MatchString(path) {
continue
}
if _, ok := seen[path]; ok {
continue
}
seen[path] = struct{}{}
disk := nvmeFormatDisk{
Device: path,
Model: strings.TrimSpace(dev.ModelNumber),
Serial: strings.TrimSpace(dev.SerialNumber),
Size: formatNVMeBytes(dev.PhysicalSize),
CurrentMode: -1,
}
modes, parseErr := readNVMeFormatModes(ctx, path)
if parseErr != nil {
disk.Error = parseErr.Error()
}
disk.Modes = modes
for _, mode := range modes {
if mode.InUse {
disk.CurrentMode = mode.Mode
disk.CurrentFormat = formatNVMeBlock(mode.DataBytes, mode.MetadataBytes)
break
}
}
disks = append(disks, disk)
}
sort.Slice(disks, func(i, j int) bool { return disks[i].Device < disks[j].Device })
return disks, nil
}
func readNVMeFormatModes(ctx context.Context, device string) ([]nvmeFormatMode, error) {
if !nvmeFormatDeviceRE.MatchString(device) {
return nil, fmt.Errorf("invalid NVMe device")
}
out, err := nvmeCommandContext(ctx, "nvme", "id-ns", device, "-H").CombinedOutput()
if err != nil {
msg := strings.TrimSpace(string(out))
if msg == "" {
msg = err.Error()
}
return nil, fmt.Errorf("%s", msg)
}
modes := parseNVMeFormatModes(string(out))
if len(modes) == 0 {
return nil, fmt.Errorf("no LBA format modes found")
}
return modes, nil
}
func parseNVMeFormatModes(raw string) []nvmeFormatMode {
byMode := map[int]nvmeFormatMode{}
for _, m := range nvmeLBAFCompactLineRE.FindAllStringSubmatch(raw, -1) {
mode, errMode := strconv.Atoi(m[1])
metadata, errMS := strconv.ParseInt(m[2], 10, 64)
lbads, errLBADS := strconv.Atoi(m[3])
if errMode != nil || errMS != nil || errLBADS != nil || lbads < 0 || lbads >= 63 {
continue
}
data := int64(1) << lbads
line := m[0]
byMode[mode] = nvmeFormatMode{
Mode: mode,
DataBytes: data,
MetadataBytes: metadata,
InUse: strings.Contains(strings.ToLower(line), "in use"),
Label: fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
}
}
for _, m := range nvmeLBAFVerboseLineRE.FindAllStringSubmatch(raw, -1) {
mode, errMode := strconv.Atoi(m[1])
metadata, errMS := strconv.ParseInt(m[2], 10, 64)
data, errData := strconv.ParseInt(m[3], 10, 64)
if errMode != nil || errMS != nil || errData != nil || data <= 0 {
continue
}
line := m[0]
byMode[mode] = nvmeFormatMode{
Mode: mode,
DataBytes: data,
MetadataBytes: metadata,
InUse: strings.Contains(strings.ToLower(line), "in use"),
Label: fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
}
}
modes := make([]nvmeFormatMode, 0, len(byMode))
for _, mode := range byMode {
modes = append(modes, mode)
}
sort.Slice(modes, func(i, j int) bool { return modes[i].Mode < modes[j].Mode })
return modes
}
func runNVMeFormatTask(ctx context.Context, j *jobState, device string, lbaf int) error {
if !nvmeFormatDeviceRE.MatchString(device) {
return fmt.Errorf("invalid NVMe device")
}
modes, err := readNVMeFormatModes(ctx, device)
if err != nil {
return err
}
var selected nvmeFormatMode
found := false
for _, mode := range modes {
if mode.Mode == lbaf {
selected = mode
found = true
break
}
}
if !found {
return fmt.Errorf("MODE %d is not available on %s", lbaf, device)
}
ms := 0
if selected.MetadataBytes > 0 {
ms = 1
}
j.append(fmt.Sprintf("Formatting %s to %s with --lbaf=%d --ms=%d --force", device, formatNVMeBlock(selected.DataBytes, selected.MetadataBytes), selected.Mode, ms))
cmd := nvmeCommandContext(ctx, "nvme", "format", device, fmt.Sprintf("--lbaf=%d", selected.Mode), fmt.Sprintf("--ms=%d", ms), "--force")
return streamCmdJob(j, cmd)
}
func (h *handler) handleAPINVMeFormats(w http.ResponseWriter, r *http.Request) {
disks, err := listNVMeFormatDisks(r.Context())
if err != nil {
writeError(w, http.StatusInternalServerError, err.Error())
return
}
writeJSON(w, disks)
}
func (h *handler) handleAPINVMeFormatRun(w http.ResponseWriter, r *http.Request) {
var req struct {
Device string `json:"device"`
LBAF int `json:"lbaf"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid request body")
return
}
if !nvmeFormatDeviceRE.MatchString(req.Device) {
writeError(w, http.StatusBadRequest, "invalid NVMe device")
return
}
disks, err := listNVMeFormatDisks(r.Context())
if err != nil {
writeError(w, http.StatusInternalServerError, err.Error())
return
}
var label string
allowed := false
for _, disk := range disks {
if disk.Device != req.Device {
continue
}
for _, mode := range disk.Modes {
if mode.Mode == req.LBAF {
allowed = true
label = mode.Label
break
}
}
}
if !allowed {
writeError(w, http.StatusBadRequest, "LBA format mode is not available for this device")
return
}
name := fmt.Sprintf("NVMe Format %s to %s", filepath.Base(req.Device), label)
t := &Task{
ID: newJobID("nvme-format"),
Name: name,
Target: "nvme-format",
Priority: defaultTaskPriority("nvme-format", taskParams{}),
Status: TaskPending,
CreatedAt: time.Now(),
params: taskParams{
Device: req.Device,
LBAF: req.LBAF,
},
}
globalQueue.enqueue(t)
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
}
func formatNVMeBlock(dataBytes, metadataBytes int64) string {
return strconv.FormatInt(dataBytes, 10) + "+" + strconv.FormatInt(metadataBytes, 10)
}
func formatNVMeBytes(n int64) string {
if n <= 0 {
return ""
}
units := []string{"B", "KB", "MB", "GB", "TB", "PB"}
v := float64(n)
unit := 0
for v >= 1000 && unit < len(units)-1 {
v /= 1000
unit++
}
if unit == 0 {
return fmt.Sprintf("%d B", n)
}
return fmt.Sprintf("%.1f %s", v, units[unit])
}
func renderNVMeFormatInline() string {
return `<div id="nvme-format-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVMe disks...</div>
<div id="nvme-format-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
<script>
function nvmeFormatEsc(s) {
return String(s == null ? '' : s).replace(/[&<>"']/g, function(c) {
return {'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#39;'}[c];
});
}
function loadNVMeFormats() {
var status = document.getElementById('nvme-format-status');
var table = document.getElementById('nvme-format-table');
status.textContent = 'Loading NVMe disks...';
status.style.color = 'var(--muted)';
table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
fetch('/api/tools/nvme-formats').then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(disks) {
window._nvmeFormatDisks = Array.isArray(disks) ? disks : [];
if (!window._nvmeFormatDisks.length) {
status.textContent = 'No NVMe disks found.';
table.innerHTML = '';
return;
}
status.textContent = window._nvmeFormatDisks.length + ' NVMe disk(s) found.';
var rows = window._nvmeFormatDisks.map(function(d, idx) {
var current = d.current_format ? (d.current_format + ' / MODE ' + d.current_mode) : 'unknown';
var detail = [d.model || '', d.serial || '', d.size || ''].filter(Boolean).join(' | ');
var options = (d.modes || []).map(function(m) {
return '<option value="' + m.mode + '"' + (m.in_use ? ' selected' : '') + '>' + nvmeFormatEsc(m.label) + '</option>';
}).join('');
var disabled = options ? '' : ' disabled';
var err = d.error ? '<div style="font-size:12px;color:var(--crit-fg,#9f3a38);margin-top:4px">' + nvmeFormatEsc(d.error) + '</div>' : '';
return '<tr>'
+ '<td style="font-family:monospace;white-space:nowrap">' + nvmeFormatEsc(d.device) + (detail ? '<div style="font-family:inherit;font-size:12px;color:var(--muted)">' + nvmeFormatEsc(detail) + '</div>' : '') + '</td>'
+ '<td style="white-space:nowrap">' + nvmeFormatEsc(current) + err + '</td>'
+ '<td style="white-space:nowrap"><select id="nvme-format-select-' + idx + '"' + disabled + '>' + options + '</select></td>'
+ '<td style="white-space:nowrap"><button class="btn btn-sm btn-primary" onclick="nvmeFormatRun(' + idx + ', this)"' + disabled + '>Apply</button><div class="nvme-format-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div></td>'
+ '</tr>';
}).join('');
table.innerHTML = '<table><tr><th>Disk</th><th>Current block / mode</th><th>New mode</th><th>Action</th></tr>' + rows + '</table>';
}).catch(function(e) {
status.textContent = 'Error loading NVMe disks: ' + e.message;
status.style.color = 'var(--crit-fg,#9f3a38)';
table.innerHTML = '';
});
}
function nvmeWaitTaskDone(taskID, rowMsg) {
var timer = setInterval(function() {
fetch('/api/tasks').then(function(r) { return r.json(); }).then(function(tasks) {
var task = (tasks || []).find(function(t) { return t.id === taskID; });
if (!task) return;
if (task.status === 'done' || task.status === 'failed' || task.status === 'cancelled') {
clearInterval(timer);
rowMsg.textContent = 'Task ' + taskID + ': ' + task.status + (task.error ? ' - ' + task.error : '');
rowMsg.style.color = task.status === 'done' ? 'var(--ok,green)' : 'var(--crit-fg,#9f3a38)';
loadNVMeFormats();
}
}).catch(function(){});
}, 1500);
}
function nvmeFormatRun(idx, btn) {
var disk = (window._nvmeFormatDisks || [])[idx];
var select = document.getElementById('nvme-format-select-' + idx);
var row = btn.closest('td');
var rowMsg = row.querySelector('.nvme-format-row-msg');
if (!disk || !select) return;
var lbaf = parseInt(select.value, 10);
var mode = (disk.modes || []).find(function(m) { return m.mode === lbaf; });
if (!mode) return;
if (!window.confirm('Format ' + disk.device + ' to ' + mode.label + '? This erases data on the namespace.')) return;
btn.disabled = true;
rowMsg.style.color = 'var(--muted)';
rowMsg.textContent = 'Queued...';
fetch('/api/tools/nvme-format/run', {
method:'POST',
headers:{'Content-Type':'application/json'},
body:JSON.stringify({device: disk.device, lbaf: lbaf})
}).then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(d) {
rowMsg.textContent = 'Task ' + d.task_id + ' queued.';
nvmeWaitTaskDone(d.task_id, rowMsg);
}).catch(function(e) {
rowMsg.style.color = 'var(--crit-fg,#9f3a38)';
rowMsg.textContent = 'Error: ' + e.message;
}).finally(function() {
btn.disabled = false;
});
}
loadNVMeFormats();
</script>`
}
func renderNVMeFormatCard() string {
return `<div class="card"><div class="card-head">NVMe Block Format <button class="btn btn-sm btn-secondary" onclick="loadNVMeFormats()" style="margin-left:auto">&#8635; Refresh</button></div><div class="card-body">` +
`<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Lists NVMe namespaces and changes their LBA format through a queued task.</p>` +
renderNVMeFormatInline() + `</div></div>`
}

View File

@@ -475,6 +475,7 @@ function installToRAM() {
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
renderServicesInline() + `</div></div>
` + renderNVMeFormatCard() + `
<script>
function checkTools() {

View File

@@ -307,6 +307,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
// Tools
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
mux.HandleFunc("GET /api/tools/nvme-formats", h.handleAPINVMeFormats)
mux.HandleFunc("POST /api/tools/nvme-format/run", h.handleAPINVMeFormatRun)
// GPU presence / tools
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)

View File

@@ -677,6 +677,12 @@ func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
if !strings.Contains(body, `/api/blackbox/status`) {
t.Fatalf("tools page missing black-box status api usage: %s", body)
}
if !strings.Contains(body, `NVMe Block Format`) {
t.Fatalf("tools page missing nvme block format section: %s", body)
}
if !strings.Contains(body, `/api/tools/nvme-formats`) || !strings.Contains(body, `/api/tools/nvme-format/run`) {
t.Fatalf("tools page missing nvme format api usage: %s", body)
}
}
func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {

View File

@@ -376,6 +376,12 @@ func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx cont
break
}
err = a.RunInstallToRAM(ctx, j.append)
case "nvme-format":
if strings.TrimSpace(t.params.Device) == "" {
err = fmt.Errorf("device is required")
break
}
err = runNVMeFormatTask(ctx, j, t.params.Device, t.params.LBAF)
default:
j.append("ERROR: unknown target: " + t.Target)
j.finish("unknown target")

View File

@@ -57,6 +57,7 @@ var taskNames = map[string]string{
"support-bundle": "Support Bundle",
"install": "Install to Disk",
"install-to-ram": "Install to RAM",
"nvme-format": "NVMe Block Format Change",
}
// burnNames maps target → human-readable name when a burn profile is set.
@@ -137,6 +138,7 @@ type taskParams struct {
RampRunID string `json:"ramp_run_id,omitempty"`
DisplayName string `json:"display_name,omitempty"`
Device string `json:"device,omitempty"` // for install
LBAF int `json:"lbaf,omitempty"`
PlatformComponents []string `json:"platform_components,omitempty"`
}
@@ -598,6 +600,17 @@ func (q *taskQueue) startRecoveredTaskMonitorLocked(t *Task, j *jobState) {
}
func (q *taskQueue) runTaskExternal(t *Task, j *jobState) {
startedKmsgWatch := false
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
startedKmsgWatch = true
}
defer func() {
if startedKmsgWatch && q.kmsgWatcher != nil {
q.kmsgWatcher.NotifyTaskFinished(t.ID)
}
}()
stopTail := make(chan struct{})
doneTail := make(chan struct{})
defer func() {

View File

@@ -126,6 +126,23 @@ func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
}
}
func TestJobAppendFlushesTaskLogImmediately(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "task.log")
j := newTaskJobState(path)
j.append("live-line")
data, err := os.ReadFile(path)
if err != nil {
t.Fatal(err)
}
if string(data) != "live-line\n" {
t.Fatalf("log=%q want live-line newline", string(data))
}
j.closeLog()
}
func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
q := &taskQueue{
@@ -849,3 +866,82 @@ func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
}
}
func TestRunTaskExternalOpensAndClosesKmsgWindow(t *testing.T) {
dir := t.TempDir()
releasePath := filepath.Join(dir, "release")
readyPath := filepath.Join(dir, "ready")
q := &taskQueue{
opts: &HandlerOptions{ExportDir: dir},
logsDir: filepath.Join(dir, "tasks"),
kmsgWatcher: newKmsgWatcher(nil),
trigger: make(chan struct{}, 1),
}
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
t.Fatal(err)
}
tk := &Task{
ID: "cpu-external-1",
Name: "CPU SAT",
Target: "cpu",
Status: TaskRunning,
CreatedAt: time.Now(),
}
q.assignTaskLogPathLocked(tk)
j := newTaskJobState(tk.LogPath)
orig := externalTaskRunnerCommand
externalTaskRunnerCommand = func(exportDir, taskID string) (*exec.Cmd, error) {
script := "printf ready > \"$1\"; while [ ! -f \"$2\" ]; do sleep 0.05; done"
return exec.Command("sh", "-c", script, "sh", readyPath, releasePath), nil
}
defer func() { externalTaskRunnerCommand = orig }()
done := make(chan struct{})
go func() {
q.runTaskExternal(tk, j)
close(done)
}()
deadline := time.Now().Add(2 * time.Second)
for time.Now().Before(deadline) {
if _, err := os.Stat(readyPath); err == nil {
break
}
time.Sleep(20 * time.Millisecond)
}
if _, err := os.Stat(readyPath); err != nil {
t.Fatalf("external runner did not start: %v", err)
}
q.kmsgWatcher.mu.Lock()
activeCount := q.kmsgWatcher.activeCount
window := q.kmsgWatcher.window
q.kmsgWatcher.mu.Unlock()
if activeCount != 1 {
t.Fatalf("activeCount while running=%d want 1", activeCount)
}
if window == nil || len(window.targets) != 1 || window.targets[0] != "cpu" {
t.Fatalf("window while running=%+v", window)
}
if err := os.WriteFile(releasePath, []byte("1\n"), 0644); err != nil {
t.Fatal(err)
}
select {
case <-done:
case <-time.After(2 * time.Second):
t.Fatal("runTaskExternal did not return")
}
q.kmsgWatcher.mu.Lock()
activeCount = q.kmsgWatcher.activeCount
window = q.kmsgWatcher.window
q.kmsgWatcher.mu.Unlock()
if activeCount != 0 {
t.Fatalf("activeCount after finish=%d want 0", activeCount)
}
if window != nil {
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
}
}