feat: redesign collection UI + add StopHostAfterCollect + TCP ping pre-probe

- Single "Подключиться" button flow: probe first, then show collect options
- Power management checkboxes: power on before / stop after collect
- Modal confirmation when enabling shutdown on already-powered-on host
- StopHostAfterCollect flag: host shuts down only when explicitly requested
- TCP ping (10 attempts, min 3 successes) before Redfish probe
- Debug payloads checkbox (Oem/Ami/Inventory/Crc, off by default)
- Remove platform_config BIOS settings collection (unreliable on AMI)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-03-19 18:50:01 +03:00
parent e3ff1745fc
commit 063b08d5fb
9 changed files with 325 additions and 100 deletions

View File

@@ -159,14 +159,11 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
systemPaths := c.discoverMemberPaths(discoveryCtx, snapshotClient, req, baseURL, "/redfish/v1/Systems", "/redfish/v1/Systems/1")
primarySystem := firstNonEmptyPath(systemPaths, "/redfish/v1/Systems/1")
poweredOnByCollector := false
if primarySystem != "" {
if on, changed := c.ensureHostPowerForCollection(ctx, snapshotClient, req, baseURL, primarySystem, emit); on {
poweredOnByCollector = changed
}
c.ensureHostPowerForCollection(ctx, snapshotClient, req, baseURL, primarySystem, emit)
}
defer func() {
if !poweredOnByCollector || primarySystem == "" {
if primarySystem == "" || !req.StopHostAfterCollect {
return
}
shutdownCtx, cancel := context.WithTimeout(context.Background(), 45*time.Second)
@@ -313,6 +310,10 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
}
// Collect hardware event logs separately (not part of tree-walk to avoid bloat).
rawLogEntries := c.collectRedfishLogEntries(withRedfishTelemetryPhase(ctx, "log_entries"), snapshotClient, req, baseURL, systemPaths, managerPaths)
var debugPayloads map[string]any
if req.DebugPayloads {
debugPayloads = c.collectDebugPayloads(ctx, snapshotClient, req, baseURL, systemPaths)
}
rawPayloads := map[string]any{
"redfish_tree": rawTree,
"redfish_profiles": map[string]any{
@@ -418,6 +419,9 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
if len(rawLogEntries) > 0 {
rawPayloads["redfish_log_entries"] = rawLogEntries
}
if len(debugPayloads) > 0 {
rawPayloads["redfish_debug_payloads"] = debugPayloads
}
// Unified tunnel: live collection and raw import go through the same analyzer over redfish_tree.
result, err := ReplayRedfishFromRawPayloads(rawPayloads, nil)
if err != nil {
@@ -618,6 +622,20 @@ func (c *RedfishConnector) restoreHostPowerAfterCollection(ctx context.Context,
}
}
// collectDebugPayloads fetches vendor-specific diagnostic endpoints on a best-effort basis.
// Results are stored in rawPayloads["redfish_debug_payloads"] and exported with the bundle.
// Enabled only when Request.DebugPayloads is true.
func (c *RedfishConnector) collectDebugPayloads(ctx context.Context, client *http.Client, req Request, baseURL string, systemPaths []string) map[string]any {
out := map[string]any{}
for _, systemPath := range systemPaths {
// AMI/MSI: inventory CRC groups — reveals which groups are supported by this BMC.
if doc, err := c.getJSON(ctx, client, req, baseURL, joinPath(systemPath, "/Oem/Ami/Inventory/Crc")); err == nil {
out[joinPath(systemPath, "/Oem/Ami/Inventory/Crc")] = doc
}
}
return out
}
// invalidateRedfishInventory POSTs to the AMI/MSI InventoryCrc endpoint to zero out
// all known CRC groups before a host power-on. This causes the BMC to accept fresh
// inventory from the host after boot, preventing stale inventory (ghost GPUs, wrong
@@ -630,8 +648,6 @@ func (c *RedfishConnector) invalidateRedfishInventory(ctx context.Context, clien
{"CPU": 0},
{"DIMM": 0},
{"PCIE": 0},
{"CERTIFICATES": 0},
{"SECUREBOOT": 0},
},
}
if err := c.postJSON(ctx, client, req, baseURL, crcPath, body); err != nil {
@@ -5609,6 +5625,7 @@ func parseFirmware(system, bios, manager, networkProtocol map[string]interface{}
return out
}
func mapStatus(statusAny interface{}) string {
if statusAny == nil {
return ""

View File

@@ -123,7 +123,7 @@ func ReplayRedfishFromRawPayloads(rawPayloads map[string]any, emit ProgressFn) (
PowerSupply: psus,
NetworkAdapters: nics,
Firmware: firmware,
},
},
}
match := profileMatch
for _, profile := range match.Profiles {
@@ -277,6 +277,7 @@ func redfishFetchErrorsFromRawPayloads(rawPayloads map[string]any) map[string]st
}
}
func buildDriveFetchWarningEvents(rawPayloads map[string]any) []models.Event {
errs := redfishFetchErrorsFromRawPayloads(rawPayloads)
if len(errs) == 0 {

View File

@@ -15,7 +15,9 @@ type Request struct {
Password string
Token string
TLSMode string
PowerOnIfHostOff bool
PowerOnIfHostOff bool
StopHostAfterCollect bool
DebugPayloads bool
}
type Progress struct {

View File

@@ -43,13 +43,13 @@ func ConvertToReanimator(result *models.AnalysisResult) (*ReanimatorExport, erro
TargetHost: targetHost,
CollectedAt: collectedAt,
Hardware: ReanimatorHardware{
Board: convertBoard(result.Hardware.BoardInfo),
Firmware: dedupeFirmware(convertFirmware(result.Hardware.Firmware)),
CPUs: dedupeCPUs(convertCPUsFromDevices(devices, collectedAt, result.Hardware.BoardInfo.SerialNumber, buildCPUMicrocodeBySocket(result.Hardware.Firmware))),
Memory: dedupeMemory(convertMemoryFromDevices(devices, collectedAt)),
Storage: dedupeStorage(convertStorageFromDevices(devices, collectedAt)),
PCIeDevices: dedupePCIe(convertPCIeFromDevices(devices, collectedAt)),
PowerSupplies: dedupePSUs(convertPSUsFromDevices(devices, collectedAt)),
Board: convertBoard(result.Hardware.BoardInfo),
Firmware: dedupeFirmware(convertFirmware(result.Hardware.Firmware)),
CPUs: dedupeCPUs(convertCPUsFromDevices(devices, collectedAt, result.Hardware.BoardInfo.SerialNumber, buildCPUMicrocodeBySocket(result.Hardware.Firmware))),
Memory: dedupeMemory(convertMemoryFromDevices(devices, collectedAt)),
Storage: dedupeStorage(convertStorageFromDevices(devices, collectedAt)),
PCIeDevices: dedupePCIe(convertPCIeFromDevices(devices, collectedAt)),
PowerSupplies: dedupePSUs(convertPSUsFromDevices(devices, collectedAt)),
Sensors: convertSensors(result.Sensors),
EventLogs: convertEventLogs(result.Events, collectedAt),
},

View File

@@ -19,7 +19,9 @@ type CollectRequest struct {
Password string `json:"password,omitempty"`
Token string `json:"token,omitempty"`
TLSMode string `json:"tls_mode"`
PowerOnIfHostOff bool `json:"power_on_if_host_off,omitempty"`
PowerOnIfHostOff bool `json:"power_on_if_host_off,omitempty"`
StopHostAfterCollect bool `json:"stop_host_after_collect,omitempty"`
DebugPayloads bool `json:"debug_payloads,omitempty"`
}
type CollectProbeResponse struct {

View File

@@ -10,8 +10,10 @@ import (
"fmt"
"html/template"
"io"
"net"
"net/http"
"os"
"sync/atomic"
"path/filepath"
"regexp"
"sort"
@@ -1574,6 +1576,32 @@ func (s *Server) handleCollectStart(w http.ResponseWriter, r *http.Request) {
_ = json.NewEncoder(w).Encode(job.toJobResponse("Collection job accepted"))
}
// pingHost dials host:port up to total times with 2s timeout each, returns true if
// at least need attempts succeeded.
func pingHost(host string, port int, total, need int) (bool, string) {
addr := fmt.Sprintf("%s:%d", host, port)
var successes atomic.Int32
done := make(chan struct{}, total)
for i := 0; i < total; i++ {
go func() {
defer func() { done <- struct{}{} }()
conn, err := net.DialTimeout("tcp", addr, 2*time.Second)
if err == nil {
conn.Close()
successes.Add(1)
}
}()
}
for i := 0; i < total; i++ {
<-done
}
n := int(successes.Load())
if n < need {
return false, fmt.Sprintf("Хост недоступен: только %d из %d попыток подключения к %s прошли успешно (требуется минимум %d)", n, total, addr, need)
}
return true, ""
}
func (s *Server) handleCollectProbe(w http.ResponseWriter, r *http.Request) {
var req CollectRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
@@ -1595,6 +1623,11 @@ func (s *Server) handleCollectProbe(w http.ResponseWriter, r *http.Request) {
return
}
if ok, msg := pingHost(req.Host, req.Port, 10, 3); !ok {
jsonError(w, msg, http.StatusBadRequest)
return
}
ctx, cancel := context.WithTimeout(r.Context(), 20*time.Second)
defer cancel()
@@ -1967,7 +2000,9 @@ func toCollectorRequest(req CollectRequest) collector.Request {
Password: req.Password,
Token: req.Token,
TLSMode: req.TLSMode,
PowerOnIfHostOff: req.PowerOnIfHostOff,
PowerOnIfHostOff: req.PowerOnIfHostOff,
StopHostAfterCollect: req.StopHostAfterCollect,
DebugPayloads: req.DebugPayloads,
}
}