diff --git a/bible-local/05-collectors.md b/bible-local/05-collectors.md index ae87301..43ef298 100644 --- a/bible-local/05-collectors.md +++ b/bible-local/05-collectors.md @@ -35,18 +35,27 @@ If the collector adds a fallback, probe, or normalization rule, replay must mirr ### Preflight and host power -- `Probe()` may be used before collection to verify API connectivity and current host `PowerState` -- if the host is off and the user chose power-on, the collector may issue `ComputerSystem.Reset` - with `ResetType=On` -- power-on attempts are bounded and logged -- after a successful power-on, the collector waits an extra stabilization window, then checks - `PowerState` again and only starts collection if the host is still on -- if the collector powered on the host itself for collection, it must attempt to power it back off - after collection completes -- if the host was already on before collection, the collector must not power it off afterward -- if power-on fails, collection still continues against the powered-off host -- all power-control decisions and attempts must be visible in the collection log so they are - preserved in raw-export bundles +- `Probe()` is used before collection to verify API connectivity and report current host `PowerState` +- if the host is off, the collector logs a warning and proceeds with collection; inventory data may + be incomplete when the host is powered off +- power-on and power-off are not performed by the collector + +### Skip hung requests + +Redfish collection uses a two-level context model: + +- `ctx` — job lifetime context, cancelled only on explicit job cancel +- `collectCtx` — collection phase context, derived from `ctx`; covers snapshot, prefetch, and plan-B + +`collectCtx` is cancelled when the user presses "Пропустить зависшие" (skip hung). +On skip, all in-flight HTTP requests in the current phase are aborted immediately via context +cancellation, the crawler and plan-B loops exit, and execution proceeds to the replay phase using +whatever was collected in `rawTree`. The result is partial but valid. + +The skip signal travels: UI button → `POST /api/collect/{id}/skip` → `JobManager.SkipJob()` → +closes `skipCh` → goroutine in `Collect()` → `cancelCollect()`. + +The skip button is visible during `running` state and hidden once the job reaches a terminal state. ### Discovery model diff --git a/internal/collector/redfish.go b/internal/collector/redfish.go index df75046..647f99c 100644 --- a/internal/collector/redfish.go +++ b/internal/collector/redfish.go @@ -112,12 +112,11 @@ func (c *RedfishConnector) Probe(ctx context.Context, req Request) (*ProbeResult } powerState := redfishSystemPowerState(systemDoc) return &ProbeResult{ - Reachable: true, - Protocol: "redfish", - HostPowerState: powerState, - HostPoweredOn: isRedfishHostPoweredOn(powerState), - PowerControlAvailable: redfishResetActionTarget(systemDoc) != "", - SystemPath: primarySystem, + Reachable: true, + Protocol: "redfish", + HostPowerState: powerState, + HostPoweredOn: isRedfishHostPoweredOn(powerState), + SystemPath: primarySystem, }, nil } @@ -160,17 +159,6 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre systemPaths := c.discoverMemberPaths(discoveryCtx, snapshotClient, req, baseURL, "/redfish/v1/Systems", "/redfish/v1/Systems/1") primarySystem := firstNonEmptyPath(systemPaths, "/redfish/v1/Systems/1") - if primarySystem != "" { - c.ensureHostPowerForCollection(ctx, snapshotClient, req, baseURL, primarySystem, emit) - } - defer func() { - if primarySystem == "" || !req.StopHostAfterCollect { - return - } - shutdownCtx, cancel := context.WithTimeout(context.Background(), 45*time.Second) - defer cancel() - c.restoreHostPowerAfterCollection(shutdownCtx, snapshotClient, req, baseURL, primarySystem, emit) - }() chassisPaths := c.discoverMemberPaths(discoveryCtx, snapshotClient, req, baseURL, "/redfish/v1/Chassis", "/redfish/v1/Chassis/1") managerPaths := c.discoverMemberPaths(discoveryCtx, snapshotClient, req, baseURL, "/redfish/v1/Managers", "/redfish/v1/Managers/1") primaryChassis := firstNonEmptyPath(chassisPaths, "/redfish/v1/Chassis/1") @@ -269,12 +257,35 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre emit(Progress{Status: "running", Progress: 80, Message: "Redfish: подготовка расширенного snapshot...", CurrentPhase: "snapshot", ETASeconds: acquisitionPlan.Tuning.ETABaseline.SnapshotSeconds}) emit(Progress{Status: "running", Progress: 90, Message: "Redfish: сбор расширенного snapshot...", CurrentPhase: "snapshot", ETASeconds: acquisitionPlan.Tuning.ETABaseline.SnapshotSeconds}) } + // collectCtx covers all data-fetching phases (snapshot, prefetch, plan-B). + // Cancelling it via the skip signal aborts only the collection phases while + // leaving the replay phase intact so results from already-fetched data are preserved. + collectCtx, cancelCollect := context.WithCancel(ctx) + defer cancelCollect() + if req.SkipHungCh != nil { + go func() { + select { + case <-req.SkipHungCh: + if emit != nil { + emit(Progress{ + Status: "running", + Progress: 97, + Message: "Redfish: пропуск зависших запросов, анализ уже собранных данных...", + }) + } + log.Printf("redfish: skip-hung triggered, cancelling collection phases") + cancelCollect() + case <-ctx.Done(): + } + }() + } + c.debugSnapshotf("snapshot crawl start host=%s port=%d", req.Host, req.Port) - rawTree, fetchErrors, postProbeMetrics, snapshotTimingSummary := c.collectRawRedfishTree(withRedfishTelemetryPhase(ctx, "snapshot"), snapshotClient, req, baseURL, seedPaths, acquisitionPlan.Tuning, emit) + rawTree, fetchErrors, postProbeMetrics, snapshotTimingSummary := c.collectRawRedfishTree(withRedfishTelemetryPhase(collectCtx, "snapshot"), snapshotClient, req, baseURL, seedPaths, acquisitionPlan.Tuning, emit) c.debugSnapshotf("snapshot crawl done docs=%d", len(rawTree)) fetchErrMap := redfishFetchErrorListToMap(fetchErrors) - prefetchedCritical, prefetchMetrics := c.prefetchCriticalRedfishDocs(withRedfishTelemetryPhase(ctx, "prefetch"), prefetchClient, req, baseURL, criticalPaths, rawTree, fetchErrMap, acquisitionPlan.Tuning, emit) + prefetchedCritical, prefetchMetrics := c.prefetchCriticalRedfishDocs(withRedfishTelemetryPhase(collectCtx, "prefetch"), prefetchClient, req, baseURL, criticalPaths, rawTree, fetchErrMap, acquisitionPlan.Tuning, emit) for p, doc := range prefetchedCritical { if _, exists := rawTree[p]; exists { continue @@ -295,10 +306,10 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre prefetchMetrics.Duration.Round(time.Millisecond), firstNonEmpty(prefetchMetrics.SkipReason, "-"), ) - if recoveredN := c.recoverCriticalRedfishDocsPlanB(withRedfishTelemetryPhase(ctx, "critical_plan_b"), criticalClient, req, baseURL, criticalPaths, rawTree, fetchErrMap, acquisitionPlan.Tuning, emit); recoveredN > 0 { + if recoveredN := c.recoverCriticalRedfishDocsPlanB(withRedfishTelemetryPhase(collectCtx, "critical_plan_b"), criticalClient, req, baseURL, criticalPaths, rawTree, fetchErrMap, acquisitionPlan.Tuning, emit); recoveredN > 0 { c.debugSnapshotf("critical plan-b recovered docs=%d", recoveredN) } - if recoveredN := c.recoverProfilePlanBDocs(withRedfishTelemetryPhase(ctx, "profile_plan_b"), criticalClient, req, baseURL, acquisitionPlan, rawTree, emit); recoveredN > 0 { + if recoveredN := c.recoverProfilePlanBDocs(withRedfishTelemetryPhase(collectCtx, "profile_plan_b"), criticalClient, req, baseURL, acquisitionPlan, rawTree, emit); recoveredN > 0 { c.debugSnapshotf("profile plan-b recovered docs=%d", recoveredN) } // Hide transient fetch errors for endpoints that were eventually recovered into rawTree. @@ -485,230 +496,6 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre return result, nil } -func (c *RedfishConnector) ensureHostPowerForCollection(ctx context.Context, client *http.Client, req Request, baseURL, systemPath string, emit ProgressFn) (hostOn bool, poweredOnByCollector bool) { - systemDoc, err := c.getJSON(ctx, client, req, baseURL, systemPath) - if err != nil { - if emit != nil { - emit(Progress{Status: "running", Progress: 18, Message: "Redfish: не удалось проверить PowerState host, сбор продолжается без power-control"}) - } - return false, false - } - - powerState := redfishSystemPowerState(systemDoc) - if isRedfishHostPoweredOn(powerState) { - if emit != nil { - emit(Progress{Status: "running", Progress: 18, Message: fmt.Sprintf("Redfish: host включен (%s)", firstNonEmpty(powerState, "On"))}) - } - return true, false - } - - if emit != nil { - emit(Progress{Status: "running", Progress: 18, Message: fmt.Sprintf("Redfish: host выключен (%s)", firstNonEmpty(powerState, "Off"))}) - } - if !req.PowerOnIfHostOff { - if emit != nil { - emit(Progress{Status: "running", Progress: 19, Message: "Redfish: включение host не запрошено, сбор продолжается на выключенном host"}) - } - return false, false - } - - // Invalidate all inventory CRC groups before powering on so the BMC accepts - // fresh inventory from the host after boot. Best-effort: failure is logged but - // does not block power-on. - c.invalidateRedfishInventory(ctx, client, req, baseURL, systemPath, emit) - - resetTarget := redfishResetActionTarget(systemDoc) - resetType := redfishPickResetType(systemDoc, "On", "ForceOn") - if resetTarget == "" || resetType == "" { - if emit != nil { - emit(Progress{Status: "running", Progress: 19, Message: "Redfish: action ComputerSystem.Reset недоступен, сбор продолжается на выключенном host"}) - } - return false, false - } - - waitWindows := []time.Duration{5 * time.Second, 10 * time.Second, 30 * time.Second} - for i, waitFor := range waitWindows { - if emit != nil { - emit(Progress{Status: "running", Progress: 19, Message: fmt.Sprintf("Redfish: попытка включения host (%d/%d), ожидание %s", i+1, len(waitWindows), waitFor)}) - } - if err := c.postJSON(ctx, client, req, baseURL, resetTarget, map[string]any{"ResetType": resetType}); err != nil { - if emit != nil { - emit(Progress{Status: "running", Progress: 19, Message: fmt.Sprintf("Redfish: включение host не удалось (%v)", err)}) - } - continue - } - if c.waitForHostPowerState(ctx, client, req, baseURL, systemPath, true, waitFor) { - if !c.waitForStablePoweredOnHost(ctx, client, req, baseURL, systemPath, emit) { - if emit != nil { - emit(Progress{Status: "running", Progress: 20, Message: "Redfish: host включился, но не подтвердил стабильное состояние; сбор продолжается на выключенном host"}) - } - return false, false - } - if emit != nil { - emit(Progress{Status: "running", Progress: 20, Message: "Redfish: host успешно включен и стабилен перед сбором"}) - } - return true, true - } - if emit != nil { - emit(Progress{Status: "running", Progress: 20, Message: fmt.Sprintf("Redfish: host не включился за %s", waitFor)}) - } - } - - if emit != nil { - emit(Progress{Status: "running", Progress: 20, Message: "Redfish: host не удалось включить, сбор продолжается на выключенном host"}) - } - return false, false -} - -func (c *RedfishConnector) waitForStablePoweredOnHost(ctx context.Context, client *http.Client, req Request, baseURL, systemPath string, emit ProgressFn) bool { - stabilizationDelay := redfishPowerOnStabilizationDelay() - if stabilizationDelay > 0 { - if emit != nil { - emit(Progress{ - Status: "running", - Progress: 20, - Message: fmt.Sprintf("Redfish: host включен, ожидание стабилизации %s перед началом сбора", stabilizationDelay), - }) - } - timer := time.NewTimer(stabilizationDelay) - select { - case <-ctx.Done(): - timer.Stop() - return false - case <-timer.C: - timer.Stop() - } - } - if emit != nil { - emit(Progress{ - Status: "running", - Progress: 20, - Message: "Redfish: повторная проверка PowerState после стабилизации host", - }) - } - if !c.waitForHostPowerState(ctx, client, req, baseURL, systemPath, true, 5*time.Second) { - return false - } - - // After the initial stabilization wait, the BMC may still be populating its - // hardware inventory (PCIeDevices, memory summary). Poll readiness with - // increasing back-off (default: +60s, +120s), then warn and proceed. - readinessWaits := redfishBMCReadinessWaits() - for attempt, extraWait := range readinessWaits { - ready, reason := c.isBMCInventoryReady(ctx, client, req, baseURL, systemPath) - if ready { - if emit != nil { - emit(Progress{ - Status: "running", - Progress: 20, - Message: fmt.Sprintf("Redfish: BMC готов (%s)", reason), - }) - } - return true - } - if emit != nil { - emit(Progress{ - Status: "running", - Progress: 20, - Message: fmt.Sprintf("Redfish: BMC не готов (%s), ожидание %s (попытка %d/%d)", reason, extraWait, attempt+1, len(readinessWaits)), - }) - } - timer := time.NewTimer(extraWait) - select { - case <-ctx.Done(): - timer.Stop() - return false - case <-timer.C: - timer.Stop() - } - if emit != nil { - emit(Progress{ - Status: "running", - Progress: 20, - Message: fmt.Sprintf("Redfish: повторная проверка готовности BMC (%d/%d)...", attempt+1, len(readinessWaits)), - }) - } - } - ready, reason := c.isBMCInventoryReady(ctx, client, req, baseURL, systemPath) - if !ready { - if emit != nil { - emit(Progress{ - Status: "running", - Progress: 20, - Message: fmt.Sprintf("Redfish: WARNING — BMC не подтвердил готовность (%s), сбор может быть неполным", reason), - }) - } - } else if emit != nil { - emit(Progress{ - Status: "running", - Progress: 20, - Message: fmt.Sprintf("Redfish: BMC готов (%s)", reason), - }) - } - return true -} - -// isBMCInventoryReady checks whether the BMC has finished populating its -// hardware inventory after a power-on. Returns (ready, reason). -// It considers the BMC ready if either the system memory summary reports -// a non-zero total or the PCIeDevices collection is non-empty. -func (c *RedfishConnector) isBMCInventoryReady(ctx context.Context, client *http.Client, req Request, baseURL, systemPath string) (bool, string) { - systemDoc, err := c.getJSON(ctx, client, req, baseURL, systemPath) - if err != nil { - return false, "не удалось прочитать System" - } - if summary, ok := systemDoc["MemorySummary"].(map[string]interface{}); ok { - if asFloat(summary["TotalSystemMemoryGiB"]) > 0 { - return true, "MemorySummary заполнен" - } - } - pcieDoc, err := c.getJSON(ctx, client, req, baseURL, joinPath(systemPath, "/PCIeDevices")) - if err == nil { - if asInt(pcieDoc["Members@odata.count"]) > 0 { - return true, "PCIeDevices не пуст" - } - if members, ok := pcieDoc["Members"].([]interface{}); ok && len(members) > 0 { - return true, "PCIeDevices не пуст" - } - } - return false, "MemorySummary=0, PCIeDevices пуст" -} - -func (c *RedfishConnector) restoreHostPowerAfterCollection(ctx context.Context, client *http.Client, req Request, baseURL, systemPath string, emit ProgressFn) { - systemDoc, err := c.getJSON(ctx, client, req, baseURL, systemPath) - if err != nil { - if emit != nil { - emit(Progress{Status: "running", Progress: 100, Message: "Redfish: не удалось повторно прочитать system перед выключением host"}) - } - return - } - resetTarget := redfishResetActionTarget(systemDoc) - resetType := redfishPickResetType(systemDoc, "GracefulShutdown", "ForceOff", "PushPowerButton") - if resetTarget == "" || resetType == "" { - if emit != nil { - emit(Progress{Status: "running", Progress: 100, Message: "Redfish: выключение host после сбора недоступно"}) - } - return - } - if emit != nil { - emit(Progress{Status: "running", Progress: 100, Message: "Redfish: выключаем host после завершения сбора"}) - } - if err := c.postJSON(ctx, client, req, baseURL, resetTarget, map[string]any{"ResetType": resetType}); err != nil { - if emit != nil { - emit(Progress{Status: "running", Progress: 100, Message: fmt.Sprintf("Redfish: не удалось выключить host после сбора (%v)", err)}) - } - return - } - if c.waitForHostPowerState(ctx, client, req, baseURL, systemPath, false, 20*time.Second) { - if emit != nil { - emit(Progress{Status: "running", Progress: 100, Message: "Redfish: host выключен после завершения сбора"}) - } - return - } - if emit != nil { - emit(Progress{Status: "running", Progress: 100, Message: "Redfish: не удалось подтвердить выключение host после сбора"}) - } -} // collectDebugPayloads fetches vendor-specific diagnostic endpoints on a best-effort basis. // Results are stored in rawPayloads["redfish_debug_payloads"] and exported with the bundle. @@ -724,49 +511,6 @@ func (c *RedfishConnector) collectDebugPayloads(ctx context.Context, client *htt return out } -// invalidateRedfishInventory POSTs to the AMI/MSI InventoryCrc endpoint to zero out -// all known CRC groups before a host power-on. This causes the BMC to accept fresh -// inventory from the host after boot, preventing stale inventory (ghost GPUs, wrong -// BIOS version, etc.) from persisting across hardware changes. -// Best-effort: any error is logged and the call silently returns. -func (c *RedfishConnector) invalidateRedfishInventory(ctx context.Context, client *http.Client, req Request, baseURL, systemPath string, emit ProgressFn) { - crcPath := joinPath(systemPath, "/Oem/Ami/Inventory/Crc") - body := map[string]any{ - "GroupCrcList": []map[string]any{ - {"CPU": 0}, - {"DIMM": 0}, - {"PCIE": 0}, - }, - } - if err := c.postJSON(ctx, client, req, baseURL, crcPath, body); err != nil { - log.Printf("redfish: inventory invalidation skipped (not AMI/MSI or endpoint unavailable): %v", err) - return - } - log.Printf("redfish: inventory CRC groups invalidated at %s before host power-on", crcPath) - if emit != nil { - emit(Progress{Status: "running", Progress: 19, Message: "Redfish: инвентарь BMC инвалидирован перед включением host (все CRC группы сброшены)"}) - } -} - -func (c *RedfishConnector) waitForHostPowerState(ctx context.Context, client *http.Client, req Request, baseURL, systemPath string, wantOn bool, timeout time.Duration) bool { - deadline := time.Now().Add(timeout) - for { - systemDoc, err := c.getJSON(ctx, client, req, baseURL, systemPath) - if err == nil { - if isRedfishHostPoweredOn(redfishSystemPowerState(systemDoc)) == wantOn { - return true - } - } - if time.Now().After(deadline) { - return false - } - select { - case <-ctx.Done(): - return false - case <-time.After(1 * time.Second): - } - } -} func firstNonEmptyPath(paths []string, fallback string) string { for _, p := range paths { @@ -799,48 +543,6 @@ func redfishSystemPowerState(systemDoc map[string]interface{}) string { return "" } -func redfishResetActionTarget(systemDoc map[string]interface{}) string { - if systemDoc == nil { - return "" - } - actions, _ := systemDoc["Actions"].(map[string]interface{}) - reset, _ := actions["#ComputerSystem.Reset"].(map[string]interface{}) - target := strings.TrimSpace(asString(reset["target"])) - if target != "" { - return target - } - odataID := strings.TrimSpace(asString(systemDoc["@odata.id"])) - if odataID == "" { - return "" - } - return joinPath(odataID, "/Actions/ComputerSystem.Reset") -} - -func redfishPickResetType(systemDoc map[string]interface{}, preferred ...string) string { - actions, _ := systemDoc["Actions"].(map[string]interface{}) - reset, _ := actions["#ComputerSystem.Reset"].(map[string]interface{}) - allowedRaw, _ := reset["ResetType@Redfish.AllowableValues"].([]interface{}) - if len(allowedRaw) == 0 { - if len(preferred) > 0 { - return preferred[0] - } - return "" - } - allowed := make([]string, 0, len(allowedRaw)) - for _, item := range allowedRaw { - if v := strings.TrimSpace(asString(item)); v != "" { - allowed = append(allowed, v) - } - } - for _, want := range preferred { - for _, have := range allowed { - if strings.EqualFold(want, have) { - return have - } - } - } - return "" -} func (c *RedfishConnector) postJSON(ctx context.Context, client *http.Client, req Request, baseURL, resourcePath string, payload map[string]any) error { body, err := json.Marshal(payload) @@ -2597,33 +2299,6 @@ func redfishCriticalSlowGap() time.Duration { return 1200 * time.Millisecond } -func redfishPowerOnStabilizationDelay() time.Duration { - if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_POWERON_STABILIZATION")); v != "" { - if d, err := time.ParseDuration(v); err == nil && d >= 0 { - return d - } - } - return 60 * time.Second -} - -// redfishBMCReadinessWaits returns the extra wait durations used when polling -// BMC inventory readiness after power-on. Defaults: [60s, 120s]. -// Override with LOGPILE_REDFISH_BMC_READY_WAITS (comma-separated durations, -// e.g. "60s,120s"). -func redfishBMCReadinessWaits() []time.Duration { - if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_BMC_READY_WAITS")); v != "" { - var out []time.Duration - for _, part := range strings.Split(v, ",") { - if d, err := time.ParseDuration(strings.TrimSpace(part)); err == nil && d >= 0 { - out = append(out, d) - } - } - if len(out) > 0 { - return out - } - } - return []time.Duration{60 * time.Second, 120 * time.Second} -} func redfishSnapshotMemoryRequestTimeout() time.Duration { if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_MEMORY_TIMEOUT")); v != "" { diff --git a/internal/collector/redfish_test.go b/internal/collector/redfish_test.go index aa844d8..0e51920 100644 --- a/internal/collector/redfish_test.go +++ b/internal/collector/redfish_test.go @@ -265,9 +265,6 @@ func TestRedfishConnectorProbe(t *testing.T) { if got.HostPowerState != "Off" { t.Fatalf("expected power state Off, got %q", got.HostPowerState) } - if !got.PowerControlAvailable { - t.Fatalf("expected power control available") - } } func TestRedfishConnectorProbe_FallsBackToPowerSummary(t *testing.T) { @@ -330,225 +327,6 @@ func TestRedfishConnectorProbe_FallsBackToPowerSummary(t *testing.T) { if got.HostPowerState != "On" { t.Fatalf("expected power state On, got %q", got.HostPowerState) } - if !got.PowerControlAvailable { - t.Fatalf("expected power control available") - } -} - -func TestEnsureHostPowerForCollection_WaitsForStablePowerOn(t *testing.T) { - t.Setenv("LOGPILE_REDFISH_POWERON_STABILIZATION", "1ms") - t.Setenv("LOGPILE_REDFISH_BMC_READY_WAITS", "1ms,1ms") - - powerState := "Off" - resetCalls := 0 - - mux := http.NewServeMux() - mux.HandleFunc("/redfish/v1/Systems/1", func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - _ = json.NewEncoder(w).Encode(map[string]interface{}{ - "@odata.id": "/redfish/v1/Systems/1", - "PowerState": powerState, - "MemorySummary": map[string]interface{}{ - "TotalSystemMemoryGiB": 128, - }, - "Actions": map[string]interface{}{ - "#ComputerSystem.Reset": map[string]interface{}{ - "target": "/redfish/v1/Systems/1/Actions/ComputerSystem.Reset", - "ResetType@Redfish.AllowableValues": []interface{}{"On"}, - }, - }, - }) - }) - mux.HandleFunc("/redfish/v1/Systems/1/Actions/ComputerSystem.Reset", func(w http.ResponseWriter, r *http.Request) { - resetCalls++ - powerState = "On" - w.WriteHeader(http.StatusOK) - }) - - ts := httptest.NewTLSServer(mux) - defer ts.Close() - - u, err := url.Parse(ts.URL) - if err != nil { - t.Fatalf("parse server url: %v", err) - } - port := 443 - if u.Port() != "" { - fmt.Sscanf(u.Port(), "%d", &port) - } - - c := NewRedfishConnector() - hostOn, changed := c.ensureHostPowerForCollection(context.Background(), c.httpClientWithTimeout(Request{TLSMode: "insecure"}, 5*time.Second), Request{ - Host: u.Hostname(), - Protocol: "redfish", - Port: port, - Username: "admin", - AuthType: "password", - Password: "secret", - TLSMode: "insecure", - PowerOnIfHostOff: true, - }, ts.URL, "/redfish/v1/Systems/1", nil) - if !hostOn || !changed { - t.Fatalf("expected stable power-on result, got hostOn=%v changed=%v", hostOn, changed) - } - if resetCalls != 1 { - t.Fatalf("expected one reset call, got %d", resetCalls) - } -} - -func TestEnsureHostPowerForCollection_FailsIfHostDoesNotStayOnAfterStabilization(t *testing.T) { - t.Setenv("LOGPILE_REDFISH_POWERON_STABILIZATION", "1ms") - t.Setenv("LOGPILE_REDFISH_BMC_READY_WAITS", "1ms,1ms") - - powerState := "Off" - - mux := http.NewServeMux() - mux.HandleFunc("/redfish/v1/Systems/1", func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - current := powerState - if powerState == "On" { - powerState = "Off" - } - _ = json.NewEncoder(w).Encode(map[string]interface{}{ - "@odata.id": "/redfish/v1/Systems/1", - "PowerState": current, - "Actions": map[string]interface{}{ - "#ComputerSystem.Reset": map[string]interface{}{ - "target": "/redfish/v1/Systems/1/Actions/ComputerSystem.Reset", - "ResetType@Redfish.AllowableValues": []interface{}{"On"}, - }, - }, - }) - }) - mux.HandleFunc("/redfish/v1/Systems/1/Actions/ComputerSystem.Reset", func(w http.ResponseWriter, r *http.Request) { - powerState = "On" - w.WriteHeader(http.StatusOK) - }) - - ts := httptest.NewTLSServer(mux) - defer ts.Close() - - u, err := url.Parse(ts.URL) - if err != nil { - t.Fatalf("parse server url: %v", err) - } - port := 443 - if u.Port() != "" { - fmt.Sscanf(u.Port(), "%d", &port) - } - - c := NewRedfishConnector() - hostOn, changed := c.ensureHostPowerForCollection(context.Background(), c.httpClientWithTimeout(Request{TLSMode: "insecure"}, 5*time.Second), Request{ - Host: u.Hostname(), - Protocol: "redfish", - Port: port, - Username: "admin", - AuthType: "password", - Password: "secret", - TLSMode: "insecure", - PowerOnIfHostOff: true, - }, ts.URL, "/redfish/v1/Systems/1", nil) - if hostOn || changed { - t.Fatalf("expected unstable power-on result to fail, got hostOn=%v changed=%v", hostOn, changed) - } -} - -func TestEnsureHostPowerForCollection_UsesPowerSummaryState(t *testing.T) { - t.Setenv("LOGPILE_REDFISH_POWERON_STABILIZATION", "1ms") - t.Setenv("LOGPILE_REDFISH_BMC_READY_WAITS", "1ms,1ms") - - powerState := "On" - - mux := http.NewServeMux() - mux.HandleFunc("/redfish/v1/Systems/1", func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - _ = json.NewEncoder(w).Encode(map[string]interface{}{ - "@odata.id": "/redfish/v1/Systems/1", - "PowerSummary": map[string]interface{}{ - "PowerState": powerState, - }, - "MemorySummary": map[string]interface{}{ - "TotalSystemMemoryGiB": 128, - }, - "Actions": map[string]interface{}{ - "#ComputerSystem.Reset": map[string]interface{}{ - "target": "/redfish/v1/Systems/1/Actions/ComputerSystem.Reset", - "ResetType@Redfish.AllowableValues": []interface{}{"On"}, - }, - }, - }) - }) - - ts := httptest.NewTLSServer(mux) - defer ts.Close() - - u, err := url.Parse(ts.URL) - if err != nil { - t.Fatalf("parse server url: %v", err) - } - port := 443 - if u.Port() != "" { - fmt.Sscanf(u.Port(), "%d", &port) - } - - c := NewRedfishConnector() - hostOn, changed := c.ensureHostPowerForCollection(context.Background(), c.httpClientWithTimeout(Request{TLSMode: "insecure"}, 5*time.Second), Request{ - Host: u.Hostname(), - Protocol: "redfish", - Port: port, - Username: "admin", - AuthType: "password", - Password: "secret", - TLSMode: "insecure", - PowerOnIfHostOff: true, - }, ts.URL, "/redfish/v1/Systems/1", nil) - if !hostOn || changed { - t.Fatalf("expected already-on host from PowerSummary, got hostOn=%v changed=%v", hostOn, changed) - } -} - -func TestWaitForHostPowerState_UsesPowerSummaryState(t *testing.T) { - powerState := "Off" - mux := http.NewServeMux() - mux.HandleFunc("/redfish/v1/Systems/1", func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - current := powerState - if powerState == "Off" { - powerState = "On" - } - _ = json.NewEncoder(w).Encode(map[string]interface{}{ - "@odata.id": "/redfish/v1/Systems/1", - "PowerSummary": map[string]interface{}{ - "PowerState": current, - }, - }) - }) - - ts := httptest.NewTLSServer(mux) - defer ts.Close() - - u, err := url.Parse(ts.URL) - if err != nil { - t.Fatalf("parse server url: %v", err) - } - port := 443 - if u.Port() != "" { - fmt.Sscanf(u.Port(), "%d", &port) - } - - c := NewRedfishConnector() - ok := c.waitForHostPowerState(context.Background(), c.httpClientWithTimeout(Request{TLSMode: "insecure"}, 5*time.Second), Request{ - Host: u.Hostname(), - Protocol: "redfish", - Port: port, - Username: "admin", - AuthType: "password", - Password: "secret", - TLSMode: "insecure", - }, ts.URL, "/redfish/v1/Systems/1", true, 3*time.Second) - if !ok { - t.Fatalf("expected waitForHostPowerState to use PowerSummary") - } } func TestParsePCIeDeviceSlot_FromNestedRedfishSlotLocation(t *testing.T) { diff --git a/internal/collector/types.go b/internal/collector/types.go index 390b195..f62f578 100644 --- a/internal/collector/types.go +++ b/internal/collector/types.go @@ -15,9 +15,8 @@ type Request struct { Password string Token string TLSMode string - PowerOnIfHostOff bool - StopHostAfterCollect bool - DebugPayloads bool + DebugPayloads bool + SkipHungCh <-chan struct{} } type Progress struct { @@ -65,10 +64,9 @@ type PhaseTelemetry struct { type ProbeResult struct { Reachable bool Protocol string - HostPowerState string - HostPoweredOn bool - PowerControlAvailable bool - SystemPath string + HostPowerState string + HostPoweredOn bool + SystemPath string } type Connector interface { diff --git a/internal/server/collect_handlers_test.go b/internal/server/collect_handlers_test.go index c7fa9f5..4f566a9 100644 --- a/internal/server/collect_handlers_test.go +++ b/internal/server/collect_handlers_test.go @@ -24,6 +24,7 @@ func newCollectTestServer() (*Server, *httptest.Server) { mux.HandleFunc("POST /api/collect", s.handleCollectStart) mux.HandleFunc("GET /api/collect/{id}", s.handleCollectStatus) mux.HandleFunc("POST /api/collect/{id}/cancel", s.handleCollectCancel) + mux.HandleFunc("POST /api/collect/{id}/skip", s.handleCollectSkip) return s, httptest.NewServer(mux) } @@ -65,9 +66,6 @@ func TestCollectProbe(t *testing.T) { if payload.HostPowerState != "Off" { t.Fatalf("expected host power state Off, got %q", payload.HostPowerState) } - if !payload.PowerControlAvailable { - t.Fatalf("expected power control to be available") - } } func TestCollectLifecycleToTerminal(t *testing.T) { diff --git a/internal/server/collect_test_helpers_test.go b/internal/server/collect_test_helpers_test.go index 2baf319..599430a 100644 --- a/internal/server/collect_test_helpers_test.go +++ b/internal/server/collect_test_helpers_test.go @@ -26,12 +26,11 @@ func (c *mockConnector) Probe(ctx context.Context, req collector.Request) (*coll hostPoweredOn = false } return &collector.ProbeResult{ - Reachable: true, - Protocol: c.protocol, - HostPowerState: map[bool]string{true: "On", false: "Off"}[hostPoweredOn], - HostPoweredOn: hostPoweredOn, - PowerControlAvailable: true, - SystemPath: "/redfish/v1/Systems/1", + Reachable: true, + Protocol: c.protocol, + HostPowerState: map[bool]string{true: "On", false: "Off"}[hostPoweredOn], + HostPoweredOn: hostPoweredOn, + SystemPath: "/redfish/v1/Systems/1", }, nil } diff --git a/internal/server/collect_types.go b/internal/server/collect_types.go index 7b60ae7..f1cd8f8 100644 --- a/internal/server/collect_types.go +++ b/internal/server/collect_types.go @@ -19,18 +19,15 @@ type CollectRequest struct { Password string `json:"password,omitempty"` Token string `json:"token,omitempty"` TLSMode string `json:"tls_mode"` - PowerOnIfHostOff bool `json:"power_on_if_host_off,omitempty"` - StopHostAfterCollect bool `json:"stop_host_after_collect,omitempty"` - DebugPayloads bool `json:"debug_payloads,omitempty"` + DebugPayloads bool `json:"debug_payloads,omitempty"` } type CollectProbeResponse struct { Reachable bool `json:"reachable"` Protocol string `json:"protocol,omitempty"` - HostPowerState string `json:"host_power_state,omitempty"` - HostPoweredOn bool `json:"host_powered_on"` - PowerControlAvailable bool `json:"power_control_available"` - Message string `json:"message,omitempty"` + HostPowerState string `json:"host_power_state,omitempty"` + HostPoweredOn bool `json:"host_powered_on"` + Message string `json:"message,omitempty"` } type CollectJobResponse struct { @@ -78,7 +75,8 @@ type Job struct { CreatedAt time.Time UpdatedAt time.Time RequestMeta CollectRequestMeta - cancel func() + cancel func() + skipFn func() } type CollectModuleStatus struct { diff --git a/internal/server/handlers.go b/internal/server/handlers.go index ae6b01a..4b6cc71 100644 --- a/internal/server/handlers.go +++ b/internal/server/handlers.go @@ -18,6 +18,7 @@ import ( "sort" "strconv" "strings" + "sync" "sync/atomic" "time" @@ -1674,34 +1675,28 @@ func (s *Server) handleCollectProbe(w http.ResponseWriter, r *http.Request) { message := "Связь с BMC установлена" if result != nil { - switch { - case !result.HostPoweredOn && result.PowerControlAvailable: - message = "Связь с BMC установлена, host выключен. Можно включить перед сбором." - case !result.HostPoweredOn: - message = "Связь с BMC установлена, host выключен." - default: - message = "Связь с BMC установлена, host включен." + if result.HostPoweredOn { + message = "Связь с BMC установлена, host включён." + } else { + message = "Связь с BMC установлена, host выключен. Данные инвентаря могут быть неполными." } } hostPowerState := "" hostPoweredOn := false - powerControlAvailable := false reachable := false if result != nil { reachable = result.Reachable hostPowerState = strings.TrimSpace(result.HostPowerState) hostPoweredOn = result.HostPoweredOn - powerControlAvailable = result.PowerControlAvailable } jsonResponse(w, CollectProbeResponse{ - Reachable: reachable, - Protocol: req.Protocol, - HostPowerState: hostPowerState, - HostPoweredOn: hostPoweredOn, - PowerControlAvailable: powerControlAvailable, - Message: message, + Reachable: reachable, + Protocol: req.Protocol, + HostPowerState: hostPowerState, + HostPoweredOn: hostPoweredOn, + Message: message, }) } @@ -1737,6 +1732,22 @@ func (s *Server) handleCollectCancel(w http.ResponseWriter, r *http.Request) { jsonResponse(w, job.toStatusResponse()) } +func (s *Server) handleCollectSkip(w http.ResponseWriter, r *http.Request) { + jobID := strings.TrimSpace(r.PathValue("id")) + if !isValidCollectJobID(jobID) { + jsonError(w, "Invalid collect job id", http.StatusBadRequest) + return + } + + job, ok := s.jobManager.SkipJob(jobID) + if !ok { + jsonError(w, "Collect job not found", http.StatusNotFound) + return + } + + jsonResponse(w, job.toStatusResponse()) +} + func (s *Server) startCollectionJob(jobID string, req CollectRequest) { ctx, cancel := context.WithCancel(context.Background()) if attached := s.jobManager.AttachJobCancel(jobID, cancel); !attached { @@ -1744,6 +1755,11 @@ func (s *Server) startCollectionJob(jobID string, req CollectRequest) { return } + skipCh := make(chan struct{}) + var skipOnce sync.Once + skipFn := func() { skipOnce.Do(func() { close(skipCh) }) } + s.jobManager.AttachJobSkip(jobID, skipFn) + go func() { connector, ok := s.getCollector(req.Protocol) if !ok { @@ -1811,7 +1827,9 @@ func (s *Server) startCollectionJob(jobID string, req CollectRequest) { } } - result, err := connector.Collect(ctx, toCollectorRequest(req), emitProgress) + collectorReq := toCollectorRequest(req) + collectorReq.SkipHungCh = skipCh + result, err := connector.Collect(ctx, collectorReq, emitProgress) if err != nil { if ctx.Err() != nil { return @@ -2035,9 +2053,7 @@ func toCollectorRequest(req CollectRequest) collector.Request { Password: req.Password, Token: req.Token, TLSMode: req.TLSMode, - PowerOnIfHostOff: req.PowerOnIfHostOff, - StopHostAfterCollect: req.StopHostAfterCollect, - DebugPayloads: req.DebugPayloads, + DebugPayloads: req.DebugPayloads, } } diff --git a/internal/server/job_manager.go b/internal/server/job_manager.go index 8a402ab..54fdb95 100644 --- a/internal/server/job_manager.go +++ b/internal/server/job_manager.go @@ -175,6 +175,43 @@ func (m *JobManager) UpdateJobDebugInfo(id string, info *CollectDebugInfo) (*Job return cloned, true } +func (m *JobManager) AttachJobSkip(id string, skipFn func()) bool { + m.mu.Lock() + defer m.mu.Unlock() + + job, ok := m.jobs[id] + if !ok || job == nil || isTerminalCollectStatus(job.Status) { + return false + } + job.skipFn = skipFn + return true +} + +func (m *JobManager) SkipJob(id string) (*Job, bool) { + m.mu.Lock() + job, ok := m.jobs[id] + if !ok || job == nil { + m.mu.Unlock() + return nil, false + } + if isTerminalCollectStatus(job.Status) { + cloned := cloneJob(job) + m.mu.Unlock() + return cloned, true + } + skipFn := job.skipFn + job.skipFn = nil + job.UpdatedAt = time.Now().UTC() + job.Logs = append(job.Logs, formatCollectLogLine(job.UpdatedAt, "Пропуск зависших запросов по команде пользователя")) + cloned := cloneJob(job) + m.mu.Unlock() + + if skipFn != nil { + skipFn() + } + return cloned, true +} + func (m *JobManager) AttachJobCancel(id string, cancelFn context.CancelFunc) bool { m.mu.Lock() defer m.mu.Unlock() @@ -229,5 +266,6 @@ func cloneJob(job *Job) *Job { cloned.CurrentPhase = job.CurrentPhase cloned.ETASeconds = job.ETASeconds cloned.cancel = nil + cloned.skipFn = nil return &cloned } diff --git a/internal/server/server.go b/internal/server/server.go index 6e035c4..2fb4ebf 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -99,6 +99,7 @@ func (s *Server) setupRoutes() { s.mux.HandleFunc("POST /api/collect/probe", s.handleCollectProbe) s.mux.HandleFunc("GET /api/collect/{id}", s.handleCollectStatus) s.mux.HandleFunc("POST /api/collect/{id}/cancel", s.handleCollectCancel) + s.mux.HandleFunc("POST /api/collect/{id}/skip", s.handleCollectSkip) } func (s *Server) Run() error { diff --git a/internal/server/upload_live_smoke_test.go b/internal/server/upload_live_smoke_test.go index 36cdf80..62bb71b 100644 --- a/internal/server/upload_live_smoke_test.go +++ b/internal/server/upload_live_smoke_test.go @@ -24,6 +24,7 @@ func newFlowTestServer() (*Server, *httptest.Server) { mux.HandleFunc("POST /api/collect", s.handleCollectStart) mux.HandleFunc("GET /api/collect/{id}", s.handleCollectStatus) mux.HandleFunc("POST /api/collect/{id}/cancel", s.handleCollectCancel) + mux.HandleFunc("POST /api/collect/{id}/skip", s.handleCollectSkip) return s, httptest.NewServer(mux) } diff --git a/web/static/css/style.css b/web/static/css/style.css index ab99ab4..917fd4a 100644 --- a/web/static/css/style.css +++ b/web/static/css/style.css @@ -211,8 +211,6 @@ main { } #api-connect-btn, -#api-power-on-collect-btn, -#api-collect-off-btn, #convert-folder-btn, #convert-run-btn, #cancel-job-btn, @@ -229,8 +227,6 @@ main { } #api-connect-btn:hover, -#api-power-on-collect-btn:hover, -#api-collect-off-btn:hover, #convert-folder-btn:hover, #convert-run-btn:hover, #cancel-job-btn:hover, @@ -241,8 +237,6 @@ main { #convert-run-btn:disabled, #convert-folder-btn:disabled, #api-connect-btn:disabled, -#api-power-on-collect-btn:disabled, -#api-collect-off-btn:disabled, #cancel-job-btn:disabled, .upload-area button:disabled { opacity: 0.6; @@ -311,64 +305,19 @@ main { border-top: 1px solid #e2e8f0; } -.api-confirm-modal-backdrop { - position: fixed; - inset: 0; - background: rgba(0, 0, 0, 0.45); +.api-host-off-warning { display: flex; align-items: center; - justify-content: center; - z-index: 1000; -} - -.api-confirm-modal { - background: #fff; - border-radius: 10px; - padding: 1.5rem 1.75rem; - max-width: 380px; - width: 90%; - box-shadow: 0 8px 32px rgba(0,0,0,0.18); -} - -.api-confirm-modal p { - margin-bottom: 1.1rem; - font-size: 0.95rem; - color: #333; - line-height: 1.5; -} - -.api-confirm-modal-actions { - display: flex; - gap: 0.6rem; - justify-content: flex-end; -} - -.api-confirm-modal-actions button { - border: none; + gap: 0.4rem; + padding: 0.5rem 0.75rem; + background: #fef3c7; + border: 1px solid #f59e0b; border-radius: 6px; - padding: 0.5rem 1rem; - font-size: 0.9rem; - font-weight: 600; - cursor: pointer; + font-size: 0.875rem; + color: #92400e; + font-weight: 500; } -.api-confirm-modal-actions .btn-cancel { - background: #e2e8f0; - color: #333; -} - -.api-confirm-modal-actions .btn-cancel:hover { - background: #cbd5e1; -} - -.api-confirm-modal-actions .btn-confirm { - background: #dc3545; - color: #fff; -} - -.api-confirm-modal-actions .btn-confirm:hover { - background: #b02a37; -} .api-connect-status { margin-top: 0.75rem; @@ -445,6 +394,33 @@ main { cursor: default; } +.job-status-actions { + display: flex; + gap: 0.5rem; + align-items: center; +} + +#skip-hung-btn { + background: #f59e0b; + color: #fff; + border: none; + border-radius: 6px; + padding: 0.5rem 0.9rem; + font-size: 0.875rem; + font-weight: 600; + cursor: pointer; + transition: background-color 0.2s ease, opacity 0.2s ease; +} + +#skip-hung-btn:hover { + background: #d97706; +} + +#skip-hung-btn:disabled { + opacity: 0.6; + cursor: not-allowed; +} + .job-status-meta { display: grid; grid-template-columns: repeat(auto-fit, minmax(230px, 1fr)); diff --git a/web/static/js/app.js b/web/static/js/app.js index b6474a8..03cf905 100644 --- a/web/static/js/app.js +++ b/web/static/js/app.js @@ -91,9 +91,9 @@ function initApiSource() { } const cancelJobButton = document.getElementById('cancel-job-btn'); + const skipHungButton = document.getElementById('skip-hung-btn'); const connectButton = document.getElementById('api-connect-btn'); const collectButton = document.getElementById('api-collect-btn'); - const powerOffCheckbox = document.getElementById('api-power-off'); const fieldNames = ['host', 'port', 'username', 'password']; apiForm.addEventListener('submit', (event) => { @@ -110,6 +110,11 @@ function initApiSource() { cancelCollectionJob(); }); } + if (skipHungButton) { + skipHungButton.addEventListener('click', () => { + skipHungCollectionJob(); + }); + } if (connectButton) { connectButton.addEventListener('click', () => { startApiProbe(); @@ -120,22 +125,6 @@ function initApiSource() { startCollectionWithOptions(); }); } - if (powerOffCheckbox) { - powerOffCheckbox.addEventListener('change', () => { - if (!powerOffCheckbox.checked) { - return; - } - // If host was already on when probed, warn before enabling shutdown - if (apiProbeResult && apiProbeResult.host_powered_on) { - showConfirmModal( - 'Хост был включён до начала сбора. Вы уверены, что хотите выключить его после завершения сбора?', - () => { /* confirmed — leave checked */ }, - () => { powerOffCheckbox.checked = false; } - ); - } - }); - } - fieldNames.forEach((fieldName) => { const field = apiForm.elements.namedItem(fieldName); if (!field) { @@ -163,36 +152,6 @@ function initApiSource() { renderCollectionJob(); } -function showConfirmModal(message, onConfirm, onCancel) { - const backdrop = document.createElement('div'); - backdrop.className = 'api-confirm-modal-backdrop'; - backdrop.innerHTML = ` -
${escapeHtml(message)}
-