From 5bfaecd417ff1daaf444120e982f489f1008a698 Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Wed, 1 Jul 2026 14:54:48 +0300 Subject: [PATCH] storage SAT: wait for smartctl self-test completion, add human-readable resource summary smartctl -t short only launches the self-test and returns immediately ("Testing has begun"); unlike nvme device-self-test --wait, it has no blocking mode. Validate/Load runs closed the task and produced reports before the drive actually finished the test. Now poll smartctl -a until the test completes (or times out) and report the real result. Also add a per-disk "Resource" section with pseudographic progress bars for uptime (vs 5y design life), bytes written (vs 1 DWPD x 5y budget), and bytes read (percent from SMART attribute 242), all rendered in human-scaled units (days/years, TB/PB) instead of raw hour/byte counts. Co-Authored-By: Claude Sonnet 5 --- audit/internal/platform/sat.go | 55 +++++++ audit/internal/platform/storage_report.go | 181 +++++++++++++++++++++- 2 files changed, 229 insertions(+), 7 deletions(-) diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index 924a315..38fd515 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -746,6 +746,25 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e key := filepath.Base(devPath) + "_" + strings.ReplaceAll(job.name, "-", "_") fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc) fmt.Fprintf(&summary, "%s_status=%s\n", key, status) + + // smartctl -t short only launches the self-test on the drive firmware and + // returns immediately ("Testing has begun"); unlike `nvme device-self-test + // --wait`, smartctl has no blocking mode, so we must poll the drive + // ourselves until the self-test actually finishes. + if job.name == "smartctl-self-test-short" && err == nil { + statusName := "smartctl-self-test-status" + statusOut := waitForSmartctlSelfTest(ctx, verboseLog, devPath, logFunc) + deviceOutputs[statusName] = statusOut + statusFile := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+2, statusName) + if writeErr := os.WriteFile(filepath.Join(runDir, statusFile), statusOut, 0644); writeErr != nil { + return "", writeErr + } + sStatus, sRC := classifySATResult(statusName, statusOut, nil) + stats.Add(sStatus) + sKey := filepath.Base(devPath) + "_" + strings.ReplaceAll(statusName, "-", "_") + fmt.Fprintf(&summary, "%s_rc=%d\n", sKey, sRC) + fmt.Fprintf(&summary, "%s_status=%s\n", sKey, sStatus) + } } reportText := GenerateDiskReportText(index+1, devPath, deviceOutputs, time.Now().UTC()) _ = os.WriteFile(filepath.Join(runDir, "disk-"+prefix+"-report.txt"), []byte(reportText), 0644) @@ -1181,6 +1200,42 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string return out, err } +// smartctlSelfTestPollInterval/Timeout bound how long we poll the drive after +// launching `smartctl -t short`, which SMART/ATA specs put at ~2 minutes. +const ( + smartctlSelfTestPollInterval = 5 * time.Second + smartctlSelfTestTimeout = 4 * time.Minute +) + +// waitForSmartctlSelfTest polls `smartctl -a` until the short self-test +// started on devPath finishes (or the timeout/context elapses) and returns +// the final output, which reflects the actual test result rather than the +// "Testing has begun" launch acknowledgement. +func waitForSmartctlSelfTest(ctx context.Context, verboseLog, devPath string, logFunc func(string)) []byte { + deadline := time.Now().Add(smartctlSelfTestTimeout) + var last []byte + for { + out, _ := runSATCommandCtx(ctx, verboseLog, "smartctl-self-test-status", []string{"smartctl", "-a", devPath}, nil, nil) + last = out + if ctx.Err() != nil { + return last + } + lower := bytes.ToLower(out) + if !bytes.Contains(lower, []byte("self-test routine in progress")) && + !bytes.Contains(lower, []byte("% of test remaining")) { + return last + } + if time.Now().After(deadline) { + return last + } + select { + case <-ctx.Done(): + return last + case <-time.After(smartctlSelfTestPollInterval): + } + } +} + func listStorageDevices() ([]string, error) { out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output() if err != nil { diff --git a/audit/internal/platform/storage_report.go b/audit/internal/platform/storage_report.go index 28b942b..807ef1c 100644 --- a/audit/internal/platform/storage_report.go +++ b/audit/internal/platform/storage_report.go @@ -158,6 +158,17 @@ func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) { writeField(b, "Media Errors", formatUint(me)) writeField(b, "Error Log Entries", formatUint(nel)) + capacityBytes := ctrl.TotalCap + if capacityBytes == 0 { + capacityBytes = ctrl.NVMCap + } + writeResourceSection(b, resourceInfo{ + powerOnHours: poh, + writtenBytes: uint64(nvmeU64(sl.DataUnitsWritten)) * 512000, + readBytes: uint64(nvmeU64(sl.DataUnitsRead)) * 512000, + capacityBytes: capacityBytes, + }) + if selfTest := outputs["nvme-device-self-test"]; len(selfTest) > 0 { writeSectionHeader(b, "Self-Test") result := parseSelfTestResult(string(selfTest)) @@ -168,7 +179,7 @@ func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) { // ── SATA / SAS (smartctl) ──────────────────────────────────────────────────── var ( - smartHealthRE = regexp.MustCompile(`(?i)SMART overall-health self-assessment test result:\s*(\S+)`) + smartHealthRE = regexp.MustCompile(`(?i)SMART overall-health self-assessment test result:\s*(\S+)`) smartAttrLineRE = regexp.MustCompile( `^\s*(\d{1,3})\s+(\S+)\s+0x[0-9a-fA-F]+\s+(\d{1,3})\s+(\d{1,3})\s+(\d{1,3})\s+\S+\s+\S+\s+\S+\s+(.+?)\s*$`, ) @@ -205,8 +216,10 @@ func writeSATAReport(b *strings.Builder, outputs map[string][]byte) { if m := smartFirmwareRE.FindStringSubmatch(text); m != nil { writeField(b, "Firmware", strings.TrimSpace(m[1])) } + var capacityBytes uint64 if m := smartCapacityRE.FindStringSubmatch(text); m != nil { cap := strings.TrimSpace(m[1]) + capacityBytes = parseLeadingUint(cap) // trim everything after "[" if present (e.g. "500,107,862,016 bytes [500 GB]") if idx := strings.Index(cap, "["); idx > 0 { cap = strings.TrimSpace(cap[idx+1:]) @@ -233,7 +246,36 @@ func writeSATAReport(b *strings.Builder, outputs map[string][]byte) { } } - if selfTest := outputs["smartctl-self-test-short"]; len(selfTest) > 0 { + var poh, writtenLBAs, readLBAs uint64 + var readValue int + hasReadValue := false + for _, a := range attrs { + switch a.ID { + case 9: // Power_On_Hours + poh = parseLeadingUint(a.Raw) + case 241: // Total_LBAs_Written + writtenLBAs = parseLeadingUint(a.Raw) + case 242: // Total_LBAs_Read + readLBAs = parseLeadingUint(a.Raw) + readValue = a.Value + hasReadValue = true + } + } + const sataSectorBytes = 512 + writeResourceSection(b, resourceInfo{ + powerOnHours: poh, + writtenBytes: writtenLBAs * sataSectorBytes, + readBytes: readLBAs * sataSectorBytes, + capacityBytes: capacityBytes, + readPercent: 100 - readValue, + hasReadPercent: hasReadValue, + }) + + selfTest := outputs["smartctl-self-test-status"] + if len(selfTest) == 0 { + selfTest = outputs["smartctl-self-test-short"] + } + if len(selfTest) > 0 { writeSectionHeader(b, "Self-Test") result := parseSelfTestResult(string(selfTest)) writeField(b, "Result", result) @@ -274,29 +316,45 @@ func parseSMARTAttrs(text string) []smartAttr { return attrs } -// parseSelfTestResult extracts a one-line summary from nvme device-self-test -// or smartctl -t short output. +// parseSelfTestResult extracts a one-line summary from nvme device-self-test, +// smartctl -a (post-completion status), or smartctl -t short (launch ack) output. func parseSelfTestResult(text string) string { text = strings.TrimSpace(text) if text == "" { return "no output" } + lines := strings.Split(text, "\n") + // smartctl -a: "Self-test execution status: ( 0)\n\tThe previous + // self-test routine completed\n\twithout error ..." — the description + // wraps onto following indented, colon-free continuation lines. + for i, line := range lines { + if strings.Contains(strings.ToLower(line), "self-test execution status") { + parts := []string{strings.TrimSpace(line)} + for j := i + 1; j < len(lines) && j < i+4; j++ { + cont := strings.TrimSpace(lines[j]) + if cont == "" || strings.Contains(cont, ":") { + break + } + parts = append(parts, cont) + } + return strings.Join(parts, " ") + } + } // nvme device-self-test: look for "Short Device Self-Test Status : 0x0" or similar - for _, line := range strings.Split(text, "\n") { + for _, line := range lines { l := strings.ToLower(line) if strings.Contains(l, "self-test status") || strings.Contains(l, "self test status") { return strings.TrimSpace(line) } } // smartctl -t short: "Testing has begun" or "Short BGST started" - for _, line := range strings.Split(text, "\n") { + for _, line := range lines { l := strings.ToLower(line) if strings.Contains(l, "testing has begun") || strings.Contains(l, "started") || strings.Contains(l, "complete") { return strings.TrimSpace(line) } } // fallback: last non-empty line - lines := strings.Split(strings.TrimSpace(text), "\n") for i := len(lines) - 1; i >= 0; i-- { if s := strings.TrimSpace(lines[i]); s != "" { return s @@ -305,6 +363,115 @@ func parseSelfTestResult(text string) string { return "done" } +// ── Resource (pseudographic usage bars) ──────────────────────────────────────── + +// designLifeYears/dwpd model the drive's rated endurance: 1 drive-write-per-day +// for 5 years, the baseline enterprise endurance spec used when the vendor's +// own TBW/DWPD rating isn't available from SMART/NVMe data. +const ( + designLifeYears = 5 + dwpd = 1.0 +) + +type resourceInfo struct { + powerOnHours uint64 + writtenBytes uint64 + readBytes uint64 + capacityBytes uint64 + readPercent int // only meaningful when hasReadPercent + hasReadPercent bool // true when the source SMART attribute exposes a normalized read-wear value +} + +func writeResourceSection(b *strings.Builder, r resourceInfo) { + writeSectionHeader(b, "Resource") + + const maxLifeHours = designLifeYears * 365 * 24 + upFrac := float64(r.powerOnHours) / float64(maxLifeHours) + fmt.Fprintf(b, " %-9s %s %s / %s (%s)\n", + "Uptime", progressBar(upFrac, 24), formatHoursHuman(r.powerOnHours), formatHoursHuman(maxLifeHours), formatPercent(upFrac*100)) + + if r.capacityBytes > 0 { + maxWritten := float64(r.capacityBytes) * dwpd * designLifeYears * 365 + wFrac := float64(r.writtenBytes) / maxWritten + fmt.Fprintf(b, " %-9s %s %s / %s (%s, %g DWPD×%dy)\n", + "Written", progressBar(wFrac, 24), formatBytesHuman(float64(r.writtenBytes)), formatBytesHuman(maxWritten), formatPercent(wFrac*100), dwpd, designLifeYears) + } else { + fmt.Fprintf(b, " %-9s %s\n", "Written", formatBytesHuman(float64(r.writtenBytes))) + } + + if r.hasReadPercent { + fmt.Fprintf(b, " %-9s %s %s (%d%%)\n", + "Read", progressBar(float64(r.readPercent)/100, 24), formatBytesHuman(float64(r.readBytes)), r.readPercent) + } else { + fmt.Fprintf(b, " %-9s %s\n", "Read", formatBytesHuman(float64(r.readBytes))) + } +} + +// progressBar renders a fixed-width pseudographic bar, e.g. "[######------]". +func progressBar(frac float64, width int) string { + if math.IsNaN(frac) || frac < 0 { + frac = 0 + } + if frac > 1 { + frac = 1 + } + filled := int(math.Round(frac * float64(width))) + return "[" + strings.Repeat("#", filled) + strings.Repeat("-", width-filled) + "]" +} + +// formatBytesHuman renders a decimal (SI) human-readable byte size, e.g. "1.23 TB". +func formatBytesHuman(n float64) string { + units := []string{"B", "KB", "MB", "GB", "TB", "PB"} + i := 0 + for n >= 1000 && i < len(units)-1 { + n /= 1000 + i++ + } + if i == 0 { + return fmt.Sprintf("%.0f %s", n, units[i]) + } + return fmt.Sprintf("%.2f %s", n, units[i]) +} + +// formatHoursHuman renders an hour count as a human-scaled duration (hours, +// days, or years) so uptimes don't show as raw four/five-digit hour counts. +func formatHoursHuman(hours uint64) string { + if hours < 48 { + return fmt.Sprintf("%d h", hours) + } + days := float64(hours) / 24 + if days < 365 { + return fmt.Sprintf("%.0f d", days) + } + years := days / 365 + if years == math.Trunc(years) { + return fmt.Sprintf("%.0f y", years) + } + return fmt.Sprintf("%.1f y", years) +} + +// formatPercent renders a percentage with extra precision below 1% (e.g. +// "0.03%"), where a rounded "0%" would hide any usage at all. +func formatPercent(pct float64) string { + if pct > 0 && pct < 1 { + return fmt.Sprintf("%.2f%%", pct) + } + return fmt.Sprintf("%.0f%%", pct) +} + +// parseLeadingUint parses the leading run of digits/commas in s (e.g. from a +// SMART raw value or "500,107,862,016 bytes") into a uint64, ignoring the rest. +func parseLeadingUint(s string) uint64 { + s = strings.TrimSpace(s) + end := 0 + for end < len(s) && (s[end] >= '0' && s[end] <= '9' || s[end] == ',') { + end++ + } + digits := strings.ReplaceAll(s[:end], ",", "") + n, _ := strconv.ParseUint(digits, 10, 64) + return n +} + // ── Formatting helpers ──────────────────────────────────────────────────────── func writeSectionHeader(b *strings.Builder, title string) {