storage SAT: wait for smartctl self-test completion, add human-readable resource summary

smartctl -t short only launches the self-test and returns immediately
("Testing has begun"); unlike nvme device-self-test --wait, it has no
blocking mode. Validate/Load runs closed the task and produced reports
before the drive actually finished the test. Now poll smartctl -a until
the test completes (or times out) and report the real result.

Also add a per-disk "Resource" section with pseudographic progress bars
for uptime (vs 5y design life), bytes written (vs 1 DWPD x 5y budget),
and bytes read (percent from SMART attribute 242), all rendered in
human-scaled units (days/years, TB/PB) instead of raw hour/byte counts.

Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-07-01 14:54:48 +03:00
parent 8575cf06f8
commit 5bfaecd417
2 changed files with 229 additions and 7 deletions

View File

@@ -746,6 +746,25 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e
key := filepath.Base(devPath) + "_" + strings.ReplaceAll(job.name, "-", "_")
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
// smartctl -t short only launches the self-test on the drive firmware and
// returns immediately ("Testing has begun"); unlike `nvme device-self-test
// --wait`, smartctl has no blocking mode, so we must poll the drive
// ourselves until the self-test actually finishes.
if job.name == "smartctl-self-test-short" && err == nil {
statusName := "smartctl-self-test-status"
statusOut := waitForSmartctlSelfTest(ctx, verboseLog, devPath, logFunc)
deviceOutputs[statusName] = statusOut
statusFile := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+2, statusName)
if writeErr := os.WriteFile(filepath.Join(runDir, statusFile), statusOut, 0644); writeErr != nil {
return "", writeErr
}
sStatus, sRC := classifySATResult(statusName, statusOut, nil)
stats.Add(sStatus)
sKey := filepath.Base(devPath) + "_" + strings.ReplaceAll(statusName, "-", "_")
fmt.Fprintf(&summary, "%s_rc=%d\n", sKey, sRC)
fmt.Fprintf(&summary, "%s_status=%s\n", sKey, sStatus)
}
}
reportText := GenerateDiskReportText(index+1, devPath, deviceOutputs, time.Now().UTC())
_ = os.WriteFile(filepath.Join(runDir, "disk-"+prefix+"-report.txt"), []byte(reportText), 0644)
@@ -1181,6 +1200,42 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
return out, err
}
// smartctlSelfTestPollInterval/Timeout bound how long we poll the drive after
// launching `smartctl -t short`, which SMART/ATA specs put at ~2 minutes.
const (
smartctlSelfTestPollInterval = 5 * time.Second
smartctlSelfTestTimeout = 4 * time.Minute
)
// waitForSmartctlSelfTest polls `smartctl -a` until the short self-test
// started on devPath finishes (or the timeout/context elapses) and returns
// the final output, which reflects the actual test result rather than the
// "Testing has begun" launch acknowledgement.
func waitForSmartctlSelfTest(ctx context.Context, verboseLog, devPath string, logFunc func(string)) []byte {
deadline := time.Now().Add(smartctlSelfTestTimeout)
var last []byte
for {
out, _ := runSATCommandCtx(ctx, verboseLog, "smartctl-self-test-status", []string{"smartctl", "-a", devPath}, nil, nil)
last = out
if ctx.Err() != nil {
return last
}
lower := bytes.ToLower(out)
if !bytes.Contains(lower, []byte("self-test routine in progress")) &&
!bytes.Contains(lower, []byte("% of test remaining")) {
return last
}
if time.Now().After(deadline) {
return last
}
select {
case <-ctx.Done():
return last
case <-time.After(smartctlSelfTestPollInterval):
}
}
}
func listStorageDevices() ([]string, error) {
out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
if err != nil {

View File

@@ -158,6 +158,17 @@ func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) {
writeField(b, "Media Errors", formatUint(me))
writeField(b, "Error Log Entries", formatUint(nel))
capacityBytes := ctrl.TotalCap
if capacityBytes == 0 {
capacityBytes = ctrl.NVMCap
}
writeResourceSection(b, resourceInfo{
powerOnHours: poh,
writtenBytes: uint64(nvmeU64(sl.DataUnitsWritten)) * 512000,
readBytes: uint64(nvmeU64(sl.DataUnitsRead)) * 512000,
capacityBytes: capacityBytes,
})
if selfTest := outputs["nvme-device-self-test"]; len(selfTest) > 0 {
writeSectionHeader(b, "Self-Test")
result := parseSelfTestResult(string(selfTest))
@@ -168,7 +179,7 @@ func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) {
// ── SATA / SAS (smartctl) ────────────────────────────────────────────────────
var (
smartHealthRE = regexp.MustCompile(`(?i)SMART overall-health self-assessment test result:\s*(\S+)`)
smartHealthRE = regexp.MustCompile(`(?i)SMART overall-health self-assessment test result:\s*(\S+)`)
smartAttrLineRE = regexp.MustCompile(
`^\s*(\d{1,3})\s+(\S+)\s+0x[0-9a-fA-F]+\s+(\d{1,3})\s+(\d{1,3})\s+(\d{1,3})\s+\S+\s+\S+\s+\S+\s+(.+?)\s*$`,
)
@@ -205,8 +216,10 @@ func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
if m := smartFirmwareRE.FindStringSubmatch(text); m != nil {
writeField(b, "Firmware", strings.TrimSpace(m[1]))
}
var capacityBytes uint64
if m := smartCapacityRE.FindStringSubmatch(text); m != nil {
cap := strings.TrimSpace(m[1])
capacityBytes = parseLeadingUint(cap)
// trim everything after "[" if present (e.g. "500,107,862,016 bytes [500 GB]")
if idx := strings.Index(cap, "["); idx > 0 {
cap = strings.TrimSpace(cap[idx+1:])
@@ -233,7 +246,36 @@ func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
}
}
if selfTest := outputs["smartctl-self-test-short"]; len(selfTest) > 0 {
var poh, writtenLBAs, readLBAs uint64
var readValue int
hasReadValue := false
for _, a := range attrs {
switch a.ID {
case 9: // Power_On_Hours
poh = parseLeadingUint(a.Raw)
case 241: // Total_LBAs_Written
writtenLBAs = parseLeadingUint(a.Raw)
case 242: // Total_LBAs_Read
readLBAs = parseLeadingUint(a.Raw)
readValue = a.Value
hasReadValue = true
}
}
const sataSectorBytes = 512
writeResourceSection(b, resourceInfo{
powerOnHours: poh,
writtenBytes: writtenLBAs * sataSectorBytes,
readBytes: readLBAs * sataSectorBytes,
capacityBytes: capacityBytes,
readPercent: 100 - readValue,
hasReadPercent: hasReadValue,
})
selfTest := outputs["smartctl-self-test-status"]
if len(selfTest) == 0 {
selfTest = outputs["smartctl-self-test-short"]
}
if len(selfTest) > 0 {
writeSectionHeader(b, "Self-Test")
result := parseSelfTestResult(string(selfTest))
writeField(b, "Result", result)
@@ -274,29 +316,45 @@ func parseSMARTAttrs(text string) []smartAttr {
return attrs
}
// parseSelfTestResult extracts a one-line summary from nvme device-self-test
// or smartctl -t short output.
// parseSelfTestResult extracts a one-line summary from nvme device-self-test,
// smartctl -a (post-completion status), or smartctl -t short (launch ack) output.
func parseSelfTestResult(text string) string {
text = strings.TrimSpace(text)
if text == "" {
return "no output"
}
lines := strings.Split(text, "\n")
// smartctl -a: "Self-test execution status: ( 0)\n\tThe previous
// self-test routine completed\n\twithout error ..." — the description
// wraps onto following indented, colon-free continuation lines.
for i, line := range lines {
if strings.Contains(strings.ToLower(line), "self-test execution status") {
parts := []string{strings.TrimSpace(line)}
for j := i + 1; j < len(lines) && j < i+4; j++ {
cont := strings.TrimSpace(lines[j])
if cont == "" || strings.Contains(cont, ":") {
break
}
parts = append(parts, cont)
}
return strings.Join(parts, " ")
}
}
// nvme device-self-test: look for "Short Device Self-Test Status : 0x0" or similar
for _, line := range strings.Split(text, "\n") {
for _, line := range lines {
l := strings.ToLower(line)
if strings.Contains(l, "self-test status") || strings.Contains(l, "self test status") {
return strings.TrimSpace(line)
}
}
// smartctl -t short: "Testing has begun" or "Short BGST started"
for _, line := range strings.Split(text, "\n") {
for _, line := range lines {
l := strings.ToLower(line)
if strings.Contains(l, "testing has begun") || strings.Contains(l, "started") || strings.Contains(l, "complete") {
return strings.TrimSpace(line)
}
}
// fallback: last non-empty line
lines := strings.Split(strings.TrimSpace(text), "\n")
for i := len(lines) - 1; i >= 0; i-- {
if s := strings.TrimSpace(lines[i]); s != "" {
return s
@@ -305,6 +363,115 @@ func parseSelfTestResult(text string) string {
return "done"
}
// ── Resource (pseudographic usage bars) ────────────────────────────────────────
// designLifeYears/dwpd model the drive's rated endurance: 1 drive-write-per-day
// for 5 years, the baseline enterprise endurance spec used when the vendor's
// own TBW/DWPD rating isn't available from SMART/NVMe data.
const (
designLifeYears = 5
dwpd = 1.0
)
type resourceInfo struct {
powerOnHours uint64
writtenBytes uint64
readBytes uint64
capacityBytes uint64
readPercent int // only meaningful when hasReadPercent
hasReadPercent bool // true when the source SMART attribute exposes a normalized read-wear value
}
func writeResourceSection(b *strings.Builder, r resourceInfo) {
writeSectionHeader(b, "Resource")
const maxLifeHours = designLifeYears * 365 * 24
upFrac := float64(r.powerOnHours) / float64(maxLifeHours)
fmt.Fprintf(b, " %-9s %s %s / %s (%s)\n",
"Uptime", progressBar(upFrac, 24), formatHoursHuman(r.powerOnHours), formatHoursHuman(maxLifeHours), formatPercent(upFrac*100))
if r.capacityBytes > 0 {
maxWritten := float64(r.capacityBytes) * dwpd * designLifeYears * 365
wFrac := float64(r.writtenBytes) / maxWritten
fmt.Fprintf(b, " %-9s %s %s / %s (%s, %g DWPD×%dy)\n",
"Written", progressBar(wFrac, 24), formatBytesHuman(float64(r.writtenBytes)), formatBytesHuman(maxWritten), formatPercent(wFrac*100), dwpd, designLifeYears)
} else {
fmt.Fprintf(b, " %-9s %s\n", "Written", formatBytesHuman(float64(r.writtenBytes)))
}
if r.hasReadPercent {
fmt.Fprintf(b, " %-9s %s %s (%d%%)\n",
"Read", progressBar(float64(r.readPercent)/100, 24), formatBytesHuman(float64(r.readBytes)), r.readPercent)
} else {
fmt.Fprintf(b, " %-9s %s\n", "Read", formatBytesHuman(float64(r.readBytes)))
}
}
// progressBar renders a fixed-width pseudographic bar, e.g. "[######------]".
func progressBar(frac float64, width int) string {
if math.IsNaN(frac) || frac < 0 {
frac = 0
}
if frac > 1 {
frac = 1
}
filled := int(math.Round(frac * float64(width)))
return "[" + strings.Repeat("#", filled) + strings.Repeat("-", width-filled) + "]"
}
// formatBytesHuman renders a decimal (SI) human-readable byte size, e.g. "1.23 TB".
func formatBytesHuman(n float64) string {
units := []string{"B", "KB", "MB", "GB", "TB", "PB"}
i := 0
for n >= 1000 && i < len(units)-1 {
n /= 1000
i++
}
if i == 0 {
return fmt.Sprintf("%.0f %s", n, units[i])
}
return fmt.Sprintf("%.2f %s", n, units[i])
}
// formatHoursHuman renders an hour count as a human-scaled duration (hours,
// days, or years) so uptimes don't show as raw four/five-digit hour counts.
func formatHoursHuman(hours uint64) string {
if hours < 48 {
return fmt.Sprintf("%d h", hours)
}
days := float64(hours) / 24
if days < 365 {
return fmt.Sprintf("%.0f d", days)
}
years := days / 365
if years == math.Trunc(years) {
return fmt.Sprintf("%.0f y", years)
}
return fmt.Sprintf("%.1f y", years)
}
// formatPercent renders a percentage with extra precision below 1% (e.g.
// "0.03%"), where a rounded "0%" would hide any usage at all.
func formatPercent(pct float64) string {
if pct > 0 && pct < 1 {
return fmt.Sprintf("%.2f%%", pct)
}
return fmt.Sprintf("%.0f%%", pct)
}
// parseLeadingUint parses the leading run of digits/commas in s (e.g. from a
// SMART raw value or "500,107,862,016 bytes") into a uint64, ignoring the rest.
func parseLeadingUint(s string) uint64 {
s = strings.TrimSpace(s)
end := 0
for end < len(s) && (s[end] >= '0' && s[end] <= '9' || s[end] == ',') {
end++
}
digits := strings.ReplaceAll(s[:end], ",", "")
n, _ := strconv.ParseUint(digits, 10, 64)
return n
}
// ── Formatting helpers ────────────────────────────────────────────────────────
func writeSectionHeader(b *strings.Builder, title string) {