storage SAT: wait for smartctl self-test completion, add human-readable resource summary
smartctl -t short only launches the self-test and returns immediately
("Testing has begun"); unlike nvme device-self-test --wait, it has no
blocking mode. Validate/Load runs closed the task and produced reports
before the drive actually finished the test. Now poll smartctl -a until
the test completes (or times out) and report the real result.
Also add a per-disk "Resource" section with pseudographic progress bars
for uptime (vs 5y design life), bytes written (vs 1 DWPD x 5y budget),
and bytes read (percent from SMART attribute 242), all rendered in
human-scaled units (days/years, TB/PB) instead of raw hour/byte counts.
Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
This commit is contained in:
@@ -746,6 +746,25 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e
|
||||
key := filepath.Base(devPath) + "_" + strings.ReplaceAll(job.name, "-", "_")
|
||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||
|
||||
// smartctl -t short only launches the self-test on the drive firmware and
|
||||
// returns immediately ("Testing has begun"); unlike `nvme device-self-test
|
||||
// --wait`, smartctl has no blocking mode, so we must poll the drive
|
||||
// ourselves until the self-test actually finishes.
|
||||
if job.name == "smartctl-self-test-short" && err == nil {
|
||||
statusName := "smartctl-self-test-status"
|
||||
statusOut := waitForSmartctlSelfTest(ctx, verboseLog, devPath, logFunc)
|
||||
deviceOutputs[statusName] = statusOut
|
||||
statusFile := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+2, statusName)
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, statusFile), statusOut, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
sStatus, sRC := classifySATResult(statusName, statusOut, nil)
|
||||
stats.Add(sStatus)
|
||||
sKey := filepath.Base(devPath) + "_" + strings.ReplaceAll(statusName, "-", "_")
|
||||
fmt.Fprintf(&summary, "%s_rc=%d\n", sKey, sRC)
|
||||
fmt.Fprintf(&summary, "%s_status=%s\n", sKey, sStatus)
|
||||
}
|
||||
}
|
||||
reportText := GenerateDiskReportText(index+1, devPath, deviceOutputs, time.Now().UTC())
|
||||
_ = os.WriteFile(filepath.Join(runDir, "disk-"+prefix+"-report.txt"), []byte(reportText), 0644)
|
||||
@@ -1181,6 +1200,42 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
||||
return out, err
|
||||
}
|
||||
|
||||
// smartctlSelfTestPollInterval/Timeout bound how long we poll the drive after
|
||||
// launching `smartctl -t short`, which SMART/ATA specs put at ~2 minutes.
|
||||
const (
|
||||
smartctlSelfTestPollInterval = 5 * time.Second
|
||||
smartctlSelfTestTimeout = 4 * time.Minute
|
||||
)
|
||||
|
||||
// waitForSmartctlSelfTest polls `smartctl -a` until the short self-test
|
||||
// started on devPath finishes (or the timeout/context elapses) and returns
|
||||
// the final output, which reflects the actual test result rather than the
|
||||
// "Testing has begun" launch acknowledgement.
|
||||
func waitForSmartctlSelfTest(ctx context.Context, verboseLog, devPath string, logFunc func(string)) []byte {
|
||||
deadline := time.Now().Add(smartctlSelfTestTimeout)
|
||||
var last []byte
|
||||
for {
|
||||
out, _ := runSATCommandCtx(ctx, verboseLog, "smartctl-self-test-status", []string{"smartctl", "-a", devPath}, nil, nil)
|
||||
last = out
|
||||
if ctx.Err() != nil {
|
||||
return last
|
||||
}
|
||||
lower := bytes.ToLower(out)
|
||||
if !bytes.Contains(lower, []byte("self-test routine in progress")) &&
|
||||
!bytes.Contains(lower, []byte("% of test remaining")) {
|
||||
return last
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
return last
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return last
|
||||
case <-time.After(smartctlSelfTestPollInterval):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func listStorageDevices() ([]string, error) {
|
||||
out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
|
||||
if err != nil {
|
||||
|
||||
@@ -158,6 +158,17 @@ func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) {
|
||||
writeField(b, "Media Errors", formatUint(me))
|
||||
writeField(b, "Error Log Entries", formatUint(nel))
|
||||
|
||||
capacityBytes := ctrl.TotalCap
|
||||
if capacityBytes == 0 {
|
||||
capacityBytes = ctrl.NVMCap
|
||||
}
|
||||
writeResourceSection(b, resourceInfo{
|
||||
powerOnHours: poh,
|
||||
writtenBytes: uint64(nvmeU64(sl.DataUnitsWritten)) * 512000,
|
||||
readBytes: uint64(nvmeU64(sl.DataUnitsRead)) * 512000,
|
||||
capacityBytes: capacityBytes,
|
||||
})
|
||||
|
||||
if selfTest := outputs["nvme-device-self-test"]; len(selfTest) > 0 {
|
||||
writeSectionHeader(b, "Self-Test")
|
||||
result := parseSelfTestResult(string(selfTest))
|
||||
@@ -168,7 +179,7 @@ func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) {
|
||||
// ── SATA / SAS (smartctl) ────────────────────────────────────────────────────
|
||||
|
||||
var (
|
||||
smartHealthRE = regexp.MustCompile(`(?i)SMART overall-health self-assessment test result:\s*(\S+)`)
|
||||
smartHealthRE = regexp.MustCompile(`(?i)SMART overall-health self-assessment test result:\s*(\S+)`)
|
||||
smartAttrLineRE = regexp.MustCompile(
|
||||
`^\s*(\d{1,3})\s+(\S+)\s+0x[0-9a-fA-F]+\s+(\d{1,3})\s+(\d{1,3})\s+(\d{1,3})\s+\S+\s+\S+\s+\S+\s+(.+?)\s*$`,
|
||||
)
|
||||
@@ -205,8 +216,10 @@ func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
|
||||
if m := smartFirmwareRE.FindStringSubmatch(text); m != nil {
|
||||
writeField(b, "Firmware", strings.TrimSpace(m[1]))
|
||||
}
|
||||
var capacityBytes uint64
|
||||
if m := smartCapacityRE.FindStringSubmatch(text); m != nil {
|
||||
cap := strings.TrimSpace(m[1])
|
||||
capacityBytes = parseLeadingUint(cap)
|
||||
// trim everything after "[" if present (e.g. "500,107,862,016 bytes [500 GB]")
|
||||
if idx := strings.Index(cap, "["); idx > 0 {
|
||||
cap = strings.TrimSpace(cap[idx+1:])
|
||||
@@ -233,7 +246,36 @@ func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
|
||||
}
|
||||
}
|
||||
|
||||
if selfTest := outputs["smartctl-self-test-short"]; len(selfTest) > 0 {
|
||||
var poh, writtenLBAs, readLBAs uint64
|
||||
var readValue int
|
||||
hasReadValue := false
|
||||
for _, a := range attrs {
|
||||
switch a.ID {
|
||||
case 9: // Power_On_Hours
|
||||
poh = parseLeadingUint(a.Raw)
|
||||
case 241: // Total_LBAs_Written
|
||||
writtenLBAs = parseLeadingUint(a.Raw)
|
||||
case 242: // Total_LBAs_Read
|
||||
readLBAs = parseLeadingUint(a.Raw)
|
||||
readValue = a.Value
|
||||
hasReadValue = true
|
||||
}
|
||||
}
|
||||
const sataSectorBytes = 512
|
||||
writeResourceSection(b, resourceInfo{
|
||||
powerOnHours: poh,
|
||||
writtenBytes: writtenLBAs * sataSectorBytes,
|
||||
readBytes: readLBAs * sataSectorBytes,
|
||||
capacityBytes: capacityBytes,
|
||||
readPercent: 100 - readValue,
|
||||
hasReadPercent: hasReadValue,
|
||||
})
|
||||
|
||||
selfTest := outputs["smartctl-self-test-status"]
|
||||
if len(selfTest) == 0 {
|
||||
selfTest = outputs["smartctl-self-test-short"]
|
||||
}
|
||||
if len(selfTest) > 0 {
|
||||
writeSectionHeader(b, "Self-Test")
|
||||
result := parseSelfTestResult(string(selfTest))
|
||||
writeField(b, "Result", result)
|
||||
@@ -274,29 +316,45 @@ func parseSMARTAttrs(text string) []smartAttr {
|
||||
return attrs
|
||||
}
|
||||
|
||||
// parseSelfTestResult extracts a one-line summary from nvme device-self-test
|
||||
// or smartctl -t short output.
|
||||
// parseSelfTestResult extracts a one-line summary from nvme device-self-test,
|
||||
// smartctl -a (post-completion status), or smartctl -t short (launch ack) output.
|
||||
func parseSelfTestResult(text string) string {
|
||||
text = strings.TrimSpace(text)
|
||||
if text == "" {
|
||||
return "no output"
|
||||
}
|
||||
lines := strings.Split(text, "\n")
|
||||
// smartctl -a: "Self-test execution status: ( 0)\n\tThe previous
|
||||
// self-test routine completed\n\twithout error ..." — the description
|
||||
// wraps onto following indented, colon-free continuation lines.
|
||||
for i, line := range lines {
|
||||
if strings.Contains(strings.ToLower(line), "self-test execution status") {
|
||||
parts := []string{strings.TrimSpace(line)}
|
||||
for j := i + 1; j < len(lines) && j < i+4; j++ {
|
||||
cont := strings.TrimSpace(lines[j])
|
||||
if cont == "" || strings.Contains(cont, ":") {
|
||||
break
|
||||
}
|
||||
parts = append(parts, cont)
|
||||
}
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
}
|
||||
// nvme device-self-test: look for "Short Device Self-Test Status : 0x0" or similar
|
||||
for _, line := range strings.Split(text, "\n") {
|
||||
for _, line := range lines {
|
||||
l := strings.ToLower(line)
|
||||
if strings.Contains(l, "self-test status") || strings.Contains(l, "self test status") {
|
||||
return strings.TrimSpace(line)
|
||||
}
|
||||
}
|
||||
// smartctl -t short: "Testing has begun" or "Short BGST started"
|
||||
for _, line := range strings.Split(text, "\n") {
|
||||
for _, line := range lines {
|
||||
l := strings.ToLower(line)
|
||||
if strings.Contains(l, "testing has begun") || strings.Contains(l, "started") || strings.Contains(l, "complete") {
|
||||
return strings.TrimSpace(line)
|
||||
}
|
||||
}
|
||||
// fallback: last non-empty line
|
||||
lines := strings.Split(strings.TrimSpace(text), "\n")
|
||||
for i := len(lines) - 1; i >= 0; i-- {
|
||||
if s := strings.TrimSpace(lines[i]); s != "" {
|
||||
return s
|
||||
@@ -305,6 +363,115 @@ func parseSelfTestResult(text string) string {
|
||||
return "done"
|
||||
}
|
||||
|
||||
// ── Resource (pseudographic usage bars) ────────────────────────────────────────
|
||||
|
||||
// designLifeYears/dwpd model the drive's rated endurance: 1 drive-write-per-day
|
||||
// for 5 years, the baseline enterprise endurance spec used when the vendor's
|
||||
// own TBW/DWPD rating isn't available from SMART/NVMe data.
|
||||
const (
|
||||
designLifeYears = 5
|
||||
dwpd = 1.0
|
||||
)
|
||||
|
||||
type resourceInfo struct {
|
||||
powerOnHours uint64
|
||||
writtenBytes uint64
|
||||
readBytes uint64
|
||||
capacityBytes uint64
|
||||
readPercent int // only meaningful when hasReadPercent
|
||||
hasReadPercent bool // true when the source SMART attribute exposes a normalized read-wear value
|
||||
}
|
||||
|
||||
func writeResourceSection(b *strings.Builder, r resourceInfo) {
|
||||
writeSectionHeader(b, "Resource")
|
||||
|
||||
const maxLifeHours = designLifeYears * 365 * 24
|
||||
upFrac := float64(r.powerOnHours) / float64(maxLifeHours)
|
||||
fmt.Fprintf(b, " %-9s %s %s / %s (%s)\n",
|
||||
"Uptime", progressBar(upFrac, 24), formatHoursHuman(r.powerOnHours), formatHoursHuman(maxLifeHours), formatPercent(upFrac*100))
|
||||
|
||||
if r.capacityBytes > 0 {
|
||||
maxWritten := float64(r.capacityBytes) * dwpd * designLifeYears * 365
|
||||
wFrac := float64(r.writtenBytes) / maxWritten
|
||||
fmt.Fprintf(b, " %-9s %s %s / %s (%s, %g DWPD×%dy)\n",
|
||||
"Written", progressBar(wFrac, 24), formatBytesHuman(float64(r.writtenBytes)), formatBytesHuman(maxWritten), formatPercent(wFrac*100), dwpd, designLifeYears)
|
||||
} else {
|
||||
fmt.Fprintf(b, " %-9s %s\n", "Written", formatBytesHuman(float64(r.writtenBytes)))
|
||||
}
|
||||
|
||||
if r.hasReadPercent {
|
||||
fmt.Fprintf(b, " %-9s %s %s (%d%%)\n",
|
||||
"Read", progressBar(float64(r.readPercent)/100, 24), formatBytesHuman(float64(r.readBytes)), r.readPercent)
|
||||
} else {
|
||||
fmt.Fprintf(b, " %-9s %s\n", "Read", formatBytesHuman(float64(r.readBytes)))
|
||||
}
|
||||
}
|
||||
|
||||
// progressBar renders a fixed-width pseudographic bar, e.g. "[######------]".
|
||||
func progressBar(frac float64, width int) string {
|
||||
if math.IsNaN(frac) || frac < 0 {
|
||||
frac = 0
|
||||
}
|
||||
if frac > 1 {
|
||||
frac = 1
|
||||
}
|
||||
filled := int(math.Round(frac * float64(width)))
|
||||
return "[" + strings.Repeat("#", filled) + strings.Repeat("-", width-filled) + "]"
|
||||
}
|
||||
|
||||
// formatBytesHuman renders a decimal (SI) human-readable byte size, e.g. "1.23 TB".
|
||||
func formatBytesHuman(n float64) string {
|
||||
units := []string{"B", "KB", "MB", "GB", "TB", "PB"}
|
||||
i := 0
|
||||
for n >= 1000 && i < len(units)-1 {
|
||||
n /= 1000
|
||||
i++
|
||||
}
|
||||
if i == 0 {
|
||||
return fmt.Sprintf("%.0f %s", n, units[i])
|
||||
}
|
||||
return fmt.Sprintf("%.2f %s", n, units[i])
|
||||
}
|
||||
|
||||
// formatHoursHuman renders an hour count as a human-scaled duration (hours,
|
||||
// days, or years) so uptimes don't show as raw four/five-digit hour counts.
|
||||
func formatHoursHuman(hours uint64) string {
|
||||
if hours < 48 {
|
||||
return fmt.Sprintf("%d h", hours)
|
||||
}
|
||||
days := float64(hours) / 24
|
||||
if days < 365 {
|
||||
return fmt.Sprintf("%.0f d", days)
|
||||
}
|
||||
years := days / 365
|
||||
if years == math.Trunc(years) {
|
||||
return fmt.Sprintf("%.0f y", years)
|
||||
}
|
||||
return fmt.Sprintf("%.1f y", years)
|
||||
}
|
||||
|
||||
// formatPercent renders a percentage with extra precision below 1% (e.g.
|
||||
// "0.03%"), where a rounded "0%" would hide any usage at all.
|
||||
func formatPercent(pct float64) string {
|
||||
if pct > 0 && pct < 1 {
|
||||
return fmt.Sprintf("%.2f%%", pct)
|
||||
}
|
||||
return fmt.Sprintf("%.0f%%", pct)
|
||||
}
|
||||
|
||||
// parseLeadingUint parses the leading run of digits/commas in s (e.g. from a
|
||||
// SMART raw value or "500,107,862,016 bytes") into a uint64, ignoring the rest.
|
||||
func parseLeadingUint(s string) uint64 {
|
||||
s = strings.TrimSpace(s)
|
||||
end := 0
|
||||
for end < len(s) && (s[end] >= '0' && s[end] <= '9' || s[end] == ',') {
|
||||
end++
|
||||
}
|
||||
digits := strings.ReplaceAll(s[:end], ",", "")
|
||||
n, _ := strconv.ParseUint(digits, 10, 64)
|
||||
return n
|
||||
}
|
||||
|
||||
// ── Formatting helpers ────────────────────────────────────────────────────────
|
||||
|
||||
func writeSectionHeader(b *strings.Builder, title string) {
|
||||
|
||||
Reference in New Issue
Block a user