storage SAT: wait for smartctl self-test completion, add human-readable resource summary
smartctl -t short only launches the self-test and returns immediately
("Testing has begun"); unlike nvme device-self-test --wait, it has no
blocking mode. Validate/Load runs closed the task and produced reports
before the drive actually finished the test. Now poll smartctl -a until
the test completes (or times out) and report the real result.
Also add a per-disk "Resource" section with pseudographic progress bars
for uptime (vs 5y design life), bytes written (vs 1 DWPD x 5y budget),
and bytes read (percent from SMART attribute 242), all rendered in
human-scaled units (days/years, TB/PB) instead of raw hour/byte counts.
Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
This commit is contained in:
@@ -746,6 +746,25 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e
|
|||||||
key := filepath.Base(devPath) + "_" + strings.ReplaceAll(job.name, "-", "_")
|
key := filepath.Base(devPath) + "_" + strings.ReplaceAll(job.name, "-", "_")
|
||||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||||
|
|
||||||
|
// smartctl -t short only launches the self-test on the drive firmware and
|
||||||
|
// returns immediately ("Testing has begun"); unlike `nvme device-self-test
|
||||||
|
// --wait`, smartctl has no blocking mode, so we must poll the drive
|
||||||
|
// ourselves until the self-test actually finishes.
|
||||||
|
if job.name == "smartctl-self-test-short" && err == nil {
|
||||||
|
statusName := "smartctl-self-test-status"
|
||||||
|
statusOut := waitForSmartctlSelfTest(ctx, verboseLog, devPath, logFunc)
|
||||||
|
deviceOutputs[statusName] = statusOut
|
||||||
|
statusFile := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+2, statusName)
|
||||||
|
if writeErr := os.WriteFile(filepath.Join(runDir, statusFile), statusOut, 0644); writeErr != nil {
|
||||||
|
return "", writeErr
|
||||||
|
}
|
||||||
|
sStatus, sRC := classifySATResult(statusName, statusOut, nil)
|
||||||
|
stats.Add(sStatus)
|
||||||
|
sKey := filepath.Base(devPath) + "_" + strings.ReplaceAll(statusName, "-", "_")
|
||||||
|
fmt.Fprintf(&summary, "%s_rc=%d\n", sKey, sRC)
|
||||||
|
fmt.Fprintf(&summary, "%s_status=%s\n", sKey, sStatus)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
reportText := GenerateDiskReportText(index+1, devPath, deviceOutputs, time.Now().UTC())
|
reportText := GenerateDiskReportText(index+1, devPath, deviceOutputs, time.Now().UTC())
|
||||||
_ = os.WriteFile(filepath.Join(runDir, "disk-"+prefix+"-report.txt"), []byte(reportText), 0644)
|
_ = os.WriteFile(filepath.Join(runDir, "disk-"+prefix+"-report.txt"), []byte(reportText), 0644)
|
||||||
@@ -1181,6 +1200,42 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
|||||||
return out, err
|
return out, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// smartctlSelfTestPollInterval/Timeout bound how long we poll the drive after
|
||||||
|
// launching `smartctl -t short`, which SMART/ATA specs put at ~2 minutes.
|
||||||
|
const (
|
||||||
|
smartctlSelfTestPollInterval = 5 * time.Second
|
||||||
|
smartctlSelfTestTimeout = 4 * time.Minute
|
||||||
|
)
|
||||||
|
|
||||||
|
// waitForSmartctlSelfTest polls `smartctl -a` until the short self-test
|
||||||
|
// started on devPath finishes (or the timeout/context elapses) and returns
|
||||||
|
// the final output, which reflects the actual test result rather than the
|
||||||
|
// "Testing has begun" launch acknowledgement.
|
||||||
|
func waitForSmartctlSelfTest(ctx context.Context, verboseLog, devPath string, logFunc func(string)) []byte {
|
||||||
|
deadline := time.Now().Add(smartctlSelfTestTimeout)
|
||||||
|
var last []byte
|
||||||
|
for {
|
||||||
|
out, _ := runSATCommandCtx(ctx, verboseLog, "smartctl-self-test-status", []string{"smartctl", "-a", devPath}, nil, nil)
|
||||||
|
last = out
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return last
|
||||||
|
}
|
||||||
|
lower := bytes.ToLower(out)
|
||||||
|
if !bytes.Contains(lower, []byte("self-test routine in progress")) &&
|
||||||
|
!bytes.Contains(lower, []byte("% of test remaining")) {
|
||||||
|
return last
|
||||||
|
}
|
||||||
|
if time.Now().After(deadline) {
|
||||||
|
return last
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return last
|
||||||
|
case <-time.After(smartctlSelfTestPollInterval):
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func listStorageDevices() ([]string, error) {
|
func listStorageDevices() ([]string, error) {
|
||||||
out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
|
out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -158,6 +158,17 @@ func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) {
|
|||||||
writeField(b, "Media Errors", formatUint(me))
|
writeField(b, "Media Errors", formatUint(me))
|
||||||
writeField(b, "Error Log Entries", formatUint(nel))
|
writeField(b, "Error Log Entries", formatUint(nel))
|
||||||
|
|
||||||
|
capacityBytes := ctrl.TotalCap
|
||||||
|
if capacityBytes == 0 {
|
||||||
|
capacityBytes = ctrl.NVMCap
|
||||||
|
}
|
||||||
|
writeResourceSection(b, resourceInfo{
|
||||||
|
powerOnHours: poh,
|
||||||
|
writtenBytes: uint64(nvmeU64(sl.DataUnitsWritten)) * 512000,
|
||||||
|
readBytes: uint64(nvmeU64(sl.DataUnitsRead)) * 512000,
|
||||||
|
capacityBytes: capacityBytes,
|
||||||
|
})
|
||||||
|
|
||||||
if selfTest := outputs["nvme-device-self-test"]; len(selfTest) > 0 {
|
if selfTest := outputs["nvme-device-self-test"]; len(selfTest) > 0 {
|
||||||
writeSectionHeader(b, "Self-Test")
|
writeSectionHeader(b, "Self-Test")
|
||||||
result := parseSelfTestResult(string(selfTest))
|
result := parseSelfTestResult(string(selfTest))
|
||||||
@@ -168,7 +179,7 @@ func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) {
|
|||||||
// ── SATA / SAS (smartctl) ────────────────────────────────────────────────────
|
// ── SATA / SAS (smartctl) ────────────────────────────────────────────────────
|
||||||
|
|
||||||
var (
|
var (
|
||||||
smartHealthRE = regexp.MustCompile(`(?i)SMART overall-health self-assessment test result:\s*(\S+)`)
|
smartHealthRE = regexp.MustCompile(`(?i)SMART overall-health self-assessment test result:\s*(\S+)`)
|
||||||
smartAttrLineRE = regexp.MustCompile(
|
smartAttrLineRE = regexp.MustCompile(
|
||||||
`^\s*(\d{1,3})\s+(\S+)\s+0x[0-9a-fA-F]+\s+(\d{1,3})\s+(\d{1,3})\s+(\d{1,3})\s+\S+\s+\S+\s+\S+\s+(.+?)\s*$`,
|
`^\s*(\d{1,3})\s+(\S+)\s+0x[0-9a-fA-F]+\s+(\d{1,3})\s+(\d{1,3})\s+(\d{1,3})\s+\S+\s+\S+\s+\S+\s+(.+?)\s*$`,
|
||||||
)
|
)
|
||||||
@@ -205,8 +216,10 @@ func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
|
|||||||
if m := smartFirmwareRE.FindStringSubmatch(text); m != nil {
|
if m := smartFirmwareRE.FindStringSubmatch(text); m != nil {
|
||||||
writeField(b, "Firmware", strings.TrimSpace(m[1]))
|
writeField(b, "Firmware", strings.TrimSpace(m[1]))
|
||||||
}
|
}
|
||||||
|
var capacityBytes uint64
|
||||||
if m := smartCapacityRE.FindStringSubmatch(text); m != nil {
|
if m := smartCapacityRE.FindStringSubmatch(text); m != nil {
|
||||||
cap := strings.TrimSpace(m[1])
|
cap := strings.TrimSpace(m[1])
|
||||||
|
capacityBytes = parseLeadingUint(cap)
|
||||||
// trim everything after "[" if present (e.g. "500,107,862,016 bytes [500 GB]")
|
// trim everything after "[" if present (e.g. "500,107,862,016 bytes [500 GB]")
|
||||||
if idx := strings.Index(cap, "["); idx > 0 {
|
if idx := strings.Index(cap, "["); idx > 0 {
|
||||||
cap = strings.TrimSpace(cap[idx+1:])
|
cap = strings.TrimSpace(cap[idx+1:])
|
||||||
@@ -233,7 +246,36 @@ func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if selfTest := outputs["smartctl-self-test-short"]; len(selfTest) > 0 {
|
var poh, writtenLBAs, readLBAs uint64
|
||||||
|
var readValue int
|
||||||
|
hasReadValue := false
|
||||||
|
for _, a := range attrs {
|
||||||
|
switch a.ID {
|
||||||
|
case 9: // Power_On_Hours
|
||||||
|
poh = parseLeadingUint(a.Raw)
|
||||||
|
case 241: // Total_LBAs_Written
|
||||||
|
writtenLBAs = parseLeadingUint(a.Raw)
|
||||||
|
case 242: // Total_LBAs_Read
|
||||||
|
readLBAs = parseLeadingUint(a.Raw)
|
||||||
|
readValue = a.Value
|
||||||
|
hasReadValue = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const sataSectorBytes = 512
|
||||||
|
writeResourceSection(b, resourceInfo{
|
||||||
|
powerOnHours: poh,
|
||||||
|
writtenBytes: writtenLBAs * sataSectorBytes,
|
||||||
|
readBytes: readLBAs * sataSectorBytes,
|
||||||
|
capacityBytes: capacityBytes,
|
||||||
|
readPercent: 100 - readValue,
|
||||||
|
hasReadPercent: hasReadValue,
|
||||||
|
})
|
||||||
|
|
||||||
|
selfTest := outputs["smartctl-self-test-status"]
|
||||||
|
if len(selfTest) == 0 {
|
||||||
|
selfTest = outputs["smartctl-self-test-short"]
|
||||||
|
}
|
||||||
|
if len(selfTest) > 0 {
|
||||||
writeSectionHeader(b, "Self-Test")
|
writeSectionHeader(b, "Self-Test")
|
||||||
result := parseSelfTestResult(string(selfTest))
|
result := parseSelfTestResult(string(selfTest))
|
||||||
writeField(b, "Result", result)
|
writeField(b, "Result", result)
|
||||||
@@ -274,29 +316,45 @@ func parseSMARTAttrs(text string) []smartAttr {
|
|||||||
return attrs
|
return attrs
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseSelfTestResult extracts a one-line summary from nvme device-self-test
|
// parseSelfTestResult extracts a one-line summary from nvme device-self-test,
|
||||||
// or smartctl -t short output.
|
// smartctl -a (post-completion status), or smartctl -t short (launch ack) output.
|
||||||
func parseSelfTestResult(text string) string {
|
func parseSelfTestResult(text string) string {
|
||||||
text = strings.TrimSpace(text)
|
text = strings.TrimSpace(text)
|
||||||
if text == "" {
|
if text == "" {
|
||||||
return "no output"
|
return "no output"
|
||||||
}
|
}
|
||||||
|
lines := strings.Split(text, "\n")
|
||||||
|
// smartctl -a: "Self-test execution status: ( 0)\n\tThe previous
|
||||||
|
// self-test routine completed\n\twithout error ..." — the description
|
||||||
|
// wraps onto following indented, colon-free continuation lines.
|
||||||
|
for i, line := range lines {
|
||||||
|
if strings.Contains(strings.ToLower(line), "self-test execution status") {
|
||||||
|
parts := []string{strings.TrimSpace(line)}
|
||||||
|
for j := i + 1; j < len(lines) && j < i+4; j++ {
|
||||||
|
cont := strings.TrimSpace(lines[j])
|
||||||
|
if cont == "" || strings.Contains(cont, ":") {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
parts = append(parts, cont)
|
||||||
|
}
|
||||||
|
return strings.Join(parts, " ")
|
||||||
|
}
|
||||||
|
}
|
||||||
// nvme device-self-test: look for "Short Device Self-Test Status : 0x0" or similar
|
// nvme device-self-test: look for "Short Device Self-Test Status : 0x0" or similar
|
||||||
for _, line := range strings.Split(text, "\n") {
|
for _, line := range lines {
|
||||||
l := strings.ToLower(line)
|
l := strings.ToLower(line)
|
||||||
if strings.Contains(l, "self-test status") || strings.Contains(l, "self test status") {
|
if strings.Contains(l, "self-test status") || strings.Contains(l, "self test status") {
|
||||||
return strings.TrimSpace(line)
|
return strings.TrimSpace(line)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// smartctl -t short: "Testing has begun" or "Short BGST started"
|
// smartctl -t short: "Testing has begun" or "Short BGST started"
|
||||||
for _, line := range strings.Split(text, "\n") {
|
for _, line := range lines {
|
||||||
l := strings.ToLower(line)
|
l := strings.ToLower(line)
|
||||||
if strings.Contains(l, "testing has begun") || strings.Contains(l, "started") || strings.Contains(l, "complete") {
|
if strings.Contains(l, "testing has begun") || strings.Contains(l, "started") || strings.Contains(l, "complete") {
|
||||||
return strings.TrimSpace(line)
|
return strings.TrimSpace(line)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// fallback: last non-empty line
|
// fallback: last non-empty line
|
||||||
lines := strings.Split(strings.TrimSpace(text), "\n")
|
|
||||||
for i := len(lines) - 1; i >= 0; i-- {
|
for i := len(lines) - 1; i >= 0; i-- {
|
||||||
if s := strings.TrimSpace(lines[i]); s != "" {
|
if s := strings.TrimSpace(lines[i]); s != "" {
|
||||||
return s
|
return s
|
||||||
@@ -305,6 +363,115 @@ func parseSelfTestResult(text string) string {
|
|||||||
return "done"
|
return "done"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Resource (pseudographic usage bars) ────────────────────────────────────────
|
||||||
|
|
||||||
|
// designLifeYears/dwpd model the drive's rated endurance: 1 drive-write-per-day
|
||||||
|
// for 5 years, the baseline enterprise endurance spec used when the vendor's
|
||||||
|
// own TBW/DWPD rating isn't available from SMART/NVMe data.
|
||||||
|
const (
|
||||||
|
designLifeYears = 5
|
||||||
|
dwpd = 1.0
|
||||||
|
)
|
||||||
|
|
||||||
|
type resourceInfo struct {
|
||||||
|
powerOnHours uint64
|
||||||
|
writtenBytes uint64
|
||||||
|
readBytes uint64
|
||||||
|
capacityBytes uint64
|
||||||
|
readPercent int // only meaningful when hasReadPercent
|
||||||
|
hasReadPercent bool // true when the source SMART attribute exposes a normalized read-wear value
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeResourceSection(b *strings.Builder, r resourceInfo) {
|
||||||
|
writeSectionHeader(b, "Resource")
|
||||||
|
|
||||||
|
const maxLifeHours = designLifeYears * 365 * 24
|
||||||
|
upFrac := float64(r.powerOnHours) / float64(maxLifeHours)
|
||||||
|
fmt.Fprintf(b, " %-9s %s %s / %s (%s)\n",
|
||||||
|
"Uptime", progressBar(upFrac, 24), formatHoursHuman(r.powerOnHours), formatHoursHuman(maxLifeHours), formatPercent(upFrac*100))
|
||||||
|
|
||||||
|
if r.capacityBytes > 0 {
|
||||||
|
maxWritten := float64(r.capacityBytes) * dwpd * designLifeYears * 365
|
||||||
|
wFrac := float64(r.writtenBytes) / maxWritten
|
||||||
|
fmt.Fprintf(b, " %-9s %s %s / %s (%s, %g DWPD×%dy)\n",
|
||||||
|
"Written", progressBar(wFrac, 24), formatBytesHuman(float64(r.writtenBytes)), formatBytesHuman(maxWritten), formatPercent(wFrac*100), dwpd, designLifeYears)
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(b, " %-9s %s\n", "Written", formatBytesHuman(float64(r.writtenBytes)))
|
||||||
|
}
|
||||||
|
|
||||||
|
if r.hasReadPercent {
|
||||||
|
fmt.Fprintf(b, " %-9s %s %s (%d%%)\n",
|
||||||
|
"Read", progressBar(float64(r.readPercent)/100, 24), formatBytesHuman(float64(r.readBytes)), r.readPercent)
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(b, " %-9s %s\n", "Read", formatBytesHuman(float64(r.readBytes)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// progressBar renders a fixed-width pseudographic bar, e.g. "[######------]".
|
||||||
|
func progressBar(frac float64, width int) string {
|
||||||
|
if math.IsNaN(frac) || frac < 0 {
|
||||||
|
frac = 0
|
||||||
|
}
|
||||||
|
if frac > 1 {
|
||||||
|
frac = 1
|
||||||
|
}
|
||||||
|
filled := int(math.Round(frac * float64(width)))
|
||||||
|
return "[" + strings.Repeat("#", filled) + strings.Repeat("-", width-filled) + "]"
|
||||||
|
}
|
||||||
|
|
||||||
|
// formatBytesHuman renders a decimal (SI) human-readable byte size, e.g. "1.23 TB".
|
||||||
|
func formatBytesHuman(n float64) string {
|
||||||
|
units := []string{"B", "KB", "MB", "GB", "TB", "PB"}
|
||||||
|
i := 0
|
||||||
|
for n >= 1000 && i < len(units)-1 {
|
||||||
|
n /= 1000
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
if i == 0 {
|
||||||
|
return fmt.Sprintf("%.0f %s", n, units[i])
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%.2f %s", n, units[i])
|
||||||
|
}
|
||||||
|
|
||||||
|
// formatHoursHuman renders an hour count as a human-scaled duration (hours,
|
||||||
|
// days, or years) so uptimes don't show as raw four/five-digit hour counts.
|
||||||
|
func formatHoursHuman(hours uint64) string {
|
||||||
|
if hours < 48 {
|
||||||
|
return fmt.Sprintf("%d h", hours)
|
||||||
|
}
|
||||||
|
days := float64(hours) / 24
|
||||||
|
if days < 365 {
|
||||||
|
return fmt.Sprintf("%.0f d", days)
|
||||||
|
}
|
||||||
|
years := days / 365
|
||||||
|
if years == math.Trunc(years) {
|
||||||
|
return fmt.Sprintf("%.0f y", years)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%.1f y", years)
|
||||||
|
}
|
||||||
|
|
||||||
|
// formatPercent renders a percentage with extra precision below 1% (e.g.
|
||||||
|
// "0.03%"), where a rounded "0%" would hide any usage at all.
|
||||||
|
func formatPercent(pct float64) string {
|
||||||
|
if pct > 0 && pct < 1 {
|
||||||
|
return fmt.Sprintf("%.2f%%", pct)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%.0f%%", pct)
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseLeadingUint parses the leading run of digits/commas in s (e.g. from a
|
||||||
|
// SMART raw value or "500,107,862,016 bytes") into a uint64, ignoring the rest.
|
||||||
|
func parseLeadingUint(s string) uint64 {
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
end := 0
|
||||||
|
for end < len(s) && (s[end] >= '0' && s[end] <= '9' || s[end] == ',') {
|
||||||
|
end++
|
||||||
|
}
|
||||||
|
digits := strings.ReplaceAll(s[:end], ",", "")
|
||||||
|
n, _ := strconv.ParseUint(digits, 10, 64)
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
|
||||||
// ── Formatting helpers ────────────────────────────────────────────────────────
|
// ── Formatting helpers ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func writeSectionHeader(b *strings.Builder, title string) {
|
func writeSectionHeader(b *strings.Builder, title string) {
|
||||||
|
|||||||
Reference in New Issue
Block a user