Tighten support bundles and fix AMD runtime checks
This commit is contained in:
@@ -4,6 +4,7 @@ import (
|
||||
"archive/tar"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
@@ -15,6 +16,22 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
satExecCommand = exec.Command
|
||||
satLookPath = exec.LookPath
|
||||
satGlob = filepath.Glob
|
||||
satStat = os.Stat
|
||||
|
||||
rocmSMIExecutableGlobs = []string{
|
||||
"/opt/rocm/bin/rocm-smi",
|
||||
"/opt/rocm-*/bin/rocm-smi",
|
||||
}
|
||||
rocmSMIScriptGlobs = []string{
|
||||
"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
|
||||
"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
|
||||
}
|
||||
)
|
||||
|
||||
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
||||
type NvidiaGPU struct {
|
||||
Index int
|
||||
@@ -41,7 +58,7 @@ func (s *System) DetectGPUVendor() string {
|
||||
|
||||
// ListAMDGPUs returns AMD GPUs visible to rocm-smi.
|
||||
func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
|
||||
out, err := exec.Command("rocm-smi", "--showproductname", "--csv").Output()
|
||||
out, err := runROCmSMI("--showproductname", "--csv")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("rocm-smi: %w", err)
|
||||
}
|
||||
@@ -337,12 +354,22 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
|
||||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
|
||||
start := time.Now().UTC()
|
||||
resolvedCmd, err := resolveSATCommand(cmd)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||
"cmd: "+strings.Join(cmd, " "),
|
||||
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||
)
|
||||
if err != nil {
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||
"rc: 1",
|
||||
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||||
"",
|
||||
)
|
||||
return []byte(err.Error() + "\n"), err
|
||||
}
|
||||
|
||||
c := exec.CommandContext(ctx, cmd[0], cmd[1:]...)
|
||||
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
||||
if len(env) > 0 {
|
||||
c.Env = append(os.Environ(), env...)
|
||||
}
|
||||
@@ -362,19 +389,11 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
||||
}
|
||||
|
||||
func listStorageDevices() ([]string, error) {
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "NAME,TYPE").Output()
|
||||
out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var devices []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
fields := strings.Fields(strings.TrimSpace(line))
|
||||
if len(fields) != 2 || fields[1] != "disk" {
|
||||
continue
|
||||
}
|
||||
devices = append(devices, "/dev/"+fields[0])
|
||||
}
|
||||
return devices, nil
|
||||
return parseStorageDevices(string(out)), nil
|
||||
}
|
||||
|
||||
func storageSATCommands(devPath string) []satJob {
|
||||
@@ -445,12 +464,22 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
||||
|
||||
func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
||||
start := time.Now().UTC()
|
||||
resolvedCmd, err := resolveSATCommand(cmd)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||
"cmd: "+strings.Join(cmd, " "),
|
||||
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||
)
|
||||
if err != nil {
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||
"rc: 1",
|
||||
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||||
"",
|
||||
)
|
||||
return []byte(err.Error() + "\n"), err
|
||||
}
|
||||
|
||||
out, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput()
|
||||
out, err := satExecCommand(resolvedCmd[0], resolvedCmd[1:]...).CombinedOutput()
|
||||
|
||||
rc := 0
|
||||
if err != nil {
|
||||
@@ -465,6 +494,91 @@ func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
||||
return out, err
|
||||
}
|
||||
|
||||
func runROCmSMI(args ...string) ([]byte, error) {
|
||||
cmd, err := resolveROCmSMICommand(args...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return satExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
|
||||
}
|
||||
|
||||
func resolveSATCommand(cmd []string) ([]string, error) {
|
||||
if len(cmd) == 0 {
|
||||
return nil, errors.New("empty SAT command")
|
||||
}
|
||||
if cmd[0] != "rocm-smi" {
|
||||
return cmd, nil
|
||||
}
|
||||
return resolveROCmSMICommand(cmd[1:]...)
|
||||
}
|
||||
|
||||
func resolveROCmSMICommand(args ...string) ([]string, error) {
|
||||
if path, err := satLookPath("rocm-smi"); err == nil {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
|
||||
for _, path := range rocmSMIExecutableCandidates() {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
|
||||
pythonPath, pyErr := satLookPath("python3")
|
||||
if pyErr == nil {
|
||||
for _, script := range rocmSMIScriptCandidates() {
|
||||
cmd := []string{pythonPath, script}
|
||||
cmd = append(cmd, args...)
|
||||
return cmd, nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
||||
}
|
||||
|
||||
func rocmSMIExecutableCandidates() []string {
|
||||
return expandExistingPaths(rocmSMIExecutableGlobs)
|
||||
}
|
||||
|
||||
func rocmSMIScriptCandidates() []string {
|
||||
return expandExistingPaths(rocmSMIScriptGlobs)
|
||||
}
|
||||
|
||||
func expandExistingPaths(patterns []string) []string {
|
||||
seen := make(map[string]struct{})
|
||||
var paths []string
|
||||
for _, pattern := range patterns {
|
||||
matches, err := satGlob(pattern)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sort.Strings(matches)
|
||||
for _, match := range matches {
|
||||
if _, err := satStat(match); err != nil {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[match]; ok {
|
||||
continue
|
||||
}
|
||||
seen[match] = struct{}{}
|
||||
paths = append(paths, match)
|
||||
}
|
||||
}
|
||||
return paths
|
||||
}
|
||||
|
||||
func parseStorageDevices(raw string) []string {
|
||||
var devices []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
fields := strings.Fields(strings.TrimSpace(line))
|
||||
if len(fields) < 2 || fields[1] != "disk" {
|
||||
continue
|
||||
}
|
||||
if len(fields) >= 3 && strings.EqualFold(fields[2], "usb") {
|
||||
continue
|
||||
}
|
||||
devices = append(devices, "/dev/"+fields[0])
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
|
||||
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
|
||||
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {
|
||||
|
||||
Reference in New Issue
Block a user