Compare commits
2 Commits
audit/v1.0
...
audit/v1.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
540a9e39b8 | ||
|
|
58510207fa |
@@ -505,7 +505,7 @@ func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStres
|
|||||||
if err != nil && err != context.Canceled {
|
if err != nil && err != context.Canceled {
|
||||||
body += "\nERROR: " + err.Error()
|
body += "\nERROR: " + err.Error()
|
||||||
}
|
}
|
||||||
return ActionResult{Title: "Fan Stress Test", Body: body}, err
|
return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// formatFanStressResult formats the summary.txt from a fan-stress run, including
|
// formatFanStressResult formats the summary.txt from a fan-stress run, including
|
||||||
|
|||||||
@@ -140,7 +140,7 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|||||||
pollSATProgress("gpu-amd", since),
|
pollSATProgress("gpu-amd", since),
|
||||||
)
|
)
|
||||||
case actionRunFanStress:
|
case actionRunFanStress:
|
||||||
m.busyTitle = "Fan Stress Test"
|
m.busyTitle = "GPU Platform Stress Test"
|
||||||
m.progressPrefix = "fan-stress"
|
m.progressPrefix = "fan-stress"
|
||||||
m.progressSince = time.Now()
|
m.progressSince = time.Now()
|
||||||
m.progressLines = nil
|
m.progressLines = nil
|
||||||
|
|||||||
@@ -317,11 +317,11 @@ func renderHealthCheck(m model) string {
|
|||||||
if m.hcCursor == hcCurFanStress {
|
if m.hcCursor == hcCurFanStress {
|
||||||
pfx = "> "
|
pfx = "> "
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "%s[ FAN STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx)
|
fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx)
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Fprintln(&b)
|
fmt.Fprintln(&b)
|
||||||
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
|
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
|
||||||
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [F] fan stress [Esc] back")
|
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [F] gpu stress [Esc] back")
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -191,7 +191,7 @@ func (m model) confirmBody() (string, string) {
|
|||||||
return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?"
|
return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?"
|
||||||
case actionRunFanStress:
|
case actionRunFanStress:
|
||||||
modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
|
modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
|
||||||
return "Fan Stress Test", "Two-phase GPU thermal cycling test.\n" +
|
return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" +
|
||||||
"Monitors fans, temps, power — detects throttling.\n" +
|
"Monitors fans, temps, power — detects throttling.\n" +
|
||||||
"Mode: " + modes[m.hcMode] + "\n\nAll NVIDIA GPUs will be stressed."
|
"Mode: " + modes[m.hcMode] + "\n\nAll NVIDIA GPUs will be stressed."
|
||||||
default:
|
default:
|
||||||
|
|||||||
@@ -5,13 +5,13 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
ROCM_VERSION="6.4"
|
# ROCm versions to try in order (newest first). Fall back if a version's
|
||||||
|
# Release file is missing from the repo (happens with brand-new releases).
|
||||||
|
ROCM_CANDIDATES="6.4 6.3 6.2"
|
||||||
ROCM_KEYRING="/etc/apt/keyrings/rocm.gpg"
|
ROCM_KEYRING="/etc/apt/keyrings/rocm.gpg"
|
||||||
ROCM_LIST="/etc/apt/sources.list.d/rocm.list"
|
ROCM_LIST="/etc/apt/sources.list.d/rocm.list"
|
||||||
APT_UPDATED=0
|
APT_UPDATED=0
|
||||||
|
|
||||||
echo "=== AMD ROCm ${ROCM_VERSION}: adding repository ==="
|
|
||||||
|
|
||||||
mkdir -p /etc/apt/keyrings
|
mkdir -p /etc/apt/keyrings
|
||||||
|
|
||||||
ensure_tool() {
|
ensure_tool() {
|
||||||
@@ -51,11 +51,26 @@ if ! wget -qO- "https://repo.radeon.com/rocm/rocm.gpg.key" \
|
|||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
cat > "${ROCM_LIST}" <<EOF
|
# Try each ROCm version until apt-get update succeeds (repo has a Release file).
|
||||||
deb [arch=amd64 signed-by=${ROCM_KEYRING}] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} bookworm main
|
ROCM_VERSION=""
|
||||||
|
for candidate in ${ROCM_CANDIDATES}; do
|
||||||
|
cat > "${ROCM_LIST}" <<EOF
|
||||||
|
deb [arch=amd64 signed-by=${ROCM_KEYRING}] https://repo.radeon.com/rocm/apt/${candidate} bookworm main
|
||||||
EOF
|
EOF
|
||||||
|
if apt-get update -qq 2>/dev/null; then
|
||||||
|
ROCM_VERSION="${candidate}"
|
||||||
|
echo "=== AMD ROCm ${ROCM_VERSION}: repository available ==="
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
echo "WARN: ROCm ${candidate} repository not available for bookworm, trying next..."
|
||||||
|
rm -f "${ROCM_LIST}"
|
||||||
|
done
|
||||||
|
|
||||||
apt-get update -qq
|
if [ -z "${ROCM_VERSION}" ]; then
|
||||||
|
echo "WARN: no ROCm apt repository available for bookworm — skipping ROCm install"
|
||||||
|
rm -f "${ROCM_KEYRING}"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
# rocm-smi-lib provides the rocm-smi CLI tool for GPU monitoring
|
# rocm-smi-lib provides the rocm-smi CLI tool for GPU monitoring
|
||||||
if apt-get install -y --no-install-recommends rocm-smi-lib 2>/dev/null; then
|
if apt-get install -y --no-install-recommends rocm-smi-lib 2>/dev/null; then
|
||||||
@@ -63,9 +78,9 @@ if apt-get install -y --no-install-recommends rocm-smi-lib 2>/dev/null; then
|
|||||||
if [ -x /opt/rocm/bin/rocm-smi ]; then
|
if [ -x /opt/rocm/bin/rocm-smi ]; then
|
||||||
ln -sf /opt/rocm/bin/rocm-smi /usr/local/bin/rocm-smi
|
ln -sf /opt/rocm/bin/rocm-smi /usr/local/bin/rocm-smi
|
||||||
else
|
else
|
||||||
candidate="$(find /opt -path '*/bin/rocm-smi' -type f 2>/dev/null | sort | tail -1)"
|
smi_path="$(find /opt -path '*/bin/rocm-smi' -type f 2>/dev/null | sort | tail -1)"
|
||||||
if [ -n "${candidate}" ]; then
|
if [ -n "${smi_path}" ]; then
|
||||||
ln -sf "${candidate}" /usr/local/bin/rocm-smi
|
ln -sf "${smi_path}" /usr/local/bin/rocm-smi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
rocm-smi --version 2>/dev/null || true
|
rocm-smi --version 2>/dev/null || true
|
||||||
|
|||||||
Reference in New Issue
Block a user