Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 05241f2e0e | |||
|
|
c1690a084b | ||
|
|
9481ca2805 | ||
|
|
a78fdadd88 | ||
|
|
4ef403898f |
@@ -117,7 +117,7 @@ type satRunner interface {
|
|||||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
@@ -190,6 +190,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
|||||||
}
|
}
|
||||||
result := collector.Run(runtimeMode)
|
result := collector.Run(runtimeMode)
|
||||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||||
|
writePSUStatusesToDB(a.StatusDB, result.Hardware.PowerSupplies)
|
||||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||||
result.Runtime = &health
|
result.Runtime = &health
|
||||||
}
|
}
|
||||||
@@ -566,11 +567,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
|
|||||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
@@ -926,6 +927,41 @@ func bodyOr(body, fallback string) string {
|
|||||||
return body
|
return body
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// writePSUStatusesToDB records PSU statuses collected during audit into the
|
||||||
|
// component-status DB so they are visible in the Hardware Summary card.
|
||||||
|
// PSU status is sourced from IPMI (ipmitool fru + sdr) during audit.
|
||||||
|
func writePSUStatusesToDB(db *ComponentStatusDB, psus []schema.HardwarePowerSupply) {
|
||||||
|
if db == nil || len(psus) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
const source = "audit:ipmi"
|
||||||
|
worstStatus := "OK"
|
||||||
|
for _, psu := range psus {
|
||||||
|
if psu.Status == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
slot := "?"
|
||||||
|
if psu.Slot != nil {
|
||||||
|
slot = *psu.Slot
|
||||||
|
}
|
||||||
|
st := *psu.Status
|
||||||
|
detail := ""
|
||||||
|
if psu.ErrorDescription != nil {
|
||||||
|
detail = *psu.ErrorDescription
|
||||||
|
}
|
||||||
|
db.Record("psu:"+slot, source, st, detail)
|
||||||
|
switch st {
|
||||||
|
case "Critical":
|
||||||
|
worstStatus = "Critical"
|
||||||
|
case "Warning":
|
||||||
|
if worstStatus != "Critical" {
|
||||||
|
worstStatus = "Warning"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
db.Record("psu:all", source, worstStatus, "")
|
||||||
|
}
|
||||||
|
|
||||||
func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
|
func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
|
||||||
raw, err := os.ReadFile(path)
|
raw, err := os.ReadFile(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -161,7 +161,7 @@ func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir
|
|||||||
return f.runNvidiaFn(baseDir)
|
return f.runNvidiaFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
|
||||||
if f.runNvidiaComputeFn != nil {
|
if f.runNvidiaComputeFn != nil {
|
||||||
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
||||||
}
|
}
|
||||||
@@ -542,8 +542,6 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
oldExportDir := DefaultExportDir
|
oldExportDir := DefaultExportDir
|
||||||
DefaultExportDir = tmp
|
DefaultExportDir = tmp
|
||||||
@@ -580,8 +578,6 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
oldExportDir := DefaultExportDir
|
oldExportDir := DefaultExportDir
|
||||||
DefaultExportDir = tmp
|
DefaultExportDir = tmp
|
||||||
@@ -643,8 +639,6 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
oldSATBaseDir := DefaultSATBaseDir
|
oldSATBaseDir := DefaultSATBaseDir
|
||||||
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
||||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ if ! command -v lspci >/dev/null 2>&1; then
|
|||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
found=0
|
found=0
|
||||||
for gpu in $(lspci -Dn | awk '$3 ~ /^10de:/ {print $1}'); do
|
for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
|
||||||
found=1
|
found=1
|
||||||
echo "=== GPU $gpu ==="
|
echo "=== GPU $gpu ==="
|
||||||
lspci -s "$gpu" -vv 2>&1 || true
|
lspci -s "$gpu" -vv 2>&1 || true
|
||||||
@@ -73,8 +73,13 @@ fi
|
|||||||
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||||
for d in /sys/bus/pci/devices/*/; do
|
for d in /sys/bus/pci/devices/*/; do
|
||||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||||
[ "$vendor" = "0x10de" ] || continue
|
[ "$vendor" = "0x10de" ] || continue
|
||||||
dev=$(basename "$d")
|
class=$(cat "$d/class" 2>/dev/null)
|
||||||
|
case "$class" in
|
||||||
|
0x030000|0x030200) ;;
|
||||||
|
*) continue ;;
|
||||||
|
esac
|
||||||
|
dev=$(basename "$d")
|
||||||
echo "=== $dev ==="
|
echo "=== $dev ==="
|
||||||
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
||||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||||
|
|||||||
@@ -179,11 +179,3 @@ func commandOutputWithTimeout(timeout time.Duration, name string, args ...string
|
|||||||
defer cancel()
|
defer cancel()
|
||||||
return exec.CommandContext(ctx, name, args...).Output()
|
return exec.CommandContext(ctx, name, args...).Output()
|
||||||
}
|
}
|
||||||
|
|
||||||
func interfaceHasCarrier(iface string) bool {
|
|
||||||
raw, err := readNetCarrierFile(iface)
|
|
||||||
if err != nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
return strings.TrimSpace(raw) == "1"
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -58,12 +58,10 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if interfaceHasCarrier(iface) {
|
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||||
if injectSFPDOMTelemetry(&devs[i], out) {
|
enriched++
|
||||||
enriched++
|
continue
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
||||||
@@ -115,8 +113,38 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
|||||||
}
|
}
|
||||||
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
||||||
val := strings.TrimSpace(trimmed[idx+1:])
|
val := strings.TrimSpace(trimmed[idx+1:])
|
||||||
|
if val == "" || strings.EqualFold(val, "not supported") || strings.EqualFold(val, "unknown") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
|
case key == "identifier":
|
||||||
|
s := parseSFPIdentifier(val)
|
||||||
|
dev.SFPIdentifier = &s
|
||||||
|
t := true
|
||||||
|
dev.SFPPresent = &t
|
||||||
|
changed = true
|
||||||
|
case key == "connector":
|
||||||
|
s := parseSFPConnector(val)
|
||||||
|
dev.SFPConnector = &s
|
||||||
|
changed = true
|
||||||
|
case key == "vendor name":
|
||||||
|
s := strings.TrimSpace(val)
|
||||||
|
dev.SFPVendor = &s
|
||||||
|
changed = true
|
||||||
|
case key == "vendor pn":
|
||||||
|
s := strings.TrimSpace(val)
|
||||||
|
dev.SFPPartNumber = &s
|
||||||
|
changed = true
|
||||||
|
case key == "vendor sn":
|
||||||
|
s := strings.TrimSpace(val)
|
||||||
|
dev.SFPSerialNumber = &s
|
||||||
|
changed = true
|
||||||
|
case strings.Contains(key, "laser wavelength"):
|
||||||
|
if f, ok := firstFloat(val); ok {
|
||||||
|
dev.SFPWavelengthNM = &f
|
||||||
|
changed = true
|
||||||
|
}
|
||||||
case strings.Contains(key, "module temperature"):
|
case strings.Contains(key, "module temperature"):
|
||||||
if f, ok := firstFloat(val); ok {
|
if f, ok := firstFloat(val); ok {
|
||||||
dev.SFPTemperatureC = &f
|
dev.SFPTemperatureC = &f
|
||||||
@@ -147,12 +175,61 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
|||||||
return changed
|
return changed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// parseSFPIdentifier extracts the human-readable transceiver type from the
|
||||||
|
// raw ethtool identifier line, e.g. "0x03 (SFP)" → "SFP".
|
||||||
|
func parseSFPIdentifier(val string) string {
|
||||||
|
if s := extractParens(val); s != "" {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseSFPConnector extracts the connector type from the raw ethtool line,
|
||||||
|
// e.g. "0x07 (LC)" → "LC".
|
||||||
|
func parseSFPConnector(val string) string {
|
||||||
|
if s := extractParens(val); s != "" {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
|
||||||
|
var parenRe = regexp.MustCompile(`\(([^)]+)\)`)
|
||||||
|
|
||||||
|
func extractParens(s string) string {
|
||||||
|
m := parenRe.FindStringSubmatch(s)
|
||||||
|
if len(m) < 2 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(m[1])
|
||||||
|
}
|
||||||
|
|
||||||
func parseSFPDOM(raw string) map[string]any {
|
func parseSFPDOM(raw string) map[string]any {
|
||||||
dev := schema.HardwarePCIeDevice{}
|
dev := schema.HardwarePCIeDevice{}
|
||||||
if !injectSFPDOMTelemetry(&dev, raw) {
|
if !injectSFPDOMTelemetry(&dev, raw) {
|
||||||
return map[string]any{}
|
return map[string]any{}
|
||||||
}
|
}
|
||||||
out := map[string]any{}
|
out := map[string]any{}
|
||||||
|
if dev.SFPPresent != nil {
|
||||||
|
out["sfp_present"] = *dev.SFPPresent
|
||||||
|
}
|
||||||
|
if dev.SFPIdentifier != nil {
|
||||||
|
out["sfp_identifier"] = *dev.SFPIdentifier
|
||||||
|
}
|
||||||
|
if dev.SFPConnector != nil {
|
||||||
|
out["sfp_connector"] = *dev.SFPConnector
|
||||||
|
}
|
||||||
|
if dev.SFPVendor != nil {
|
||||||
|
out["sfp_vendor"] = *dev.SFPVendor
|
||||||
|
}
|
||||||
|
if dev.SFPPartNumber != nil {
|
||||||
|
out["sfp_part_number"] = *dev.SFPPartNumber
|
||||||
|
}
|
||||||
|
if dev.SFPSerialNumber != nil {
|
||||||
|
out["sfp_serial_number"] = *dev.SFPSerialNumber
|
||||||
|
}
|
||||||
|
if dev.SFPWavelengthNM != nil {
|
||||||
|
out["sfp_wavelength_nm"] = *dev.SFPWavelengthNM
|
||||||
|
}
|
||||||
if dev.SFPTemperatureC != nil {
|
if dev.SFPTemperatureC != nil {
|
||||||
out["sfp_temperature_c"] = *dev.SFPTemperatureC
|
out["sfp_temperature_c"] = *dev.SFPTemperatureC
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -122,10 +122,7 @@ func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T)
|
|||||||
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
||||||
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
||||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||||
ethtoolModuleQuery = func(string) (string, error) {
|
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("no module") }
|
||||||
t.Fatal("ethtool -m should not be called without carrier")
|
|
||||||
return "", nil
|
|
||||||
}
|
|
||||||
|
|
||||||
class := "EthernetController"
|
class := "EthernetController"
|
||||||
bdf := "0000:18:00.0"
|
bdf := "0000:18:00.0"
|
||||||
|
|||||||
@@ -49,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
|||||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
}
|
}
|
||||||
|
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||||
|
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||||
|
}
|
||||||
if len(selected) > 0 {
|
if len(selected) > 0 {
|
||||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
}
|
}
|
||||||
@@ -63,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
|||||||
"bee-john-gpu-stress",
|
"bee-john-gpu-stress",
|
||||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
}
|
}
|
||||||
|
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||||
|
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||||
|
}
|
||||||
if len(selected) > 0 {
|
if len(selected) > 0 {
|
||||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -384,25 +384,39 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
|||||||
), logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
var (
|
||||||
if err != nil {
|
profCmd []string
|
||||||
return "", err
|
profEnv []string
|
||||||
|
)
|
||||||
|
if staggerSec > 0 && len(selected) > 1 {
|
||||||
|
profCmd = []string{
|
||||||
|
"bee-dcgmproftester-staggered",
|
||||||
|
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
||||||
|
"--stagger-seconds", strconv.Itoa(staggerSec),
|
||||||
|
"--devices", joinIndexList(selected),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
profEnv = nvidiaVisibleDevicesEnv(selected)
|
||||||
}
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
||||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||||||
satJob{
|
satJob{
|
||||||
name: "03-dcgmproftester.log",
|
name: "03-dcgmproftester.log",
|
||||||
cmd: profCmd,
|
cmd: profCmd,
|
||||||
env: nvidiaVisibleDevicesEnv(selected),
|
env: profEnv,
|
||||||
collectGPU: true,
|
collectGPU: true,
|
||||||
gpuIndices: selected,
|
gpuIndices: selected,
|
||||||
},
|
},
|
||||||
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
), logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -70,6 +70,7 @@ type NvidiaStressOptions struct {
|
|||||||
Loader string
|
Loader string
|
||||||
GPUIndices []int
|
GPUIndices []int
|
||||||
ExcludeGPUIndices []int
|
ExcludeGPUIndices []int
|
||||||
|
StaggerSeconds int
|
||||||
}
|
}
|
||||||
|
|
||||||
func New() *System {
|
func New() *System {
|
||||||
|
|||||||
@@ -183,6 +183,13 @@ type HardwarePCIeDevice struct {
|
|||||||
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
|
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
|
||||||
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
|
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
|
||||||
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
|
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
|
||||||
|
SFPPresent *bool `json:"sfp_present,omitempty"`
|
||||||
|
SFPIdentifier *string `json:"sfp_identifier,omitempty"`
|
||||||
|
SFPConnector *string `json:"sfp_connector,omitempty"`
|
||||||
|
SFPVendor *string `json:"sfp_vendor,omitempty"`
|
||||||
|
SFPPartNumber *string `json:"sfp_part_number,omitempty"`
|
||||||
|
SFPSerialNumber *string `json:"sfp_serial_number,omitempty"`
|
||||||
|
SFPWavelengthNM *float64 `json:"sfp_wavelength_nm,omitempty"`
|
||||||
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
|
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
|
||||||
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
|
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
|
||||||
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
|
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
|
||||||
|
|||||||
@@ -482,12 +482,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
var body struct {
|
var body struct {
|
||||||
Duration int `json:"duration"`
|
Duration int `json:"duration"`
|
||||||
StressMode bool `json:"stress_mode"`
|
StressMode bool `json:"stress_mode"`
|
||||||
GPUIndices []int `json:"gpu_indices"`
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||||
Loader string `json:"loader"`
|
StaggerGPUStart bool `json:"stagger_gpu_start"`
|
||||||
|
Loader string `json:"loader"`
|
||||||
Profile string `json:"profile"`
|
Profile string `json:"profile"`
|
||||||
DisplayName string `json:"display_name"`
|
DisplayName string `json:"display_name"`
|
||||||
PlatformComponents []string `json:"platform_components"`
|
PlatformComponents []string `json:"platform_components"`
|
||||||
@@ -503,12 +504,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
if strings.TrimSpace(body.DisplayName) != "" {
|
if strings.TrimSpace(body.DisplayName) != "" {
|
||||||
name = body.DisplayName
|
name = body.DisplayName
|
||||||
}
|
}
|
||||||
params := taskParams{
|
params := taskParams{
|
||||||
Duration: body.Duration,
|
Duration: body.Duration,
|
||||||
StressMode: body.StressMode,
|
StressMode: body.StressMode,
|
||||||
GPUIndices: body.GPUIndices,
|
GPUIndices: body.GPUIndices,
|
||||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||||
Loader: body.Loader,
|
StaggerGPUStart: body.StaggerGPUStart,
|
||||||
|
Loader: body.Loader,
|
||||||
BurnProfile: body.Profile,
|
BurnProfile: body.Profile,
|
||||||
DisplayName: body.DisplayName,
|
DisplayName: body.DisplayName,
|
||||||
PlatformComponents: body.PlatformComponents,
|
PlatformComponents: body.PlatformComponents,
|
||||||
|
|||||||
@@ -317,106 +317,271 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return `<div class="card"><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
|
return `<div class="card"><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
|
||||||
}
|
}
|
||||||
// Parse just enough fields for the summary banner
|
var ingest schema.HardwareIngestRequest
|
||||||
var snap struct {
|
if err := json.Unmarshal(data, &ingest); err != nil {
|
||||||
Summary struct {
|
|
||||||
CPU struct{ Model string }
|
|
||||||
Memory struct{ TotalGB float64 }
|
|
||||||
Storage []struct{ Device, Model, Size string }
|
|
||||||
GPUs []struct{ Model string }
|
|
||||||
PSUs []struct{ Model string }
|
|
||||||
}
|
|
||||||
Network struct {
|
|
||||||
Interfaces []struct {
|
|
||||||
Name string
|
|
||||||
IPv4 []string
|
|
||||||
State string
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Try to extract top-level fields loosely
|
|
||||||
var raw map[string]json.RawMessage
|
|
||||||
if err := json.Unmarshal(data, &raw); err != nil {
|
|
||||||
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
|
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
|
||||||
}
|
}
|
||||||
_ = snap
|
hw := ingest.Hardware
|
||||||
|
|
||||||
// Also load runtime-health for badges
|
var records []app.ComponentStatusRecord
|
||||||
type componentHealth struct {
|
if db, err := app.OpenComponentStatusDB(filepath.Join(opts.ExportDir, "component-status.json")); err == nil {
|
||||||
FailCount int `json:"fail_count"`
|
records = db.All()
|
||||||
WarnCount int `json:"warn_count"`
|
|
||||||
}
|
}
|
||||||
type healthSummary struct {
|
|
||||||
CPU componentHealth `json:"cpu"`
|
|
||||||
Memory componentHealth `json:"memory"`
|
|
||||||
Storage componentHealth `json:"storage"`
|
|
||||||
GPU componentHealth `json:"gpu"`
|
|
||||||
PSU componentHealth `json:"psu"`
|
|
||||||
Network componentHealth `json:"network"`
|
|
||||||
}
|
|
||||||
var health struct {
|
|
||||||
HardwareHealth healthSummary `json:"hardware_health"`
|
|
||||||
}
|
|
||||||
if hdata, herr := loadSnapshot(filepath.Join(opts.ExportDir, "runtime-health.json")); herr == nil {
|
|
||||||
_ = json.Unmarshal(hdata, &health)
|
|
||||||
}
|
|
||||||
|
|
||||||
badge := func(h componentHealth) string {
|
|
||||||
if h.FailCount > 0 {
|
|
||||||
return `<span class="badge badge-err">FAIL</span>`
|
|
||||||
}
|
|
||||||
if h.WarnCount > 0 {
|
|
||||||
return `<span class="badge badge-warn">WARN</span>`
|
|
||||||
}
|
|
||||||
return `<span class="badge badge-ok">OK</span>`
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract readable strings from raw JSON
|
|
||||||
getString := func(key string) string {
|
|
||||||
v, ok := raw[key]
|
|
||||||
if !ok {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
var s string
|
|
||||||
if err := json.Unmarshal(v, &s); err == nil {
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
cpuModel := getString("cpu_model")
|
|
||||||
memStr := getString("memory_summary")
|
|
||||||
gpuSummary := getString("gpu_summary")
|
|
||||||
|
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
b.WriteString(`<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body">`)
|
b.WriteString(`<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body">`)
|
||||||
b.WriteString(`<table style="width:auto">`)
|
b.WriteString(`<table style="width:auto">`)
|
||||||
writeRow := func(label, value, badgeHTML string) {
|
writeRow := func(label, value, badgeHTML string) {
|
||||||
b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
|
b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0;color:var(--muted);font-size:13px">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
|
||||||
html.EscapeString(label), html.EscapeString(value), badgeHTML))
|
html.EscapeString(label), html.EscapeString(value), badgeHTML))
|
||||||
}
|
}
|
||||||
if cpuModel != "" {
|
|
||||||
writeRow("CPU", cpuModel, badge(health.HardwareHealth.CPU))
|
cpuRow := aggregateComponentStatus("CPU", records, []string{"cpu:all"}, nil)
|
||||||
} else {
|
writeRow("CPU", hwDescribeCPU(hw), runtimeStatusBadge(cpuRow.Status))
|
||||||
writeRow("CPU", "—", badge(health.HardwareHealth.CPU))
|
|
||||||
|
memRow := aggregateComponentStatus("Memory", records, []string{"memory:all"}, []string{"memory:"})
|
||||||
|
writeRow("Memory", hwDescribeMemory(hw), runtimeStatusBadge(memRow.Status))
|
||||||
|
|
||||||
|
storageRow := aggregateComponentStatus("Storage", records, []string{"storage:all"}, []string{"storage:"})
|
||||||
|
writeRow("Storage", hwDescribeStorage(hw), runtimeStatusBadge(storageRow.Status))
|
||||||
|
|
||||||
|
gpuRow := aggregateComponentStatus("GPU", records, nil, []string{"pcie:gpu:"})
|
||||||
|
writeRow("GPU", hwDescribeGPU(hw), runtimeStatusBadge(gpuRow.Status))
|
||||||
|
|
||||||
|
psuRow := aggregateComponentStatus("PSU", records, nil, []string{"psu:"})
|
||||||
|
writeRow("PSU", hwDescribePSU(hw), runtimeStatusBadge(psuRow.Status))
|
||||||
|
|
||||||
|
if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
|
||||||
|
writeRow("Network", nicDesc, "")
|
||||||
}
|
}
|
||||||
if memStr != "" {
|
|
||||||
writeRow("Memory", memStr, badge(health.HardwareHealth.Memory))
|
|
||||||
} else {
|
|
||||||
writeRow("Memory", "—", badge(health.HardwareHealth.Memory))
|
|
||||||
}
|
|
||||||
if gpuSummary != "" {
|
|
||||||
writeRow("GPU", gpuSummary, badge(health.HardwareHealth.GPU))
|
|
||||||
} else {
|
|
||||||
writeRow("GPU", "—", badge(health.HardwareHealth.GPU))
|
|
||||||
}
|
|
||||||
writeRow("Storage", "—", badge(health.HardwareHealth.Storage))
|
|
||||||
writeRow("PSU", "—", badge(health.HardwareHealth.PSU))
|
|
||||||
b.WriteString(`</table>`)
|
b.WriteString(`</table>`)
|
||||||
b.WriteString(`</div></div>`)
|
b.WriteString(`</div></div>`)
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// hwDescribeCPU returns a human-readable CPU summary, e.g. "2× Intel Xeon Gold 6338".
|
||||||
|
func hwDescribeCPU(hw schema.HardwareSnapshot) string {
|
||||||
|
counts := map[string]int{}
|
||||||
|
order := []string{}
|
||||||
|
for _, cpu := range hw.CPUs {
|
||||||
|
model := "Unknown CPU"
|
||||||
|
if cpu.Model != nil && *cpu.Model != "" {
|
||||||
|
model = *cpu.Model
|
||||||
|
}
|
||||||
|
if counts[model] == 0 {
|
||||||
|
order = append(order, model)
|
||||||
|
}
|
||||||
|
counts[model]++
|
||||||
|
}
|
||||||
|
if len(order) == 0 {
|
||||||
|
return "—"
|
||||||
|
}
|
||||||
|
parts := make([]string, 0, len(order))
|
||||||
|
for _, m := range order {
|
||||||
|
if counts[m] > 1 {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
|
||||||
|
} else {
|
||||||
|
parts = append(parts, m)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return strings.Join(parts, ", ")
|
||||||
|
}
|
||||||
|
|
||||||
|
// hwDescribeMemory returns a summary like "16× 32 GB DDR4".
|
||||||
|
func hwDescribeMemory(hw schema.HardwareSnapshot) string {
|
||||||
|
type key struct {
|
||||||
|
sizeMB int
|
||||||
|
typ string
|
||||||
|
}
|
||||||
|
counts := map[key]int{}
|
||||||
|
order := []key{}
|
||||||
|
for _, dimm := range hw.Memory {
|
||||||
|
if dimm.SizeMB == nil || *dimm.SizeMB == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
t := ""
|
||||||
|
if dimm.Type != nil {
|
||||||
|
t = *dimm.Type
|
||||||
|
}
|
||||||
|
k := key{*dimm.SizeMB, t}
|
||||||
|
if counts[k] == 0 {
|
||||||
|
order = append(order, k)
|
||||||
|
}
|
||||||
|
counts[k]++
|
||||||
|
}
|
||||||
|
if len(order) == 0 {
|
||||||
|
return "—"
|
||||||
|
}
|
||||||
|
parts := make([]string, 0, len(order))
|
||||||
|
for _, k := range order {
|
||||||
|
gb := k.sizeMB / 1024
|
||||||
|
desc := fmt.Sprintf("%d× %d GB", counts[k], gb)
|
||||||
|
if k.typ != "" {
|
||||||
|
desc += " " + k.typ
|
||||||
|
}
|
||||||
|
parts = append(parts, desc)
|
||||||
|
}
|
||||||
|
return strings.Join(parts, ", ")
|
||||||
|
}
|
||||||
|
|
||||||
|
// hwDescribeStorage returns a summary like "4× 3.84 TB NVMe, 2× 1.92 TB SATA".
|
||||||
|
func hwDescribeStorage(hw schema.HardwareSnapshot) string {
|
||||||
|
type key struct {
|
||||||
|
sizeGB int
|
||||||
|
iface string
|
||||||
|
}
|
||||||
|
counts := map[key]int{}
|
||||||
|
order := []key{}
|
||||||
|
for _, disk := range hw.Storage {
|
||||||
|
sz := 0
|
||||||
|
if disk.SizeGB != nil {
|
||||||
|
sz = *disk.SizeGB
|
||||||
|
}
|
||||||
|
iface := ""
|
||||||
|
if disk.Interface != nil {
|
||||||
|
iface = *disk.Interface
|
||||||
|
} else if disk.Type != nil {
|
||||||
|
iface = *disk.Type
|
||||||
|
}
|
||||||
|
k := key{sz, iface}
|
||||||
|
if counts[k] == 0 {
|
||||||
|
order = append(order, k)
|
||||||
|
}
|
||||||
|
counts[k]++
|
||||||
|
}
|
||||||
|
if len(order) == 0 {
|
||||||
|
return "—"
|
||||||
|
}
|
||||||
|
parts := make([]string, 0, len(order))
|
||||||
|
for _, k := range order {
|
||||||
|
var sizeStr string
|
||||||
|
if k.sizeGB >= 1000 {
|
||||||
|
sizeStr = fmt.Sprintf("%.2g TB", float64(k.sizeGB)/1000)
|
||||||
|
} else if k.sizeGB > 0 {
|
||||||
|
sizeStr = fmt.Sprintf("%d GB", k.sizeGB)
|
||||||
|
} else {
|
||||||
|
sizeStr = "?"
|
||||||
|
}
|
||||||
|
desc := fmt.Sprintf("%d× %s", counts[k], sizeStr)
|
||||||
|
if k.iface != "" {
|
||||||
|
desc += " " + k.iface
|
||||||
|
}
|
||||||
|
parts = append(parts, desc)
|
||||||
|
}
|
||||||
|
return strings.Join(parts, ", ")
|
||||||
|
}
|
||||||
|
|
||||||
|
// hwDescribeGPU returns a summary like "8× NVIDIA H100 80GB".
|
||||||
|
func hwDescribeGPU(hw schema.HardwareSnapshot) string {
|
||||||
|
counts := map[string]int{}
|
||||||
|
order := []string{}
|
||||||
|
for _, dev := range hw.PCIeDevices {
|
||||||
|
if dev.DeviceClass == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !isGPUDeviceClass(*dev.DeviceClass) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
model := "Unknown GPU"
|
||||||
|
if dev.Model != nil && *dev.Model != "" {
|
||||||
|
model = *dev.Model
|
||||||
|
}
|
||||||
|
if counts[model] == 0 {
|
||||||
|
order = append(order, model)
|
||||||
|
}
|
||||||
|
counts[model]++
|
||||||
|
}
|
||||||
|
if len(order) == 0 {
|
||||||
|
return "—"
|
||||||
|
}
|
||||||
|
parts := make([]string, 0, len(order))
|
||||||
|
for _, m := range order {
|
||||||
|
if counts[m] > 1 {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
|
||||||
|
} else {
|
||||||
|
parts = append(parts, m)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return strings.Join(parts, ", ")
|
||||||
|
}
|
||||||
|
|
||||||
|
// hwDescribePSU returns a summary like "2× 1600 W" or "2× PSU".
|
||||||
|
func hwDescribePSU(hw schema.HardwareSnapshot) string {
|
||||||
|
n := len(hw.PowerSupplies)
|
||||||
|
if n == 0 {
|
||||||
|
return "—"
|
||||||
|
}
|
||||||
|
// Try to get a consistent wattage
|
||||||
|
watt := 0
|
||||||
|
consistent := true
|
||||||
|
for _, psu := range hw.PowerSupplies {
|
||||||
|
if psu.WattageW == nil {
|
||||||
|
consistent = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if watt == 0 {
|
||||||
|
watt = *psu.WattageW
|
||||||
|
} else if *psu.WattageW != watt {
|
||||||
|
consistent = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if consistent && watt > 0 {
|
||||||
|
return fmt.Sprintf("%d× %d W", n, watt)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%d× PSU", n)
|
||||||
|
}
|
||||||
|
|
||||||
|
// hwDescribeNIC returns a summary like "2× Mellanox ConnectX-6".
|
||||||
|
func hwDescribeNIC(hw schema.HardwareSnapshot) string {
|
||||||
|
counts := map[string]int{}
|
||||||
|
order := []string{}
|
||||||
|
for _, dev := range hw.PCIeDevices {
|
||||||
|
isNIC := false
|
||||||
|
if dev.DeviceClass != nil {
|
||||||
|
c := strings.ToLower(strings.TrimSpace(*dev.DeviceClass))
|
||||||
|
isNIC = c == "ethernetcontroller" || c == "networkcontroller" || strings.Contains(c, "fibrechannel")
|
||||||
|
}
|
||||||
|
if !isNIC && len(dev.MacAddresses) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
model := ""
|
||||||
|
if dev.Model != nil && *dev.Model != "" {
|
||||||
|
model = *dev.Model
|
||||||
|
} else if dev.Manufacturer != nil && *dev.Manufacturer != "" {
|
||||||
|
model = *dev.Manufacturer + " NIC"
|
||||||
|
} else {
|
||||||
|
model = "NIC"
|
||||||
|
}
|
||||||
|
if counts[model] == 0 {
|
||||||
|
order = append(order, model)
|
||||||
|
}
|
||||||
|
counts[model]++
|
||||||
|
}
|
||||||
|
if len(order) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
parts := make([]string, 0, len(order))
|
||||||
|
for _, m := range order {
|
||||||
|
if counts[m] > 1 {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
|
||||||
|
} else {
|
||||||
|
parts = append(parts, m)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return strings.Join(parts, ", ")
|
||||||
|
}
|
||||||
|
|
||||||
|
func isGPUDeviceClass(class string) bool {
|
||||||
|
switch strings.TrimSpace(class) {
|
||||||
|
case "VideoController", "DisplayController", "ProcessingAccelerator":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func renderAuditModal() string {
|
func renderAuditModal() string {
|
||||||
return `<div id="audit-modal-overlay" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,.5);z-index:100;align-items:center;justify-content:center">
|
return `<div id="audit-modal-overlay" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,.5);z-index:100;align-items:center;justify-content:center">
|
||||||
<div style="background:#fff;border-radius:6px;padding:24px;min-width:480px;max-width:1100px;width:min(1100px,92vw);max-height:92vh;overflow:auto;position:relative">
|
<div style="background:#fff;border-radius:6px;padding:24px;min-width:480px;max-width:1100px;width:min(1100px,92vw);max-height:92vh;overflow:auto;position:relative">
|
||||||
@@ -482,7 +647,6 @@ func renderHealthCard(opts HandlerOptions) string {
|
|||||||
buildRuntimeToolsRow(health),
|
buildRuntimeToolsRow(health),
|
||||||
buildRuntimeServicesRow(health),
|
buildRuntimeServicesRow(health),
|
||||||
}
|
}
|
||||||
rows = append(rows, buildHardwareComponentRows(opts.ExportDir)...)
|
|
||||||
b.WriteString(`<table><thead><tr><th>Check</th><th>Status</th><th>Source</th><th>Issue</th></tr></thead><tbody>`)
|
b.WriteString(`<table><thead><tr><th>Check</th><th>Status</th><th>Source</th><th>Issue</th></tr></thead><tbody>`)
|
||||||
for _, row := range rows {
|
for _, row := range rows {
|
||||||
b.WriteString(`<tr><td>` + html.EscapeString(row.Title) + `</td><td>` + runtimeStatusBadge(row.Status) + `</td><td>` + html.EscapeString(row.Source) + `</td><td>` + rowIssueHTML(row.Issue) + `</td></tr>`)
|
b.WriteString(`<tr><td>` + html.EscapeString(row.Title) + `</td><td>` + runtimeStatusBadge(row.Status) + `</td><td>` + html.EscapeString(row.Source) + `</td><td>` + rowIssueHTML(row.Issue) + `</td></tr>`)
|
||||||
@@ -1031,25 +1195,26 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
|
|
||||||
<div class="card" style="margin-bottom:16px">
|
<div class="card" style="margin-bottom:16px">
|
||||||
<div class="card-head">Validate Profile</div>
|
<div class="card-head">Validate Profile</div>
|
||||||
<div class="card-body validate-profile-body">
|
<div class="card-body validate-profile-body">
|
||||||
<div class="validate-profile-col">
|
<div class="validate-profile-col">
|
||||||
<div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
|
<div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
|
||||||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
</div>
|
||||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
<div class="validate-profile-col">
|
||||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
|
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||||
</div>
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||||
<div class="validate-profile-col validate-profile-action">
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
|
||||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
|
</div>
|
||||||
<button class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
<div class="validate-profile-col validate-profile-action">
|
||||||
</div>
|
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
|
||||||
<div class="validate-profile-col"></div>
|
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||||
</div>
|
<div style="margin-top:12px">
|
||||||
<div class="card-body" style="padding-top:0;display:flex;justify-content:center">
|
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="grid3">
|
<div class="grid3">
|
||||||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||||
@@ -1156,7 +1321,7 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
</div>
|
</div>
|
||||||
<style>
|
<style>
|
||||||
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||||
.validate-profile-col { min-width:0; }
|
.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
|
||||||
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
|
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
|
||||||
.validate-card-body { padding:0; }
|
.validate-card-body { padding:0; }
|
||||||
.validate-card-section { padding:12px 16px 0; }
|
.validate-card-section { padding:12px 16px 0; }
|
||||||
@@ -2095,11 +2260,11 @@ func renderBurn() string {
|
|||||||
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 hours</span></label>
|
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 hours</span></label>
|
||||||
</div>
|
</div>
|
||||||
<div class="burn-profile-col burn-profile-action">
|
<div class="burn-profile-col burn-profile-action">
|
||||||
<button class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
|
<button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
|
||||||
<p>Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.</p>
|
<p>Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class="burn-profile-col burn-profile-action">
|
<div class="burn-profile-col burn-profile-action">
|
||||||
<button class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
|
<button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
|
||||||
<p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
|
<p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -2116,12 +2281,16 @@ func renderBurn() string {
|
|||||||
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
|
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
|
||||||
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
|
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
|
||||||
</div>
|
</div>
|
||||||
<div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
<div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||||
</div>
|
</div>
|
||||||
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
|
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
|
||||||
</div>
|
<label class="cb-row" style="margin-top:10px">
|
||||||
</div>
|
<input type="checkbox" id="burn-stagger-nvidia">
|
||||||
|
<span>Ramp selected NVIDIA GPUs one by one before full-load hold. Uses a 3-minute stabilization window per GPU, then keeps all selected GPUs under load for the chosen Burn Profile duration.</span>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="burn-section">Core Burn Paths</div>
|
<div class="burn-section">Core Burn Paths</div>
|
||||||
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
||||||
@@ -2147,10 +2316,6 @@ func renderBurn() string {
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="burn-section">GPU-Specific Tests</div>
|
|
||||||
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||||
<div class="card-head">Output <span id="bi-title"></span></div>
|
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||||
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
||||||
@@ -2199,6 +2364,11 @@ function burnSelectedGPUIndices() {
|
|||||||
.sort(function(a, b) { return a - b; });
|
.sort(function(a, b) { return a - b; });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function burnUseNvidiaRampUp() {
|
||||||
|
const el = document.getElementById('burn-stagger-nvidia');
|
||||||
|
return !!(el && el.checked);
|
||||||
|
}
|
||||||
|
|
||||||
function burnUpdateSelectionNote() {
|
function burnUpdateSelectionNote() {
|
||||||
const note = document.getElementById('burn-selection-note');
|
const note = document.getElementById('burn-selection-note');
|
||||||
const selected = burnSelectedGPUIndices();
|
const selected = burnSelectedGPUIndices();
|
||||||
@@ -2258,6 +2428,9 @@ function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
|
|||||||
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||||
}
|
}
|
||||||
body.gpu_indices = selected;
|
body.gpu_indices = selected;
|
||||||
|
if (burnUseNvidiaRampUp() && selected.length > 1) {
|
||||||
|
body.stagger_gpu_start = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return fetch('/api/sat/' + target + '/run', {
|
return fetch('/api/sat/' + target + '/run', {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
|
|||||||
@@ -1094,6 +1094,7 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
|||||||
}
|
}
|
||||||
body := rec.Body.String()
|
body := rec.Body.String()
|
||||||
for _, needle := range []string{
|
for _, needle := range []string{
|
||||||
|
// Runtime Health card — LiveCD checks only
|
||||||
`Runtime Health`,
|
`Runtime Health`,
|
||||||
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
|
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
|
||||||
`Export Directory`,
|
`Export Directory`,
|
||||||
@@ -1102,16 +1103,18 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
|||||||
`CUDA / ROCm`,
|
`CUDA / ROCm`,
|
||||||
`Required Utilities`,
|
`Required Utilities`,
|
||||||
`Bee Services`,
|
`Bee Services`,
|
||||||
`<td>CPU</td>`,
|
|
||||||
`<td>Memory</td>`,
|
|
||||||
`<td>Storage</td>`,
|
|
||||||
`<td>GPU</td>`,
|
|
||||||
`CUDA runtime is not ready for GPU SAT.`,
|
`CUDA runtime is not ready for GPU SAT.`,
|
||||||
`Missing: nvidia-smi`,
|
`Missing: nvidia-smi`,
|
||||||
`bee-nvidia=inactive`,
|
`bee-nvidia=inactive`,
|
||||||
`cpu SAT: FAILED`,
|
// Hardware Summary card — component health badges
|
||||||
`storage SAT: FAILED`,
|
`Hardware Summary`,
|
||||||
`sat:nvidia`,
|
`>CPU<`,
|
||||||
|
`>Memory<`,
|
||||||
|
`>Storage<`,
|
||||||
|
`>GPU<`,
|
||||||
|
`>PSU<`,
|
||||||
|
`badge-warn`, // cpu Warning badge
|
||||||
|
`badge-err`, // storage Critical badge
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(body, needle) {
|
if !strings.Contains(body, needle) {
|
||||||
t.Fatalf("dashboard missing %q: %s", needle, body)
|
t.Fatalf("dashboard missing %q: %s", needle, body)
|
||||||
|
|||||||
@@ -118,6 +118,7 @@ type taskParams struct {
|
|||||||
StressMode bool `json:"stress_mode,omitempty"`
|
StressMode bool `json:"stress_mode,omitempty"`
|
||||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||||
|
StaggerGPUStart bool `json:"stagger_gpu_start,omitempty"`
|
||||||
SizeMB int `json:"size_mb,omitempty"`
|
SizeMB int `json:"size_mb,omitempty"`
|
||||||
Passes int `json:"passes,omitempty"`
|
Passes int `json:"passes,omitempty"`
|
||||||
Loader string `json:"loader,omitempty"`
|
Loader string `json:"loader,omitempty"`
|
||||||
@@ -162,6 +163,13 @@ func resolveBurnPreset(profile string) burnPreset {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func boolToNvidiaStaggerSeconds(enabled bool, selected []int) int {
|
||||||
|
if enabled && len(selected) > 1 {
|
||||||
|
return 180
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
||||||
acceptanceCycles := []platform.PlatformStressCycle{
|
acceptanceCycles := []platform.PlatformStressCycle{
|
||||||
{LoadSec: 85, IdleSec: 5},
|
{LoadSec: 85, IdleSec: 5},
|
||||||
@@ -592,7 +600,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
RunNCCL: t.params.RunNCCL,
|
RunNCCL: t.params.RunNCCL,
|
||||||
ParallelGPUs: t.params.ParallelGPUs,
|
ParallelGPUs: t.params.ParallelGPUs,
|
||||||
}, j.append)
|
}, j.append)
|
||||||
case "nvidia-compute":
|
case "nvidia-compute":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
break
|
break
|
||||||
@@ -601,7 +609,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
if t.params.BurnProfile != "" && dur <= 0 {
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
}
|
}
|
||||||
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
staggerSec := boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||||
|
if staggerSec > 0 {
|
||||||
|
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU", staggerSec))
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, staggerSec, j.append)
|
||||||
case "nvidia-targeted-power":
|
case "nvidia-targeted-power":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
@@ -651,12 +663,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
if t.params.BurnProfile != "" && dur <= 0 {
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
}
|
}
|
||||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||||
DurationSec: dur,
|
DurationSec: dur,
|
||||||
Loader: t.params.Loader,
|
Loader: t.params.Loader,
|
||||||
GPUIndices: t.params.GPUIndices,
|
GPUIndices: t.params.GPUIndices,
|
||||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
}, j.append)
|
StaggerSeconds: boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices),
|
||||||
|
}, j.append)
|
||||||
case "memory":
|
case "memory":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
|
|||||||
110
iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
Executable file
110
iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
Executable file
@@ -0,0 +1,110 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
SECONDS=300
|
||||||
|
STAGGER_SECONDS=180
|
||||||
|
DEVICES=""
|
||||||
|
EXCLUDE=""
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||||
|
exit 2
|
||||||
|
}
|
||||||
|
|
||||||
|
normalize_list() {
|
||||||
|
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||||
|
}
|
||||||
|
|
||||||
|
contains_csv() {
|
||||||
|
needle="$1"
|
||||||
|
haystack="${2:-}"
|
||||||
|
echo ",${haystack}," | grep -q ",${needle},"
|
||||||
|
}
|
||||||
|
|
||||||
|
resolve_dcgmproftester() {
|
||||||
|
for candidate in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
|
||||||
|
if command -v "${candidate}" >/dev/null 2>&1; then
|
||||||
|
command -v "${candidate}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
while [ "$#" -gt 0 ]; do
|
||||||
|
case "$1" in
|
||||||
|
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||||
|
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||||
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
|
*) usage ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
PROF=$(resolve_dcgmproftester) || { echo "dcgmproftester not found in PATH" >&2; exit 1; }
|
||||||
|
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||||
|
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||||
|
|
||||||
|
DEVICES=$(normalize_list "${DEVICES}")
|
||||||
|
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||||
|
SELECTED="${DEVICES}"
|
||||||
|
if [ -z "${SELECTED}" ]; then
|
||||||
|
SELECTED="${ALL_DEVICES}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
FINAL=""
|
||||||
|
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||||
|
[ -n "${id}" ] || continue
|
||||||
|
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if [ -z "${FINAL}" ]; then
|
||||||
|
FINAL="${id}"
|
||||||
|
else
|
||||||
|
FINAL="${FINAL},${id}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||||
|
|
||||||
|
echo "loader=dcgmproftester-staggered"
|
||||||
|
echo "selected_gpus=${FINAL}"
|
||||||
|
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||||
|
|
||||||
|
TMP_DIR=$(mktemp -d)
|
||||||
|
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||||
|
|
||||||
|
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||||
|
gpu_pos=0
|
||||||
|
WORKERS=""
|
||||||
|
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||||
|
gpu_pos=$((gpu_pos + 1))
|
||||||
|
log="${TMP_DIR}/gpu-${id}.log"
|
||||||
|
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
|
||||||
|
gpu_seconds=$(( SECONDS + extra_sec ))
|
||||||
|
echo "starting gpu ${id} seconds=${gpu_seconds}"
|
||||||
|
CUDA_VISIBLE_DEVICES="${id}" "${PROF}" --no-dcgm-validation -t 1004 -d "${gpu_seconds}" >"${log}" 2>&1 &
|
||||||
|
pid=$!
|
||||||
|
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||||
|
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||||
|
sleep "${STAGGER_SECONDS}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
status=0
|
||||||
|
for spec in ${WORKERS}; do
|
||||||
|
pid=${spec%%:*}
|
||||||
|
rest=${spec#*:}
|
||||||
|
id=${rest%%:*}
|
||||||
|
log=${rest#*:}
|
||||||
|
if wait "${pid}"; then
|
||||||
|
echo "gpu ${id} finished: OK"
|
||||||
|
else
|
||||||
|
rc=$?
|
||||||
|
echo "gpu ${id} finished: FAILED rc=${rc}"
|
||||||
|
status=1
|
||||||
|
fi
|
||||||
|
sed "s/^/[gpu ${id}] /" "${log}" || true
|
||||||
|
done
|
||||||
|
|
||||||
|
exit "${status}"
|
||||||
17
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file → Executable file
17
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file → Executable file
@@ -2,13 +2,14 @@
|
|||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
SECONDS=5
|
SECONDS=5
|
||||||
|
STAGGER_SECONDS=0
|
||||||
SIZE_MB=0
|
SIZE_MB=0
|
||||||
DEVICES=""
|
DEVICES=""
|
||||||
EXCLUDE=""
|
EXCLUDE=""
|
||||||
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||||
exit 2
|
exit 2
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -25,6 +26,7 @@ contains_csv() {
|
|||||||
while [ "$#" -gt 0 ]; do
|
while [ "$#" -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||||
|
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||||
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
|
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
|
||||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
@@ -61,14 +63,18 @@ done
|
|||||||
|
|
||||||
echo "loader=bee-gpu-burn"
|
echo "loader=bee-gpu-burn"
|
||||||
echo "selected_gpus=${FINAL}"
|
echo "selected_gpus=${FINAL}"
|
||||||
|
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||||
|
|
||||||
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||||
|
|
||||||
TMP_DIR=$(mktemp -d)
|
TMP_DIR=$(mktemp -d)
|
||||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||||
|
|
||||||
|
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||||
|
gpu_pos=0
|
||||||
WORKERS=""
|
WORKERS=""
|
||||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||||
|
gpu_pos=$((gpu_pos + 1))
|
||||||
log="${TMP_DIR}/gpu-${id}.log"
|
log="${TMP_DIR}/gpu-${id}.log"
|
||||||
gpu_size_mb="${SIZE_MB}"
|
gpu_size_mb="${SIZE_MB}"
|
||||||
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
|
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
|
||||||
@@ -79,11 +85,16 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
|
|||||||
gpu_size_mb=512
|
gpu_size_mb=512
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
|
||||||
|
gpu_seconds=$(( SECONDS + extra_sec ))
|
||||||
|
echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
|
||||||
CUDA_VISIBLE_DEVICES="${id}" \
|
CUDA_VISIBLE_DEVICES="${id}" \
|
||||||
"${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||||
pid=$!
|
pid=$!
|
||||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||||
|
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||||
|
sleep "${STAGGER_SECONDS}"
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
status=0
|
status=0
|
||||||
|
|||||||
16
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file → Executable file
16
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file → Executable file
@@ -2,6 +2,7 @@
|
|||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
DURATION_SEC=300
|
DURATION_SEC=300
|
||||||
|
STAGGER_SECONDS=0
|
||||||
DEVICES=""
|
DEVICES=""
|
||||||
EXCLUDE=""
|
EXCLUDE=""
|
||||||
FORMAT=""
|
FORMAT=""
|
||||||
@@ -12,7 +13,7 @@ export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
|
|||||||
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
|
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
||||||
exit 2
|
exit 2
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -118,6 +119,7 @@ ensure_opencl_ready() {
|
|||||||
while [ "$#" -gt 0 ]; do
|
while [ "$#" -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
|
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
|
||||||
|
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
||||||
@@ -170,6 +172,7 @@ done
|
|||||||
echo "loader=john"
|
echo "loader=john"
|
||||||
echo "selected_gpus=${FINAL}"
|
echo "selected_gpus=${FINAL}"
|
||||||
echo "john_devices=${JOHN_DEVICES}"
|
echo "john_devices=${JOHN_DEVICES}"
|
||||||
|
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||||
|
|
||||||
cd "${JOHN_DIR}"
|
cd "${JOHN_DIR}"
|
||||||
|
|
||||||
@@ -232,14 +235,21 @@ trap cleanup EXIT INT TERM
|
|||||||
echo "format=${CHOSEN_FORMAT}"
|
echo "format=${CHOSEN_FORMAT}"
|
||||||
echo "target_seconds=${DURATION_SEC}"
|
echo "target_seconds=${DURATION_SEC}"
|
||||||
echo "slice_seconds=${TEST_SLICE_SECONDS}"
|
echo "slice_seconds=${TEST_SLICE_SECONDS}"
|
||||||
DEADLINE=$(( $(date +%s) + DURATION_SEC ))
|
TOTAL_DEVICES=$(echo "${JOHN_DEVICES}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||||
_first=1
|
_first=1
|
||||||
|
pos=0
|
||||||
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
||||||
|
pos=$((pos + 1))
|
||||||
[ "${_first}" = "1" ] || sleep 3
|
[ "${_first}" = "1" ] || sleep 3
|
||||||
_first=0
|
_first=0
|
||||||
run_john_loop "${opencl_id}" "${DEADLINE}" &
|
extra_sec=$(( STAGGER_SECONDS * (TOTAL_DEVICES - pos) ))
|
||||||
|
deadline=$(( $(date +%s) + DURATION_SEC + extra_sec ))
|
||||||
|
run_john_loop "${opencl_id}" "${deadline}" &
|
||||||
pid=$!
|
pid=$!
|
||||||
PIDS="${PIDS} ${pid}"
|
PIDS="${PIDS} ${pid}"
|
||||||
|
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${pos}" -lt "${TOTAL_DEVICES}" ]; then
|
||||||
|
sleep "${STAGGER_SECONDS}"
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
FAIL=0
|
FAIL=0
|
||||||
for pid in ${PIDS}; do
|
for pid in ${PIDS}; do
|
||||||
|
|||||||
@@ -21,8 +21,13 @@ read_nvidia_modules_flavor() {
|
|||||||
|
|
||||||
log "kernel: $(uname -r)"
|
log "kernel: $(uname -r)"
|
||||||
|
|
||||||
# Skip if no NVIDIA GPU present (PCI vendor 10de)
|
# Skip if no NVIDIA display/compute GPU is present.
|
||||||
if ! lspci -nn 2>/dev/null | grep -qi '10de:'; then
|
# Match only display-class PCI functions (0300 VGA, 0302 3D controller) from vendor 10de.
|
||||||
|
have_nvidia_gpu() {
|
||||||
|
lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||||
|
}
|
||||||
|
|
||||||
|
if ! have_nvidia_gpu; then
|
||||||
log "no NVIDIA GPU detected — skipping module load"
|
log "no NVIDIA GPU detected — skipping module load"
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ log() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
have_nvidia_gpu() {
|
have_nvidia_gpu() {
|
||||||
lspci -nn 2>/dev/null | grep -qi '10de:'
|
lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||||
}
|
}
|
||||||
|
|
||||||
service_active() {
|
service_active() {
|
||||||
|
|||||||
Reference in New Issue
Block a user