Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bb1218ddd4 | ||
|
|
65faae8ede |
@@ -15,6 +15,7 @@ const nvidiaVendorID = 0x10de
|
|||||||
type nvidiaGPUInfo struct {
|
type nvidiaGPUInfo struct {
|
||||||
Index int
|
Index int
|
||||||
BDF string
|
BDF string
|
||||||
|
Name string
|
||||||
Serial string
|
Serial string
|
||||||
VBIOS string
|
VBIOS string
|
||||||
TemperatureC *float64
|
TemperatureC *float64
|
||||||
@@ -73,6 +74,9 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if v := strings.TrimSpace(info.Name); v != "" {
|
||||||
|
devs[i].Model = &v
|
||||||
|
}
|
||||||
if v := strings.TrimSpace(info.Serial); v != "" {
|
if v := strings.TrimSpace(info.Serial); v != "" {
|
||||||
devs[i].SerialNumber = &v
|
devs[i].SerialNumber = &v
|
||||||
}
|
}
|
||||||
@@ -99,7 +103,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
|||||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||||
out, err := exec.Command(
|
out, err := exec.Command(
|
||||||
"nvidia-smi",
|
"nvidia-smi",
|
||||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
"--query-gpu=index,pci.bus_id,name,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||||
"--format=csv,noheader,nounits",
|
"--format=csv,noheader,nounits",
|
||||||
).Output()
|
).Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -123,8 +127,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
|||||||
if len(rec) == 0 {
|
if len(rec) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if len(rec) < 13 {
|
if len(rec) < 14 {
|
||||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
|
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 14", len(rec))
|
||||||
}
|
}
|
||||||
|
|
||||||
bdf := normalizePCIeBDF(rec[1])
|
bdf := normalizePCIeBDF(rec[1])
|
||||||
@@ -135,17 +139,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
|||||||
info := nvidiaGPUInfo{
|
info := nvidiaGPUInfo{
|
||||||
Index: parseRequiredInt(rec[0]),
|
Index: parseRequiredInt(rec[0]),
|
||||||
BDF: bdf,
|
BDF: bdf,
|
||||||
Serial: strings.TrimSpace(rec[2]),
|
Name: strings.TrimSpace(rec[2]),
|
||||||
VBIOS: strings.TrimSpace(rec[3]),
|
Serial: strings.TrimSpace(rec[3]),
|
||||||
TemperatureC: parseMaybeFloat(rec[4]),
|
VBIOS: strings.TrimSpace(rec[4]),
|
||||||
PowerW: parseMaybeFloat(rec[5]),
|
TemperatureC: parseMaybeFloat(rec[5]),
|
||||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
PowerW: parseMaybeFloat(rec[6]),
|
||||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
ECCUncorrected: parseMaybeInt64(rec[7]),
|
||||||
HWSlowdown: parseMaybeBool(rec[8]),
|
ECCCorrected: parseMaybeInt64(rec[8]),
|
||||||
PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
|
HWSlowdown: parseMaybeBool(rec[9]),
|
||||||
PCIeLinkGenMax: parseMaybeInt(rec[10]),
|
PCIeLinkGenCurrent: parseMaybeInt(rec[10]),
|
||||||
PCIeLinkWidthCur: parseMaybeInt(rec[11]),
|
PCIeLinkGenMax: parseMaybeInt(rec[11]),
|
||||||
PCIeLinkWidthMax: parseMaybeInt(rec[12]),
|
PCIeLinkWidthCur: parseMaybeInt(rec[12]),
|
||||||
|
PCIeLinkWidthMax: parseMaybeInt(rec[13]),
|
||||||
}
|
}
|
||||||
result[bdf] = info
|
result[bdf] = info
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestParseNVIDIASMIQuery(t *testing.T) {
|
func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
raw := "0, 00000000:65:00.0, NVIDIA H100 80GB HBM3, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||||
byBDF, err := parseNVIDIASMIQuery(raw)
|
byBDF, err := parseNVIDIASMIQuery(raw)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("parse failed: %v", err)
|
t.Fatalf("parse failed: %v", err)
|
||||||
@@ -16,6 +16,9 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
|||||||
if !ok {
|
if !ok {
|
||||||
t.Fatalf("gpu by normalized bdf not found")
|
t.Fatalf("gpu by normalized bdf not found")
|
||||||
}
|
}
|
||||||
|
if gpu.Name != "NVIDIA H100 80GB HBM3" {
|
||||||
|
t.Fatalf("name: got %q", gpu.Name)
|
||||||
|
}
|
||||||
if gpu.Serial != "GPU-SERIAL-1" {
|
if gpu.Serial != "GPU-SERIAL-1" {
|
||||||
t.Fatalf("serial: got %q", gpu.Serial)
|
t.Fatalf("serial: got %q", gpu.Serial)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -79,6 +79,25 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Exclude BMC/management virtual VGA adapters — these are firmware video chips,
|
||||||
|
// not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA).
|
||||||
|
if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") {
|
||||||
|
bmcPatterns := []string{
|
||||||
|
"management system chip",
|
||||||
|
"management controller",
|
||||||
|
"ibmc",
|
||||||
|
"idrac",
|
||||||
|
"ilo vga",
|
||||||
|
"aspeed",
|
||||||
|
"matrox",
|
||||||
|
}
|
||||||
|
for _, bad := range bmcPatterns {
|
||||||
|
if strings.Contains(d, bad) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
|
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
|
||||||
internalAMDPatterns := []string{
|
internalAMDPatterns := []string{
|
||||||
"dummy function",
|
"dummy function",
|
||||||
|
|||||||
@@ -29,6 +29,8 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
|
|||||||
{name: "raid", class: "RAID bus controller", want: true},
|
{name: "raid", class: "RAID bus controller", want: true},
|
||||||
{name: "nvme", class: "Non-Volatile memory controller", want: true},
|
{name: "nvme", class: "Non-Volatile memory controller", want: true},
|
||||||
{name: "vga", class: "VGA compatible controller", want: true},
|
{name: "vga", class: "VGA compatible controller", want: true},
|
||||||
|
{name: "ibmc vga", class: "VGA compatible controller", vendor: "Huawei Technologies Co., Ltd.", device: "Hi171x Series [iBMC Intelligent Management system chip w/VGA support]", want: false},
|
||||||
|
{name: "aspeed vga", class: "VGA compatible controller", vendor: "ASPEED Technology, Inc.", device: "ASPEED Graphics Family", want: false},
|
||||||
{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
|
{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1602,8 +1602,8 @@ function runAllSAT() {
|
|||||||
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
||||||
const status = document.getElementById('sat-all-status');
|
const status = document.getElementById('sat-all-status');
|
||||||
status.textContent = 'Enqueuing...';
|
status.textContent = 'Enqueuing...';
|
||||||
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth', 'hpl'];
|
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||||
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','hpl','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||||
const activeTargets = baseTargets.filter(target => {
|
const activeTargets = baseTargets.filter(target => {
|
||||||
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
||||||
const btn = document.getElementById('sat-btn-' + target);
|
const btn = document.getElementById('sat-btn-' + target);
|
||||||
|
|||||||
Reference in New Issue
Block a user