Improve validate GPU runs and web UI feedback
This commit is contained in:
@@ -6,6 +6,7 @@ import (
|
|||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
@@ -52,6 +53,12 @@ var metricChartPalette = []string{
|
|||||||
"#ffbe5c",
|
"#ffbe5c",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var gpuLabelCache struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
loadedAt time.Time
|
||||||
|
byIndex map[int]string
|
||||||
|
}
|
||||||
|
|
||||||
func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
||||||
pointCount := len(labels)
|
pointCount := len(labels)
|
||||||
if len(times) > pointCount {
|
if len(times) > pointCount {
|
||||||
@@ -76,15 +83,7 @@ func renderMetricChartSVG(title string, labels []string, times []time.Time, data
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
mn, avg, mx := globalStats(datasets)
|
statsLabel := chartStatsLabel(datasets)
|
||||||
if mx > 0 {
|
|
||||||
title = fmt.Sprintf("%s ↓%s ~%s ↑%s",
|
|
||||||
title,
|
|
||||||
chartLegendNumber(mn),
|
|
||||||
chartLegendNumber(avg),
|
|
||||||
chartLegendNumber(mx),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
legendItems := []metricChartSeries{}
|
legendItems := []metricChartSeries{}
|
||||||
for i, name := range names {
|
for i, name := range names {
|
||||||
@@ -106,7 +105,7 @@ func renderMetricChartSVG(title string, labels []string, times []time.Time, data
|
|||||||
|
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
writeSVGOpen(&b, layout.Width, layout.Height)
|
writeSVGOpen(&b, layout.Width, layout.Height)
|
||||||
writeChartFrame(&b, title, layout.Width, layout.Height)
|
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
|
||||||
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||||
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||||
writeHorizontalGrid(&b, layout, scale)
|
writeHorizontalGrid(&b, layout, scale)
|
||||||
@@ -133,7 +132,7 @@ func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, tim
|
|||||||
labels := sampleTimeLabels(samples)
|
labels := sampleTimeLabels(samples)
|
||||||
times := sampleTimes(samples)
|
times := sampleTimes(samples)
|
||||||
svg, err := drawGPUOverviewChartSVG(
|
svg, err := drawGPUOverviewChartSVG(
|
||||||
fmt.Sprintf("GPU %d Overview", idx),
|
gpuDisplayLabel(idx)+" Overview",
|
||||||
labels,
|
labels,
|
||||||
times,
|
times,
|
||||||
[]metricChartSeries{
|
[]metricChartSeries{
|
||||||
@@ -214,7 +213,7 @@ func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, s
|
|||||||
|
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
writeSVGOpen(&b, width, height)
|
writeSVGOpen(&b, width, height)
|
||||||
writeChartFrame(&b, title, width, height)
|
writeChartFrame(&b, title, "", width, height)
|
||||||
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||||
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||||
writeHorizontalGrid(&b, layout, scales[0])
|
writeHorizontalGrid(&b, layout, scales[0])
|
||||||
@@ -457,10 +456,14 @@ func writeSVGClose(b *strings.Builder) {
|
|||||||
b.WriteString("</svg>\n")
|
b.WriteString("</svg>\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
func writeChartFrame(b *strings.Builder, title string, width, height int) {
|
func writeChartFrame(b *strings.Builder, title, subtitle string, width, height int) {
|
||||||
fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
|
fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
|
||||||
fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
|
fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
|
||||||
width/2, sanitizeChartText(title))
|
width/2, sanitizeChartText(title))
|
||||||
|
if strings.TrimSpace(subtitle) != "" {
|
||||||
|
fmt.Fprintf(b, `<text x="%d" y="50" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="600" fill="#64748b">%s</text>`+"\n",
|
||||||
|
width/2, sanitizeChartText(subtitle))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func writePlotBorder(b *strings.Builder, layout chartLayout) {
|
func writePlotBorder(b *strings.Builder, layout chartLayout) {
|
||||||
@@ -545,7 +548,21 @@ func writeSeriesPolyline(b *strings.Builder, layout chartLayout, times []time.Ti
|
|||||||
x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
|
x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
|
y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
|
||||||
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
|
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
peakIdx := 0
|
||||||
|
peakValue := values[0]
|
||||||
|
for idx, value := range values[1:] {
|
||||||
|
if value >= peakValue {
|
||||||
|
peakIdx = idx + 1
|
||||||
|
peakValue = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
x := chartXForTime(chartPointTime(times, peakIdx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(peakValue, scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="4.2" fill="%s" stroke="#ffffff" stroke-width="1.6"/>`+"\n", x, y, color)
|
||||||
|
fmt.Fprintf(b, `<path d="M %.1f %.1f L %.1f %.1f L %.1f %.1f Z" fill="%s" opacity="0.9"/>`+"\n",
|
||||||
|
x, y-10, x-5, y-18, x+5, y-18, color)
|
||||||
}
|
}
|
||||||
|
|
||||||
func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
|
func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
|
||||||
@@ -711,3 +728,49 @@ func valueClamp(value float64, scale chartScale) float64 {
|
|||||||
}
|
}
|
||||||
return value
|
return value
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func chartStatsLabel(datasets [][]float64) string {
|
||||||
|
mn, avg, mx := globalStats(datasets)
|
||||||
|
if mx <= 0 && avg <= 0 && mn <= 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("min %s avg %s max %s",
|
||||||
|
chartLegendNumber(mn),
|
||||||
|
chartLegendNumber(avg),
|
||||||
|
chartLegendNumber(mx),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func gpuDisplayLabel(idx int) string {
|
||||||
|
if name := gpuModelNameByIndex(idx); name != "" {
|
||||||
|
return fmt.Sprintf("GPU %d — %s", idx, name)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("GPU %d", idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
func gpuModelNameByIndex(idx int) string {
|
||||||
|
now := time.Now()
|
||||||
|
gpuLabelCache.mu.Lock()
|
||||||
|
if now.Sub(gpuLabelCache.loadedAt) > 30*time.Second || gpuLabelCache.byIndex == nil {
|
||||||
|
gpuLabelCache.loadedAt = now
|
||||||
|
gpuLabelCache.byIndex = loadGPUModelNames()
|
||||||
|
}
|
||||||
|
name := strings.TrimSpace(gpuLabelCache.byIndex[idx])
|
||||||
|
gpuLabelCache.mu.Unlock()
|
||||||
|
return name
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadGPUModelNames() map[int]string {
|
||||||
|
out := map[int]string{}
|
||||||
|
gpus, err := platform.New().ListNvidiaGPUs()
|
||||||
|
if err != nil {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
name := strings.TrimSpace(gpu.Name)
|
||||||
|
if name != "" {
|
||||||
|
out[gpu.Index] = name
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|||||||
@@ -860,6 +860,35 @@ func renderMetrics() string {
|
|||||||
<script>
|
<script>
|
||||||
let gpuChartKey = '';
|
let gpuChartKey = '';
|
||||||
const gpuChartModeStorageKey = 'bee.metrics.gpuChartMode';
|
const gpuChartModeStorageKey = 'bee.metrics.gpuChartMode';
|
||||||
|
let metricsNvidiaGPUsPromise = null;
|
||||||
|
|
||||||
|
function loadMetricsNvidiaGPUs() {
|
||||||
|
if (!metricsNvidiaGPUsPromise) {
|
||||||
|
metricsNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
|
||||||
|
.then(function(r) {
|
||||||
|
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||||
|
return r.json();
|
||||||
|
})
|
||||||
|
.then(function(list) { return Array.isArray(list) ? list : []; })
|
||||||
|
.catch(function() { return []; });
|
||||||
|
}
|
||||||
|
return metricsNvidiaGPUsPromise;
|
||||||
|
}
|
||||||
|
|
||||||
|
function metricsGPUNameMap(list) {
|
||||||
|
const out = {};
|
||||||
|
(list || []).forEach(function(gpu) {
|
||||||
|
const idx = Number(gpu.index);
|
||||||
|
if (!Number.isFinite(idx) || !gpu.name) return;
|
||||||
|
out[idx] = gpu.name;
|
||||||
|
});
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
function metricsGPUDisplayLabel(idx, names) {
|
||||||
|
const name = names && names[idx];
|
||||||
|
return name ? ('GPU ' + idx + ' — ' + name) : ('GPU ' + idx);
|
||||||
|
}
|
||||||
|
|
||||||
function loadGPUChartModePreference() {
|
function loadGPUChartModePreference() {
|
||||||
try {
|
try {
|
||||||
@@ -909,14 +938,15 @@ function gpuIndices(rows) {
|
|||||||
return out.sort(function(a, b) { return a - b; });
|
return out.sort(function(a, b) { return a - b; });
|
||||||
}
|
}
|
||||||
|
|
||||||
function renderGPUOverviewCards(indices) {
|
function renderGPUOverviewCards(indices, names) {
|
||||||
const host = document.getElementById('gpu-metrics-by-gpu');
|
const host = document.getElementById('gpu-metrics-by-gpu');
|
||||||
if (!host) return;
|
if (!host) return;
|
||||||
host.innerHTML = indices.map(function(idx) {
|
host.innerHTML = indices.map(function(idx) {
|
||||||
|
const label = metricsGPUDisplayLabel(idx, names);
|
||||||
return '<div class="card" style="margin-bottom:16px">' +
|
return '<div class="card" style="margin-bottom:16px">' +
|
||||||
'<div class="card-head">GPU ' + idx + ' — Overview</div>' +
|
'<div class="card-head">' + label + ' — Overview</div>' +
|
||||||
'<div class="card-body" style="padding:8px">' +
|
'<div class="card-body" style="padding:8px">' +
|
||||||
'<img id="chart-gpu-' + idx + '-overview" data-chart-refresh="1" src="/api/metrics/chart/gpu/' + idx + '-overview.svg" style="width:100%;display:block;border-radius:6px" alt="GPU ' + idx + ' overview">' +
|
'<img id="chart-gpu-' + idx + '-overview" data-chart-refresh="1" src="/api/metrics/chart/gpu/' + idx + '-overview.svg" style="width:100%;display:block;border-radius:6px" alt="' + label + ' overview">' +
|
||||||
'</div></div>';
|
'</div></div>';
|
||||||
}).join('');
|
}).join('');
|
||||||
}
|
}
|
||||||
@@ -936,18 +966,21 @@ function syncMetricsLayout(d) {
|
|||||||
const section = document.getElementById('gpu-metrics-section');
|
const section = document.getElementById('gpu-metrics-section');
|
||||||
const summary = document.getElementById('gpu-metrics-summary');
|
const summary = document.getElementById('gpu-metrics-summary');
|
||||||
const indices = gpuIndices(d.gpus);
|
const indices = gpuIndices(d.gpus);
|
||||||
if (section) section.style.display = indices.length > 0 ? '' : 'none';
|
loadMetricsNvidiaGPUs().then(function(gpus) {
|
||||||
if (summary) {
|
const names = metricsGPUNameMap(gpus);
|
||||||
summary.textContent = indices.length > 0
|
if (section) section.style.display = indices.length > 0 ? '' : 'none';
|
||||||
? ('Detected GPUs: ' + indices.map(function(idx) { return 'GPU ' + idx; }).join(', '))
|
if (summary) {
|
||||||
: 'No GPUs detected in live metrics.';
|
summary.textContent = indices.length > 0
|
||||||
}
|
? ('Detected GPUs: ' + indices.map(function(idx) { return metricsGPUDisplayLabel(idx, names); }).join(', '))
|
||||||
const nextKey = indices.join(',');
|
: 'No GPUs detected in live metrics.';
|
||||||
if (nextKey !== gpuChartKey) {
|
}
|
||||||
renderGPUOverviewCards(indices);
|
const nextKey = indices.join(',') + '|' + indices.map(function(idx) { return names[idx] || ''; }).join(',');
|
||||||
gpuChartKey = nextKey;
|
if (nextKey !== gpuChartKey) {
|
||||||
}
|
renderGPUOverviewCards(indices, names);
|
||||||
applyGPUChartMode();
|
gpuChartKey = nextKey;
|
||||||
|
}
|
||||||
|
applyGPUChartMode();
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
function loadMetricsLayout() {
|
function loadMetricsLayout() {
|
||||||
@@ -1029,17 +1062,17 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
`</div>
|
`</div>
|
||||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||||
<div class="grid3">
|
<div class="grid3">
|
||||||
` + renderSATCard("nvidia", "NVIDIA GPU", "runSAT('nvidia')", "", renderValidateCardBody(
|
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||||
`Diag level is taken from Validate Profile.`,
|
`Runs one GPU at a time. Diag level is taken from Validate Profile.`,
|
||||||
)) +
|
)) +
|
||||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runSAT('nvidia-targeted-stress')", "", renderValidateCardBody(
|
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
|
`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
|
||||||
`<code>dcgmi diag targeted_stress</code>`,
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
`Uses the fixed DCGM targeted stress recipe.`,
|
`Runs one GPU at a time with the fixed DCGM targeted stress recipe.`,
|
||||||
)) +
|
)) +
|
||||||
`</div>
|
`</div>
|
||||||
<div class="grid3" style="margin-top:16px">
|
<div class="grid3" style="margin-top:16px">
|
||||||
@@ -1077,17 +1110,37 @@ function satCPUDurationFromDiagLevel() {
|
|||||||
function satLabels() {
|
function satLabels() {
|
||||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||||
}
|
}
|
||||||
function satRequestBody(target) {
|
let satNvidiaGPUsPromise = null;
|
||||||
|
function loadSatNvidiaGPUs() {
|
||||||
|
if (!satNvidiaGPUsPromise) {
|
||||||
|
satNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
|
||||||
|
.then(r => {
|
||||||
|
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||||
|
return r.json();
|
||||||
|
})
|
||||||
|
.then(list => Array.isArray(list) ? list : []);
|
||||||
|
}
|
||||||
|
return satNvidiaGPUsPromise;
|
||||||
|
}
|
||||||
|
function satGPUDisplayName(gpu) {
|
||||||
|
const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
|
||||||
|
const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
|
||||||
|
return 'GPU ' + idx + ' — ' + name;
|
||||||
|
}
|
||||||
|
function satRequestBody(target, overrides) {
|
||||||
const body = {};
|
const body = {};
|
||||||
const labels = satLabels();
|
const labels = satLabels();
|
||||||
body.display_name = labels[target] || ('Validate ' + target);
|
body.display_name = labels[target] || ('Validate ' + target);
|
||||||
if (target === 'nvidia') body.diag_level = satDiagLevel();
|
if (target === 'nvidia') body.diag_level = satDiagLevel();
|
||||||
if (target === 'nvidia-targeted-stress') body.duration = 300;
|
if (target === 'nvidia-targeted-stress') body.duration = 300;
|
||||||
if (target === 'cpu') body.duration = satCPUDurationFromDiagLevel();
|
if (target === 'cpu') body.duration = satCPUDurationFromDiagLevel();
|
||||||
|
if (overrides) {
|
||||||
|
Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
|
||||||
|
}
|
||||||
return body;
|
return body;
|
||||||
}
|
}
|
||||||
function enqueueSATTarget(target) {
|
function enqueueSATTarget(target, overrides) {
|
||||||
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target))})
|
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
|
||||||
.then(r => r.json());
|
.then(r => r.json());
|
||||||
}
|
}
|
||||||
function selectedAMDValidateTargets() {
|
function selectedAMDValidateTargets() {
|
||||||
@@ -1101,12 +1154,15 @@ function selectedAMDValidateTargets() {
|
|||||||
return targets;
|
return targets;
|
||||||
}
|
}
|
||||||
function runSAT(target) {
|
function runSAT(target) {
|
||||||
|
return runSATWithOverrides(target, null);
|
||||||
|
}
|
||||||
|
function runSATWithOverrides(target, overrides) {
|
||||||
if (satES) { satES.close(); satES = null; }
|
if (satES) { satES.close(); satES = null; }
|
||||||
document.getElementById('sat-output').style.display='block';
|
document.getElementById('sat-output').style.display='block';
|
||||||
document.getElementById('sat-title').textContent = '— ' + target;
|
document.getElementById('sat-title').textContent = '— ' + target;
|
||||||
const term = document.getElementById('sat-terminal');
|
const term = document.getElementById('sat-terminal');
|
||||||
term.textContent = 'Enqueuing ' + target + ' test...\n';
|
term.textContent = 'Enqueuing ' + target + ' test...\n';
|
||||||
return enqueueSATTarget(target)
|
return enqueueSATTarget(target, overrides)
|
||||||
.then(d => {
|
.then(d => {
|
||||||
term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
|
term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
|
||||||
satES = new EventSource('/api/tasks/'+d.task_id+'/stream');
|
satES = new EventSource('/api/tasks/'+d.task_id+'/stream');
|
||||||
@@ -1114,6 +1170,55 @@ function runSAT(target) {
|
|||||||
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
function expandSATTarget(target) {
|
||||||
|
if (target !== 'nvidia' && target !== 'nvidia-targeted-stress') {
|
||||||
|
return Promise.resolve([{target: target}]);
|
||||||
|
}
|
||||||
|
return loadSatNvidiaGPUs().then(gpus => gpus.map(gpu => ({
|
||||||
|
target: target,
|
||||||
|
overrides: {
|
||||||
|
gpu_indices: [Number(gpu.index)],
|
||||||
|
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
|
||||||
|
},
|
||||||
|
label: satGPUDisplayName(gpu)
|
||||||
|
})));
|
||||||
|
}
|
||||||
|
function runNvidiaValidateSet(target) {
|
||||||
|
return loadSatNvidiaGPUs().then(gpus => {
|
||||||
|
if (!gpus.length) return;
|
||||||
|
if (gpus.length === 1) {
|
||||||
|
const gpu = gpus[0];
|
||||||
|
return runSATWithOverrides(target, {
|
||||||
|
gpu_indices: [Number(gpu.index)],
|
||||||
|
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (satES) { satES.close(); satES = null; }
|
||||||
|
document.getElementById('sat-output').style.display='block';
|
||||||
|
document.getElementById('sat-title').textContent = '— ' + target;
|
||||||
|
const term = document.getElementById('sat-terminal');
|
||||||
|
term.textContent = 'Enqueuing ' + target + ' tests one GPU at a time...\n';
|
||||||
|
const labelBase = satLabels()[target] || ('Validate ' + target);
|
||||||
|
const enqueueNext = (idx) => {
|
||||||
|
if (idx >= gpus.length) return;
|
||||||
|
const gpu = gpus[idx];
|
||||||
|
const gpuLabel = satGPUDisplayName(gpu);
|
||||||
|
enqueueSATTarget(target, {
|
||||||
|
gpu_indices: [Number(gpu.index)],
|
||||||
|
display_name: labelBase + ' (' + gpuLabel + ')'
|
||||||
|
}).then(d => {
|
||||||
|
term.textContent += 'Task ' + d.task_id + ' queued for ' + gpuLabel + '.\n';
|
||||||
|
if (idx === gpus.length - 1) {
|
||||||
|
satES = new EventSource('/api/tasks/' + d.task_id + '/stream');
|
||||||
|
satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
||||||
|
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
||||||
|
}
|
||||||
|
enqueueNext(idx + 1);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
enqueueNext(0);
|
||||||
|
});
|
||||||
|
}
|
||||||
function runAMDValidateSet() {
|
function runAMDValidateSet() {
|
||||||
const targets = selectedAMDValidateTargets();
|
const targets = selectedAMDValidateTargets();
|
||||||
if (!targets.length) return;
|
if (!targets.length) return;
|
||||||
@@ -1142,25 +1247,38 @@ function runAMDValidateSet() {
|
|||||||
}
|
}
|
||||||
function runAllSAT() {
|
function runAllSAT() {
|
||||||
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
||||||
const targets = ['nvidia','nvidia-targeted-stress','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
|
||||||
const total = targets.length * cycles;
|
|
||||||
let enqueued = 0;
|
|
||||||
const status = document.getElementById('sat-all-status');
|
const status = document.getElementById('sat-all-status');
|
||||||
status.textContent = 'Enqueuing...';
|
status.textContent = 'Enqueuing...';
|
||||||
const enqueueNext = (cycle, idx) => {
|
const baseTargets = ['nvidia','nvidia-targeted-stress','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||||
if (cycle >= cycles) { status.textContent = 'Enqueued '+total+' tasks.'; return; }
|
const activeTargets = baseTargets.filter(target => {
|
||||||
if (idx >= targets.length) { enqueueNext(cycle+1, 0); return; }
|
|
||||||
const target = targets[idx];
|
|
||||||
const btn = document.getElementById('sat-btn-' + target);
|
const btn = document.getElementById('sat-btn-' + target);
|
||||||
if (btn && btn.disabled) { enqueueNext(cycle, idx+1); return; }
|
return !(btn && btn.disabled);
|
||||||
enqueueSATTarget(target)
|
});
|
||||||
.then(()=>{
|
Promise.all(activeTargets.map(expandSATTarget)).then(groups => {
|
||||||
enqueued++;
|
const expanded = [];
|
||||||
status.textContent = 'Enqueued '+enqueued+'/'+total+'...';
|
for (let cycle = 0; cycle < cycles; cycle++) {
|
||||||
enqueueNext(cycle, idx+1);
|
groups.forEach(group => group.forEach(item => expanded.push(item)));
|
||||||
});
|
}
|
||||||
};
|
const total = expanded.length;
|
||||||
enqueueNext(0, 0);
|
let enqueued = 0;
|
||||||
|
if (!total) {
|
||||||
|
status.textContent = 'No tasks selected.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const enqueueNext = (idx) => {
|
||||||
|
if (idx >= expanded.length) { status.textContent = 'Enqueued ' + total + ' tasks.'; return; }
|
||||||
|
const item = expanded[idx];
|
||||||
|
enqueueSATTarget(item.target, item.overrides)
|
||||||
|
.then(() => {
|
||||||
|
enqueued++;
|
||||||
|
status.textContent = 'Enqueued ' + enqueued + '/' + total + '...';
|
||||||
|
enqueueNext(idx + 1);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
enqueueNext(0);
|
||||||
|
}).catch(err => {
|
||||||
|
status.textContent = 'Error: ' + err.message;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
</script>
|
</script>
|
||||||
<script>
|
<script>
|
||||||
@@ -2179,29 +2297,57 @@ function usbRefresh() {
|
|||||||
'<td>'+label+'</td>' +
|
'<td>'+label+'</td>' +
|
||||||
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
||||||
'<td style="white-space:nowrap">' +
|
'<td style="white-space:nowrap">' +
|
||||||
'<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+JSON.stringify(t)+')">Audit JSON</button> ' +
|
'<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+JSON.stringify(t)+',this)">Audit JSON</button> ' +
|
||||||
'<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+JSON.stringify(t)+')">Support Bundle</button>' +
|
'<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+JSON.stringify(t)+',this)">Support Bundle</button>' +
|
||||||
|
'<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
|
||||||
'</td></tr>';
|
'</td></tr>';
|
||||||
}).join('') + '</table>';
|
}).join('') + '</table>';
|
||||||
}).catch(e => {
|
}).catch(e => {
|
||||||
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
window.usbExport = function(type, target) {
|
window.usbExport = function(type, target, btn) {
|
||||||
const msg = document.getElementById('usb-msg');
|
const msg = document.getElementById('usb-msg');
|
||||||
|
const row = btn ? btn.closest('td') : null;
|
||||||
|
const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
|
||||||
|
const originalText = btn ? btn.textContent : '';
|
||||||
|
if (btn) {
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Exporting...';
|
||||||
|
}
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--muted)';
|
||||||
|
rowMsg.textContent = 'Working...';
|
||||||
|
}
|
||||||
msg.style.color = 'var(--muted)';
|
msg.style.color = 'var(--muted)';
|
||||||
msg.textContent = 'Exporting to ' + (target.device||'') + '...';
|
msg.textContent = 'Exporting ' + (type === 'bundle' ? 'support bundle' : 'audit JSON') + ' to ' + (target.device||'') + '...';
|
||||||
fetch('/api/export/usb/'+type, {
|
fetch('/api/export/usb/'+type, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: {'Content-Type':'application/json'},
|
headers: {'Content-Type':'application/json'},
|
||||||
body: JSON.stringify(target)
|
body: JSON.stringify(target)
|
||||||
}).then(r=>r.json()).then(d => {
|
}).then(async r => {
|
||||||
if (d.error) { msg.style.color='var(--err,red)'; msg.textContent = 'Error: '+d.error; return; }
|
const d = await r.json();
|
||||||
|
if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
|
||||||
|
return d;
|
||||||
|
}).then(d => {
|
||||||
msg.style.color = 'var(--ok,green)';
|
msg.style.color = 'var(--ok,green)';
|
||||||
msg.textContent = d.message || 'Done.';
|
msg.textContent = d.message || 'Done.';
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--ok,green)';
|
||||||
|
rowMsg.textContent = d.message || 'Done.';
|
||||||
|
}
|
||||||
}).catch(e => {
|
}).catch(e => {
|
||||||
msg.style.color = 'var(--err,red)';
|
msg.style.color = 'var(--err,red)';
|
||||||
msg.textContent = 'Error: '+e;
|
msg.textContent = 'Error: '+e;
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--err,red)';
|
||||||
|
rowMsg.textContent = 'Error: ' + e;
|
||||||
|
}
|
||||||
|
}).finally(() => {
|
||||||
|
if (btn) {
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = originalText;
|
||||||
|
}
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
window.usbRefresh = usbRefresh;
|
window.usbRefresh = usbRefresh;
|
||||||
|
|||||||
@@ -703,7 +703,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
}
|
}
|
||||||
switch sub {
|
switch sub {
|
||||||
case "load":
|
case "load":
|
||||||
title = fmt.Sprintf("GPU %d Load", idx)
|
title = gpuDisplayLabel(idx) + " Load"
|
||||||
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||||
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||||
if util == nil && mem == nil {
|
if util == nil && mem == nil {
|
||||||
@@ -714,7 +714,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
yMin = floatPtr(0)
|
yMin = floatPtr(0)
|
||||||
yMax = floatPtr(100)
|
yMax = floatPtr(100)
|
||||||
case "temp":
|
case "temp":
|
||||||
title = fmt.Sprintf("GPU %d Temperature", idx)
|
title = gpuDisplayLabel(idx) + " Temperature"
|
||||||
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||||
if temp == nil {
|
if temp == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false
|
||||||
@@ -724,7 +724,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
yMin = floatPtr(0)
|
yMin = floatPtr(0)
|
||||||
yMax = autoMax120(temp)
|
yMax = autoMax120(temp)
|
||||||
case "clock":
|
case "clock":
|
||||||
title = fmt.Sprintf("GPU %d Core Clock", idx)
|
title = gpuDisplayLabel(idx) + " Core Clock"
|
||||||
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||||
if clock == nil {
|
if clock == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false
|
||||||
@@ -733,7 +733,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
names = []string{"Core Clock MHz"}
|
names = []string{"Core Clock MHz"}
|
||||||
yMin, yMax = autoBounds120(clock)
|
yMin, yMax = autoBounds120(clock)
|
||||||
case "memclock":
|
case "memclock":
|
||||||
title = fmt.Sprintf("GPU %d Memory Clock", idx)
|
title = gpuDisplayLabel(idx) + " Memory Clock"
|
||||||
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
|
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
|
||||||
if clock == nil {
|
if clock == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false
|
||||||
@@ -742,7 +742,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
names = []string{"Memory Clock MHz"}
|
names = []string{"Memory Clock MHz"}
|
||||||
yMin, yMax = autoBounds120(clock)
|
yMin, yMax = autoBounds120(clock)
|
||||||
default:
|
default:
|
||||||
title = fmt.Sprintf("GPU %d Power", idx)
|
title = gpuDisplayLabel(idx) + " Power"
|
||||||
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||||
if power == nil {
|
if power == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false
|
||||||
@@ -871,7 +871,7 @@ func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetr
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
datasets = append(datasets, ds)
|
datasets = append(datasets, ds)
|
||||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
names = append(names, gpuDisplayLabel(idx))
|
||||||
}
|
}
|
||||||
return datasets, names
|
return datasets, names
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -259,7 +259,7 @@ func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
|
|||||||
if !strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
|
if !strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
|
||||||
t.Fatalf("metrics page should include GPU memory clock chart: %s", body)
|
t.Fatalf("metrics page should include GPU memory clock chart: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `renderGPUOverviewCards(indices)`) {
|
if !strings.Contains(body, `renderGPUOverviewCards(indices, names)`) {
|
||||||
t.Fatalf("metrics page should build per-GPU chart cards dynamically: %s", body)
|
t.Fatalf("metrics page should build per-GPU chart cards dynamically: %s", body)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user