Files
ProjectAGiPrompt/32-WDD-AI服务器/4-nvidia-监控.md
2026-06-08 09:28:55 +08:00

2.2 KiB

#!/bin/bash

打印表头

printf "%-26s | %6s | %6s | %12s | %7s | %6s | %6s | %14s\n"
"时间戳" "GPU%" "显存%" "显存用/总MiB" "功耗W" "温度°C" "CPU%" "RAM用/总GiB" printf '%s\n' "$(printf '%.0s-' {1..100})"

保存上次 CPU 计数

prev_total=0; prev_used=0

while true; do

── GPU 数据 ──────────────────────────────────────────────

IFS=',' read -r ts gpu_util mem_used mem_total pwr temp < <( nvidia-smi
--query-gpu=timestamp,utilization.gpu,memory.used,memory.total,power.draw,temperature.gpu
--format=csv,noheader,nounits
--id=0 2>/dev/null | head -1 )

去除首尾空格

ts=$(echo "$ts" | xargs) gpu_util=$(echo "$gpu_util" | xargs) mem_util=$(awk "BEGIN {printf "%.1f", $mem_used/$mem_total*100}") pwr=$(echo "$pwr" | xargs) temp=$(echo "$temp" | xargs)

── CPU 使用率(差值法)──────────────────────────────────

read -ra c < <(grep '^cpu ' /proc/stat) total=$(( c[1]+c[2]+c[3]+c[4]+c[5]+c[6]+c[7] )) used=$(( total - c[4] - c[5] )) if (( prev_total > 0 )); then cpu_pct=$(( (used - prev_used) * 100 / (total - prev_total) )) else cpu_pct=0 fi prev_total=$total; prev_used=$used

── RAM 使用率 ────────────────────────────────────────────

mem_total_kb=$(grep '^MemTotal' /proc/meminfo | awk '{print $2}') mem_avail_kb=$(grep '^MemAvailable' /proc/meminfo | awk '{print $2}') mem_used_gib=$(awk "BEGIN {printf "%.2f", ($mem_total_kb - $mem_avail_kb)/1024/1024}") mem_total_gib=$(awk "BEGIN {printf "%.2f", $mem_total_kb/1024/1024}")

── 输出一行 ──────────────────────────────────────────────

printf "%-26s | %5s%% | %5s%% | %5s / %-5s | %6s | %5s | %5d%% | %5s / %s\n"
"$ts" "$gpu_util" "$mem_util"
"$mem_used" "$mem_total"
"$pwr" "$temp" "$cpu_pct"
"$mem_used_gib" "$mem_total_gib"

sleep 1 done