一個簡單測試Deepseek吞吐量的腳本,這里用DeepSeek-R1-Distill-Qwen-32B ,支持單卡4090 24G可跑,具體看你的硬件情況做調整,理論支持所有的模型,看你需要,可以修改模型名稱,重點是pip使用國內的源,模型下載用阿里的ModelScope,無障礙下載,使用.
最后可以生成一個txt與html報表.
前提是你安裝了python與python-venv,你可以不用venv來做,可動手改一下腳本.
?
#!/bin/bash
set -e
# 參數配置
MODEL_REPO="deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
BATCH_SIZES=(1 2 4 10 20 30 50) # 32B模型顯存需求大,batch_size調小
SEQ_LENGTHS=(256 512)
WARMUP_STEPS=3
MEASURE_STEPS=10
VENV_NAME="deepseek_test"
REPORT_FILE="benchmark_report.txt"
HTML_REPORT_FILE="benchmark_report.html"# 創建虛擬環境
echo "創建Python虛擬環境..."
python3 -m venv $VENV_NAME
source $VENV_NAME/bin/activate# 配置國內pip源
echo "配置國內pip源..."
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn# 安裝依賴
echo "安裝依賴包..."
pip install torch transformers modelscope accelerate# 測試函數
run_benchmark() {local batch_size=$1local seq_length=$2echo -e "\n測試配置: batch_size=${batch_size}, seq_length=${seq_length}"python3 - <<EOF
import torch
from modelscope import AutoModelForCausalLM, AutoTokenizer
import timemodel_name = "${MODEL_REPO}"
device = "cuda" if torch.cuda.is_available() else "cpu"# 加載模型和分詞器
print("加載模型和分詞器中...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name,torch_dtype=torch.float16,device_map="auto",trust_remote_code=True
)# 顯式設置 pad_token_id
if tokenizer.eos_token is None:eos_token = tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id)tokenizer.add_special_tokens({'pad_token': eos_token})
else:tokenizer.pad_token = tokenizer.eos_token# 準備輸入
input_text = "測試" * ${seq_length}
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
input_ids = inputs.input_ids.repeat(${batch_size}, 1).to(device)
attention_mask = inputs.attention_mask.repeat(${batch_size}, 1).to(device)# 預熱
print("預熱中...")
for _ in range(${WARMUP_STEPS}):_ = model.generate(input_ids, attention_mask=attention_mask, max_length=input_ids.shape[1]+10)# 正式測試
print("性能測試中...")
start_time = time.time()
for _ in range(${MEASURE_STEPS}):_ = model.generate(input_ids, attention_mask=attention_mask, max_length=input_ids.shape[1]+10)
elapsed = time.time() - start_time# 計算指標
avg_latency = elapsed / ${MEASURE_STEPS}
tokens_per_sec = (${batch_size} * 10) / avg_latencyprint(f"平均延遲: {avg_latency:.3f}s")
print(f"吞吐量: {tokens_per_sec:.2f} tokens/s")
print(f"顯存占用: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB")
EOF
}# 主流程
echo "$MODEL_REPO 性能測試開始" > $REPORT_FILE
echo "GPU信息:" >> $REPORT_FILE
nvidia-smi --query-gpu=name,memory.total --format=csv >> $REPORT_FILEecho "<html><body><h1>DeepSeek-R1-Distill-Qwen-32B性能測試報告</h1>" > $HTML_REPORT_FILE
echo "<p>GPU信息:</p>" >> $HTML_REPORT_FILE
nvidia-smi --query-gpu=name,memory.total --format=csv | sed 's/^/<p>/' | sed 's/$/<\/p>/' >> $HTML_REPORT_FILEfor bs in ${BATCH_SIZES[@]}; dofor seq in ${SEQ_LENGTHS[@]}; doecho -e "\n測試配置: batch_size=${bs}, seq_length=${seq}" >> $REPORT_FILEecho "<h2>測試配置: batch_size=${bs}, seq_length=${seq}</h2>" >> $HTML_REPORT_FILErun_benchmark $bs $seq | tee -a $REPORT_FILE | sed 's/^/<p>/' | sed 's/$/<\/p>/' >> $HTML_REPORT_FILEdone
doneecho "</body></html>" >> $HTML_REPORT_FILEdeactivate
echo "測試完成"