使用的數據集:ceval-exam
import requests
from datasets import load_dataset, concatenate_datasets
import re
from tqdm import tqdm
import re, time, tiktoken, ollama
from ollama import ChatResponse
from ollama import Optionsdef llm(model, query, temperature=0.6, stream=False, encoding=tiktoken.encoding_for_model("gpt-4"), max_tokens=None):# return "A"options = Options(temperature=temperature,num_gpu=0, # num_gpu=0即使用CPU計算# num_thread=32,# num_ctx=4096, # 上下文窗口大小)# 流式輸出response = ollama.chat(model=model,messages=[{"role": "system","content": "你是一個做題專家。請完成下列單項選擇題。\n\n## output format\n只能輸出一個選項編號字母,不要有解析等其他任何內容。",},{"role": "user","content": query,},],options=options,stream=stream,keep_alive=0)if stream:chunks = ""# 逐塊打印響應內容for chunk in response:chunks += chunk["message"]["content"]# print(chunk["message"]["content"], end="", flush=True)if max_tokens != None and len(encoding.encode(chunks)) > max_tokens:breakresponse = chunkselse:# print(response["message"]["content"])response = response["message"]["content"]# stream=True時無效# with open("tmp.txt", "a", encoding="utf-8") as f:# f.write(response + "\n"+ 100*'*' + '\n')if '<think>' in response and '</think>' in response:response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)return response.strip()task_list = ["computer_network","operating_system","computer_architecture","college_programming","college_physics","college_chemistry","advanced_mathematics","probability_and_statistics","discrete_mathematics","electrical_engineer","metrology_engineer","high_school_mathematics","high_school_physics","high_school_chemistry","high_school_biology","middle_school_mathematics","middle_school_biology","middle_school_physics","middle_school_chemistry","veterinary_medicine","college_economics","business_administration","marxism","mao_zedong_thought","education_science","teacher_qualification","high_school_politics","high_school_geography","middle_school_politics","middle_school_geography","modern_chinese_history","ideological_and_moral_cultivation","logic","law","chinese_language_and_literature","art_studies","professional_tour_guide","legal_professional","high_school_chinese","high_school_history","middle_school_history","civil_servant","sports_science","plant_protection","basic_medicine","clinical_medicine","urban_and_rural_planner","accountant","fire_engineer","environmental_impact_assessment_engineer","tax_accountant","physician",
]
task_chinese_name_list = ["計算機網絡","操作系統","計算機架構","大學編程","大學物理","大學化學","高等數學","概率與統計","離散數學","電氣工程師","計量工程師","高中數學","高中物理","高中化學","高中生物學","中學數學","中學生物學","中學物理","中學化學","獸醫學","大學經濟學","工商管理","馬克思主義","毛澤東思想","教育科學","教師資格","高中政治","高中地理","中學政治","中學地理","現代中國史","思想道德修養","邏輯","法律","漢語與文學","藝術研究","專業旅游指南","法律專業","高中漢語","高中歷史","中學歷史","公務員","體育科學","植物保護","基礎醫學","臨床醫學","城市與農村規劃","會計","消防工程師","環境影響評估工程師","稅務會計","醫生",
]def test_split(model_name):encoding = tiktoken.encoding_for_model("gpt-4")model_name_write = model_name.replace(":", "_").replace("/", "_")# with open(f"{model_name_write}.txt", "w", encoding="utf-8") as f:# f.write(f"")# 加載數據集sum_total = 0sum_correct = 0for i in range(26, len(task_list)):try:dataset_tmp = load_dataset(r"ceval/data", name=task_list[i])dataset = concatenate_datasets([dataset_tmp["dev"], dataset_tmp["val"]])print(f"\nNo.{i}: {task_list[i]}({task_chinese_name_list[i]})數據集加載完成, len(dataset)={len(dataset)}")except:print(f"\nNo.{i}: {task_list[i]}({task_chinese_name_list[i]})數據集加載失敗")continue# 初始化統計變量correct = 0total = len(dataset)for item in tqdm(dataset, desc=f"No.{i}: Processing"):# for item in dataset:try:# 構造完整問題user_prompt = f"{item['question']}\nA. {item['A']}\nB. {item['B']}\nC. {item['C']}\nD. {item['D']}\n答案:"# 調用Ollama APImodel_answer = llm(model_name, user_prompt, stream=True, encoding=encoding, max_tokens=4096)# 提取并驗證答案"""從模型輸出中提取答案選項(A/B/C/D)"""match = re.search(r"[A-D]", model_answer.upper())extracted = match.group(0) if match else Noneif extracted and extracted == item["answer"]:correct += 1except:print("\nerror.")# 輸出結果sum_total += totalsum_correct += correctprint(f"No.{i}: {task_list[i]}({task_chinese_name_list[i]})數據集準確率: {correct}/{total} = {correct/total:.2%}")with open(f"{model_name_write}.txt", "a", encoding="utf-8") as f:f.write(f"No.{i}: {task_list[i]}({task_chinese_name_list[i]})數據集準確率: {correct}/{total} = {correct/total:.2%}\n\n")with open(f"{model_name_write}.txt", "a", encoding="utf-8") as f:f.write(f"總準確率: {sum_correct}/{sum_total} = {sum_correct/sum_total:.2%}\n\n")print(f"總準確率: {sum_correct}/{sum_total} = {sum_correct/sum_total:.2%}")# huihui_ai/qwen2.5-abliterate:7b-instruct-q4_K_M
# qwen2.5:3b-instruct-q8_0
# qwen2.5:7b-instruct-q5_K_M
# deepseek-r1-7b:latest
# test_split(model_name="qwen2.5:3b-instruct-q8_0")
# test_split(model_name="qwen2.5:7b-instruct-q5_K_M")
# test_split(model_name="huihui_ai/qwen2.5-abliterate:7b-instruct-q4_K_M")
# test_split(model_name="qwen2.5:1.5b")
# test_split(model_name="qwen2.5:1.5b-instruct-fp16")
# test_split(model_name="qwen2.5:3b")
# test_split(model_name="gemma3:4b")
# test_split(model_name="qwen2.5:7b")
# test_split(model_name="gemma3:4b-it-q8_0")
# test_split(model_name="qwen2.5:0.5b-instruct-fp16")
# test_split(model_name="qwen2.5:0.5b")test_split(model_name="deepseek-r1:1.5b")
# test_split(model_name="deepseek-r1:1.5b-qwen-distill-fp16")
# test_split(model_name="deepseek-r1:7b")