我沒招了jpg
import pandas as pd
import requests
import re
import json
from tqdm import tqdm
from datetime import datetime, timedeltadef calculate_stop_duration(arrival_time_str, departure_time_str):"""計算列車停留時長,處理跨天和異常情況"""try:if arrival_time_str == '無數據' or departure_time_str == '無數據':return "無數據"# 統一時間格式(處理"23:41"和"23:41:00"混用情況)arrival_time_str = arrival_time_str if len(arrival_time_str.split(':')) == 3 else arrival_time_str + ':00'departure_time_str = departure_time_str if len(departure_time_str.split(':')) == 3 else departure_time_str + ':00'arrival_time = datetime.strptime(arrival_time_str, '%H:%M:%S')departure_time = datetime.strptime(departure_time_str, '%H:%M:%S')# 處理跨天情況(如23:40到達,00:12離開)if departure_time < arrival_time:departure_time += timedelta(days=1)duration = departure_time - arrival_timehours, remainder = divmod(duration.seconds, 3600)minutes = remainder // 60# 優化輸出格式(小于1小時只顯示分鐘)if hours == 0:return f"{minutes}分鐘"else:return f"{hours}小時{minutes}分鐘"except Exception as e:return f"計算錯誤: {str(e)}"def call_llm(content: str):"""調用大模型API(與原baseline相同)"""url = "https://api.siliconflow.cn/v1/chat/completions"payload = {"model": "Qwen/Qwen3-8B","messages": [{"role": "user", "content": content}]}headers = {"Authorization": "Bearer sk-xxx","Content-Type": "application/json"}resp = requests.request("POST", url, json=payload, headers=headers).json()content = resp['choices'][0]['message']['content'].split('</think>')[-1]pattern = re.compile(r'^```json\s*([\s\S]*?)```$', re.IGNORECASE)match = pattern.match(content.strip())if match:return json.loads(match.group(1).strip())return contentdef create_question_list(row: dict):"""生成多樣化問題列表(包含簡單查詢和時間推理)"""question_list = []train_number = row["車次"]departure_station = row["始發站"]# 1. 基礎信息查詢question_list.append(f'{train_number}號車次應該從哪個檢票口檢票?')question_list.append(f'{train_number}號車次應該從哪個站臺上車?')question_list.append(f'{train_number}次列車的終到站是哪里?')question_list.append(f'{train_number}次列車的候車區域是哪里?')# 2. 時間推理問題if row['到點'] != '無數據' and row['開點'] != '無數據':# 停留時長question_list.append(f'{train_number}次列車在{departure_station}站的停留時長是多久?')# 發車時間查詢(驗證模型能否處理時間格式)question_list.append(f'{train_number}次列車什么時候從{departure_station}站發車?')# 3. 比較類問題(需要多行數據,可在后續擴展)# question_list.append(f'從{departure_station}出發的車次中,哪趟停留時間最長?')return question_list# 主流程
if __name__ == "__main__":# 1. 讀取數據data = pd.read_excel('data/info_table.xlsx').fillna('無數據')# 2. 預處理時間數據data['停留時長'] = data.apply(lambda row: calculate_stop_duration(str(row['到點']), str(row['開點'])), axis=1)# 3. 生成提示詞模板prompt = '''你是專業的列車乘務員,請嚴格根據以下列車信息回答問題:
# 列車信息
{}# 用戶問題列表
{}注意:
1. 所有時間計算已考慮跨天情況
2. 對于"無數據"的問題,如實回答"暫無該信息"
'''output_format = '''# 輸出格式
```json
[{"q": "問題文本","a": "答案文本(直接引用或計算得出)"
}]
```'''# 4. 生成訓練數據train_data = []error_log = []for idx, row in tqdm(data.iterrows(), total=len(data)):try:row_dict = row.to_dict()questions = create_question_list(row_dict)# 調用LLM生成答案(使用原始數據+計算后的停留時長)llm_response = call_llm(prompt.format(row_dict, questions) + output_format)if isinstance(llm_response, list):train_data.extend([{"instruction": qa["q"], "output": qa["a"]} for qa in llm_response])else:error_log.append({"row": row_dict, "error": "LLM返回格式錯誤"})except Exception as e:error_log.append({"row": row_dict, "error": str(e)})# 5. 保存結果with open('train_data_v2.json', 'w', encoding='utf-8') as f:json.dump(train_data, f, ensure_ascii=False, indent=2)if error_log:with open('error_log.json', 'w', encoding='utf-8') as f:json.dump(error_log, f, ensure_ascii=False, indent=2)print(f"生成完成!有效數據{len(train_data)}條,錯誤{len(error_log)}條")