報錯大致是這樣的,但是直接run沒有問題,debug就停住不動了?
Traceback (most recent call last):
??File "/home/mapengsen/.pycharm_helpers/pydev/_pydevd_bundle/pydevd_comm.py", line 467, in start_client
????s.connect((host, port))
TimeoutError: timed out
Traceback (most recent call last):
??File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
??File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
[14:30:48.928250] [14:30:48.928492] [14:30:48.928599] [14:30:48.950877] [14:30:48.951222] [14:30:48.951351] ??File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
??File "<frozen importlib._bootstrap_external>", line 883, in exec_module
??File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
Could not connect to 127.0.0.1: 56945
Traceback (most recent call last):
??File "/home/mapengsen/.pycharm_helpers/pydev/_pydevd_bundle/pydevd_comm.py", line 467, in start_client
????s.connect((host, port))
TimeoutError: timed out
??File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 26, in <module>
Traceback (most recent call last):
??File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
??File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
??File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
??File "<frozen importlib._bootstrap_external>", line 883, in exec_module
????from torch._inductor.codecache import code_hash, CompiledFxGraph
??File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1424, in <module>
??File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
??File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 26, in <module>
????from torch._inductor.codecache import code_hash, CompiledFxGraph
??File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1424, in <module>
????????AsyncCompile.warm_pool()AsyncCompile.warm_pool()
??File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1363, in warm_pool
??File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1363, in warm_pool
????pool._adjust_process_count()
??File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/concurrent/futures/process.py", line 697, in _adjust_process_count
????pool._adjust_process_count()
??File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/concurrent/futures/process.py", line 697, in _adjust_process_count
Could not connect to 127.0.0.1: 56945
后來才發現是自己 import 自己定義的datasets的時候出現了錯誤,因為我是在自己定義的datasets中進行了測試,但是里面有錯誤,然后我還在主程序中import了這個datasets,所以一直停住不動。把dataset報錯的地方刪除就行,只留方法部分:
def collate_fn_paired_skip_invalid(batch):
? ? if len(batch[0]) == 5: ?# 單任務情況 (添加了task_id)
? ? ? ? valid_batch_items = [item for item in batch if item[0] is not None and item[2] is not None]
? ? ? ? if not valid_batch_items:
? ? ? ? ? ? return torch.empty(0), torch.empty(0, 0), torch.empty(0), torch.empty(0, 0), torch.empty(0, dtype=torch.long)
? ? ? ? return torch.utils.data.dataloader.default_collate(valid_batch_items)? ? else: ?# 多任務情況 (7個元素,添加了task_id)
? ? ? ? valid_batch_items = [item for item in batch if item[0] is not None and item[2] is not None and item[4] is not None]
? ? ? ? if not valid_batch_items:
? ? ? ? ? ? return torch.empty(0), torch.empty(0, 0), torch.empty(0), torch.empty(0, 0), torch.empty(0), torch.empty(0, 0), torch.empty(0, dtype=torch.long)
? ? ? ? return torch.utils.data.dataloader.default_collate(valid_batch_items)刪除下面的,以免有錯誤
#
# # --- 主訓練循環 ---
# trained_models_per_task = {}
#
# # 假設您在這里定義了 all_task_names
# all_task_names = [['A_bioavailability_ma'], ['A_hia_hou'], ['A_bioavailability_ma', 'A_hia_hou']]
#
# for current_task_names in all_task_names:
# ? ? task_key = '+'.join(current_task_names) ?# 創建任務組合的鍵名
# ? ? print(f"\n--- 開始為任務組合: {task_key} 準備數據和模型 (Paired Data) ---")
?