基于python抽取目錄下所有“jsonl”格式文件。遍歷文件內某個字段進行抽取并合并。
import os
import json
import time
from tqdm import tqdm # 需要先安裝:pip install tqdmdef process_files():# 設置目錄路徑dir_path = r"D:\daku\關鍵詞識別\1623-0000001\zh"# 獲取并排序文件列表file_list = sorted([f for f in os.listdir(dir_path) if f.lower().endswith('.jsonl')],key=lambda x: os.path.getsize(os.path.join(dir_path, x)),reverse=True) # 按文件大小降序排列# 進度統計total_files = len(file_list)processed_files = 0total_lines = sum(1 for f in file_list for _ in open(os.path.join(dir_path, f), 'r', encoding='utf-8'))processed_lines = 0start_time = time.time()# 輸出文件設置output_file = os.path.join(dir_path, "combined_contents.txt")with open(output_file, "w", encoding="utf-8") as outfile:with tqdm(total=total_lines, desc="合并進度", unit="line") as pbar:for filename in file_list:file_path = os.path.join(dir_path, filename)try:with open(file_path, "r", encoding="utf-8") as infile:file_size = os.path.getsize(file_path)chunk_size = max(1024 * 1024, file_size // 100) # 動態調整讀取塊大小while True:lines = infile.readlines(chunk_size)if not lines:breakfor line_num, line in enumerate(lines, 1):line = line.strip()if not line:continuetry:data = json.loads(line)content = data.get("content", "").replace("\n", " ") # 清除內容中的換行符outfile.write(content + "\n\n") # 用雙換行分隔記錄processed_lines += 1except json.JSONDecodeError:print(f"\nJSON解析失敗: {filename} 第{processed_lines + 1}行")except Exception as e:print(f"\n處理異常: {filename} 第{processed_lines + 1}行 - {str(e)}")# 進度更新pbar.update(1)if processed_lines % 1000 == 0:elapsed = time.time() - start_timespeed = processed_lines / (elapsed + 1e-5)remaining = (total_lines - processed_lines) / (speed + 1e-5)pbar.set_postfix({'速度': f"{speed:.1f} lines/s",'剩余時間': f"{remaining // 3600:.0f}h {remaining % 3600 // 60:.0f}m"})processed_files += 1except Exception as e:print(f"\n無法讀取文件 {filename}: {str(e)}")# 生成統計報告end_time = time.time()print(f"\n合并完成!共處理 {processed_files}/{total_files} 個文件")print(f"總記錄數: {processed_lines:,} 條")print(f"耗時: {end_time - start_time:.2f} 秒")print(f"輸出文件路徑: {output_file}")if __name__ == "__main__":process_files()