根據你的需求,這里提供一套完整的Python解決方案,支持遞歸遍歷子文件夾、提取Word文檔內容(段落+表格),并整合到Excel中。以下是代碼實現及詳細說明:
一個單元格一個word的全部內容
完整代碼
# -*- coding: utf-8 -*-
import os
from docx import Document
import pandas as pd
from tqdm import tqdm # 需安裝:pip install tqdmdef extract_word_content(doc_path):"""提取Word文檔中的段落和表格內容"""try:doc = Document(doc_path)content = []# 提取段落文本for para in doc.paragraphs:if para.text.strip():content.append(('paragraph', para.text))# 提取表格內容(按行合并)for table in doc.tables:for row in table.rows:row_text = ' | '.join([cell.text.strip() for cell in row.cells])content.append(('table_row', row_text))return '\n'.join([f"[{type}] {text}" for type, text in content])except Exception as e:return f"Error reading file: {str(e)}"def process_folder(root_folder):"""遍歷文件夾并提取所有Word文檔"""data = []# 遍歷所有子文件夾for root, _, files in os.walk(root_folder):# 篩選Word文件(兼容.doc和.docx)word_files = [f for f in files if f.lower().endswith(('.doc', '.docx'))]# 顯示進度條(需安裝tqdm)for file in tqdm(word_files, desc=f"Processing {os.path.basename(root)}"):file_path = os.path.join(root, file)content = extract_word_content(file_path)data.append({"文件名": file,"完整路徑": file_path,"文件內容": content})return pd.DataFrame(data)if __name__ == "__main__":# 配置參數folder_path = r"C:\Users\esensoft\Desktop\報名推薦" # 修改為實際路徑output_excel = "匯總結果.xlsx"# 執行處理df = process_folder(folder_path)# 導出Excel(自動處理格式)with pd.ExcelWriter(output_excel, engine='openpyxl') as writer:df.to_excel(writer, index=False, sheet_name='文檔匯總')# 自動調整列寬for column in df.columns:max_len = max(df[column].astype(str).map(len).max(), len(column))writer.sheets['文檔匯總'].column_dimensions[writer.sheets['文檔匯總'].cell(row=