現有大量的Word文檔,每個文檔中有大量的表格,需要將其轉換為Excel。
Python處理源碼
# 需要安裝pip install xlsxwriter
import pandas as pd
from docx import Document
from pathlib import Path
from datetime import datetimedef process_docx(filepath):# 處理Word文檔的主函數doc = Document(filepath)# 示例處理邏輯:提取所有段落文本content = [p.text for p in doc.paragraphs if p.text.strip()]print(f"成功處理文檔: {filepath}")data = []monitor_time = ""# 提取監測時間for paragraph in doc.paragraphs:# print(paragraph.text)if "第12次:" in paragraph.text:monitor_time = parse_monitor_time(paragraph.text)print(f"提取監測時間: {monitor_time}")break# 處理所有表格index = 0for table in doc.tables:# 檢查是否為數據表格(包含房屋編號列)if len(table.columns) >= 7 and "成果表" in table.cell(0,0).text:# print(table.cell(0,0).text)for row in table.rows[2:]: # 跳過標題行first_cell_text = row.cells[0].text.strip() # 獲取第一個單元格的文本并去除首尾空格if "備注" in first_cell_text: # 如果第一個單元格包含"備注"continue # 跳過該行cells = [cell.text.replace("\n", "").replace("\r", "").strip() for cell in row.cells]if len(cells) >= 7: # 確保數據完整# 構建輸出記錄record = {'點號': f"{cells[0].replace(" ", "")}-{cells[1]}",'初始值': cells[2],'檢測值': cells[3],'累計值': cells[4],'監測時間': monitor_time,'上次監測時間': "2025/6/17 03:00" # 根據備注補充}# print(record)data.append(record)return datadef generate_excel(data, output_path):# 生成標準格式Exceldf = pd.DataFrame(data) # 補充固定字段df['備注'] = '無'# 字段順序調整columns_order = [ '點號', '初始值', '檢測值', '累計值', '監測時間', '上次監測時間', '備注' ]df = df.reindex(columns=columns_order)# 填充空值df['上次監測時間'] = '2025-06-01'# 保存Excel# df.to_excel(output_path, index=False)with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:df.to_excel(writer, index=False, sheet_name='Sheet1') # 導出數據worksheet = writer.sheets['Sheet1']# 手動設置列寬(單位:字符寬度)worksheet.set_column('A:A', 38) # 設置A列為15字符寬度worksheet.set_column('B:B', 12) # 設置B列為10字符寬度print(f"Excel文件已生成: {output_path}")print(f"Excel開始生成")
filepath=r"C:\Users\admin\Desktop\test.docx"
output_path=r"C:\Users\admin\Desktop\test.xlsx"
data = process_docx(filepath)
generate_excel(data, output_path)
print(f"Excel生成結束")
輸入Word文檔
word文檔格式如下所示