????????對上一節進行優化:
? ? ? ? 1、識別多個excel
? ? ? ? 2、將表格中的nan替換成空字符串
????????一、示例中的pdf內容
? ? ? ? 二、完整代碼參考:
import tabula
import numpy as np
from docx import Document
from docx.oxml.ns import qn
from docx.oxml import OxmlElementdef get_table_data(df_list):table_data = []# 遍歷每個表格for table_index, table in enumerate(df_list):# 獲取行數和列數rows, cols = table.shapeprint(f"表格 {table_index + 1} 的行數: {rows}, 列數: {cols}")heading_cells = []for col_num, column_name in enumerate(table.columns):heading_cells.append(column_name)table_data.append(heading_cells)for row_index, row in table.iterrows():table_data.append(row.tolist())return table_datadef handle_table(table_data):for i in range(len(table_data) - 1, 0, -1):if table_data[i][0] in [None, np.nan, ""] or table_data[i][1] in [None,np.nan,"",]:for j in range(len(table_data[i])):if table_data[i][j] not in [None,np.nan,"",]: # 只有當單元格不為空時才合并table_data[i - 1][j] = f"{table_data[i - 1][j]}{table_data[i][j]}".strip()# 刪除當前行del table_data[i]def set_cell_borders(cell, border_color="000000", row_height=None):"""設置單元格的邊框顏色:param cell: 單元格對象:param border_color: 邊框顏色,默認為黑色"""tc = cell._elementtcPr = tc.get_or_add_tcPr()tcBorders = OxmlElement("w:tcBorders")for border_name in ("top", "left", "bottom", "right"):border = OxmlElement(f"w:{border_name}")border.set(qn("w:val"), "single")border.set(qn("w:sz"), "4") # 邊框大小border.set(qn("w:space"), "0")border.set(qn("w:color"), border_color)tcBorders.append(border)tcPr.append(tcBorders)# 設置內容居中顯示for paragraph in cell.paragraphs:for run in paragraph.runs:run.font.size = paragraph.style.font.size # 保持字體大小一致paragraph.alignment = 1 # 1 表示居中對齊# 設置行高if row_height is not None:tr = cell._element.getparent() # 獲取行元素trPr = tr.get_or_add_trPr()trHeight = OxmlElement("w:trHeight")trHeight.set(qn("w:val"), str(row_height))trPr.append(trHeight)def create_table_and_fill_data(data, output_file):"""在 Word 文檔中插入表格并填充數據:param data: 表格數據:param output_file: 輸出文件路徑"""# 創建一個新的 Word 文檔doc = Document()# 添加一個標題sssdoc.add_heading("測試XX信息表", level=1)# 創建表格table = doc.add_table(rows=len(data), cols=len(data[0]))# 填充表格數據for row_index, row_data in enumerate(data):for col_index, cell_text in enumerate(row_data):cell = table.cell(row_index, col_index)cell.text = str(cell_text)set_cell_borders(cell, border_color="FF0000", row_height=300)# 設置表格邊框顏色# 保存 Word 文檔doc.save(output_file)pdf_file = "excelv2.pdf"
output_file = "order0429.docx" # 輸出的 Word 文件路徑
table_data = []
# 使用tabula從PDF中提取表格
df_list = tabula.read_pdf(pdf_file, pages="all", multiple_tables=True, stream="lattice")
table_data = get_table_data(df_list)
handle_table(table_data)
create_table_and_fill_data(table_data, output_file)