python將pdf轉txt，并切割ai

step1:pdf轉換

from PIL import Image
import pytesseract
import os
import tempfile
from pdf2image import convert_from_path# 設置 Tesseract 路徑
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\wangrusheng\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'# 設置 Poppler 路徑
POPPLER_PATH = r'C:\Users\wangrusheng\AppData\Local\Programs\poppler-24.08.0\Library\bin'def pdf_to_txt(input_pdf, output_txt):"""將PDF文件轉換為文本文件參數:input_pdf -- 輸入的PDF文件路徑output_txt -- 輸出的文本文件路徑"""# 創建臨時目錄存儲轉換后的圖片with tempfile.TemporaryDirectory() as temp_dir:# 將PDF轉換為圖片列表images = convert_from_path(input_pdf,poppler_path=POPPLER_PATH,  # 添加關鍵配置output_folder=temp_dir,dpi=300,fmt='jpeg',thread_count=4)# 打開輸出文件with open(output_txt, 'w', encoding='utf-8') as f:# 處理每一頁圖片for i, image in enumerate(images):try:# 使用OCR識別文字text = pytesseract.image_to_string(image,lang='chi_sim+eng+jpn+rus+tha+kor+ara'  # 中英文混合識別)# 寫入識別結果f.write(f"--- 第 {i + 1} 頁內容 ---\n")f.write(text.strip())f.write("\n\n")print(f"已處理第 {i + 1} 頁")except Exception as e:error_msg = f"第 {i + 1} 頁識別失敗: {str(e)}"print(error_msg)f.write(error_msg + "\n")print(f"\n轉換完成！結果已保存至: {output_txt}")if __name__ == "__main__":# 輸入PDF路徑input_pdf = r"C:\Users\wangrusheng\Downloads\sdf.pdf"# 輸出TXT路徑output_txt = os.path.join(os.path.dirname(input_pdf),os.path.splitext(os.path.basename(input_pdf))[0] + ".txt")# 執行轉換pdf_to_txt(input_pdf, output_txt)

step2:

import osdef read_txt_file(txt_path):"""讀取TXT文件內容"""with open(txt_path, 'r', encoding='utf-8') as f:return f.read()def split_text_by_size(text, max_bytes, output_dir):"""按文件大小切割文本（確保不截斷行和UTF-8字符）:param text: 完整文本內容:param max_bytes: 每個文件的最大字節數:param output_dir: 輸出文件目錄:return: 生成的文件列表"""# 確保輸出目錄存在os.makedirs(output_dir, exist_ok=True)# 計算備注信息的字節大小（UTF-8編碼）note_text = """請ai保持批判性思維，模仿袁的口吻，講述材料中的歷史故事。具體要求如下：1.
風格： 采用經典的評書口吻和結構。例如，開頭要有上一回和下一回的標題，內容部分必須拆分小標題，小段落，能拆分的全部拆分2.
內容：?
嚴格遵循歷史： 只講述真實發生的事件、人物和時間，不能有任何虛構或戲說。?
邏輯連貫： 清晰解釋事件發生的原因、過程和結果。?
自動補全背景： 在故事中自然地融入必要的背景信息，讓邏輯連貫，轉場絲滑。3.
語言： 絕對簡單易懂，使用口語化的詞匯和短句，讓小學生能毫無障礙地聽懂故事里發生了什么事。可以用夸張，比喻，隱喻，諷刺等手法，但需基于史實。請從‘第一回：xxx’開始講起。
"""  # 修改后的固定備注信息note_bytes = note_text.encode('utf-8')note_size = len(note_bytes)# 調整最大字節數（預留備注信息空間）max_bytes = max_bytes - note_sizeif max_bytes <= 0:raise ValueError("文件大小設置過小，無法容納備注信息")# 初始化字母組合生成器def generate_suffix():for first in range(26):for second in range(26):yield f"{chr(97 + first)}{chr(97 + second)}"suffix_gen = generate_suffix()files_created = []encoded_text = text.encode('utf-8')  # 整個文本的UTF-8字節表示while encoded_text:# 獲取當前塊的最大字節數chunk_size = min(max_bytes, len(encoded_text))# 查找安全切割點（優先在換行符處切割）cut_index = chunk_sizeif b'\n' in encoded_text[:chunk_size]:# 查找最后一個換行符作為切割點cut_index = encoded_text.rindex(b'\n', 0, chunk_size) + 1else:# 嘗試在字符邊界處切割while cut_index > 0:try:# 驗證是否在完整字符處encoded_text[:cut_index].decode('utf-8')breakexcept UnicodeDecodeError:cut_index -= 1# 提取當前塊并更新剩余文本chunk = encoded_text[:cut_index]encoded_text = encoded_text[cut_index:]# 獲取下一個字母組合后綴suffix = next(suffix_gen)# 寫入文件（添加備注信息）output_file = os.path.join(output_dir, f"{suffix}.txt")with open(output_file, 'wb') as f:f.write(chunk)f.write(note_bytes)  # 在文件底部添加備注信息files_created.append(output_file)print(f"已創建: {output_file} (大小: {len(chunk) + note_size:,} 字節)")return files_createddef process_txt(input_txt, output_dir, max_size_kb=20):"""處理TXT文件：按大小切割:param input_txt: 輸入的TXT文件路徑:param output_dir: 輸出文件目錄:param max_size_kb: 每個文件的最大大小(KB)"""# 檢查文件是否存在if not os.path.exists(input_txt):raise FileNotFoundError(f"文件不存在: {input_txt}")# 讀取TXT文件text_content = read_txt_file(input_txt)if not text_content.strip():print("警告: 文件內容為空")# 按大小切割max_bytes = max_size_kb * 1024  # KB轉為字節return split_text_by_size(text_content, max_bytes, output_dir)# 使用示例
if __name__ == "__main__":input_file = r"C:\Users\wangrusheng\Downloads\ust.txt"  # TXT文件路徑output_dir = r"C:\Users\wangrusheng\Downloads\accc"  # 輸出文件目錄max_size_kb = 15  # 每個文件最大20KBcreated_files = process_txt(input_file, output_dir, max_size_kb)print(f"切割完成! 共生成 {len(created_files)} 個文件")

step3:查詢頁數

from pdf2image import convert_from_path
import os# 設置 Poppler 路徑
POPPLER_PATH = r'C:\Users\wangrusheng\AppData\Local\Programs\poppler-24.08.0\Library\bin'def get_pdf_page_count(input_pdf):"""獲取PDF文件的頁數參數:input_pdf -- 輸入的PDF文件路徑返回:page_count -- PDF文件的頁數"""# 將PDF轉換為圖片列表（不寫入磁盤）images = convert_from_path(input_pdf,poppler_path=POPPLER_PATH,dpi=50,  # 降低DPI以提高速度fmt='jpeg',thread_count=4,use_pdftocairo=True,  # 使用更穩定的轉換引擎strict=False  # 忽略部分錯誤)return len(images)if __name__ == "__main__":# 輸入PDF路徑input_pdf = r"D:\Users\wangrusheng\Downloads\pe.pdf"try:page_count = get_pdf_page_count(input_pdf)print(f"PDF文件頁數: {page_count}")except Exception as e:print(f"處理PDF時出錯: {str(e)}")

end

本文來自互聯網用戶投稿，該文觀點僅代表作者本人，不代表本站立場。本站僅提供信息存儲空間服務，不擁有所有權，不承擔相關法律責任。
如若轉載，請注明出處：http://www.pswp.cn/pingmian/96664.shtml
繁體地址，請注明出處：http://hk.pswp.cn/pingmian/96664.shtml
英文地址，請注明出處：http://en.pswp.cn/pingmian/96664.shtml

如若內容造成侵權/違法違規/事實不符，請聯系多彩編程網進行投訴反饋email:809451989@qq.com，一經查實，立即刪除！