（原創）用python語言基于paddleocr構建批量識別實現紙質和電子的增值稅專用發票程序

文章目錄

1. 說明
2. 準備工作
3. 代碼
- 3.1 導入庫：
- 3.2 遍歷發票指定處理方式
- 3.3 發票識別相關函數
- 3.4 發票字段定位函數
- 3.6 識別記錄相關函數
- 3.6 識別結果校驗
- 3.7 文件預處理等其他函數
- 3.8 main主函數

1. 說明

1.1 以paddle識別引擎為基礎的增值稅發票識別程序，可批量識別和累積紙質發票和電子發票數據。已經生產環境中測試。
1.2 識別的源發票數據：- 文件夾中存放的用高速連續發票掃描儀批量掃描的JPG格式圖片- 文件夾中匯集的電子發票PDF格式文件
1.3 可選擇用識別引擎：快速-mb 平衡:sv 精細-pp (總體上，預識別用mb，精細用pd，速度和精確度比較好。
1.4 適配斷續工作，跳過已掃描的重復發票，邊識別邊存儲。
1.5 可裝在閑置低配置的win7老臺式，資源利用，識別速度視電腦配置差異大概2-3秒一張。
1.6 在實際生產環境中測試，如果紙質發票不清晰，綜合識別準確率大概85%-95%左右。如果數電發票比較多，識別準確率大概達到97%以上。
1.7 對于識別有誤或缺失的數據，在結果中提示錯誤并鏈接原發票文件，以便人工直接對照修改。
1.8 其他： - 公司名稱稅號可在代碼中預置設定好，位置在發票字段定位函數Loc_range_content_pandas。- 可自行預置對方公司名稱錯誤的更正，詳細可在Check_result函數中此處文字內容"字段修正：公司名錯別字"所在位置的字典修改。

2. 準備工作

2.1 準備工作發票電子文件夾：已用高速連續發票掃描儀掃描完紙質發票的圖片文件夾，和已匯集的電子發票PDF格式文件夾。
2.2 安裝好輔助程序 acrobat pro dc
2.3 語言環境 anaconda，python3.7(虛擬環境)
2.4 環境中安裝好所需要的庫(自行安裝好虛擬環境中所需的第三方庫)：imghdr, shutil, glob, pathlib, tkinter, cv2, numpy, paddlehub, pandas, psutil, openpyxl, paddleocr, pillow, pyzbar, ZipFile, pymupdf

3. 代碼

3.1 導入庫：

# -*- coding: utf-8 -*-
# 程序名： final_inv_ocr
# Author: ddxn417
# email:allenzhang0182@qq.com
import imghdr
import math
import os
import re
import shutil
from collections import OrderedDict
from datetime import datetime
from glob import glob
from pathlib import Path
from tkinter import filedialog
from tkinter import Tk
import cv2
import numpy as np
import paddlehub as hub
import pandas as pd
import psutil
from openpyxl import cell, load_workbook
from openpyxl.styles import Font, colors
from paddleocr import PaddleOCR, draw_ocr
from PIL import Image, ImageDraw, ImageEnhance, ImageFont
from pyzbar import pyzbar
from zipfile import ZipFile
import fitz #pip install pymupdf

3.2 遍歷發票指定處理方式

# 遍歷文件夾內的發票文件，識別。
def walk_folder_ocr(origin_pandas,duplicate_pandas,origin_folder_path,**walk_folder_args):ocr_engines = walk_folder_args['ocr_engines']temp_folder_path = walk_folder_args['temp_folder_path']prepare_engine = walk_folder_args['engine_switch']result_pandas = origin_pandas# 獲取文件夾內所有的jpg和pdf文件個數cnt_file = len({p.resolve() for p in Path(origin_folder_path).glob("*") if p.suffix in [".jpg", ".pdf"]})# 如果要包括子目錄中的文件，則為：# cnt_total = len({p.resolve() for p in Path(origin_folder_path).glob("**/*") if p.suffix in [".jpg", ".pdf"]})inv_dict = {}  #發票字典初始化  #從origin_pandas 構建inv_dict字典(票號:文件路徑)if not result_pandas.empty:for i, (index, row) in enumerate(result_pandas.iterrows()):if row['01票號'] is np.NAN: #如果票號是空，則跳過continueif row['01票號'] not in inv_dict:inv_dict[row['01票號']] = [row['file_path']]else:inv_dict[row['01票號']].append(row['file_path'])if not duplicate_pandas.empty:for i, (index, row) in enumerate(duplicate_pandas.iterrows()):if row['重復票號'] is np.NAN: #如果票號是空，則跳過continueif row['重復票號'] not in inv_dict:inv_dict[row['重復票號']] = [row['file_path']]else:inv_dict[row['重復票號']].append(row['file_path'])   cnt_done = 0cnt_duplicate = 0if not origin_pandas.empty:cnt_done = len(origin_pandas.loc[origin_pandas['file_path'].notnull(),:])if not duplicate_pandas.empty:cnt_duplicate = len(duplicate_pandas.loc[duplicate_pandas['file_path'].notnull(),:])for file_name in os.listdir(origin_folder_path): #只在本層文件夾內遍歷file_path = os.path.join(origin_folder_path, file_name)if os.path.isfile(file_path): #排除file_name是文件夾的情況pr,nm,fr,ex = pathsplit(file_path)if ex not in ['.pdf','.jpg']:continueinv_out_of_result_pandas = Trueinv_out_of_duplicate_pandas = True# 在上次結果文件和重復文件記錄中查找文件路徑：try:inv_out_of_result_pandas = result_pandas.loc[result_pandas['file_path']==file_path,:].emptyinv_out_of_duplicate_pandas = duplicate_pandas.loc[duplicate_pandas['file_path']==file_path,:].emptyexcept:pass#如果文件路徑在上次結果文件和重復文件記錄中查詢結果不為空，即曾識別過，則跳過該文件if not(inv_out_of_result_pandas and inv_out_of_duplicate_pandas):continue        result_series_orderdic = OrderedDict() #定義series有序字典err_info = '' #錯誤記錄初始化if ex == '.pdf':inv_code = ''pdf_trans_file_fr = frpdf_trans_file_ex = '.xlsx'# pdf_trans_file_ex = '.txt'pdf_trans_file_nm = pdf_trans_file_fr + pdf_trans_file_expdf_trans_folder_name = 'temp_pdf_trans_excel'pdf_trans_folder_path = os.path.join(temp_folder_path, pdf_trans_folder_name)if not os.path.exists(pdf_trans_folder_path):os.mkdir(pdf_trans_folder_path)pdf_trans_file_path = os.path.join(pdf_trans_folder_path, pdf_trans_file_nm)if not os.path.exists(pdf_trans_file_path):trans_type = '.xlsx'# trans_type = '.txt'pdf_trans_file_path = Pdf_tans_to(file_path, pdf_trans_file_path, trans_type = trans_type, temp_pdf_trans_excel_out = True)if os.path.exists(pdf_trans_file_path):result_series_orderdic, err_info, inv_dict = Tele_inv_ocr(ocr_engines, result_series_orderdic, inv_dict, file_path, pdf_trans_file_path, err_info, engine_switch = precise_engine)   if len(result_series_orderdic) != 0:if '01票號' in result_series_orderdic:inv_code = result_series_orderdic['01票號'][0].values[0]#票號添加到票號字典if inv_code not in inv_dict:inv_dict[inv_code] = [file_path]else:if file_path not in inv_dict[inv_code]:inv_dict[inv_code].append(file_path)if len(inv_dict[inv_code]) > 1: #如果該票號的發票重復，跳出本張圖片循環if duplicate_pandas.empty:duplicate_pandas = pd.DataFrame(data={'重復票號':[inv_code],'file_path':[file_path]}) else:duplicate_pandas = pd.concat([duplicate_pandas, pd.DataFrame(data={'重復票號':[inv_code],'file_path':[file_path]})], ignore_index = True, axis = 0)    Log_result_file(duplicate_pandas,result_file_path,duplicate_sheet_name)cnt_duplicate = cnt_duplicate + 1print(datetime.now().strftime("%H:%M:%S"),file_path, 'Skip. ','\n\t\tDuplicate:', inv_code,inv_dict[inv_code][0])#發票號重復，跳出本次識別continue else:#如果沒有結果，轉成圖片識別pdf_trans_file_ex = '.jpg'pdf_trans_file_nm = pdf_trans_file_fr + '.jpg'pdf_trans_folder_name = 'temp_pdf_trans_jpg'pdf_trans_folder_path = os.path.join(temp_folder_path, pdf_trans_folder_name)pdf_trans_jpg_file_path = os.path.join(pdf_trans_folder_path, pdf_trans_file_nm)pdf_trans_jpg_file_path = Pdf_tans_jpg(file_path, pdf_trans_jpg_file_path, temp_pdf_trans_jpg_out = True)if len(pdf_trans_jpg_file_path)>0:if os.path.exists(pdf_trans_jpg_file_path):#如果傳回了轉成圖片的路徑，并且路徑存在，讀取jpg路徑，付給file_path,轉成ocr識別：print('\n\nPDF轉成圖片識別：',pdf_trans_jpg_file_path,'【此模塊待添加。】\n\n')elif str.lower(ex) == '.jpg':        known_dict = {} #初始化inv_code ='' #初始化temp_img_trans_excel_folder = os.path.join(temp_folder_path,'temp_img_trans_excel')img_trans_xls_name = 'result_' + fr +  '.xlsx' img_trans_xls_path = os.path.join(temp_img_trans_excel_folder, img_trans_xls_name)if os.path.exists(img_trans_xls_path):origin_df = pd.read_excel(img_trans_xls_path, sheet_name=0,header=0,index_col=0,na_values=None, keep_default_na=False, dtype=object) #讀取表格else:known_dict = Crop_known_from_qrcode(file_path)if len(known_dict)>0:inv_code = known_dict['01票號'].values[0]#票號添加到票號字典if inv_code not in inv_dict:inv_dict[inv_code] = [file_path]else:if file_path not in inv_dict[inv_code]:inv_dict[inv_code].append(file_path)if len(inv_dict[inv_code]) > 1: #如果該票號的發票重復，跳出本張圖片循環if duplicate_pandas.empty:duplicate_pandas = pd.DataFrame(data={'重復票號':[inv_code],'file_path':[file_path]}) else:duplicate_pandas = pd.concat([duplicate_pandas, pd.DataFrame(data={'重復票號':[inv_code],'file_path':[file_path]})], ignore_index = True, axis = 0)    Log_result_file(duplicate_pandas,result_file_path,duplicate_sheet_name)cnt_duplicate = cnt_duplicate + 1print(datetime.now().strftime("%H:%M:%S"),file_path, 'Skip. ','\n\t\tDuplicate:', inv_code,inv_dict[inv_code][0])#發票號重復，跳出本次識別continue origin_df = Ocr_func(ocr_engines, img_path = file_path, temp_folder_path = temp_folder_path, range_title = '', known_dict=known_dict, ocr_excel_out = ocr_excel_out, draw_result_out = draw_result_out, engine_switch=prepare_engine)  #識別為原始文本dfif not origin_df.empty:result_series_orderdic, err_info = Loc_range_content_pandas(ocr_engines, origin_df, result_series_orderdic, err_info, known_dict, file_path, temp_folder_path, enhance = enhance, engine_switch=precise_engine) #處理為結果series字典if len(result_series_orderdic['01票號']) > 0:inv_code = result_series_orderdic['01票號'].values[0]# assert isinstance(inv_code,str)# assert len(inv_code) == 8 or len(inv_code) == 20if inv_code not in inv_dict:inv_dict[inv_code] = [file_path]else:if file_path not in inv_dict[inv_code]:inv_dict[inv_code].append(file_path)if len(inv_code)>0 and inv_code in inv_dict and len(inv_dict[inv_code]) >1:# duplicate_df = pd.read_excel(result_file_path, sheet_name=duplicate_sheet_name,index_col=0,header = 0,keep_default_na=True,dtype=object) #讀取表格if duplicate_pandas.empty:duplicate_pandas = pd.DataFrame(data={'重復票號':[inv_code],'file_path':[file_path]}) else:duplicate_pandas = pd.concat([duplicate_pandas, pd.DataFrame(data={'重復票號':[inv_code],'file_path':[file_path]})], ignore_index = True, axis = 0)    Log_result_file(duplicate_pandas,result_file_path,duplicate_sheet_name)cnt_duplicate = cnt_duplicate + 1print(datetime.now().strftime("%H:%M:%S"),file_path, 'Skip. ','\n\t\tDuplicate:', inv_code,inv_dict[inv_code][0])continue #如果發票號不只一張，跳出本次識別#series列表合成dataframe:bind_df = pd.DataFrame([result_series_orderdic[series_title][0] if isinstance(result_series_orderdic[series_title], list) else result_series_orderdic[series_title] for series_title in result_series_orderdic]).Tcolumns_list =  ['01票號','02代碼','03日期','04購方','05購方稅號','06品名','07單位','08數量','09單價','10稅前','11稅率','12稅額','13合計稅前','14合計稅額','15總額','16大寫','17銷方','18銷方稅號'] if len(bind_df) == 0:bind_df = pd.DataFrame(columns = columns_list)result_df = bind_df.copy() #淺拷貝，防止下面填充提示錯誤result_df['file_path'] = ''if len(result_df) == 0:result_df = result_df.append({'file_path':file_path},ignore_index = True) #追加文件路徑到第一行else:result_df['file_path'].values[0] = file_path #追加文件路徑到第一行result_df['err_info'] = ''result_df.loc[result_df.index[0],'err_info'] = err_info #追加錯誤提示到第一行# 填充處理：務必先處理na值，再進行后續處理。result_df = Fill_na_result(result_df)if result_pandas.empty:result_pandas = result_dfelse:result_pandas = pd.concat([result_pandas, result_df], ignore_index = True, axis = 0)result_pandas = Check_result(result_pandas) #檢查和修改結果 每識別一個文件，重新檢查前面所有的發票#每識別一個文件，寫入結果文件，防止中間出錯導致未保存結果而重復識別，以實現斷點接續，提高總體的效率：Log_result_file(result_pandas,result_file_path,result_sheet_name)# writer = pd.ExcelWriter(result_file_path, engine='openpyxl', mode='a', if_sheet_exists='replace')# duplicate_pandas.to_excel(writer,sheet_name=duplicate_sheet_name)# writer.close()#-----添加文件路徑超鏈接------Add_hyperlink(result_file_path,result_sheet_name)cnt_done = cnt_done + 1print(datetime.now().strftime("%H:%M:%S"),file_name, inv_code,'done: ' + str(cnt_done) + ' / ' + str(cnt_file))# cnt_dict = {'cnt_file':cnt_file,'cnt_done':cnt_file,'cnt_done':cnt_duplicate}return result_pandas,duplicate_pandas

3.3 發票識別相關函數

# ocr image to origin_DataFrame. 
def Ocr_func(ocr_engines, img_path, temp_folder_path,  range_title='', known_dict = {}, ocr_excel_out = True, draw_result_out = False, engine_switch = 0) ->object: #DataFrame            p,n,fr,ex = pathsplit(img_path) #拆分路徑temp_img_trans_excel_folder = os.path.join(temp_folder_path,'temp_img_trans_excel')temp_draw_result_folder = os.path.join(temp_folder_path,'temp_draw_result')if engine_switch == 0:engine = 'mb'elif engine_switch == 1:engine = 'pp'elif engine_switch == 2:engine = 'sv'if range_title =='':img_trans_xls_name = 'result(' + engine + ')_' + fr + '.xlsx' else:img_trans_xls_name = 'result(' + engine + ')_' + fr + '_' + range_title + '.xlsx' img_trans_xls_path = os.path.join(temp_img_trans_excel_folder, img_trans_xls_name)if not os.path.exists(temp_img_trans_excel_folder):Create_clear_dir(temp_img_trans_excel_folder)if not os.path.exists(temp_draw_result_folder):Create_clear_dir(temp_draw_result_folder)result = '' #結果初始化if engine_switch == 1:paddleOcr = ocr_engines[engine_switch] results = paddleOcr.ocr(img_path, cls=True)  #識別圖像----------------df0 = pd.DataFrame(data=results,columns=['pix','result'])df1 = pd.concat([pd.DataFrame(df0['pix'].values.tolist(),columns=['lu','ru','rd','ld']), pd.DataFrame(df0['result'].values.tolist(),columns=['content','trust'])], axis=1)title_list = ['lu', 'ru', 'rd', 'ld']df = df1[['content','trust']]for i, title in enumerate(title_list):df = pd.concat([df, pd.DataFrame(df1[title].values.tolist(), columns=[title + 'w', title + 'h'])], axis=1)if ocr_excel_out == True:df.to_excel(img_trans_xls_path, index=False)if draw_result_out == True:# draw resultfrom PIL import Imageimage = Image.open(img_path).convert('RGB')boxes = [line[0] for line in result]txts = [line[1][0] for line in result]scores = [line[1][1] for line in result]im_show = draw_ocr(image, boxes, txts, scores, font_path='./fonts/simfang.ttf')im_show = Image.fromarray(im_show)if range_title =='':draw_result_name = 'draw_result_' + fr + exelse:draw_result_name = 'draw_result_' + fr + '_' + range_title + ex draw_result_path = os.path.join(temp_draw_result_folder, draw_result_name)im_show.save(draw_result_path)elif engine_switch == 0 or engine_switch == 2:hubOcr = ocr_engines[engine_switch]img = cv_imread(img_path)np_images = [img]
#         np_images = [cv2.imdecode(np.fromfile(jpgfile, dtype=np.uint8), cv2.IMREAD_COLOR)]#---------使用識別引擎：hub_result = hubOcr.recognize_text(images=np_images,  # 圖片數據，ndarray.shape 為 [H, W, C]，BGR格式use_gpu=False,  # 是否使用 GPU。否即False,是即請先設置CUDA_VISIBLE_DEVICES環境變量output_dir=temp_draw_result_folder,  # 圖片的保存路徑visualization=True,  # 是否將識別結果保存為圖片文件box_thresh=0.5,  # 檢測文本框置信度的閾值text_thresh=0.5)  # 識別中文文本置信度的閾值results = hub_result[0]['data']df = pd.DataFrame()column_list = ['content','confdence','luw','luh','ruw','ruh','rdw','rdh','ldw','ldh']for infomation in results:content = infomation['text']confidence = infomation['confidence']box = infomation['text_box_position']luw,luh,ruw,ruh = box[0][0],box[0][1],box[1][0],box[1][1]rdw,rdh,ldw,ldh = box[2][0],box[2][1],box[3][0],box[3][1]line = [content,confidence,luw,luh,ruw,ruh,rdw,rdh,ldw,ldh]line_df = pd.DataFrame(data = line,index = column_list).Tif df.empty:df = line_dfelse:df = pd.concat([df, line_df], axis=0, ignore_index=True)if ocr_excel_out == True:df.to_excel(img_trans_xls_path, index = False)return df# 識別發票二維碼信息
def Crop_known_from_qrcode(file_path) ->dict:known_dict = {} #返回值初始化pr,nm,fr,ex = pathsplit(file_path)qrcode_folder_name = 'temp_crop_qrcode'qrcode_folder_path = os.path.join(temp_folder_path, qrcode_folder_name)if not os.path.exists(qrcode_folder_path):Create_clear_dir(qrcode_folder_path)qrcode_file_name = 'qrcode_' + nmqrcode_file_path = os.path.join(qrcode_folder_path, qrcode_file_name)qrcode_image_crop = Crop_qrcode_image(file_path, qrcode_file_path)  # -----------切割處理二維碼圖片qrcode_result = ''if qrcode_image_crop == True: #如果二維碼切圖返回為Trueqrcode_result = qrcode_recongnize(qrcode_file_path)    #------------二維碼識別if len(qrcode_result) > 0:if len(qrcode_result) > 20:qrcode_list = qrcode_result.split(',') for index, range_title in enumerate(['02代碼','01票號','13合計稅前','04日期']): #二維碼各字段結果逐個賦值給knowndictknown_dict[range_title] = pd.Series(data=qrcode_list[index+2],name = range_title)return known_dict#切割二維碼圖片并放大像素
def Crop_qrcode_image(origin_file_path,crop_file_path):# 切割二維碼圖片result = False #結果初始化img_inv = cv_imread(origin_file_path)img_crop = img_inv[100:400, 50:350]  # h, wimg_magnify = cv2.resize(img_crop, (1200, 1200))cv2.imencode('.jpg', img_magnify)[1].tofile(crop_file_path) if os.path.exists(crop_file_path):result = Truereturn result# 二維碼識別：
def qrcode_recongnize(file_path, method = 'cv2', drawframe = False, enhance=False): #method：pil or cv2pr = os.path.split(file_path)[0]nm = os.path.split(file_path)[1]output_img_path = os.path.join(pr, 'draw_qrcode_' + nm)#方式一：cv2 方式if method =='cv2':img = cv_imread(file_path)gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)barcodes =pyzbar.decode(gray_img)#     print(barcodes)barcodeData = ''if len(barcodes) >0 :for barcode in barcodes:# 提取條形碼的邊界框的位置# 畫出圖像中條形碼的邊界框(x, y, w, h) = barcode.rectcv2.rectangle(img, (x, y), (x + w, y + h), (255, 255, 0), 2)# 條形碼數據為字節對象，所以如果我們想在輸出圖像上#  畫出來，就需要先將它轉換成字符串barcodeData = barcode.data.decode("utf-8")if len(barcodeData) > 20:if drawframe == True:from PIL import Image, ImageFont, ImageDraw# 繪出圖像上條形碼的數據和條形碼類型barcodeType = barcode.type# 把cv2格式的圖片轉成PIL格式的圖片然后在上標注二維碼和條形碼的內容img_PIL = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))# 參數（字體，默認大小）font = ImageFont.truetype('STFANGSO.TTF', 25)# 字體顏色fillColor = (0,255,0)# 文字輸出位置position = (x, y-25)# 輸出內容strl = barcodeData# 需要先把輸出的中文字符轉換成Unicode編碼形式(str.decode("utf-8))  # 創建畫筆draw = ImageDraw.Draw(img_PIL)draw.text(position, strl, font=font,fill=fillColor)# 使用PIL中的save方法保存圖片到本地img_PIL.save(output_img_path, 'jpeg')# 向終端打印條形碼數據和條形碼類型# print("掃描結果==》 類別： {0} 內容： {1}".format(barcodeType, barcodeData))breakreturn barcodeDataelif method == 'pil':#方式二：pil+qrcodefrom PIL import Image, ImageEnhanceimg = Image.open(file_path).convert('RGB')if enhance == True:# 增加亮度img = ImageEnhance.Brightness(img).enhance(1.0)# 銳利化img = ImageEnhance.Sharpness(img).enhance(1.5)# 增加對比度img = ImageEnhance.Contrast(img).enhance(2.0)# 灰度化img = img.convert('L')# 解碼二維碼decoded = pyzbar.decode(img)result = decoded[0][0].decode('utf-8')return result# 切割圖片識別
def Crop_ocr(ocr_engines, result_series_orderdic, known_dict,img_inv, file_path, crop_folder_path, set_h_adjust, cond_list, enhance = False, engine_switch = 0):pr,nm,fr,ex = pathsplit(file_path)range_title = cond_list[0]loc_method = cond_list[1]reg_type = cond_list[2]reg = cond_list[3]count_limit = cond_list[4]loc_dict = cond_list[5]chop_pix = loc_dict['crop'][min_w,max_w,min_h,max_h] = chop_pixadjust_ratio_dict = {'02代碼':1, '03日期':1,'10稅前':0.6,'11稅率':0.7,'12稅額':0.8}if range_title in adjust_ratio_dict:adjust_ratio = adjust_ratio_dict[range_title]min_h = min_h - int(set_h_adjust * adjust_ratio) #用微調系數和條件字段調節比例對裁切高度做微調，得到截取的新區域坐標max_h = max_h - int(set_h_adjust * adjust_ratio) crop_center_h = (max_h - min_h)//2 #獲取截取后的區域的中心高度，即原圖截取坐標高度差的一半img_crop = img_inv[min_h:max_h, min_w:max_w]enhance_title = ['04購方','05購方稅號','06品名','07單位','16大寫','17銷方','18銷方稅號']if enhance == True:if range_title in enhance_title:img_pil = cv2_pil(img_crop)img_enhance = pil_enhance(img_pil)img_crop = pil_cv2(img_enhance)crop_file_name = 'crop_'+ range_title + '_' +nmcrop_file_path = os.path.join(crop_folder_path, crop_file_name)cv2.imencode('.jpg', img_crop)[1].tofile(crop_file_path) df = Ocr_func(ocr_engines, img_path = crop_file_path, temp_folder_path = crop_folder_path, range_title = range_title, known_dict=known_dict,ocr_excel_out = True, draw_result_out = True, engine_switch = engine_switch)get_h_adjust = 0 #高度微調參數賦初始值result_sr = pd.Series(name = range_title)  #結果初始化# if range_title in ['09單價','02代碼','03日期']:  #調試#     print(range_title)if reg_type == 'extract':cond_df = df['content'].str.extract(reg)cond_df.loc[:,['luh','ldh']] = df.loc[:,['luh','ldh']]content_result = pd.notna(cond_df[0])if 'center_limit' in loc_dict: #如果字典有中心位置限制條件數據，則核對數據中心點位置是否符合條件center_df = df #賦值給中間臨時表center_df，用以計算中心位置是否滿足限制條件center_df[['luw','ruw','luh','ldh']].astype(int)center_df['center_w'] = (center_df['luw'] + center_df['ruw']) //2 center_df['center_h'] = (center_df['luh'] + center_df['ldh']) //2[center_w_min, center_w_max, center_h_min, center_h_max] = loc_dict['center_limit']cond_center = (center_w_min <= center_df.loc[:,'center_w']) & (center_df.loc[:,'center_w'] <= center_w_max) & \(center_h_min <= center_df.loc[:,'center_h']) & (center_df.loc[:,'center_h'] <= center_h_max) #坐標限定 content_result = content_result & cond_centertemp_df = df.loc[cond_df[content_result].index,:]if not temp_df.empty:temp_sr = temp_df.iloc[:,0]if range_title == '07單位': list(temp_sr.replace(to_replace = '[單|位|數|量]',value='',regex=True).values[0])   #把所獲得的字符串拆分 如"個個"拆為單個的字else:result_list = temp_sr.to_list()result_sr = pd.Series(data = result_list, name = range_title)if range_title == '01票號':data_center_h = (temp_df['luh'].values[0] + temp_df['ldh'].values[0]) //2get_h_adjust = int(crop_center_h - data_center_h)  #計算微調的高度系數,只能是整型if reg_type == 'contains':content_result = df['content'].str.contains(reg)if 'center_limit' in loc_dict: #如果字典有中心位置限制條件數據，則核對數據中心點位置是否符合條件center_df = df #賦值給中間臨時表center_df，用以計算中心位置是否滿足限制條件center_df[['luw','ruw','luh','ldh']].astype(int)center_df['center_w'] = (center_df['luw'] + center_df['ruw']) //2 center_df['center_h'] = (center_df['luh'] + center_df['ldh']) //2[center_w_min, center_w_max, center_h_min, center_h_max] = loc_dict['center_limit']cond_center = (center_w_min <= center_df.loc[:,'center_w']) & (center_df.loc[:,'center_w'] <= center_w_max) & \(center_h_min <= center_df.loc[:,'center_h']) & (center_df.loc[:,'center_h'] <= center_h_max) #坐標限定 content_result = content_result & cond_centerif range_title == '07單位':cond_special = ~df['content'].str.contains('單\s*位|數\s*量') #不包含‘單位’字樣content_result = content_result & cond_specialcontent_df = df.loc[content_result,:]if range_title == '01票號':data_center_h = (content_df['luh'].values[0] + content_df['ldh'].values[0]) //2get_h_adjust = int(crop_center_h - data_center_h)  #計算微調的高度系數,只能是整型temp_df = content_df.loc[:,['content']]if not temp_df.empty:temp_sr = temp_df.iloc[:,0]if range_title == '07單位': result_list = list(temp_sr.replace(to_replace = '[單|位|數|量]',value='',regex=True).values[0])  #把所獲得的字符串拆分 如"個個"拆為單個的字else:result_list = temp_sr.to_list()result_sr = pd.Series(data = result_list, name = range_title)result_series_orderdic[range_title] = result_srreturn result_series_orderdic, get_h_adjust# 根據條件在發票圖片預識別表中直接查找文本
def Loc_jpg_content(df, cond_list, order_dict):range_title = cond_list[0]loc_method = cond_list[1]reg_type = cond_list[2]      reg = cond_list[3]count_limit = cond_list[4]loc_dict = cond_list[5]w_min, w_max, h_min, h_max = loc_dict['direct'][0], loc_dict['direct'][1], loc_dict['direct'][2], loc_dict['direct'][3]# # 調試代碼：# if range_title == '10稅前' and '0005.jpg' in file_path: #      print(range_title,'\n', "w_limit:", w_min,w_max,'\n', 'h_limit:', h_min,h_max,'\n')result_sr = pd.Series(name = range_title) #結果初始化loc_tuple = [] #結果初始化if reg_type == 'extract':temp_df = df['content'].str.extract(reg)if len(temp_df) > 0:temp_df[['center_w','center_h','luw','ruw','luh','ldh']] = df[['center_w','center_h','luw','ruw','luh','ldh']]content_result = (temp_df.iloc[:,0].str.len() > 0)cond_loc=(w_min <= temp_df.loc[:,'center_w']) & (temp_df.loc[:,'center_w'] <= w_max) & \(h_min <= temp_df.loc[:,'center_h']) & (temp_df.loc[:,'center_h'] <= h_max) #坐標限定 cond_result = content_result & cond_loc #限定合并temp_cond_pandas = temp_df.loc[cond_result,:]if not temp_cond_pandas.empty:result_sr = temp_cond_pandas.iloc[:,0] #先賦值給result_sr，如果下面備用位置未取到，則就取當前賦的值loc_tuple = temp_cond_pandas.loc[:,['luw','luh']].values[0]if len(result_sr) == 0:#使用備用位置if len(loc_dict['direct'])>=8:w_min, w_max, h_min, h_max = loc_dict['direct'][4], loc_dict['direct'][5], loc_dict['direct'][6], loc_dict['direct'][7]#使用備用位置坐標識別temp_df = df['content'].str.extract(reg)temp_df[['center_w','center_h']] = df[['center_w','center_h']]content_result = (temp_df.iloc[:,0].str.len() > 0)cond_loc=(w_min <= temp_df.loc[:,'center_w']) & (temp_df.loc[:,'center_w'] <= w_max) & \(h_min <= temp_df.loc[:,'center_h']) & (temp_df.loc[:,'center_h'] <= h_max) #坐標限定 cond_result = content_result & cond_loc#限定合并temp_cond_pandas = temp_df.loc[cond_result,:]result_sr = temp_cond_pandas.iloc[:,0]loc_tuple = temp_cond_pandas.loc[:,['luw','luh']].values[0]elif len(result_sr) >=1 and count_limit == '1':temp_cond_pandas = temp_df.loc[cond_result,:]result_sr = temp_cond_pandas.iloc[:,0].head(1)loc_tuple = temp_cond_pandas.loc[:,['luw','luh']].values[0]else:               result_sr = temp_df.loc[cond_result,0]loc_tuple = temp_cond_pandas.loc[:,['luw','luh']].values[0]elif reg_type == 'contains':content_result = df['content'].str.contains(reg)temp_df = df.loc[content_result,:]if len(temp_df) > 0:  #如果有結果cond_loc = (w_min <= temp_df.loc[:,'center_w']) & (temp_df.loc[:,'center_w'] <= w_max) & \(h_min <= temp_df.loc[:,'center_h']) & (temp_df.loc[:,'center_h'] <= h_max)  #坐標限定cond_result = content_result & cond_loctemp_cond_pandas = temp_df.loc[cond_result,:]if not temp_cond_pandas.empty:result_sr = temp_cond_pandas.iloc[:,0].head(1)loc_tuple = temp_cond_pandas.loc[:,['luw','luh']].values[0]else: #使用備用位置if len(loc_dict['direct'])>=8:w_min, w_max, h_min, h_max = loc_dict['direct'][4], loc_dict['direct'][5], loc_dict['direct'][6], loc_dict['direct'][7]#使用備用位置坐標識別content_result = df['content'].str.contains(reg)temp_df = df.loc[content_result,:]cond_loc = (w_min <= temp_df.loc[:,'center_w']) & (temp_df.loc[:,'center_w'] <= w_max) & \(h_min <= temp_df.loc[:,'center_h']) & (temp_df.loc[:,'center_h'] <= h_max)  #坐標限定cond_result = content_result & cond_loctemp_cond_pandas = temp_df.loc[cond_result,:]result_sr = temp_cond_pandas.iloc[:,0]loc_tuple = temp_cond_pandas.loc[:,['luw','luh']].values[0]#記錄再次精確識別的坐標：漢字金額左上角的w和h，傳遞給主程序  大寫金額|{'trad'：(x,x,x)} #---------------可在此處調試：上行加斷點--------------------------------result_list = result_sr.to_list() #當前結果轉為列表#---------------可在此處調試：上行加斷點--------------------------------order_dict[range_title] = [pd.Series(result_list, name=range_title), loc_tuple] #追加識別信息到字典return order_dict# ------------------------------func:Loc_tele_content-----------------------
# 功能：根據條件在發票圖片預識別表中直接查找文本
def Loc_tele_content(df, known_dict, cond_list, order_dict):range_title = cond_list[0]loc_method = cond_list[1]reg_type = cond_list[2]      reg = cond_list[3]count_limit = cond_list[4]known_sr = pd.Series(name = range_title) #結果初始化result_sr = pd.Series(name = range_title) #結果初始化err_info = ''# 從已知的known中取值if range_title in known_dict:#此處加斷點調試#先設定為known_dict中的值known_sr = pd.Series(data= known_dict[range_title], name = range_title)if reg_type == 'extract':#此處加斷點調試temp_cond_pandas = pd.DataFrame()# if range_title == '11稅率':#    print(range_title)#此處注釋為調試代碼，遍歷reg列表，嘗試匹配，提取到就跳出匹配：for _, r in enumerate(reg):temp_df = df['content'].str.extract(r)cond_result = temp_df.iloc[:,0].str.len() > 0temp_cond_pandas = temp_df.loc[temp_df.iloc[:,0].str.len() > 0,:]if len(temp_cond_pandas)>0:breakif len(temp_cond_pandas)>0:# result_sr = temp_cond_pandas.iloc[:,0] #先賦值給result_sr，如果下面備用位置未取到，則就取當前賦的值if count_limit == '1':result_sr = temp_cond_pandas.iloc[:,0].head(1)elif count_limit == '-1':if len(temp_cond_pandas) == 1:result_sr = temp_cond_pandas.iloc[:,0].head(1)else:# 如果多條數據，選倒數第一個result_sr = temp_cond_pandas.iloc[:,0].tail(-1)                  else:result_sr = temp_df.loc[cond_result,0]#去掉首尾空格：result_sr = result_sr.replace(to_replace='^\s|\s$',value='',regex=True)if range_title =='13合計稅前':if len(known_sr) > 0:if len(result_sr) == 0:result_sr = known_srelse:result_value = result_sr.values[0]known_value = known_sr.values[0]if result_value == known_value:#針對有的發票二維碼讀出來的金額不是稅前金額而是總額的情況。#只有是稅前金額的情況，才以known為準result_sr = known_sr.copy()elif range_title =='15總額':if '13合計稅前' in known_dict:#針對有的發票二維碼讀出來的金額不熟稅前金額而是總額的情況,如前期上海良和的發票known_sr = pd.Series(data= known_dict['13合計稅前'], name = range_title)if len(known_sr) > 0:if len(result_sr) > 0:if result_sr.values[0] == known_sr.values[0]:     result_sr = known_sr.copy()elif range_title == '06品名':# 先處理數據中的空格（符合的留下，不符合的刪掉）target_sr = result_sr.str.extractall('([\u4e00-\u9fa5]+\s+[\u4e00-\u9fa5]+)')if len(target_sr) > 0:#對提取的要替換sr重新賦索引target_sr.index = list(range(len(target_sr)))#構建要替換成的字符串新srreplace_sr = target_sr.replace('\s+','',regex=True)#sr替換字符串new_sr = result_sr.copy()for i in enumerate(target_sr.index):new_sr=new_sr.replace(target_sr.iloc[i],replace_sr.iloc[i],regex=True)result_sr = new_sr.copy()# 多品名按空格分割為多行:data = result_sr.iloc[0]if data.count(' ')>0:result_sr = pd.Series(data = data.split(' '),name=range_title)else:# 對于其他字段，如果result_sr沒有值而known_sr有值，則以known_sr為準if len(result_sr) == 0 and len(known_sr) > 0:result_sr = known_sr.copy()#---------------可在此處調試：上行加斷點--------------------------------# result_list = result_sr.to_list() #當前結果轉為列表result_sr.name = range_titleresult_sr.index = list(range(len(result_sr))) #重新賦索引#---------------可在此處調試：上行加斷點--------------------------------# order_dict[range_title] = [pd.Series(result_list, name=range_title)] #追加識別信息到字典order_dict[range_title] = [result_sr]return order_dict, err_infodef Get_known_from_from_xls_image(origin_pdf_xls_path, paddle_ocr):# 功能： 從pdf轉換的xls中，識別其中包含的圖片，生成known_dictxls_file_path = origin_pdf_xls_path# 解壓目錄pth_split = os.path.split(xls_file_path)pr = pth_split[0]nm = pth_split[1]nm_split = os.path.splitext(nm)fr = nm_split[0]ex = nm_split[1]unzip_path = os.path.join(pr, fr)sub_img_path = os.path.join(unzip_path, "xl\\media")result_title=['content']result_df = pd.DataFrame(columns = result_title)known_dict = {}draw_result_out = Truewb = load_workbook(xls_file_path)ws = wb['Table 1']if not os.path.exists(unzip_path):os.mkdir(unzip_path)if draw_result_out == True:draw_result_folder = os.path.join(unzip_path, 'draw_result')if not os.path.exists(draw_result_folder):os.mkdir(draw_result_folder)with ZipFile(xls_file_path) as f:for file in f.namelist():# 解壓圖片部分的文件tempimg_path = ''if file.startswith("xl/media"):f.extract(file, path=unzip_path)    # 此處tempimg_path表達式和下面的os.path.join(sub_img_path, filename) 結果是相同的temp_img_name = os.path.split(file)[1]temp_img_fr = os.path.splitext(temp_img_name)[0]ext = os.path.splitext(temp_img_name)[1].lower()tempimg_path = os.path.join(unzip_path, file)#直接cv2方式:img = cv_imread(tempimg_path)#先判斷圖片是否為二維碼，根據圖片長款是否一致且大于300# print(img.shape)(h, w, _) = img.shapeif 80 <= max(h, w) <= 200 and h == w:#如果圖片高寬一致且大于300，可能為二維碼，嘗試讀取codedata = pyzbar.decode(img)if len(codedata) > 0:data_str = codedata[0].data.decode()if len(data_str) > 20:data_list = data_str.split(',')if len(data_list) > 4:known_dict['01票號'] = data_list[3],known_dict['02代碼'] = data_list[2],known_dict['03日期'] = data_list[5],known_dict['13合計稅前'] = data_list[4]# img_linear = cv2.resize(img, (img.shape[1]*4, img.shape[0]*4), cv2.INTER_LINEAR)# img_nearest = cv2.resize(img, (img.shape[1]*4, img.shape[0]*4), cv2.INTER_NEAREST)#              # 只識別高度像素在50以內的圖片：if h < 50: enlarge = 4 #放大4倍img_new = new(img, enlarge)edge = 20color = (255,255,255) #白色img_large = cv2.copyMakeBorder(img_new,edge,edge,edge,edge, cv2.BORDER_CONSTANT,value=color) enlarge_img_folder = os.path.join(unzip_path, 'img_enlarge')if not os.path.exists(enlarge_img_folder):os.mkdir(enlarge_img_folder)enlarge_img_path = os.path.join(enlarge_img_folder, 'enlarge_' + temp_img_name)cv2.imencode(".jpg", img_large)[1].tofile(enlarge_img_path)result = paddle_ocr.ocr(img_large, cls=True)  #識別圖像----------------if len(result) > 0:df = pd.DataFrame(data=[result[i][1][0] for i in range(len(result))],columns = result_title)result_df = Collect_df(result_df, df)if draw_result_out == True:# draw resultfrom PIL import Imageimage = Image.open(enlarge_img_path).convert('RGB')# image = cv2_pil(img_large)boxes = [line[0] for line in result]txts = [line[1][0] for line in result]scores = [line[1][1] for line in result]im_show = draw_ocr(image, boxes, txts, scores, font_path='./fonts/simfang.ttf')im_show = Image.fromarray(im_show)# if range_title =='':draw_result_name = 'draw_' + temp_img_name# else:# draw_result_name = 'draw_result_' + fr + '_' + range_title + ex draw_result_path = os.path.join(draw_result_folder, draw_result_name)im_show.save(draw_result_path)temp_df = result_df.loc[:,'content'].str.extract('[￥￥]([.0-9]+)')temp_df.columns=['content']amount_df = temp_df.loc[temp_df['content'].notna(),:]if len(amount_df) >= 3:sqhj = float(known_dict['13合計稅前'])amount_df = amount_df.astype(float)if sqhj > 1:values = amount_df.loc[amount_df['content']!=sqhj,'content'].valuesknown_dict['15總額'] = max(values)known_dict['14合計稅額'] = min(values)temp_df = result_df.loc[:,'content'].str.extract('^(91\S{16})$')temp_df.columns=['content']tax_numbers_df = temp_df.loc[temp_df['content'].notna(),:]if len(tax_numbers_df) > 0:our_number = '你公司的稅號'known_dict['05購方稅號'] = our_numbervalues = tax_numbers_df.loc[tax_numbers_df['content']!=our_number,'content'].valuesif len(values)>0:known_dict['18銷方稅號'] = values[0]# print(result_df)# print('known_dict:',known_dict)img_ocr_result_folder = os.path.join(unzip_path, 'result')if not os.path.exists(img_ocr_result_folder):os.mkdir(img_ocr_result_folder)img_ocr_result_name = temp_img_fr + '.xlsx'img_ocr_result_path = os.path.join(img_ocr_result_folder, img_ocr_result_name)result_df.to_excel(img_ocr_result_path)return known_dict# PDF格式文件轉換
def Pdf_tans_to(file_path, pdf_trans_to_file_path, trans_type = '.xlsx', temp_pdf_trans_excel_out = True) :# 可提取文字的pdf文件轉為excel：# 先引入winerror、win32模塊import winerrorfrom win32com.client.dynamic import ERRORS_BAD_CONTEXT, DispatchERRORS_BAD_CONTEXT.append(winerror.E_NOTIMPL)output_folder_path = os.path.split(pdf_trans_to_file_path)[0]if not os.path.exists(output_folder_path):Create_clear_dir(output_folder_path)if trans_type == '.xlsx':trans_engion = 'com.adobe.acrobat.xlsx'elif trans_type == '.txt':trans_engion = 'com.adobe.acrobat.plain-text'else:trans_engion = 'com.adobe.acrobat.plain-text'try:AvDoc = Dispatch("AcroExch.AVDoc")    if AvDoc.Open(file_path, ""):            pdDoc = AvDoc.GetPDDoc()jsObject = pdDoc.GetJSObject()jsObject.SaveAs(pdf_trans_to_file_path, trans_engion)except Exception as e:print(str(e))finally:        AvDoc.Close(True)jsObject = NonepdDoc = NoneAvDoc = Noneif os.path.exists(pdf_trans_to_file_path):return pdf_trans_to_file_pathelse:return Nonedef Pdf_tans_jpg(file_path, pdf_trans_jpg_file_path, temp_pdf_trans_jpg_out = True) :# 可提取文字的pdf文件轉為excel：# 先引入winerror、win32模塊output_folder_path = os.path.split(pdf_trans_jpg_file_path)[0]if not os.path.exists(output_folder_path):Create_clear_dir(output_folder_path)doc = fitz.open(file_path)pdf_name = os.path.splitext(file_path)[0]for pg in range(doc.pageCount):page = doc[pg]rotate = int(0)# 每個尺寸的縮放系數為2，這將為我們生成分辨率提高四倍的圖像。zoom_x = 2.0zoom_y = 2.0trans = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)pm = page.getPixmap(matrix=trans, alpha=False)pm.writePNG(pdf_trans_jpg_file_path)if os.path.exists(pdf_trans_jpg_file_path):return pdf_trans_jpg_file_pathelse:return Nonedef pil_enhance(img):# 功能圖片預處理# 增加亮度img = ImageEnhance.Brightness(img).enhance(1.0)# 銳利化img = ImageEnhance.Sharpness(img).enhance(1.5)# 增加對比度img = ImageEnhance.Contrast(img).enhance(2.0)# 灰度化img_result = img.convert('L')return img_resultdef new(img, enlarge):# 放大圖像為enlarge倍img_new = np.zeros((img.shape[0] * enlarge, img.shape[1] * enlarge, img.shape[2]))for i in range(img.shape[0]):for j in range(img.shape[1]):for m in range(4):for n in range(4):img_new[4*i + m][4*j + n] = img[i][j]return img_newdef Pil_make_border(image, edge = 20):# 圖像擴充邊界，擴充后邊界增加 1/2 * edge 個像素iw, ih = image.size  # 原始圖像的尺寸w, h = iw + edge, ih + edge  # 目標圖像的尺寸target_size = (w, h)# scale = min(float(w) / float(iw), float(h) / float(ih))  # 轉換的最小比例# 保證長或寬，至少一個符合目標圖像的尺寸# nw = int(iw * scale)# nh = int(ih * scale)nw = iwnh = ihimage = image.resize((nw, nh), Image.BICUBIC)  # 縮小圖像# 白色color=(255,255,255) new_image = Image.new('RGB', target_size, color)  # 生成白色色圖像# // 為整數除法，計算圖像的位置new_image.paste(image, ((w - nw) // 2, (h - nh) // 2))  # 將圖像填充為中間圖像，兩側為白色的樣式# new_image.show()return new_image

3.4 發票字段定位函數

# 根據紙質發票數據預識別表定位數據
def Loc_range_content_pandas(ocr_engines, df, result_series_orderdic, err_info, known_dict, file_path, temp_folder_path, enhance=False, engine_switch=0): #DataFrameuser_name, user_code  = '你的公司名稱', '你公司的稅號'df['content'].astype(str) #content列轉換為字符格式#計算各識別區域中心點的w和hdf['center_w']=(df.loc[:,'luw']+df.loc[:,'rdw'])/2df['center_h']=(df.loc[:,'luh']+df.loc[:,'rdh'])/2w_ratio = 1h_ratio = 1w_this_loc_tradtitle = 240h_this_loc_tradtitle = 1170# 用價稅合計標題字段坐推算漢字金額區域坐標范圍，并查找確定漢字金額的坐標 # tolerance = 18 #容錯像素范圍# 漢字金額區域左右上下坐標與價稅合計標題距離:min_w_zero_distance, max_w_zero_distance ,min_h_zero_distance, max_h_zero_distance \= 521,1550,-33,98# 具體某張發票中的原點（即漢字金額）的寬度和長度區塊：min_w_zero = w_this_loc_tradtitle + w_ratio * min_w_zero_distancemax_w_zero = w_this_loc_tradtitle + w_ratio * max_w_zero_distancemin_h_zero = h_this_loc_tradtitle + h_ratio * min_h_zero_distancemax_h_zero = h_this_loc_tradtitle + h_ratio * max_h_zero_distanceloc_trad_range = [min_w_zero, max_w_zero, min_h_zero, max_h_zero]# 查詢原點字段的條件：cond_trad = ['16大寫','direct', #原點后面也通過裁切重新識別。（解決直接獲取出現的識別漢字不全的情況）'contains', '[圓角分整零壹貳叁肆伍陸柒捌玖拾佰仟萬億]{2,}', '1', #提取數量限制{'direct':loc_trad_range}]known_dict = Loc_jpg_content(df, cond_trad, order_dict=known_dict)if len(known_dict['16大寫'][1]) > 0:(w_zero, h_zero) = known_dict['16大寫'][1]else:err_info = err_info + '識別失敗！未找到大寫金額內容。'#設定一個默認坐標w_zero = 750h_zero = 1180# 其他字段區塊查詢條件：range_list = [[#發票號碼'01票號',  #0 區域代號['known','crop'], #1 前面先qrcode，通過字典的known獲取，其次crop識別'extract', #2 提取方式'^\D*(\d{8})$',   #3 正則表達式'1', #4 提取數量限制{ #5'crop':[int(w_zero + w_ratio *  (1430)), int(w_zero + w_ratio *  (1685)), int(h_zero + h_ratio * (-990)), int(h_zero + h_ratio * (-900))], 'known':known_dict}       ], [#發票代碼'02代碼',  #字段標題['known','crop'], #1 前面先qrcode，通過字典的known獲取，其次crop識別'extract', '([a-zA-Z0-9]{10})$', '1', #提取數量限制{'crop':[int(w_zero + w_ratio *  (-475)), int(w_zero + w_ratio *  (80)), int(h_zero + h_ratio * (-1100)), int(h_zero + h_ratio * (-920))]} ],[#開票日期'03日期',['known','crop'], #數據提取方式列表：漢字金額和前面先qrcode的，通過字典的known獲取，其次crop、direct'extract', '(\d{4}\s*年\s*\d{2}\s*月\s*\d{2}\s*日)$','1', #提取數量限制{'direct':[int(w_zero + w_ratio *  (1100)), int(w_zero + w_ratio *  (1637)), int(h_zero + h_ratio * (-925)), int(h_zero + h_ratio * (-840))],'crop':[int(w_zero + w_ratio *  (1300)), int(w_zero + w_ratio *  (1637)), int(h_zero + h_ratio * (-925)), int(h_zero + h_ratio * (-840))],}],[#買方名稱'04購方',['crop'],'extract','([\(\)（）\u4e00-\u9fa5]{8,30})', '1', {'crop':[int(w_zero + w_ratio *  (-320)), int(w_zero + w_ratio *  (600)), int(h_zero + h_ratio * (-800)), int(h_zero + h_ratio * (-680))],}],[#買方稅號'05購方稅號',['direct'],'extract', '([a-zA-Z0-9]{18})$', '1', {'direct':[int(w_zero + w_ratio *  (-240)), int(w_zero + w_ratio *  (540)), int(h_zero + h_ratio * (-800)), int(h_zero + h_ratio * (-680))],'crop':[int(w_zero + w_ratio *  (-320)), int(w_zero + w_ratio *  (600)), int(h_zero + h_ratio * (-800)), int(h_zero + h_ratio * (-680))],}],[#商品名稱  ok'06品名',['crop'],'contains', '^[\*冰水米\+]?(\S*[制品]\S*[\*冰水米\+]?\S+)$', 'n', {'crop':[int(w_zero + w_ratio *  (-670)), int(w_zero + w_ratio *  (640)), int(h_zero + h_ratio * (-560)), int(h_zero + h_ratio * (-100))],#在crop圖中的位置限定'center_limit': [10, 500, 10, 450],}],[#單位'07單位',['crop'],'contains', '^\D{1,8}$', 'n', {'crop': #和品名同一區塊總體聚類后再識別，通過位置提取，降低了漏識率[int(w_zero + w_ratio *  (-670)), int(w_zero + w_ratio *  (640)), int(h_zero + h_ratio * (-560)), int(h_zero + h_ratio * (-100))],#在crop圖中的位置限定:'center_limit': [820,1100,10,450]}    ],[#數量'08數量',['crop'],'contains', '^\d+$|^\d+\.\d+$','n', {'crop':[int(w_zero + w_ratio *  (440)), int(w_zero + w_ratio *  (640)), int(h_zero + h_ratio * (-510)), int(h_zero + h_ratio * (-100))],}    ],[#單價'09單價',['crop'],'contains', '^[\.:：]?\d+[\.:：]?\s*\d*\s*$', 'n', {'crop':[int(w_zero + w_ratio *  (635)), int(w_zero + w_ratio *  (890)), int(h_zero + h_ratio * (-510)), int(h_zero + h_ratio * (-100))],}],[#商品明細稅前金額'10稅前',['crop'],'contains', '^\s*[+-]?(?:\d+|\d{1,3}(?:,\d{3})*)[\.:：]\s*\d{2}\s*$', 'n', {'crop':[int(w_zero + w_ratio *  (980)), int(w_zero + w_ratio *  (1240)), int(h_zero + h_ratio * (-510)), int(h_zero + h_ratio * (-100))],}],[#稅率'11稅率',['crop'],'contains', '^\d{1,2}\s*%$', '1', {'crop':[int(w_zero + w_ratio *  (1240)), int(w_zero + w_ratio *  (1350)), int(h_zero + h_ratio * (-510)), int(h_zero + h_ratio * (-100))],}],[#商品明細稅額'12稅額',['crop'],'contains', '^\s*[+-]?(?:\d+|\d{1,3}(?:,\d{3}))[\.:：]?\s*\d{0,2}\s*\D*', 'n', {'crop':[int(w_zero + w_ratio *  (1380)), int(w_zero + w_ratio *  (1700)), int(h_zero + h_ratio * (-510)), int(h_zero + h_ratio * (-100))],}    ],[#合計稅前金額'13合計稅前',['known','crop'], #1 前面先qrcode，通過字典的known獲取，其次識別'contains', '[￥￥]?s*[+-]?(?:\d+|\d{1,3}(?:,\d{3})*)[\.:：]\s*\d{2}\s*$','1', {'crop':[int(w_zero + w_ratio *  (880)), int(w_zero + w_ratio *  (1235)), int(h_zero + h_ratio * (-100)), int(h_zero + h_ratio * (-10))],'known':known_dict}],[#合計稅額'14合計稅額',['crop'],'contains', '[￥￥]?s*[+-]?(?:\d+|\d{1,3}(?:,\d{3})*)[\.:：]?\s*\d{0,2}\s*$','1', {'crop':[int(w_zero + w_ratio *  (1300)), int(w_zero + w_ratio *  (1710)), int(h_zero + h_ratio * (-110)), int(h_zero + h_ratio * (0))],}],[#合計總額小寫'15總額',['crop'],'contains', '[￥￥]?s*[+-]?(?:\d+|\d{1,3}(?:,\d{3})*)[\.:：]\s*\d{2}\s*$','1', {'crop':[int(w_zero + w_ratio *  (1220)), int(w_zero + w_ratio *  (1700)), int(h_zero + h_ratio * (-20)), int(h_zero + h_ratio * (70))],}   ],   [#合計大寫'16大寫',['known'],known_dict],[#銷方名稱'17銷方',['crop'],'extract', # '([\(\)（）\u4e00-\u9fa5]{8,30})', '([\(\)（）\u4e00-\u9fa5]{8,30}[辦|處|公|司|廠|社|部])$', '1', {'crop':[int(w_zero + w_ratio *  (-280)), int(w_zero + w_ratio *  (540)), int(h_zero + h_ratio * (60)), int(h_zero + h_ratio * (165))],}],[#銷方稅號'18銷方稅號',['direct'],'extract','([a-zA-Z0-9]{18})$', '1', {'direct':[int(w_zero + w_ratio *  (-260)), int(w_zero + w_ratio *  (600)), int(h_zero + h_ratio * (100)), int(h_zero + h_ratio * (220))],'crop':[int(w_zero + w_ratio *  (-320)), int(w_zero + w_ratio *  (600)), int(h_zero + h_ratio * (100)), int(h_zero + h_ratio * (220))],}]]# cv2獲取圖像對象img_inv = cv_imread(file_path)  #獲取需要crop識別前的整張發票原始圖片err_info = '' #初始化此發票錯誤提示信息set_h_adjust = 0 #初始化crop的高度像素微調像素# 開始遍歷字段條件列表逐個提取字段：for i, cond_list in enumerate(range_list):range_title = cond_list[0]loc_method = cond_list[1]result_series_orderdic[range_title] = pd.Series() #返回值初始化if 'known' in loc_method:if range_title in known_dict:known = Trueresult_series_orderdic[range_title] = known_dict[range_title] #追加已知字典中的字段識別信息到字典 if len(result_series_orderdic[range_title]) > 0:continue  #如果已賦值，跳出本輪循環,不再執行本輪后面的語句if 'crop' in loc_method:crop_folder_name = 'crop'crop_folder_path = os.path.join(temp_folder_path, crop_folder_name) if not os.path.exists(crop_folder_path):Create_clear_dir(crop_folder_path)result_series_orderdic, get_h_adjust = Crop_ocr(ocr_engines, result_series_orderdic,known_dict, img_inv, file_path, crop_folder_path, set_h_adjust, cond_list, enhance, engine_switch = engine_switch)if range_title == '01票號':#用識別01票號獲取的高度像素調整結果如果大于5，設定為其他區域裁切高度的調整值。識別其他區域返回的系數不用管。if get_h_adjust > 5:set_h_adjust = get_h_adjustif len(result_series_orderdic[range_title]) > 0:continue  #如果已賦值，跳出本輪循環,不再執行本輪后面的語句if 'direct' in loc_method:result_series_orderdic = Loc_jpg_content(df, cond_list, order_dict=result_series_orderdic)return result_series_orderdic, err_info# 電子發票字段定位
def Tele_inv_ocr(ocr_engines, result_series_orderdic, inv_dict, file_path, excel_file_path, err_info, engine_switch = 0):df_org = pd.read_excel(excel_file_path, sheet_name=0,header=None,index_col=None,na_values='', keep_default_na=True, dtype=object) #讀取表格df_org = df_org.fillna('')df_org = df_org.astype(str)'去多空格'df_org = df_org.replace(to_replace = '\\n|\s+',value=' ',regex=True)df_org = df_org.replace(to_replace = '^\s+',value='',regex=True)#字典替換字符串:# rep = {'\n':'',' ':''}# rep = dict((re.escape(k), v) for k, v in rep.items())# #print(rep)# #print(rep.keys())# pattern = re.compile("|".join(rep.keys()))# #print(pattern)# my_str = pattern.sub(lambda m: rep[re.escape(m.group(0))], words)df_new = pd.DataFrame(data='', index = df_org.index, columns=['content'])#合并df_org的每一列到新表for i in df_org.columns:df_new['content'] = df_new['content'] +'|'+ df_org[i]#去重復分隔號df_new = df_new.replace(to_replace = '\|+',value='|',regex=True)#去開頭結尾分隔號df_new = df_new.replace(to_replace = '^\||\|+$',value='',regex=True)fp_mark = False    if len(df_new.loc[df_new['content'].str.contains('發票'),:]) >0:fp_mark = True# 判斷結果if fp_mark == False:  #pdf發票無找到電子發票字樣, 返回。err_info = 'inv character not found.'return result_series_orderdic, err_info, inv_dictknown_dict = {}known_dict = Get_known_from_from_xls_image(excel_file_path, ocr_engines[engine_switch])#字段參數配置：range_list = [[#發票號碼'01票號',  #0 區域代號['direct'], #1 直接提取'extract', #2 提取方式['發票號碼[：|:]?\s*(\d+)'],   #3 正則表達式'1', #4 提取數量限制], [#發票代碼'02代碼',  #字段標題['direct'], #1 前面先qrcode，通過字典的known獲取，其次crop識別'extract', ['發票代碼[：|:]?\s*(\d+)'], '1', #提取數量限制 ],[#開票日期'03日期',['direct'], #數據提取方式列表：漢字金額和前面先qrcode的，通過字典的known獲取，其次crop、direct'extract', ['(\d{4}\s*年\s*\d{2}\s*月\s*\d{2}\s*日)'],'1', #提取數量限制],[#買方名稱'04購方',['direct'],'extract',['^購買方信息\|名稱：(.+?) 統一社會信用代碼/納稅人識別號：','名\s*稱：\s*(.+?)\s*納稅人識別號'], #非貪婪匹配'1'],[#買方稅號'05購方稅號',['direct'],'extract', ['購買[\D]+納稅人識別號：[\|\s]*([0-9A-Z]{18?})',#非貪婪'納稅人識別號：([a-zA-Z0-9]{18})',], #非貪婪匹配'1'],[#商品名稱  ok'06品名',['direct'],'extract', ['^項目名稱\s*(.+)合\s*計\|','^項目名稱\s*(.+)合|',], #貪婪匹配'1'],[#單位'07單位',['direct'],'extract', ['^([\u4e00-\u9fa5]+)[\|\s]*[.0-9]+[\|\s]*[.0-9]+[\|\s]*[.0-9]+[\|\s]*[.0-9]+[\|\s]*[.0-9]+$','\|單\s*([\u4e00-\u9fa5]+)\|位\|','\|?單\s*\|?\s*價\s*\|?\s*([\u4e00-\u9fa5]{1,3})\s*[.0-9]+','\|?單[\s\|]*價[\|\s]*([\u4e00-\u9fa5]{1,3})\s*[.0-9]+','\|?單[\s\|]*位[\|\s]*([\u4e00-\u9fa5]{1,3})[\|\s]*數[\|\s]*量[\|\s]*[.0-9]+[\|\s]*單[\|\s]*價[\|\s]*[.0-9]+',], #貪婪匹配'n'    ],[#數量'08數量',['direct'],'extract', ['^[\u4e00-\u9fa5]+[\|\s]*([.0-9]+)[\|\s]*[.0-9]+[\|\s]*[.0-9]+[\|\s]*[.0-9]+[\|\s]*[.0-9]+$','量\s*([.0-9]+)\s*\|單','\|?單[\s\|]*價[\|\s]*[\u4e00-\u9fa5]{1,3}\s*([.0-9]+)','量[\s\|]*單[\s\|]*價[\|\s]*([.0-9]+)\s+[.0-9]+','([.0-9]+)[\s\|]+[.0-9]+[\s\|]+[.0-9]+[\s\|]+[.0-9]+[\s\|]+[.0-9]+'], #貪婪'n'    ],[#單價'09單價',['direct'],'extract', ['^[\u4e00-\u9fa5]+[\|\s]*[.0-9]+[\|\s]*([.0-9]+)[\|\s]*[.0-9]+[\|\s]*[.0-9]+[\|\s]*[.0-9]+$','價\s*([.0-9]+)\s*\|金','\|?單[\s\|]*價[\|\s]*[\u4e00-\u9fa5]{1,3}\s*[.0-9]+[\|\s]+([.0-9]+)','量[\s\|]*單[\s\|]*價[\|\s]*[.0-9]+\s+([.0-9]+)','[.0-9]+[\s\|]+([.0-9]+)[\s\|]+[.0-9]+[\s\|]+[.0-9]+[\s\|]+[.0-9]+'], #貪婪'n'],[#商品明細稅前金額'10稅前',['direct'],'extract', ['^[\u4e00-\u9fa5]+[\|\s]*[.0-9]+[\|\s]*[.0-9]+[\|\s]*([.0-9]+)[\|\s]*[.0-9]+[\|\s]*[.0-9]+$','[率|\|]\s*([.0-9]+)\s+[0-9]{1,2}%[\||稅]','金\s*額\s*([.0-9]+)[\|\s]*稅率\s*[.0-9]+%[\|\s]*稅\s*額','[.0-9]+[\s\|]+[.0-9]+[\s\|]+([.0-9]+)[\s\|]+[.0-9]+[\s\|]+[.0-9]+'], #貪婪'n'],[#稅率'11稅率',['direct'],'extract', ['^[\u4e00-\u9fa5]+[\|\s]*[.0-9]+[\|\s]*[.0-9]+[\|\s]*[.0-9]+[\|\s]*([.0-9]+)[\|\s]*[.0-9]+$','[率|\|]\s*[.0-9]+\s+([0-9]{1,2}%)[\||稅]','金\s*額\s*[.0-9]+[\|\s]*稅\s*率\s*([.0-9]+%)[\|\s]*稅\s*額','[.0-9]+[\s\|]+[.0-9]+[\s\|]+[.0-9]+[\s\|]+([.0-9]+)[\s\|]+[.0-9]+'],  #非貪婪'1'],[#商品明細稅額'12稅額',['direct'],'extract', ['^[\u4e00-\u9fa5]+[\|\s]*[.0-9]+[\|\s]*[.0-9]+[\|\s]*[.0-9]+[\|\s]*[.0-9]+[\|\s]*([.0-9]+)$','稅\s*[\|]?\s*額\s*[\|]?\s*([.0-9]+)','[.0-9]+[\s\|]+[.0-9]+[\s\|]+[.0-9]+[\s\|]+[.0-9]+[\s\|]+([.0-9]+)'],  #貪婪'n'    ],[#合計稅前金額'13合計稅前',['direct'], #1 前面先qrcode，通過字典的known獲取，其次識別'extract', ['[￥￥](-?\d+\.\d{0,2})[\|\s][￥￥]','^([.0-9]+)\|[.0-9]+$'], #非貪婪'1'],[#合計稅額'14合計稅額',['direct'],'extract', ['[￥￥]-?\d+\.\d+[\|\s][￥￥](-?\d+\.\d+)','^[.0-9]+\|([.0-9]+)$'], #非貪婪'1'],[#合計總額小寫'15總額',['direct'],'extract', ['（小寫）[￥￥](.+)','價稅合計[\|\s]*[零壹貳叁肆伍陸柒捌玖拾佰仟億角分圓整]{2,}[\|\s]*[￥￥]?([.0-9]+)$'],'1'   ],   [#合計大寫'16大寫',['direct'],'extract', # '([\(\)（）\u4e00-\u9fa5]{8,30})', ['^價稅合計（大寫）\|(.+)\|（小寫）','價稅合計[\|\s]*([零壹貳叁肆伍陸柒捌玖拾佰仟億角分圓整]{2,})'],'1'],[#銷方名稱'17銷方',['direct'],'extract', # '([\(\)（）\u4e00-\u9fa5]{7,30})', ['銷售方信息\|名稱：(.+?) 統一社會信用代碼','銷售方\s*\|\s*名\s*稱：\s*([\u4e00-\u9fa5]+)\s*納稅人識別號'],  #非貪婪'1'],[#銷方稅號'18銷方稅號',['direct'],'extract',['銷售[\D]+納稅人識別號：[\|\s]*([0-9A-Z]{18})','納稅人識別號：([a-zA-Z0-9]{18})'],  #非貪婪'-1']]result_series_orderdic = OrderedDict()for i, cond_list in enumerate(range_list):result_series_orderdic, err_info = Loc_tele_content(df_new, known_dict, cond_list, result_series_orderdic)return result_series_orderdic, err_info, inv_dict

3.6 識別記錄相關函數


# 結果輸出
def Log_result_file(result_pandas,result_file_path,result_sheet_name):writer = pd.ExcelWriter(result_file_path, engine='openpyxl', mode='a', if_sheet_exists='replace')result_pandas.to_excel(writer,sheet_name=result_sheet_name,index=False)writer.close()return True# 結果添加超鏈接
def Add_hyperlink(result_file_path,result_sheet_name):
#添加文件路徑超鏈接wb = load_workbook(result_file_path)wb.move_sheet(result_sheet_name, offset=-1)ws = wb[result_sheet_name]wb._active_sheet_index = 0  #激活第一個工作表rng = ws.iter_cols(min_row=2, max_row=ws.max_row, min_col=19, max_col=20)for col in rng:    #列方法for cell in col:    #遍歷列txt = cell.valueif txt is None:continueif len(txt) > 0:if cell.column == 19:pr,nm,fr,ex = pathsplit(txt)# 絕對路徑# cell.hyperlink = 'file:\\' + txt# 相對路徑cell.hyperlink = '..\\' + nmcell.font = Font(color=colors.Color(index = 4), italic=True)else:cell.font = Font(color=colors.Color(index = 2), italic=False)wb.save(result_file_path)ws = Nonewb = Nonereturn True#匯總df表
def Collect_df(collect_df, item_df):if len(item_df) == 0:return collect_dfif collect_df.empty:collect_df = item_dfelse:test_set = {0,1}collect_df_col_set = set(collect_df.columns)item_df_col_set = set(item_df.columns)if len(collect_df_col_set - item_df_col_set) > 0:# 如果合并表和被合并子表列名不一致，則忽略索引后合并temp_collect_df = collect_df.copy()temp_collect_title_list = temp_collect_df.columns.to_list()temp_collect_title_df = pd.DataFrame(data = temp_collect_title_list).Ttemp_collect_df.columns  = list(range(len(temp_collect_df.columns)))collect_df = pd.concat([temp_collect_title_df, temp_collect_df], ignore_index = True, axis = 0)temp_item_df = item_df.copy()temp_item_title_list = temp_item_df.columns.to_list()temp_item_title_df = pd.DataFrame(data = temp_item_title_list).Ttemp_item_df.columns  = list(range(len(temp_item_df.columns)))item_df = pd.concat([temp_item_title_df, temp_item_df], ignore_index = True, axis = 0)collect_col_num = len(temp_collect_title_list)item_df_col_num = len(temp_item_title_list)max_col = max(collect_col_num, item_df_col_num)collect_col_dif = max_col - collect_col_numitem_col_dif = max_col - item_df_col_numif collect_col_dif > 0:for i in range(collect_col_num, max_col + 1):temp_collect_df[i] = ''if item_col_dif > 0:for i in range(item_df_col_num, max_col + 1):temp_item_df[i] = ''collect_df = temp_collect_dfitem_df = temp_item_dfcollect_df = pd.concat([collect_df, item_df], ignore_index = True, axis = 0)# 重設index從1 collect_df = reset_nature_index(collect_df)return collect_df# 匯集df輸出到excel表格：
def Log_df_to_file(df, save_path, sheet_name, keep_exists = True):# 默認不刪除原來文件表中內容，即默認累積keep_exists=Truewriter = pd.ExcelWriter(save_path, engine='openpyxl', mode='a', if_sheet_exists='replace')pandas_write = pd.DataFrame()if not df.empty:if keep_exists == True:# 如果累積原文件內數據df_title = df.columns.to_list()df_non_title = dfdf_non_title.columns  = list(range(len(df_non_title.columns)))            pandas_write = pd.read_excel(save_path, sheet_name=sheet_name,index_col=0,header = 0,keep_default_na=True,dtype=object) #讀取表格pandas_write = Collect_df(pandas_write, df)else:# 如果不累積原文件內數據pandas_write = dfif not pandas_write.empty:pandas_write.to_excel(writer,sheet_name=sheet_name)  writer.close()return Truedef reset_nature_index(df): # 重設自然數索引index從1開始df.index = list(range(1,len(df)+1))return df

3.6 識別結果校驗


# 校驗發票識別數據，，檢查數量單價金額和稅額的邏輯關系是否正確，以及字段是否有缺失等并標記
def Check_result(result_pandas): #->Dataframeif len(result_pandas) == 0:return result_pandas# 整表轉換為字符格式:edit_pandas = result_pandas.copy()edit_pandas = edit_pandas.fillna('')edit_pandas = edit_pandas.astype(str)temp_title_list = edit_pandas.columns.tolist()edit_pandas['err_info'] = ''  #清空err_info列pandas_title_list = edit_pandas.columns.tolist()inv_title_list = pandas_title_list[0:-2]detail_title_list = ['06品名','07單位' ,'08數量','09單價','10稅前','12稅額']num_title_list = ['08數量','09單價','10稅前','11稅率','12稅額',\'13合計稅前','14合計稅額','15總額']one_row_title_list = ['01票號','02代碼','03日期','04購方','05購方稅號','13合計稅前','14合計稅額','15總額','16大寫','17銷方','18銷方稅號']one_row_title_list.sort()  #注意集合打亂了順序，需要對標題列表重新排序#正則替換數字型文本字段區域的貨幣字符、百分號、括號和中文字符:edit_pandas.loc[:,num_title_list] = \edit_pandas.loc[:,num_title_list].replace(to_replace = '[￥￥%\s（）\(\)\u4e00-\u9fa5]',value='',regex=True)edit_pandas.loc[:,num_title_list] = \edit_pandas.loc[:,num_title_list].replace(to_replace = '[:：]',value='.',regex=True)edit_pandas.loc[:,'05購方稅號'] = \edit_pandas.loc[:,'05購方稅號'].replace(to_replace = '[:：]',value='',regex=True)# 替換 品名標點字符：edit_pandas.loc[:,'06品名'] = \edit_pandas.loc[:,'06品名'].replace(to_replace = '^[米水冰]|[\+\*#]',value=' ',regex=True)edit_pandas.loc[:,'06品名'] = \edit_pandas.loc[:,'06品名'].replace(to_replace = '^\s',value='',regex=True)# 字段修正：公司名錯別字：comp_dict = {'A有限公司' : 'AA有限公司',}edit_pandas = edit_pandas.replace({'17銷方':comp_dict})# 字段修正 通過文件讀入字典修正replace_file = 'D:\\pyscripts\\發票修正.xlsx'if os.path.exists(replace_file):replace_df = pd.read_excel(replace_file, sheet_name=0,header=0, keep_default_na=True, dtype=object) #讀取表格if not replace_df.empty:replace_df = replace_df.fillna('')edit_df_title_list = edit_pandas.columns.to_list()replace_df_title_list = replace_df.columns.to_list()for _, title in enumerate(replace_df_title_list):if title in edit_df_title_list:if not replace_df.loc[replace_df[title]!='',:].empty:#如果replace_df里對應edit_df的相應字段不為空replace_title = title + '修正'if replace_title in replace_df_title_list:#如果有相應字段的修正列,則遍歷字段列，用修正列替換for _, row in enumerate(replace_df[[title,replace_title]].iterrows()):str_origin = row[1].values[0]str_replace = row[1].values[1]edit_pandas[title] = edit_pandas[title].replace(to_replace = str_origin, value=str_replace, regex=True)# 獲得遍歷需要的發票起止行row_start_index = edit_pandas.loc[edit_pandas['file_path'].str.len()>0,'file_path'].indexrow_start_list = row_start_index.to_list()temp_index = row_start_index - 1temp_list = temp_index.to_list()row_end_list = temp_list[1:]row_pandas_last = edit_pandas.index[-1]row_end_list.append(row_pandas_last)rows_tuple = zip(row_start_list,row_end_list)for i, (row_start, row_end) in enumerate(rows_tuple):err_info = ''err_blank = ''err_code = ''err_product = ''err_num = ''this_inv_pandas = edit_pandas.iloc[row_start:row_end+1, :] #截取單張發票的數據行區域到one_inv_pandas   # file_path = this_inv_pandas.loc[this_inv_pandas.index[0], 'file_path'] #此處調試：# if '\\23.jpg' in file_path:#     print(file_path)#數值核對num_extract_reg = '((?:\d+|\d{0,3}(?:,\d{3})*)\.?\d{0,})\s*$'# 如果數字列包含除了點以外的非數字字符，則去除非數字字符for _, num_title in enumerate(num_title_list):this_inv_pandas.loc[:,num_title] = this_inv_pandas.loc[:,num_title].str.extract(num_extract_reg)this_inv_pandas.loc[:,num_title_list]=this_inv_pandas.loc[:,num_title_list].replace('^$','0',regex=True)this_inv_pandas.loc[:,num_title_list] = this_inv_pandas.loc[:,num_title_list].astype(float)   #1. 稅率換算小數if this_inv_pandas.loc[:,'11稅率'].values[0] >1:this_inv_pandas.loc[:,'11稅率'] = this_inv_pandas.loc[:,'11稅率']/100# 稅前合計num_sum_pretax_amount = round(sum(this_inv_pandas['10稅前'].values),2)num_total_pretax_amount = this_inv_pandas['13合計稅前'].values[0]# 累計稅額和稅額合計num_total_tax = this_inv_pandas['14合計稅額'].values[0]num_sum_detail_tax = round(sum(this_inv_pandas['12稅額'].values), 2)# 稅前合計+稅額合計 和 發票金額 num_total_amount= this_inv_pandas['15總額'].values[0]sum_total = num_total_pretax_amount + num_total_tax#檢查空白區域：title_blank_list = []err_inv_list = []for _, title in enumerate(detail_title_list):cond1 = this_inv_pandas.loc[:, title] == ''cond2 = this_inv_pandas.loc[:, title] == 0cond = cond1 | cond2count_blank = len(this_inv_pandas.loc[cond,:])if count_blank > 0:#如果有空值title_blank_list.append(title)if title == '06品名':cond = this_inv_pandas.loc[:, title].str.contains('品[\u4e00-\u9fa5]')product_wrong_df = this_inv_pandas.loc[cond,'06品名']count_product_err = len(product_wrong_df)if count_product_err > 0:err_product = err_product + 'Check product name:' + ','.join(product_wrong_df.to_list()) + '.'if '品名' not in err_blank:if len(this_inv_pandas.loc[~this_inv_pandas['06品名'].str.contains('[\u4e00-\u9fa5]\s[\u4e00-\u9fa5]'),:]) > 0:# 如果品名不符合"若干漢字+空格+若干漢字"格式，提示錯誤err_product = err_product + '品名格式不符“類品+空格+品名”.'for _, title in enumerate(one_row_title_list):if title == '發票號碼':temp_df = this_inv_pandas.loc[this_inv_pandas['file_path']!='', '發票號碼']temp_df['發票號長度'] = temp_df['發票號'].apply(lambda x:len(x))temp_check_df = temp_df.loc[~((temp_df['發票號長度']==8) |(temp_df['發票號長度']==20)),: ]if len(temp_check_df) > 0:err_inv_list.append('Inv number lenth illegal')temp_check_df= temp_df.loc[temp_df['發票號'].str.contains('\D'), :]if len(temp_df) > 0:err_inv_list.append('Inv number character illegal')cond1 = this_inv_pandas.loc[this_inv_pandas.index[0], title] == ''cond2 = this_inv_pandas.loc[this_inv_pandas.index[0], title] == 0cond = cond1 | cond2if cond == True:  #即為空if title == '02代碼':if len(this_inv_pandas.loc[this_inv_pandas.index[0], '01票號']) == 20: #表示電子發票，可以沒有代碼continueif title == '15總額':#如果是總額字段，是0，如果大寫不為空，用大寫轉換為小寫，替換總額txt = this_inv_pandas.loc[this_inv_pandas.index[0], '16大寫']if not txt == '':trad = txt.split('|')[0]repl_dict = {'參' : '叁','柴' : '柒','什' : '仟'}trad = repl_by_dict(trad, repl_dict)money = trad_to_int(trad)if not money == trad:money = float(money)if money > 0:this_inv_pandas.loc[this_inv_pandas.index[0], title] = moneycontinueelse: #如果大寫為空的話，再看合計稅前和合計稅額，如果都有數值，則用兩個金額的和替換發票總額if num_total_pretax_amount >0 and num_total_tax > 0:this_inv_pandas.loc[this_inv_pandas.index[0], title] = sum_totalcontinueif title == '16大寫':continuetitle_blank_list.append(title)#如果公司稅號不存在空值，進行校驗：if '05購方稅號' not in title_blank_list:if this_inv_pandas['05購方稅號'].values[0] != user_code:err_code = '購方稅號['+ this_inv_pandas['05購方稅號'].values[0] + ']不是“' + user_code + '”。'if len(title_blank_list) > 0:title_blank_list.sort()err_blank = 'Null:[' + ','.join(title_blank_list) + ']。' #標識到錯誤記錄diff_pretax_amount = round(num_total_pretax_amount - num_sum_pretax_amount, 2)if diff_pretax_amount != 0:err_num = err_num + '稅前之和≠合計稅前[' + str(num_total_pretax_amount) + ' - ' + str(num_sum_pretax_amount) + ' = ' + str(diff_pretax_amount) + ']。'# 2 稅前合計+稅額合計 和 發票金額 , 累計稅額和稅額合計sum_total_pretax_tax = round(num_total_pretax_amount + num_total_tax, 2)diff_total = round(num_total_amount - sum_total_pretax_tax, 2)diff_tax = round(num_total_tax - num_sum_detail_tax, 2)if diff_total != 0:err_num = err_num + '稅前合計與稅額合計之和≠發票總額[' + str(sum_total_pretax_tax) + '≠' + str(num_total_amount) + ']。'if diff_tax != 0:err_num = err_num + '明細稅額之和≠14合計稅額:[' + str(num_sum_detail_tax) + ' ≠ ' + str(num_total_tax) +']。'# 3 數量*單價=稅前金額quantity_price_df = this_inv_pandas.loc[:,['06品名','08數量','09單價','10稅前']]quantity_price_df['diff_quantity_price'] = quantity_price_df['08數量'] * quantity_price_df['09單價'] - quantity_price_df['10稅前']round_quantity_price_df = quantity_price_df.loc[:,'diff_quantity_price'].astype(float).round(2)quantity_price_df['diff_quantity_price'] = round_quantity_price_dfdiff_quantity_price_df = quantity_price_df.loc[quantity_price_df['diff_quantity_price'] != 0,:]if not diff_quantity_price_df.empty:str_temp_quantity      = '，'.join(diff_quantity_price_df['08數量'].astype(str).tolist())str_temp_price         = '，'.join(diff_quantity_price_df['09單價'].astype(str).tolist())str_temp_pretax_amount = '，'.join(diff_quantity_price_df['10稅前'].astype(str).tolist())str_temp_diff          = '，'.join(diff_quantity_price_df['diff_quantity_price'].astype(str).tolist())err_num = err_num + '量*價≠稅前,差異明細：['+ str_temp_quantity + ']×['+ str_temp_price + ']-['+  str_temp_pretax_amount + ']=[' + str_temp_diff + ']。'err_inv = '票號格式錯誤['+','.join(err_inv_list) + ']。' if len(err_inv_list)>0 else ''err_info = err_inv + err_blank + err_code + err_product + err_numerr_before = this_inv_pandas.loc[:,'err_info'].values[0]err_info = err_before + err_infothis_inv_pandas.loc[this_inv_pandas.index[0],'err_info'] = err_infoedit_pandas.iloc[row_start:row_end + 1,:] = this_inv_pandas #temp_pandas的寫回結果到截取區域result_pandas = edit_pandas # 修改數據edit_pandas寫回result_pandasreturn result_pandas

3.7 文件預處理等其他函數


# cv2 轉 pil
def cv2_pil(img_cv):pil_image = Image.fromarray(cv2.cvtColor(img_cv,cv2.COLOR_BGR2RGB))return pil_image# cv2讀取圖片
def cv_imread(file_path):cv_img = cv2.imdecode(np.fromfile(file_path,dtype=np.uint8),cv2.IMREAD_COLOR)return cv_img# 大寫金額轉為小寫數字 
def trad_to_int(money):# 轉換字典trad_dict = {'零':0,'壹':1,'貳':2,'叁':3,'肆':4,'伍':5,'陸':6,'柒':7,'捌':8,'玖':9,'拾':10,'佰':100,'仟':1000,'萬':10000,'億':100000000,'角':0.1,'分':0.01}trad = re.search(r"[零壹貳叁肆伍陸柒捌玖拾佰仟億角分]+", money)if trad is not None:num = 0add = 0sum = 0for i in money:if i in ['零','壹','貳','叁','肆','伍','陸','柒','捌','玖']:add = trad_dict[i]sum = sum + addelif i in ['拾','佰','仟','億','角','分']:num = add * trad_dict[i]sum = sum - addsum = sum + numadd = numelif i == '萬' or i == '億':sum = sum * trad_dict[i]sum = str(sum)return sumelse:return moneydef Fill_na_result(result_df):# 填充處理：務必先處理na值，再進行后續處理。result_df.loc[:,'03日期'] = result_df.loc[:,'03日期'].apply(lambda x: delta_date(x))result_df.loc[:,'11稅率'] = result_df.loc[:,'11稅率'].fillna(method='ffill')result_df.iloc[:,0:7] = result_df.iloc[:,0:7].fillna('')result_df.iloc[:,7:15] = result_df.iloc[:,7:15].fillna('0')result_df.iloc[:,15:] = result_df.iloc[:,15:].fillna('')result_df = result_df.fillna('')  return result_dfdef delta_date(para):
# int值轉date 文本格式time = paraif isinstance(para,int):time = pd.to_datetime('1899-12-30') + pd.Timedelta(str(int(para))+'days')time = time.strftime("%Y-%m-%d")elif isinstance(para,float):time = ''# time = pd.to_datetime('1899-12-30') + pd.Timedelta(str(origin_pandas.iloc[0,2])+'days')return time#字典替換字符串
def repl_by_dict(my_str,repl_dict):for (k,v) in repl_dict.items():my_str = my_str.replace(k, v)return my_str# 路徑分割
def pathsplit(f) ->tuple:parent = os.path.split(f)[0]fullname = os.path.split(f)[1]frontname = os.path.splitext(fullname)[0]extname = str.lower(os.path.splitext(f)[1])return (parent,fullname,frontname,extname)# 創建空目錄
def Create_clear_dir(folder_path):if os.path.exists(folder_path):  #清空臨時文件夾，如果不存在則新for dirpath, dirnames, filenames in os.walk(folder_path):for filepath in filenames: #清空臨時txt文件夾delFolderorFile(os.path.join(dirpath, filepath))  else:os.mkdir(folder_path) #新建文件夾if os.path.exists(folder_path):return Trueelse:return False#刪除目錄或文件
def delFolderorFile(folder_path):if not os.path.exists(folder_path):return Falseif os.path.isfile(folder_path):os.remove(folder_path)returnfor m in os.listdir(folder_path):n = os.path.join(folder_path, m)if os.path.isdir(n):#遞歸調用delFolderorFiledelFolderorFile(n)else:os.unlink(n)os.rmdir(folder_path) #刪除空目錄#兩點坐標計算角度弧度
def cal_angle(p1, p2):""" px : (橫坐標，縱坐標)"""angle=math.atan2(p2[1]-p1[1], p2[0]-p1[0])# angle=math.degrees(angle)return angle * (180 / math.pi)

3.8 main主函數

#----------------------------------------------------------------#
#                    --- MAIN PROGRAM ---                        #
#----------------------------------------------------------------#if __name__ == '__main__':print('\n',datetime.now().strftime("%H:%M:%S"),'Program start running...\n')killexcel()user_name = ''user_code = ''reserve_template_before = True #默認為True：保留之前的中間文件,False:刪除中間文件,全新運行。ocr_excel_out = True #True:輸出臨時excel文件draw_result_out = True #繪制識別結果enhance = Falseacumulate_input = 'y'  #默認增量運行#設置快慢速所用引擎: 0-快速,1-慢速,2-平衡prepare_engine = 1 precise_engine = 1 root = Tk()print('Please choose the images folder:')origin_folder_path = filedialog.askdirectory()if len(origin_folder_path) > 0:origin_folder_path = origin_folder_path.replace('/','\\')print(datetime.now().strftime("%H:%M:%S"),'The images folder you chose：', origin_folder_path)else:print(datetime.now().strftime("%H:%M:%S"),'No file chosen. \nQuit.')exit()root.destroy()result_folder_name = 'result' #結果文件夾result_sheet_name ='result' #結果工作表名result_folder_path = os.path.join(origin_folder_path, result_folder_name) #結果文件夾if not os.path.exists(result_folder_path):Create_clear_dir(result_folder_path) #建立干凈的中間臨時文件夾            result_file_name = 'result' + '.xlsx'result_file_path = os.path.join(result_folder_path, result_file_name) #結果文件路徑run_renew = Truepr,nm,fr,ex = pathsplit(result_file_path)now = datetime.now()back_str = now.strftime("%Y%m%d_%H%M%S")back_file_name = fr + '_' + back_str + exback_file_path = os.path.join(result_folder_path, back_file_name) origin_pandas = pd.DataFrame() #初始化t0 = datetime.now()if os.path.exists(result_file_path):print(datetime.now().strftime("%H:%M:%S"), f'Found previous result: {result_file_path} .')# 選擇識別還是只是檢查結果，默認識別。ocr_input = 'y'print('\nChoose please: \n"y" - run the orgnize engine.   "n" - only check the result, do not run engine.\n')ocr_input = input('Input(y/n):\n')# 如果只是檢查結果，檢查結果后退出:# 先初始化origin_pandas = pd.DataFrame()if str.lower(ocr_input) == 'n':#結果文件備份：shutil.copy(result_file_path, back_file_path)#獲取結果文件try:origin_pandas = pd.read_excel(result_file_path, sheet_name=result_sheet_name,header=0, keep_default_na=True, dtype=object) #讀取表格except ValueError:origin_pandas = pd.read_excel(result_file_path, sheet_name=0,header=0, keep_default_na=True, dtype=object) #讀取表格#檢查結果result_pandas = Check_result(origin_pandas)#添加超鏈接Log_result_file(result_pandas,result_file_path,result_sheet_name)Add_hyperlink(result_file_path,result_sheet_name)print('\n')print(datetime.now().strftime("%H:%M:%S"), 'Done.《', result_file_path, '》checked over.')# 退出程序exit()# 如果選擇運行，指定增量運行還是全新運行 ,默認是增量運行if ocr_input.lower() == 'y':print('\nChoose run method: \n"y" - Run acumulated to the existed result.   \n"n" - Run fresh and delete all existed results and template folders.\n')acumulate_input = input('Input(y/n):\n')#如果增量運行，讀取已存在結果if acumulate_input.lower() =='y':# 因是增量運行，所以把臨時中間文件全新運行設置為否reserve_template_before = 'y'#結果文件備份：shutil.copy(result_file_path, back_file_path)#獲取結果文件try:origin_pandas = pd.read_excel(result_file_path, sheet_name=result_sheet_name,header=0, keep_default_na=True, dtype=object) #讀取表格except ValueError:origin_pandas = pd.read_excel(result_file_path, sheet_name=0,header=0, keep_default_na=True, dtype=object) #讀取表格# df = pd.read_excel(os.path.join(init_dir, '地理區域.xlsx'), converters={'父級地理區域編號': str, '地理區域編號': str})else:#如果全新運行，刪除結果文件Create_clear_dir(result_folder_path)if not origin_pandas.empty:origin_pandas = Fill_na_result(origin_pandas)temp_folder_name = 'temp' #中間臨時文件夾temp_folder_path = os.path.join(origin_folder_path, temp_folder_name)if not(reserve_template_before) or not(os.path.exists(temp_folder_path)):# 如果不保留上次臨時文件，或者臨時文件夾不存在，則清空或建立新臨時文件夾Create_clear_dir(temp_folder_path) #建立干凈的中間臨時文件夾     print(datetime.now().strftime("%H:%M:%S"),'Start the engine...')#識別引擎列表：mobile_ocr = hub.Module(name="chinese_ocr_db_crnn_mobile")  #0號引擎paddle_ocr = PaddleOCR(enable_mkldnn=True,use_angle_cls=True, lang='ch')       #1號引擎       # server_ocr = hub.Module(name="chinese_ocr_db_crnn_server")  #2號引擎ocr_engines = [mobile_ocr, paddle_ocr]  #引擎集合print(datetime.now().strftime("%H:%M:%S"),'Engine start running...')result_pandas_orderdic = OrderedDict() #定義pandas字典duplicate_sheet_name = 'duplicate'duplicate_pandas = pd.DataFrame() #初始化try:duplicate_pandas = pd.read_excel(back_file_path, sheet_name=duplicate_sheet_name,header=0, keep_default_na=True, dtype=object) #讀取表格except:passif duplicate_pandas.empty:duplicate_pandas.to_excel(result_file_path,sheet_name=duplicate_sheet_name,index=False) #如果沒有重復發票表，則創建一個重復發票工作表cnt_file = len({p.resolve() for p in Path(origin_folder_path).glob("*") if p.suffix in [".jpg", ".pdf"]})cnt_done_pre = 0cnt_duplicate_pre = 0# 如果是增量運行，取已存在的識別文件數和重復文件數。否則都默認為0if acumulate_input.lower() =='y':if not origin_pandas.empty:cnt_done_pre = len(origin_pandas.loc[origin_pandas['file_path'].notnull(),:])if not duplicate_pandas.empty:cnt_duplicate_pre = len(duplicate_pandas.loc[duplicate_pandas['file_path'].notnull(),:])inv_dict = {} #發票號字典 {發票號：[文件名]}walk_folder_args = {'ocr_engines':ocr_engines, 'temp_folder_path': temp_folder_path, 'engine_switch':prepare_engine}#------------------識別核心函數--------------------------------result_pandas,duplicate_pandas = walk_folder_ocr(origin_pandas, duplicate_pandas, origin_folder_path,**walk_folder_args)#-------------------------------------------------------------print('\n')print(datetime.now().strftime("%H:%M:%S"),'Get the result.')cnt_done = 0cnt_duplicate = 0if not result_pandas.empty:cnt_done = len(result_pandas.loc[(result_pandas['file_path']!='') & (result_pandas['file_path'].notnull()),:]) - cnt_done_preif not duplicate_pandas.empty:cnt_duplicate = len(duplicate_pandas.loc[(duplicate_pandas['file_path']!='') & (duplicate_pandas['file_path'].notnull()),:]) - cnt_duplicate_preif not result_pandas.empty:print(datetime.now().strftime("%H:%M:%S"),'Checking result data...')result_pandas = Check_result(result_pandas)  #最后檢查一遍結果,并重新寫入結果文件Log_result_file(result_pandas,result_file_path,result_sheet_name)print(datetime.now().strftime("%H:%M:%S"),'Result data check over.')# 結果添加超鏈接Add_hyperlink(result_file_path,result_sheet_name)paddle_ocr = Noneserver_ocr = Nonemobile_ocr = Noneocr_engines = Noneprint('\toriginal image path:  ' + origin_folder_path)print('\toutput file path:  ' + result_file_path)t1 = datetime.now()tx = t1 - t0 v = 0try:v = round(tx.total_seconds() / (cnt_done + cnt_duplicate) , 2)except:passprint('\n')print( t1,'\n Done. Time spent: ', str(tx).split('.')[0], '. Files total: ' + str(cnt_file) \+ '. Already done before start: ' + str(cnt_done_pre) \+ '. Already find duplicate before start: ' + str(cnt_duplicate_pre) \+ '. \n Files recognized this time total: ' + str(cnt_done + cnt_duplicate) \+ ', valid: ' + str(cnt_done) + ', duplicate: ' + str(cnt_duplicate) \+ ', Everage: ' + str(v) + ' s.\n') cnt_done_total = cnt_done_pre + cnt_donecnt_duplicate_total = cnt_duplicate_pre + cnt_duplicateif cnt_done_total != cnt_duplicate_total:print('Warning: 有效發票數:' + str(cnt_done_total) + ' 重復發票數：' + str(cnt_duplicate_total) + ', 檢查是否有發票號碼錯誤。')