1.準備:
1)Python開發環境, 筆者用的是3.7; 工具用的是Pycharm
2)百度云后臺創建文字識別的應用, 獲取AppID, API key, Secret Key
百度云后臺創建文字識別的應用
3) 百度模塊
pip install baidu-aip
安裝百度模塊
4) 要保存成csv需要用到pandas模塊
pip Install pandas
安裝pandas成功
2.上路:
1)初始化百度客戶端, 用來發送圖片信息
初始化百度客戶端
2)調用通用文字接口
調用通用文字接口并返回數據
這邊我們斷點查看一下返回來的數據:
返回結果
3)保存成CSV
這里不顯示返回數據的進一步處理過程,我把處理后的數據保存到全局變量Company_Data中。具體數據處理過程可以參考本文源碼或者在《Python使用騰訊Ocr識別文字》中的方法.下面把Company_Data中的數據保存成CSV.
保存成CSV
3.結果:
讀取圖片內容
讀取6張圖片
輸出結果:
輸出識別結果
4.完整代碼:
# -*- coding: utf-8 -*-
from aipimport AipOcr
import os
import pandas
# create a new AipOcr
APP_ID ="16921559"
API_KEY ="HfpMM13vAnDlTRWabQVDKnk8"
SECRET_KEY ="EQpdKCeICwfHLWazx0vsIpRqoRkVX6pM"
IMG_EXT = ['.png', '.jpg', '.jpeg', '.bmp']
Company_Data = []
# initialize
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
#1 get text content
def imageReader(file_path):
with open(file_path,'rb')as f:
content = f.read()
# general text API
api_result = client.basicGeneral(content)
# text content
words_result = []
for iin api_result['words_result']:
words_result.append(i['words'])
return words_result
#2 write the content into file
def saveData(file_path):
# separate the file name and extend type
filename, ext = os.path.splitext(file_path)
if extin IMG_EXT:
#? ? ? ? new_path = file_path + '.txt'
print(" reading the following image %s" % file_path)
result = imageReader(file_path)
#with open(new_path, 'w', encoding='utf-8') as f:
#? ? f.write(result)
# 電話和傳真前面加上'\t',可以防止excel打開csv內容的時候自動計算
data = {}
for textin result:
if ':' in text:
itemname, value = text.split(":")
if '展位號' in itemname:
data['展位號'] = value
elif '地址' in itemname:
data['地址'] = value
elif '郵編' in itemname:
data['郵編'] = value
elif '電話' in itemname:
data['電話'] ='\t'+ value
elif '傳真' in itemname:
data['傳真'] ='\t'+ value
elif '聯系人' in itemname:
data['聯系人'] = value
elif '職務' in itemname:
data['職務'] = value
elif '電郵' in itemname:
data['電郵'] = value
elif '網址' in itemname:
data['網址'] = value
elif '業務性質' in itemname:
data['業務性質'] = value
elif '產品類型' in itemname:
data['產品類型'] = value
print(" Saving the data of %s" % file_path)
print(data)
Company_Data.append(data)
#3 get all .jpg under the path
def each_path(dir_path):
# get file names of current directory
file_name = os.listdir(dir_path)
for namein file_name:
if '.jpg' in name:
image_path = dir_path +'\\' + name
saveData(image_path)
def save2csv():
file_name ='test1.csv'
save = pandas.DataFrame(data=Company_Data, dtype='object')
try:
save.to_csv(file_name,quoting=1,float_format='str', mode='a+')
except UnicodeDecodeError:
print("編碼錯誤,該數據無法寫到文件中,直接忽略該數據")
if __name__ =='__main__':
file_path =r"F:\我的堅果云\其他\Study\Python\Demo\BaiduOcr\picture"
each_path(file_path)
save2csv()