廣西南寧政府門面網站
import requests import os import io import numpy as np from concurrent.futures import ThreadPoolExecutor from bs4 import BeautifulSoup import time import pdfplumber import pandas as pd from docx import Document import docx import win32com.client as win32 import zipfile import xlrd headers=[{'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'},{'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.76'} ] temp='' with open('D:/t.txt','r',encoding='utf-8') as f:temp=f.read() contents=[] def get_pdf_content(url,title):response = requests.get(url)pdf_content = io.BytesIO(response.content)with open(f'D:/{title}', 'wb') as f:f.write(pdf_content.read())print(f'D:/{title}')if 'D:/308 南寧市衛生和計劃生育委員會2018年部門預算及“三公”經費預算/308南寧市衛生和計劃生育委員會及所屬單位2018年部門預算及“三公”經費預算公開.pdf'==f'D:/{title}':return ''texts=[]with pdfplumber.open(f'D:/{title}') as pdf:for page in pdf.pages:text = page.extract_text()#提取文本texts.append(text)return ' '.join(texts) def doc_to_docx(title):word = win32.Dispatch("Word.Application")doc = word.Documents.Open('D:\\'+title)doc.SaveAs('D:\\'+title+'x')doc.Close()word.Quit() def get_doc_content(url,title):response = requests.get(url)pdf_content = io.BytesIO(response.content)with open(f'D:/{title}', 'wb') as f:f.write(pdf_content.read())print(f'D:/{title}')#doc_to_docx(title)texts=[]if f'D:/{title}'=='D:/市工信委2017年部門預算及“三公”經費公開.docx' or 'D:/2017年部門預算公開-政府辦公廳.docx'==f'D:/{title}' or 'D:/政務辦2017年部門預算及“三公”經費公開 (1).doc'==f'D:/{title}' or 'D:/南寧市茅橋地區人民檢察院2018年部門預算及“三公”經費預算.doc'==f'D:/{title}' or 'D:/017 南寧市審計局2018年部門預算及“三公”經費預算.doc'==f'D:/{title}' or 'D:/南寧市人大常委會辦公廳2018年部門預算及“三公”經費預算 (1).doc'==f'D:/{title}' or 'D:/2017年市民宗委預算公開 (1).doc'==f'D:/{title}' or 'D:/預算 名詞解釋.docx'==f'D:/{title}' or 'D:/南寧市編辦2017年部門預算公開名詞解釋.docx'==f'D:/{title}' or 'D:/南寧市編辦2016年決算收支增減變化情況說明.docx'==f'D:/{title}':return ''app = win32.DispatchEx("Word.Application")doc = app.Documents.Open(f'D:/{title}')content = doc.Content.Textapp.Quit()return content def get_xls_content(url,title):response = requests.get(url)pdf_content = io.BytesIO(response.content)with open(f'D:/{title}', 'wb') as f:f.write(pdf_content.read())print(f'D:/{title}')try: # 嘗試打開文件 with open(f'D:/{title}', 'rb') as file:passexcept Exception as e: return ''if f'D:/{title}'=='D:/附件—市發展改革委2018年預算公開附件.xlsx' or f'D:/{title}'=='D:/2017預算公開附件-市人大.xls' or 'D:/南寧市科協2020年部門預算公開附件.xlsx'==f'D:/{title}' or 'D:/宣傳部2017年預算公開附件0309.xlsx'==f'D:/{title}':return ''if title.split('.')[-1]=='xls':df = pd.read_excel(f'D:/{title}',engine='xlrd')elif title.split('.')[-1]=='xlsx':df = pd.read_excel(f'D:/{title}',engine='openpyxl')else:return ''return df.to_string(index=False) def solve_file(file_type,file_url,title):content=''if file_type=='pdf':content=get_pdf_content(file_url,title)elif file_type=='doc' or file_type=='docx':content=get_doc_content(file_url,title)elif file_type=='xls' or file_type=='xlsx' or file_type=='XLS':content=get_xls_content(file_url,title)return content def unzip_file(save_file,unzip_dir_path): # 打開壓縮包file_name=[]with zipfile.ZipFile(save_file, 'r') as zip_file:# 獲取所有文件列表for zip_info in zip_file.infolist():# 如果是文件,先將文件名從gbk編碼轉換為utf-8編碼# print(type(zip_info.filename.encode('cp437').decode('gbk').encode('utf-8')))zip_info.filename = zip_info.filename.encode('cp437').decode('gbk')# 解壓文件zip_file.extract(zip_info, unzip_dir_path)file_name.append(zip_info.filename)return file_name def get_file(url,base,year,date):time.sleep(10)r=requests.get(url,headers=np.random.choice(headers))#利用request的get函數連接到網址if r.status_code!=200:print('connect error!')r=requests.get(url,headers=np.random.choice(headers))if r.status_code!=200:returnr.encoding='utf-8'soup = BeautifulSoup(r.text, 'html.parser') hrefs=soup.find('div',class_='downfile').find_all('a')for href in hrefs:file_url=base+href['href']file_type=file_url.split('.')[-1]#獲取文件類型print(file_type)title=href.textif '.' not in title:title+='.'+file_typet=solve_file(file_type,file_url,title)if t!='':content=[]content.append(year)content.append(date)content.append('廣西')content.append('南寧市')content.append(title)content.append(t)content.append('預算公開')content.append(file_url)contents.append(content)print(content)elif file_type=='zip':response = requests.get(file_url)with open(f'D:/{title}', 'wb') as f:f.write(response.content)print(f'D:/{title}')if 'D:/17年預算信息公開.zip'==f'D:/{title}' or 'D:/衛計委2017年部門預算公開.zip'==f'D:/{title}':continuefile_name=unzip_file(f'D:/{title}','D:/')for name in file_name:path='D:/'+namezip_file_type=path.split('.')[-1]t=tempif zip_file_type=='xls' or zip_file_type=='xlsx' or zip_file_type=='doc' or zip_file_type=='XLS':passelse:t=solve_file(zip_file_type,'https://www.nanning.gov.cn/zwgk/fdzdgknr/czxx/sbjyjs/sbjbmys/2017bmys/P020171030399005252620.doc',name)content=[]content.append(year)content.append(date)content.append('廣西')content.append('南寧市')content.append(path.split('/')[-1])content.append(t)content.append('預算公開')content.append(file_url)contents.append(content)print(content)continueelif file_type=='rar':continuedef get_url(urls,year):#每年的總頁面time.sleep(10)r=requests.get(urls,headers=np.random.choice(headers))if r.status_code!=200:print('connect error!')r=requests.get(url,headers=np.random.choice(headers))if r.status_code!=200:returnr.encoding='utf-8'soup = BeautifulSoup(r.text, 'html.parser') hrefs=soup.find('div',class_='nav1Cont').find_all('li')for href in hrefs:date=href.find('span',class_='time').texthref=href.find('a')base=href['href']url=base_urls+basetitle=href.text.strip()net_type=url.split('.')[-1]if net_type=='pdf':content=[]content.append(year)content.append(date)content.append('廣西')content.append('南寧市')content.append(title)t=get_pdf_content(url,title+'.pdf')if t.strip()=='':t=tempcontent.append(t)content.append('預算公開')content.append(url)contents.append(content)print(content)continueget_file(url,base_urls,year,date)for year in range(2018,2023):base_urls=f'https://www.nanning.gov.cn/zwgk/fdzdgknr/czxx/sbjyjs/sbjbmys/{year}bmys/'get_url(base_urls,year)for i in range(1,7):url=f'https://www.nanning.gov.cn/zwgk/fdzdgknr/czxx/sbjyjs/sbjbmys/{year}bmys/index_{i}.html'r=requests.get(url)if r.status_code!=200:breakget_url(url,year)df=pd.DataFrame(contents,columns=['年份','發布日期','省份','城市','標題','文本','類型','下載鏈接']) df.to_excel('D:/廣西-南寧-部門預算2017.xlsx',index=False)
爬取網頁附件,根據文件類型分類處理,顯示文件內容并制成表格?