python門戶網站文件爬取并顯示

廣西南寧政府門面網站

import requests
import os
import io
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
import time
import pdfplumber
import pandas as pd
from docx import Document
import docx
import win32com.client as win32
import zipfile
import xlrd
headers=[{'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'},{'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.76'}
]
temp=''
with open('D:/t.txt','r',encoding='utf-8') as f:temp=f.read()
contents=[]
def get_pdf_content(url,title):response = requests.get(url)pdf_content = io.BytesIO(response.content)with open(f'D:/{title}', 'wb') as f:f.write(pdf_content.read())print(f'D:/{title}')if 'D:/308 南寧市衛生和計劃生育委員會2018年部門預算及“三公”經費預算/308南寧市衛生和計劃生育委員會及所屬單位2018年部門預算及“三公”經費預算公開.pdf'==f'D:/{title}':return ''texts=[]with pdfplumber.open(f'D:/{title}') as pdf:for page in pdf.pages:text = page.extract_text()#提取文本texts.append(text)return ' '.join(texts)
def doc_to_docx(title):word = win32.Dispatch("Word.Application")doc = word.Documents.Open('D:\\'+title)doc.SaveAs('D:\\'+title+'x')doc.Close()word.Quit()
def get_doc_content(url,title):response = requests.get(url)pdf_content = io.BytesIO(response.content)with open(f'D:/{title}', 'wb') as f:f.write(pdf_content.read())print(f'D:/{title}')#doc_to_docx(title)texts=[]if f'D:/{title}'=='D:/市工信委2017年部門預算及“三公”經費公開.docx' or 'D:/2017年部門預算公開-政府辦公廳.docx'==f'D:/{title}' or 'D:/政務辦2017年部門預算及“三公”經費公開 (1).doc'==f'D:/{title}' or 'D:/南寧市茅橋地區人民檢察院2018年部門預算及“三公”經費預算.doc'==f'D:/{title}' or 'D:/017  南寧市審計局2018年部門預算及“三公”經費預算.doc'==f'D:/{title}' or 'D:/南寧市人大常委會辦公廳2018年部門預算及“三公”經費預算 (1).doc'==f'D:/{title}' or 'D:/2017年市民宗委預算公開 (1).doc'==f'D:/{title}' or 'D:/預算 名詞解釋.docx'==f'D:/{title}' or 'D:/南寧市編辦2017年部門預算公開名詞解釋.docx'==f'D:/{title}' or 'D:/南寧市編辦2016年決算收支增減變化情況說明.docx'==f'D:/{title}':return ''app = win32.DispatchEx("Word.Application")doc = app.Documents.Open(f'D:/{title}')content = doc.Content.Textapp.Quit()return content
def get_xls_content(url,title):response = requests.get(url)pdf_content = io.BytesIO(response.content)with open(f'D:/{title}', 'wb') as f:f.write(pdf_content.read())print(f'D:/{title}')try:  # 嘗試打開文件  with open(f'D:/{title}', 'rb') as file:passexcept Exception as e:  return ''if f'D:/{title}'=='D:/附件—市發展改革委2018年預算公開附件.xlsx' or f'D:/{title}'=='D:/2017預算公開附件-市人大.xls' or 'D:/南寧市科協2020年部門預算公開附件.xlsx'==f'D:/{title}' or 'D:/宣傳部2017年預算公開附件0309.xlsx'==f'D:/{title}':return ''if title.split('.')[-1]=='xls':df = pd.read_excel(f'D:/{title}',engine='xlrd')elif title.split('.')[-1]=='xlsx':df = pd.read_excel(f'D:/{title}',engine='openpyxl')else:return ''return df.to_string(index=False)
def solve_file(file_type,file_url,title):content=''if file_type=='pdf':content=get_pdf_content(file_url,title)elif file_type=='doc' or file_type=='docx':content=get_doc_content(file_url,title)elif file_type=='xls' or file_type=='xlsx' or file_type=='XLS':content=get_xls_content(file_url,title)return content
def unzip_file(save_file,unzip_dir_path):
# 打開壓縮包file_name=[]with zipfile.ZipFile(save_file, 'r') as zip_file:# 獲取所有文件列表for zip_info in zip_file.infolist():# 如果是文件，先將文件名從gbk編碼轉換為utf-8編碼# print(type(zip_info.filename.encode('cp437').decode('gbk').encode('utf-8')))zip_info.filename = zip_info.filename.encode('cp437').decode('gbk')# 解壓文件zip_file.extract(zip_info, unzip_dir_path)file_name.append(zip_info.filename)return file_name
def get_file(url,base,year,date):time.sleep(10)r=requests.get(url,headers=np.random.choice(headers))#利用request的get函數連接到網址if r.status_code!=200:print('connect error!')r=requests.get(url,headers=np.random.choice(headers))if r.status_code!=200:returnr.encoding='utf-8'soup = BeautifulSoup(r.text, 'html.parser')  hrefs=soup.find('div',class_='downfile').find_all('a')for href in hrefs:file_url=base+href['href']file_type=file_url.split('.')[-1]#獲取文件類型print(file_type)title=href.textif '.' not in title:title+='.'+file_typet=solve_file(file_type,file_url,title)if t!='':content=[]content.append(year)content.append(date)content.append('廣西')content.append('南寧市')content.append(title)content.append(t)content.append('預算公開')content.append(file_url)contents.append(content)print(content)elif file_type=='zip':response = requests.get(file_url)with open(f'D:/{title}', 'wb') as f:f.write(response.content)print(f'D:/{title}')if 'D:/17年預算信息公開.zip'==f'D:/{title}' or 'D:/衛計委2017年部門預算公開.zip'==f'D:/{title}':continuefile_name=unzip_file(f'D:/{title}','D:/')for name in file_name:path='D:/'+namezip_file_type=path.split('.')[-1]t=tempif zip_file_type=='xls' or zip_file_type=='xlsx' or zip_file_type=='doc' or zip_file_type=='XLS':passelse:t=solve_file(zip_file_type,'https://www.nanning.gov.cn/zwgk/fdzdgknr/czxx/sbjyjs/sbjbmys/2017bmys/P020171030399005252620.doc',name)content=[]content.append(year)content.append(date)content.append('廣西')content.append('南寧市')content.append(path.split('/')[-1])content.append(t)content.append('預算公開')content.append(file_url)contents.append(content)print(content)continueelif file_type=='rar':continuedef get_url(urls,year):#每年的總頁面time.sleep(10)r=requests.get(urls,headers=np.random.choice(headers))if r.status_code!=200:print('connect error!')r=requests.get(url,headers=np.random.choice(headers))if r.status_code!=200:returnr.encoding='utf-8'soup = BeautifulSoup(r.text, 'html.parser')  hrefs=soup.find('div',class_='nav1Cont').find_all('li')for href in hrefs:date=href.find('span',class_='time').texthref=href.find('a')base=href['href']url=base_urls+basetitle=href.text.strip()net_type=url.split('.')[-1]if net_type=='pdf':content=[]content.append(year)content.append(date)content.append('廣西')content.append('南寧市')content.append(title)t=get_pdf_content(url,title+'.pdf')if t.strip()=='':t=tempcontent.append(t)content.append('預算公開')content.append(url)contents.append(content)print(content)continueget_file(url,base_urls,year,date)for year in range(2018,2023):base_urls=f'https://www.nanning.gov.cn/zwgk/fdzdgknr/czxx/sbjyjs/sbjbmys/{year}bmys/'get_url(base_urls,year)for i in range(1,7):url=f'https://www.nanning.gov.cn/zwgk/fdzdgknr/czxx/sbjyjs/sbjbmys/{year}bmys/index_{i}.html'r=requests.get(url)if r.status_code!=200:breakget_url(url,year)df=pd.DataFrame(contents,columns=['年份','發布日期','省份','城市','標題','文本','類型','下載鏈接'])
df.to_excel('D:/廣西-南寧-部門預算2017.xlsx',index=False)

爬取網頁附件，根據文件類型分類處理，顯示文件內容并制成表格?

本文來自互聯網用戶投稿，該文觀點僅代表作者本人，不代表本站立場。本站僅提供信息存儲空間服務，不擁有所有權，不承擔相關法律責任。
如若轉載，請注明出處：http://www.pswp.cn/news/209728.shtml
繁體地址，請注明出處：http://hk.pswp.cn/news/209728.shtml
英文地址，請注明出處：http://en.pswp.cn/news/209728.shtml

如若內容造成侵權/違法違規/事實不符，請聯系多彩編程網進行投訴反饋email:809451989@qq.com，一經查實，立即刪除！