目錄
1.介紹
2.代碼
1.main.py
2.PageSider.py
3.DetailSpider.py
4.DataParse.py
5.Constant.py
6.HanderRequest.py
1.介紹
1. 使用多線程爬取網站
2.爬取數據后保存至excel
3.爬取網站(僅做測試)網創類項目爬取:https://www.maomp.com/
4..實現效果
2.代碼
1.main.py
# coding:utf-8
import threadingimport requests
from queue import Queue
from PageSpider import PageSpider
from DetailSpider import DetailSpider
from DataParse import DataParse
import xlsxwriter
import time
"""
爬取網站:https://www.maomp.com/wzjc/
爬取信息,保存至Excel
"""def start_page(threadsize,page_queue,detail_queue):# 開啟線程,開始采集page頁面page_spider_threadsize = threadsizepage_spider_list = []for i in range(1,page_spider_threadsize+1):pageSpiderThread = PageSpider(thread_name="頁面采集線程"+str(i), page_queue=page_queue, detail_queue=detail_queue)# 啟動線程pageSpiderThread.start()page_spider_list.append(pageSpiderThread)# 查看隊列是否有數據while not page_queue:pass# 釋放資源for page_spider in page_spider_list:if page_spider.is_alive():page_spider.join()def start_detail(threadsize,detail_queue,data_queue):# 開啟線程,開始采集page頁面detail_spider_threadsize = threadsizedetail_spider_list = []for i in range(1, detail_spider_threadsize + 1):detailSpiderThread = DetailSpider(thread_name="詳情頁采集線程" + str(i), detail_queue=detail_queue,data_queue=data_queue)# 啟動線程detailSpiderThread.start()detail_spider_list.append(detailSpiderThread)# 查看隊列是否有數據while not detail_queue:pass# 釋放資源for detail_spider in detail_spider_list:if detail_spider.is_alive():detail_spider.join()def start_data_parse(threadsize,data_queue,book):# 開啟線程,開始采集page頁面lock=threading.Lock()sheet1 = book.add_worksheet("sheet1")title_data = ("網址", "標題", "發布時間", "內容")# 添加表頭for index, title_datum in enumerate(title_data):sheet1.write(0, index, title_datum)spider_list = []for i in range(1, threadsize + 1):thread = DataParse(thread_name="數據解析線程" + str(i), data_queue=data_queue,lock=lock,sheet=sheet1)# 啟動線程thread.start()spider_list.append(thread)# 查看隊列是否有數據while not data_queue:pass# 釋放資源for parse in spider_list:if parse.is_alive():parse.join()def main(xlswriter=None):#定義頁面隊列,存放page頁信息page_queue = Queue()#定義詳情頁隊列detail_queue = Queue()#定義詳情頁數據隊列data_queue = Queue()page_start=1page_end=1for i in range(page_start,page_end+1):page_url="https://www.maomp.com/wzjc/page/{}/".format(i)page_queue.put(page_url)print("頁面隊列:",page_queue.qsize())#啟動采集分頁start_page(threadsize=3,page_queue=page_queue,detail_queue=detail_queue)#啟動詳情頁采集start_detail(threadsize=3, detail_queue=detail_queue, data_queue=data_queue)# 啟動數據解析#創建存放excel文件夾book = xlsxwriter.Workbook(time.strftime("%Y%m%d%H%M%S",time.gmtime())+"文件.xlsx")start_data_parse(threadsize=5,data_queue=data_queue,book=book)book.close()print("分頁數據個數:",page_queue.qsize())print("詳情頁數據個數:", detail_queue.qsize())print("數據數據個數:", data_queue.qsize())if __name__ == '__main__':main()
2.PageSider.py
# coding:utf-8
import threading
from lxml import etree
import HanderRequestclass PageSpider(threading.Thread):"""頁面url,請求多線程類"""def __init__(self,thread_name,page_queue,detail_queue):super(PageSpider,self).__init__()self.thread_name=thread_nameself.page_queue=page_queueself.detail_queue=detail_queuedef parse_detail_url(self,content):"""解析page頁獲取詳情頁url:param content: page頁text:return: 返回詳情頁url"""#頁碼返回數據html實例化item_html=etree.HTML(content)#解析出索引詳情頁URLdetail_urls=item_html.xpath("//h2[@class='entry-title']/a/@href")for url in detail_urls:#將詳情頁url存放到隊列中self.detail_queue.put(url)def run(self):#實際發送請求print("{}啟動".format(self.thread_name))#需要從page_queue隊列中獲取數據try:while not self.page_queue.empty():#從隊列中獲取數據,并設置為非阻塞狀態page_url= self.page_queue.get(block=False)#請求頁面鏈接response_text=HanderRequest.send_reqeust(page_url)if response_text:#解析詳情urlself.parse_detail_url(response_text)except Exception as e:print("{} 執行異常:{}".format(self.thread_name,e))print("{}結束".format(self.thread_name))
3.DetailSpider.py
# coding:utf-8
import threading
from lxml import etree
import HanderRequestclass DetailSpider(threading.Thread):"""詳情頁url,請求詳情頁"""def __init__(self,thread_name,detail_queue,data_queue):super(DetailSpider,self).__init__()self.thread_name=thread_nameself.data_queue=data_queueself.detail_queue=detail_queuedef run(self):#實際發送請求print("{}啟動".format(self.thread_name))#需要從page_queue隊列中獲取數據try:while not self.detail_queue.empty():#從隊列中獲取數據,并設置為非阻塞狀態detail_url= self.detail_queue.get(block=False)#請求頁面鏈接response_text=HanderRequest.send_reqeust(detail_url)if response_text:data={"url":detail_url,"html_content":response_text}#存放data_queuq數據self.data_queue.put(data)except Exception as e:print("{} 執行異常:{}".format(self.thread_name,e))print("{}結束".format(self.thread_name))
4.DataParse.py
# coding:utf-8
import threading
from lxml import etree
import Constantclass DataParse(threading.Thread):"""詳情頁數據處理"""def __init__(self,thread_name,data_queue,lock,sheet):super(DataParse,self).__init__()self.thread_name=thread_nameself.data_queue=data_queueself.lock=lockself.sheet=sheetdef __list_join(self,list):return "".join(list)def __parse(self,data):"""解析data_queue數據保存至excel中:return:"""html= etree.HTML(data.get("html_content"))data={"url":data.get("url"),"title": self.__list_join(html.xpath("//h1[@class='entry-title']/text()")),"put_date":self.__list_join(html.xpath("//span[@class='my-date']/text()")),"content_html":self.__list_join(html.xpath("//div[@class='single-content']//p/text()"))}#多線程,使用lock來進行控制并發with self.lock:#寫入Excelfor index,e in enumerate(data):self.sheet.write(Constant.CURR_EXCEL_COL,index,data.get(e))Constant.CURR_EXCEL_COL += 1def run(self):#實際發送請求print("{}啟動".format(self.thread_name))#需要從page_queue隊列中獲取數據try:while not self.data_queue.empty():#從隊列中獲取數據,并設置為非阻塞狀態data_content= self.data_queue.get(block=False)#解析html數據self.__parse(data_content)except Exception as e:print("{} 執行異常:{}".format(self.thread_name,e))print("{}結束".format(self.thread_name))
5.Constant.py
# coding:utf-8# excel寫入到第幾列
CURR_EXCEL_COL=1
6.HanderRequest.py
注意修改cookie
# coding:utf-8import requestsdef send_reqeust(url):#發送數據headers={"Cookie":"xxx","user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"}response=requests.get(url,headers=headers)if response.status_code==200 and response:return response.text