Python-爬蟲案例
- 代碼
- 代碼
代碼
import requests
import json
import threading
from queue import Queue
import timeclass HeiMaTouTiao:def __init__(self):self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) ""AppleWebKit/537.36 (KHTML, like Gecko) ""Chrome/92.0.4515.107 Safari/537.36",'Authorization': 'Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIU''zI1NiJ9.eyJleHAiOjE2NTY2NTk3NjcsInVzZXJfaWQiOjEsInJlZn''Jlc2giOmZhbHNlLCJ2ZXJpZmllZCI6dHJ1ZX0.ZSdV5mT6w_yhEKLg''qcvWNln2GKHBxfxK7d8YXaoCMYg'}# URL隊列self.url_queue = Queue()# 提取的內容隊列self.content_queue = Queue()def get_url_list(self, start_page, end_page):url_temp = 'http://api-toutiao-web.itheima.net/mp/v1_0/articles?' \'page={}&per_page=10&response_type=comment'url_list = [url_temp.format(i) for i in range(start_page, end_page + 1)]for url in url_list:print('正在請求:', url)self.url_queue.put(url)def get_data(self):content_li = []while True:url = self.url_queue.get()comment = requests.get(url=url, headers=self.headers).textdata = json.loads(comment)data = data['data']['results']for index in range(len(data)):content = dict()content['標題'] = data[index]['title']if data[index]['comment_status'] is True:content['評論狀態'] = '正常'else:content['評論狀態'] = '關閉'content['總評論數'] = data[index]['total_comment_count']content['粉絲評論數'] = data[index]['fans_comment_count']content_li.append(content)self.content_queue.put(content_li)self.url_queue.task_done()def save_data(self):while True:content_list = self.content_queue.get()with open('toutiao.json', mode='a+', encoding='utf-8')as f:f.write(json.dumps(content_list, ensure_ascii=False, indent=2))self.content_queue.task_done()def run(self):start_page = int(input('請輸入抓取的起始頁:'))end_page = int(input('請輸入抓取的結束頁:'))# 線程列表t_list = []if start_page <= 0:print('抓取的起始頁從1開始。')else:t_url = threading.Thread(target=self.get_url_list, args=(start_page, end_page))t_list.append(t_url)# 提取內容線程for i in range(9):t_content = threading.Thread(target=self.get_data)t_list.append(t_content)# 保存數據t_save = threading.Thread(target=self.save_data)t_list.append(t_save)for t in t_list:t.setDaemon(True)t.start()for q in [self.url_queue, self.content_queue]:q.join()if __name__ == '__main__':heimatoutiao = HeiMaTouTiao()start_time = time.time()heimatoutiao.run()end_time = time.time()print(f'總用時:{end_time - start_time}秒')
這段Python代碼定義了一個名為 HeiMaTouTiao 的類,用于爬取和保存某學習網站上的文章信息。
代碼
import requests
import json
from pymongo import MongoClient
class LittleRabbit:def __init__(self):# 準備車載用品類頁面的URLself.init_url = 'https://apipc-xiaotuxian-front.itheima.net/category/goods/temporary'# 請求頭self.headers = {"Content-Type": "application/json;charset=utf-8",'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64)''AppleWebKit/537.36 (KHTML, like Gecko)''Chrome/90.0.4430.212 Safari/537.36'}# 連接MongoDB的客戶端self.client = MongoClient('127.0.0.1', 27017)def load_category_page(self, page):"""抓取車載用品類商品展示頁面的數據:param page:待抓取的頁碼數:return:車載用品類下的所有商品"""# 準備請求體request_payload = {"page": page, "pageSize": 20, "categoryId": "1005009"}# 將字典form_data轉換為JSON字符串json_data = json.dumps(request_payload)response = requests.post(url=self.init_url, data=json_data,headers=self.headers)# 將服務器返回的JSON字符串先轉換成字典,再獲取字典中的商品信息all_goods = json.loads(response.text)["result"]["items"]return all_goodsdef load_detail_page(self, all_goods):"""抓取商品詳情頁的數據:param all_goods: 車載用品類下的所有商品:return: 所有商品的詳情信息"""# 準備基本URLbase_url = 'https://apipc-xiaotuxian-front.itheima.net/goods?'# 定義一個數組,保存所有商品的詳情信息goods_detail_info = []for good_info in all_goods:# 提取商品的ID標識good_id = dict(id=good_info['id'])# 根據拼接商品詳情頁的完整URL,發送GET請求response = requests.get(url=base_url, params=good_id)# 將服務器返回的JSON數據轉換為字典good_detail = json.loads(response.text)goods_detail_info.append(good_detail)return goods_detail_infodef parse_page(self, detail_data):"""解析商品詳情頁的數據,提取目標數據:param detail_data:所有商品的詳情數據:return:所有商品的信息"""# 定義一個列表,保存所有商品的信息all_goods_info = []temp_url = 'http://erabbit.itheima.net/#/product/'for info in detail_data:dict_data = dict()dict_data['商品名稱'] = info['result']['name']dict_data['商品描述'] = info['result']['desc']dict_data['商品鏈接'] = temp_url + info['result']['id']dict_data['商品價格'] = info['result']['price']# 獲取詳情頁面中的第一張圖片dict_data['商品圖片'] = info['result']['mainPictures'][0]good_detail = info['result']['details']['properties']dict_data['商品詳情'] = ''.join([':'.join(info.values()) + '\n' for info in good_detail])all_goods_info.append(dict_data)return all_goods_infodef save_data(self, goods_info):"""存儲商品詳情的數據:param get_goods_info:"""# 建立連接到本地的MongoDBclient = self.client# 訪問/創建數據庫rabbitdb = client.rabbitry:for good in goods_info:# 創建集合little_rabbit,并在該集合中插入文檔對象db.little_rabbit.insert_one(good)print('保存成功')# 訪問集合中的文檔對象result = db.little_rabbit.find()for doc in result:print(doc)except Exception as error:print(error)def run(self):"""啟動網絡爬蟲,控制網絡爬蟲的執行流程"""begin_page = int(input('起始頁碼:'))end_page = int(input('結束頁碼:'))if begin_page <= 0:print('起始頁碼從1開始')else:for page in range(begin_page, end_page + 1):print(f'正在抓取第{page}頁')all_goods = self.load_category_page(page)goods_detail = self.load_detail_page(all_goods)goods_info = self.parse_page(goods_detail)self.save_data(goods_info)if __name__ == '__main__':lr = LittleRabbit()lr.run()
用于爬取和存儲指定網站上的商品信息到MongoDB數據庫