python 微博爬蟲 01

起因，目的:

?下載單個視頻，完成。
? 獲取某用戶的視頻列表，完成。
剩下的就是，根據視頻列表，逐個下載視頻，我沒做，沒意思。
獲取視頻的評論，以后再說。

關鍵點記錄:

1. 對一個視頻的直接 url，

ssig 是變動的。我估計是有時效的。
使用 requests 來下載單獨視頻，還是可行的。

2. 獲取視頻播放列表

不能直接使用 seleinum 庫, 因為網頁沒有顯示，只能一個一個點擊。會很慢.
獲取視頻播放列表，可以訪問 api: https://weibo.com/ajax/profile/getWaterFallContent?uid=5653796775&cursor=4436755690237089
cursor 參數是從 0 開始，而且相應的json 中，會給出 “next_cursor”: “4560020807617171”
實際情況是，使用 firefox 瀏覽器查看 json 相應，很方便查看 json 的結構，很清晰。

3. 爬取微博，不建議使用 requests 庫，理由是

靜態頁面和動態頁面的區別。
中間有個 js 驗證！

1. 使用 requests，單獨下載一個微博視頻，成功。

# -*- coding: UTF-8 -*-
import requests# 1. 假如知道了視頻的直接 url, 那么直接下載視頻，成功！
# 2. 知道了視頻的 主頁面，然后找到視頻的 url， 再下載。失敗！ 因為中間涉及 js !def make_headers():headers = {'Accept-Encoding': '*/*','Referer': 'https://weibo.com/','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',}return headersclass PlayAround:def __init__(self):self.headers = make_headers()self.session = requests.Session()self.fail = []# 單純的下載視頻。已經完成。能實現。def download_video(self, video_url, video_name):resp = self.session.get(video_url, headers=self.headers,  allow_redirects=False)if resp.status_code == 200:with  open(f"{video_name}.mp4", "wb") as f:f.write(resp.content)if __name__ == '__main__':p = PlayAround()# 視頻的直接鏈接mp4_url = "https://f.video.weibocdn.com/o0/QSQkAf0wlx08cD50PfXa01041201cfbq0E010.mp4?label=mp4_1080p&template=1080x1920.24.0&media_id=5002705307893807&tp=8x8A3El:YTkl0eM8&us=0&ori=1&bf=4&ot=v&lp=00002D9dZv&ps=mZ6WB&uid=6Ak7kf&ab=13038-g1,,8012-g2,3601-g32,3601-g31,8013-g0,3601-g29,3601-g39,3601-g19,3601-g36,3601-g27,12739-g1,3601-g38,3601-g37&Expires=1744548301&ssig=jh4Js32Fx1&KID=unistore,video"video_name = "趙露思的微博的微博視頻223"p.download_video(mp4_url, video_name)

2. 使用 selenuim + cookies 登錄微博

實際上，修改 cookies, 可以登錄任意網站。

import time
import random
import json
import pickle
from selenium import webdriver
from selenium.webdriver.chrome.options import Options"""
此文件， 使用 selenium + cookies, 登錄微博  1. 任意網站，從插件 editThisCookie 導出所有的 cookies , 復制到 cookies.json 文件
2. 運行此文件的過程中，會自動生成 pickle 文件
3. 然后模擬登錄，刷新頁面，即可登錄成功
4. 繼續　selenium 的其他功能如果報錯，那么需要刪除 cookies 中的 sameSite 屬性， 并重新生成 pickle 文件
"""URL = "https://weibo.com/"
PKL_NAME = "weibo_cookies.pkl"
JSON_NAME =  "weibo_cookies.json"class SeleHeaders:def __init__(self):self.option = Options()# self.option.add_argument("--start-maximized")# self.option.add_argument('--headless')self.bot = webdriver.Chrome(options=self.option)@staticmethoddef make_cookie():with open(JSON_NAME, encoding="utf-8") as f:cookies = json.load(f)# 刪除 sameSite 屬性for c in cookies:if "sameSite" in c:del c["sameSite"]# 保存為 pickle 文件with open(PKL_NAME, "wb") as f:pickle.dump(cookies, f)print("? 已生成新的 pickle 文件（sameSite 屬性已刪除）")def login(self):self.make_cookie()bot = self.botbot.get(URL)time.sleep(random.randint(3, 7))try:cookies = pickle.load(open(PKL_NAME, "rb"))for c in cookies:bot.add_cookie(c)bot.refresh()print("? 登錄成功！")# 刷新頁面，檢查效果！for i in range(3):time.sleep(random.randint(2, 5))bot.refresh()except Exception as e:print("? 失敗！請檢查 cookies 文件或登錄狀態")print(e)if __name__ == "__main__":meme = SeleHeaders()meme.login()

3. 使用 selenium 提取一個視頻的基本信息

比如： ‘2,095萬次觀看 · 1月前 · 發布于四川’
還是需要使用 cookies, 與前面的很像相似。

import os
import time
import random
import json
import pickle
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import requests"""
2025-04-13 19:07:07   能成功運行， 能下載視頻。此文件， 
1. 使用 selenium + cookies, 登錄 weibo.com
2. 找到視頻的 url , 下載視頻1. 任意網站，從插件 editThisCookie 導出所有的 cookies , 復制到 cookies.json 文件
2. 運行此文件的過程中，會自動生成 pickle 文件
3. 然后模擬登錄，刷新頁面，即可登錄成功
4. 繼續　selenium 的其他功能如果報錯，那么需要刪除 cookies 中的 sameSite 屬性， 并重新生成 pickle 文件
"""URL = "https://weibo.com/"
PKL_NAME = "weibo_cookies.pkl"
JSON_NAME =  "weibo_cookies.json"def make_headers():headers = {'Accept-Encoding': '*/*','Referer': 'https://weibo.com/','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',}return headersdef make_cookies():if PKL_NAME in os.listdir(): return # 已有 pickle 文件，直接返回with open(JSON_NAME, encoding="utf-8") as f:cookies = json.load(f)# 刪除 sameSite 屬性for c in cookies:if "sameSite" in c:del c["sameSite"]# 保存為 pickle 文件with open(PKL_NAME, "wb") as f:pickle.dump(cookies, f)print("? 已生成新的 pickle 文件（sameSite 屬性已刪除）")class SeleniumSpider:def __init__(self):self.option = Options()# self.option.add_argument("--start-maximized")# self.option.add_argument('--headless')self.bot = webdriver.Chrome(options=self.option)make_cookies()def login(self):bot = self.botbot.get(URL)time.sleep(random.randint(3, 7))try:cookies = pickle.load(open(PKL_NAME, "rb"))for c in cookies:bot.add_cookie(c)bot.refresh()print("? 登錄成功！")except Exception as e:print("? 失敗！請檢查 cookies 文件或登錄狀態")print(e)def get_video_info(self, url):video_info = {"video_url": "","video_name": "","video_date": ""}bot = self.botbot.get(url)time.sleep(random.randint(3, 6))  # Wait for page to load# Check if URL is a direct video linkif url.endswith(('.mp4', '.m3u8')):video_info["video_url"] = urlvideo_info["video_name"] = url.split('/')[-1].split('.')[0]print(f"? 檢測到直接視頻鏈接: {url}")else:# Parse page for video informationtry:# Find video elementvideo_element = bot.find_element("tag name", "video")video_info["video_url"] = video_element.get_attribute("src")except:print(f"? 未找到視頻元素: {url}")return None# Extract title (video name)try:video_info["video_name"] = bot.title.strip()except:video_info["video_name"] = f"weibo_video_{int(time.time())}"print(f"?? 未找到標題，使用默認名稱: {video_info['video_name']}")# Extract full date/views/location stringtry:# Target div.star-f16 with parent div.Detail_tith4_3_UzS# '2,095萬次觀看 · 1月前 · 發布于 四川'date_element = bot.find_element("css selector", "div.Detail_tith4_3_UzS > div.star-f16")#date_element = bot.find_element("css selector", "div.Detail_tith4_3_UzS")video_info["video_date"] = date_element.text.strip()except:video_info["video_date"] = ""print("?? 未找到日期信息")print(f"? 獲取視頻信息成功: {url}")print(video_info)return video_infodef download_video(self, video_info, save_path=""):session = requests.Session()headers =  make_headers()# Construct full save pathfilename = f"{video_info['video_name']}.mp4"# Download videoprint(f"? 開始下載: {filename}")response = session.get(video_info["video_url"], headers=headers, stream=True, timeout=30)if response.status_code == 200:with open(filename, "wb") as f:for chunk in response.iter_content(chunk_size=8192):if chunk:f.write(chunk)print(f"? 下載成功: {filename}")return Trueelse:print(f"? 下載失敗: {filename} (狀態碼: {response.status_code})")return Falseif __name__ == "__main__":# 1. 先登錄spider = SeleniumSpider()spider.login()# 2. 再獲取視頻的內容one_video_url = "https://weibo.com/tv/show/1034:5136658249744436?mid=5136667216183574"video_info = spider.get_video_info(one_video_url)# 3. 下載視頻if video_info:spider.download_video(video_info, save_path="videos/")spider.bot.quit()  # Close browser when done

輸出結果:
在這里插入圖片描述

4. 使用 requests + cookies + api, 獲取某用戶的視頻播放列表。

這部分是比較麻煩的。需要加上 cookies!! 否則的話，得到的是 js 代碼 !
下面的代碼，請使用自己的 cookies ，并且修改頁數，默認是5頁。

import requests# 請求 api: https://weibo.com/ajax/profile/getWaterFallContent?uid=5653796775&cursor=4436755690237089
# 這里有點復雜，需要加上 cookies!!
# 否則的話，會得到 js !
def make_headers():headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7','Referer': 'https://weibo.com/','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',"cookie": "SC*****很長很長*******FVH4X16"}return headers# def get_video_list(self, user_id, max_pages=5):
def get_video_list( user_id, max_pages=5):# api: https://weibo.com/ajax/profile/getWaterFallContent?uid=5653796775&cursor=4436755690237089video_infos = []cursor = "0"cnt = 0for page in range(max_pages):print(f"? 獲取第 {page + 1} 頁視頻...")url = f"https://weibo.com/ajax/profile/getWaterFallContent?uid={user_id}&cursor={cursor}"response = requests.get(url, headers=make_headers())print(response.status_code)print(response.text)if response.status_code != 200:print(f"? API 請求失敗: {response.status_code}")breakdata = response.json()print(type(data))# 解析 JSON 數據for item in data.get("data", {}).get("list", []):# 此時位于 data/list/item[0]# 繼續提取 page_info/media_info/playback_list/[0]/play_info/url# 對于 playback_list， 只需要提前第一個即可。playback_list = item.get("page_info", {}).get("media_info", {}).get("playback_list", [])for play_info in playback_list:video_url = play_info.get("play_info", {}).get("url", "")print(video_url)print(play_info)print()cnt += 1break# 繼續檢查下一個 api jsoncursor = data.get("data", {}).get("next_cursor", "")if not cursor:print("? 已到達最后一頁")breakprint(f"? 共收集 {cnt} 個視頻信息")return video_infosif __name__ == '__main__':get_video_list(user_id="5653796775", max_pages=5)