前言
這個項目存在cookie沒有自動更新問題,后續可能會發出來解決教程,還有微博網頁版的話最多看到300條評論,而且回復別人信息的話最多回復15條就要休息5分鐘左右才能評論
1. 項目概述
本項目實現了一個微博評論自動化處理系統,主要功能包括:
-
微博評論區數據爬取
-
文本內容清洗過濾
-
使用預訓練模型進行情感分析
-
違法內容檢測與AI法律條文回復
-
數據存儲(MySQL+Excel)
-
異常情況短信提醒
技術棧:
-
Python
-
Transformers(情感分析模型)
-
DeepSeek API(智能回復生成)
-
MySQL(數據存儲)
-
Requests(微博接口請求)
2. 核心功能模塊
2.1 數據爬取模塊
class WeiboSpider:def get_id(self, theme): ... # 獲取微博IDdef get_comments(self, com_id): ... # 分頁爬取評論def filter_text(self, text): ... # 正則過濾非法字符
特點:
-
模擬瀏覽器請求頭
-
自動處理分頁邏輯
-
支持多種括號內容過濾
2.2 情感分析模塊
def ana_com(self, sample_comments):sentiment_pipeline = pipeline("sentiment-analysis", model=self.model_name)# 使用uer/roberta-base-finetuned-jd模型
模型選擇:
-
使用在中文電商評論上微調的RoBERTa模型
-
支持二分類(positive/negative)
2.3 AI智能回復模塊
def ai_com(self, sample_comment):client = OpenAI(api_key="your_key",base_url="https://api.deepseek.com")# 調用DeepSeek法律專家模型
2.4 數據存儲模塊
class MySQLStorage:def insert_comment(...): # MySQL存儲
def store_to_excel(...): # Excel備份
3. 關鍵代碼解析
3.1 評論過濾邏輯
def filter_text(text):# 刪除多種括號內容text = re.sub(r'<[^>]*>|\{[^}]*\}|\[...]', '', text)# 保留中文、英文、數字及常用標點pattern = re.compile(r'[^\u4e00-\u9fa5a-zA-Z0-9,。!?...]+')return re.sub(pattern, '', text)
3.2 違法內容檢測
def zhengzhi_com(self, text):inputs = self.tokenizer(text, return_tensors="pt",truncation=True)outputs = self.model(**inputs)return torch.argmax(probs).item() # 1表示違法內容
3.3 自動回復流程
if self.zhengzhi_com(comment) == 1:content = self.ai_com(comment)self.replay_comments(weibo_id, cid, content)if 負面評論超過閾值:self.send_mess() # 觸發短信報警
4. 環境配置指南
4.1 依賴安裝
pip install transformers requests pandas openai mysql-connector-python
5. 效果展示
5.1 運行示例
6. 優化方向
-
反爬策略增強:
-
添加IP代理池
-
實現Cookie自動更新
-
-
模型優化:
-
使用更大規模的中文預訓練模型
-
加入自定義訓練數據
-
-
功能擴展:
-
支持多微博同時監控
-
添加可視化分析面板
-
7. 總結
本項目實現了微博評論的自動化處理閉環,主要創新點:
-
將情感分析與法律條文回復相結合
-
雙存儲方案保證數據可靠性
-
智能閾值判斷降低誤報率
?代碼
import re
import time
import requests
import pandas as pd
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import torch
from openai import OpenAI # 請確保已安裝并正確配置 OpenAI SDK
import logging
import mysql.connector
from mysql.connector import Errorclass MySQLStorage:def __init__(self, host, user, password, database):self.host = hostself.user = userself.password = passwordself.database = databaseself.connection = Nonedef connect(self):try:self.connection = mysql.connector.connect(host=self.host,user=self.user,password=self.password,database=self.database)if self.connection.is_connected():logging.info("MySQL連接成功")except Error as e:logging.error("連接MySQL出錯: %s", e)self.connection = Nonedef insert_comment(self, comment_id, comment_text, classification, reply):if not self.connection:self.connect()try:cursor = self.connection.cursor()sql = "INSERT INTO comments (comment_id, comment_text, classification, reply) VALUES (%s, %s, %s, %s)"values = (comment_id, comment_text, classification, reply)cursor.execute(sql, values)self.connection.commit()logging.info("插入評論ID %s 成功", comment_id)except Error as e:logging.error("插入評論時出錯: %s", e)def close(self):if self.connection:self.connection.close()logging.info("MySQL連接關閉")class WeiboSpider:def __init__(self, mysql_storage=None):self.headers = {'accept': 'application/json, text/plain, */*','accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6','cache-control': 'no-cache','client-version': 'v2.47.42','pragma': 'no-cache','priority': 'u=1, i','referer': 'https://weibo.com','sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Microsoft Edge";v="134"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"','sec-fetch-dest': 'empty','sec-fetch-mode': 'cors','sec-fetch-site': 'same-origin','server-version': 'v2025.03.13.1','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0','x-requested-with': 'XMLHttpRequest','x-xsrf-token': 'dSkqzhoyOR93G8syKpQJyAK6',}self.cookies = {'PC_TOKEN': 'b7063fd6a8','SCF': 'ApLwKgU7wH8un2lyl7onZ1dcBvI3q1epuPNFSFxuMr2n8iv6RrnGBsMOizTQ8qxB5kNTwzX0lUmeqa8SNPeh8ME.','SUB': '_2A25FLscfDeRhGeFH6lMV8yfNzz-IHXVmQkbXrDV8PUNbmtAbLUP3kW9Ne-lAJhc5FMVOy_Y3MCs3-DA0aRSLKoTc','SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9WWFGmFm5XRHK8Yuudiw2bKA5NHD95QN1K2pShe4eKB0Ws4Dqcj.i--ciKnRiK.pi--Ri-2ciKnpi--NiKnRi-i2i--NiKy8i-24','ALF': '02_1750221904','WBPSESS': 'FT0tjnG_uSkfs1RofpxMm5pUM9iNsMc-7Ud_mBJKmqp97lszgTfzKkQ2WdsmNdtegfc7aSrvkjrq05K1BbriOuzWdCHsg5myyuP3pi1vY0SScFtRt8S2HRwdljlfX5EBzTkwrmdbUHL4_A0kdHTeQQ==',
}self.model_name = "uer/roberta-base-finetuned-jd-binary-chinese"self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)#self.mysql_storage = mysql_storage # 可選:MySQL 存儲對象self.excel_data = [] # 用于Excel存儲@staticmethoddef filter_text(text):# 1. 刪除括號及其中的內容(支持 < > { } [ ] ())text = re.sub(r'<[^>]*>|\{[^}]*\}|\[[^\]]*\]|\([^)]*\)', '', text)# 2. 只保留漢字、英文字母、數字和常見標點符號pattern = re.compile(r'[^\u4e00-\u9fa5a-zA-Z0-9,。!?、;:“”‘’()—…《》〈〉【】]+')filtered_text = re.sub(pattern, '', text)return filtered_textdef get_id(self, theme):try:params = {'id': theme,'locale': 'zh-CN','isGetLongText': 'true',}response = requests.get('https://weibo.com/ajax/statuses/show', params=params,cookies=self.cookies, headers=self.headers).json()weibo_id = response.get('id')if not weibo_id:raise ValueError("未獲取到微博ID")return weibo_idexcept Exception as e:logging.error("get_id 出錯: %s", e)return Nonedef get_comments(self, com_id):max_id = 0all_texts = []user_dict = {}try:while True:params = {'is_reload': '1','id': com_id,'is_show_bulletin': '2','is_mix': '0','max_id': max_id,'count': '10','uid': '1798653494','fetch_level': '0','locale': 'zh-CN',}response = requests.get('https://weibo.com/ajax/statuses/buildComments', params=params,cookies=self.cookies, headers=self.headers).json()max_id = response.get('max_id', 0)datas = response.get('data', [])if not datas:breakfor data in datas:cid = str(data.get('id', ''))text = str(data.get('text', ''))text = self.filter_text(text)all_texts.append(text)user_dict[text] = cid# 同時記錄到Excel數據中self.excel_data.append({'comment_id': cid,'comment_text': text,})print(cid, text)if max_id == 0:breaktime.sleep(3)except Exception as e:logging.error("get_comments 出錯: %s", e)return all_texts, user_dictdef replay_comments(self, com_id, user_id, content):data = {'id': com_id,'cid': user_id,'comment': content,'pic_id': '','is_repost': '0','comment_ori': '0','is_comment': '0',}try:response = requests.post('https://weibo.com/ajax/comments/reply',cookies=self.cookies, headers=self.headers, data=data)print("回復結果:", response.text)except Exception as e:logging.error("replay_comments 出錯: %s", e)time.sleep(5)def ana_com(self, sample_comments):pos_score = 0neg_score = 0try:sentiment_pipeline = pipeline("sentiment-analysis", model=self.model_name)results = sentiment_pipeline(sample_comments)for comment, result in zip(sample_comments, results):label = result.get('label', '')if label.startswith("negative"):neg_score += 1else:pos_score += 1except Exception as e:logging.error("ana_com 出錯: %s", e)return pos_score, neg_scoredef zhengzhi_com(self, text):try:inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True)outputs = self.model(**inputs)probs = torch.softmax(outputs.logits, dim=-1)result = torch.argmax(probs, dim=-1).item()return 1 if result == 1 else 0except Exception as e:logging.error("zhengzhi_com 出錯: %s", e)return 0def ai_com(self, sample_comment):try:client = OpenAI(api_key="你自己的key", base_url="https://api.deepseek.com")response = client.chat.completions.create(model="deepseek-chat",messages=[{"role": "system","content": "你是一個精通法律且經常上網沖浪的人,懂得網友回復,幫我判斷微博評論的違法性,并給出法律條文回復和建議,要求簡潔精煉,字數不能超過50字,否則無法回復,法律條文可以說的不具體"},{"role": "user", "content": sample_comment},],stream=False)reply = response.choices[0].message.contentreturn replyexcept Exception as e:logging.error("ai_com 出錯: %s", e)return "無法生成回復"def send_mess(self):# 這里填寫短信發送邏輯,可調用第三方短信APIprint("發送短信提醒...")def store_to_mysql(self):if self.mysql_storage:for data in self.excel_data:comment_text = data.get('comment_text', '')comment_id = data.get('comment_id', '')classification = "不當" if self.zhengzhi_com(comment_text) == 1 else "正常"reply = ""if classification == "不當":reply = self.ai_com(comment_text)self.replay_comments(comment_id, comment_id, reply)self.mysql_storage.insert_comment(comment_id, comment_text, classification, reply)def store_to_excel(self, excel_file="comments.xlsx"):try:df = pd.DataFrame(self.excel_data)df.to_excel(excel_file, index=False)print("數據已導出到", excel_file)except Exception as e:logging.error("store_to_excel 出錯: %s", e)def run(self):weibo_id = self.get_id('PiV4XoZZM')if not weibo_id:print("獲取微博ID失敗")returncomments, dict_com = self.get_comments(weibo_id)pos_sc, neg_sc = self.ana_com(comments)print("正面評論數量:", pos_sc)print("負面評論數量:", neg_sc)bad = 0#判斷是否進行ai自動回復if neg_sc > pos_sc-10:for comment in comments:if self.zhengzhi_com(comment) == 1:content = self.ai_com(comment)print("AI回復:", content)self.replay_comments(weibo_id, dict_com.get(comment, ""), content)bad += 1if neg_sc >= pos_sc and bad > pos_sc/2:self.send_mess()# 將數據分別存儲到 MySQL 和 Excel#self.store_to_mysql()self.store_to_excel()time.sleep(60)
if __name__ == '__main__':weibo_spider = WeiboSpider()weibo_spider.run()