Python 實現的采集諸葛靈簽
項目介紹
這是一個基于 Python 開發的諸葛靈簽數據采集和展示項目。通過爬蟲技術獲取諸葛神簽的簽文和解簽內容,并提供數據存儲和查詢功能。
項目結構
zhuge/├── zhuge_scraper.py # 爬蟲主程序├── zhuge_pages/ # 數據存儲目錄│ ├── all_signs.json # 匯總數據│ └── zhuge_sign_*.json # 單個簽文數據└── zhuge.md # 項目說明文檔
功能特點
- 支持批量爬取 384 個諸葛神簽
- 自動將中文數字轉換為阿拉伯數字
- 數據以 JSON 格式保存
- 實現斷點續傳和錯誤重試
- 智能延時,避免請求過頻
- 雙重保存機制(單獨文件 + 匯總文件)
技術棧
- Python 3.x
- requests:網絡請求
- BeautifulSoup4:HTML 解析
- json:數據序列化
- re:正則表達式處理
核心功能模塊
1. 中文數字轉換
實現了將"三百八十四"等中文數字轉換為阿拉伯數字的功能:
def chinese_to_arabic(chinese_num):# 將中文數字(如"三百八十四")轉換為阿拉伯數字(384)
2. 頁面解析
解析網頁內容,提取簽號、簽文和解簽信息:
def parse_zhuge_page(soup):# 解析頁面內容,返回包含簽號、簽文和解簽的字典
3. 數據爬取
處理單個頁面的爬取和數據保存:
def scrape_zhuge_page(url):# 爬取單個頁面并保存數據
4. 批量處理
控制批量爬取流程和請求頻率:
def scrape_zhuge_range(start=1, end=384):# 批量爬取指定范圍的簽文
數據存儲結構
數據以 JSON 格式存儲,包含以下字段:
{"sign_number": "簽號","sign_text": "簽文內容","interpretation": "解簽詳解"
}
使用說明
環境準備
pip install requests beautifulsoup4
運行方式
python zhuge_scraper.py
數據輸出
- 單個簽文:zhuge_pages/zhuge_sign_[編號].json
- 匯總文件:zhuge_pages/all_signs.json
注意事項
- 請合理控制爬取頻率
- 建議使用代理池輪換 IP
- 數據僅供學習研究使用
- 注意網站反爬蟲機制
后續優化計劃
- 添加代理池支持
- 優化中文數字轉換算法
- 添加數據驗證機制
- 實現更完善的錯誤處理
- 添加日志記錄系統
項目源碼
import requests
from bs4 import BeautifulSoup
import os
import time
import random
import re
import jsondef chinese_to_arabic(chinese_num):"""將中文數字轉換為阿拉伯數字"""cn_num = {'零': 0, '一': 1, '二': 2, '三': 3, '四': 4,'五': 5, '六': 6, '七': 7, '八': 8, '九': 9,'十': 10, '百': 100}result = 0temp_sum = 0temp_num = 0for char in chinese_num:curr_num = cn_num.get(char)if curr_num == 100: # 百temp_sum += (temp_num if temp_num > 0 else 1) * curr_numtemp_num = 0elif curr_num == 10: # 十temp_sum += (temp_num if temp_num > 0 else 1) * curr_numtemp_num = 0else: # 個位數temp_num = curr_numresult = temp_sum + temp_numreturn resultdef parse_zhuge_page(soup):"""Parse a Zhuge divination page and extract key information."""# Find the sign numbersign_number_elem = soup.find('dt', text=re.compile(r'諸葛測算第[零一二三四五六七八九十百]+簽結果'))if sign_number_elem:chinese_num = re.search(r'第([零一二三四五六七八九十百]+)簽', sign_number_elem.text).group(1)sign_number = str(chinese_to_arabic(chinese_num))else:sign_number = None# Find the sign textsign_text_elem = soup.find('dd').find('em')sign_text = sign_text_elem.text.strip() if sign_text_elem else None# Find the detailed interpretationinterpretation_elems = soup.find_all('dd')[1].find_all('p')interpretation = '\n'.join([p.text.strip() for p in interpretation_elems])return {'sign_number': sign_number,'sign_text': sign_text,'interpretation': interpretation}def scrape_zhuge_page(url):try:# 更完整的請求頭headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8','Accept-Encoding': 'gzip, deflate, br','Connection': 'keep-alive','Referer': 'https://www.chazidian.com/','Upgrade-Insecure-Requests': '1'}# 增加重試機制max_retries = 3retry_delay = 5for attempt in range(max_retries):try:response = requests.get(url, headers=headers, timeout=15)response.raise_for_status()breakexcept requests.RequestException as e:if attempt == max_retries - 1:raiseprint(f"Attempt {attempt + 1} failed, retrying in {retry_delay} seconds...")time.sleep(retry_delay)# 其余代碼保持不變soup = BeautifulSoup(response.text, 'html.parser')page_data = parse_zhuge_page(soup)# 創建目錄os.makedirs('zhuge_pages', exist_ok=True)# 保存到單個文件和獨立文件all_data_file = 'zhuge_pages/all_signs.json'# 讀取現有數據(如果存在)existing_data = []if os.path.exists(all_data_file):with open(all_data_file, 'r', encoding='utf-8') as f:existing_data = json.load(f)# 追加新數據existing_data.append(page_data)# 保存所有數據with open(all_data_file, 'w', encoding='utf-8') as f:json.dump(existing_data, f, ensure_ascii=False, indent=2)# 同時保存單獨的文件(保持原有功能)filename = f'zhuge_pages/zhuge_sign_{page_data["sign_number"]}.json'with open(filename, 'w', encoding='utf-8') as f:json.dump(page_data, f, ensure_ascii=False, indent=2)print(f"Successfully scraped and saved {url}")return page_dataexcept requests.RequestException as e:print(f"Error scraping {url}: {e}")return Nonedef scrape_zhuge_range(start=1, end=384):"""Scrape a range of Zhuge divination pagesArgs:start (int): Starting page numberend (int): Ending page number"""# Scrape pagesfor page_num in range(start, end + 1):url = f'https://www.chazidian.com/zhuge{page_num}/'print(f"Scraping page {page_num}...")# Scrape pagepage_data = scrape_zhuge_page(url)if page_data:# Random delay to be nice to the servertime.sleep(random.uniform(0.5, 2))# Optional: break if too many errors occurif page_num % 50 == 0:print(f"Paused at page {page_num}. Waiting a bit...")time.sleep(random.uniform(3, 7))def main():try:scrape_zhuge_range(1, 384)print("Scraping completed successfully!")except Exception as e:print(f"An error occurred during scraping: {e}")if __name__ == '__main__':main()