目錄
Python實例題
題目
要求:
解題思路:
代碼實現:
Python實例題
題目
Web 爬蟲與數據可視化
要求:
- 編寫一個爬蟲,從豆瓣電影 Top250 頁面(豆瓣電影 Top 250)抓取電影名稱、評分、導演、主演和上映年份。
- 將數據存儲到 SQLite 數據庫中。
- 基于數據庫數據,使用 Matplotlib 生成柱狀圖,展示評分最高的 10 部電影。
- 添加異常處理(如網絡請求失敗、解析錯誤、數據庫操作失敗等)。
解題思路:
- 使用?
requests
?和?BeautifulSoup
?實現網頁爬取與解析。 - 使用?
sqlite3
?建立數據庫并存儲數據。 - 使用?
matplotlib
?繪制柱狀圖。 - 添加重試機制和異常捕獲。
代碼實現:
import requests
from bs4 import BeautifulSoup
import sqlite3
import matplotlib.pyplot as plt
from requests.exceptions import RequestException
import timedef fetch_movie_data(url):headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}max_retries = 3for attempt in range(max_retries):try:response = requests.get(url, headers=headers, timeout=10)response.raise_for_status()return response.textexcept RequestException as e:print(f"請求失敗 ({attempt+1}/{max_retries}): {e}")if attempt < max_retries - 1:time.sleep(2) # 等待2秒后重試return Nonedef parse_movie_data(html_content):if not html_content:return []soup = BeautifulSoup(html_content, 'html.parser')movies = []for item in soup.select('div.item'):try:title = item.select_one('span.title').text.strip()rating = float(item.select_one('span.rating_num').text)info = item.select_one('div.bd p').text.strip()# 解析導演、主演和年份director, *actors = info.split('\xa0\xa0\xa0')[0].replace('導演: ', '').split('主演: ')year = info.split('\n')[-1].strip().split('/')[0]movies.append({'title': title,'rating': rating,'director': director.strip(),'actors': actors[0].strip() if actors else '','year': year})except (AttributeError, ValueError, IndexError) as e:print(f"解析錯誤: {e}")return moviesdef save_to_database(movies):conn = sqlite3.connect('douban_movies.db')c = conn.cursor()try:c.execute('''CREATE TABLE IF NOT EXISTS movies(id INTEGER PRIMARY KEY AUTOINCREMENT,title TEXT NOT NULL,rating REAL NOT NULL,director TEXT,actors TEXT,year TEXT)''')for movie in movies:c.execute('''INSERT INTO movies (title, rating, director, actors, year)VALUES (?, ?, ?, ?, ?)''',(movie['title'], movie['rating'], movie['director'], movie['actors'], movie['year']))conn.commit()except sqlite3.Error as e:print(f"數據庫錯誤: {e}")conn.rollback()finally:conn.close()def plot_top_movies():conn = sqlite3.connect('douban_movies.db')c = conn.cursor()try:c.execute('SELECT title, rating FROM movies ORDER BY rating DESC LIMIT 10')top_movies = c.fetchall()if not top_movies:print("數據庫中沒有電影數據")returntitles, ratings = zip(*top_movies)plt.figure(figsize=(10, 6))plt.barh(titles, ratings, color='skyblue')plt.xlabel('評分')plt.ylabel('電影名稱')plt.title('豆瓣電影評分Top10')plt.tight_layout()plt.savefig('top_movies.png')plt.show()except sqlite3.Error as e:print(f"數據庫錯誤: {e}")finally:conn.close()if __name__ == "__main__":all_movies = []for start in range(0, 250, 25):url = f"https://movie.douban.com/top250?start={start}"html = fetch_movie_data(url)movies = parse_movie_data(html)all_movies.extend(movies)print(f"已抓取 {start+25}/250 部電影")time.sleep(1) # 避免請求過快if all_movies:save_to_database(all_movies)plot_top_movies()else:print("未抓取到任何電影數據")