大三上實訓內容

項目一：爬取天氣預報數據

【內容】

在中國天氣網(http://www.weather.com.cn)中輸入城市的名稱，例如輸入信陽，進入http://www.weather.com.cn/weather1d/101180601.shtml#input

的網頁顯示信陽的天氣預報，其中101180601是信陽的代碼，每個城市或者地區都有一個代碼。如下圖所示，請爬取河南所有城市15天的天氣預報數據。

1到7天代碼

import requests
from bs4 import BeautifulSoup
import csvheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36','Accept-Encoding': 'gzip, deflate'
}
city_list = [101180101,101180901,101180801,101180301,101180501,101181101,101180201,101181201,101181501,101180701,101180601,101181401,101181001,101180401,101181701,101181601,101181301]
city_name_dict = {101180101: "鄭州市",101180901: "洛陽市",101180801: "開封市",101180301: "新鄉市",101180501: "平頂山市",101181101: "焦作市",101180201: "安陽市",101181201: "鶴壁市",101181501: "漯河市",101180701: "南陽市",101180601: "信陽市",101181401: "周口市",101181001: "商丘市",101180401: "許昌市",101181701: "三門峽市",101181601: "駐馬店市",101181301: "濮陽"
}# 創建csv文件
with open('河南地級市7天天氣情況.csv', 'w', newline='', encoding='utf-8') as csvfile:csv_writer = csv.writer(csvfile)# 寫入表頭csv_writer.writerow(['City ID', 'City Name', 'Weather Info'])for city in city_list:city_id = citycity_name = city_name_dict.get(city_id, "未知城市")print(f"City ID: {city_id}, City Name: {city_name}")url = f'http://www.weather.com.cn/weather/{city}.shtml'response = requests.get(headers=headers, url=url)soup = BeautifulSoup(response.content.decode('utf-8'), 'html.parser')# 找到v<div id="7d" class="c7d">標簽v_div = soup.find('div', {'id': '7d'})# 提取v<div id="7d" class="c7d">下的天氣相關的網頁信息weather_info = v_div.find('ul', {'class': 't clearfix'})# 提取li標簽下的內容，每個標簽下的分行打印，移除打印結果之間的空格weather_list = []for li in weather_info.find_all('li'):weather_list.append(li.text.strip().replace('\n', ''))# 將城市ID、城市名稱和天氣信息寫入csv文件csv_writer.writerow([city_id, city_name, ', '.join(weather_list)])

8到15天的代碼

import requests
from bs4 import BeautifulSoup
import csvheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36','Accept-Encoding': 'gzip, deflate'
}
city_list = [101180101, 101180901, 101180801, 101180301, 101180501, 101181101, 101180201, 101181201, 101181501,101180701, 101180601, 101181401, 101181001, 101180401, 101181701, 101181601, 101181301]
city_name_dict = {101180101: "鄭州市",101180901: "洛陽市",101180801: "開封市",101180301: "新鄉市",101180501: "平頂山市",101181101: "焦作市",101180201: "安陽市",101181201: "鶴壁市",101181501: "漯河市",101180701: "南陽市",101180601: "信陽市",101181401: "周口市",101181001: "商丘市",101180401: "許昌市",101181701: "三門峽市",101181601: "駐馬店市",101181301: "濮陽"
}
# 創建csv文件
with open('河南地級市8-15天天氣情況.csv', 'w', newline='', encoding='utf-8') as csvfile:csv_writer = csv.writer(csvfile)# 寫入表頭csv_writer.writerow(['City ID', 'City Name', 'Weather Info'])for city in city_list:city_id = citycity_name = city_name_dict.get(city_id, "未知城市")print(f"City ID: {city_id}, City Name: {city_name}")url = f'http://www.weather.com.cn/weather15d/{city}.shtml'response = requests.get(headers=headers, url=url)soup = BeautifulSoup(response.content.decode('utf-8'), 'html.parser')# 找到v<div id="15d" class="c15d">標簽v_div = soup.find('div', {'id': '15d'})# 提取v<div id="15d" class="c15d">下的天氣相關的網頁信息weather_info = v_div.find('ul', {'class': 't clearfix'})# 提取li標簽下的信息for li in weather_info.find_all('li'):time = li.find('span', {'class': 'time'}).textwea = li.find('span', {'class': 'wea'}).texttem = li.find('span', {'class': 'tem'}).textwind = li.find('span', {'class': 'wind'}).textwind1 = li.find('span', {'class': 'wind1'}).textcsv_writer.writerow([city_id, city_name, f"時間：{time}，天氣：{wea}，溫度：{tem}，風向：{wind}，風力：{wind1}"])

15天代碼

import requests
from bs4 import BeautifulSoup
import csvheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36','Accept-Encoding': 'gzip, deflate'
}
city_list = [101180101, 101180901, 101180801, 101180301, 101180501, 101181101, 101180201, 101181201, 101181501,101180701, 101180601, 101181401, 101181001, 101180401, 101181701, 101181601, 101181301]
city_name_dict = {101180101: "鄭州市",101180901: "洛陽市",101180801: "開封市",101180301: "新鄉市",101180501: "平頂山市",101181101: "焦作市",101180201: "安陽市",101181201: "鶴壁市",101181501: "漯河市",101180701: "南陽市",101180601: "信陽市",101181401: "周口市",101181001: "商丘市",101180401: "許昌市",101181701: "三門峽市",101181601: "駐馬店市",101181301: "濮陽"
}# 創建csv文件
with open('河南地級市1-15天天氣情況.csv', 'w', newline='', encoding='utf-8') as csvfile:csv_writer = csv.writer(csvfile)# 寫入表頭csv_writer.writerow(['City ID', 'City Name', 'Weather Info'])for city in city_list:city_id = citycity_name = city_name_dict.get(city_id, "未知城市")print(f"City ID: {city_id}, City Name: {city_name}")# 爬取1-7天天氣情況url_7d = f'http://www.weather.com.cn/weather/{city}.shtml'response_7d = requests.get(headers=headers, url=url_7d)soup_7d = BeautifulSoup(response_7d.content.decode('utf-8'), 'html.parser')v_div_7d = soup_7d.find('div', {'id': '7d'})weather_info_7d = v_div_7d.find('ul', {'class': 't clearfix'})weather_list_7d = []for li in weather_info_7d.find_all('li'):weather_list_7d.append(li.text.strip().replace('\n', ''))# 爬取8-15天天氣情況url_15d = f'http://www.weather.com.cn/weather15d/{city}.shtml'response_15d = requests.get(headers=headers, url=url_15d)soup_15d = BeautifulSoup(response_15d.content.decode('utf-8'), 'html.parser')v_div_15d = soup_15d.find('div', {'id': '15d'})weather_info_15d = v_div_15d.find('ul', {'class': 't clearfix'})weather_list_15d = []for li in weather_info_15d.find_all('li'):time = li.find('span', {'class': 'time'}).textwea = li.find('span', {'class': 'wea'}).texttem = li.find('span', {'class': 'tem'}).textwind = li.find('span', {'class': 'wind'}).textwind1 = li.find('span', {'class': 'wind1'}).textweather_list_15d.append(f"時間：{time}，天氣：{wea}，溫度：{tem}，風向：{wind}，風力：{wind1}")# 將城市ID、城市名稱和天氣信息寫入csv文件csv_writer.writerow([city_id, city_name, ', '.join(weather_list_7d+weather_list_15d)])

項目二：爬取紅色旅游數據

【內容】

?? 信陽是大別山革命根據地，紅色旅游資源非常豐富，爬取http://www.bytravel.cn/view/red/index441_list.html 網頁的紅色旅游景點，并在地圖上標注出來。

相關代碼

import requests  # 導入requests庫，用于發送HTTP請求
import csv  # 導入csv庫，用于處理CSV文件
from bs4 import BeautifulSoup  # 導入BeautifulSoup庫，用于解析HTML文檔headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0 Safari/537.36','Accept-Encoding': 'gzip, deflate'  # 設置請求頭，模擬瀏覽器訪問
}# 創建csv文件并寫入表頭
csv_file = open('信陽紅色景點.csv', 'w', newline='', encoding='utf-8')  # 打開csv文件，以寫入模式
csv_writer = csv.writer(csv_file)  # 創建csv寫入對象
csv_writer.writerow(['景點名稱', '景點簡介', '星級', '圖片鏈接'])  # 寫入表頭# 爬取第一頁
url = 'http://www.bytravel.cn/view/red/index441_list.html'  # 定義要爬取的網頁URL
response = requests.get(headers=headers, url=url)  # 發送GET請求，獲取網頁內容
soup = BeautifulSoup(response.content.decode('gbk'), 'html.parser')  # 使用BeautifulSoup解析網頁內容target_div = soup.find('div', {'style': 'margin:5px 10px 0 10px'})  # 在解析后的HTML中查找目標divfor div in target_div.find_all('div', {'style': 'margin:2px 10px 0 7px;padding:3px 0 0 0'}):  # 在目標div中查找所有符合條件的子divtitle_element = div.find('a', {'class': 'blue14b'})  # 在子div中查找標題元素if title_element:  # 如果找到了標題元素title = title_element.text  # 獲取標題文本else:title = "未找到標題"  # 如果沒有找到標題元素，設置默認值Introduction_element = div.find('div', id='tctitletop102')  # 在子div中查找簡介元素if Introduction_element:  # 如果找到了簡介元素intro = Introduction_element.text.strip().replace("[詳細]", "")  # 獲取簡介文本，去除首尾空格和"[詳細]"標記else:intro = "無簡介"  # 如果沒有找到簡介元素，設置默認值star_element = div.find('font', {'class': 'f14'})  # 在子div中查找星級元素if star_element:  # 如果找到了星級元素star = star_element.text  # 獲取星級文本else:star = "無星級"  # 如果沒有找到星級元素，設置默認值img_url_element = div.find('img', {'class': 'hpic'})  # 在子div中查找圖片鏈接元素if img_url_element:  # 如果找到了圖片鏈接元素img_url = img_url_element['src']  # 獲取圖片鏈接else:img_url = "無圖片鏈接"  # 如果沒有找到圖片鏈接元素，設置默認值print('景點名稱：', title)  # 打印景點名稱print('景點簡介：', intro)  # 打印景點簡介print('星級：', star)  # 打印星級print('圖片鏈接：', img_url)  # 打印圖片鏈接# 將數據寫入csv文件csv_writer.writerow([title, intro, star, img_url])  # 將景點名稱、簡介、星級和圖片鏈接寫入csv文件# 爬取第二頁到第五頁
for page in range(1, 5):  # 遍歷第二頁到第五頁url = f'http://www.bytravel.cn/view/red/index441_list{page}.html'  # 構造每一頁的URLresponse = requests.get(headers=headers, url=url)  # 發送GET請求，獲取網頁內容soup = BeautifulSoup(response.content.decode('gbk'), 'html.parser')  # 使用BeautifulSoup解析網頁內容target_div = soup.find('div', {'style': 'margin:5px 10px 0 10px'})  # 在解析后的HTML中查找目標divfor div in target_div.find_all('div', {'style': 'margin:2px 10px 0 7px;padding:3px 0 0 0'}):  # 在目標div中查找所有符合條件的子divtitle_element = div.find('a', {'class': 'blue14b'})  # 在子div中查找標題元素if title_element:  # 如果找到了標題元素title = title_element.text  # 獲取標題文本else:title = "未找到標題"  # 如果沒有找到標題元素，設置默認值Introduction_element = div.find('div', id='tctitletop102')  # 在子div中查找簡介元素if Introduction_element:  # 如果找到了簡介元素intro = Introduction_element.text.strip().replace("[詳細]", "")  # 獲取簡介文本，去除首尾空格和"[詳細]"標記else:intro = "無簡介"  # 如果沒有找到簡介元素，設置默認值star_element = div.find('font', {'class': 'f14'})  # 在子div中查找星級元素if star_element:  # 如果找到了星級元素star = star_element.text  # 獲取星級文本else:star = "無星級"  # 如果沒有找到星級元素，設置默認值img_url_element = div.find('img', {'class': 'hpic'})  # 在子div中查找圖片鏈接元素if img_url_element:  # 如果找到了圖片鏈接元素img_url = img_url_element['src']  # 獲取圖片鏈接else:img_url = "無圖片鏈接"  # 如果沒有找到圖片鏈接元素，設置默認值print('景點名稱：', title)  # 打印景點名稱print('景點簡介：', intro)  # 打印景點簡介print('星級：', star)  # 打印星級print('圖片鏈接：', img_url)  # 打印圖片鏈接# 將數據寫入csv文件csv_writer.writerow([title, intro, star, img_url])  # 將景點名稱、簡介、星級和圖片鏈接寫入csv文件# 關閉csv文件
csv_file.close()

項目三：豆瓣網爬取top250電影數據

【內容】

運用scrapy框架從豆瓣電影top250網站爬取全部上榜的電影信息，并將電影的名稱、評分、排名、一句影評、劇情簡介分別保存都mysql 和mongodb 庫里面。

douban.py

import scrapy  # 導入scrapy庫
from scrapy import Selector, Request  # 從scrapy庫中導入Selector和Request類
from scrapy.http import HtmlResponse  # 從scrapy庫中導入HtmlResponse類
from ..items import DoubanspidersItem  # 從當前目錄下的items模塊中導入DoubanspidersItem類class DoubanSpider(scrapy.Spider):  # 定義一個名為DoubanSpider的爬蟲類，繼承自scrapy.Spidername = 'douban'  # 設置爬蟲的名稱為'douban'allowed_domains = ['movie.douban.com']  # 設置允許爬取的域名為'movie.douban.com'# start_urls = ['http://movie.douban.com/top250']  # 設置起始URL，但注釋掉了，所以不會自動開始爬取def start_requests(self):  # 定義start_requests方法，用于生成初始請求for page in range(10):  # 循環10次，每次生成一個請求，爬取豆瓣電影Top250的前10頁數據yield Request(url=f'https://movie.douban.com/top250?start={page * 25}&filt=')  # 使用yield關鍵字返回請求對象，Scrapy會自動處理請求并調用回調函數def parse(self, response: HtmlResponse, **kwargs):  # 定義parse方法，用于解析響應數據sel = Selector(response)  # 使用Selector類解析響應數據list_items = sel.css('#content > div > div.article > ol > li')  # 使用CSS選擇器提取電影列表項for list_item in list_items:  # 遍歷電影列表項detail_url = list_item.css('div.info > div.hd > a::attr(href)').extract_first()  # 提取電影詳情頁的URLmovie_item = DoubanspidersItem()  # 創建一個DoubanspidersItem實例movie_item['name'] = list_item.css('span.title::text').extract_first()  # 提取電影名稱movie_item['score'] = list_item.css('span.rating_num::text').extract_first()  # 提取電影評分movie_item['top'] = list_item.css('div.pic em ::text').extract_first()  # 提取電影排名yield Request(  # 使用yield關鍵字返回請求對象，Scrapy會自動處理請求并調用回調函數url=detail_url, callback=self.parse_movie_info, cb_kwargs={'item': movie_item})def parse_movie_info(self, response, **kwargs):  # 定義parse_movie_info方法，用于解析電影詳情頁數據movie_item = kwargs['item']  # 獲取傳入的DoubanspidersItem實例sel = Selector(response)  # 使用Selector類解析響應數據movie_item['comment'] = sel.css('div.comment p.comment-content span.short::text').extract_first()  # 提取電影評論movie_item['introduction'] = sel.css('span[property="v:summary"]::text').extract_first().strip() or ''  # 提取電影簡介yield movie_item  # 返回處理后的DoubanspidersItem實例，Scrapy會自動處理并保存結果

items.py


import scrapyclass DoubanspidersItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()# passtop = scrapy.Field()name = scrapy.Field()score = scrapy.Field()introduction = scrapy.Field()comment = scrapy.Field()

pipelines.py

from itemadapter import ItemAdapter
import openpyxl
import pymysqlclass DoubanspidersPipeline:def __init__(self):self.conn = pymysql.connect(host='localhost',port=3306,user='root',password='789456MLq',db='sx_douban250',charset='utf8mb4')self.cursor = self.conn.cursor()self.data = []def close_spider(self,spider):if len(self.data) > 0:self._write_to_db()self.conn.close()def process_item(self, item, spider):self.data.append((item['top'],item['name'],item['score'],item['introduction'],item['comment']))if len(self.data) == 100:self._writer_to_db()self.data.clear()return itemdef _writer_to_db(self):self.cursor.executemany('insert into doubantop250 (top,name,score,introduction,comment)''values (%s,%s,%s,%s,%s)',self.data)self.conn.commit()from pymongo import MongoClientclass MyMongoDBPipeline:def __init__(self):self.client = MongoClient('mongodb://localhost:27017/')self.db = self.client['sx_douban250']self.collection = self.db['doubantop250']self.data = []def close_spider(self, spider):if len(self.data) > 0:self._write_to_db()self.client.close()def process_item(self, item, spider):self.data.append({'top': item['top'],'name': item['name'],'score': item['score'],'introduction': item['introduction'],'comment': item['comment']})if len(self.data) == 100:self._write_to_db()self.data.clear()return itemdef _write_to_db(self):self.collection.insert_many(self.data)self.data.clear()class ExcelPipeline:def __init__(self):self.wb = openpyxl.Workbook()self.ws = self.wb.activeself.ws.title = 'Top250'self.ws.append(('排名','評分','主題','簡介','評論'))def open_spider(self,spider):passdef close_spider(self,spider):self.wb.save('豆瓣Top250.xlsx')def process_item(self,item,spider):self.ws.append((item['top'], item['name'], item['score'], item['introduction'], item['comment']))return item