源碼:
https://github.com/Wist-fully/Attack/tree/pc
pc_p1
目標:
1.進入列表頁,順著列表爬取每個電影詳情頁
2.利用正則來提取,海報,名稱,類別,上映的時間,評分,劇情簡介
3.將爬取的內容,保存下來
邏輯:
1.遍歷所有的頁碼,拼接URL
2.拿到詳情頁頁面的url
3.在詳情頁面用正則匹配出我們要的內容
4.詳情頁面正則提取
5.保存數據+優化
首先導入需要的模塊,設置日志
#!/usr/bin/env pythonimport logging
import requests
import re
from urllib.parse import urljoin
import pymongologging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s')BASE_URL = 'https://ssr1.scrape.center'
TOTAL_PAGE = 10
核心代碼
1.遍歷所有的頁碼,拼接URL
#抓取某一頁面的內容
def scrape_index(page):index_url = f'{BASE_URL}/page/{page}'return scrape_page(index_url)
2.拿到詳情頁頁面的url
#定義一個函數抓取網頁的內容
def scrape_page(url):logging.info("正在抓取 %s.....",url)#發起get請求try:response = requests.get(url)if response.status_code == 200:return response.textelse:logging.error("抓取 %s 時返回無效的狀態碼 %s",url,response.status_code)except requests.RequestException :#如果發生異常,就報錯logging.error("抓取%s時發生異常",url,exc_info=True)
3.在詳情頁面用正則匹配出我們要的內容
def parse_index(html):#用正則把鏈接給提取出來# print(type(html))pattern = re.compile('<a.*href="(.*?)".*?class="name">')items = re.findall(pattern,html)# print(items)if not items:return []for item in items:#把相對鏈接轉為絕對鏈接 detail_url = urljoin(BASE_URL,item)# print(detail_url)logging.info('找到詳情頁面了,鏈接%s',detail_url)yield detail_urldef main():for page in range(1,TOTAL_PAGE+1):index_html = scrape_index(page)detail_urls = parse_index(index_html)# print(list(detail_urls))logging.info('詳細頁面鏈接 %s', list(detail_urls))if __name__ == '__main__':main()
4.詳情頁面正則提取
分析:
1.圖片是個img標簽拿到url
2.類別是兩個button里面的span標簽
3.上映時間是一個div里的span標簽
4.評分是p標簽 source
5.劇情簡介也是一個p標簽,外面有個div drama
def parse_detail(html):#匹配圖片的urlcover_pattern = re.compile('class="el-col.*?<img.*?src="(.*?)".*?class="cover">', re.S)# cover_pattern = re.compile(# '<img.*?src="(.*?)".*?class="cover">', re.S)#匹配電影名稱name_pattern = re.compile('<h2.*?>(.*?)</h2>')#匹配類別categories_pattern = re.compile('<button.*?category.*?<span>(.*?)</span>.*?</button>', re.S)#匹配時間published_at_pattern = re.compile('(\d{4}-\d{2}-\d{2})\s?上映')#匹配劇情簡介drama_pattern = re.compile('<div.*?drama.*?>.*?<p.*?>(.*?)</p>', re.S)#匹配評分score_pattern = re.compile('<p.*?score.*?>(.*?)</p>', re.S)cover = re.search(cover_pattern, html).group(1).strip() if re.search(cover_pattern, html) else Nonename = re.search(name_pattern, html).group(1).strip() if re.search(name_pattern, html) else Nonecategories = re.findall(categories_pattern, html) if re.findall(categories_pattern, html) else []published_at = re.search(published_at_pattern, html).group(1) if re.search(published_at_pattern, html) else Nonedrama = re.search(drama_pattern, html).group(1).strip() if re.search(drama_pattern, html) else Nonescore = float(re.search(score_pattern, html).group(1).strip()) if re.search(score_pattern, html) else None# print(type(cover))return {'cover': cover,'name': name,'categories': categories,'published_at': published_at,'drama': drama,'score': score}
5.保存數據+優化
#!/usr/bin/env pythonimport logging
import requests
import re
from urllib.parse import urljoin
import pymongo
import multiprocessingmongo_client = pymongo.MongoClient("mongodb://192.168.6.6:27017/")
db = mongo_client["my_movies"]
collection = db["movies"]logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s')BASE_URL = 'https://ssr1.scrape.center'
TOTAL_PAGE = 10
def save_data(data):collection.insert_one(data)logging.info("數據保存到mongodb成功!!!!")def main(page):# for page in range(1,TOTAL_PAGE+1):index_html = scrape_index(page)detail_urls = parse_index(index_html)for detail_url in detail_urls:detail_html = scrape_detail(detail_url)data = parse_detail(detail_html)logging.info('get detail data %s', data)save_data(data=data)logging.info('data saved successfully')def run_main(page):main(page)if __name__ == '__main__':# 獲取CPU的核心數量num_process = multiprocessing.cpu_count()# 創建進程池pool = multiprocessing.Pool(num_process)# 要抓取的頁面數量page_to_scrape = list(range(1, TOTAL_PAGE + 1))# 使用進程池運行pool.map(run_main, page_to_scrape)# 關閉進程池pool.close()
Python爬蟲網安-logging模塊日志管理-CSDN博客