目錄 前言 1. requests庫基礎 2. HTTP請求方法 2.1 GET請求 2.2 POST請求 2.3 其他HTTP方法 3. 請求頭設置 3.1 User-Agent設置 3.2 常用請求頭 4. 響應處理 4.1 響應內容獲取 4.2 響應狀態碼 4.3 響應頭信息 5. 會話管理 5.1 Session對象 5.2 Cookie處理 6. 代理設置 7. 超時設置 8. SSL證書驗證 9. 文件上傳和下載 9.1 文件上傳 10. 異常處理 11. 重試機制 12. 實際爬蟲案例 13. 性能優化技巧 14. 反爬蟲對策 14.1 隨機User-Agent 14.2 IP代理池 15. 最佳實踐 總結
前言
requests是Python中最受歡迎的HTTP庫之一,它簡化了HTTP請求的發送過程,是網絡爬蟲開發的首選工具。本文將詳細介紹requests庫在爬蟲開發中的各種知識點和實用技巧。
1. requests庫基礎
1.1 安裝requests
pip install requests
1.2 基本導入
import requests
2. HTTP請求方法
2.1 GET請求
import requests
response = requests. get( 'https://httpbin.org/get' )
print ( response. text)
params = { 'key1' : 'value1' , 'key2' : 'value2' }
response = requests. get( 'https://httpbin.org/get' , params= params)
print ( response. url)
2.2 POST請求
data = { 'username' : 'admin' , 'password' : '123456' }
response = requests. post( 'https://httpbin.org/post' , data= data)
import json
json_data = { 'name' : '張三' , 'age' : 25 }
response = requests. post( 'https://httpbin.org/post' , json= json_data)
response = requests. post( 'https://httpbin.org/post' , data= json. dumps( json_data) , headers= { 'Content-Type' : 'application/json' } )
2.3 其他HTTP方法
response = requests. put( 'https://httpbin.org/put' , data= { 'key' : 'value' } )
response = requests. delete( 'https://httpbin.org/delete' )
response = requests. head( 'https://httpbin.org/get' )
response = requests. options( 'https://httpbin.org/get' )
3. 請求頭設置
3.1 User-Agent設置
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests. get( 'https://httpbin.org/headers' , headers= headers)
3.2 常用請求頭
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' , 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' , 'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' , 'Accept-Encoding' : 'gzip, deflate' , 'Connection' : 'keep-alive' , 'Referer' : 'https://www.google.com/'
}
4. 響應處理
4.1 響應內容獲取
response = requests. get( 'https://httpbin.org/get' )
print ( response. text)
print ( response. content)
if response. headers. get( 'content-type' ) == 'application/json' : json_data = response. json( ) print ( json_data)
print ( response. encoding)
response. encoding = 'utf-8'
4.2 響應狀態碼
response = requests. get( 'https://httpbin.org/get' )
print ( response. status_code)
if response. status_code == 200 : print ( '請求成功' )
else : print ( f'請求失敗,狀態碼: { response. status_code} ' )
try : response. raise_for_status( )
except requests. exceptions. HTTPError as e: print ( f'HTTP錯誤: { e} ' )
4.3 響應頭信息
response = requests. get( 'https://httpbin.org/get' )
print ( response. headers)
print ( response. headers[ 'Content-Type' ] )
print ( response. headers. get( 'Server' , '未知' ) )
5. 會話管理
5.1 Session對象
session = requests. Session( )
session. headers. update( { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
} )
response = session. get( 'https://httpbin.org/get' )
response = session. get( 'https://httpbin.org/cookies/set/sessioncookie/123456789' )
response = session. get( 'https://httpbin.org/cookies' )
print ( response. json( ) )
5.2 Cookie處理
cookies = { 'session_id' : 'abc123' , 'user_id' : '456' }
response = requests. get( 'https://httpbin.org/cookies' , cookies= cookies)
response = requests. get( 'https://httpbin.org/cookies/set/test/value' )
print ( response. cookies)
session = requests. Session( )
response = session. get( 'https://httpbin.org/cookies/set/auto/managed' )
response = session. get( 'https://httpbin.org/cookies' )
print ( response. json( ) )
6. 代理設置
6.1 HTTP代理
proxies = { 'http' : 'http://proxy.example.com:8080' , 'https' : 'https://proxy.example.com:8080'
} response = requests. get( 'https://httpbin.org/ip' , proxies= proxies)
6.2 SOCKS代理
proxies = { 'http' : 'socks5://127.0.0.1:1080' , 'https' : 'socks5://127.0.0.1:1080'
} response = requests. get( 'https://httpbin.org/ip' , proxies= proxies)
7. 超時設置
try : response = requests. get( 'https://httpbin.org/delay/5' , timeout= ( 3 , 10 ) )
except requests. exceptions. Timeout: print ( '請求超時' )
try : response = requests. get( 'https://httpbin.org/delay/5' , timeout= 5 )
except requests. exceptions. Timeout: print ( '請求超時' )
8. SSL證書驗證
response = requests. get( 'https://httpbin.org/get' , verify= False )
response = requests. get( 'https://httpbin.org/get' , verify= '/path/to/ca-bundle.crt' )
response = requests. get( 'https://httpbin.org/get' , cert= ( '/path/to/client.cert' , '/path/to/client.key' ) )
9. 文件上傳和下載
9.1 文件上傳
with open ( 'test.txt' , 'rb' ) as f: files = { 'file' : f} response = requests. post( 'https://httpbin.org/post' , files= files)
files = { 'file1' : open ( 'file1.txt' , 'rb' ) , 'file2' : open ( 'file2.txt' , 'rb' )
}
response = requests. post( 'https://httpbin.org/post' , files= files)
for file in files. values( ) : file . close( )
9.2 文件下載
response = requests. get( 'https://httpbin.org/image/png' )
with open ( 'image.png' , 'wb' ) as f: f. write( response. content)
url = 'https://httpbin.org/stream-bytes/1024'
response = requests. get( url, stream= True )
with open ( 'large_file.bin' , 'wb' ) as f: for chunk in response. iter_content( chunk_size= 8192 ) : if chunk: f. write( chunk)
10. 異常處理
import requests
from requests. exceptions import RequestException, ConnectionError, Timeout, HTTPErrordef safe_request ( url, ** kwargs) : try : response = requests. get( url, ** kwargs) response. raise_for_status( ) return responseexcept ConnectionError: print ( '連接錯誤' ) except Timeout: print ( '請求超時' ) except HTTPError as e: print ( f'HTTP錯誤: { e} ' ) except RequestException as e: print ( f'請求異常: { e} ' ) return None
response = safe_request( 'https://httpbin.org/get' , timeout= 5 )
if response: print ( response. text)
11. 重試機制
from requests. adapters import HTTPAdapter
from requests. packages. urllib3. util. retry import Retrydef create_session_with_retry ( ) : session = requests. Session( ) retry_strategy = Retry( total= 3 , backoff_factor= 1 , status_forcelist= [ 429 , 500 , 502 , 503 , 504 ] , ) adapter = HTTPAdapter( max_retries= retry_strategy) session. mount( 'http://' , adapter) session. mount( 'https://' , adapter) return session
session = create_session_with_retry( )
response = session. get( 'https://httpbin.org/status/500' )
12. 實際爬蟲案例
12.1 爬取網頁內容
import requests
from bs4 import BeautifulSoup
import time
import randomclass WebScraper : def __init__ ( self) : self. session = requests. Session( ) self. session. headers. update( { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } ) def get_page ( self, url, ** kwargs) : try : response = self. session. get( url, timeout= 10 , ** kwargs) response. raise_for_status( ) return responseexcept Exception as e: print ( f'獲取頁面失敗: { e} ' ) return None def parse_html ( self, html_content) : soup = BeautifulSoup( html_content, 'html.parser' ) return soupdef crawl_with_delay ( self, urls) : results = [ ] for url in urls: response = self. get_page( url) if response: results. append( response. text) time. sleep( random. uniform( 1 , 3 ) ) return results
scraper = WebScraper( )
urls = [ 'https://httpbin.org/get' , 'https://httpbin.org/headers' ]
results = scraper. crawl_with_delay( urls)
12.2 處理表單登錄
def login_example ( ) : session = requests. Session( ) login_page = session. get( 'https://example.com/login' ) soup = BeautifulSoup( login_page. text, 'html.parser' ) csrf_token = soup. find( 'input' , { 'name' : 'csrf_token' } ) [ 'value' ] login_data = { 'username' : 'your_username' , 'password' : 'your_password' , 'csrf_token' : csrf_token} response = session. post( 'https://example.com/login' , data= login_data) if '登錄成功' in response. text or response. url == 'https://example.com/dashboard' : print ( '登錄成功' ) return sessionelse : print ( '登錄失敗' ) return None
13. 性能優化技巧
13.1 連接池
from requests. adapters import HTTPAdaptersession = requests. Session( )
adapter = HTTPAdapter( pool_connections= 10 , pool_maxsize= 20 , max_retries= 3
) session. mount( 'http://' , adapter)
session. mount( 'https://' , adapter)
13.2 并發請求
import concurrent. futures
import requestsdef fetch_url ( url) : try : response = requests. get( url, timeout= 5 ) return response. status_code, len ( response. content) except Exception as e: return None , str ( e) urls = [ 'https://httpbin.org/get' ] * 10
with concurrent. futures. ThreadPoolExecutor( max_workers= 5 ) as executor: results = list ( executor. map ( fetch_url, urls) ) for i, ( status, size) in enumerate ( results) : print ( f'URL { i} : Status= { status} , Size= { size} ' )
14. 反爬蟲對策
14.1 隨機User-Agent
import randomuser_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' , 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' , 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
] def get_random_headers ( ) : return { 'User-Agent' : random. choice( user_agents) , 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' , 'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' , 'Accept-Encoding' : 'gzip, deflate' , 'Connection' : 'keep-alive' , }
response = requests. get( 'https://httpbin.org/headers' , headers= get_random_headers( ) )
14.2 IP代理池
import randomclass ProxyPool : def __init__ ( self) : self. proxies = [ { 'http' : 'http://proxy1:8080' , 'https' : 'https://proxy1:8080' } , { 'http' : 'http://proxy2:8080' , 'https' : 'https://proxy2:8080' } , ] def get_random_proxy ( self) : return random. choice( self. proxies) def request_with_proxy ( self, url, ** kwargs) : proxy = self. get_random_proxy( ) try : response = requests. get( url, proxies= proxy, timeout= 10 , ** kwargs) return responseexcept Exception as e: print ( f'代理請求失敗: { e} ' ) return None
proxy_pool = ProxyPool( )
response = proxy_pool. request_with_proxy( 'https://httpbin.org/ip' )
15. 最佳實踐
15.1 完整的爬蟲框架
import requests
import time
import random
import logging
from urllib. parse import urljoin, urlparseclass AdvancedScraper : def __init__ ( self, base_url= None , delay_range= ( 1 , 3 ) ) : self. base_url = base_urlself. delay_range = delay_rangeself. session = self. _create_session( ) self. logger = self. _setup_logger( ) def _create_session ( self) : session = requests. Session( ) session. headers. update( { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } ) return sessiondef _setup_logger ( self) : logger = logging. getLogger( __name__) logger. setLevel( logging. INFO) handler = logging. StreamHandler( ) formatter = logging. Formatter( '%(asctime)s - %(levelname)s - %(message)s' ) handler. setFormatter( formatter) logger. addHandler( handler) return loggerdef get ( self, url, ** kwargs) : if self. base_url and not url. startswith( 'http' ) : url = urljoin( self. base_url, url) try : self. logger. info( f'請求URL: { url} ' ) response = self. session. get( url, timeout= 10 , ** kwargs) response. raise_for_status( ) delay = random. uniform( * self. delay_range) time. sleep( delay) return responseexcept Exception as e: self. logger. error( f'請求失敗: { url} , 錯誤: { e} ' ) return None def close ( self) : self. session. close( )
scraper = AdvancedScraper( base_url= 'https://httpbin.org' )
response = scraper. get( '/get' )
if response: print ( response. json( ) )
scraper. close( )
總結
requests庫是Python爬蟲開發的強大工具,掌握以下要點:
基礎使用 :熟練掌握GET、POST等HTTP方法會話管理 :使用Session對象管理Cookie和連接異常處理 :完善的錯誤處理機制性能優化 :連接池、并發請求等技術反爬蟲對策 :隨機請求頭、代理池、延時等最佳實踐 :結構化代碼、日志記錄、資源管理
通過合理使用這些技巧,可以構建穩定、高效的網絡爬蟲程序。記住要遵守網站的robots.txt協議和相關法律法規,進行負責任的數據采集。