基礎版本
爬取網頁后直接將前端html代碼不加處理的輸出
# pip3 install requests
import requests# request the target URL
def crawler():response = requests.get("https://www.scrapingcourse.com/ecommerce/")response.raise_for_status()print(response.text)# execute the crawler
crawler()
無限增生的爬蟲
從第一個鏈接開始,記錄已經遍歷過的鏈接;
并且從這個鏈接爬取的html代碼中記錄 a[href] 的鏈接,存儲到將要遍歷的列表;
對于已經爬取的鏈接,直接continue處理
# pip3 install requests
import requestsdef crawler():while urls_to_visit:# get the page to visit from the listcurrent_url = urls_to_visit.pop(0)print(current_url)if current_url in visited_urls:continue# 記錄訪問過的url到列表中visited_urls.add(current_url)try:response = requests.get(current_url, timeout=5) # 設置超時時間,避免死循環response.raise_for_status() # 檢查請求是否成功except requests.RequestException as e:print(f"請求失敗: {current_url}, 錯誤: {e}")continue# parse the HTMLsoup = BeautifulSoup(response.text, "html.parser")# collect all the linkslink_elements = soup.select("a[href]")for link_element in link_elements:url = link_element["href"]if url.startswith("#"):continue # ignore internal links# convert links to absolute URLsif not url.startswith("http"):absolute_url = requests.compat.urljoin(target_url, url)else:absolute_url = url# ensure the crawled link belongs to the target domain and hasn't been visitedif (absolute_url.startswith(target_url)and absolute_url not in urls_to_visit):urls_to_visit.append(url)# pip3 install requests beautifulsoup4from bs4 import BeautifulSouptarget_url = "https://www.scrapingcourse.com/ecommerce/"
# initialize the list of discovered URLs
urls_to_visit = [target_url]
visited_urls = set() # 記錄已訪問的 URL,防止重復爬取
# execute the crawler
crawler()
無限增生的效果
部分鏈接爬取失敗后會返回錯誤信息