**************************************************************************************************************
author:keyinfo
date:2025-09-09 23:50
title:網絡信息爬取之多聯級標題內容點擊
**************************************************************************************************************
(到此已經完成可以初步對頁面多級鏈接進行爬取并且寫入到文件里面了,接下來可以對防爬措施的繞過和內容再精細化篩選)
對AI的提問:
問:TypeError: 'dict' object is not callable錯誤
場景 1:誤用圓括號 () 訪問字典(最常見)
場景 2:變量名與字典類型 dict 重名
場景 3:字典方法調用錯誤(少了括號或多了括號)
場景 4:函數/變量名與字典混淆
總結:如何避免此錯誤?
訪問字典鍵時用 [],而非 ():my_dict[key] 或 my_dict.get(key) 。
不要用 dict 作為變量名:避免覆蓋內置類型。
區分對象和方法調用:方法調用需加 ()(如 my_dict.keys() ),而對象本身(如字典、列表)不能加 ()。
檢查變量類型:如果不確定變量類型,可用 print(type(my_var)) 確認,避免將非函數對象當作函數調用。
源代碼經由AI進行調優、添加上注釋
核心代碼:
代碼段1:
# 提取所有 <a> 標簽中的 href 鏈接 for link in soup.find_all("a"): href = link.get("href") # 過濾無效鏈接if not href or href.startswith(("#", "javascript:", "mailto:")):continue # 拼接完整 URL full_url = f"{url.rstrip('/')}/{href.lstrip('/')}" collected_links.append(full_url)
代碼段2:
def save_links_to_file(links, file_path, buffer_size=1000):seen = set()buffer = []# 讀取已有鏈接,防止重復寫入try:with open(file_path, "r", encoding="utf-8") as f:seen.update(line.strip() for line in f)except FileNotFoundError:pass# 處理每個鏈接for link in links:if link in seen:continueseen.add(link) buffer.append(f"{link}\n") # 緩存滿則寫入文件if len(buffer) >= buffer_size:with open(file_path, "a", encoding="utf-8") as f:f.writelines(buffer) buffer = []# 寫入剩余鏈接 if buffer:with open(file_path, "a", encoding="utf-8") as f:f.writelines(buffer)
完整代碼:
import time
from urllib.request import urlopen, Request
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from urllib.error import URLError, HTTPError# 記錄程序開始時間
start_time = time.perf_counter() def fetch_url(url, headers, collected_links):try:# 構建請求對象 req = Request(url, headers=headers)# 發起請求并讀取響應內容with urlopen(req) as response:content = response.read() status_code = response.getcode() # 使用 BeautifulSoup 解析 HTMLsoup = BeautifulSoup(content, "lxml")# 提取所有 <a> 標簽中的 href 鏈接 for link in soup.find_all("a"): href = link.get("href") # 過濾無效鏈接if not href or href.startswith(("#", "javascript:", "mailto:")):continue # 拼接完整 URL full_url = f"{url.rstrip('/')}/{href.lstrip('/')}" collected_links.append(full_url) # 打印成功信息print(f"URL: {url}")print(f"Status: {status_code}")print(f"描述: {'成功' if status_code == 200 else '未知'}")except HTTPError as e:print(f"HTTP 錯誤: URL={url}, 狀態碼={e.code}, 原因={e.reason}") except URLError as e:print(f"URL 錯誤: URL={url}, 原因={e.reason}") except Exception as e:print(f"請求失敗: {e}")return collected_linksdef extract_base_url(url):parsed = urlparse(url)base_url = f"{parsed.scheme}://{parsed.netloc}" return base_urldef save_links_to_file(links, file_path, buffer_size=1000):seen = set()buffer = []# 讀取已有鏈接,防止重復寫入try:with open(file_path, "r", encoding="utf-8") as f:seen.update(line.strip() for line in f)except FileNotFoundError:pass# 處理每個鏈接for link in links:if link in seen:continueseen.add(link) buffer.append(f"{link}\n") # 緩存滿則寫入文件if len(buffer) >= buffer_size:with open(file_path, "a", encoding="utf-8") as f:f.writelines(buffer) buffer = []# 寫入剩余鏈接 if buffer:with open(file_path, "a", encoding="utf-8") as f:f.writelines(buffer) if __name__ == "__main__":# 用戶輸入input_url = input("請輸入網址:")output_path = input("請輸入保存路徑(例如:./links.txt ):")# 提取基礎 URLbase_url = extract_base_url(input_url)# 設置請求頭headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36","Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8","Referer": base_url,"Host": base_url[8:] if base_url.startswith("https://") else base_url[7:],}# 存儲所有鏈接all_links = [base_url]# 獲取頁面鏈接fetch_url(base_url, headers, all_links)# 保存鏈接到文件save_links_to_file(all_links, output_path)# 打印運行時間end_time = time.perf_counter() print(f"程序共計運行:{end_time - start_time:.4f}秒")