獲取Firecrawl apikey
打開官網,使用github賬號登錄
https://www.firecrawl.dev/
進入個人中心
https://www.firecrawl.dev/app/api-keys
使用PyCharm創建python項目
創建.env
# API配置
FIRECRAWL_API_KEY=fc-9*********0816d5ac6b20
# 輸出配置
OUTPUT_DIR=output
創建url_list.txt
https://mp.weixin.qq.com/s/NH4Odi-xT_hlmZdGe0dw6Q
創建wechat_crawler.py
import os
from typing import Dict, Any, List
from firecrawl import FirecrawlApp
from dotenv import load_dotenv
import logging
import re# 配置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)# 加載.env文件
load_dotenv()class WeChatCrawler:def __init__(self, api_key: str = None):"""初始化微信公眾號爬蟲Args:api_key: Firecrawl API密鑰"""self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')if not self.api_key:raise ValueError("請提供Firecrawl API密鑰")self.app = FirecrawlApp(api_key=self.api_key)self.output_dir = os.getenv('OUTPUT_DIR', 'output')# 創建輸出目錄if not os.path.exists(self.output_dir):os.makedirs(self.output_dir)def read_urls_from_file(self, file_path: str = 'url_list.txt') -> List[str]:"""從文件中讀取URL列表Args:file_path: URL列表文件路徑Returns:List[str]: URL列表"""urls = []try:with open(file_path, 'r', encoding='utf-8') as file:for line in file:url = line.strip()if url and url.startswith('http'): # 確保是有效的URLurls.append(url)logger.info(f"從 {file_path} 讀取到 {len(urls)} 個URL")except FileNotFoundError:logger.error(f"文件 {file_path} 不存在")except Exception as e:logger.error(f"讀取文件時出錯: {e}")return urlsdef scrape_url(self, url: str) -> Dict[str, Any]:"""爬取單個URL的文章內容Args:url: 要爬取的URLReturns:Dict: 爬取結果"""try:logger.info(f"開始爬取: {url}")# 使用Firecrawl爬取URLresult = self.app.scrape_url(url, formats=['markdown', 'html'])logger.info(f"成功爬取: {url}")return resultexcept Exception as e:logger.error(f"爬取 {url} 時出錯: {e}")return {'error': str(e)}def save_markdown(self, content: str, filename: str) -> None:"""保存Markdown內容到文件Args:content: Markdown內容filename: 文件名"""try:file_path = os.path.join(self.output_dir, filename)with open(file_path, 'w', encoding='utf-8') as file:file.write(content)logger.info(f"成功保存: {file_path}")except Exception as e:logger.error(f"保存文件 {filename} 時出錯: {e}")def generate_filename(self, url: str) -> str:"""根據URL生成文件名Args:url: URLReturns:str: 生成的文件名"""try:# 嘗試從微信URL中提取標識符# 微信公眾號文章URL格式: https://mp.weixin.qq.com/s/標識符match = re.search(r'/s/([A-Za-z0-9_-]+)', url)if match:identifier = match.group(1)return f"wechat_article_{identifier}.md"else:# 如果解析失敗,使用時間戳import timetimestamp = int(time.time())return f"wechat_article_{timestamp}.md"except:# 如果解析失敗,使用時間戳import timetimestamp = int(time.time())return f"wechat_article_{timestamp}.md"def crawl_all_urls(self) -> None:"""爬取所有URL并保存為Markdown文件"""urls = self.read_urls_from_file()if not urls:logger.warning("沒有找到要爬取的URL")returnlogger.info(f"開始爬取 {len(urls)} 個URL")for i, url in enumerate(urls, 1):logger.info(f"正在處理第 {i}/{len(urls)} 個URL")result = self.scrape_url(url)# 檢查返回結果中是否包含markdown內容# 處理可能的超時或其他錯誤try:# 檢查是否有錯誤if hasattr(result, 'error') and result.error:logger.error(f"爬取 {url} 出錯: {result.error}")continue# 檢查是否有markdown內容if hasattr(result, 'markdown') and result.markdown:filename = self.generate_filename(url)self.save_markdown(result.markdown, filename)else:# 嘗試轉換為字典result_dict = result.dict() if hasattr(result, 'dict') else resultif isinstance(result_dict, dict) and 'markdown' in result_dict and result_dict['markdown']:filename = self.generate_filename(url)self.save_markdown(result_dict['markdown'], filename)else:logger.error(f"無法獲取 {url} 的Markdown內容")except Exception as e:logger.error(f"處理 {url} 的結果時出錯: {e}")if __name__ == "__main__":try:# 創建爬蟲實例crawler = WeChatCrawler()# 爬取所有URLcrawler.crawl_all_urls()print("所有文章爬取完成!")except Exception as e:logger.error(f"程序執行出錯: {e}")print(f"程序執行出錯: {e}")