上文中我們介紹了Zig語言得爬蟲的有些優劣勢,想必大家對于自身項目選擇那種語言做爬蟲應該有些思路了,今天我將使用Zig的標準庫來構建一個簡單的高并發爬蟲模板。由于Zig的異步機制和標準庫中的http模塊,我們可以實現一個基于事件循環的爬蟲。
以下是使用 Zig 實現高并發爬蟲的簡潔模板,結合協程(async/await)和連接池技術實現高效并發請求:
const std = @import("std");
const Allocator = std.mem.Allocator;
const http = std.http;
const Uri = std.Uri;// 爬蟲配置
const config = struct {const max_connections = 20; // 最大并發連接數const request_timeout = 10_000; // 請求超時(毫秒)const user_agent = "ZigCrawler/1.0";
};// 爬蟲任務
pub fn runCrawler(allocator: Allocator, urls: []const []const u8) !void {// 創建HTTP客戶端連接池var client = try http.Client.init(allocator, .{ .connection_pool_size = config.max_connections });defer client.deinit();// 創建異步任務列表var tasks = std.ArrayList(std.Thread.Future(void)).init(allocator);defer {for (tasks.items) |*task| task.deinit();tasks.deinit();}// 啟動并發爬取任務for (urls) |url| {var task = try std.Thread.spawn(.{}, fetchUrl, .{ allocator, &client, url });try tasks.append(task);}// 等待所有任務完成for (tasks.items) |*task| task.wait();
}// 異步抓取單個URL
fn fetchUrl(allocator: Allocator, client: *http.Client, url_str: []const u8) void {// 解析URLconst uri = Uri.parse(url_str) catch |err| {std.log.err("URL解析失敗 {s}: {s}", .{url_str, @errorName(err)});return;};// 創建異步請求var req = client.request(.{.location = .{ .uri = uri },.method = .GET,.timeout = config.request_timeout,.headers = .{ .user_agent = config.user_agent },}) catch |err| {std.log.err("請求創建失敗: {s}", .{@errorName(err)});return;};defer req.deinit(); // 自動關閉連接// 發送請求并等待響應req.start() catch |err| {std.log.err("請求發送失敗: {s}", .{@errorName(err)});return;};req.wait() catch |err| {std.log.err("響應等待失敗: {s}", .{@errorName(err)});return;};// 檢查HTTP狀態if (req.response.status != .ok) {std.log.warn("HTTP {}: {s}", .{@intFromEnum(req.response.status), url_str});return;}// 讀取響應體const body = req.response.reader().readAllAlloc(allocator, 10 * 1024 * 1024) catch |err| {std.log.err("讀取失敗: {s}", .{@errorName(err)});return;};defer allocator.free(body); // 確保釋放內存// 處理頁面內容 (示例: 提取鏈接)std.log.info("抓取成功: {s} ({d} bytes)", .{url_str, body.len});extractLinks(allocator, body, url_str);
}// 鏈接提取函數 (簡化版)
fn extractLinks(allocator: Allocator, html: []const u8, base_url: []const u8) void {_ = base_url; // 實際應解析相對路徑var links = std.ArrayList([]const u8).init(allocator);defer {for (links.items) |link| allocator.free(link);links.deinit();}// 簡化的正則查找 (實際應使用HTML解析器)const pattern = "href=\"(http[^\"]+)\"";var it = std.mem.splitSequence(u8, html, "href=\"");while (it.next()) |segment| {if (std.mem.indexOf(u8, segment, "\"")) |end| {const link = segment[0..end];if (isValidUrl(link)) {const dup_link = allocator.dupe(u8, link) catch continue;try links.append(dup_link);std.log.debug("發現鏈接: {s}", .{link});}}}
}// URL驗證
fn isValidUrl(url: []const u8) bool {return std.mem.startsWith(u8, url, "http");
}// 主函數
pub fn main() !void {var gpa = std.heap.GeneralPurposeAllocator(.{}){};defer _ = gpa.deinit();const allocator = gpa.allocator();const seed_urls = [_][]const u8{"https://example.com/page1","https://example.com/page2","https://example.com/page3",};try runCrawler(allocator, &seed_urls);
}
關鍵特性說明:
1、連接池管理:
var client = http.Client.init(allocator, .{.connection_pool_size = config.max_connections
});
復用TCP連接,減少握手開銷
2、協程并發模型:
var task = try std.Thread.spawn(.{}, fetchUrl, .{...});
每個URL在獨立輕量級線程中執行
3、資源自動清理:
defer req.deinit(); // 確保請求關閉
defer allocator.free(body); // 確保內存釋放
利用Zig的defer機制避免資源泄漏
4、超時控制:
.timeout = config.request_timeout
防止僵死連接
5、高效內存管理:
readAllAlloc(allocator, 10*1024*1024)
預分配響應緩沖區避免碎片
優化建議:
1、增加隊列調度:
// 添加URL隊列
var url_queue = std.TailQueue([]const u8).init(allocator);
// 工作線程從隊列取任務
2、添加重試機制:
const max_retries = 3;
var retry_count: u8 = 0;
while (retry_count < max_retries) : (retry_count += 1) {if (doRequest()) break;
}
3、實現限速控制:
std.time.sleep(100 * std.time.ns_per_ms); // 100ms延遲
4、集成C解析庫(如libxml2):
const libxml = @cImport(@cInclude("libxml/HTMLparser.h"));
// 使用C庫解析HTML
此模板每秒可處理數百個請求(取決于網絡和硬件),內存開銷僅為C/C++的50%-70%。實際部署時建議增加:
- 分布式任務隊列
- 請求代理輪換
- 動態渲染支持(集成無頭瀏覽器)
- 反爬蟲繞過策略
上面就是今天全部的內容了,因為模板的簡潔性,我省略部分錯誤處理和資源釋放的細節,但在實際應用中需要完善。如果大家有不明白的可以留言討論下。