Go語言調用Chrome瀏覽器去進行截圖的操作,對電腦的性能要求比較高,所以速度比較有限,但是目前來看這種方式可以最佳的去獲取網頁加載后的結果。
main.go
package mainimport ("context""errors""flag""fmt""io/ioutil""log""net/url""os""path/filepath""strings""sync""sync/atomic""time""github.com/chromedp/chromedp"
)// 任務結構
type Task struct {URL stringFilename string
}// 域名黑名單,包含關鍵字的域名將被跳過
var blacklist = []string{"edu.cn", "gov.cn"}var (totalTasks int64 // 總任務數finishedTasks int64 // 已完成任務數
)func main() {start := time.Now()defer func() {if r := recover(); r != nil {log.Printf("程序異常退出: %v", r)}}()// 定義命令行參數,增加初始等待時間參數urlFile := flag.String("urls", "urls.txt", "包含URL列表的文件路徑")outputDir := flag.String("output", "screenshots", "截圖保存的目錄")workers := flag.Int("workers", 50, "并發工作線程數(建議1~3)")width := flag.Int("width", 1280, "瀏覽器窗口寬度")height := flag.Int("height", 800, "瀏覽器窗口高度")fullPage := flag.Bool("full", false, "是否截取整個頁面")timeout := flag.Int("timeout", 20, "每個任務的超時時間(秒,建議大于頁面加載等待時間,默認120)")retry := flag.Int("retry", 3, "失敗重試次數")initialWait := flag.Int("initialWait", 1, "初始等待時間(秒),用于分散任務啟動")flag.Parse()// 確保timeout參數合理if *timeout <= 30 {log.Printf("警告:timeout參數過小,已自動調整為60秒以避免context canceled錯誤!")*timeout = 60}// 創建輸出目錄if _, err := os.Stat(*outputDir); os.IsNotExist(err) {if err := os.MkdirAll(*outputDir, 0755); err != nil {log.Fatalf("創建輸出目錄失敗: %v", err)}}// 讀取URL列表urls, err := readURLs(*urlFile)if err != nil {log.Fatalf("讀取URL文件失敗: %v", err)}if len(urls) == 0 {log.Fatal("URL列表為空")}// 統計總任務數totalTasks = int64(len(urls))// 創建任務通道,增加緩沖大小taskCh := make(chan Task, len(urls))// 填充任務通道go func() {for _, url := range urls {// 生成文件名filename := generateFilename(url, *outputDir)taskCh <- Task{URL: url, Filename: filename}}close(taskCh)}()// 創建等待組var wg sync.WaitGroup// 啟動進度監控協程go func() {startTime := time.Now()for {done := atomic.LoadInt64(&finishedTasks)total := totalTaskselapsed := time.Since(startTime).Seconds()var speed float64 = 0if elapsed > 0 {speed = float64(done) / elapsed}remain := 0.0if speed > 0 {remain = float64(total-done) / speed}percent := float64(done) / float64(total) * 100fmt.Printf("\r進度: %d/%d (%.2f%%) | 速度: %.2f/秒 | 已用: %.0fs | 預計剩余: %.0fs",done, total, percent, speed, elapsed, remain)if done >= total {fmt.Println()break}time.Sleep(1 * time.Second)}}()// 啟動工作線程,增加啟動間隔log.Printf("開始處理 %d 個URL,使用 %d 個工作線程\n", len(urls), *workers)for i := 0; i < *workers; i++ {// 增加啟動間隔,避免同時啟動過多線程time.Sleep(time.Duration(i*(*initialWait)) * time.Second)wg.Add(1)go func(workerID int) {defer wg.Done()processTasks(workerID, taskCh, *width, *height, *fullPage, *timeout, *retry)}(i)}// 等待所有工作線程完成wg.Wait()elapsed := time.Since(start)log.Printf("所有任務完成,耗時: %s\n", elapsed)
}// 讀取URL文件
func readURLs(filePath string) ([]string, error) {data, err := ioutil.ReadFile(filePath)if err != nil {return nil, err}// 按行分割URLvar urls []stringlines := strings.Split(string(data), "\n")for _, line := range lines {if url := strings.TrimSpace(line); url != "" {if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") {url = "https://" + url}// 檢查黑名單blacklisted := falsefor _, keyword := range blacklist {if strings.Contains(url, keyword) {blacklisted = truebreak}}if blacklisted {continue}urls = append(urls, url)}}return urls, nil
}// 生成文件名
func generateFilename(urlStr, outputDir string) string {// 移除URL中的協議部分u, err := url.Parse(urlStr)if err != nil {// 如果解析失敗,使用時間戳作為文件名return filepath.Join(outputDir, fmt.Sprintf("unknown_%d.png", time.Now().UnixNano()))}// 使用主機名和路徑生成文件名filename := strings.ReplaceAll(u.Host+u.Path, "/", "_")if len(filename) > 100 {filename = filename[:100]}return filepath.Join(outputDir, filename+".png")
}// 處理任務
func processTasks(workerID int, taskCh <-chan Task, width, height int, fullPage bool, timeout, retry int) {// 優化Chrome選項,增加更多反檢測設置opts := append(chromedp.DefaultExecAllocatorOptions[:],chromedp.Flag("headless", false),chromedp.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"),chromedp.Flag("disable-blink-features", "AutomationControlled"),chromedp.Flag("disable-web-security", true), // 禁用Web安全策略chromedp.Flag("allow-running-insecure-content", true), // 允許運行不安全內容chromedp.Flag("ignore-certificate-errors", true), // 忽略SSL證書錯誤chromedp.WindowSize(width, height),chromedp.Flag("no-sandbox", true), // 禁用沙盒模式,在某些環境需要chromedp.Flag("disable-setuid-sandbox", true), // 禁用setuid沙盒)// 每個worker只啟動一個Chrome實例allocCtx, allocCancel := chromedp.NewExecAllocator(context.Background(), opts...)defer allocCancel()parentCtx, parentCancel := chromedp.NewContext(allocCtx)defer parentCancel()for task := range taskCh {var success boolvar attempt intfor attempt = 1; attempt <= retry; attempt++ {log.Printf("工作線程 %d 正在處理 %s (嘗試 %d/%d)\n", workerID, task.URL, attempt, retry)if attempt > 1 {time.Sleep(time.Duration(attempt*2) * time.Second)}// 每個任務新建tabctx, cancel := chromedp.NewContext(parentCtx)err := captureScreenshot(ctx, task.URL, fullPage, timeout, task.Filename)cancel()if err == nil {log.Printf("工作線程 %d 成功保存截圖: %s\n", workerID, task.Filename)success = truebreak}log.Printf("工作線程 %d 處理 %s 失敗: %v (嘗試 %d/%d)\n", workerID, task.URL, err, attempt, retry)if err != nil && (strings.Contains(err.Error(), "ERR_NAME_NOT_RESOLVED") ||strings.Contains(err.Error(), "context canceled")) {log.Printf("域名未被解析,停止對此URL的重試: %s", task.URL)break}}// 每個任務完成后,finishedTasks++atomic.AddInt64(&finishedTasks, 1)if !success {log.Printf("工作線程 %d 處理 %s 失敗,已達到最大重試次數\n", workerID, task.URL)f, err := os.OpenFile("failed_urls.txt", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)if err == nil {f.WriteString(task.URL + "\n")f.Close()}}}
}// 截圖函數,優化等待策略
func captureScreenshot(ctx context.Context, url string, fullPage bool, timeout int, outputPath string) error {// 設置更長的超時ctx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second)defer cancel()var buf []byteerr := chromedp.Run(ctx, chromedp.Tasks{chromedp.Navigate(url),chromedp.ActionFunc(func(ctx context.Context) error {var readyState stringstart := time.Now()for {err := chromedp.Evaluate(`document.readyState`, &readyState).Do(ctx)if err != nil {return err}if readyState == "complete" {time.Sleep(2 * time.Second) // 頁面加載完成后再等2秒return nil}if time.Since(start) > time.Duration(timeout)*time.Second {return errors.New("等待頁面加載超時")}time.Sleep(500 * time.Millisecond)}}),chromedp.FullScreenshot(&buf, 95), // 提高截圖質量})if err != nil {if errors.Is(err, context.DeadlineExceeded) {log.Printf("截圖超時(context deadline exceeded):%s", url)} else if errors.Is(err, context.Canceled) {log.Printf("截圖被取消(context canceled):%s", url)} else {log.Printf("截圖失敗: %s, 錯誤: %+v", url, err)}return err}return ioutil.WriteFile(outputPath, buf, 0644)
}
go.mod
module screenshot-toolgo 1.24.4require (github.com/chromedp/cdproto v0.0.0-20250403032234-65de8f5d025b // indirectgithub.com/chromedp/chromedp v0.13.7 // indirectgithub.com/chromedp/sysutil v1.1.0 // indirectgithub.com/go-json-experiment/json v0.0.0-20250211171154-1ae217ad3535 // indirectgithub.com/gobwas/httphead v0.1.0 // indirectgithub.com/gobwas/pool v0.2.1 // indirectgithub.com/gobwas/ws v1.4.0 // indirectgolang.org/x/sys v0.29.0 // indirect
)
運行命令:
go run main.go