使用 Colly 實現 豆瓣電影Top250爬取
package mainimport ("encoding/csv""github.com/PuerkitoBio/goquery""github.com/gocolly/colly""log""os""strings""time"
)type Movie struct {idx stringtitle stringyear stringinfo stringrating stringurl string
}func main() {// 存儲文件名fName := "douban_movie_top250.csv"file, err := os.Create(fName)if err != nil {log.Fatalf("創建文件失敗 %q: %s\n", fName, err)return}defer file.Close()writer := csv.NewWriter(file)defer writer.Flush()// 寫CSV頭部writer.Write([]string{"Idx", "Title", "Year", "Info", "Rating", "URL"})// 起始UrlstartUrl := "https://movie.douban.com/top250"// 創建Collectorcollector := colly.NewCollector(// 設置用戶代理colly.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"),)// 設置抓取頻率限制collector.Limit(&colly.LimitRule{DomainGlob: "*",RandomDelay: 5 * time.Second, // 隨機延遲})// 異常處理collector.OnError(func(response *colly.Response, err error) {log.Println(err.Error())})collector.OnRequest(func(request *colly.Request) {log.Println("start visit: ", request.URL.String())})// 解析列表collector.OnHTML("ol.grid_view", func(element *colly.HTMLElement) {// 依次遍歷所有的li節點element.DOM.Find("li").Each(func(i int, selection *goquery.Selection) {href, found := selection.Find("div.hd > a").Attr("href")// 如果找到了詳情頁,則繼續下一步的處理if found {parseDetail(collector, href, writer)log.Println(href)}})})// 查找下一頁collector.OnHTML("div.paginator > span.next", func(element *colly.HTMLElement) {href, found := element.DOM.Find("a").Attr("href")// 如果有下一頁,則繼續訪問if found {element.Request.Visit(element.Request.AbsoluteURL(href))}})// 起始入口collector.Visit(startUrl)
}/*** 處理詳情頁*/
func parseDetail(collector *colly.Collector, url string, writer *csv.Writer) {collector = collector.Clone()collector.Limit(&colly.LimitRule{DomainGlob: "*",RandomDelay: 2 * time.Second,})collector.OnRequest(func(request *colly.Request) {log.Println("start visit: ", request.URL.String())})// 解析詳情頁數據collector.OnHTML("body", func(element *colly.HTMLElement) {selection := element.DOM.Find("div#content")idx := selection.Find("div.top250 > span.top250-no").Text()title := selection.Find("h1 > span").First().Text()year := selection.Find("h1 > span.year").Text()info := selection.Find("div#info").Text()info = strings.ReplaceAll(info, " ", "")info = strings.ReplaceAll(info, "\n", "; ")rating := selection.Find("strong.rating_num").Text()movie := Movie{idx: idx,title: title,year: year,info: info,rating: rating,url: element.Request.URL.String(),}writer.Write([]string{idx,title,year,info,rating,element.Request.URL.String(),})log.Printf("%+v", movie)})collector.Visit(url)
}