Colly實現豆瓣電影Top250爬取

使用 Colly 實現豆瓣電影Top250爬取

package mainimport ("encoding/csv""github.com/PuerkitoBio/goquery""github.com/gocolly/colly""log""os""strings""time"
)type Movie struct {idx    stringtitle  stringyear   stringinfo   stringrating stringurl    string
}func main() {// 存儲文件名fName := "douban_movie_top250.csv"file, err := os.Create(fName)if err != nil {log.Fatalf("創建文件失敗 %q: %s\n", fName, err)return}defer file.Close()writer := csv.NewWriter(file)defer writer.Flush()// 寫CSV頭部writer.Write([]string{"Idx", "Title", "Year", "Info", "Rating", "URL"})// 起始UrlstartUrl := "https://movie.douban.com/top250"// 創建Collectorcollector := colly.NewCollector(// 設置用戶代理colly.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"),)// 設置抓取頻率限制collector.Limit(&colly.LimitRule{DomainGlob:  "*",RandomDelay: 5 * time.Second, // 隨機延遲})// 異常處理collector.OnError(func(response *colly.Response, err error) {log.Println(err.Error())})collector.OnRequest(func(request *colly.Request) {log.Println("start visit: ", request.URL.String())})// 解析列表collector.OnHTML("ol.grid_view", func(element *colly.HTMLElement) {// 依次遍歷所有的li節點element.DOM.Find("li").Each(func(i int, selection *goquery.Selection) {href, found := selection.Find("div.hd > a").Attr("href")// 如果找到了詳情頁，則繼續下一步的處理if found {parseDetail(collector, href, writer)log.Println(href)}})})// 查找下一頁collector.OnHTML("div.paginator > span.next", func(element *colly.HTMLElement) {href, found := element.DOM.Find("a").Attr("href")// 如果有下一頁，則繼續訪問if found {element.Request.Visit(element.Request.AbsoluteURL(href))}})// 起始入口collector.Visit(startUrl)
}/*** 處理詳情頁*/
func parseDetail(collector *colly.Collector, url string, writer *csv.Writer) {collector = collector.Clone()collector.Limit(&colly.LimitRule{DomainGlob:  "*",RandomDelay: 2 * time.Second,})collector.OnRequest(func(request *colly.Request) {log.Println("start visit: ", request.URL.String())})// 解析詳情頁數據collector.OnHTML("body", func(element *colly.HTMLElement) {selection := element.DOM.Find("div#content")idx := selection.Find("div.top250 > span.top250-no").Text()title := selection.Find("h1 > span").First().Text()year := selection.Find("h1 > span.year").Text()info := selection.Find("div#info").Text()info = strings.ReplaceAll(info, " ", "")info = strings.ReplaceAll(info, "\n", "; ")rating := selection.Find("strong.rating_num").Text()movie := Movie{idx:    idx,title:  title,year:   year,info:   info,rating: rating,url:    element.Request.URL.String(),}writer.Write([]string{idx,title,year,info,rating,element.Request.URL.String(),})log.Printf("%+v", movie)})collector.Visit(url)
}

本文來自互聯網用戶投稿，該文觀點僅代表作者本人，不代表本站立場。本站僅提供信息存儲空間服務，不擁有所有權，不承擔相關法律責任。
如若轉載，請注明出處：http://www.pswp.cn/news/455017.shtml
繁體地址，請注明出處：http://hk.pswp.cn/news/455017.shtml
英文地址，請注明出處：http://en.pswp.cn/news/455017.shtml

如若內容造成侵權/違法違規/事實不符，請聯系多彩編程網進行投訴反饋email:809451989@qq.com，一經查實，立即刪除！