Python3 實現游戲主播人氣排行榜
from urllib import request import re# 爬取某個游戲主播的人氣(每個游戲主播的觀看人數)排行榜''' 爬蟲前奏:明確爬蟲目的找到數據對應的網頁分析網頁的結構找到數據所在的標簽位置模擬HTTP請求,向服務器發送這個請求,獲取到服務器返回給我們的HTML利用正則表達式提取我們要的數據(主播名字,人氣) 概括字符集:\d \D\w 單詞字符 \W\s 空白字符 \S. 匹配除換行符\n之外的所有字符 爬蟲框架:ScrapyBeautifulSoup 進階:爬蟲大數據的存儲數據的分析 常見問題:反爬蟲反反爬蟲IP被封代理IP庫 '''class Spider():url = 'https://www.panda.tv/cate/lol'root_pattern = '<div class="video-info">([\s\S]*?)</div>'name_pattern = '</i>([\s\S]*?)</span>'number_pattern = '<span class="video-number">([\s\S]*?)</span>'def __fetch_content(self):r = request.urlopen(Spider.url)# byteshtmls = r.read()htmls = str(htmls, encoding='utf-8')return htmlsdef __analysis(self, htmls):root_html = re.findall(Spider.root_pattern, htmls)anchors = []for html in root_html:name = re.findall(Spider.name_pattern, html)number = re.findall(Spider.number_pattern, html)anchor = {'name': name, 'number': number}anchors.append(anchor)print(anchors[0])return anchorsdef __refine(self, anchors):jl = lambda anchors: {'name': anchors['name'][0].strip(),'number': anchors['number'][0].strip()}return map(jl, anchors)def __sort(self, anchors):# filteranchors = sorted(anchors, key=self.__sort_seed, reverse=True)return anchorsdef __sort_seed(self, anchor):r = re.findall('\d*\.\d*', anchor['number'])number = float(r[0])if '萬' in anchor['number']:number *= 10000return numberdef __show(self, anchors):for rank in range(0, len(anchors)):print('rank ' + str(rank + 1)+ ':' + anchors[rank]['name']+ ' ' + anchors[rank]['number'])def go(self):htmls = self.__fetch_content()anchors = self.__analysis(htmls)anchors = list(self.__refine(anchors))anchors = self.__sort(anchors)self.__show(anchors)spider = Spider() spider.go()
?