python 網頁爬取數據生成文字云圖

1. 需要的三個包：

from wordcloud import WordCloud        #詞云庫
import matplotlib.pyplot as plt        #數學繪圖庫
import jieba;

2. 定義變量（將對于的變量到一個全局的文件中）：

import re;
pdurl_first='https://movie.douban.com/subject/26363254/comments?start=0'
head={'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/59.0.3071.109 Chrome/59.0.3071.109 Safari/537.36'}
reg=re.compile(r'<a href="(.*?)&amp;.*?class="next">') #下一頁
cookies={"__utma":"30149280.503249607.1504402391.1504402391.1504402391.1","_utmb":"30149280.2.9.1504402391","__utmc":"30149280","__utmt":"1","__utmz":"30149280.1504402391.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)","ap":"1","as":'"https://movie.douban.com/subject/26363254/comments?start=225&limit=20&sort=new_score&status=P"',"bid":"g7k4BGd2sRk","ck":"76vs","dbcl2":'"166279730:fohmXhoM9uU"',"ps":"y","push_doumail_num":"0","push_doumail_num":"0"}

3. 抓取數據

import requests;
import re;
from GrabData import Param;
import pandas as pd;
from bs4 import BeautifulSoup;class GrabComent:ren = re.compile(r'<span class="votes">(.*?)</span>.*?comment">.*?</span>.*?<span.*?class="">(.*?)</a>.*?<span>(.*?)</span>.*?title="(.*?)"></span>.*?title="(.*?)"><p .*? > (.*?)</p>',re.S)def __init__(self):print('開始抓取數據');html = requests.get(Param.pdurl_first, headers=Param.head, cookies=Param.cookies);while html.status_code == 200:url_next = 'https://movie.douban.com/subject/26363254/comments' + re.findall(Param.reg, html.text)[0]zhanlang = re.findall(self.ren, html.text)print(zhanlang)data = pd.DataFrame(zhanlang)data.to_csv('H:\\python_projects\\ticket\\zhanlangpinglun.csv', header=False, index=False,mode='a+')  # 寫入csv文件,'a+'是追加模式data = []zhanlang = []print("下一頁地址："+url_next);html = requests.get(url_next, cookies=Param.cookies, headers=Param.head)if __name__ == '__main__':GrabComent();

4. 生成云圖

from wordcloud import WordCloud        #詞云庫
import matplotlib.pyplot as plt        #數學繪圖庫
import jieba;class WordYun:def __init__(self):print("開始讀取文件!");self.main();def main(self):text = self.readFile();self.showTitle(text);def showTitle(self,text1):wc1 = WordCloud(background_color="white",width=1000,height=860,font_path="D:\\Windows\\Fonts\\STFANGSO.ttf",  # 不加這一句顯示口字形亂碼margin=2);wc2 = wc1.generate(text1)  # 我們觀察到generate()接受一個Unicode的對象，所以之前要把文本處理成unicode類型
        plt.imshow(wc2)plt.axis("off")plt.show();def readFile(self):a = []f = open(r'H:\\python_projects\\ticket\\zhanlangpinglun.csv', 'r').read()words = list(jieba.cut(f))for word in words:if len(word) > 1:a.append(word);txt = r' '.join(a)print("readFile返回的結果："+txt);return txt;if __name__ == '__main__':WordYun();

轉載于:https://www.cnblogs.com/wangshunyao/p/7534883.html

本文來自互聯網用戶投稿，該文觀點僅代表作者本人，不代表本站立場。本站僅提供信息存儲空間服務，不擁有所有權，不承擔相關法律責任。
如若轉載，請注明出處：http://www.pswp.cn/news/251831.shtml
繁體地址，請注明出處：http://hk.pswp.cn/news/251831.shtml
英文地址，請注明出處：http://en.pswp.cn/news/251831.shtml

如若內容造成侵權/違法違規/事實不符，請聯系多彩編程網進行投訴反饋email:809451989@qq.com，一經查實，立即刪除！