復雜HTML解析

#再端一碗BeautifulSoup
#獲取《戰爭與和平》中的人物名字from urllib.request import urlopen
from bs4 import BeautifulSouphtml = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsObj = BeautifulSoup(html,'html.parser')#namelist = bsObj.findAll("span",{"class":"green"})
#for name in namelist:
#    print(name.get_text())

name_number = bsObj.findAll(text='the prince')
print(len(name_number))allText = bsObj.findAll(id="text") #bsObj.findAll(id='text')等價于bsObj.findAll(" ",{"id":"text"})
print(allText[0].get_text())#get_text()會把正在處理的HTML文檔中所有的標簽都清除，然后返回一個只包含文字的字符串。
#通常在準備打印、存儲和操作數據時，應該最后才使用get_text()#BeautifulSoup的find()和findAll()
#其定義如下
#findAll(tag,attributes,recursive,text,limit,keywords)
#find(tag,attributes,recursive,text,keywords)#find等價于findAll的limit等于1時的情形
#如果只對網頁中獲取的前X項結果感興趣，就可以設置它
#但是得注意這個參數設置之后，獲得的前幾項結果是按照網頁上的順序排序的
#未必是想要的那幾項#其他BeautifulSoup對象
#1.NavigableSring對象：用來表示標簽里的文字
#2.Comment對象：用來查找HTML文檔的注釋標簽，<!-- 文字 -->#子標簽和后代標簽
from urllib.request import urlopen
from bs4 import BeautifulSouphtml = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html,'html.parser')for child in bsObj.find("table",{"id":"giftList"}).children:print(child)#處理兄弟標簽
from urllib.request import urlopen
from bs4 import BeautifulSouphtml = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html,'html.parser')for sibling in bsObj.find('table',{'id':'giftList'}).tr.next_siblings:print(sibling)#父標簽處理
from urllib.request import urlopen
from bs4 import BeautifulSouphtml = urlopen('http://www.pythonscraping.com/pages/page3.html')
bsObj = BeautifulSoup(html,'html.parser')
print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())#正則表達式
#通過商品圖片的文件路徑查找
from urllib.request import urlopen
from bs4 import BeautifulSoup
import rehtml = urlopen('http://www.pythonscraping.com/pages/page3.html')
bsObj = BeautifulSoup(html,'html.parser')
images = bsObj.findAll("img",{"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")})
for image in images:print(image["src"])

轉載于:https://www.cnblogs.com/geeker-xjl/p/11081532.html

本文來自互聯網用戶投稿，該文觀點僅代表作者本人，不代表本站立場。本站僅提供信息存儲空間服務，不擁有所有權，不承擔相關法律責任。
如若轉載，請注明出處：http://www.pswp.cn/news/386579.shtml
繁體地址，請注明出處：http://hk.pswp.cn/news/386579.shtml
英文地址，請注明出處：http://en.pswp.cn/news/386579.shtml

如若內容造成侵權/違法違規/事實不符，請聯系多彩編程網進行投訴反饋email:809451989@qq.com，一經查實，立即刪除！