‘’’
爬取思路:
1、requests(url)
2、requests + json
3、requests + XPath
4、requests + BeautifulSoup
5、selenium
6、scrapy框架
7、scrapy-redis 及分布式
===============================================
OS:
import os
os.system(“C: && p.txt”)
os.system(“ping 127.0.0.1”)
===============================================
requests:
requests.get(url, headers=headers, data={’’:’’}, proxies=proxies)
===============================================
Proxies:
proxies = {‘http’: ‘124.207.82.166:8008’} # 47.98.129.198
response = requests.get(request_url, proxies=proxies) # 發起請求
===============================================
File:
with open(path,‘w’) as f:
f.write(text)
===============================================
Threading:
import threading
threading.Thread(target=fun, kwargs={‘list_url’:list_url,‘path_order’:path_order1}).start()
===============================================
requests、json:
1.data = json.load(open(“package1.json”,encoding=“utf-8”))
response = requests.get(url, headers=headers)
print(response.text)
2.response = requests.get(url)
data = response.text
obj = json.loads(data)
===============================================
requests、XPath
from lxml import etree
response = requests.get(list_url, headers=headers)
content = response.content
selector = etree.HTML(scontent) # 將頁面裝入etree樹
items = selector.xpath(path_order) # 按照XPath查找樹,返回迭代,
title = item.xpath("./div/p[1]/a/text()")[0].strip() # 迭代對象item可繼續用XPath查找
===============================================
requests、BeautifulSoup
from bs4 import BeautifulSoup
response = requests.get(url)
html= response.text
soup = BeautifulSoup(html, ‘lxml’)
soup_str = soup.prettify() # 標準化html
tag = soup.b
tag的一系類操作
===============================================
selenium: 安裝對應chrome版本的 Selenium driver https://www.cnblogs.com/JHblogs/p/7699951.html
并且安裝依賴庫 pip install selenium
from selenium import webdriver
chromedriver = “G:/4Anaconda/chromedriver.exe” # 驅動若在python路徑下 即可省略這一步
browser = webdriver.Chrome(chromedriver)
#打開一個網頁
browser.get(“http://www.baidu.com”)
browser.find_element_by_id(“kw”).send_keys(“selenium”)
browser.find_element_by_id(“su”).click()
browser.title
browser.set_window_size(480, 800) #參數數字為像素點
browser.back()
browser.forward()
#退出并關閉窗口的每一個相關的驅動程序
browser.quit()
#關閉當前窗口
#browser.close()
隱式等待
from selenium import webdriver
browser = webdriver.Chrome()
這里用implicitly_wait()實現了隱式等待
browser.implicitly_wait(10)
browser.get(‘https://www.zhihu.com/explore’)
input = browser.find_element_by_class_name(‘zu-top-add-question’)
print(input)
顯示等待
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Chrome()
browser.get(‘https://www.taobao.com/’)
wait = WebDriverWait(browser, 10)
input = wait.until(EC.presence_of_element_located((By.ID, ‘q’)))
button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ‘.btn-search’)))
print(input, button)
‘’’
‘’’
#一、 創建scrapy項目(cmd):
scrapy startproject weibospider
cd weibospider
#二、 創建sipder語句cmd:scrapy genspider WeiboSpider image.baidu.com拒絕爬蟲協議 ROBOTSTXT_OBEY = False 運行爬蟲 scrapy crawl baiduimg#三、 設置數據結構name = scrapy.Field()#四、 導入 數據 from hotnewsSpider.items import WeiboSpiderItem使用 weiboitem = WeiboSpiderItem()weiboitem['name'] = '123'返回 yield weiboitem#五、 發送請求傳遞 (在parse中)yield scrapy.Request(url=url, headers=self.headers, cookies=self.cookies, callback=self.clickFindMore)# 發送請求傳遞并回調,加參 callbackyield scrapy.Request(link,callback=self.parse_detail)#六、重寫初始化請求def start_requests(self):for url in self.urls:yield scrapy.Request(url=url, headers=self.headers, cookies=self.cookies, callback=self.parse)#七、接收responsedef parse(self,response):pass
‘’’