下載地址:https://www.lfd.uci.edu/~gohlke/pythonlibs/
pip install wheel
pip install lxml
pip install pyopenssl
pip install Twisted
pip install pywin32
pip install scrapy
scrapy startproject jandan 創建項目
cd?jandan
cd?jandan
items.py 存放數據
pipelines.py 管道文件
?
由于煎蛋網有反爬蟲措施,我們需要做一些處理
settings文件
ROBOTSTXT_OBEY = False #不遵尋reboot協議
DOWNLOAD_DELAY = 2 #下載延遲時間 DOWNLOAD_TIMEOUT = 15 #下載超時時間 COOKIES_ENABLED = False #禁用cookie
DOWNLOADER_MIDDLEWARES = {
#請求頭
'jandan.middlewares.RandomUserAgent': 100,
#代理ip
'jandan.middlewares.RandomProxy': 200,
}
#請求列表
USER_AGENTS = [
#遨游
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
#火狐
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
#谷歌
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]
#代理ip列表
PROXIES = [
{"ip_port":"119.177.90.103:9999","user_passwd":""},
#代理ip無密碼
{"ip_port":"101.132.122.230:3128","user_passwd":""},
#代理ip有密碼
# {"ip_port":"123.139.56.238:9999","user_passwd":"root:admin"}
]
#管道文件,取消注釋
ITEM_PIPELINES = {
'jandan.pipelines.JandanPipeline': 300,
}
IMAGES_STORE = "images"
?
middlewares文件
import random import base64 from jandan.settings import USER_AGENTS from jandan.settings import PROXIESclass RandomUserAgent(object):def process_request(self,request,spider):useragent = random.choice(USER_AGENTS)request.headers.setdefault("User-Agent",useragent)class RandomProxy(object):def process_request(self,request,spider):proxy = random.choice(PROXIES)if proxy["user_passwd"] is None:request.meta["proxy"] = "http://" + proxy["ip_port"]else:# b64編碼接收字節對象,在py3中str是unicode,需要轉換,返回是字節對象base64_userpasswd = base64.b16encode(proxy["user_passwd"].encode())request.meta["proxy"] = "http://" + proxy["ip_port"]#拼接是字符串,需要轉碼request.headers["Proxy-Authorization"] = "Basic " + base64_userpasswd.decode()
?
?
items文件
import scrapyclass JandanItem(scrapy.Item):name = scrapy.Field()url = scrapy.Field()
?
scrapy genspider ?-t crawl dj jandan.net 創建crawlscrapy類爬蟲
會自動在spiders下創建jandan.py文件,頁面由js編寫,需要BeautifulSoup類定位js元素獲取數據
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from jandan.items import JandanItem from selenium import webdriver from bs4 import BeautifulSoup as bs4class JdSpider(CrawlSpider):name = 'jd'allowed_domains = ['jandan.net']start_urls = ['http://jandan.net/pic/page-1#comments/']rules = (Rule(LinkExtractor(allow=r'pic/page-\d+'), callback='parse_item', follow=True),)def parse_item(self, response):item = JandanItem()driver = webdriver.PhantomJS()driver.get(response.url)soup = bs4(driver.page_source, 'html.parser')all_data = soup.find_all('div', {'class': 'row'})for i in all_data:name = i.find("strong")item["name"] = name.get_text().strip()link = i.find('a', {'class': 'view_img_link'})url = link.get("href")if len(url) == 0:returnitem["url"] = "http://" + url.split("//")[-1]yield item
?
pipelines.py
import json import os import requests from scrapy.conf import settingsclass JandanPipeline(object):
#保存為json文件# def __init__(self):# self.filename = open("jandan.json","wb")# self.num = 0# # def process_item(self, item, spider):# text = json.dumps(dict(item),ensure_ascii=False) + "\n"# self.filename.write(text.encode("utf-8"))# self.num += 1# return item# # def close_spider(self,spider):# self.filename.close()# print("總共有" + str(self.num) + "個資源")
#下載到本地def process_item(self, item, spider):if 'url' in item:dir_path = settings["IMAGES_STORE"]if not os.path.exists(dir_path):os.makedirs(dir_path)su = "." + item["url"].split(".")[-1]path = item["name"] + sunew_path = '%s/%s' % (dir_path, path)if not os.path.exists(new_path):with open(new_path, 'wb') as handle:response = requests.get(item["url"], stream=True)for block in response.iter_content(1024):if not block:breakhandle.write(block)return item
?
?
scrapy crawl dj 啟動爬蟲
scrapy shell "https://hr.tencent.com/position.php?&start=0" 發送請求
?
?奉上我的github地址,會定期更新項目
https://github.com/bjptw/workspace
?