scrapy從安裝到爬取煎蛋網圖片

下載地址：https://www.lfd.uci.edu/~gohlke/pythonlibs/
pip install wheel
pip install lxml
pip install pyopenssl
pip install Twisted
pip install pywin32
pip install scrapy

scrapy startproject jandan 創建項目
cd?jandan
cd?jandan

items.py 存放數據
pipelines.py 管道文件

由于煎蛋網有反爬蟲措施，我們需要做一些處理

settings文件

ROBOTSTXT_OBEY = False #不遵尋reboot協議

DOWNLOAD_DELAY = 2 #下載延遲時間
DOWNLOAD_TIMEOUT = 15 #下載超時時間
COOKIES_ENABLED = False #禁用cookie

DOWNLOADER_MIDDLEWARES = {
   #請求頭
   'jandan.middlewares.RandomUserAgent': 100,
   #代理ip
   'jandan.middlewares.RandomProxy': 200,
}

#請求列表
USER_AGENTS = [
   #遨游
   "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
   #火狐
   "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
   #谷歌
   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]

#代理ip列表
PROXIES = [
   {"ip_port":"119.177.90.103:9999","user_passwd":""},
   #代理ip無密碼
   {"ip_port":"101.132.122.230:3128","user_passwd":""},
   #代理ip有密碼
   # {"ip_port":"123.139.56.238:9999","user_passwd":"root:admin"}
]

#管道文件，取消注釋
ITEM_PIPELINES = {
   'jandan.pipelines.JandanPipeline': 300,
}

IMAGES_STORE = "images"

middlewares文件

import random
import base64
from jandan.settings import USER_AGENTS
from jandan.settings import PROXIESclass RandomUserAgent(object):def process_request(self,request,spider):useragent = random.choice(USER_AGENTS)request.headers.setdefault("User-Agent",useragent)class RandomProxy(object):def process_request(self,request,spider):proxy = random.choice(PROXIES)if proxy["user_passwd"] is None:request.meta["proxy"] = "http://" + proxy["ip_port"]else:# b64編碼接收字節對象,在py3中str是unicode，需要轉換,返回是字節對象base64_userpasswd = base64.b16encode(proxy["user_passwd"].encode())request.meta["proxy"] = "http://" + proxy["ip_port"]#拼接是字符串，需要轉碼request.headers["Proxy-Authorization"] = "Basic " + base64_userpasswd.decode()

items文件

import scrapyclass JandanItem(scrapy.Item):name = scrapy.Field()url = scrapy.Field()

scrapy genspider ?-t crawl dj jandan.net 創建crawlscrapy類爬蟲
會自動在spiders下創建jandan.py文件,頁面由js編寫，需要BeautifulSoup類定位js元素獲取數據

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jandan.items import JandanItem
from selenium import webdriver
from bs4 import BeautifulSoup as bs4class JdSpider(CrawlSpider):name = 'jd'allowed_domains = ['jandan.net']start_urls = ['http://jandan.net/pic/page-1#comments/']rules = (Rule(LinkExtractor(allow=r'pic/page-\d+'), callback='parse_item', follow=True),)def parse_item(self, response):item = JandanItem()driver = webdriver.PhantomJS()driver.get(response.url)soup = bs4(driver.page_source, 'html.parser')all_data = soup.find_all('div', {'class': 'row'})for i in all_data:name = i.find("strong")item["name"] = name.get_text().strip()link = i.find('a', {'class': 'view_img_link'})url = link.get("href")if len(url) == 0:returnitem["url"] = "http://" + url.split("//")[-1]yield item

pipelines.py

import json
import os
import requests
from scrapy.conf import settingsclass JandanPipeline(object):
　　　#保存為json文件# def __init__(self):#     self.filename = open("jandan.json","wb")#     self.num = 0# # def process_item(self, item, spider):#     text = json.dumps(dict(item),ensure_ascii=False) + "\n"#     self.filename.write(text.encode("utf-8"))#     self.num += 1#     return item# # def close_spider(self,spider):#     self.filename.close()#     print("總共有" + str(self.num) + "個資源")
　　#下載到本地def process_item(self, item, spider):if 'url' in item:dir_path = settings["IMAGES_STORE"]if not os.path.exists(dir_path):os.makedirs(dir_path)su = "." + item["url"].split(".")[-1]path = item["name"] + sunew_path = '%s/%s' % (dir_path, path)if not os.path.exists(new_path):with open(new_path, 'wb') as handle:response = requests.get(item["url"], stream=True)for block in response.iter_content(1024):if not block:breakhandle.write(block)return item

scrapy crawl dj 啟動爬蟲

scrapy shell "https://hr.tencent.com/position.php?&start=0" 發送請求

?奉上我的github地址，會定期更新項目

https://github.com/bjptw/workspace

轉載于:https://www.cnblogs.com/bjp9528/p/9318013.html

本文來自互聯網用戶投稿，該文觀點僅代表作者本人，不代表本站立場。本站僅提供信息存儲空間服務，不擁有所有權，不承擔相關法律責任。
如若轉載，請注明出處：http://www.pswp.cn/news/389128.shtml
繁體地址，請注明出處：http://hk.pswp.cn/news/389128.shtml
英文地址，請注明出處：http://en.pswp.cn/news/389128.shtml

如若內容造成侵權/違法違規/事實不符，請聯系多彩編程網進行投訴反饋email:809451989@qq.com，一經查實，立即刪除！