首先下載scrapy模塊
這里有驚喜
https://www.cnblogs.com/bobo-zhang/p/10068997.html
創建一個scrapy文件
首先在終端找到一個文件夾
輸入
scrapy startproject jy (項目件名)
修改setting文件配置
# Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'# Obey robots.txt rules ROBOTSTXT_OBEY = Fals
?
cd 到 spiders文件,在終端創建一個文件
scrapy genspider myjy(文件名) www.xxx.com
?
在文件里執行我們的第一個代碼吧
?
#實現解析+持久化存儲
# -*- coding: utf-8 -*-
import scrapy
class FirstSpider(scrapy.Spider):
#爬蟲文件的名稱
name = 'first'
#允許的域名
#allowed_domains = ['www.xxx.com']
#起始url列表
start_urls = ['https://www.qiushibaike.com/text/']
#實現了數據的基本解析操作
# def parse(self, response):
# div_list = response.xpath('//div[@id="content-left"]/div')
# for div in div_list:
# #author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
# #如果可以保證xpath返回的列表中只有一個列表元素則可以使用extract_first(),否則必須使用extract()
# author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
# content = div.xpath('./a[1]/div/span//text()').extract()
# content = ''.join(content)
# print(author,content)
#實現解析+持久化存儲
#1.基于終端指令的持久化存儲
# 只可以將parse方法的返回值持久化存儲到本地的文本中
#2.基于管道的持久化存儲
# 1.基于終端指令的持久化存儲
def parse(self, response):
div_list = response.xpath('//div[@id="content-left"]/div')
all_data = []
for div in div_list:
#author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
#如果可以保證xpath返回的列表中只有一個列表元素則可以使用extract_first(),否則必須使用extract()
author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
content = div.xpath('./a[1]/div/span//text()').extract()
content = ''.join(content)
dic = {
'author':author,
'content':content
}
all_data.append(dic)
return all_data
?
?
最后運行文件
scrapy crawl myjy
?
?
?
#解析+管道持久化存儲
?
首先在psrse里寫入文件
# -*- coding: utf-8 -*- import scrapyfrom bossPro.items import BossproItem class BossSpider(scrapy.Spider):name = 'boss'# allowed_domains = ['www.xxx.com']start_urls = ['https://www.zhipin.com/job_detail/?query=python%E7%88%AC%E8%99%AB&scity=101010100&industry=&position=']url = 'https://www.zhipin.com/c101010100/?query=python爬蟲&page=%d&ka=page-2'page = 1#解析+管道持久化存儲def parse(self, response):li_list = response.xpath('//div[@class="job-list"]/ul/li')for li in li_list:job_name = li.xpath('.//div[@class="info-primary"]/h3/a/div/text()').extract_first()salary = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()').extract_first()company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first()#實例化一個item對象item = BossproItem()#將解析到的數據全部封裝到item對象中item['job_name'] = job_nameitem['salary'] = salaryitem['company'] = company#將item提交給管道yield itemif self.page <= 3:print('if 執行!!!')self.page += 1new_url = format(self.url%self.page)print(new_url)#手動請求發送yield scrapy.Request(url=new_url,callback=self.parse)
?
配置items.py文件,用來作為數據結構
import scrapyclass BossproItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()job_name = scrapy.Field()salary = scrapy.Field()company = scrapy.Field()
?
?
在pipelines.py里寫入文件
# -*- coding: utf-8 -*-# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport pymysql from redis import Redis class BossproPipeline(object):fp = Nonedef open_spider(self, spider):print('開始爬蟲......')self.fp = open('./boss.txt','w',encoding='utf-8')def close_spider(self, spider):print('結束爬蟲......')self.fp.close()#爬蟲文件每向管道提交一次item,則該方法就會被調用一次.#參數:item 就是管道接收到的item類型對象def process_item(self, item, spider):#print(item)self.fp.write(item['job_name']+':'+item['salary']+':'+item['company']+'\n')return item #返回給下一個即將被執行的管道類class mysqlPileLine(object):conn = Nonecursor =Nonedef open_spider(self,spider):self.conn = pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='',db='scrapy',charset="utf8")print(self.conn)def process_item(self, item, spider):self.cursor = self.conn.cursor()# print(item)#print('insert into boss values ("%s","%s","%s")'%(item['job_name'],item['salary'],item['company']))try:print('insert into boss values ("%s","%s","%s")'%(item['job_name'],item['salary'],item['company']))self.cursor.execute('insert into boss values ("%s","%s","%s")'%(item['job_name'],item['salary'],item['company']))self.conn.commit()except Exception as e:print(e)self.conn.rollback()def close_spider(self,spider):self.conn.close()self.cursor.close()class redisPileLine(object):conn = Nonedef open_spider(self,spider):self.conn = Redis(host='127.0.0.1',port=6379)print(self.conn)def process_item(self, item, spider):# print(item)dic = {'name':item['job_name'],'salary':item['salary'],'company':item['company']}self.conn.lpush('boss',dic)
?
?
別忘了在setting里面配置
?
?
ITEM_PIPELINES = {# 'boss.pipelines.BossPipeline': 300,'boss.pipelines.redisPipeline': 301,# 'boss.pipelines.mysqlPipeline': 302, }
?