爬蟲之拉勾網職位獲取

重點在于演示urllib.request.Request()請求中各項參數的 書寫格式 譬如: url data headers...

Demo演示(POST請求):

import urllib.request
import urllib.parse
import json, jsonpath, csv

url = "https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false"
headers = {
??? "Accept": "application/json, text/javascript, */*; q=0.單線程",
??? "Accept-Encoding": "gzip, deflate, br",
??? "Accept-Language": "zh-CN,zh;q=0.9",
??? "Connection": "keep-alive",
??? "Content-Length": "38",
??? "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
??? "Cookie": "_ga=GA1.2.1963509933.1531996888; user_trace_token=20180719184127-4a8c7914-8b40-11e8-9eb6-525400f775ce; LGUID=20180719184127-4a8c7df2-8b40-11e8-9eb6-525400f775ce; JSESSIONID=ABAAABAAAIAACBI0F0B14254DA54E3CCF3B1F22FE32B179; _gid=GA1.2.1918046323.1536408617; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536408620; X_HTTP_TOKEN=339034308973d0bd323cc0b9b6b3203a; LG_LOGIN_USER_ID=24096d6ba723e146bd326de981ab924b23c1f21775136c3a8be953e855211e61; _putrc=95519B7FB60FCF58123F89F2B170EADC; login=true; unick=%E9%A9%AC%E7%BB%A7%E4%B8%9A; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=55; gate_login_token=3b4fa15daef090780ae377bbcd66dc83af9af0cc6a7f1dd697770790f3b9f9ef; index_location_city=%E4%B8%8A%E6%B5%B7; TG-TRACK-CODE=search_code; _gat=1; LGSID=20180908221639-cd6d2a72-b371-11e8-b62b-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_%25E7%2588%25AC%25E8%2599%25AB%3Fcity%3D%25E4%25B8%258A%25E6%25B5%25B7%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; SEARCH_ID=e559a417b4464fd9bc0b439a67ef0a5a; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536416580; LGRID=20180908222259-afed9b74-b372-11e8-b62b-5254005c3644",
??? "Host": "www.lagou.com",
??? "Origin": "https://www.lagou.com",
??? "Referer": "https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=",
??? "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
??? "X-Anit-Forge-Code": "0",
??? "X-Anit-Forge-Token": "None",
??? "X-Requested-With": "XMLHttpRequest"}
# params = {"city": "上海", "needAddtionalResult": "false"}
list_position = []
for pn in range(1, 5):
??? data = {
??????? "first": "false",
??????? "pn": pn,
??????? "kd": "爬蟲"
??? }
??? # params = urllib.parse.urlencode(params)
??? # url = url + params
??? data = urllib.parse.urlencode(data).encode('utf-8')
??? req = urllib.request.Request(url, data=data, headers=headers)
??? print('正在請求第%d頁' % pn)
??? str_data = urllib.request.urlopen(req).read()
??? with open('03.html', 'wb') as f:
??????? f.write(str_data)
??? # 轉換成python對象
??? data_list = json.loads(str_data)
??? job_list = jsonpath.jsonpath(data_list, "$..result")[0]

??? for item in job_list:
??????? position_dict = {}
??????? position_dict['positionName'] = item.get('positionName')
??????? position_dict['createTime'] = item.get('createTime')
??????? position_dict['url'] = 'https://www.lagou.com/jobs/' + str(item.get('positionId')) + '.html'

??????? position_dict['salary'] = item.get('salary')
??????? position_dict['workYear'] = item.get('workYear')
??????? position_dict['companySize'] = item.get('companySize')
??????? list_position.append(position_dict)

# 保存到json文件
json.dump(list_position, open('03.json', 'w'))

# 保存到csv文件? 'gbk' codec can't encode character '\u200b' in position 0: illegal multibyte seq
csv_writer = csv.writer(open('04.csv', 'w', encoding='utf-8'))
sheets = list_position[0].keys()? # 表頭
row_content = []
for item in list_position:
??? row_content.append(item.values())? # 內容
try:
??? csv_writer.writerow(sheets)
??? csv_writer.writerows(row_content)
except Exception as e:
??? print(e)


 1 import urllib.request
 2 import urllib.parse
 3 import json, jsonpath, csv
 4 
 5 url = "https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false"
 6 headers = {
 7     "Accept": "application/json, text/javascript, */*; q=0.單線程",
 8     "Accept-Encoding": "gzip, deflate, br",
 9     "Accept-Language": "zh-CN,zh;q=0.9",
10     "Connection": "keep-alive",
11     "Content-Length": "38",
12     "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
13     "Cookie": "_ga=GA1.2.1963509933.1531996888; user_trace_token=20180719184127-4a8c7914-8b40-11e8-9eb6-525400f775ce; LGUID=20180719184127-4a8c7df2-8b40-11e8-9eb6-525400f775ce; JSESSIONID=ABAAABAAAIAACBI0F0B14254DA54E3CCF3B1F22FE32B179; _gid=GA1.2.1918046323.1536408617; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536408620; X_HTTP_TOKEN=339034308973d0bd323cc0b9b6b3203a; LG_LOGIN_USER_ID=24096d6ba723e146bd326de981ab924b23c1f21775136c3a8be953e855211e61; _putrc=95519B7FB60FCF58123F89F2B170EADC; login=true; unick=%E9%A9%AC%E7%BB%A7%E4%B8%9A; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=55; gate_login_token=3b4fa15daef090780ae377bbcd66dc83af9af0cc6a7f1dd697770790f3b9f9ef; index_location_city=%E4%B8%8A%E6%B5%B7; TG-TRACK-CODE=search_code; _gat=1; LGSID=20180908221639-cd6d2a72-b371-11e8-b62b-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_%25E7%2588%25AC%25E8%2599%25AB%3Fcity%3D%25E4%25B8%258A%25E6%25B5%25B7%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; SEARCH_ID=e559a417b4464fd9bc0b439a67ef0a5a; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536416580; LGRID=20180908222259-afed9b74-b372-11e8-b62b-5254005c3644",
14     "Host": "www.lagou.com",
15     "Origin": "https://www.lagou.com",
16     "Referer": "https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=",
17     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
18     "X-Anit-Forge-Code": "0",
19     "X-Anit-Forge-Token": "None",
20     "X-Requested-With": "XMLHttpRequest"}
21 # params = {"city": "上海", "needAddtionalResult": "false"}
22 list_position = []
23 for pn in range(1, 5):
24     data = {
25         "first": "false",
26         "pn": pn,
27         "kd": "爬蟲"
28     }
29     # params = urllib.parse.urlencode(params)
30     # url = url + params
31     data = urllib.parse.urlencode(data).encode('utf-8')
32     req = urllib.request.Request(url, data=data, headers=headers)
33     print('正在請求第%d頁' % pn)
34     str_data = urllib.request.urlopen(req).read()
35     with open('03.html', 'wb') as f:
36         f.write(str_data)
37     # 轉換成python對象
38     data_list = json.loads(str_data)
39     job_list = jsonpath.jsonpath(data_list, "$..result")[0]
40 
41     for item in job_list:
42         position_dict = {}
43         position_dict['positionName'] = item.get('positionName')
44         position_dict['createTime'] = item.get('createTime')
45         position_dict['url'] = 'https://www.lagou.com/jobs/' + str(item.get('positionId')) + '.html'
46 
47         position_dict['salary'] = item.get('salary')
48         position_dict['workYear'] = item.get('workYear')
49         position_dict['companySize'] = item.get('companySize')
50         list_position.append(position_dict)
51 
52 # 保存到json文件
53 json.dump(list_position, open('03.json', 'w'))
54 
55 # 保存到csv文件  'gbk' codec can't encode character '\u200b' in position 0: illegal multibyte seq
56 csv_writer = csv.writer(open('04.csv', 'w', encoding='utf-8'))
57 sheets = list_position[0].keys()  # 表頭
58 row_content = []
59 for item in list_position:
60     row_content.append(item.values())  # 內容
61 try:
62     csv_writer.writerow(sheets)
63     csv_writer.writerows(row_content)
64 except Excepti

轉載于:https://www.cnblogs.com/We612/p/9978288.html

本文來自互聯網用戶投稿,該文觀點僅代表作者本人,不代表本站立場。本站僅提供信息存儲空間服務,不擁有所有權,不承擔相關法律責任。
如若轉載,請注明出處:http://www.pswp.cn/news/250231.shtml
繁體地址,請注明出處:http://hk.pswp.cn/news/250231.shtml
英文地址,請注明出處:http://en.pswp.cn/news/250231.shtml

如若內容造成侵權/違法違規/事實不符,請聯系多彩編程網進行投訴反饋email:809451989@qq.com,一經查實,立即刪除!

相關文章

小程序 --- 點擊放大功能、獲取位置信息、文字樣式省略、頁面跳轉(navigateTo)

1. 點擊放大功能的實現 需求: 點擊輪播圖中的圖片會實現放大預覽的功能。首先有輪播圖的樣式如下 <!-- pages/goods_detail/index.wxml --> <!-- 輪播圖 --> <view class"detail_swiper"><swiperautoplaycircularindicator-dots><swip…

Axure實現多用戶注冊驗證

*****多用戶登錄驗證***** 一、&#xff08;常規想法&#xff09;方法&#xff1a;工作量較大&#xff0c;做起來繁瑣 1、當用戶名和密碼相同時怎么區分兩者&#xff0c;使用冒號和括號來區分&#xff1a; eg. (admin:123456)(123456:demo)(zhang:san);由此得出前面是括號后面是…

前端插件網址

http://www.swiper.com.cn/轉載于:https://www.cnblogs.com/luchuangao/p/9088057.html

python --- opencv部分學習

1. OpenCV 1.1 opencv概念 OpenCV是一個基于BSD許可(開源)發行的跨平臺計算機視覺庫可以運行在Linux、Windows、Android和Mac OS操作系統上它輕量級而且高效 – 有一系列C函數和少量 C 類構成同時提供了 Python、Ruby、MATLAB等語言的接口實現了圖像處理和計算機視覺方面的很…

hive與hbase集成

環境: hadoop2.7.7 hive3.1.0 hbase2.0.2 1.jar包拷貝(之所以用這種方式,是因為這種方式最為穩妥,最開始用的軟連接的方式,總是卻少jar包)到hive的lib目錄下刪除所有hbase相關的jar rm -rf hbase-*.jar 接著從hbase的lib目錄下拷貝所有的hbase相關jar cp -a hbasehome/lib/hba…

Winform(C#)輸入完畢后,按Enter鍵觸發Button事件

如在輸入“用戶名”和“密碼”之后&#xff0c;有些人習慣按“回車鍵”來代替頁面上的“確定”按鈕&#xff0c;那么這一功能在Winform(C#)里如何實現呢&#xff1f; 觸發密碼文本框的KeyDown事件&#xff0c;代碼如下&#xff1a; [c-sharp] view plaincopy private void txtP…

Maximum Xor Secondary(單調棧好題)

Maximum Xor Secondary CodeForces - 280B Bike loves looking for the second maximum element in the sequence. The second maximum element in the sequence of distinct numbers x1,?x2,?...,?xk (k?>?1) is such maximum element xj, that the following inequa…

python --- udp的使用

1. python的模塊導入規則 參考 1.1 系統自帶模塊 系統自帶的模塊直接import導入 import time import unittest1.2 第三方下載模塊 第三方下載模塊也可以直接導入 import HTMLTestRunner import requests1.3 導入模塊的部分函數或類 from time import sleep,strftime fro…

雜項-公司:唯品會

ylbtech-雜項-公司&#xff1a;唯品會唯品會公司成立于2008年08月&#xff0c;2012年3月23日登陸美國紐約證券交易所上市&#xff08;股票代碼&#xff1a;VIPS&#xff09;。成為華南第一家在美國紐交所上市的電子商務企業。主營B2C商城唯品會名牌折扣網站是一家致力于打造中高…

python --- 使用socket創建tcp服務

1. 網絡-tcp 參考 1.1 tcp簡介 介紹 TCP協議,傳輸控制協議(英語: Transmission Control Protocol, 縮寫為TCP)是一種面向連接的、可靠的、基于字節流的傳輸層通信協議,由IETF的RFC 793定義. TCP通信需要經過創建連接、數據傳送、終止連接三個步驟. TCP通信模型中,在通信開…

Linux基本的操作

一、為什么我們要學習Linux 相信大部分人的PC端都是用Windows系統的&#xff0c;那我們為什么要學習Linux這個操作系統呢&#xff1f;&#xff1f;&#xff1f;Windows圖形化界面做得這么好&#xff0c;日常基本使用的話&#xff0c;學習成本幾乎為零。 而Linux不一樣&#xff…

匯編語言 實驗4

實驗4 實驗內容1&#xff1a;綜合使用 loop,[bx]&#xff0c;編寫完整匯編程序&#xff0c;實現向內存 b800:07b8 開始的連續 16 個 字單元重復填充字數據 0403H&#xff1b;修改0403H為0441H&#xff0c;再次運行 步驟1&#xff1a;在記事本中編寫好temp.asm文件 步驟2&#x…

python --- 線程

1. 多任務 - 線程 參考 首先考慮一個沒有多任務的程序: import timedef sing():# 唱歌 5 秒鐘for i in range(5):print("-----菊花臺ing....-----")time.sleep(1)def dance():# 跳舞 5秒鐘for i in range(5):print("-----跳舞.....-----")time.sleep(5)d…

Python 鏈接匯總

MNIST手寫識別 轉載于:https://www.cnblogs.com/bycnboy/p/9095199.html

17種常用的JS正則表達式 非負浮點數 非負正數

<input typetext idSYS_PAGE_JumpPage nameSYS_PAGE_JumpPage size3 maxlength5 οnkeyupthis.valuethis.value.replace(/[^1-9]\D*$/,"") οndragenter"return false" οnpaste"return !clipboardData.getData(text).match(/\D/)"" sty…

python --- 使用conda配置pytorch

使用Conda配置PyTorch 1. 添加channels 下載地址 $ conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ $ conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/ $ conda config --add channels htt…

LDAP第三天 MySQL+LDAP 安裝

https://www.easysoft.com/applications/openldap/back-sql-odbc.html OpenLDAP 使用 SQLServer 和 Oracle 數據庫。 https://www.cnblogs.com/bigbrotherer/p/7251372.html          CentOS7安裝OpenLDAPMySQLPHPLDAPadmin 1.安裝和設置數據庫 在CentOS7下&…

Myeclipse連接Mysql數據庫時報錯:Error while performing database login with the pro driver:unable...

driver template: Mysql connector/j&#xff08;下拉框進行選擇&#xff09; driver name: 任意填&#xff0c;最好是數據庫名稱&#xff0c;方便查找 connection URL: jdbc:mysql://localhost:3306/programmableweb User name: 用戶名 password: 密碼 Driver jars: 添加jar包…

Centos6.5靜態IP設置

1.創建新的虛擬機 2.打開終端&#xff0c;打開/etc/sysconfig/network-scripts/ifcfg-eth0文件 3.將BOOTPROTOstatic&#xff0c;原值為dhcp 4.添加 IPADDR192.168.43.125  #靜態IP GATEWAY192.168.43.1  #網關 NETMASK255.255.255.0  #子網掩碼 NETWORK192.168.43.0  …

matlab --- 圖像處理基礎

MATLAB圖像處理 1. 數字圖像處理 參考 數字圖像處理(Digital Image Processing)又稱為計算機圖像處理,是一種將圖像信號數字化利用計算進行處理的過程。隨著計算機科學、電子學和光學的發展,數字圖像處理已經廣泛的應用到諸多領域之中。本小節主要介紹圖像的概念、分類和數字…