python爬取知網論文關鍵詞_Python爬蟲根據關鍵詞爬取知網論文摘要并保存到數據庫中...

由于實驗室需要一些語料做研究，語料要求是知網上的論文摘要，但是目前最新版的知網爬起來有些麻煩，所以我利用的是知網的另外一個搜索接口

搜索出來的結果和知網上的結果幾乎一樣

在這個基礎上，我簡單看了些網頁的結構，很容易就能寫出爬取得代碼（是最基礎的，相當不完善，增加其他功能可自行增加）

網頁的結構還是很清晰的

摘要信息也很清晰

我使用的是 pymysql 連接的數據庫，效率也還可以

下面直接貼代碼：

# -*- coding: utf-8 -*-

import time

import re

import random

import requests

from bs4 import BeautifulSoup

import pymysql

connection = pymysql.connect(host='',

user='',

password='',

db='',

port=3306,

charset='utf8') # 注意是utf8不是utf-8

# 獲取游標

cursor = connection.cursor()

#url = 'http://epub.cnki.net/grid2008/brief/detailj.aspx?filename=RLGY201806014&dbname=CJFDLAST2018'

#這個headers信息必須包含，否則該網站會將你的請求重定向到其它頁面

headers = {

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'Accept-Encoding':'gzip, deflate, sdch',

'Accept-Language':'zh-CN,zh;q=0.8',

'Connection':'keep-alive',

'Host':'www.cnki.net',

'Referer':'http://search.cnki.net/search.aspx?q=%E4%BD%9C%E8%80%85%E5%8D%95%E4%BD%8D%3a%E6%AD%A6%E6%B1%89%E5%A4%A7%E5%AD%A6&rank=relevant&cluster=zyk&val=CDFDTOTAL',

'Upgrade-Insecure-Requests':'1',

'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'

}

headers1 = {

'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'

}

def get_url_list(start_url):

depth = 20

url_list = []

for i in range(depth):

try:

url = start_url + "&p=" + str(i * 15)

search = requests.get(url.replace('\n', ''), headers=headers1)

soup = BeautifulSoup(search.text, 'html.parser')

for art in soup.find_all('div', class_='wz_tab'):

print(art.find('a')['href'])

if art.find('a')['href'] not in url_list:

url_list.append(art.find('a')['href'])

print("爬取第" + str(i) + "頁成功！")

time.sleep(random.randint(1, 3))

except:

print("爬取第" + str(i) + "頁失敗！")

return url_list

def get_data(url_list, wordType):

try:

# 通過url_results.txt讀取鏈接進行訪問

for url in url_list:

i = 1;

if url == pymysql.NULL or url == '':

continue

try:

html = requests.get(url.replace('\n', ''), headers=headers)

soup = BeautifulSoup(html.text, 'html.parser')

except:

print("獲取網頁失敗")

try:

print(url)

if soup is None:

continue

# 獲取標題

title = soup.find('title').get_text().split('-')[0]

# 獲取作者

author = ''

for a in soup.find('div', class_='summary pad10').find('p').find_all('a', class_='KnowledgeNetLink'):

author += (a.get_text() + ' ')

# 獲取摘要

abstract = soup.find('span', id='ChDivSummary').get_text()

# 獲取關鍵詞，存在沒有關鍵詞的情況

except:

print("部分獲取失敗")

pass

try:

key = ''

for k in soup.find('span', id='ChDivKeyWord').find_all('a', class_='KnowledgeNetLink'):

key += (k.get_text() + ' ')

except:

pass

print("第" + str(i) + "個url")

print("【Title】：" + title)

print("【author】：" + author)

print("【abstract】：" + abstract)

print("【key】：" + key)

# 執行SQL語句

cursor.execute('INSERT INTO cnki VALUES (NULL, %s, %s, %s, %s, %s)', (wordType, title, author, abstract, key))

# 提交到數據庫執行

connection.commit()

print()

print("爬取完畢")

finally:

print()

if __name__ == '__main__':

try:

for wordType in {"大腸桿菌", "菌群總落", "胭脂紅", "日落黃"}:

wordType = "肉+" + wordType

start_url = "http://search.cnki.net/search.aspx?q=%s&rank=relevant&cluster=zyk&val=" % wordType

url_list = get_url_list(start_url)

print("開始爬取")

get_data(url_list, wordType)

print("一種類型爬取完畢")

print("全部爬取完畢")

finally:

connection.close()1

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

在這里的關鍵詞我簡單的選了幾個，作為實驗，如果爬取的很多，可以寫在txt文件里，直接讀取就可以，非常方便。

本文來自互聯網用戶投稿，該文觀點僅代表作者本人，不代表本站立場。本站僅提供信息存儲空間服務，不擁有所有權，不承擔相關法律責任。
如若轉載，請注明出處：http://www.pswp.cn/news/539163.shtml
繁體地址，請注明出處：http://hk.pswp.cn/news/539163.shtml
英文地址，請注明出處：http://en.pswp.cn/news/539163.shtml

如若內容造成侵權/違法違規/事實不符，請聯系多彩編程網進行投訴反饋email:809451989@qq.com，一經查實，立即刪除！