下載nginx搭建的文件服務器(爬蟲)
windows版
需要下載python包:pip install requests
import requests
import re
import os#開始訪問的url地址,必須以/結尾
index_url = "https://www.aaa.com/aaaaa/"
#下載到本地的地址,必須以/結尾
local_address = "D:/up/"def getHtml(index_url,local_address):resp = requests.get(index_url)html_content = resp.text# 使用re.DOTALL標志使.匹配包括換行符在內的任何字符 pattern = re.compile(r'<a\s+[^>]*?>(.*?)</a>', re.DOTALL)matches = pattern.findall(html_content)for match in matches:if("../"!=match):if("/"==match[-1]):#遞歸dg_url = index_url+matchdg_local_address = local_address+matchgetHtml(dg_url,dg_local_address)else:hq_index_url = index_url+matchhq_local_address = local_address+matchprint(hq_index_url+"=========="+hq_local_address)downFile(hq_index_url,hq_local_address)# 獲取內容,并下載
def downFile(url,local_address):# 創建目錄# 分離目錄和文件名 directory, filename = os.path.split(local_address) # 檢查目錄是否存在,如果不存在則創建 if not os.path.exists(directory): os.makedirs(directory) response = requests.get(url, stream=True) # 使用stream=True以節省內存# 檢查響應狀態碼 if response.status_code == 200: # 打開文件以二進制寫入模式 with open(local_address, 'wb') as f: # 迭代響應內容 for chunk in response.iter_content(chunk_size=8192): # 如果chunk存在,則寫入文件 if chunk: f.write(chunk) else: print("下載出錯:"+url)getHtml(index_url,local_address)