從小紅書上爬取評論,但是目前還不能完全爬取子評論,使用GPT沒能解決這個問題。
后續博主可能會改進。或者如果你懂的話,可以在博主代碼基礎上改進。
需要安裝nodejs軟件,部署環境變量。博主是在pycharm中運行的。
代碼無套路獲取。自行修改參數(中文在代碼里標記了)即可。
var http = require('http');
var https = require('https');
var _ = require('lodash');
const XLSX = require('xlsx'); // 引入 xlsx 庫
const path = require('path');// API 請求配置
const options = {hostname: 'edith.xiaohongshu.com',port: 443,path: '/api/sns/web/v2/comment/page?note_id=你要爬取的筆記id&cursor=&top_comment_id=&image_formats=jpg,webp,avif',method: 'GET',headers: {'Cookie': '你的cookie'}
};https.get(options, (resp) => {let data = '';resp.on("data", (chunk) => {data += chunk;});resp.on('end', () => {console.log('Response Data:', data); // Print the raw responsetry {const jsonResponse = JSON.parse(data);// Check if the response contains the expected data structureif (jsonResponse.data && jsonResponse.data.comments) {const records = [];let commentIdCounter = 1; // Initialize a counter for parent comment IDs// Process parent commentsjsonResponse.data.comments.forEach(item => {const parentComment = {comment_id: commentIdCounter++, // Assign unique ID for parent commentsnickname: item.user_info && item.user_info.nickname ? item.user_info.nickname : 'No Nickname',content: item.content || '',url: item.pictures?.[0]?.url || '', // First image URLparent_comment_id: 'Parent Comment', // Mark parent comments as 'Parent Comment'};records.push(parentComment);// Process sub-comments and add indentation to show hierarchyif (item.sub_comments && item.sub_comments.length > 0) {item.sub_comments.forEach(subItem => {const subComment = {comment_id: commentIdCounter++, // Assign unique ID for sub-commentsnickname: subItem.user_info && subItem.user_info.nickname ? subItem.user_info.nickname : 'No Nickname',content: ' ' + (subItem.content || ''), // Indent to show it's a sub-commenturl: subItem.pictures?.[0]?.url || '', // First image URLparent_comment_id: parentComment.comment_id // Link sub-comment to parent comment};records.push(subComment);});}});// Sort records by the original order (comment_id) or creation timerecords.sort((a, b) => a.comment_id - b.comment_id);// Create a new workbook and add a sheetconst wb = XLSX.utils.book_new();const ws = XLSX.utils.json_to_sheet(records);// Add the sheet to the workbookXLSX.utils.book_append_sheet(wb, ws, 'Comments');// Save the workbook as an XLSX fileconst filePath = path.join(__dirname, 'comments_with_parent_child_hierarchy.xlsx');XLSX.writeFile(wb, filePath);console.log('The XLSX file was written successfully at:', filePath);} else {console.error('No comments data found or data structure is incorrect');}} catch (error) {console.error('Error parsing response data:', error);}});}).on('error', (err) => {console.error('Request failed:', err);
});