python爬取小红书 - 穆世明博客

window.scrollTo(0, window.scrollY + vh * 1.5)

ul = document.querySelector(‘#userPostedFeeds’).querySelectorAll(‘.cover’)

ul.forEach((e,index)=>{
// length 为 0 时是图片，为 1 时为视频
work_obj[e.href] = ul[index].querySelector(‘.play-icon’) ? 1 : 0
})
// 延迟500ms
await delay(500);
// console.log(last_height, document.body.offsetHeight)

// 判断是否滚动到底部
if(document.body.offsetHeight > last_height){
action()
}else{
console.log(‘end’)
// 作品的数量
console.log(Object.keys(work_obj).length)

// 转换格式，并下载为txt文件 var content = JSON.stringify(work_obj); var blob = new Blob([content], {type: "text/plain;charset=utf-8"}); var link = document.createElement("a"); link.href = URL.createObjectURL(blob); link.download = "xhs_works.txt"; link.click();

action()

         写出的 txt 文件内容如下： ![](https://img-blog.csdnimg.cn/direct/08a7f89701a74dd69bab2af630f7c251.png) 3、在 Python 中读入该文件并做准备工作

获取当前时间

def get_current_time():
now = datetime.now()
format_time = now.strftime(“_%Y-%m-%d__%H-%M-%S-%f__”)
return format_time

下载的作品保存的路径，以作者主页的 id 号命名

ABS_BASE_URL = f’G:\c006023’

检查作品是否已经下载过

def check_download_or_not(work_id, is_pictures):
end_str = ‘pictures’ if is_pictures else ‘video’
# work_id 是每一个作品的目录，检查目录是否存在并且是否有内容，则能判断对应的作品是否被下载过
path = f’{ABS_BASE_URL}/{work_id}-{end_str}’
if os.path.exists(path) and os.path.isdir(path):
if os.listdir(path):
return True
return False

下载资源

def download_resource(url, save_path):
response = requests.get(url, stream=True)
if response.status_code == 200:
with open(save_path, ‘wb’) as file:
for chunk in response.iter_content(1024):
file.write(chunk)

 读入文件，判断作品数量然后进行任务分配：

读入文件

content = ‘’
with open(‘./xhs_works.txt’, mode=‘r’, encoding=‘utf-8’) as f:
content = json.load(f)

转换成 [[h

上一篇：小红书出现验证滑块验证不好用了怎么办

下一篇：小红书小程序爬虫

版权声明：
本文来源网络，所有图片文章版权属于原作者，如有侵权，联系删除。

本文网址：https://www.mushiming.com/mjsbk/500.html