Python爬虫------批量爬取douyin视频,轻松爬取douyin视频(不包含付费内容)_批量爬取抖音视频
Python爬虫------批量爬取douyin视频
准备所需模块
DrissionPagerequests
我们以下图视频为例
基础版代码
缺点:无法爬取所有视频,因为douyin有懒加载机制,只有用户向下滑动页面才会更新新的数据。
import requests headers = { \'Referer\':\'https://www.douyin.com/user/MS4wLjABAAAA3q_M7SAG4eQnFrskafFBDLnycg_2s21oi7Q_aI42C2Q\', \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0\'} url = \'https://www.douyin.com/aweme/v1/web/aweme/post/?device_platform=webapp&aid=6383&channel=channel_pc_web&sec_user_id=MS4wLjABAAAA3q_M7SAG4eQnFrskafFBDLnycg_2s21oi7Q_aI42C2Q&max_cursor=0&locate_query=false&show_live_replay_strategy=1&need_time_list=1&time_list_query=0&whale_cut_token=&cut_version=1&count=18&publish_video_strategy_type=2&update_version_code=170400&pc_client_type=1&version_code=290100&version_name=29.1.0&cookie_enabled=true&screen_width=1536&screen_height=960&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=120.0.0.0&browser_online=true&engine_name=Blink&engine_version=120.0.0.0&os_name=Windows&os_version=10&cpu_core_num=16&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=100&webid=7380299229899687463&msToken=8gdJJvLIyhS_BJtjtw7Fqu6mNVZbrR4qZwUd91GW7-zTxDLPHRc2ihWTkedEWHw8Toe1qvjz8KnzrBxK28MRww8keaOC98zmHIP3prq_ntj3oKTSSHJZObYF0oTkHHg%3D&a_bogus=Yv80QmhkDDDNXd6h5UCLfY3q6lZ3Ypsn0trEMD2fmVVbsg39HMPY9exErhTva26jN4%2FkIbfjy4hSYpqMxQIbA3v6HSRKlICh-g00t-P2so0j5ZhjCfukrUmF-vzWt-Bp-JV3EcvMoJKrFRw0AIee-wHvnwVxapt2&verifyFp=verify_lztf7ywm_4zA5RL96_bnv5_4Uke_BXUL_Vtk8j1ogZmPa&fp=verify_lztf7ywm_4zA5RL96_bnv5_4Uke_BXUL_Vtk8j1ogZmPa\' resp = requests.get(url, headers=headers) aweme_list = resp.json().get(\"aweme_list\") for aweme in aweme_list: title = aweme.get(\"desc\") url = aweme.get(\"video\").get(\"play_addr\").get(\"url_list\")[-1] resp = requests.get(url) with open(f\"./章若楠/{title}.mp4\", \"wb\") as f: f.write(resp.content) print(f\"{title}下载成功!\")
进阶版代码
可以自己写向下滑动等更自动化的操作,这里我不想写了。
from DrissionPage import ChromiumPagefrom tqdm import tqdmimport requestsimport osimport timeprint(\"请输入抖音用户链接(如:https://www.douyin.com/user/MS4wLjABAAAA3q_M7SAG4eQnFrskafFBDLnycg_2s21oi7Q_aI42C2Q?from_tab_name=main&vid=7512800351692623114)\\n\")name = input(\"抖音用户链接: \").strip()#浏览器page = ChromiumPage()#等待用户登录print(\"正在打开抖音页面,请手动登录...\")page.get(\"https://www.douyin.com/\")# 等待用户登录完成input(\"请先登录抖音账号,登录完成后按回车键继续...\")# 启动监听print(\"启动数据监听...\")page.listen.start(\"aweme/post\")# 跳转到指定用户页面print(\"正在跳转到指定用户页面...\")page.get(name)page.wait(3) # 等待页面加载完成print(\"页面加载完成,请开始手动浏览\")print(\"\\n=== 手动滚动模式 ===\")print(\"请手动滚动页面浏览视频,程序将自动收集浏览过的视频数据\")print(\"当您浏览完成后,请在控制台按 Enter 键结束收集\")all_videos = [] # 存储所有收集到的视频video_ids = set() # 用于去重的视频ID集合# 持续监听模式print(\"监听已启动,请开始手动滚动浏览视频...\")print(\"提示:滚动时程序会自动收集数据,完成后按 Enter 键结束\")try: while True: try: # 非阻塞方式检查是否有新数据 resp = page.listen.wait(timeout=2) if resp and resp.response.body: data_json = resp.response.body if \'aweme_list\' in data_json and data_json[\'aweme_list\']: new_videos = data_json[\'aweme_list\'] new_count = 0 for video in new_videos: video_id = video.get(\'aweme_id\') if video_id and video_id not in video_ids: video_ids.add(video_id) all_videos.append(video) new_count += 1 if new_count > 0: print(f\"收集到 {new_count} 个新视频,总计 {len(all_videos)} 个\") except Exception: # 超时或其他异常,继续循环 pass # 检查用户是否按下 Enter 键(Windows兼容) import msvcrt import sys if msvcrt.kbhit(): key = msvcrt.getch() if key == b\'\\r\': # Enter键 print(\"\\n用户结束收集\") break except KeyboardInterrupt: print(\"\\n用户中断收集\")except Exception as e: print(f\"监听过程中出错: {e}\")print(f\"\\n=== 数据收集完成 ===\")print(f\"总共收集到 {len(all_videos)} 个视频\")# 使用收集到的所有视频数据data = all_videosheaders = { \"User-Agent\": \"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Mobile Safari/537.36 Edg/129.0.0.0\", \"Referer\": \"https://www.douyin.com/\",}def time(ms): \"\"\"将毫秒转换为分钟\"\"\" if ms is None: return \"未知\" minutes = ms // 60000 seconds = (ms % 60000) // 1000 return f\"{minutes}分{seconds}秒\" if minutes > 0 else f\"{seconds}秒\"def clean_filename(name): \"\"\"清理文件名中的非法字符\"\"\" invalid_chars = \'/\\\\:*?\"|\' for char in invalid_chars: name = name.replace(char, \'_\') return name.strip() or \"未命名视频\" # 处理空标题情况# 统计用户信息的循环for x in data: print(\"抖音用户昵称:\",x[\"author\"][\"nickname\"]) print(\"抖音用户uid:\",x[\"author\"][\"uid\"]) break# 展示视频列表供用户选择print(\"\\n视频列表:\")for idx, video in enumerate(data, 1): print(f\"{idx}. {video[\'desc\']}\")selected = input(\"\\n请输入要下载的视频序号(多个用逗号分隔,如1,3): \").strip()if not selected: print(\"未选择任何视频,退出下载\")else: try: # 检查并创建 download 文件夹 print(\"\\n检查下载文件夹...\") if not os.path.exists(\"download\"): print(\"\\n正在创建下载文件夹...\") os.makedirs(\"download\") print(\"下载文件夹创建成功!\") else: print(\"\\n下载文件夹已存在!\") indexes = [int(i)-1 for i in selected.split(\",\")] valid_indexes = [i for i in indexes if 0 <= i < len(data)] if not valid_indexes: print(\"输入的序号无效,退出下载\") else: # 下载选中视频( for idx in valid_indexes: video = data[idx] video_url = video[\"video\"][\"play_addr\"][\"url_list\"][0] aweme_id = video[\"aweme_id\"] video_desc = video[\"desc\"] cleaned_desc = clean_filename(video_desc) # 清理标题作为文件名 print(f\"\\n正在下载视频:{video_desc}\") # 重试机制 max_retries = 3 retry_count = 0 download_success = False while retry_count < max_retries and not download_success: try: if retry_count > 0: print(f\"第{retry_count + 1}次尝试下载...\") file_path = f\"download/{cleaned_desc}.mp4\" # 检查是否存在部分下载的文件(断点续传) resume_header = headers.copy() initial_pos = 0 if os.path.exists(file_path) and retry_count > 0: initial_pos = os.path.getsize(file_path) if initial_pos > 0: print(f\"检测到部分下载文件,从 {initial_pos:,} 字节处继续下载...\") resume_header[\'Range\'] = f\'bytes={initial_pos}-\' response = requests.get(video_url, headers=resume_header, stream=True, timeout=30) response.raise_for_status() # 获取文件总大小(字节) if \'Content-Range\' in response.headers: # 断点续传情况下的总大小 content_range = response.headers[\'Content-Range\'] total_size = int(content_range.split(\'/\')[-1]) else: # 正常下载情况下的总大小 total_size = int(response.headers.get(\'Content-Length\', 0)) if total_size == 0: print(\"警告:无法获取文件大小,继续下载...\") # 初始化tqdm进度条 progress_bar = tqdm( total=total_size, initial=initial_pos, unit=\'B\', unit_scale=True, # 自动转换为KB/MB等单位 unit_divisor=1024, desc=f\"下载中:{cleaned_desc}\", leave=True # 完成后保留进度条 ) downloaded_size = initial_pos # 流式写入并更新进度 mode = \"ab\" if initial_pos > 0 else \"wb\" # 断点续传用追加模式 with open(file_path, mode) as f: for chunk in response.iter_content(chunk_size=8192): if chunk: # 过滤空块 f.write(chunk) downloaded_size += len(chunk) progress_bar.update(len(chunk)) # 更新进度 progress_bar.close() # 关闭进度条 # 验证下载完整性 if total_size > 0 and downloaded_size != total_size: print(f\"警告:下载大小不匹配!预期:{total_size} 字节,实际:{downloaded_size} 字节\") if downloaded_size < total_size * 0.95: # 如果下载不足95%,认为失败 raise Exception(f\"下载不完整,只下载了 {downloaded_size}/{total_size} 字节\") else: print(\"下载基本完整,继续...\") # 检查文件是否存在且大小合理 if os.path.exists(file_path): file_size = os.path.getsize(file_path) if file_size > 0: print(f\"\\n下载成功!保存为 download/{cleaned_desc}.mp4\") print(f\"文件大小:{file_size:,} 字节\") download_success = True else: raise Exception(\"下载的文件大小为0\") else: raise Exception(\"下载的文件不存在\") except requests.exceptions.RequestException as e: retry_count += 1 print(f\"网络错误(第{retry_count}次尝试): {str(e)}\") if retry_count < max_retries: print(\"等待3秒后重试...\") time.sleep(3) except Exception as e: retry_count += 1 print(f\"下载错误(第{retry_count}次尝试): {str(e)}\") if retry_count < max_retries: print(\"等待3秒后重试...\") time.sleep(3) if not download_success: print(f\"下载失败({video_desc}): 已尝试{max_retries}次,仍然失败\") print(\"\\n选中视频详细信息:\") for idx in valid_indexes: i = data[idx] print(\"\\n视频标题:\",i[\"desc\"]) print(\"视频播放地址:\",i[\"video\"][\"play_addr\"][\"url_list\"][0]) print(\"视频封面地址:\",i[\"video\"][\"cover\"][\"url_list\"][0]) print(\"视频动态封面地址:\",i[\"video\"][\"dynamic_cover\"][\"url_list\"][0]) print(\"视频aweme_id:\",i[\"aweme_id\"]) print(\"视频创建时间:\",i[\"create_time\"]) print(\"视频时长:\",time(i[\"video\"][\"duration\"])) except ValueError: print(\"输入格式错误,请输入数字序号\")page.close()
相关技术细节可参考:
1.https://blog.csdn.net/YCH0309/article/details/141268110
2.https://www.drissionpage.cn/