基于 Python 爬取 TikTok 搜索数据 Tiktok爬虫(2025.3.17)_tiktok 爬虫
1. 前言
在数据分析和网络爬虫的应用场景中,我们经常需要获取社交媒体平台的数据,例如 TikTok。本篇文章介绍如何使用 Python 爬取 TikTok 用户搜索数据,并解析其返回的数据。
结果截图
2. 项目环境准备
在正式运行代码之前,我们需要安装相关的 Python 库:
pip install requests pandas execjs loguru
此外,我们需要一个 JavaScript 运行环境(如 Node.js),用于执行加密签名代码。
3. 代码解析
3.1 初始化爬虫类
我们创建 TiktokUserSearch
类,并在初始化方法 __init__
中设置请求头信息,并初始化输出文件。
class TiktokUserSearch: def __init__(self, output_file=None): self.headers = { # 设置请求头 \"user-agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ...\", \"referer\": \"https://www.tiktok.com/\" } self.cookies = None self.output_file = output_file if output_file else f\'tiktok_videos_{datetime.now().strftime(\"%Y%m%d_%H%M%S\")}.csv\'
3.2 处理 Cookie
我们需要将 TikTok 的 cookie 从字符串转换成字典格式,以便后续请求使用。
def cookie_str_to_dict(self, cookie_str) -> dict: cookie_dict = {} cookies = [i.strip() for i in cookie_str.split(\'; \') if i.strip() != \"\"] for cookie in cookies: key, value = cookie.split(\'=\', 1) cookie_dict[key] = value return cookie_dict
3.3 发送请求
TikTok 需要使用 X-Bogus
进行签名,我们需要执行 JavaScript 代码来获取该参数。
为了防止网络不稳定,设置三次重试机制。
可根据自己需求设置代理。
def get(self, keyword, cursor, search_id, cookie_str): self.cookies = self.cookie_str_to_dict(cookie_str) url = \"https://www.tiktok.com/api/search/general/full/\" if cursor == \"0\": focus_state = \"true\" else: focus_state = \"false\" params = { \"WebIdLastTime\": f\"{int(time.time())}\", \"aid\": \"1988\", \"app_language\": \"zh-Hans\", \"app_name\": \"tiktok_web\", \"browser_language\": \"zh-CN\", \"browser_name\": \"Mozilla\", \"browser_online\": \"true\", \"browser_platform\": \"Win32\", \"browser_version\": self.headers[\'user-agent\'].replace(\'Mozilla/\', \'\'), \"channel\": \"tiktok_web\", \"cookie_enabled\": \"true\", \"cursor\": cursor, \"device_id\": \"7339506347602019870\", \"device_platform\": \"web_pc\", \"focus_state\": focus_state, \"from_page\": \"search\", \"history_len\": \"7\", \"is_fullscreen\": \"false\", \"is_page_visible\": \"true\", \"keyword\": keyword, \"os\": \"windows\", \"priority_region\": \"\", \"referer\": \"\", \"region\": \"KR\", \"screen_height\": \"1080\", \"screen_width\": \"1920\", \"tz_name\": \"Asia/Shanghai\", \"web_search_code\": \"{\\\"tiktok\\\":{\\\"client_params_x\\\":{\\\"search_engine\\\":{\\\"ies_mt_user_live_video_card_use_libra\\\":1,\\\"mt_search_general_user_live_card\\\":1}},\\\"search_server\\\":{}}}\", \"webcast_language\": \"zh-Hans\", \"msToken\": self.cookies[\"msToken\"], } if cursor != \"0\": params.update({\"search_id\": search_id}) x_b = execjs.compile(open(\'./encrypt.js\', encoding=\'utf-8\').read()).call(\"sign\", urlencode(params), self.headers[\"user-agent\"]) params.update({\"X-Bogus\": x_b}) headers = self.headers.copy() headers.update({\"referer\": \"https://www.tiktok.com/search?q=\" + keyword}) max_retries = 3 for attempt in range(max_retries): try: response = requests.get( url, headers=headers, cookies=self.cookies, params=params, timeout=(3, 10), proxies=None ) return response.json() except (ex1, ex2, ex3) as e: print(f\"尝试 {attempt + 1}/{max_retries} 发生网络错误:{e}\") if attempt < max_retries - 1: time.sleep(2) else: return {\"error\": f\"Network error after {max_retries} attempts: {str(e)}\"} except Exception as e: print(f\"发生其他错误:{e}\") return {\"error\": str(e)}
3.4 解析数据并存储
解析 TikTok 返回的视频数据,并保存到 CSV 文件。
def parse_data(self, data_list): resultList = [] video_data = [] for u in data_list: try: item = u[\'item\'] author = item[\'author\'] stats = item[\'stats\'] author_stats = item[\'authorStats\'] # 添加作者统计信息 # 提取需要的数据 video_info = { \'video_id\': item[\'id\'], \'desc\': item[\'desc\'], \'create_time\': datetime.fromtimestamp(item[\'createTime\']).strftime(\'%Y-%m-%d %H:%M:%S\'), \'duration\': item[\'video\'][\'duration\'], # 作者基本信息 \'author_id\': author[\'id\'], \'author_name\': author[\'uniqueId\'], \'author_nickname\': author[\'nickname\'], \'author_signature\': author[\'signature\'], \'author_verified\': author[\'verified\'], # 作者统计信息 \'author_following_count\': author_stats[\'followingCount\'], # 关注数 \'author_follower_count\': author_stats[\'followerCount\'], # 粉丝数 \'author_heart_count\': author_stats[\'heartCount\'], # 获赞总数 \'author_video_count\': author_stats[\'videoCount\'], # 视频总数 \'author_digg_count\': author_stats[\'diggCount\'], # 点赞数 # 视频统计信息 \'digg_count\': stats[\'diggCount\'], \'share_count\': stats[\'shareCount\'], \'comment_count\': stats[\'commentCount\'], \'play_count\': stats[\'playCount\'], \'collect_count\': stats.get(\'collectCount\', 0), \'video_url\': item[\'video\'][\'playAddr\'] } # 添加标签信息 if \'challenges\' in item: video_info[\'hashtags\'] = \',\'.join([tag[\'title\'] for tag in item[\'challenges\']]) else: video_info[\'hashtags\'] = \'\' # 添加音乐信息 if \'music\' in item: music = item[\'music\'] video_info.update({ \'music_id\': music[\'id\'], \'music_title\': music[\'title\'], \'music_author\': music[\'authorName\'], \'music_original\': music[\'original\'] }) video_data.append(video_info) resultList.append(f\"https://www.tiktok.com/@{author[\'uniqueId\']}\") except Exception as e: logger.error(f\"解析视频数据时出错: {str(e)}\") continue # 将数据保存到CSV文件 try: df = pd.DataFrame(video_data) # 检查文件是否存在 file_exists = os.path.exists(self.output_file) # 如果文件不存在,创建新文件并写入表头 # 如果文件存在,追加数据不写入表头 df.to_csv(self.output_file,mode=\'a\',header=not file_exists, index=False,encoding=\'utf-8-sig\') logger.info(f\"数据已{\'追加\' if file_exists else \'保存\'}到文件: {self.output_file}\") except Exception as e: logger.error(f\"保存CSV文件时出错: {str(e)}\") return resultList
3.5 运行爬虫
我们定义 main
方法,负责调用 get
方法获取数据并解析。
def main(self, keyword, cookie_str, cursor=\"0\", search_id=None): dataJson = self.get(keyword, cursor, search_id, cookie_str) if dataJson: if \"error\" in dataJson: return {\"cursor\": cursor, \"search_id\": search_id, \"data\": [], \"status\": \"-2\", \"error\": dataJson[\"error\"]} elif \"verify_event\" in str(dataJson): return {\"cursor\": cursor, \"search_id\": search_id, \"data\": [], \"status\": \"-1\"} else: # 解析数据并保存到CSV if \'data\' in dataJson: self.parse_data(dataJson[\'data\']) return dataJson
3.6 运行入口
最后,我们编写 if __name__ == \'__main__\'
逻辑,定义要爬取的关键词,并进行循环爬取。
if __name__ == \'__main__\': os.makedirs(\'results1\', exist_ok=True) topics = [ \"Chen Duxiu\", \"Li Dazhao\",] for keyword in topics: logger.info(f\"开始爬取 {keyword} 的视频\") output_file = f\'results1/{keyword}_videos.csv\' # 你可以自定义文件名 tiktok = TiktokUserSearch(output_file=output_file) cookie_str = \'_ttp=2ZzUB37CLclhWsrgyW56Erox1XM; tiktok_webapp_theme_auto_dark_ab=1; delay_guest_mode_vid=5; passport_csrf_token=d8e4d28ec7abdf12a7829d524dca64de; passport_csrf_token_default=d8e4d28ec7abdf12a7829d524dca64de; tt_chain_token=SSmpjX/0in/IP8BYwawD+Q==; multi_sids=7361707798058615814%3A53b730c284c4eaaa9bb2157eef01d70d; cmpl_token=AgQQAPNoF-RO0rYU5JqLsx0__dmghl8Nv5IhYNkWMA; passport_auth_status=a8a7a1e1c4b96a994a45acb38dc83509%2C; passport_auth_status_ss=a8a7a1e1c4b96a994a45acb38dc83509%2C; uid_tt=7857882a3366539dc1d9ca226b3fdc91f76b1b072c7da11dd4120368d88bf861; uid_tt_ss=7857882a3366539dc1d9ca226b3fdc91f76b1b072c7da11dd4120368d88bf861; sid_tt=53b730c284c4eaaa9bb2157eef01d70d; sessionid=53b730c284c4eaaa9bb2157eef01d70d; sessionid_ss=53b730c284c4eaaa9bb2157eef01d70d; store-idc=maliva; store-country-code=ca; store-country-code-src=uid; tt-target-idc=useast1a; tt-target-idc-sign=t3pz21FprSb2qc1ucJWFQbxzCKwgoBX9PKUWEbPHh7_4mpPThOuO0EN9pm2ORzFqk0bLFt6MtI9-gofvcVtQFoGSTOI_JvUWIAAUSHz1mM1A9jP1kRk_qucQnxEMOLvir3s4ffm0hJSh62RyKNO5LBTlT-fsqbi2tQVUwrgIGF-2HFT04S52ciyRnKAXr_0NyD3Aa0lM4J4hUGplo46wKRfId1DwwajXudUfjqJ3rvAuA8qURTsSHCKuDjLbcdfhcC0WKqemrmHFBJ11hGFJxiL4VEOClIoJGrF1_S9jvlx0H0Nph9BHlHNA-wzwi3NF6hPK17WL3TSvsqfEiKclZ5ScpHMv7ATYfOK4BVOzKXrq6fCxzNBT5kCNc4-ImuvjBNqpY8yL2s2KusWxslveOyIq3gwU3Dhxl084w5Tsp13xzuFOGNVHK5ZPeS5ERmykYFB6uTIHty9W_Z6pwN1tT9yQ-34qyZRZB7WONZn_NAFsywU6Hj4wcHLQkJ-tIiAO; last_login_method=google; tiktok_webapp_theme_source=auto; tiktok_webapp_theme=dark; sid_guard=53b730c284c4eaaa9bb2157eef01d70d%7C1740207607%7C15551996%7CThu%2C+21-Aug-2025+07%3A00%3A03+GMT; sid_ucp_v1=1.0.0-KDc2NDMxMTQzMTkwNTY2NTJiOWZhMmZhM2ZlMDg3ZDE0YzNiOGU5NTUKGQiGiIec0MeClWYQ9-vlvQYYsws4CEASSAQQAxoGbWFsaXZhIiA1M2I3MzBjMjg0YzRlYWFhOWJiMjE1N2VlZjAxZDcwZA; ssid_ucp_v1=1.0.0-KDc2NDMxMTQzMTkwNTY2NTJiOWZhMmZhM2ZlMDg3ZDE0YzNiOGU5NTUKGQiGiIec0MeClWYQ9-vlvQYYsws4CEASSAQQAxoGbWFsaXZhIiA1M2I3MzBjMjg0YzRlYWFhOWJiMjE1N2VlZjAxZDcwZA; odin_tt=a7027d0b8a102be6dd20600ca35291f4aee8c003895d8913d5c2f8276f16d6b974345357a22ed0bdcb57930d326afded3e5bd3105e061197876c80a92bbdbbfc29402634b8e7439ba178a1c7ed9ca552; tt_csrf_token=qs6ncqIZ-SbUZVzUbkUZ2SViJyY7VzYRki0M; perf_feed_cache={%22expireTimestamp%22:1742302800000%2C%22itemIds%22:[%227481325874978000134%22%2C%227463463290065161505%22%2C%227466633420098112799%22]}; msToken=BnkIjkPpJEc1i9jiiwT_paC5FW-NL62UVF7-lzpHYki9WIA_KpLrplpY-qlZfuG7V12rbCDHiyQYNrZcOnTzZLk1cvnH3_E_89nfOpqpVquKbSR-Nqr6bGDmL220vjBHdutm4R-gfVnYIG7fvWOJUkZ7yg==; ttwid=1%7Cv5j4n07c_G3ZtA91KIuree-ptnDLwgTwFuM8BnZINnQ%7C1742131441%7Ce88a85fcc36fd7e79815fddb10d16ef553b5e2e4a51c65e2c40098ade19023e2; msToken=iYDKRqCM8rSqc_9ZDzQnWcQiv_iJqPk15-6Y-iFBUmk4uIzb61dM13b9fWHcg4hxGkl9L3n56glok05TllvGurkwpgBYEF8N76ZRIii7OvNEkrk004dagNuoqQVeV9Bzd0_9naXjFXtEiMRi330G5Jdakw==; passport_fe_beating_status=false\' has_more = 1 cursor = \'0\' search_id = None while has_more: data = tiktok.main(keyword, cookie_str, cursor, search_id) logger.info(data) if data and isinstance(data, dict): has_more = data.get(\'has_more\', 0) logger.info(has_more) cursor = data.get(\'cursor\', \'0\') search_id = data.get(\'log_pb\', {}).get(\'impr_id\') if \'data\' in data: data = data[\'data\'] else: logger.error(\"No data found in response\") break else: logger.error(\"Invalid response format\") break time.sleep(1) # 添加延时避免请求过快 logger.info(f\"爬取 {keyword} 的视频完成\") time.sleep(30) logger.info(f\"等待30秒后继续爬取下一个主题\") logger.info(\"所有主题的视频爬取完成\")
4. 关键问题与解决方案
-
TikTok 反爬措施:
- 需要定期更新 Cookie
- 适当增加请求间隔
- 使用代理提高稳定性
-
X-Bogus
参数:- 需要使用 JavaScript 计算签名
- 依赖
encrypt.js
进行加密
-
数据存储:
- 使用
pandas
处理数据 - 追加模式写入 CSV,避免数据丢失
- 使用
6. 总结
本文介绍了如何使用 Python 爬取 TikTok 用户搜索数据,包括如何构造请求、解析数据并存储到 CSV 文件。希望对有类似需求的读者有所帮助!
帮助与咨询:私信博主或在评论区留言