> 文档中心 > jd爬虫全流程requests+json+etree+pandas

jd爬虫全流程requests+json+etree+pandas

今天打开pycharm发现去年十二月各种翻阅资料总结写的爬虫记录还在,记录下

温馨提示:爬虫玩得越行,你就越刑。切忌在合法合理情况下使用。
中国python爬虫违法违规案例大汇总:https://blog.csdn.net/huang5333/article/details/114656888

下面开始正题,先来个最终获取数据
jd爬虫全流程requests+json+etree+pandas

思路:

1、输入关键字,获取每个物品URL
2、将所获取的URL存储到一个集合中A,当使用时,从集合取出一条,集合数量A-1
3、取出一条URL,抓取信息数据
4、存储数据

一、获取页面URL

模拟在搜索栏输入关键字,获取页面物品URL(目前代码只能获取30条URL)
例如:url = ‘https://search.jd.com/Search?keyword=小米8’

import requestsfrom lxml import etreeheaders = {    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '    'Chrome/92.0.4515.131 Safari/537.36'}class url_data:    def get_url(self, url): urls = list() response = requests.get(url, headers=headers) response.encoding = 'utf-8' detail = etree.HTML(response.text)  # etree转HTML for i in range(30):     [product_name] = detail.xpath(  '//*[@id="J_goodsList"]/ul/li[' + str(i + 1) + ' ]/div/div[1]/a/@href')  # 商品url     # print("href:" + str(product_name))     urls.append(str(product_name)) print(urls) return urlsif __name__ == '__main__':    url = 'https://search.jd.com/Search?keyword=小米8'    u = url_data()    u.get_url(url)

二、保存URL到集合中

class URLManager(object):    def __init__(self): # 设定新url与以爬取url集合 self.new_urls = set() self.old_urls = set()    def save_new_url(self, url): # 将单条新url保存到待爬取集合中 if url is not None:     if url not in self.new_urls and url not in self.old_urls:  print("保存新URL:{}".format(url))  self.new_urls.add(url)    def save_new_urls(self, url_list): # 批量保存url for url in url_list:     self.save_new_url(url)    def get_new_url(self): # 取出一条未爬取的url,同时保存到以爬取url中 if self.get_new_url_num() > 0:     url = self.new_urls.pop()     self.old_urls.add(url)     return url else:     return None    def get_new_url_num(self): # 返回未爬取的url数量 return len(self.new_urls)    def get_old_url_num(self): # 返回已经爬取的url数量 return len(self.old_urls)

三、取数

提示:当前创作和你的工作、学习是什么样的关系
例如:

  1. 创作是否已经是你生活的一部分了
  2. 有限的精力下,如何平衡创作和工作学习
import reimport requestsimport jsonfrom lxml import etreeimport pandas as pd# 代理headers = {    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '    'Chrome/92.0.4515.131 Safari/537.36'}class JDdata:    def get_data(self, url): # data = list() # 取出商品id global product_name, title url_p = 'https://item.jd.com/(.*?).html' Goods_id = re.findall(url_p, url, re.S) # print(Goods_id) response = requests.get(url, headers=headers) response.encoding = 'utf-8' detail = etree.HTML(response.text)  # etr转HTML """---------------名称------------------------""" try:     # [product_name] = detail.xpath('//*[@id = "detail"]/div[2]/div[1]/div[1]/ul[3]/li[1]/@title')     [product_name] = detail.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[1]/@title')  # 商品名称     # print("商品名称:" + str(product_name)) except:     print('商品名称无法抓取,请查看设置的“xpath”格式') """---------------品牌------------------------""" # [product_brand] = detail.xpath('//*[@id="parameter-brand"]/li/a/text()')  # 品牌 [product_brand] = detail.xpath('//*[@id="parameter-brand"]/li/a/text()') print("品牌:" + str(product_brand)) """---------------标题------------------------""" # [title] = detail.xpath('/html/body/div[6]/div/div[2]/div[1]/text()')  # 标题 # # title = str(title).strip() # print("标题:" + str(title).strip()) """---------------价格------------------------""" # 请求价格信息json p = requests.get('https:' + '//p.3.cn/prices/mgets?skuIds=J_' + Goods_id[0], headers=headers)  # 请求商品价格json [product_dict] = json.loads(p.text)  # 获取商品价格 product_m_price = product_dict['m']  # 最高价格 product_price = product_dict['p']  # 当前价格 product_o_price = product_dict['op']  # 指导价格 # print('最高价格:' + str(product_m_price)) # print("商品价格:" + str(product_price)) # print("指导价格:" + str(product_o_price)) # 请求评论信息json c = requests.get('https://club.jd.com/comment/productCommentSummaries.action?referenceIds=' + Goods_id[0],    headers=headers) comment_dict = json.loads(c.text.split('[')[-1].split(']')[0])  # json内容截取 total_comment_num = comment_dict['CommentCountStr'] good_comment_num = comment_dict['GoodCountStr'] good_percent_com = comment_dict['GoodRate'] bad_comment_num = comment_dict['PoorCountStr'] bad_percent_com = comment_dict['PoorRate'] # print('总评论数为:' + str(total_comment_num)) # print('好评数:' + str(good_comment_num)) # print('好评率:' + str(good_percent_com)) # print('差评数:' + str(bad_comment_num)) # print('差评率:' + str(bad_percent_com)) print('差评率:' + str(bad_percent_com)) data = pd.DataFrame([[str(product_name), str(product_brand).strip(), product_price, product_o_price,  str(total_comment_num), str(good_comment_num), str(good_percent_com),  str(bad_comment_num), str(bad_percent_com)]],columns=['product_name', 'product_brand', 'product_price', 'product_o_price',  'total_comment_num', 'good_comment_num', 'good_percent_com',  'bad_comment_num', 'bad_percent_com']) return data# 测试if __name__ == '__main__':    url = 'https://item.jd.com/100020693650.html'    jd = JDdata()    jd.get_data(url)

四、存储数据

import osimport pandas as pdclass DataSave:    # 指定数据保存的文件路径    def __init__(self, path): self.path = path    def save(self, data): # 判断文件路径是否存在,不存在自动创建添加数据,否则追加数据,去重表头 if not os.path.exists(self.path):     print('--->' + '\'' + self.path + '\'' + '路径不存在')     print('--->' + '已自动创建该路径')     data.to_csv(self.path, encoding='utf_8_sig', index=False, index_label=False, mode='a')     print('--->' + '数据首次保存') else:     data.to_csv(self.path, encoding='utf_8_sig', index=False, mode='a', header=False, index_label=False)     print('--->' + '数据追加保存')if __name__ == "__main__":    data = pd.DataFrame([['李王三', '48', '男'], ['张一五', '43', '男']], columns=['name', 'age', 'sex'])    save_path = 'E:\\jd.csv'    ds = DataSave(save_path)    ds.save(data)

五、任务调度

from urlmanager import URLManagerfrom goods_data import JDdatafrom url_data import url_datafrom save_data import DataSaveheaders = {    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '    'Chrome/92.0.4515.131 Safari/537.36'}class Scheduler:    def __init__(self, path, root_url): # 初始化各个组件 self.url_manager = URLManager() self.url_data = url_data() self.jd_data = JDdata() self.save_data = DataSave(path) self.root_url = root_url # self.count = count    def run_spider(self): # 先添加一条url到未爬取url集合中 self.url_manager.save_new_url(self.root_url) # 获取一条未爬取url url = self.url_manager.get_new_url() # 通过搜索商品获得的所有url(目前是30条) new_urls = self.url_data.get_url(url) # 将获取到的url保存到未爬取url集合中 self.url_manager.save_new_urls(new_urls) # print(new_urls) # 保存数据到本地文件 # self.data_save.save(data) # get_new_url会减少一条url,所以只需判断大于0 while len(self.url_manager.new_urls) > 0:     url = self.url_manager.get_new_url()     url = 'https:' + url     print(url)     # print(url)     data = self.jd_data.get_data(url)     self.save_data.save(data)if __name__ == "__main__":    root_url = "https://search.jd.com/Search?keyword=裤子"    save_url = "E:\\jd.csv"    Spider = Scheduler(save_url, root_url)    Spider.run_spider()