> 文档中心 > 爬子第一篇:zol手机型号参数抓取

爬子第一篇:zol手机型号参数抓取


目标

爬取url:https://detail.zol.com.cn/cell_phone_advSearch/subcate57_1_s8975_1_1__2.html
数据需求:
抓取主流品牌的所有手机机型和相关参数。这应该是我写过的第一个正式的爬虫。
在这里插入图片描述

方法论

不需要登陆。没有加密参数。只需要cookie和ua就能获取。通过selenium获取cookie,通过分页获取每个手机的url【注:详情页的url需要手动拼接】,再获取详情页的参数。
提前准备好要抓取的列表。包括品牌、url、页数。如:
brand,url,pagenum
华为,https://detail.zol.com.cn/cell_phone_advSearch/subcate57_1_m613_1_1__{}.html,24
vivo,https://detail.zol.com.cn/cell_phone_advSearch/subcate57_1_m1795_1_1__{}.html,15
oppo,https://detail.zol.com.cn/cell_phone_advSearch/subcate57_1_m1673_1_1__{}.html,13

代码

import csvimport jsonimport timeimport pandas as pdfrom selenium import webdriverimport requestsfrom lxml import etreedef down_cookie():    url = 'https://detail.zol.com.cn/'    driver = webdriver.Chrome(executable_path='/Users/fangli/Downloads/chromedriver')    driver.get(url)    dictCookies = driver.get_cookies()  # 核心    jsonCookies = json.dumps(dictCookies)    print(jsonCookies)    # 登录完成后将cookie保存到本地文件    with open('cookies.json', 'w') as f: f.write(jsonCookies)    time.sleep(3)    driver.close()def get_cookie():    with open('cookies.json', 'r', encoding='utf-8') as f: listCookies = json.loads(f.read())    cookie = [item["name"] + "=" + item["value"] for item in listCookies]    cookiestr = '; '.join(item for item in cookie)    return cookiestrdef get_response(pageurl):    cookie=get_cookie()    headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36" , "cookie": cookie    }    try: response = requests.get(url=pageurl, headers=headers).text return response    except: print('更换cookie') down_cookie() get_response(pageurl)def get_datail(detail_url):    response = get_response(detail_url)    html = etree.HTML(response)    phone_name  = str(html.xpath('//h1[@class="product-model__name"]/text()')[0]).replace('参数','')    with open(file='手机参数数据采集全量.csv',encoding='utf-8',mode='a') as files: for i in range(1,11):     trs = html.xpath('//div[@class="detailed-parameters"]/table[{}]/tr'.format(i))     for tr in trs:  try:      k=tr.xpath("./th/span/text() | ./th/a/text()")[0]      v = str(tr.xpath('./td/span/text() | ./td/span/a/text()')[0]).replace('>','').replace(',',';')      files.write('{},{},{}'.format(phone_name,k,v))      files.write('\n')      print(phone_name,k,v)  except:      pass#获取pagelist中的urldef get_pagelist(pageurl):    response = get_response(pageurl)    html = etree.HTML(response)    result = html.xpath('//*[@id="result_box"]/div[2]/ul/li')    for i in result: proname = str(i.xpath('./dl/dt/a/@id')[0]).replace('proName_','') prename = int(proname[0:4])+1 detail_url = 'https://detail.zol.com.cn/{}/{}/param.shtml'.format(prename,proname) get_datail(detail_url)def get_data():    df=pd.read_csv('brandlist.csv')    for i in range(df.shape[0]): brand = df.iat[i,0] url = df.iat[i,1] pagenum = int(df.iat[i,2]) for j in range(1,pagenum+1):     pageurl = str(url).format(j)     print(brand,pageurl,j)     get_pagelist(pageurl)if __name__ == '__main__':    get_data()