python 爬取大学排行网站全部排行数据
中国大学排名,中国两岸四地排名,全球体育类院系大学排行,世界大学学术排名
中国最好学科排名,中国大学专业排名,世界一流学科排名(每个专业学科排行都有)
import requestsimport reimport osimport pandas as pdcookie = '''Hm_lvt_af1fda4748dacbd3ee2e3a69c3496570=1648800879,1648802769,1648806576,1648808126; Hm_lpvt_af1fda4748dacbd3ee2e3a69c3496570=1648808126; TOKEN=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2NDg4MjYxNDQsImp0aSI6IjI3MzA0NSIsImlzcyI6IjE1MCoqKio0NTcyIn0.FNeg0WENVRU90bOeT1HrrA0EeAy42t3LZkVvVwmIPus'''headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.7 Safari/537.36','Cookie': cookie}# 防反爬虫,模拟浏览器b1= "排行/"if not os.path.exists(b1) or not os.path.isdir(b1): os.makedirs(b1)s1 = os.path.join(b1, "中国大学排名/")if not os.path.exists(s1): os.makedirs(s1)s2 = os.path.join(b1, "中国最好学科排名/")if not os.path.exists(s2): os.makedirs(s2)s3 = os.path.join(b1,"中国大学专业排名/")if not os.path.exists(s3): os.makedirs(s3)s4 = os.path.join(b1,"世界大学学术排名/")if not os.path.exists(s4): os.makedirs(s4)s5 = os.path.join(b1,"世界一流学科排名/")if not os.path.exists(s5): os.makedirs(s5)s6 = os.path.join(b1,"中国两岸四地大学排名/")if not os.path.exists(s6): os.makedirs(s6)s7 = os.path.join(b1,"全球体育类院系学术排名/")if not os.path.exists(s7): os.makedirs(s7)def get_payload_js(url): # 每个url的 payload.js 是不一样的通过这个函数获取,返回对应数据 resp = requests.get(url, headers=headers) if resp.status_code == 404: return None # 状态值为404,获取失败,榜单不存在的情况 # 正则匹配payload.js payload_href = re.findall(r'/_nuxt/static/(.*?)/payload\.js"', resp.text, re.S) payload_href = payload_href[0] js_url = "https://www.shanghairanking.cn/_nuxt/static/{}/payload.js".format(payload_href) resp2 = requests.get(js_url, headers=headers) html = resp2.text return htmldef get_num(s): s = s.split("-")[0] return int(s)# 对字符串进行分割并返回一个列表def get_bcur_data(year): # 获取中国大学排名 url = f"https://www.shanghairanking.cn/rankings/bcur/{year}" payload_js = get_payload_js(url) if not payload_js: return None params = get_params(payload_js) data = re.findall('\{univUp(.*?)\},', payload_js, re.S) if begin_year != 2021|2020: k = "生源质量" else: k = "办学层次" items = { "排名": [], "学校名称": [], "省市": [], "类型": [], "总分": [], k: [], } for i, _ in enumerate(data): rank = get_data(_, 'ranking\:(.*?)\,', params) univNameCn = get_data(_, 'univNameCn\:(.*?)\,', params) score = get_data(_, 'score\:(.*?)\,', params) province = get_data(_, 'province\:(.*?)\,', params) univCategory = get_data(_, 'univCategory\:(.*?),', params) # province indData_score = get_data(_, 'indData\:\{\"(.*?)\"\:(.*?)\,', params) items['排名'].append(rank) items['学校名称'].append(univNameCn) items['省市'].append(province) items['类型'].append(univCategory) items['总分'].append(score) items[k].append(indData_score) return itemsdef get_rtugc_data(year): # 中国两岸四地大学排名 url = "https://www.shanghairanking.cn/rankings/rtugc/{}".format(year) payload_js = get_payload_js(url) if not payload_js: return None params = get_params(payload_js) data = re.findall(r'\{ranking(.*?)\},', payload_js, re.S) items = { "排名": [], "学校名称": [], "国家/地区": [], "总分": [], "研究生比例": [], } for i, _ in enumerate(data): _ = "ranking" + _ rank = get_data(_, r'ranking\:(.*?)\,', params) univNameCn = get_data(_, r'univNameCn\:(.*?)\,', params) score = get_data(_, r'score\:(.*?)\,', params) region = get_data(_,r'region\:(.*?)\,', params) indData = get_data(_, r'data\:\{\"(.*?)\"\:(.*?)\,', params) items['排名'].append(rank) items['学校名称'].append(univNameCn) items['国家/地区'].append(region) items['总分'].append(score) items['研究生比例'].append(indData) # 匹配对应数据并写入字典里面 return itemsdef get_grsssd_data(year): # 全球体育类院系学术排名 url = "https://www.shanghairanking.cn/rankings/grsssd/{}".format(year) payload_js = get_payload_js(url) if not payload_js: return None params = get_params(payload_js) data = re.findall('\{ranking(.*?)\},', payload_js, re.S) items = { "排名": [], "学校名称": [], "国家/地区": [], "总分": [], "论文数": [], } for i, _ in enumerate(data): _ = "ranking" + _ rank = get_data(_, 'ranking\:(.*?)\,', params) univNameCn = get_data(_, 'univNameCn\:(.*?)\,', params) score = get_data(_, 'score\:(.*?)\,', params) region = get_data(_, 'region\:(.*?)\,', params) indData = get_data(_, 'indData\:\{\"(.*?)\"\:(.*?)\,', params) items['排名'].append(rank) items['学校名称'].append(univNameCn) items['国家/地区'].append(region) if score: items['总分'].append(score) else: items['总分'].append('') items['论文数'].append(indData) return itemsdef get_arwu_data(year): #世界大学学术排名 url = f"https://www.shanghairanking.cn/api/pub/v1/arwu/rank?year={year}" print(url) resp = requests.get(url, headers=headers) data = resp.json() if begin_year != 2003: k = "校友获奖" else: k = "教师获奖" items = { "排名": [], "学校名称": [], "国家/地区": [], "国家/地区 排名": [], "总分": [], k: [], } inds = {_['nameCn']: _['code'] for _ in data['data']['inds']} #字典生成式 for r in data['data']['rankings']: items['排名'].append(r['ranking']) items['学校名称'].append(r['univNameCn']) items['国家/地区'].append(r['region']) items['国家/地区 排名'].append(r['regionRanking']) items['总分'].append(r['score']) if k in inds: items[k].append(r['indData'].get(inds[k], "0")) else: items[k].append('') return itemsdef get_bcsr_name(year): #获取中国最好学科排名的学科编码及名称 url = "https://www.shanghairanking.cn/_nuxt/static/1647508426/rankings/bcsr/{}/payload.js".format(year) resp = requests.get(url, headers=headers) params = get_params(resp.text) data = re.findall('\{code(.*?)\},', resp.text, re.S) id_items = [] for _ in data: _ = "code" + _ _id = get_data(_, 'code\:(.*?)\,', params) if len(_id) <= 2: continue nameCn = get_data(_, 'nameCn\:(.*?)\,', params) id_items.append({"id": _id, "name": nameCn}) return id_itemsdef get_bcsr_data(id_s, year): # 获取中国最好学科排名的学科编码及名称对应的排行榜 url = f"https://www.shanghairanking.cn/api/pub/v1/bcsr/rank?target_yr={year}&yr={year - 1}&subj_code={id_s}" resp = requests.get(url, headers=headers) data = resp.json() items = { f"{year}排名": [], f"{year - 1}排名": [], "全部层次": [], "学校名称": [], "总分": [] } for j in data['data']['rankings']: items[f"{year}排名"].append(j['ranking']) if j['contrastRanking'] and j['contrastRanking'].get(f"{year-1}"): items[f"{year - 1}排名"].append(j['contrastRanking'].get(f"{year-1}")) else: items[f"{year - 1}排名"].append('') items['全部层次'].append(j['rankPctTop']) items['学校名称'].append(j['univNameCn']) items['总分'].append(j['score']) return itemsdef get_gras_name(year): #获取世界一流学科排名排名的学科编码及名称 url = "https://www.shanghairanking.cn/_nuxt/static/1647508426/rankings/gras/{}/payload.js".format(year) resp = requests.get(url, headers=headers) params = get_params(resp.text) data = re.findall('\{code(.*?)\},', resp.text, re.S) id_items = [] for _ in data: _ = "code" + _ _id = get_data(_, 'code\:(.*?)\,', params) if len(_id) <= 4: continue nameCn = get_data(_, 'nameCn\:(.*?)\,', params) id_items.append({"id": _id, "name": nameCn}) return id_itemsdef get_gras_data(id_s, year): # 获取世界一流学科排名排名的学科编码及名称对应的排行榜 url = f"https://www.shanghairanking.cn/api/pub/v1/gras/rank?year={year}&subj_code={id_s}" resp = requests.get(url, headers=headers) data = resp.json() if begin_year != 2020|2021: k = "论文总数" else: k = "重要期刊论文数" items = { f"{year}排名": [], "学校名称": [], "国家/地区": [], "总分": [], k: [], } inds = {_['nameCn']: _['code'] for _ in data['data']['inds']} for j in data['data']['rankings']: items[f"{year}排名"].append(j['ranking']) items['学校名称'].append(j['univNameCn']) items['国家/地区'].append(j['region']) items['总分'].append(j['score']) if k in inds: items[k].append(j['indData'].get(inds[k], "0")) else: items[k].append('') return itemsdef get_bcmr_name(): #获取中国大学专业排名的专业编码及名称 url = "https://www.shanghairanking.cn/rankings/bcmr/2021" payload_js = get_payload_js(url) if not payload_js: return None params = get_params(payload_js) data = re.findall('name=(.*?);(.*?)code=(.*?);', payload_js, re.S) id_items = [] for _ in data: name = _[0] if name in params: name = params[name] name = name.replace('"', "") _id = _[-1] if _id in params: _id = params[_id] _id = _id.replace('"', "") if len(_id) <= 4: continue id_items.append({"id": _id, "name": name}) return id_itemsdef parse(p): p = p.replace('"', "") if p == "null": return None if p == "false": return False return p# 将js里面的null及false转为python里面的None,False,pyhon里面必须大写,将''去除def get_data(s, re_t,params): k = re.findall(re_t, s, re.S)[0] if isinstance(k, tuple): k = k[-1] if k in params: k = params[k] else: k = k.replace('"', "") return kdef get_params(payload_js): params1 = re.findall('function\((.*?)\)', payload_js, re.S)[0].split(",") params1 = [x.strip() for x in params1] # 参数列表 params2 = [parse(_) for _ in re.findall("""}\((.*?)\)""", payload_js, re.S)[0].replace("\n", "").replace('"2021,2020"','"2021|2020"').split( ",")] # 值的列表 # 映射关系 params = {x: y for x, y in zip(params1, params2)} return params#获取参数对应的payload_js数据,def get_bcmr_data(id_s): # 获取中国大学专业排名排名的学科编码及名称对应的排行榜 url = f"https://www.shanghairanking.cn/api/pub/v1/bcmr/rank?year=2021&majorCode={id_s}" resp = requests.get(url, headers=headers) data = resp.json() items = { "评级": [], "排名": [], "学校名称": [], "省市": [], "总分": [], } #inds = {_['nameCn']: _['code'] for _ in data['data']['inds']} for j in data['data']['rankings']: items["评级"].append(j['grade']) items["排名"].append(j['ranking']) items['学校名称'].append(j['univNameCn']) city=j['city'] if j['city'] else j['province'] items['省市'].append(city) # items['省市'].append(j['province']) items['总分'].append(j['score']) return itemsif __name__ == '__main__': begin_year = 2003 while begin_year = 2015: s1_d = os.path.join(s1, str(begin_year)) if not os.path.exists(s1_d ): os.makedirs(s1_d ) ch1 = get_bcur_data(begin_year) if ch1: file_name = os.path.join(s1_d, "中国大学排名.csv") # 在年份子目录下建立对应排名文件 pd.DataFrame(ch1).to_csv(file_name, index=False) #写入数据 if 2011 <= begin_year <= 2020: s6_d = os.path.join(s6, str(begin_year)) if not os.path.exists(s6_d): os.makedirs(s6_d) c2 = get_rtugc_data(begin_year) if c2: file_name = os.path.join(s6_d, "中国两岸四地大学排名.csv") pd.DataFrame(c2).to_csv(file_name, index=False) if 2016 <= begin_year =2017: s2_d = os.path.join(s2, str(begin_year)) if not os.path.exists(s2_d): os.makedirs(s2_d) for rk in get_bcsr_name(begin_year): # 中国最好学科排名 c4 = get_bcsr_data(rk['id'], begin_year) if c4: file_name = os.path.join(s2_d, rk['name'] + ".csv") pd.DataFrame(c4).to_csv(file_name, index=False) s5_d = os.path.join(s5, str(begin_year)) if not os.path.exists(s5_d): os.makedirs(s5_d) for gr in get_gras_name(begin_year): # 世界一流学科排名 c5 = get_gras_data(gr['id'], begin_year) if c5: file_name = os.path.join(s5_d, gr['name'] + ".csv") pd.DataFrame(c5).to_csv(file_name, index=False) if begin_year == 2021: s3_d = os.path.join(s3, str(begin_year)) if not os.path.exists(s3_d): os.makedirs(s3_d) for zr in get_bcmr_name(): c6 = get_bcmr_data(zr['id']) if c6: file_name = os.path.join(s3_d, zr['name'] + ".csv") pd.DataFrame(c6).to_csv(file_name, index=False) s4_d = os.path.join(s4, str(begin_year)) if not os.path.exists(s4_d): os.makedirs(s4_d) c7= get_arwu_data(str(begin_year)) file_name = os.path.join(s4_d, "世界大学学术排名.csv") pd.DataFrame(c7).to_csv(file_name, index=False) begin_year += 1