> 文档中心 > 用python处理28万条人人贷数据,告诉你最详细的借款人结构分布情况

用python处理28万条人人贷数据,告诉你最详细的借款人结构分布情况

目录

一、import 

二、导入数据

三、借款人籍贯分布图

四、性别分布

五、教育程度分布

六、借款人年龄分布

七、借款人职位分布

​八、借款人行业分布

九、借款金额分布图

十、借款人收入分布

十一、婚姻状况分布

十二、车贷情况

十三、房贷情况


零、写在前面 

①28W条数据我会尽快传到CSDN的资源里,大家有兴趣的可以自己下载

②文章只是列举最简单的分布情况,比如还可以看看各年龄段学历组成等

③数据里有一条贷款理由,可以画出词云图

④数据里有对各个借款人的信用进行评级,可以尝试使用深度学习等方法训练预测模型

⑤pandas、matplotlib都是较为基础的用法,不做过多注释

⑥爬虫参考代码:人人贷散标爬虫实例进阶-使用异步io_小zhan柯基-CSDN博客、人人贷散标爬虫实例_小zhan柯基-CSDN博客

一、import 

import numpy as npimport matplotlib.pyplot as pltimport pandas as pd     import matplotlib.ticker as tickerimport mpl_toolkits.axisartist as AAfrom mpl_toolkits.axisartist.axislines import SubplotZeroimport pylabimport jieba   from wordcloud import WordCloudpylab.mpl.rcParams['font.sans-serif'] = ['SimHei'] #显示中文plt.rcParams['axes.unicode_minus']=False  #用于解决不能显示负号的问题

二、导入数据

①使用read_csv导入数据

②设置列名

③花式索引

④将“id”设置为索引index

⑤去除所有都是nan的数据

data = pd.read_csv("all.csv",encoding="gbk",header=None,parse_dates=True)data.columns = ["id","借款时间(月)","剩余还款时间(月)","借款金额","notPayInterest","productRepayType", "贷款类型","利率","性别","籍贯","出生日期","教育程度","工作单位","行业","公司规模","职位","收入", "车贷","汽车数量","婚姻状况","房贷","房子数量","信用等级","none","none","none","借款理由"]conciseData = data[["id","借款时间(月)","剩余还款时间(月)","借款金额","贷款类型","利率","性别","籍贯","出生日期","教育程度","工作单位","行业","公司规模","职位","收入", "车贷","汽车数量","婚姻状况","房贷","房子数量","信用等级","借款理由"]]conciseData = conciseData.set_index("id")conciseData = conciseData.dropna(how="all")

三、借款人籍贯分布图

reigon = (conciseData["籍贯"].dropna().apply(lambda x:x.split(":")[0])\ .apply(lambda x:x.replace("省","").replace("市","").replace("壮族自治区","").replace("古",""))\     .value_counts()/(len(conciseData["籍贯"].dropna().apply(lambda x:x.split(":")[0])))*100).drop(index=["保密","null","请选择","深圳"])[:31]reigon = reigon[["上海","北京","浙江","天津","江苏","广东","福建","山东","辽宁",  "内蒙","重庆","湖南","安徽","江西","海南","湖北","河北","四川","陕西",  "吉林","宁夏","山西","黑龙江","河南","广西","青海","新疆","云南","贵州","西藏","甘肃"]]plt.figure(figsize=(16,8))plt.title("借款人籍贯分布图(按2020年各省人均可支配收入排序)",fontsize=20)plt.ylabel("百分比/%",size=20)# plt.tick_params(labelsize=15)plt.xticks(rotation=45,fontsize=15)plt.yticks(fontsize=15)# plt.grid(linestyle=":", color="b", linewidth=1)plt.bar(reigon.index,reigon, color=["grey","gold","darkviolet","turquoise","r","g","b","c", "k","darkorange","lightgreen","plum", "tan","khaki", "pink", "skyblue","lawngreen","salmon"])plt.savefig("借款人籍贯分布图.jpg",dpi=500,bbox_inches = "tight")    

四、性别分布

conciseData["性别"].dropna().value_counts().plot.pie(figsize=(5,5),autopct='%.2f%%',textprops = {'fontsize':17, 'color':'black'})plt.ylabel("性别分布",fontsize=20)plt.legend(loc=2, bbox_to_anchor=(1.05,1.0),fontsize=15)plt.savefig("性别分布图.jpg",dpi=500,bbox_inches = "tight")

 

五、教育程度分布

conciseData["教育程度"] = conciseData["教育程度"].apply(lambda x:x.replace(",","").replace(" ","").replace("短期周转","") \      .replace("","")if isinstance(x,str) else "")conciseData["教育程度"] = conciseData[~conciseData["教育程度"].isin(["其他借款","投资创业","短期周转","装修借款","请选择","购车借款","专科","大专高中或以下",""])]["教育程度"].dropna()(conciseData["教育程度"].value_counts()/sum(conciseData["教育程度"].value_counts())).plot.pie(      figsize=(5,5),autopct='%.1f%%',textprops = {'fontsize':17, 'color':'black'})plt.title("教育程度分布图",fontsize=20)plt.ylabel("")plt.legend(loc=2, bbox_to_anchor=(1.05,1.0),fontsize=15)plt.savefig("教育程度分布图.jpg",dpi=500,bbox_inches = "tight")

六、借款人年龄分布

year = conciseData["出生日期"].apply(lambda x:x.split("/")[0]).value_counts()/len(conciseData["出生日期"])*100year = year.sort_index()[10:-5]plt.figure(figsize=(16,8))plt.title("借款人年龄分布图",fontsize=20)plt.ylabel("百分比/%",size=20)# plt.tick_params(labelsize=15)plt.xticks(rotation=45,fontsize=15)plt.yticks(fontsize=15)# plt.grid(linestyle=":", color="b", linewidth=1)plt.bar(year.index,year, color=["grey","gold","darkviolet","turquoise","r","g","b","c", "k","darkorange","lightgreen","plum", "tan","khaki", "pink", "skyblue","lawngreen","salmon"])plt.savefig("借款人年龄分布图.jpg",dpi=500,bbox_inches = "tight")

七、借款人职位分布

position = (conciseData["职位"].value_counts()/len(conciseData["职位"])*100)[:25]    plt.figure(figsize=(16,8))plt.title("借款人职位分布图",fontsize=20)plt.ylabel("百分比/%",size=20)# plt.tick_params(labelsize=15)plt.xticks(rotation=60,fontsize=14)plt.yticks(fontsize=15)# plt.grid(linestyle=":", color="b", linewidth=1)plt.bar(position.index,position, color=["grey","gold","darkviolet","turquoise","r","g","b","c", "k","darkorange","lightgreen","plum", "tan","khaki", "pink", "skyblue","lawngreen","salmon"])plt.savefig("借款人职位分布图.jpg",dpi=500,bbox_inches = "tight")

八、借款人行业分布

ind = (conciseData["行业"].value_counts()/len(conciseData["职位"])*100)[:15]plt.figure(figsize=(16,8))plt.title("借款人行业分布图",fontsize=20)plt.ylabel("百分比/%",size=20)# plt.tick_params(labelsize=15)plt.xticks(rotation=60,fontsize=20)plt.yticks(fontsize=20)# plt.grid(linestyle=":", color="b", linewidth=1)plt.bar(ind.index,ind, color=["grey","gold","darkviolet","turquoise","r","g","b","c", "k","darkorange","lightgreen","plum", "tan","khaki", "pink", "skyblue","lawngreen","salmon"])plt.savefig("借款人行业分布图.jpg",dpi=500,bbox_inches = "tight")

九、借款金额分布图

conciseData["借款金额"] = conciseData["借款金额"].apply(lambda x:str(int(x))+"元")loanAmount = conciseData["借款金额"].value_counts().iloc[:10]/sum(conciseData["借款金额"].value_counts().iloc[:10])*100# plt.figure(figsize=(16,8))plt.title("借款金额分布图",fontsize=20)plt.ylabel("百分比/%",size=20)plt.xticks(rotation=60,fontsize=20)plt.yticks(fontsize=15)# plt.grid(linestyle=":", color="b", linewidth=1)plt.bar(loanAmount.index,loanAmount, color=["grey","gold","darkviolet","turquoise","r","g","b","c", "k","darkorange","lightgreen","plum", "tan","khaki", "pink", "skyblue","lawngreen","salmon"])plt.savefig("借款人金额分布图.jpg",dpi=500,bbox_inches = "tight")

十、借款人收入分布

salary = (conciseData["收入"].value_counts()[:7]/sum(conciseData["收入"].value_counts()[:7]))*100salary = salary[["1000元以下","1001-2000元","2000-5000元","5000-10000元","10000-20000元","20000-50000元","50000元以上"]]     # plt.figure(figsize=(16,8))plt.title("借款人收入分布图",fontsize=20)plt.ylabel("百分比/%",size=20)# plt.tick_params(labelsize=15)plt.xticks(rotation=60,fontsize=20)plt.yticks(fontsize=15)# plt.grid(linestyle=":", color="b", linewidth=1)plt.bar(salary.index,salary, color=["grey","gold","darkviolet","turquoise","r","g","b","c", "k","darkorange","lightgreen","plum", "tan","khaki", "pink", "skyblue","lawngreen","salmon"])plt.savefig("借款人收入分布图.jpg",dpi=500,bbox_inches = "tight")

十一、婚姻状况分布

conciseData["婚姻状况"].dropna().value_counts().plot.pie(figsize=(5,5),autopct='%.1f%%',textprops = {'fontsize':17, 'color':'black'})plt.title("婚姻状况分布图",fontsize=20)plt.ylabel("")plt.legend(loc=2, bbox_to_anchor=(1.05,1.0),fontsize=15)plt.savefig("婚姻状况分布图.jpg",dpi=500,bbox_inches = "tight")

十二、车贷情况

conciseData["车贷"].dropna().value_counts().plot.pie(figsize=(5,5),autopct='%.1f%%',textprops = {'fontsize':17, 'color':'black'})plt.title("车贷情况分布图",fontsize=20)plt.ylabel("")plt.legend(loc=2, bbox_to_anchor=(1.05,1.0),fontsize=15)plt.savefig("车贷情况分布图.jpg",dpi=500,bbox_inches = "tight")

十三、房贷情况

conciseData["房贷"].dropna().value_counts().plot.pie(figsize=(5,5),autopct='%.1f%%',textprops = {'fontsize':17, 'color':'black'})plt.title("房贷情况分布图",fontsize=20)plt.ylabel("")plt.legend(loc=2, bbox_to_anchor=(1.05,1.0),fontsize=15)plt.savefig("房贷情况分布图.jpg",dpi=500,bbox_inches = "tight")