> 文档中心 > python提取CSDN博客文章url和标题

python提取CSDN博客文章url和标题

文档中心

# -*- coding: UTF-8 -*-#导入request和re模块from urllib import requestimport re#防止被应用到其他模块调用if __name__ == "__main__":#自己的CSDN list地址注意格式    url = 'https://blog.csdn.net/icanflyingg/article/list/'    head = {}#定义一个字典，将爬虫程序披上浏览器访问的外衣    head['User-Agent'] = 'User-Agent,Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331'    #自己CSDN页码数    for i in range (1,10): url1=url+str(i) #把所需要的请求参数进行打包,然后一起交给urlopen 进行请求 req = request.Request(url1, headers=head)        #发起请求 response = request.urlopen(req)        #对收集的参数进行解码 html = response.read().decode('utf-8')            #re.M ：多行匹配            #"re.S"单行匹配,如果分行则显示为一行/n            #re.I : 忽略大小写 arr = re.findall(r'(.+?)
',html,re.I|re.S|re.M) for value in arr:     #打印文章url              #"lstrip()"取出字段前面空格            #"re.S"单行匹配,如果分行则显示为一行/n            #re.findall提取操作re.findall('匹配内容(.*)"匹配内容',value)[0])        weburl=str(re.findall('"(.*)" ',value)[0]).lstrip()     print(weburl)            ##打印标题     title = str(re.findall('(.*)', value, re.S)[0]).lstrip()     print(title)