scrapy框架初识04-CrawlSpider
- CrawlSpider:类,Spider的一个子类 - 全站数据爬取的方式 - 基于Spider:手动请求 - 基于CrawlSpider - CrawlSpider的使用: - 创建一个工程 - cd XXX - 创建爬虫文件(CrawlSpider): - scrapy genspider -t crawl xxx www.xxxx.com - 链接提取器: - 作用:根据指定的规则(allow)进行指定链接的提取 - 规则解析器: - 作用:将链接提取器提取到的链接进行指定规则(callback)的解析 #需求:爬取sun网站中的编号,新闻标题,新闻内容,标号 - 分析:爬取的数据没有在同一张页面中。 - 1.可以使用链接提取器提取所有的页码链接 - 2.让链接提取器提取所有的新闻详情页的链接
本次案例建议使用代理IP,芝麻代理注册即可每天使用免费ip
import scrapyfrom scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Rulefrom sunPro.items import SunproItem,DetailItemclass SunSpider(CrawlSpider): name = 'sun' # allowed_domains = [''] start_urls = ['https://wz.sun0769.com/political/index/politicsNewest?id=1&page='] #链接提取器:根据指定规则(allows=“正则”)进行指定连接提取,从start_urls中提取 link=LinkExtractor(allow=r'id=1&page=\d+') link_detail=LinkExtractor(allow=r'id=\d+') rules = ( #规则解析器:将链接提取器提取到的链接进行指定规则的解析 Rule(link, callback='parse_item', follow=True), # follow=True:可以将链接提取器 继续作用到 连接提取器提取到的链接 所对应的页面中 即例如:在第2个页面中提取第3个页面链接 ... Rule(link_detail,callback='parse_detail') )#以下两个方法不可以实现请求传参!#解析问题编号和问题标题 #如何将两个方法解析到的数据存到同一个item中,存到两个item中 def parse_item(self, response): li_list=response.xpath('/html/body/div[2]/div[3]/ul[2]/li') for li in li_list: p_num=li.xpath('./span[1]/text()').extract_first() p_title=li.xpath('./span[3]/a/text()').extract_first() # print(p_num,p_title) item=SunproItem() item['p_title']=p_title item['p_num']=p_num yield item#解析问题具体内容和编号 def parse_detail(self,response): p_content=response.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/pre/text()').extract_first() p_id=response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[4]/text()').extract_first() # print(p_id,p_content) if not p_id: print('已处理空值!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') else: p_id=p_id[3:] item=DetailItem() item['p_content']=p_content item['p_id']=p_id yield item
# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html# useful for handling different item types with a single interfacefrom itemadapter import ItemAdapterimport pymysqlclass SunproPipeline: def process_item(self, item, spider): #如何判断是那个item if item.__class__.__name__== 'SunproItem': print(item['p_num'],item['p_title']) else: print(item['p_id'],item['p_content']) return itemclass mysqlPileLine: conn=None cursor = None def open_spider(self,spider): self.coon = pymysql.Connect(host='127.0.0.1',user='root',password='123456',db='rxkc',charset='utf8') def process_item(self, item, spider): self.cursor=self.coon.cursor() try: if item.__class__.__name__ == 'SunproItem': self.cursor.execute('insert into sun values("%s","%s")'%(item["p_num"],item["p_title"])) self.coon.commit() else: self.cursor.execute('insert into sunn values("%s","%s")' % (item["p_id"], item["p_content"])) self.coon.commit() except Exception as e: print(e) self.coon.rollback() return item def close_spider(self,spider): self.cursor.close() self.coon.close()
# Define here the models for your spider middleware## See documentation in:# Define here the models for your spider middleware## See documentation in:# https://docs.scrapy.org/en/latest/topics/spider-middleware.htmlfrom scrapy import signals# useful for handling different item types with a single interfacefrom itemadapter import is_item, ItemAdapterimport randomclass SunproDownloaderMiddleware: user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] PROXY_http = [ '125.106.138.41:4234', '42.6.114.117:7018', ] PROXY_https = [ '110.89.123.129:4213', '118.123.40.30:4231', '42.6.114.104:2687' ] # 拦截请求 def process_request(self, request, spider): request.headers['User-Agent'] = random.choice(self.user_agent_list) # request.meta['proxy'] = 'https://42.6.114.99:9702' return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # 拦截发生异常的请求 if request.url.split(':')[0] == 'http': # 代理 request.meta['proxy'] = 'http://' + random.choice(self.PROXY_http) else: request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https) return request # 将修正之后的请求对象进行重新的请求发送
import scrapyclass SunproItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() p_num = scrapy.Field() p_title=scrapy.Field()class DetailItem(scrapy.Item): p_content = scrapy.Field() p_id =scrapy.Field()
连接一下两个表的结果: