> 文档中心 > scrapy框架初识04-CrawlSpider

scrapy框架初识04-CrawlSpider

文档中心

- CrawlSpider:类，Spider的一个子类    - 全站数据爬取的方式 - 基于Spider：手动请求 - 基于CrawlSpider    - CrawlSpider的使用： - 创建一个工程 - cd XXX - 创建爬虫文件（CrawlSpider）：     - scrapy genspider -t crawl xxx www.xxxx.com     - 链接提取器：  - 作用：根据指定的规则（allow）进行指定链接的提取     - 规则解析器：  - 作用：将链接提取器提取到的链接进行指定规则（callback）的解析 #需求：爬取sun网站中的编号，新闻标题，新闻内容，标号     - 分析：爬取的数据没有在同一张页面中。     - 1.可以使用链接提取器提取所有的页码链接     - 2.让链接提取器提取所有的新闻详情页的链接

本次案例建议使用代理IP,芝麻代理注册即可每天使用免费ip

import scrapyfrom scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Rulefrom sunPro.items import SunproItem,DetailItemclass SunSpider(CrawlSpider):    name = 'sun'    # allowed_domains = ['']    start_urls = ['https://wz.sun0769.com/political/index/politicsNewest?id=1&page=']    #链接提取器：根据指定规则(allows=“正则”)进行指定连接提取,从start_urls中提取    link=LinkExtractor(allow=r'id=1&page=\d+')    link_detail=LinkExtractor(allow=r'id=\d+')    rules = ( #规则解析器:将链接提取器提取到的链接进行指定规则的解析 Rule(link, callback='parse_item', follow=True), # follow=True：可以将链接提取器 继续作用到 连接提取器提取到的链接 所对应的页面中 即例如：在第2个页面中提取第3个页面链接 ... Rule(link_detail,callback='parse_detail')    )#以下两个方法不可以实现请求传参！#解析问题编号和问题标题    #如何将两个方法解析到的数据存到同一个item中，存到两个item中    def parse_item(self, response): li_list=response.xpath('/html/body/div[2]/div[3]/ul[2]/li') for li in li_list:     p_num=li.xpath('./span[1]/text()').extract_first()     p_title=li.xpath('./span[3]/a/text()').extract_first()     # print(p_num,p_title)     item=SunproItem()     item['p_title']=p_title     item['p_num']=p_num     yield item#解析问题具体内容和编号    def parse_detail(self,response): p_content=response.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/pre/text()').extract_first() p_id=response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[4]/text()').extract_first() # print(p_id,p_content) if not p_id:     print('已处理空值！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！') else:     p_id=p_id[3:]     item=DetailItem()     item['p_content']=p_content     item['p_id']=p_id     yield item

# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html# useful for handling different item types with a single interfacefrom itemadapter import ItemAdapterimport pymysqlclass SunproPipeline:    def process_item(self, item, spider): #如何判断是那个item if item.__class__.__name__== 'SunproItem':     print(item['p_num'],item['p_title']) else:     print(item['p_id'],item['p_content']) return itemclass mysqlPileLine:    conn=None    cursor = None    def open_spider(self,spider): self.coon = pymysql.Connect(host='127.0.0.1',user='root',password='123456',db='rxkc',charset='utf8')    def process_item(self, item, spider): self.cursor=self.coon.cursor() try:     if item.__class__.__name__ == 'SunproItem':  self.cursor.execute('insert into sun values("%s","%s")'%(item["p_num"],item["p_title"]))  self.coon.commit()     else:  self.cursor.execute('insert into sunn values("%s","%s")' % (item["p_id"], item["p_content"]))  self.coon.commit() except Exception as e:     print(e)     self.coon.rollback() return item    def close_spider(self,spider): self.cursor.close() self.coon.close()

# Define here the models for your spider middleware## See documentation in:# Define here the models for your spider middleware## See documentation in:# https://docs.scrapy.org/en/latest/topics/spider-middleware.htmlfrom scrapy import signals# useful for handling different item types with a single interfacefrom itemadapter import is_item, ItemAdapterimport randomclass SunproDownloaderMiddleware:    user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"    ]    PROXY_http = [ '125.106.138.41:4234', '42.6.114.117:7018',    ]    PROXY_https = [ '110.89.123.129:4213', '118.123.40.30:4231', '42.6.114.104:2687'    ]    # 拦截请求    def process_request(self, request, spider): request.headers['User-Agent'] = random.choice(self.user_agent_list) # request.meta['proxy'] = 'https://42.6.114.99:9702' return None    def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response    def process_exception(self, request, exception, spider): # 拦截发生异常的请求 if request.url.split(':')[0] == 'http':     # 代理     request.meta['proxy'] = 'http://' + random.choice(self.PROXY_http) else:     request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https) return request  # 将修正之后的请求对象进行重新的请求发送

import scrapyclass SunproItem(scrapy.Item):    # define the fields for your item here like:    # name = scrapy.Field()    p_num = scrapy.Field()    p_title=scrapy.Field()class DetailItem(scrapy.Item):    p_content = scrapy.Field()    p_id =scrapy.Field()

连接一下两个表的结果：