> 文档中心 > linux 内核网络发送技术栈(二)

linux 内核网络发送技术栈(二)

目录

 软中断调度

 网卡驱动发送    

补充:网卡启动

发送完成硬中断

总结


网络发送过程

 

邻居子系统后继续

 软中断调度

软中断的条件:系统发送网络包不够用的时候,调用__netif_schedule

void __netif_schedule(struct Qdisc *q){if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))__netif_reschedule(q);}
static void __netif_reschedule(struct Qdisc *q){struct softnet_data *sd;unsigned long flags;local_irq_save(flags);sd = this_cpu_ptr(&softnet_data);q->next_sched = NULL;*sd->output_queue_tailp = q;sd->output_queue_tailp = &q->next_sched;raise_softirq_irqoff(NET_TX_SOFTIRQ);local_irq_restore(flags);}

发出 NET_TX_SOFTIRQ 类型软中断。

会进⼊到 net_tx_action 函数,在该函数中能获取到发送队列,并也最终调⽤到驱动程序⾥的⼊⼝函数 dev_hard_start_xmit。
 

发送数据消耗的CPU显示在si里,不会消耗用户进程的系统时间

static __latent_entropy void net_tx_action(struct softirq_action *h){    //通过softnet_data 获取发送队列struct softnet_data *sd = this_cpu_ptr(&softnet_data);    //如果output queue上有qdiscif (sd->output_queue) {local_irq_disable(); //指向第一个qdischead = sd->output_queue; //遍历qdsicwhile (head) {struct Qdisc *q = head;spinlock_t *root_lock = NULL;head = head->next_sched;     //发送数据,与用户态一样,调用qdisc_restart=>sch_direct_xmit->dev_hard_start_xmitqdisc_run(q);}}xfrm_dev_backlog(sd);}
  • 进程内核态在调用__netif_reschedule的时候,把发送队列写到softnet_data的output_queue里了。
  • 软中断会调用softnet_data获取数据。
  • 软中断循环遍历sd->output_queue发送数据帧
  • qdisc_run(q) 调用qdisc_restart=>sch_direct_xmit->dev_hard_start_xmit

 网卡驱动发送    

struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,    struct netdev_queue *txq, int *ret){struct sk_buff *skb = first;int rc = NETDEV_TX_OK;while (skb) {struct sk_buff *next = skb->next;skb_mark_not_on_list(skb);rc = xmit_one(skb, dev, txq, next != NULL); ...}}

函数调用关系

xmit_one

        netdev_start_xmit

                __netdev_start_xmit

static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,      struct sk_buff *skb, struct net_device *dev,      bool more){__this_cpu_write(softnet_data.xmit.more, more);return ops->ndo_start_xmit(skb, dev);}

对于⽹络设备层定义的 ndo_start_xmit, igb 的实现函数是 igb_xmit_frame。在⽹卡驱动初始化的时候赋值

static const struct net_device_ops igb_netdev_ops = {.ndo_open= igb_open,.ndo_stop= igb_close,.ndo_start_xmit= igb_xmit_frame,}

 继续看igb_xmit_frame

static netdev_tx_t igb_xmit_frame(struct sk_buff *skb,  struct net_device *netdev){struct igb_adapter *adapter = netdev_priv(netdev);return igb_xmit_frame_ring(skb, igb_tx_queue_mapping(adapter, skb));}

调用igb_xmit_frame_ring

netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,struct igb_ring *tx_ring){    //获取TX Queue 中下⼀个可⽤缓冲区信息first = &tx_ring->tx_buffer_info[tx_ring->next_to_use];first->skb = skb;first->bytecount = skb->len;first->gso_segs = 1;first->tx_flags = tx_flags;first->protocol = protocol;    //准备发送数据if (igb_tx_map(tx_ring, first, hdr_len))}

//将skb数据映射到网卡可访问的内存DMA区域static int igb_tx_map(struct igb_ring *tx_ring,      struct igb_tx_buffer *first,      const u8 hdr_len){//获取下一个可用描述符指针tx_desc = IGB_TX_DESC(tx_ring, i);//为skb->data 构造内存映射,以运行设备通过DMA从RAM中读取数据dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);tx_buffer = first;//遍历数据包所有分片,为SKB的每个分片生成有效映射for (frag = &skb_shinfo(skb)->frags[0];; frag++) {     tx_desc->read.cmd_type_len =cpu_to_le32(cmd_type ^ IGB_MAX_DATA_PER_TXD);tx_desc->read.olinfo_status = 0;tx_desc->read.buffer_addr = cpu_to_le64(dma);}    //设置最后一个descriptorcmd_type |= size | IGB_TXD_DCMD;tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type);netdev_tx_sent_queue(txring_txq(tx_ring), first->bytecount);dma_wmb();first->next_to_watch = tx_desc;}

dma_map_single为skb->data 构造内存映射,以运行设备通过DMA从RAM中读取数据

skb 的所有数据都映射到 DMA 地址后,触发发送。

补充:网卡启动

static int __igb_open(struct net_device *netdev, bool resuming){//分配RingBuffer    //发送队列err = igb_setup_all_tx_resources(adapter);    //接收队列err = igb_setup_all_rx_resources(adapter);    //开启全部队列netif_tx_start_all_queues(netdev);}

igb_setup_all_tx_resources-> igb_setup_tx_resources

int igb_setup_tx_resources(struct igb_ring *tx_ring){//申请igb_tx_buffer数组内存tx_ring->tx_buffer_info = vmalloc(size);//申请e1000_adv_tx_desc DMA数组内存/* round up to nearest 4K */tx_ring->size = tx_ring->count * sizeof(union e1000_adv_tx_desc);tx_ring->size = ALIGN(tx_ring->size, 4096);tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,   &tx_ring->dma, GFP_KERNEL);    ...}static int igb_setup_all_tx_resources(struct igb_adapter *adapter){struct pci_dev *pdev = adapter->pdev;int i, err = 0;    //有几个队列就构造几个ringbufferfor (i = 0; i num_tx_queues; i++) {err = igb_setup_tx_resources(adapter->tx_ring[i]);}return err;}

有两个数组需要注意

  1. igb_tx_buffer 数组:通过 vzalloc 申请的,是内核使⽤;
  2. e1000_adv_tx_desc 数组:硬件是可以通过 DMA 直接访问这块内存,通过dma_alloc_coherent 分配,这个数组是⽹卡硬件使⽤的。

之间的关系:

        指向同⼀个 skb,内核和硬件就能共同访问同样的数据,内核往 skb ⾥写数据,⽹卡硬件负责发送。

发送完成硬中断

目的:清理内存,清理skb,清扫RingBuffer

static inline void ____napi_schedule(struct softnet_data *sd,     struct napi_struct *napi){list_add_tail(&napi->poll_list, &sd->poll_list);__raise_softirq_irqoff(NET_RX_SOFTIRQ);}

硬中断触发的软中断是NET_RX_SOFTIRQ

static int igb_poll(struct napi_struct *napi, int budget){if (q_vector->tx.ring)clean_complete = igb_clean_tx_irq(q_vector, budget);}
static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget){    /* free the skb */napi_consume_skb(tx_buffer->skb, napi_budget);/* clear last DMA location and unmap remaining buffers */while (tx_desc != eop_desc) {tx_buffer++;tx_desc++;i++;if (unlikely(!i)) {i -= tx_ring->count;tx_buffer = tx_ring->tx_buffer_info;tx_desc = IGB_TX_DESC(tx_ring, 0);}/* unmap any remaining paged data */if (dma_unmap_len(tx_buffer, len)) {dma_unmap_page(tx_ring->dev,dma_unmap_addr(tx_buffer, dma),dma_unmap_len(tx_buffer, len),DMA_TO_DEVICE);dma_unmap_len_set(tx_buffer, len, 0);}}}

清理SKB,unmap DMA映射。

总结

1)/proc/softirqs中 RX 为什么比TX高很多

  1.  数据发送完,通过硬中断来通知发送完成。软中断无论是数据接收还是发送完毕,触发的都是NET_RX_SOFTIRQ。
  2. 对于发送方,数据在⽤户进程内核态处理了,只有系统态配额⽤尽才会发出NET_TX,让软中断上。所以NET_TX比较少。对于读方,都是经过NET_RX软中断。

2)内存拷贝操作

  1. 内存申请完skb后,将用户发送的数据拷贝到skb
  2. 传输层到网络层,每个skb克隆一个新的副本。为了重传,只有收到ACK才丢失
  3. 当IP层发现skb大于MTU时,申请额外的skb,将原来的skb拷贝为多个小的skb

参考

https://course.0voice.com/v1/course/intro?courseId=2&agentId=0