> 文档中心 > linux内核网络收包过程—TCP协议处理

linux内核网络收包过程—TCP协议处理

目录

socket创建过程

tcp_recvmsg

软中断模块

tcp_v4_rcv

sock_def_readable

__wake_up_common

总结


socket创建过程

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol){return __sys_socket(family, type, protocol);}

系统调用__sys_socket->sock_create->__sock_create

int __sock_create(struct net *net, int family, int type, int protocol, struct socket res, int kern){    ...    //分配sock对象sock = sock_alloc();    //获取协议族操作表pf = rcu_dereference(net_families[family]);    //协议族createerr = pf->create(net, sock, protocol, kern);}

看一下关联PF_INET的协议族的关联函数

static const struct net_proto_family inet_family_ops = {.family = PF_INET,.create = inet_create,.owner= THIS_MODULE,};//sock_register - add a socket protocol handlerint sock_register(const struct net_proto_family *ops){spin_lock(&net_family_lock);if (rcu_dereference_protected(net_families[ops->family],      lockdep_is_held(&net_family_lock)))err = -EEXIST;else {rcu_assign_pointer(net_families[ops->family], ops);err = 0;}}static int __init inet_init(void){    ...(void)sock_register(&inet_family_ops);}

AF_INET执行的是inet_create inet_stream_ops tcp_prot 。把它们分别设置到 socket->ops sock->sk_prot

void sock_init_data(struct socket *sock, struct sock *sk){sk->sk_state_change=sock_def_wakeup;sk->sk_data_ready=sock_def_readable;sk->sk_write_space=sock_def_write_space;sk->sk_error_report=sock_def_error_report;sk->sk_destruct=sock_def_destruct;    ...}

当软中断上收到数据包时会通过调⽤ sk_data_ready 函数指针 来唤醒在 sock 上等待的进程

tcp_recvmsg

int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,int flags, int *addr_len){do {skb_queue_walk(&sk->sk_receive_queue, skb) { ...}if (copied >= target) {release_sock(sk);lock_sock(sk);} else {sk_wait_data(sk, &timeo, last);}...}

使用skb_queue_walk 遍历接收队列接收数据 sk_wait_data等待队列处理

#define DEFINE_WAIT_FUNC(name, function)\struct wait_queue_entry name = {\.private= current,\.func= function,\.entry= LIST_HEAD_INIT((name).entry),\}static inline wait_queue_head_t *sk_sleep(struct sock *sk){BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0);return &rcu_dereference_raw(sk->sk_wq)->wait;}int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb){DEFINE_WAIT_FUNC(wait, woken_wake_function);int rc;add_wait_queue(sk_sleep(sk), &wait);sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);remove_wait_queue(sk_sleep(sk), &wait);return rc;}

DEFINE_WAIT _FUNC 宏下         定义了⼀个等待队列项 wait         注册 回调函数 function         把当前进程描述符 current 关联到其 .private成员上 sk_sleep 获取 sock 对象下的等待队列列表头wait_queue_head_t add_wait_queue 把新定义的等待队列项 wait 插⼊到 sock 对象的等待队列下 sk_wait_event   让出 CPU ,进程将进⼊睡眠状态


软中断模块

软中断 ksoftirqd 进程,收到数据包以后:

  1. 若是 tcp 的包 执行 tcp_v4_rcv 函数;
  2. 如果是 ESTABLISH 状态下的数据包,把数据拆出来放到对应 socket 的接收队列中;
  3. 最后调⽤ sk_data_ready 来唤醒⽤户进程。

接下来分析源码,从tcp_v4_rcv 开始

tcp_v4_rcv

int tcp_v4_rcv(struct sk_buff *skb){th = (const struct tcphdr *)skb->data;//tcp headeriph = ip_hdr(skb);//获取ip headerlookup:sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,th->dest, sdif, &refcounted);//用户未被锁定if (!sock_owned_by_user(sk)) {skb_to_free = sk->sk_rx_skb_cache;sk->sk_rx_skb_cache = NULL;//接收数据ret = tcp_v4_do_rcv(sk, skb);}...}

int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb){struct sock *rsk;//ESTABLISH 状态的数据包处理if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */struct dst_entry *dst = sk->sk_rx_dst;    ...tcp_rcv_established(sk, skb);return 0;}//⾮ ESTABLISH 状态的数据包处理}
static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,      bool *fragstolen){    ...if (!eaten) {//把收到的数据放到socket的接收队列尾部__skb_queue_tail(&sk->sk_receive_queue, skb);skb_set_owner_r(skb, sk);}}void tcp_data_ready(struct sock *sk){    ...    //软中断唤醒sk->sk_data_ready(sk);}//主函数void tcp_rcv_established(struct sock *sk, struct sk_buff *skb){//从队列中接收数据eaten = tcp_queue_rcv(sk, skb, &fragstolen); //唤醒软中断tcp_data_ready(sk);...}

sock_def_readable

sk_data_ready在socket创建的时候注册的 sock_def_readable

void sock_def_readable(struct sock *sk){struct socket_wq *wq;rcu_read_lock();//获取wait queuewq = rcu_dereference(sk->sk_wq);//有进程在此socket的等待队列if (skwq_has_sleeper(wq))//唤醒等待队列上的进行wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |EPOLLRDNORM | EPOLLRDBAND);sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);rcu_read_unlock();}

recvfrom 执⾏的最后,通过 DEFINE_WAIT_FUNC  将当前进程关联的等待队列添加到 sock- >sk_wq 下的 wait ⾥了。

void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,unsigned int mode, void *key){ __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL);}#define wake_up_interruptible_sync_poll_locked(x, m)\__wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))

__wake_up_common

实现唤醒软中断

总结

  1. 应用程序调用socket函数会进入内核态创建的对象
  2. recv函数在进入内核态后查看接收队列,在没有数据到来时把当前进程阻塞
  3. 软中断上下文将处理完数据放到socket的接收队列中
  4. 根据socket对象找到其等待队列中正在因等待而被阻塞的进程,把它唤醒。

注意上述有两次进程上下文切换的开销

1)每次进程为了等一个socket上数据就得从CPU上拿下来,然后再换上另一个进程

2)等到数据ready了,睡眠的进程又会被唤醒

每一次切换大约3~5us左右,如果是网络IO密集型的应用,CPU在不停做无用功。

高效的网络IO模型 select,poll,epoll

参考

https://course.0voice.com/v1/course/intro?courseId=2&agentId=0