> 文档中心 > linux 内核中epoll实现

linux 内核中epoll实现

目录

epoll_create

epoll_ctl

epoll_wait


epoll_create

struct eventpoll {/* Wait queue used by sys_epoll_wait() */wait_queue_head_t wq;/* Wait queue used by file->poll() */wait_queue_head_t poll_wait;/* List of ready file descriptors */struct list_head rdllist;/* RB tree root used to store monitored fd structs */struct rb_root_cached rbr;}static int ep_alloc(struct eventpoll **pep){struct eventpoll *ep;    //申请epollevent内存ep = kzalloc(sizeof(*ep), GFP_KERNEL);    //初始化等待队列init_waitqueue_head(&ep->wq);init_waitqueue_head(&ep->poll_wait);    //初始化就绪列表    INIT_LIST_HEAD(&ep->rdllist);    //初始化红黑树指针ep->rbr = RB_ROOT_CACHED;...}
  • wq等待队列链表。软中断数据就绪的时候会通过 wq 来找到阻塞在 epoll 对象上的⽤户进程
  • rbr红⿊树。为了⽀持对海量连接的⾼效查找、插⼊和删除,eventpoll 内部使⽤了⼀棵红⿊树。通过这棵树来管理⽤户进程下添加进来的所有 socket 连接。
  • rdllist就绪的描述符的链表。当有连接就绪的时候,内核会把就绪的连接放到 rdllist链表⾥。应⽤进程只需要判断链表就能找出就绪进程,⽽不⽤去遍历整棵树。

epoll_ctl

添加 socket

1,分配一个红黑树节点对象epitem

2,添加等待事件到socket的等待队列中,其回调函数是ep_poll_callback

3,将epitem插入到epoll对象的红黑树里

int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock){    //根据epfd找到eventpoll对象f = fdget(epfd);/* Get the "struct file *" for the target file */    //根据socket句柄号,找到tftf = fdget(fd);switch (op) {case EPOLL_CTL_ADD:if (!epi) {epds->events |= EPOLLERR | EPOLLHUP;     //error = ep_insert(ep, epds, tf.file, fd, full_check);} elseerror = -EEXIST;if (full_check)clear_tfile_check_list();break;}return error;}
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,     struct file *tfile, int fd, int full_check){//分配epitemif (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))INIT_LIST_HEAD(&epi->rdllink);INIT_LIST_HEAD(&epi->fllink);INIT_LIST_HEAD(&epi->pwqlist);epi->ep = ep;ep_set_ffd(&epi->ffd, tfile, fd);epi->event = *event;epq.epi = epi;//设置监听的fd事件    //设置socket等待队列 并初始化ep_pqueue对象init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);//注册回调函数    //实际调用ep_ptable_queue_proc,注册函数为ep_poll_callbackrevents = ep_item_poll(epi, &epq.pt, 1);//插入红黑树ep_rbtree_insert(ep, epi);}
  • 分配epitem,它会被挂载到红黑树上;
  • 设置 socket 对象上的等待任务队列
  • ep_poll_callback 设置为数据就绪时候的回调函数

ep_ptable_queue_proc->ep_poll_callback

init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

将当前被监听的socket fd加入到等待队列中

static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt){if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {//初始化回调方法init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);pwq->whead = whead;pwq->base = epi; //如果设置了EPOLLEXCLUSIVEif (epi->event.events & EPOLLEXCLUSIVE)add_wait_queue_exclusive(whead, &pwq->wait);elseadd_wait_queue(whead, &pwq->wait);list_add_tail(&pwq->llink, &epi->pwqlist);epi->nwait++;}}

 如果一个socket fd被添加到了多个epoll中进行监控,设置了这个参数后,这个fd上有事件发生时,只会唤醒被添加到的第一个epoll里,避免惊群。

static inline voidinit_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func){wq_entry->flags= 0;wq_entry->private= NULL;wq_entry->func= func;}
  • private =NULL, socket 是交给epoll 来管理的,不需要在⼀个 socket 就绪的时候就唤醒进程。在前面分析recvfrom时,此处设置成当前进程描述符current。
  • ep_poll_callback注册到wait_queue_entry上,有数据到达时调用。

什么时候调用?软中断将数据收到 socket 的接收队列后,会通过注册的这个ep_poll_callback 函数来回调,进⽽通知到 epoll 对象。

插入红黑树

epoll 在查找效率、插⼊效率、内存开销等等多个⽅⾯⽐较均衡

ep_rbtree_insert(ep, epi);


epoll_wait

等待接收

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,int, maxevents, int, timeout){return do_epoll_wait(epfd, events, maxevents, timeout);}static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, int timeout){ep = f.file->private_data;error = ep_poll(ep, events, maxevents, timeout);    ...}static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,   int maxevents, long timeout){     //是否有就绪事件eavail = ep_events_available(ep);    do { //Internally init_wait() uses autoremove_wake_function() //初始化wait,保存上下文,当前进程,即当前进程插入epoll等待队列 init_wait(&wait);    //是否有就绪队列eavail = ep_events_available(ep);if (!eavail) {//没有就绪队列,但是有信号需要处理if (signal_pending(current))res = -EINTR;else__add_wait_queue_exclusive(&ep->wq, &wait);//添加到等待队列}//阻塞当前进程if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {timed_out = 1;break;}} while (0);    }}
static int ep_send_events(struct eventpoll *ep,  struct epoll_event __user *events, int maxevents){struct ep_send_events_data esed;//定义保存触发的事件的结构体esed.maxevents = maxevents;esed.events = events;    //扫描就绪列表,调用f_op->poll()ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);return esed.res;}

看一下ep_send_events_proc

static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,void *priv){//遍历就绪队列list_for_each_entry_safe(epi, tmp, head, rdllink) {if (esed->res >= esed->maxevents)break;//移出就绪队列list_del_init(&epi->rdllink);revents = ep_item_poll(epi, &pt, 1);if (!revents)continue;//写入调用方传入的结构体,返回0则说明成功if (__put_user(revents, &uevent->events) ||    __put_user(epi->event.data, &uevent->data)) {//失败则插入队列中list_add(&epi->rdllink, head);}esed->res++;uevent++;//设置了EPOLLONESHOT 则清除订阅的事件if (epi->event.events & EPOLLONESHOT)epi->event.events &= EP_PRIVATE_BITS;//水平触发则重新插入,下次epoll_wait继续触发else if (!(epi->event.events & EPOLLET)) {list_add_tail(&epi->rdllink, &ep->rdllist);ep_pm_stay_awake(epi);}}return 0;}

 

static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth){struct eventpoll *ep;bool locked;pt->_key = epi->event.events;if (!is_file_epoll(epi->ffd.file))return vfs_poll(epi->ffd.file, pt) & epi->event.events;ep = epi->ffd.file->private_data;poll_wait(epi->ffd.file, &ep->poll_wait, pt);locked = pt && (pt->_qproc == ep_ptable_queue_proc);return ep_scan_ready_list(epi->ffd.file->private_data,  ep_read_events_proc, &depth, depth,  locked) & epi->event.events;}static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt){if (unlikely(!file->f_op->poll))return DEFAULT_POLLMASK;return file->f_op->poll(file, pt);}

调用了 file->f_op->poll 实际上调用sock_poll

static __poll_t sock_poll(struct file *file, poll_table *wait){struct socket *sock = file->private_data;return sock->ops->poll(file, sock, wait) | flag;}
__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait){struct sock *sk = sock->sk;sock_poll_wait(file, sock, wait);}

参考

https://zhuanlan.zhihu.com/p/384098769

https://course.0voice.com/v1/course/intro?courseId=2&agentId=0