epoll是由一組系統調用組成。
???? int epoll_create(int size);
???? int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
???? int epoll_wait(int epfd, struct epoll_event *events,int maxevents, int timeout);
???? select/poll的缺點在于:
???? 1.每次調用時要重復地從用戶態讀入參數。
???? 2.每次調用時要重復地掃描文件描述符。
???? 3.每次在調用開始時,要把當前進程放入各個文件描述符的等待隊列。在調用結束后,又把進程從各個等待隊列中刪除。
???? 在實際應用中,select/poll監視的文件描述符可能會非常多,如果每次只是返回一小部分,那么,這種情況下select/poll
顯得不夠高效。epoll的設計思路,是把select/poll單個的操作拆分為1個epoll_create+多個epoll_ctrl+一個epoll_wait。
epoll機制實現了自己特有的文件系統eventpoll filesystem
- /*?File?callbacks?that?implement?the?eventpoll?file?behaviour?*/??
- static?const?struct?file_operations?eventpoll_fops?=?{??
- ????.release????=?ep_eventpoll_release,??
- ????.poll???????=?ep_eventpoll_poll??
- };??
epoll_create創建一個屬于該文件系統的文件,然后返回其文件描述符。
?
struct eventpoll 保存了epoll文件節點的擴展信息,該結構保存于file結構體的private_data域中,每個epoll_create創建的epoll
描述符都分配一個該結構體。該結構的各個成員的定義如下,注釋也很詳細。
- /*?
- ?*?This?structure?is?stored?inside?the?"private_data"?member?of?the?file?
- ?*?structure?and?rapresent?the?main?data?sructure?for?the?eventpoll?
- ?*?interface.?
- ?*/??
- struct?eventpoll?{??
- ????/*?Protect?the?this?structure?access,可用于中斷上下文?*/??
- ????spinlock_t?lock;??
- ????/*?
- ?????*?This?mutex?is?used?to?ensure?that?files?are?not?removed?
- ?????*?while?epoll?is?using?them.?This?is?held?during?the?event?
- ?????*?collection?loop,?the?file?cleanup?path,?the?epoll?file?exit?
- ?????*?code?and?the?ctl?operations.用戶進程上下文中?
- ?????*/??
- ????struct?mutex?mtx;??
- ????/*?Wait?queue?used?by?sys_epoll_wait()?*/??
- ????wait_queue_head_t?wq;??
- ????/*?Wait?queue?used?by?file->poll()?*/??
- ????wait_queue_head_t?poll_wait;??
- ????/*?List?of?ready?file?descriptors?*/??
- ????struct?list_head?rdllist;??
- ????/*?RB?tree?root?used?to?store?monitored?fd?structs?*/??
- ????struct?rb_root?rbr;??
- ????/*?
- ?????*?This?is?a?single?linked?list?that?chains?all?the?"struct?epitem"?that?
- ?????*?happened?while?transfering?ready?events?to?userspace?w/out?
- ?????*?holding?->lock.?
- ?????*/??
- ????struct?epitem?*ovflist;??
- ????/*?The?user?that?created?the?eventpoll?descriptor?*/??
- ????struct?user_struct?*user;??
- };??
?
而通過epoll_ctl接口加入該epoll描述符監聽的套接字則屬于socket filesystem,這點一定要注意。每個添加的待監聽(這里監聽
和listen調用不同)都對應于一個epitem結構體,該結構體已紅黑樹的結構組織,eventpoll結構中保存了樹的根節點(rbr成員)。
同時有監聽事件到來的套接字的該結構以雙向鏈表組織起來,鏈表頭也保存在eventpoll中(rdllist成員)。
- /*?
- ?*?Each?file?descriptor?added?to?the?eventpoll?interface?will?
- ?*?have?an?entry?of?this?type?linked?to?the?"rbr"?RB?tree.?
- ?*/??
- struct?epitem?{??
- ????/*?RB?tree?node?used?to?link?this?structure?to?the?eventpoll?RB?tree?*/??
- ????struct?rb_node?rbn;??
- ????/*?List?header?used?to?link?this?structure?to?the?eventpoll?ready?list?*/??
- ????struct?list_head?rdllink;??
- ????/*?
- ?????*?Works?together?"struct?eventpoll"->ovflist?in?keeping?the?
- ?????*?single?linked?chain?of?items.?
- ?????*/??
- ????struct?epitem?*next;??
- ????/*?The?file?descriptor?information?this?item?refers?to?*/??
- ????struct?epoll_filefd?ffd;??
- ????/*?Number?of?active?wait?queue?attached?to?poll?operations?*/??
- ????int?nwait;??
- ????/*?List?containing?poll?wait?queues?*/??
- ????struct?list_head?pwqlist;??
- ????/*?The?"container"?of?this?item?*/??
- ????struct?eventpoll?*ep;??
- ????/*?List?header?used?to?link?this?item?to?the?"struct?file"?items?list?*/??
- ????struct?list_head?fllink;??
- ????/*?The?structure?that?describe?the?interested?events?and?the?source?fd?*/??
- ????struct?epoll_event?event;??
- };??
?
epoll_create的調用很簡單,就是創建一個epollevent的文件,并返回文件描述符。
epoll_ctl用來添加,刪除以及修改監聽項。
- /*?
- ?*?The?following?function?implements?the?controller?interface?for?
- ?*?the?eventpoll?file?that?enables?the?insertion/removal/change?of?
- ?*?file?descriptors?inside?the?interest?set.?
- ?*/??
- SYSCALL_DEFINE4(epoll_ctl,?int,?epfd,?int,?op,?int,?fd,??
- ????????struct?epoll_event?__user?*,?event)??
- {??
- ????int?error;??
- ????struct?file?*file,?*tfile;??
- ????struct?eventpoll?*ep;??
- ????struct?epitem?*epi;??
- ????struct?epoll_event?epds;??
- ????DNPRINTK(3,?(KERN_INFO?"[%p]?eventpoll:?sys_epoll_ctl(%d,?%d,?%d,?%p)/n",??
- ?????????????current,?epfd,?op,?fd,?event));??
- ????error?=?-EFAULT;??
- ????if?(ep_op_has_event(op)?&&??
- ????????copy_from_user(&epds,?event,?sizeof(struct?epoll_event)))??
- ????????goto?error_return;??
- ????/*?Get?the?"struct?file?*"?for?the?eventpoll?file?*/??
- ????error?=?-EBADF;??
- ????file?=?fget(epfd);??
- ????if?(!file)??
- ????????goto?error_return;??
- ????/*?Get?the?"struct?file?*"?for?the?target?file?*/??
- ????tfile?=?fget(fd);??
- ????if?(!tfile)??
- ????????goto?error_fput;??
- ????/*?The?target?file?descriptor?must?support?poll?*/??
- ????error?=?-EPERM;??
- ????if?(!tfile->f_op?||?!tfile->f_op->poll)??
- ????????goto?error_tgt_fput;??
- ????/*?
- ?????*?We?have?to?check?that?the?file?structure?underneath?the?file?descriptor?
- ?????*?the?user?passed?to?us?_is_?an?eventpoll?file.?And?also?we?do?not?permit?
- ?????*?adding?an?epoll?file?descriptor?inside?itself.?
- ?????*/??
- ????error?=?-EINVAL;??
- ????if?(file?==?tfile?||?!is_file_epoll(file))??
- ????????goto?error_tgt_fput;??
- ????/*?
- ?????*?At?this?point?it?is?safe?to?assume?that?the?"private_data"?contains?
- ?????*?our?own?data?structure.?
- ?????*/??
- ????ep?=?file->private_data;??
- ????mutex_lock(&ep->mtx);??
- ????/*?
- ?????*?Try?to?lookup?the?file?inside?our?RB?tree,?Since?we?grabbed?"mtx"?
- ?????*?above,?we?can?be?sure?to?be?able?to?use?the?item?looked?up?by?
- ?????*?ep_find()?till?we?release?the?mutex.?
- ?????*/??
- ????epi?=?ep_find(ep,?tfile,?fd);??
- ????error?=?-EINVAL;??
- ????switch?(op)?{??
- ????case?EPOLL_CTL_ADD:??
- ????????if?(!epi)?{??
- ????????????epds.events?|=?POLLERR?|?POLLHUP;??
- ????????????error?=?ep_insert(ep,?&epds,?tfile,?fd);??
- ????????}?else??
- ????????????error?=?-EEXIST;??
- ????????break;??
- ????case?EPOLL_CTL_DEL:??
- ????????if?(epi)??
- ????????????error?=?ep_remove(ep,?epi);??
- ????????else??
- ????????????error?=?-ENOENT;??
- ????????break;??
- ????case?EPOLL_CTL_MOD:??
- ????????if?(epi)?{??
- ????????????epds.events?|=?POLLERR?|?POLLHUP;??
- ????????????error?=?ep_modify(ep,?epi,?&epds);??
- ????????}?else??
- ????????????error?=?-ENOENT;??
- ????????break;??
- ????}??
- ????mutex_unlock(&ep->mtx);??
- error_tgt_fput:??
- ????fput(tfile);??
- error_fput:??
- ????fput(file);??
- error_return:??
- ????DNPRINTK(3,?(KERN_INFO?"[%p]?eventpoll:?sys_epoll_ctl(%d,?%d,?%d,?%p)?=?%d/n",??
- ?????????????current,?epfd,?op,?fd,?event,?error));??
- ????return?error;??
- }??
同樣,代碼很清楚。先來看看添加流程
- /*?
- ?*?Must?be?called?with?"mtx"?held.?
- ?*/??
- static?int?ep_insert(struct?eventpoll?*ep,?struct?epoll_event?*event,??
- ?????????????struct?file?*tfile,?int?fd)??
- {??
- ????int?error,?revents,?pwake?=?0;??
- ????unsigned?long?flags;??
- ????struct?epitem?*epi;??
- ????struct?ep_pqueue?epq;??
- ????????/*?不允許超過最大監聽個數*/??
- ????if?(unlikely(atomic_read(&ep->user->epoll_watches)?>=??
- ?????????????max_user_watches))??
- ????????return?-ENOSPC;??
- ????if?(!(epi?=?kmem_cache_alloc(epi_cache,?GFP_KERNEL)))??
- ????????return?-ENOMEM;??
- ????/*?Item?initialization?follow?here?...?*/??
- ????INIT_LIST_HEAD(&epi->rdllink);??
- ????INIT_LIST_HEAD(&epi->fllink);??
- ????INIT_LIST_HEAD(&epi->pwqlist);??
- ????epi->ep?=?ep;??
- ????ep_set_ffd(&epi->ffd,?tfile,?fd);??
- ????epi->event?=?*event;??
- ????epi->nwait?=?0;??
- ????epi->next?=?EP_UNACTIVE_PTR;??
- ????/*?Initialize?the?poll?table?using?the?queue?callback?*/??
- ????epq.epi?=?epi;??
- ????init_poll_funcptr(&epq.pt,?ep_ptable_queue_proc);??
- ????/*?
- ?????*?Attach?the?item?to?the?poll?hooks?and?get?current?event?bits.?
- ?????*?We?can?safely?use?the?file*?here?because?its?usage?count?has?
- ?????*?been?increased?by?the?caller?of?this?function.?Note?that?after?
- ?????*?this?operation?completes,?the?poll?callback?can?start?hitting?
- ?????*?the?new?item.?
- ?????*/??
- ????revents?=?tfile->f_op->poll(tfile,?&epq.pt);??
- ????/*?
- ?????*?We?have?to?check?if?something?went?wrong?during?the?poll?wait?queue?
- ?????*?install?process.?Namely?an?allocation?for?a?wait?queue?failed?due?
- ?????*?high?memory?pressure.?
- ?????*/??
- ????error?=?-ENOMEM;??
- ????if?(epi->nwait?<?0)??
- ????????goto?error_unregister;??
- ????/*?Add?the?current?item?to?the?list?of?active?epoll?hook?for?this?file?*/??
- ????spin_lock(&tfile->f_ep_lock);??
- ????list_add_tail(&epi->fllink,?&tfile->f_ep_links);??
- ????spin_unlock(&tfile->f_ep_lock);??
- ????/*?
- ?????*?Add?the?current?item?to?the?RB?tree.?All?RB?tree?operations?are?
- ?????*?protected?by?"mtx",?and?ep_insert()?is?called?with?"mtx"?held.?
- ?????*/??
- ????ep_rbtree_insert(ep,?epi);??
- ????/*?We?have?to?drop?the?new?item?inside?our?item?list?to?keep?track?of?it?*/??
- ????spin_lock_irqsave(&ep->lock,?flags);??
- ????/*?If?the?file?is?already?"ready"?we?drop?it?inside?the?ready?list?*/??
- ????if?((revents?&?event->events)?&&?!ep_is_linked(&epi->rdllink))?{??
- ????????list_add_tail(&epi->rdllink,?&ep->rdllist);??
- ????????/*?Notify?waiting?tasks?that?events?are?available?*/??
- ????????if?(waitqueue_active(&ep->wq))??
- ????????????wake_up_locked(&ep->wq);??
- ????????if?(waitqueue_active(&ep->poll_wait))??
- ????????????pwake++;??
- ????}??
- ????spin_unlock_irqrestore(&ep->lock,?flags);??
- ????atomic_inc(&ep->user->epoll_watches);??
- ????/*?We?have?to?call?this?outside?the?lock?*/??
- ????if?(pwake)??
- ????????ep_poll_safewake(&psw,?&ep->poll_wait);??
- ????DNPRINTK(3,?(KERN_INFO?"[%p]?eventpoll:?ep_insert(%p,?%p,?%d)/n",??
- ?????????????current,?ep,?tfile,?fd));??
- ????return?0;??
- error_unregister:??
- ????ep_unregister_pollwait(ep,?epi);??
- ????/*?
- ?????*?We?need?to?do?this?because?an?event?could?have?been?arrived?on?some?
- ?????*?allocated?wait?queue.?Note?that?we?don't?care?about?the?ep->ovflist?
- ?????*?list,?since?that?is?used/cleaned?only?inside?a?section?bound?by?"mtx".?
- ?????*?And?ep_insert()?is?called?with?"mtx"?held.?
- ?????*/??
- ????spin_lock_irqsave(&ep->lock,?flags);??
- ????if?(ep_is_linked(&epi->rdllink))??
- ????????list_del_init(&epi->rdllink);??
- ????spin_unlock_irqrestore(&ep->lock,?flags);??
- ????kmem_cache_free(epi_cache,?epi);??
- ????return?error;??
- }??
init_poll_funcptr函數注冊poll table回調函數。然后程序的下一步是調用tfile的poll函數,并且poll函數的第2個參數為poll table,
這是epoll機制中唯一對監聽套接字調用poll時第2個參數不為NULL的時機。ep_ptable_queue_proc函數的作用是注冊等待函數
并添加到指定的等待隊列,所以在第一次調用后,該信息已經存在了,無需在poll函數中再次調用了。
- /*?
- ?*?This?is?the?callback?that?is?used?to?add?our?wait?queue?to?the?
- ?*?target?file?wakeup?lists.?
- ?*/??
- static?void?ep_ptable_queue_proc(struct?file?*file,?wait_queue_head_t?*whead,??
- ?????????????????poll_table?*pt)??
- {??
- ????struct?epitem?*epi?=?ep_item_from_epqueue(pt);??
- ????struct?eppoll_entry?*pwq;??
- ????if?(epi->nwait?>=?0?&&?(pwq?=?kmem_cache_alloc(pwq_cache,?GFP_KERNEL)))?{??
- ????????????????/*?為監聽套接字注冊一個等待回調函數,在喚醒時調用*/??
- ????????init_waitqueue_func_entry(&pwq->wait,?ep_poll_callback);??
- ????????pwq->whead?=?whead;??
- ????????pwq->base?=?epi;??
- ????????add_wait_queue(whead,?&pwq->wait);??
- ????????list_add_tail(&pwq->llink,?&epi->pwqlist);??
- ????????epi->nwait++;??
- ????}?else?{??
- ????????/*?We?have?to?signal?that?an?error?occurred?*/??
- ????????epi->nwait?=?-1;??
- ????}??
- }??
?
那么該poll函數到底是怎樣的呢,這就要看我們在傳入到epoll_ctl前創建的套接字的類型(socket調用)。對于創建的tcp套接字
來說,可以按照創建流程找到其對應得函數是tcp_poll。
tcp_poll的主要功能為:
- 如果poll table回調函數存在(ep_ptable_queue_proc),則調用它來等待。注意這只限第一次調用,在后面的poll中都無需此步
- 判斷事件的到達。(根據tcp的相關成員)
tcp_poll注冊到的等待隊列是sock成員的sk_sleep,等待隊列在對應的IO事件中被喚醒。當等待隊列被喚醒時會調用相應的等待回調函數
,前面看到我們注冊的是函數ep_poll_callback。該函數可能在中斷上下文中調用。
- /*?
- ?*?This?is?the?callback?that?is?passed?to?the?wait?queue?wakeup?
- ?*?machanism.?It?is?called?by?the?stored?file?descriptors?when?they?
- ?*?have?events?to?report.?
- ?*/??
- static?int?ep_poll_callback(wait_queue_t?*wait,?unsigned?mode,?int?sync,?void?*key)??
- {??
- ????int?pwake?=?0;??
- ????unsigned?long?flags;??
- ????struct?epitem?*epi?=?ep_item_from_wait(wait);??
- ????struct?eventpoll?*ep?=?epi->ep;??
- ????DNPRINTK(3,?(KERN_INFO?"[%p]?eventpoll:?poll_callback(%p)?epi=%p?ep=%p/n",??
- ?????????????current,?epi->ffd.file,?epi,?ep));??
- ????????/*?對eventpoll的spinlock加鎖,因為是在中斷上下文中*/??
- ????spin_lock_irqsave(&ep->lock,?flags);??
- ????/*?沒有事件到來?
- ?????*?If?the?event?mask?does?not?contain?any?poll(2)?event,?we?consider?the?
- ?????*?descriptor?to?be?disabled.?This?condition?is?likely?the?effect?of?the?
- ?????*?EPOLLONESHOT?bit?that?disables?the?descriptor?when?an?event?is?received,?
- ?????*?until?the?next?EPOLL_CTL_MOD?will?be?issued.?
- ?????*/??
- ????if?(!(epi->event.events?&?~EP_PRIVATE_BITS))??
- ????????goto?out_unlock;??
- ????/*?
- ?????*?If?we?are?trasfering?events?to?userspace,?we?can?hold?no?locks?
- ?????*?(because?we're?accessing?user?memory,?and?because?of?linux?f_op->poll()?
- ?????*?semantics).?All?the?events?that?happens?during?that?period?of?time?are?
- ?????*?chained?in?ep->ovflist?and?requeued?later?on.?
- ?????*/??
- ????if?(unlikely(ep->ovflist?!=?EP_UNACTIVE_PTR))?{??
- ????????if?(epi->next?==?EP_UNACTIVE_PTR)?{??
- ????????????epi->next?=?ep->ovflist;??
- ????????????ep->ovflist?=?epi;??
- ????????}??
- ????????goto?out_unlock;??
- ????}??
- ????/*?If?this?file?is?already?in?the?ready?list?we?exit?soon?*/??
- ????if?(ep_is_linked(&epi->rdllink))??
- ????????goto?is_linked;??
- ????????/*?加入ready?queue*/??
- ????list_add_tail(&epi->rdllink,?&ep->rdllist);??
- is_linked:??
- ????/*?
- ?????*?Wake?up?(?if?active?)?both?the?eventpoll?wait?list?and?the?->poll()?
- ?????*?wait?list.?
- ?????*/??
- ????if?(waitqueue_active(&ep->wq))??
- ????????wake_up_locked(&ep->wq);??
- ????if?(waitqueue_active(&ep->poll_wait))??
- ????????pwake++;??
- out_unlock:??
- ????spin_unlock_irqrestore(&ep->lock,?flags);??
- ????/*?We?have?to?call?this?outside?the?lock?*/??
- ????if?(pwake)??
- ????????ep_poll_safewake(&psw,?&ep->poll_wait);??
- ????return?1;??
- }??
?
注意這里有2中隊列,一種是在epoll_wait調用中使用的eventpoll的等待隊列,用于判斷是否有監聽套接字可用,一種是對應于每個套接字
的等待隊列sk_sleep,用于判斷每個監聽套接字上事件,該隊列喚醒后調用ep_poll_callback,在該函數中又調用wakeup函數來喚醒前一種
隊列,來通知epoll_wait調用進程。
- static?int?ep_poll(struct?eventpoll?*ep,?struct?epoll_event?__user?*events,??
- ???????????int?maxevents,?long?timeout)??
- {??
- ????int?res,?eavail;??
- ????unsigned?long?flags;??
- ????long?jtimeout;??
- ????wait_queue_t?wait;??
- ????/*?
- ?????*?Calculate?the?timeout?by?checking?for?the?"infinite"?value?(?-1?)?
- ?????*?and?the?overflow?condition.?The?passed?timeout?is?in?milliseconds,?
- ?????*?that?why?(t?*?HZ)?/?1000.?
- ?????*/??
- ????jtimeout?=?(timeout?<?0?||?timeout?>=?EP_MAX_MSTIMEO)????
- ????????MAX_SCHEDULE_TIMEOUT?:?(timeout?*?HZ?+?999)?/?1000;??
- retry:??
- ????spin_lock_irqsave(&ep->lock,?flags);??
- ????res?=?0;??
- ????if?(list_empty(&ep->rdllist))?{??
- ????????/*?
- ?????????*?We?don't?have?any?available?event?to?return?to?the?caller.?
- ?????????*?We?need?to?sleep?here,?and?we?will?be?wake?up?by?
- ?????????*?ep_poll_callback()?when?events?will?become?available.?
- ?????????*/??
- ????????init_waitqueue_entry(&wait,?current);??
- ????????wait.flags?|=?WQ_FLAG_EXCLUSIVE;??
- ????????__add_wait_queue(&ep->wq,?&wait);??
- ????????for?(;;)?{??
- ????????????/*?
- ?????????????*?We?don't?want?to?sleep?if?the?ep_poll_callback()?sends?us?
- ?????????????*?a?wakeup?in?between.?That's?why?we?set?the?task?state?
- ?????????????*?to?TASK_INTERRUPTIBLE?before?doing?the?checks.?
- ?????????????*/??
- ????????????set_current_state(TASK_INTERRUPTIBLE);??
- ????????????if?(!list_empty(&ep->rdllist)?||?!jtimeout)??
- ????????????????break;??
- ????????????if?(signal_pending(current))?{??
- ????????????????res?=?-EINTR;??
- ????????????????break;??
- ????????????}??
- ????????????spin_unlock_irqrestore(&ep->lock,?flags);??
- ????????????jtimeout?=?schedule_timeout(jtimeout);??
- ????????????spin_lock_irqsave(&ep->lock,?flags);??
- ????????}??
- ????????__remove_wait_queue(&ep->wq,?&wait);??
- ????????set_current_state(TASK_RUNNING);??
- ????}??
- ????/*?Is?it?worth?to?try?to?dig?for?events???*/??
- ????eavail?=?!list_empty(&ep->rdllist);??
- ????spin_unlock_irqrestore(&ep->lock,?flags);??
- ????/*?
- ?????*?Try?to?transfer?events?to?user?space.?In?case?we?get?0?events?and?
- ?????*?there's?still?timeout?left?over,?we?go?trying?again?in?search?of?
- ?????*?more?luck.?
- ?????*/??
- ????if?(!res?&&?eavail?&&??
- ????????!(res?=?ep_send_events(ep,?events,?maxevents))?&&?jtimeout)??
- ????????goto?retry;??
- ????return?res;??
- }??
該函數是在epoll_wait中調用的等待函數,其等待被ep_poll_callback喚醒,然后調用ep_send_events來把到達事件copy到用戶空間,然后
epoll_wait才返回。
?
最后我們來看看ep_poll_callback函數和ep_send_events函數的同步,因為他們都要操作ready queue。
eventpoll中巧妙地設置了2種類型的鎖,一個是mtx,是個mutex類型,是對該描述符操作的基本同步鎖,可以睡眠;所以又存在了另外一個
鎖,lock,它是一個spinlock類型,不允許睡眠,所以用在ep_poll_callback中,注意mtx不能用于此。
注意由于ep_poll_callback函數中會涉及到對eventpoll的ovflist和rdllist成員的訪問,所以在任意其它地方要訪問時都要先加mxt,在加lock鎖。
?
由于中斷的到來時異步的,為了方便,先看ep_send_events函數。
- static?int?ep_send_events(struct?eventpoll?*ep,?struct?epoll_event?__user?*events,??
- ??????????????int?maxevents)??
- {??
- ????int?eventcnt,?error?=?-EFAULT,?pwake?=?0;??
- ????unsigned?int?revents;??
- ????unsigned?long?flags;??
- ????struct?epitem?*epi,?*nepi;??
- ????struct?list_head?txlist;??
- ????INIT_LIST_HEAD(&txlist);??
- ????/*?
- ?????*?We?need?to?lock?this?because?we?could?be?hit?by?
- ?????*?eventpoll_release_file()?and?epoll_ctl(EPOLL_CTL_DEL).?
- ?????*/??
- ????mutex_lock(&ep->mtx);??
- ????/*?
- ?????*?Steal?the?ready?list,?and?re-init?the?original?one?to?the?
- ?????*?empty?list.?Also,?set?ep->ovflist?to?NULL?so?that?events?
- ?????*?happening?while?looping?w/out?locks,?are?not?lost.?We?cannot?
- ?????*?have?the?poll?callback?to?queue?directly?on?ep->rdllist,?
- ?????*?because?we?are?doing?it?in?the?loop?below,?in?a?lockless?way.?
- ?????*/??
- ????spin_lock_irqsave(&ep->lock,?flags);??
- ????list_splice(&ep->rdllist,?&txlist);??
- ????INIT_LIST_HEAD(&ep->rdllist);??
- ????ep->ovflist?=?NULL;??
- ????spin_unlock_irqrestore(&ep->lock,?flags);??
- ????/*?
- ?????*?We?can?loop?without?lock?because?this?is?a?task?private?list.?
- ?????*?We?just?splice'd?out?the?ep->rdllist?in?ep_collect_ready_items().?
- ?????*?Items?cannot?vanish?during?the?loop?because?we?are?holding?"mtx".?
- ?????*/??
- ????for?(eventcnt?=?0;?!list_empty(&txlist)?&&?eventcnt?<?maxevents;)?{??
- ????????epi?=?list_first_entry(&txlist,?struct?epitem,?rdllink);??
- ????????list_del_init(&epi->rdllink);??
- ????????/*?
- ?????????*?Get?the?ready?file?event?set.?We?can?safely?use?the?file?
- ?????????*?because?we?are?holding?the?"mtx"?and?this?will?guarantee?
- ?????????*?that?both?the?file?and?the?item?will?not?vanish.?
- ?????????*/??
- ????????revents?=?epi->ffd.file->f_op->poll(epi->ffd.file,?NULL);??
- ????????revents?&=?epi->event.events;??
- ????????/*?
- ?????????*?Is?the?event?mask?intersect?the?caller-requested?one,?
- ?????????*?deliver?the?event?to?userspace.?Again,?we?are?holding?
- ?????????*?"mtx",?so?no?operations?coming?from?userspace?can?change?
- ?????????*?the?item.?
- ?????????*/??
- ????????if?(revents)?{??
- ????????????if?(__put_user(revents,??
- ???????????????????????&events[eventcnt].events)?||??
- ????????????????__put_user(epi->event.data,??
- ???????????????????????&events[eventcnt].data))??
- ????????????????goto?errxit;??
- ????????????if?(epi->event.events?&?EPOLLONESHOT)??
- ????????????????epi->event.events?&=?EP_PRIVATE_BITS;??
- ????????????eventcnt++;??
- ????????}??
- ????????/*?
- ?????????*?At?this?point,?noone?can?insert?into?ep->rdllist?besides?
- ?????????*?us.?The?epoll_ctl()?callers?are?locked?out?by?us?holding?
- ?????????*?"mtx"?and?the?poll?callback?will?queue?them?in?ep->ovflist.?
- ?????????*/??
- ????????if?(!(epi->event.events?&?EPOLLET)?&&??
- ????????????(revents?&?epi->event.events))??
- ????????????list_add_tail(&epi->rdllink,?&ep->rdllist);??
- ????}??
- ????error?=?0;??
- errxit:??
- ????spin_lock_irqsave(&ep->lock,?flags);??
- ????/*?
- ?????*?During?the?time?we?spent?in?the?loop?above,?some?other?events?
- ?????*?might?have?been?queued?by?the?poll?callback.?We?re-insert?them?
- ?????*?inside?the?main?ready-list?here.?
- ?????*/??
- ????for?(nepi?=?ep->ovflist;?(epi?=?nepi)?!=?NULL;??
- ?????????nepi?=?epi->next,?epi->next?=?EP_UNACTIVE_PTR)?{??
- ????????/*?
- ?????????*?If?the?above?loop?quit?with?errors,?the?epoll?item?might?still?
- ?????????*?be?linked?to?"txlist",?and?the?list_splice()?done?below?will?
- ?????????*?take?care?of?those?cases.?
- ?????????*/??
- ????????if?(!ep_is_linked(&epi->rdllink))??
- ????????????list_add_tail(&epi->rdllink,?&ep->rdllist);??
- ????}??
- ????/*?
- ?????*?We?need?to?set?back?ep->ovflist?to?EP_UNACTIVE_PTR,?so?that?after?
- ?????*?releasing?the?lock,?events?will?be?queued?in?the?normal?way?inside?
- ?????*?ep->rdllist.?
- ?????*/??
- ????ep->ovflist?=?EP_UNACTIVE_PTR;??
- ????/*?
- ?????*?In?case?of?error?in?the?event-send?loop,?or?in?case?the?number?of?
- ?????*?ready?events?exceeds?the?userspace?limit,?we?need?to?splice?the?
- ?????*?"txlist"?back?inside?ep->rdllist.?
- ?????*/??
- ????list_splice(&txlist,?&ep->rdllist);??
- ????if?(!list_empty(&ep->rdllist))?{??
- ????????/*?
- ?????????*?Wake?up?(if?active)?both?the?eventpoll?wait?list?and?the?->poll()?
- ?????????*?wait?list?(delayed?after?we?release?the?lock).?
- ?????????*/??
- ????????if?(waitqueue_active(&ep->wq))??
- ????????????wake_up_locked(&ep->wq);??
- ????????if?(waitqueue_active(&ep->poll_wait))??
- ????????????pwake++;??
- ????}??
- ????spin_unlock_irqrestore(&ep->lock,?flags);??
- ????mutex_unlock(&ep->mtx);??
- ????/*?We?have?to?call?this?outside?the?lock?*/??
- ????if?(pwake)??
- ????????ep_poll_safewake(&psw,?&ep->poll_wait);??
- ????return?eventcnt?==?0???error:?eventcnt;??
- }??
該函數的注釋也很清晰,不過我們從總體上分析下。
?
首先函數加mtx鎖,這時必須的。
然后得工作是要讀取ready queue,但是中斷會寫這個成員,所以要加spinlock;但是接下來的工作會sleep,所以在整個loop都加spinlock顯然
會阻塞ep_poll_callback函數,從而阻塞中斷,這是個很不好的行為,也不可取。于是epoll中在eventpoll中設置了另一個成員ovflist。在讀取ready
queue前,我們設置該成員為NULL,然后就可以釋放spinlock了。為什么這樣可行呢,因為對應的,在ep_poll_callback中,獲取spinlock后,對于
到達的事件并不總是放入ready queue,而是先判斷ovflist是否為EP_UNACTIVE_PTR。
- if?(unlikely(ep->ovflist?!=?EP_UNACTIVE_PTR))?{??
- /*?進入此處說明用用戶進程在調用ep_poll_callback,所以把事件加入ovflist中,而不是ready?queue中*/??
- ????????if?(epi->next?==?EP_UNACTIVE_PTR)?{/*?如果此處條件不成立,說明該epi已經在ovflist中,所以直接返回*/??
- ????????????epi->next?=?ep->ovflist;??
- ????????????ep->ovflist?=?epi;??
- ????????}??
- ????????goto?out_unlock;??
- ????}??
?
所以在此期間,到達的事件放入了ovflist中。當loop結束后,函數接著遍歷該list,添加到ready queue中,最后設置ovflist為EP_UNACTIVE_PTR,
這樣下次中斷中的事件可以放入ready queue了。最后判斷是否有其他epoll_wait調用被阻塞,則喚醒。
?
?
?
從源代碼中,可以看出epoll的幾大優點:
- 用戶傳入的信息保存在內核中了,無需每次傳入
- 事件監聽機制不在是 整個監聽隊列,而是每個監聽套接字在有事件到達時通過等待回調函數異步通知epoll,然后再返回給用戶。
同時epoll中的同步機制也是一個內核編程的設計經典,值得深入理解。
?