You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ev_epoll.c 10KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. /*
  2. * libev epoll fd activity backend
  3. *
  4. * Copyright (c) 2007,2008,2009,2010,2011,2016,2017,2019 Marc Alexander Lehmann <libev@schmorp.de>
  5. * All rights reserved.
  6. *
  7. * Redistribution and use in source and binary forms, with or without modifica-
  8. * tion, are permitted provided that the following conditions are met:
  9. *
  10. * 1. Redistributions of source code must retain the above copyright notice,
  11. * this list of conditions and the following disclaimer.
  12. *
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  18. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
  19. * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  20. * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
  21. * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  22. * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  23. * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  24. * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
  25. * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
  26. * OF THE POSSIBILITY OF SUCH DAMAGE.
  27. *
  28. * Alternatively, the contents of this file may be used under the terms of
  29. * the GNU General Public License ("GPL") version 2 or any later version,
  30. * in which case the provisions of the GPL are applicable instead of
  31. * the above. If you wish to allow the use of your version of this file
  32. * only under the terms of the GPL and not to allow others to use your
  33. * version of this file under the BSD license, indicate your decision
  34. * by deleting the provisions above and replace them with the notice
  35. * and other provisions required by the GPL. If you do not delete the
  36. * provisions above, a recipient may use your version of this file under
  37. * either the BSD or the GPL.
  38. */
  39. /*
  40. * general notes about epoll:
  41. *
  42. * a) epoll silently removes fds from the fd set. as nothing tells us
  43. * that an fd has been removed otherwise, we have to continually
  44. * "rearm" fds that we suspect *might* have changed (same
  45. * problem with kqueue, but much less costly there).
  46. * b) the fact that ADD != MOD creates a lot of extra syscalls due to a)
  47. * and seems not to have any advantage.
  48. * c) the inability to handle fork or file descriptors (think dup)
  49. * limits the applicability over poll, so this is not a generic
  50. * poll replacement.
  51. * d) epoll doesn't work the same as select with many file descriptors
  52. * (such as files). while not critical, no other advanced interface
  53. * seems to share this (rather non-unixy) limitation.
  54. * e) epoll claims to be embeddable, but in practise you never get
  55. * a ready event for the epoll fd (broken: <=2.6.26, working: >=2.6.32).
  56. * f) epoll_ctl returning EPERM means the fd is always ready.
  57. *
  58. * lots of "weird code" and complication handling in this file is due
  59. * to these design problems with epoll, as we try very hard to avoid
  60. * epoll_ctl syscalls for common usage patterns and handle the breakage
  61. * ensuing from receiving events for closed and otherwise long gone
  62. * file descriptors.
  63. */
  64. #include <sys/epoll.h>
  65. #define EV_EMASK_EPERM 0x80
  66. static void
  67. epoll_modify (EV_P_ int fd, int oev, int nev)
  68. {
  69. struct epoll_event ev;
  70. unsigned char oldmask;
  71. /*
  72. * we handle EPOLL_CTL_DEL by ignoring it here
  73. * on the assumption that the fd is gone anyways
  74. * if that is wrong, we have to handle the spurious
  75. * event in epoll_poll.
  76. * if the fd is added again, we try to ADD it, and, if that
  77. * fails, we assume it still has the same eventmask.
  78. */
  79. if (!nev)
  80. return;
  81. oldmask = anfds [fd].emask;
  82. anfds [fd].emask = nev;
  83. /* store the generation counter in the upper 32 bits, the fd in the lower 32 bits */
  84. ev.data.u64 = (uint64_t)(uint32_t)fd
  85. | ((uint64_t)(uint32_t)++anfds [fd].egen << 32);
  86. ev.events = (nev & EV_READ ? EPOLLIN : 0)
  87. | (nev & EV_WRITE ? EPOLLOUT : 0);
  88. if (ecb_expect_true (!epoll_ctl (backend_fd, oev && oldmask != nev ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev)))
  89. return;
  90. if (ecb_expect_true (errno == ENOENT))
  91. {
  92. /* if ENOENT then the fd went away, so try to do the right thing */
  93. if (!nev)
  94. goto dec_egen;
  95. if (!epoll_ctl (backend_fd, EPOLL_CTL_ADD, fd, &ev))
  96. return;
  97. }
  98. else if (ecb_expect_true (errno == EEXIST))
  99. {
  100. /* EEXIST means we ignored a previous DEL, but the fd is still active */
  101. /* if the kernel mask is the same as the new mask, we assume it hasn't changed */
  102. if (oldmask == nev)
  103. goto dec_egen;
  104. if (!epoll_ctl (backend_fd, EPOLL_CTL_MOD, fd, &ev))
  105. return;
  106. }
  107. else if (ecb_expect_true (errno == EPERM))
  108. {
  109. /* EPERM means the fd is always ready, but epoll is too snobbish */
  110. /* to handle it, unlike select or poll. */
  111. anfds [fd].emask = EV_EMASK_EPERM;
  112. /* add fd to epoll_eperms, if not already inside */
  113. if (!(oldmask & EV_EMASK_EPERM))
  114. {
  115. array_needsize (int, epoll_eperms, epoll_epermmax, epoll_epermcnt + 1, array_needsize_noinit);
  116. epoll_eperms [epoll_epermcnt++] = fd;
  117. }
  118. return;
  119. }
  120. else
  121. assert (("libev: I/O watcher with invalid fd found in epoll_ctl", errno != EBADF && errno != ELOOP && errno != EINVAL));
  122. fd_kill (EV_A_ fd);
  123. dec_egen:
  124. /* we didn't successfully call epoll_ctl, so decrement the generation counter again */
  125. --anfds [fd].egen;
  126. }
  127. static void
  128. epoll_poll (EV_P_ ev_tstamp timeout)
  129. {
  130. int i;
  131. int eventcnt;
  132. if (ecb_expect_false (epoll_epermcnt))
  133. timeout = EV_TS_CONST (0.);
  134. /* epoll wait times cannot be larger than (LONG_MAX - 999UL) / HZ msecs, which is below */
  135. /* the default libev max wait time, however. */
  136. EV_RELEASE_CB;
  137. eventcnt = epoll_wait (backend_fd, epoll_events, epoll_eventmax, EV_TS_TO_MSEC (timeout));
  138. EV_ACQUIRE_CB;
  139. if (ecb_expect_false (eventcnt < 0))
  140. {
  141. if (errno != EINTR)
  142. ev_syserr ("(libev) epoll_wait");
  143. return;
  144. }
  145. for (i = 0; i < eventcnt; ++i)
  146. {
  147. struct epoll_event *ev = epoll_events + i;
  148. int fd = (uint32_t)ev->data.u64; /* mask out the lower 32 bits */
  149. int want = anfds [fd].events;
  150. int got = (ev->events & (EPOLLOUT | EPOLLERR | EPOLLHUP) ? EV_WRITE : 0)
  151. | (ev->events & (EPOLLIN | EPOLLERR | EPOLLHUP) ? EV_READ : 0);
  152. /*
  153. * check for spurious notification.
  154. * this only finds spurious notifications on egen updates
  155. * other spurious notifications will be found by epoll_ctl, below
  156. * we assume that fd is always in range, as we never shrink the anfds array
  157. */
  158. if (ecb_expect_false ((uint32_t)anfds [fd].egen != (uint32_t)(ev->data.u64 >> 32)))
  159. {
  160. /* recreate kernel state */
  161. postfork |= 2;
  162. continue;
  163. }
  164. if (ecb_expect_false (got & ~want))
  165. {
  166. anfds [fd].emask = want;
  167. /*
  168. * we received an event but are not interested in it, try mod or del
  169. * this often happens because we optimistically do not unregister fds
  170. * when we are no longer interested in them, but also when we get spurious
  171. * notifications for fds from another process. this is partially handled
  172. * above with the gencounter check (== our fd is not the event fd), and
  173. * partially here, when epoll_ctl returns an error (== a child has the fd
  174. * but we closed it).
  175. * note: for events such as POLLHUP, where we can't know whether it refers
  176. * to EV_READ or EV_WRITE, we might issue redundant EPOLL_CTL_MOD calls.
  177. */
  178. ev->events = (want & EV_READ ? EPOLLIN : 0)
  179. | (want & EV_WRITE ? EPOLLOUT : 0);
  180. /* pre-2.6.9 kernels require a non-null pointer with EPOLL_CTL_DEL, */
  181. /* which is fortunately easy to do for us. */
  182. if (epoll_ctl (backend_fd, want ? EPOLL_CTL_MOD : EPOLL_CTL_DEL, fd, ev))
  183. {
  184. postfork |= 2; /* an error occurred, recreate kernel state */
  185. continue;
  186. }
  187. }
  188. fd_event (EV_A_ fd, got);
  189. }
  190. /* if the receive array was full, increase its size */
  191. if (ecb_expect_false (eventcnt == epoll_eventmax))
  192. {
  193. ev_free (epoll_events);
  194. epoll_eventmax = array_nextsize (sizeof (struct epoll_event), epoll_eventmax, epoll_eventmax + 1);
  195. epoll_events = (struct epoll_event *)ev_malloc (sizeof (struct epoll_event) * epoll_eventmax);
  196. }
  197. /* now synthesize events for all fds where epoll fails, while select works... */
  198. for (i = epoll_epermcnt; i--; )
  199. {
  200. int fd = epoll_eperms [i];
  201. unsigned char events = anfds [fd].events & (EV_READ | EV_WRITE);
  202. if (anfds [fd].emask & EV_EMASK_EPERM && events)
  203. fd_event (EV_A_ fd, events);
  204. else
  205. {
  206. epoll_eperms [i] = epoll_eperms [--epoll_epermcnt];
  207. anfds [fd].emask = 0;
  208. }
  209. }
  210. }
  211. static int
  212. epoll_epoll_create (void)
  213. {
  214. int fd;
  215. #if defined EPOLL_CLOEXEC && !defined __ANDROID__
  216. fd = epoll_create1 (EPOLL_CLOEXEC);
  217. if (fd < 0 && (errno == EINVAL || errno == ENOSYS))
  218. #endif
  219. {
  220. fd = epoll_create (256);
  221. if (fd >= 0)
  222. fcntl (fd, F_SETFD, FD_CLOEXEC);
  223. }
  224. return fd;
  225. }
  226. inline_size
  227. int
  228. epoll_init (EV_P_ int flags)
  229. {
  230. if ((backend_fd = epoll_epoll_create ()) < 0)
  231. return 0;
  232. backend_mintime = EV_TS_CONST (1e-3); /* epoll does sometimes return early, this is just to avoid the worst */
  233. backend_modify = epoll_modify;
  234. backend_poll = epoll_poll;
  235. epoll_eventmax = 64; /* initial number of events receivable per poll */
  236. epoll_events = (struct epoll_event *)ev_malloc (sizeof (struct epoll_event) * epoll_eventmax);
  237. return EVBACKEND_EPOLL;
  238. }
  239. inline_size
  240. void
  241. epoll_destroy (EV_P)
  242. {
  243. ev_free (epoll_events);
  244. array_free (epoll_eperm, EMPTY);
  245. }
  246. ecb_cold
  247. static void
  248. epoll_fork (EV_P)
  249. {
  250. close (backend_fd);
  251. while ((backend_fd = epoll_epoll_create ()) < 0)
  252. ev_syserr ("(libev) epoll_create");
  253. fd_rearm_all (EV_A);
  254. }