Changelog: http://cvs.schmorp.de/libev/Changes?pathrev=rel-4_33tags/2.5
@@ -9,8 +9,12 @@ CHECK_INCLUDE_FILES(sys/stat.h HAVE_SYS_STAT_H) | |||
CHECK_INCLUDE_FILES(sys/signalfd.h HAVE_SYS_SIGNALFD_H) | |||
CHECK_INCLUDE_FILES(port.h HAVE_PORT_H) | |||
CHECK_INCLUDE_FILES(poll.h HAVE_POLL_H) | |||
CHECK_INCLUDE_FILES(memory.h HAVE_MEMORY_H) | |||
CHECK_INCLUDE_FILES(sys/select.h HAVE_SYS_SELECT_H) | |||
CHECK_INCLUDE_FILES(sys/eventfd.h HAVE_SYS_EVENTFD_H) | |||
CHECK_INCLUDE_FILES(sys/timerfd.h HAVE_SYS_TIMERFD_H) | |||
CHECK_INCLUDE_FILES(linux/fs.h HAVE_LINUX_FS_H) | |||
CHECK_INCLUDE_FILES(linux/aio_abi.h HAVE_LINUX_AIO_ABI_H) | |||
IF(HAVE_SYS_INOTIFY_H) | |||
CHECK_SYMBOL_EXISTS(inotify_init "sys/types.h;sys/inotify.h" HAVE_INOTIFY_INIT) | |||
@@ -36,7 +40,9 @@ ENDIF() | |||
IF(HAVE_SYS_SIGNALFD_H) | |||
CHECK_SYMBOL_EXISTS(signalfd sys/signalfd.h HAVE_EVENTFD) | |||
ENDIF() | |||
IF(HAVE_LINUX_FS_H) | |||
CHECK_SYMBOL_EXISTS(RWF_SUPPORTED linux/fs.h HAVE_KERNEL_RWF_T) | |||
ENDIF() | |||
CHECK_SYMBOL_EXISTS(time.h nanosleep HAVE_NANOSLEEP) | |||
# check first without rt |
@@ -24,12 +24,21 @@ | |||
/* Define to 1 if you have the <inttypes.h> header file. */ | |||
#cmakedefine HAVE_INTTYPES_H 1 | |||
/* Define to 1 if linux/fs.h defined kernel_rwf_t */ | |||
#cmakedefine HAVE_KERNEL_RWF_T 1 | |||
/* Define to 1 if you have the `kqueue' function. */ | |||
#cmakedefine HAVE_KQUEUE 1 | |||
/* Define to 1 if you have the `rt' library (-lrt). */ | |||
#cmakedefine HAVE_LIBRT 1 | |||
/* Define to 1 if you have the <linux/aio_abi.h> header file. */ | |||
#cmakedefine HAVE_LINUX_AIO_ABI_H 1 | |||
/* Define to 1 if you have the <linux/fs.h> header file. */ | |||
#cmakedefine HAVE_LINUX_FS_H 1 | |||
/* Define to 1 if you have the <memory.h> header file. */ | |||
#cmakedefine HAVE_MEMORY_H 1 | |||
@@ -87,18 +96,14 @@ | |||
/* Define to 1 if you have the <sys/stat.h> header file. */ | |||
#cmakedefine HAVE_SYS_STAT_H 1 | |||
/* Define to 1 if you have the <sys/timerfd.h> header file. */ | |||
#cmakedefine HAVE_SYS_TIMERFD_H 1 | |||
/* Define to 1 if you have the <sys/types.h> header file. */ | |||
#cmakedefine HAVE_SYS_TYPES_H 1 | |||
/* Define to 1 if you have the <unistd.h> header file. */ | |||
#cmakedefine HAVE_UNISTD_H 1 | |||
/* Define to the version of this package. */ | |||
#define PACKAGE_VERSION 4.25 | |||
/* Define to 1 if you have the ANSI C header files. */ | |||
#define STDC_HEADERS 1 | |||
/* Version number of package */ | |||
#undef VERSION |
@@ -1,7 +1,7 @@ | |||
/* | |||
* libev native API header | |||
* | |||
* Copyright (c) 2007-2018 Marc Alexander Lehmann <libev@schmorp.de> | |||
* Copyright (c) 2007-2020 Marc Alexander Lehmann <libev@schmorp.de> | |||
* All rights reserved. | |||
* | |||
* Redistribution and use in source and binary forms, with or without modifica- | |||
@@ -151,7 +151,10 @@ EV_CPP(extern "C" {) | |||
/*****************************************************************************/ | |||
typedef double ev_tstamp; | |||
#ifndef EV_TSTAMP_T | |||
# define EV_TSTAMP_T double | |||
#endif | |||
typedef EV_TSTAMP_T ev_tstamp; | |||
#include <string.h> /* for memmove */ | |||
@@ -212,7 +215,7 @@ struct ev_loop; | |||
/*****************************************************************************/ | |||
#define EV_VERSION_MAJOR 4 | |||
#define EV_VERSION_MINOR 25 | |||
#define EV_VERSION_MINOR 33 | |||
/* eventmask, revents, events... */ | |||
enum { | |||
@@ -389,14 +392,12 @@ typedef struct ev_stat | |||
} ev_stat; | |||
#endif | |||
#if EV_IDLE_ENABLE | |||
/* invoked when the nothing else needs to be done, keeps the process from blocking */ | |||
/* revent EV_IDLE */ | |||
typedef struct ev_idle | |||
{ | |||
EV_WATCHER (ev_idle) | |||
} ev_idle; | |||
#endif | |||
/* invoked for each run of the mainloop, just before the blocking call */ | |||
/* you can still change events in any way you like */ | |||
@@ -413,23 +414,19 @@ typedef struct ev_check | |||
EV_WATCHER (ev_check) | |||
} ev_check; | |||
#if EV_FORK_ENABLE | |||
/* the callback gets invoked before check in the child process when a fork was detected */ | |||
/* revent EV_FORK */ | |||
typedef struct ev_fork | |||
{ | |||
EV_WATCHER (ev_fork) | |||
} ev_fork; | |||
#endif | |||
#if EV_CLEANUP_ENABLE | |||
/* is invoked just before the loop gets destroyed */ | |||
/* revent EV_CLEANUP */ | |||
typedef struct ev_cleanup | |||
{ | |||
EV_WATCHER (ev_cleanup) | |||
} ev_cleanup; | |||
#endif | |||
#if EV_EMBED_ENABLE | |||
/* used to embed an event loop inside another */ | |||
@@ -439,16 +436,18 @@ typedef struct ev_embed | |||
EV_WATCHER (ev_embed) | |||
struct ev_loop *other; /* ro */ | |||
#undef EV_IO_ENABLE | |||
#define EV_IO_ENABLE 1 | |||
ev_io io; /* private */ | |||
#undef EV_PREPARE_ENABLE | |||
#define EV_PREPARE_ENABLE 1 | |||
ev_prepare prepare; /* private */ | |||
ev_check check; /* unused */ | |||
ev_timer timer; /* unused */ | |||
ev_periodic periodic; /* unused */ | |||
ev_idle idle; /* unused */ | |||
ev_fork fork; /* private */ | |||
#if EV_CLEANUP_ENABLE | |||
ev_cleanup cleanup; /* unused */ | |||
#endif | |||
} ev_embed; | |||
#endif | |||
@@ -501,29 +500,32 @@ union ev_any_watcher | |||
/* flag bits for ev_default_loop and ev_loop_new */ | |||
enum { | |||
/* the default */ | |||
EVFLAG_AUTO = 0x00000000U, /* not quite a mask */ | |||
EVFLAG_AUTO = 0x00000000U, /* not quite a mask */ | |||
/* flag bits */ | |||
EVFLAG_NOENV = 0x01000000U, /* do NOT consult environment */ | |||
EVFLAG_FORKCHECK = 0x02000000U, /* check for a fork in each iteration */ | |||
EVFLAG_NOENV = 0x01000000U, /* do NOT consult environment */ | |||
EVFLAG_FORKCHECK = 0x02000000U, /* check for a fork in each iteration */ | |||
/* debugging/feature disable */ | |||
EVFLAG_NOINOTIFY = 0x00100000U, /* do not attempt to use inotify */ | |||
EVFLAG_NOINOTIFY = 0x00100000U, /* do not attempt to use inotify */ | |||
#if EV_COMPAT3 | |||
EVFLAG_NOSIGFD = 0, /* compatibility to pre-3.9 */ | |||
EVFLAG_NOSIGFD = 0, /* compatibility to pre-3.9 */ | |||
#endif | |||
EVFLAG_SIGNALFD = 0x00200000U, /* attempt to use signalfd */ | |||
EVFLAG_NOSIGMASK = 0x00400000U /* avoid modifying the signal mask */ | |||
EVFLAG_SIGNALFD = 0x00200000U, /* attempt to use signalfd */ | |||
EVFLAG_NOSIGMASK = 0x00400000U, /* avoid modifying the signal mask */ | |||
EVFLAG_NOTIMERFD = 0x00800000U /* avoid creating a timerfd */ | |||
}; | |||
/* method bits to be ored together */ | |||
enum { | |||
EVBACKEND_SELECT = 0x00000001U, /* available just about anywhere */ | |||
EVBACKEND_POLL = 0x00000002U, /* !win, !aix, broken on osx */ | |||
EVBACKEND_EPOLL = 0x00000004U, /* linux */ | |||
EVBACKEND_KQUEUE = 0x00000008U, /* bsd, broken on osx */ | |||
EVBACKEND_DEVPOLL = 0x00000010U, /* solaris 8 */ /* NYI */ | |||
EVBACKEND_PORT = 0x00000020U, /* solaris 10 */ | |||
EVBACKEND_ALL = 0x0000003FU, /* all known backends */ | |||
EVBACKEND_MASK = 0x0000FFFFU /* all future backends */ | |||
EVBACKEND_SELECT = 0x00000001U, /* available just about anywhere */ | |||
EVBACKEND_POLL = 0x00000002U, /* !win, !aix, broken on osx */ | |||
EVBACKEND_EPOLL = 0x00000004U, /* linux */ | |||
EVBACKEND_KQUEUE = 0x00000008U, /* bsd, broken on osx */ | |||
EVBACKEND_DEVPOLL = 0x00000010U, /* solaris 8 */ /* NYI */ | |||
EVBACKEND_PORT = 0x00000020U, /* solaris 10 */ | |||
EVBACKEND_LINUXAIO = 0x00000040U, /* linux AIO, 4.19+ */ | |||
EVBACKEND_IOURING = 0x00000080U, /* linux io_uring, 5.1+ */ | |||
EVBACKEND_ALL = 0x000000FFU, /* all known backends */ | |||
EVBACKEND_MASK = 0x0000FFFFU /* all future backends */ | |||
}; | |||
#if EV_PROTOTYPES | |||
@@ -557,7 +559,6 @@ EV_API_DECL void ev_set_syserr_cb (void (*cb)(const char *msg) EV_NOEXCEPT) EV_N | |||
/* you can call this as often as you like */ | |||
EV_API_DECL struct ev_loop *ev_default_loop (unsigned int flags EV_CPP (= 0)) EV_NOEXCEPT; | |||
/* create and destroy alternative loops that don't handle signals */ | |||
EV_API_DECL struct ev_loop *ev_loop_new (unsigned int flags EV_CPP (= 0)) EV_NOEXCEPT; | |||
@@ -643,6 +644,8 @@ EV_API_DECL int ev_active_cnt (EV_P) EV_NOEXCEPT; | |||
*/ | |||
EV_API_DECL void ev_once (EV_P_ int fd, int events, ev_tstamp timeout, void (*cb)(int revents, void *arg), void *arg) EV_NOEXCEPT; | |||
EV_API_DECL void ev_invoke_pending (EV_P); /* invoke all pending watchers */ | |||
# if EV_FEATURE_API | |||
EV_API_DECL unsigned int ev_iteration (EV_P) EV_NOEXCEPT; /* number of loop iterations */ | |||
EV_API_DECL unsigned int ev_depth (EV_P) EV_NOEXCEPT; /* #ev_loop enters - #ev_loop leaves */ | |||
@@ -660,7 +663,6 @@ EV_API_DECL void ev_set_invoke_pending_cb (EV_P_ ev_loop_callback invoke_pending | |||
EV_API_DECL void ev_set_loop_release_cb (EV_P_ void (*release)(EV_P) EV_NOEXCEPT, void (*acquire)(EV_P) EV_NOEXCEPT) EV_NOEXCEPT; | |||
EV_API_DECL unsigned int ev_pending_count (EV_P) EV_NOEXCEPT; /* number of pending events, if any */ | |||
EV_API_DECL void ev_invoke_pending (EV_P); /* invoke all pending watchers */ | |||
/* | |||
* stop/start the timer handling. | |||
@@ -680,6 +682,7 @@ EV_API_DECL void ev_resume (EV_P) EV_NOEXCEPT; | |||
ev_set_cb ((ev), cb_); \ | |||
} while (0) | |||
#define ev_io_modify(ev,events_) do { (ev)->events = (ev)->events & EV__IOFDSET | (events_); } while (0) | |||
#define ev_io_set(ev,fd_,events_) do { (ev)->fd = (fd_); (ev)->events = (events_) | EV__IOFDSET; } while (0) | |||
#define ev_timer_set(ev,after_,repeat_) do { ((ev_watcher_time *)(ev))->at = (after_); (ev)->repeat = (repeat_); } while (0) | |||
#define ev_periodic_set(ev,ofs_,ival_,rcb_) do { (ev)->offset = (ofs_); (ev)->interval = (ival_); (ev)->reschedule_cb = (rcb_); } while (0) | |||
@@ -712,7 +715,6 @@ EV_API_DECL void ev_resume (EV_P) EV_NOEXCEPT; | |||
#define ev_is_active(ev) (0 + ((ev_watcher *)(void *)(ev))->active) /* ro, true when the watcher has been started */ | |||
#define ev_can_stop(ev) (ev_is_pending(ev) || ev_is_active(ev)) /* ro, true when the watcher has been started */ | |||
#define ev_cb_(ev) (ev)->cb /* rw */ | |||
#define ev_cb(ev) (memmove (&ev_cb_ (ev), &((ev_watcher *)(ev))->cb, sizeof (ev_cb_ (ev))), (ev)->cb) | |||
@@ -727,6 +729,7 @@ EV_API_DECL void ev_resume (EV_P) EV_NOEXCEPT; | |||
#define ev_periodic_at(ev) (+((ev_watcher_time *)(ev))->at) | |||
#ifndef ev_set_cb | |||
/* memmove is used here to avoid strict aliasing violations, and hopefully is optimized out by any reasonable compiler */ | |||
# define ev_set_cb(ev,cb_) (ev_cb_ (ev) = (cb_), memmove (&((ev_watcher *)(ev))->cb, &ev_cb_ (ev), sizeof (ev_cb_ (ev)))) | |||
#endif | |||
@@ -1,7 +1,7 @@ | |||
/* | |||
* libev epoll fd activity backend | |||
* | |||
* Copyright (c) 2007,2008,2009,2010,2011 Marc Alexander Lehmann <libev@schmorp.de> | |||
* Copyright (c) 2007,2008,2009,2010,2011,2016,2017,2019 Marc Alexander Lehmann <libev@schmorp.de> | |||
* All rights reserved. | |||
* | |||
* Redistribution and use in source and binary forms, with or without modifica- | |||
@@ -93,10 +93,10 @@ epoll_modify (EV_P_ int fd, int oev, int nev) | |||
ev.events = (nev & EV_READ ? EPOLLIN : 0) | |||
| (nev & EV_WRITE ? EPOLLOUT : 0); | |||
if (expect_true (!epoll_ctl (backend_fd, oev && oldmask != nev ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev))) | |||
if (ecb_expect_true (!epoll_ctl (backend_fd, oev && oldmask != nev ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev))) | |||
return; | |||
if (expect_true (errno == ENOENT)) | |||
if (ecb_expect_true (errno == ENOENT)) | |||
{ | |||
/* if ENOENT then the fd went away, so try to do the right thing */ | |||
if (!nev) | |||
@@ -105,7 +105,7 @@ epoll_modify (EV_P_ int fd, int oev, int nev) | |||
if (!epoll_ctl (backend_fd, EPOLL_CTL_ADD, fd, &ev)) | |||
return; | |||
} | |||
else if (expect_true (errno == EEXIST)) | |||
else if (ecb_expect_true (errno == EEXIST)) | |||
{ | |||
/* EEXIST means we ignored a previous DEL, but the fd is still active */ | |||
/* if the kernel mask is the same as the new mask, we assume it hasn't changed */ | |||
@@ -115,7 +115,7 @@ epoll_modify (EV_P_ int fd, int oev, int nev) | |||
if (!epoll_ctl (backend_fd, EPOLL_CTL_MOD, fd, &ev)) | |||
return; | |||
} | |||
else if (expect_true (errno == EPERM)) | |||
else if (ecb_expect_true (errno == EPERM)) | |||
{ | |||
/* EPERM means the fd is always ready, but epoll is too snobbish */ | |||
/* to handle it, unlike select or poll. */ | |||
@@ -124,12 +124,14 @@ epoll_modify (EV_P_ int fd, int oev, int nev) | |||
/* add fd to epoll_eperms, if not already inside */ | |||
if (!(oldmask & EV_EMASK_EPERM)) | |||
{ | |||
array_needsize (int, epoll_eperms, epoll_epermmax, epoll_epermcnt + 1, EMPTY2); | |||
array_needsize (int, epoll_eperms, epoll_epermmax, epoll_epermcnt + 1, array_needsize_noinit); | |||
epoll_eperms [epoll_epermcnt++] = fd; | |||
} | |||
return; | |||
} | |||
else | |||
assert (("libev: I/O watcher with invalid fd found in epoll_ctl", errno != EBADF && errno != ELOOP && errno != EINVAL)); | |||
fd_kill (EV_A_ fd); | |||
@@ -144,16 +146,16 @@ epoll_poll (EV_P_ ev_tstamp timeout) | |||
int i; | |||
int eventcnt; | |||
if (expect_false (epoll_epermcnt)) | |||
timeout = 0.; | |||
if (ecb_expect_false (epoll_epermcnt)) | |||
timeout = EV_TS_CONST (0.); | |||
/* epoll wait times cannot be larger than (LONG_MAX - 999UL) / HZ msecs, which is below */ | |||
/* the default libev max wait time, however. */ | |||
EV_RELEASE_CB; | |||
eventcnt = epoll_wait (backend_fd, epoll_events, epoll_eventmax, timeout * 1e3); | |||
eventcnt = epoll_wait (backend_fd, epoll_events, epoll_eventmax, EV_TS_TO_MSEC (timeout)); | |||
EV_ACQUIRE_CB; | |||
if (expect_false (eventcnt < 0)) | |||
if (ecb_expect_false (eventcnt < 0)) | |||
{ | |||
if (errno != EINTR) | |||
ev_syserr ("(libev) epoll_wait"); | |||
@@ -176,14 +178,14 @@ epoll_poll (EV_P_ ev_tstamp timeout) | |||
* other spurious notifications will be found by epoll_ctl, below | |||
* we assume that fd is always in range, as we never shrink the anfds array | |||
*/ | |||
if (expect_false ((uint32_t)anfds [fd].egen != (uint32_t)(ev->data.u64 >> 32))) | |||
if (ecb_expect_false ((uint32_t)anfds [fd].egen != (uint32_t)(ev->data.u64 >> 32))) | |||
{ | |||
/* recreate kernel state */ | |||
postfork |= 2; | |||
continue; | |||
} | |||
if (expect_false (got & ~want)) | |||
if (ecb_expect_false (got & ~want)) | |||
{ | |||
anfds [fd].emask = want; | |||
@@ -195,6 +197,8 @@ epoll_poll (EV_P_ ev_tstamp timeout) | |||
* above with the gencounter check (== our fd is not the event fd), and | |||
* partially here, when epoll_ctl returns an error (== a child has the fd | |||
* but we closed it). | |||
* note: for events such as POLLHUP, where we can't know whether it refers | |||
* to EV_READ or EV_WRITE, we might issue redundant EPOLL_CTL_MOD calls. | |||
*/ | |||
ev->events = (want & EV_READ ? EPOLLIN : 0) | |||
| (want & EV_WRITE ? EPOLLOUT : 0); | |||
@@ -212,7 +216,7 @@ epoll_poll (EV_P_ ev_tstamp timeout) | |||
} | |||
/* if the receive array was full, increase its size */ | |||
if (expect_false (eventcnt == epoll_eventmax)) | |||
if (ecb_expect_false (eventcnt == epoll_eventmax)) | |||
{ | |||
ev_free (epoll_events); | |||
epoll_eventmax = array_nextsize (sizeof (struct epoll_event), epoll_eventmax, epoll_eventmax + 1); | |||
@@ -235,23 +239,34 @@ epoll_poll (EV_P_ ev_tstamp timeout) | |||
} | |||
} | |||
inline_size | |||
int | |||
epoll_init (EV_P_ int flags) | |||
static int | |||
epoll_epoll_create (void) | |||
{ | |||
int fd; | |||
#if defined EPOLL_CLOEXEC && !defined __ANDROID__ | |||
backend_fd = epoll_create1 (EPOLL_CLOEXEC); | |||
fd = epoll_create1 (EPOLL_CLOEXEC); | |||
if (backend_fd < 0 && (errno == EINVAL || errno == ENOSYS)) | |||
if (fd < 0 && (errno == EINVAL || errno == ENOSYS)) | |||
#endif | |||
backend_fd = epoll_create (256); | |||
{ | |||
fd = epoll_create (256); | |||
if (backend_fd < 0) | |||
return 0; | |||
if (fd >= 0) | |||
fcntl (fd, F_SETFD, FD_CLOEXEC); | |||
} | |||
return fd; | |||
} | |||
fcntl (backend_fd, F_SETFD, FD_CLOEXEC); | |||
inline_size | |||
int | |||
epoll_init (EV_P_ int flags) | |||
{ | |||
if ((backend_fd = epoll_epoll_create ()) < 0) | |||
return 0; | |||
backend_mintime = 1e-3; /* epoll does sometimes return early, this is just to avoid the worst */ | |||
backend_mintime = EV_TS_CONST (1e-3); /* epoll does sometimes return early, this is just to avoid the worst */ | |||
backend_modify = epoll_modify; | |||
backend_poll = epoll_poll; | |||
@@ -269,17 +284,15 @@ epoll_destroy (EV_P) | |||
array_free (epoll_eperm, EMPTY); | |||
} | |||
inline_size | |||
void | |||
ecb_cold | |||
static void | |||
epoll_fork (EV_P) | |||
{ | |||
close (backend_fd); | |||
while ((backend_fd = epoll_create (256)) < 0) | |||
while ((backend_fd = epoll_epoll_create ()) < 0) | |||
ev_syserr ("(libev) epoll_create"); | |||
fcntl (backend_fd, F_SETFD, FD_CLOEXEC); | |||
fd_rearm_all (EV_A); | |||
} | |||
@@ -0,0 +1,694 @@ | |||
/* | |||
* libev linux io_uring fd activity backend | |||
* | |||
* Copyright (c) 2019-2020 Marc Alexander Lehmann <libev@schmorp.de> | |||
* All rights reserved. | |||
* | |||
* Redistribution and use in source and binary forms, with or without modifica- | |||
* tion, are permitted provided that the following conditions are met: | |||
* | |||
* 1. Redistributions of source code must retain the above copyright notice, | |||
* this list of conditions and the following disclaimer. | |||
* | |||
* 2. Redistributions in binary form must reproduce the above copyright | |||
* notice, this list of conditions and the following disclaimer in the | |||
* documentation and/or other materials provided with the distribution. | |||
* | |||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED | |||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- | |||
* CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO | |||
* EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- | |||
* CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; | |||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, | |||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- | |||
* ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | |||
* OF THE POSSIBILITY OF SUCH DAMAGE. | |||
* | |||
* Alternatively, the contents of this file may be used under the terms of | |||
* the GNU General Public License ("GPL") version 2 or any later version, | |||
* in which case the provisions of the GPL are applicable instead of | |||
* the above. If you wish to allow the use of your version of this file | |||
* only under the terms of the GPL and not to allow others to use your | |||
* version of this file under the BSD license, indicate your decision | |||
* by deleting the provisions above and replace them with the notice | |||
* and other provisions required by the GPL. If you do not delete the | |||
* provisions above, a recipient may use your version of this file under | |||
* either the BSD or the GPL. | |||
*/ | |||
/* | |||
* general notes about linux io_uring: | |||
* | |||
* a) it's the best interface I have seen so far. on linux. | |||
* b) best is not necessarily very good. | |||
* c) it's better than the aio mess, doesn't suffer from the fork problems | |||
* of linux aio or epoll and so on and so on. and you could do event stuff | |||
* without any syscalls. what's not to like? | |||
* d) ok, it's vastly more complex, but that's ok, really. | |||
* e) why two mmaps instead of one? one would be more space-efficient, | |||
* and I can't see what benefit two would have (other than being | |||
* somehow resizable/relocatable, but that's apparently not possible). | |||
* f) hmm, it's practically undebuggable (gdb can't access the memory, and | |||
* the bizarre way structure offsets are communicated makes it hard to | |||
* just print the ring buffer heads, even *iff* the memory were visible | |||
* in gdb. but then, that's also ok, really. | |||
* g) well, you cannot specify a timeout when waiting for events. no, | |||
* seriously, the interface doesn't support a timeout. never seen _that_ | |||
* before. sure, you can use a timerfd, but that's another syscall | |||
* you could have avoided. overall, this bizarre omission smells | |||
* like a µ-optimisation by the io_uring author for his personal | |||
* applications, to the detriment of everybody else who just wants | |||
* an event loop. but, umm, ok, if that's all, it could be worse. | |||
* (from what I gather from the author Jens Axboe, it simply didn't | |||
* occur to him, and he made good on it by adding an unlimited nuber | |||
* of timeouts later :). | |||
* h) initially there was a hardcoded limit of 4096 outstanding events. | |||
* later versions not only bump this to 32k, but also can handle | |||
* an unlimited amount of events, so this only affects the batch size. | |||
* i) unlike linux aio, you *can* register more then the limit | |||
* of fd events. while early verisons of io_uring signalled an overflow | |||
* and you ended up getting wet. 5.5+ does not do this anymore. | |||
* j) but, oh my! it had exactly the same bugs as the linux aio backend, | |||
* where some undocumented poll combinations just fail. fortunately, | |||
* after finally reaching the author, he was more than willing to fix | |||
* this probably in 5.6+. | |||
* k) overall, the *API* itself is, I dare to say, not a total trainwreck. | |||
* once the bugs ae fixed (probably in 5.6+), it will be without | |||
* competition. | |||
*/ | |||
/* TODO: use internal TIMEOUT */ | |||
/* TODO: take advantage of single mmap, NODROP etc. */ | |||
/* TODO: resize cq/sq size independently */ | |||
#include <sys/timerfd.h> | |||
#include <sys/mman.h> | |||
#include <poll.h> | |||
#include <stdint.h> | |||
#define IOURING_INIT_ENTRIES 32 | |||
/*****************************************************************************/ | |||
/* syscall wrapdadoop - this section has the raw api/abi definitions */ | |||
#include <linux/fs.h> | |||
#include <linux/types.h> | |||
/* mostly directly taken from the kernel or documentation */ | |||
struct io_uring_sqe | |||
{ | |||
__u8 opcode; | |||
__u8 flags; | |||
__u16 ioprio; | |||
__s32 fd; | |||
union { | |||
__u64 off; | |||
__u64 addr2; | |||
}; | |||
__u64 addr; | |||
__u32 len; | |||
union { | |||
__kernel_rwf_t rw_flags; | |||
__u32 fsync_flags; | |||
__u16 poll_events; | |||
__u32 sync_range_flags; | |||
__u32 msg_flags; | |||
__u32 timeout_flags; | |||
__u32 accept_flags; | |||
__u32 cancel_flags; | |||
__u32 open_flags; | |||
__u32 statx_flags; | |||
}; | |||
__u64 user_data; | |||
union { | |||
__u16 buf_index; | |||
__u64 __pad2[3]; | |||
}; | |||
}; | |||
struct io_uring_cqe | |||
{ | |||
__u64 user_data; | |||
__s32 res; | |||
__u32 flags; | |||
}; | |||
struct io_sqring_offsets | |||
{ | |||
__u32 head; | |||
__u32 tail; | |||
__u32 ring_mask; | |||
__u32 ring_entries; | |||
__u32 flags; | |||
__u32 dropped; | |||
__u32 array; | |||
__u32 resv1; | |||
__u64 resv2; | |||
}; | |||
struct io_cqring_offsets | |||
{ | |||
__u32 head; | |||
__u32 tail; | |||
__u32 ring_mask; | |||
__u32 ring_entries; | |||
__u32 overflow; | |||
__u32 cqes; | |||
__u64 resv[2]; | |||
}; | |||
struct io_uring_params | |||
{ | |||
__u32 sq_entries; | |||
__u32 cq_entries; | |||
__u32 flags; | |||
__u32 sq_thread_cpu; | |||
__u32 sq_thread_idle; | |||
__u32 features; | |||
__u32 resv[4]; | |||
struct io_sqring_offsets sq_off; | |||
struct io_cqring_offsets cq_off; | |||
}; | |||
#define IORING_SETUP_CQSIZE 0x00000008 | |||
#define IORING_OP_POLL_ADD 6 | |||
#define IORING_OP_POLL_REMOVE 7 | |||
#define IORING_OP_TIMEOUT 11 | |||
#define IORING_OP_TIMEOUT_REMOVE 12 | |||
/* relative or absolute, reference clock is CLOCK_MONOTONIC */ | |||
struct iouring_kernel_timespec | |||
{ | |||
int64_t tv_sec; | |||
long long tv_nsec; | |||
}; | |||
#define IORING_TIMEOUT_ABS 0x00000001 | |||
#define IORING_ENTER_GETEVENTS 0x01 | |||
#define IORING_OFF_SQ_RING 0x00000000ULL | |||
#define IORING_OFF_CQ_RING 0x08000000ULL | |||
#define IORING_OFF_SQES 0x10000000ULL | |||
#define IORING_FEAT_SINGLE_MMAP 0x00000001 | |||
#define IORING_FEAT_NODROP 0x00000002 | |||
#define IORING_FEAT_SUBMIT_STABLE 0x00000004 | |||
inline_size | |||
int | |||
evsys_io_uring_setup (unsigned entries, struct io_uring_params *params) | |||
{ | |||
return ev_syscall2 (SYS_io_uring_setup, entries, params); | |||
} | |||
inline_size | |||
int | |||
evsys_io_uring_enter (int fd, unsigned to_submit, unsigned min_complete, unsigned flags, const sigset_t *sig, size_t sigsz) | |||
{ | |||
return ev_syscall6 (SYS_io_uring_enter, fd, to_submit, min_complete, flags, sig, sigsz); | |||
} | |||
/*****************************************************************************/ | |||
/* actual backed implementation */ | |||
/* we hope that volatile will make the compiler access this variables only once */ | |||
#define EV_SQ_VAR(name) *(volatile unsigned *)((char *)iouring_sq_ring + iouring_sq_ ## name) | |||
#define EV_CQ_VAR(name) *(volatile unsigned *)((char *)iouring_cq_ring + iouring_cq_ ## name) | |||
/* the index array */ | |||
#define EV_SQ_ARRAY ((unsigned *)((char *)iouring_sq_ring + iouring_sq_array)) | |||
/* the submit/completion queue entries */ | |||
#define EV_SQES ((struct io_uring_sqe *) iouring_sqes) | |||
#define EV_CQES ((struct io_uring_cqe *)((char *)iouring_cq_ring + iouring_cq_cqes)) | |||
inline_speed | |||
int | |||
iouring_enter (EV_P_ ev_tstamp timeout) | |||
{ | |||
int res; | |||
EV_RELEASE_CB; | |||
res = evsys_io_uring_enter (iouring_fd, iouring_to_submit, 1, | |||
timeout > EV_TS_CONST (0.) ? IORING_ENTER_GETEVENTS : 0, 0, 0); | |||
assert (("libev: io_uring_enter did not consume all sqes", (res < 0 || res == iouring_to_submit))); | |||
iouring_to_submit = 0; | |||
EV_ACQUIRE_CB; | |||
return res; | |||
} | |||
/* TODO: can we move things around so we don't need this forward-reference? */ | |||
static void | |||
iouring_poll (EV_P_ ev_tstamp timeout); | |||
static | |||
struct io_uring_sqe * | |||
iouring_sqe_get (EV_P) | |||
{ | |||
unsigned tail; | |||
for (;;) | |||
{ | |||
tail = EV_SQ_VAR (tail); | |||
if (ecb_expect_true (tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries))) | |||
break; /* whats the problem, we have free sqes */ | |||
/* queue full, need to flush and possibly handle some events */ | |||
#if EV_FEATURE_CODE | |||
/* first we ask the kernel nicely, most often this frees up some sqes */ | |||
int res = iouring_enter (EV_A_ EV_TS_CONST (0.)); | |||
ECB_MEMORY_FENCE_ACQUIRE; /* better safe than sorry */ | |||
if (res >= 0) | |||
continue; /* yes, it worked, try again */ | |||
#endif | |||
/* some problem, possibly EBUSY - do the full poll and let it handle any issues */ | |||
iouring_poll (EV_A_ EV_TS_CONST (0.)); | |||
/* iouring_poll should have done ECB_MEMORY_FENCE_ACQUIRE for us */ | |||
} | |||
/*assert (("libev: io_uring queue full after flush", tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries)));*/ | |||
return EV_SQES + (tail & EV_SQ_VAR (ring_mask)); | |||
} | |||
inline_size | |||
struct io_uring_sqe * | |||
iouring_sqe_submit (EV_P_ struct io_uring_sqe *sqe) | |||
{ | |||
unsigned idx = sqe - EV_SQES; | |||
EV_SQ_ARRAY [idx] = idx; | |||
ECB_MEMORY_FENCE_RELEASE; | |||
++EV_SQ_VAR (tail); | |||
/*ECB_MEMORY_FENCE_RELEASE; /* for the time being we assume this is not needed */ | |||
++iouring_to_submit; | |||
} | |||
/*****************************************************************************/ | |||
/* when the timerfd expires we simply note the fact, | |||
* as the purpose of the timerfd is to wake us up, nothing else. | |||
* the next iteration should re-set it. | |||
*/ | |||
static void | |||
iouring_tfd_cb (EV_P_ struct ev_io *w, int revents) | |||
{ | |||
iouring_tfd_to = EV_TSTAMP_HUGE; | |||
} | |||
/* called for full and partial cleanup */ | |||
ecb_cold | |||
static int | |||
iouring_internal_destroy (EV_P) | |||
{ | |||
close (iouring_tfd); | |||
close (iouring_fd); | |||
if (iouring_sq_ring != MAP_FAILED) munmap (iouring_sq_ring, iouring_sq_ring_size); | |||
if (iouring_cq_ring != MAP_FAILED) munmap (iouring_cq_ring, iouring_cq_ring_size); | |||
if (iouring_sqes != MAP_FAILED) munmap (iouring_sqes , iouring_sqes_size ); | |||
if (ev_is_active (&iouring_tfd_w)) | |||
{ | |||
ev_ref (EV_A); | |||
ev_io_stop (EV_A_ &iouring_tfd_w); | |||
} | |||
} | |||
ecb_cold | |||
static int | |||
iouring_internal_init (EV_P) | |||
{ | |||
struct io_uring_params params = { 0 }; | |||
iouring_to_submit = 0; | |||
iouring_tfd = -1; | |||
iouring_sq_ring = MAP_FAILED; | |||
iouring_cq_ring = MAP_FAILED; | |||
iouring_sqes = MAP_FAILED; | |||
if (!have_monotonic) /* cannot really happen, but what if11 */ | |||
return -1; | |||
for (;;) | |||
{ | |||
iouring_fd = evsys_io_uring_setup (iouring_entries, ¶ms); | |||
if (iouring_fd >= 0) | |||
break; /* yippie */ | |||
if (errno != EINVAL) | |||
return -1; /* we failed */ | |||
#if TODO | |||
if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEATURE_SINGLE_MMAP | IORING_FEAT_SUBMIT_STABLE)) | |||
return -1; /* we require the above features */ | |||
#endif | |||
/* EINVAL: lots of possible reasons, but maybe | |||
* it is because we hit the unqueryable hardcoded size limit | |||
*/ | |||
/* we hit the limit already, give up */ | |||
if (iouring_max_entries) | |||
return -1; | |||
/* first time we hit EINVAL? assume we hit the limit, so go back and retry */ | |||
iouring_entries >>= 1; | |||
iouring_max_entries = iouring_entries; | |||
} | |||
iouring_sq_ring_size = params.sq_off.array + params.sq_entries * sizeof (unsigned); | |||
iouring_cq_ring_size = params.cq_off.cqes + params.cq_entries * sizeof (struct io_uring_cqe); | |||
iouring_sqes_size = params.sq_entries * sizeof (struct io_uring_sqe); | |||
iouring_sq_ring = mmap (0, iouring_sq_ring_size, PROT_READ | PROT_WRITE, | |||
MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_SQ_RING); | |||
iouring_cq_ring = mmap (0, iouring_cq_ring_size, PROT_READ | PROT_WRITE, | |||
MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_CQ_RING); | |||
iouring_sqes = mmap (0, iouring_sqes_size, PROT_READ | PROT_WRITE, | |||
MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_SQES); | |||
if (iouring_sq_ring == MAP_FAILED || iouring_cq_ring == MAP_FAILED || iouring_sqes == MAP_FAILED) | |||
return -1; | |||
iouring_sq_head = params.sq_off.head; | |||
iouring_sq_tail = params.sq_off.tail; | |||
iouring_sq_ring_mask = params.sq_off.ring_mask; | |||
iouring_sq_ring_entries = params.sq_off.ring_entries; | |||
iouring_sq_flags = params.sq_off.flags; | |||
iouring_sq_dropped = params.sq_off.dropped; | |||
iouring_sq_array = params.sq_off.array; | |||
iouring_cq_head = params.cq_off.head; | |||
iouring_cq_tail = params.cq_off.tail; | |||
iouring_cq_ring_mask = params.cq_off.ring_mask; | |||
iouring_cq_ring_entries = params.cq_off.ring_entries; | |||
iouring_cq_overflow = params.cq_off.overflow; | |||
iouring_cq_cqes = params.cq_off.cqes; | |||
iouring_tfd = timerfd_create (CLOCK_MONOTONIC, TFD_CLOEXEC); | |||
if (iouring_tfd < 0) | |||
return iouring_tfd; | |||
iouring_tfd_to = EV_TSTAMP_HUGE; | |||
return 0; | |||
} | |||
ecb_cold | |||
static void | |||
iouring_fork (EV_P) | |||
{ | |||
iouring_internal_destroy (EV_A); | |||
while (iouring_internal_init (EV_A) < 0) | |||
ev_syserr ("(libev) io_uring_setup"); | |||
fd_rearm_all (EV_A); | |||
ev_io_stop (EV_A_ &iouring_tfd_w); | |||
ev_io_set (EV_A_ &iouring_tfd_w, iouring_tfd, EV_READ); | |||
ev_io_start (EV_A_ &iouring_tfd_w); | |||
} | |||
/*****************************************************************************/ | |||
static void | |||
iouring_modify (EV_P_ int fd, int oev, int nev) | |||
{ | |||
if (oev) | |||
{ | |||
/* we assume the sqe's are all "properly" initialised */ | |||
struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); | |||
sqe->opcode = IORING_OP_POLL_REMOVE; | |||
sqe->fd = fd; | |||
/* Jens Axboe notified me that user_data is not what is documented, but is | |||
* some kind of unique ID that has to match, otherwise the request cannot | |||
* be removed. Since we don't *really* have that, we pass in the old | |||
* generation counter - if that fails, too bad, it will hopefully be removed | |||
* at close time and then be ignored. */ | |||
sqe->addr = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); | |||
sqe->user_data = (uint64_t)-1; | |||
iouring_sqe_submit (EV_A_ sqe); | |||
/* increment generation counter to avoid handling old events */ | |||
++anfds [fd].egen; | |||
} | |||
if (nev) | |||
{ | |||
struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); | |||
sqe->opcode = IORING_OP_POLL_ADD; | |||
sqe->fd = fd; | |||
sqe->addr = 0; | |||
sqe->user_data = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); | |||
sqe->poll_events = | |||
(nev & EV_READ ? POLLIN : 0) | |||
| (nev & EV_WRITE ? POLLOUT : 0); | |||
iouring_sqe_submit (EV_A_ sqe); | |||
} | |||
} | |||
inline_size | |||
void | |||
iouring_tfd_update (EV_P_ ev_tstamp timeout) | |||
{ | |||
ev_tstamp tfd_to = mn_now + timeout; | |||
/* we assume there will be many iterations per timer change, so | |||
* we only re-set the timerfd when we have to because its expiry | |||
* is too late. | |||
*/ | |||
if (ecb_expect_false (tfd_to < iouring_tfd_to)) | |||
{ | |||
struct itimerspec its; | |||
iouring_tfd_to = tfd_to; | |||
EV_TS_SET (its.it_interval, 0.); | |||
EV_TS_SET (its.it_value, tfd_to); | |||
if (timerfd_settime (iouring_tfd, TFD_TIMER_ABSTIME, &its, 0) < 0) | |||
assert (("libev: iouring timerfd_settime failed", 0)); | |||
} | |||
} | |||
inline_size | |||
void | |||
iouring_process_cqe (EV_P_ struct io_uring_cqe *cqe) | |||
{ | |||
int fd = cqe->user_data & 0xffffffffU; | |||
uint32_t gen = cqe->user_data >> 32; | |||
int res = cqe->res; | |||
/* user_data -1 is a remove that we are not atm. interested in */ | |||
if (cqe->user_data == (uint64_t)-1) | |||
return; | |||
assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax)); | |||
/* documentation lies, of course. the result value is NOT like | |||
* normal syscalls, but like linux raw syscalls, i.e. negative | |||
* error numbers. fortunate, as otherwise there would be no way | |||
* to get error codes at all. still, why not document this? | |||
*/ | |||
/* ignore event if generation doesn't match */ | |||
/* other than skipping removal events, */ | |||
/* this should actually be very rare */ | |||
if (ecb_expect_false (gen != (uint32_t)anfds [fd].egen)) | |||
return; | |||
if (ecb_expect_false (res < 0)) | |||
{ | |||
/*TODO: EINVAL handling (was something failed with this fd)*/ | |||
if (res == -EBADF) | |||
{ | |||
assert (("libev: event loop rejected bad fd", res != -EBADF)); | |||
fd_kill (EV_A_ fd); | |||
} | |||
else | |||
{ | |||
errno = -res; | |||
ev_syserr ("(libev) IORING_OP_POLL_ADD"); | |||
} | |||
return; | |||
} | |||
/* feed events, we do not expect or handle POLLNVAL */ | |||
fd_event ( | |||
EV_A_ | |||
fd, | |||
(res & (POLLOUT | POLLERR | POLLHUP) ? EV_WRITE : 0) | |||
| (res & (POLLIN | POLLERR | POLLHUP) ? EV_READ : 0) | |||
); | |||
/* io_uring is oneshot, so we need to re-arm the fd next iteration */ | |||
/* this also means we usually have to do at least one syscall per iteration */ | |||
anfds [fd].events = 0; | |||
fd_change (EV_A_ fd, EV_ANFD_REIFY); | |||
} | |||
/* called when the event queue overflows */ | |||
ecb_cold | |||
static void | |||
iouring_overflow (EV_P) | |||
{ | |||
/* we have two options, resize the queue (by tearing down | |||
* everything and recreating it, or living with it | |||
* and polling. | |||
* we implement this by resizing the queue, and, if that fails, | |||
* we just recreate the state on every failure, which | |||
* kind of is a very inefficient poll. | |||
* one danger is, due to the bios toward lower fds, | |||
* we will only really get events for those, so | |||
* maybe we need a poll() fallback, after all. | |||
*/ | |||
/*EV_CQ_VAR (overflow) = 0;*/ /* need to do this if we keep the state and poll manually */ | |||
fd_rearm_all (EV_A); | |||
/* we double the size until we hit the hard-to-probe maximum */ | |||
if (!iouring_max_entries) | |||
{ | |||
iouring_entries <<= 1; | |||
iouring_fork (EV_A); | |||
} | |||
else | |||
{ | |||
/* we hit the kernel limit, we should fall back to something else. | |||
* we can either poll() a few times and hope for the best, | |||
* poll always, or switch to epoll. | |||
* TODO: is this necessary with newer kernels? | |||
*/ | |||
iouring_internal_destroy (EV_A); | |||
/* this should make it so that on return, we don't call any uring functions */ | |||
iouring_to_submit = 0; | |||
for (;;) | |||
{ | |||
backend = epoll_init (EV_A_ 0); | |||
if (backend) | |||
break; | |||
ev_syserr ("(libev) iouring switch to epoll"); | |||
} | |||
} | |||
} | |||
/* handle any events in the completion queue, return true if there were any */ | |||
static int | |||
iouring_handle_cq (EV_P) | |||
{ | |||
unsigned head, tail, mask; | |||
head = EV_CQ_VAR (head); | |||
ECB_MEMORY_FENCE_ACQUIRE; | |||
tail = EV_CQ_VAR (tail); | |||
if (head == tail) | |||
return 0; | |||
/* it can only overflow if we have events, yes, yes? */ | |||
if (ecb_expect_false (EV_CQ_VAR (overflow))) | |||
{ | |||
iouring_overflow (EV_A); | |||
return 1; | |||
} | |||
mask = EV_CQ_VAR (ring_mask); | |||
do | |||
iouring_process_cqe (EV_A_ &EV_CQES [head++ & mask]); | |||
while (head != tail); | |||
EV_CQ_VAR (head) = head; | |||
ECB_MEMORY_FENCE_RELEASE; | |||
return 1; | |||
} | |||
static void | |||
iouring_poll (EV_P_ ev_tstamp timeout) | |||
{ | |||
/* if we have events, no need for extra syscalls, but we might have to queue events */ | |||
/* we also clar the timeout if there are outstanding fdchanges */ | |||
/* the latter should only happen if both the sq and cq are full, most likely */ | |||
/* because we have a lot of event sources that immediately complete */ | |||
/* TODO: fdchacngecnt is always 0 because fd_reify does not have two buffers yet */ | |||
if (iouring_handle_cq (EV_A) || fdchangecnt) | |||
timeout = EV_TS_CONST (0.); | |||
else | |||
/* no events, so maybe wait for some */ | |||
iouring_tfd_update (EV_A_ timeout); | |||
/* only enter the kernel if we have something to submit, or we need to wait */ | |||
if (timeout || iouring_to_submit) | |||
{ | |||
int res = iouring_enter (EV_A_ timeout); | |||
if (ecb_expect_false (res < 0)) | |||
if (errno == EINTR) | |||
/* ignore */; | |||
else if (errno == EBUSY) | |||
/* cq full, cannot submit - should be rare because we flush the cq first, so simply ignore */; | |||
else | |||
ev_syserr ("(libev) iouring setup"); | |||
else | |||
iouring_handle_cq (EV_A); | |||
} | |||
} | |||
inline_size | |||
int | |||
iouring_init (EV_P_ int flags) | |||
{ | |||
iouring_entries = IOURING_INIT_ENTRIES; | |||
iouring_max_entries = 0; | |||
if (iouring_internal_init (EV_A) < 0) | |||
{ | |||
iouring_internal_destroy (EV_A); | |||
return 0; | |||
} | |||
ev_io_init (&iouring_tfd_w, iouring_tfd_cb, iouring_tfd, EV_READ); | |||
ev_set_priority (&iouring_tfd_w, EV_MINPRI); | |||
ev_io_start (EV_A_ &iouring_tfd_w); | |||
ev_unref (EV_A); /* watcher should not keep loop alive */ | |||
backend_modify = iouring_modify; | |||
backend_poll = iouring_poll; | |||
return EVBACKEND_IOURING; | |||
} | |||
inline_size | |||
void | |||
iouring_destroy (EV_P) | |||
{ | |||
iouring_internal_destroy (EV_A); | |||
} | |||
@@ -1,7 +1,7 @@ | |||
/* | |||
* libev kqueue backend | |||
* | |||
* Copyright (c) 2007,2008,2009,2010,2011,2012,2013 Marc Alexander Lehmann <libev@schmorp.de> | |||
* Copyright (c) 2007,2008,2009,2010,2011,2012,2013,2016,2019 Marc Alexander Lehmann <libev@schmorp.de> | |||
* All rights reserved. | |||
* | |||
* Redistribution and use in source and binary forms, with or without modifica- | |||
@@ -48,7 +48,7 @@ void | |||
kqueue_change (EV_P_ int fd, int filter, int flags, int fflags) | |||
{ | |||
++kqueue_changecnt; | |||
array_needsize (struct kevent, kqueue_changes, kqueue_changemax, kqueue_changecnt, EMPTY2); | |||
array_needsize (struct kevent, kqueue_changes, kqueue_changemax, kqueue_changecnt, array_needsize_noinit); | |||
EV_SET (&kqueue_changes [kqueue_changecnt - 1], fd, filter, flags, fflags, 0, 0); | |||
} | |||
@@ -103,10 +103,10 @@ kqueue_poll (EV_P_ ev_tstamp timeout) | |||
EV_ACQUIRE_CB; | |||
kqueue_changecnt = 0; | |||
if (expect_false (res < 0)) | |||
if (ecb_expect_false (res < 0)) | |||
{ | |||
if (errno != EINTR) | |||
ev_syserr ("(libev) kevent"); | |||
ev_syserr ("(libev) kqueue kevent"); | |||
return; | |||
} | |||
@@ -115,7 +115,7 @@ kqueue_poll (EV_P_ ev_tstamp timeout) | |||
{ | |||
int fd = kqueue_events [i].ident; | |||
if (expect_false (kqueue_events [i].flags & EV_ERROR)) | |||
if (ecb_expect_false (kqueue_events [i].flags & EV_ERROR)) | |||
{ | |||
int err = kqueue_events [i].data; | |||
@@ -129,10 +129,16 @@ kqueue_poll (EV_P_ ev_tstamp timeout) | |||
if (fd_valid (fd)) | |||
kqueue_modify (EV_A_ fd, 0, anfds [fd].events); | |||
else | |||
fd_kill (EV_A_ fd); | |||
{ | |||
assert (("libev: kqueue found invalid fd", 0)); | |||
fd_kill (EV_A_ fd); | |||
} | |||
} | |||
else /* on all other errors, we error out on the fd */ | |||
fd_kill (EV_A_ fd); | |||
{ | |||
assert (("libev: kqueue found invalid fd", 0)); | |||
fd_kill (EV_A_ fd); | |||
} | |||
} | |||
} | |||
else | |||
@@ -145,7 +151,7 @@ kqueue_poll (EV_P_ ev_tstamp timeout) | |||
); | |||
} | |||
if (expect_false (res == kqueue_eventmax)) | |||
if (ecb_expect_false (res == kqueue_eventmax)) | |||
{ | |||
ev_free (kqueue_events); | |||
kqueue_eventmax = array_nextsize (sizeof (struct kevent), kqueue_eventmax, kqueue_eventmax + 1); | |||
@@ -164,7 +170,7 @@ kqueue_init (EV_P_ int flags) | |||
fcntl (backend_fd, F_SETFD, FD_CLOEXEC); /* not sure if necessary, hopefully doesn't hurt */ | |||
backend_mintime = 1e-9; /* apparently, they did the right thing in freebsd */ | |||
backend_mintime = EV_TS_CONST (1e-9); /* apparently, they did the right thing in freebsd */ | |||
backend_modify = kqueue_modify; | |||
backend_poll = kqueue_poll; | |||
@@ -0,0 +1,620 @@ | |||
/* | |||
* libev linux aio fd activity backend | |||
* | |||
* Copyright (c) 2019 Marc Alexander Lehmann <libev@schmorp.de> | |||
* All rights reserved. | |||
* | |||
* Redistribution and use in source and binary forms, with or without modifica- | |||
* tion, are permitted provided that the following conditions are met: | |||
* | |||
* 1. Redistributions of source code must retain the above copyright notice, | |||
* this list of conditions and the following disclaimer. | |||
* | |||
* 2. Redistributions in binary form must reproduce the above copyright | |||
* notice, this list of conditions and the following disclaimer in the | |||
* documentation and/or other materials provided with the distribution. | |||
* | |||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED | |||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- | |||
* CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO | |||
* EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- | |||
* CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; | |||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, | |||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- | |||
* ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | |||
* OF THE POSSIBILITY OF SUCH DAMAGE. | |||
* | |||
* Alternatively, the contents of this file may be used under the terms of | |||
* the GNU General Public License ("GPL") version 2 or any later version, | |||
* in which case the provisions of the GPL are applicable instead of | |||
* the above. If you wish to allow the use of your version of this file | |||
* only under the terms of the GPL and not to allow others to use your | |||
* version of this file under the BSD license, indicate your decision | |||
* by deleting the provisions above and replace them with the notice | |||
* and other provisions required by the GPL. If you do not delete the | |||
* provisions above, a recipient may use your version of this file under | |||
* either the BSD or the GPL. | |||
*/ | |||
/* | |||
* general notes about linux aio: | |||
* | |||
* a) at first, the linux aio IOCB_CMD_POLL functionality introduced in | |||
* 4.18 looks too good to be true: both watchers and events can be | |||
* batched, and events can even be handled in userspace using | |||
* a ring buffer shared with the kernel. watchers can be canceled | |||
* regardless of whether the fd has been closed. no problems with fork. | |||
* ok, the ring buffer is 200% undocumented (there isn't even a | |||
* header file), but otherwise, it's pure bliss! | |||
* b) ok, watchers are one-shot, so you have to re-arm active ones | |||
* on every iteration. so much for syscall-less event handling, | |||
* but at least these re-arms can be batched, no big deal, right? | |||
* c) well, linux as usual: the documentation lies to you: io_submit | |||
* sometimes returns EINVAL because the kernel doesn't feel like | |||
* handling your poll mask - ttys can be polled for POLLOUT, | |||
* POLLOUT|POLLIN, but polling for POLLIN fails. just great, | |||
* so we have to fall back to something else (hello, epoll), | |||
* but at least the fallback can be slow, because these are | |||
* exceptional cases, right? | |||
* d) hmm, you have to tell the kernel the maximum number of watchers | |||
* you want to queue when initialising the aio context. but of | |||
* course the real limit is magically calculated in the kernel, and | |||
* is often higher then we asked for. so we just have to destroy | |||
* the aio context and re-create it a bit larger if we hit the limit. | |||
* (starts to remind you of epoll? well, it's a bit more deterministic | |||
* and less gambling, but still ugly as hell). | |||
* e) that's when you find out you can also hit an arbitrary system-wide | |||
* limit. or the kernel simply doesn't want to handle your watchers. | |||
* what the fuck do we do then? you guessed it, in the middle | |||
* of event handling we have to switch to 100% epoll polling. and | |||
* that better is as fast as normal epoll polling, so you practically | |||
* have to use the normal epoll backend with all its quirks. | |||
* f) end result of this train wreck: it inherits all the disadvantages | |||
* from epoll, while adding a number on its own. why even bother to use | |||
* it? because if conditions are right and your fds are supported and you | |||
* don't hit a limit, this backend is actually faster, doesn't gamble with | |||
* your fds, batches watchers and events and doesn't require costly state | |||
* recreates. well, until it does. | |||
* g) all of this makes this backend use almost twice as much code as epoll. | |||
* which in turn uses twice as much code as poll. and that#s not counting | |||
* the fact that this backend also depends on the epoll backend, making | |||
* it three times as much code as poll, or kqueue. | |||
* h) bleah. why can't linux just do kqueue. sure kqueue is ugly, but by now | |||
* it's clear that whatever linux comes up with is far, far, far worse. | |||
*/ | |||
#include <sys/time.h> /* actually linux/time.h, but we must assume they are compatible */ | |||
#include <poll.h> | |||
#include <linux/aio_abi.h> | |||
/*****************************************************************************/ | |||
/* syscall wrapdadoop - this section has the raw api/abi definitions */ | |||
#include <sys/syscall.h> /* no glibc wrappers */ | |||
/* aio_abi.h is not versioned in any way, so we cannot test for its existance */ | |||
#define IOCB_CMD_POLL 5 | |||
/* taken from linux/fs/aio.c. yup, that's a .c file. | |||
* not only is this totally undocumented, not even the source code | |||
* can tell you what the future semantics of compat_features and | |||
* incompat_features are, or what header_length actually is for. | |||
*/ | |||
#define AIO_RING_MAGIC 0xa10a10a1 | |||
#define EV_AIO_RING_INCOMPAT_FEATURES 0 | |||
struct aio_ring | |||
{ | |||
unsigned id; /* kernel internal index number */ | |||
unsigned nr; /* number of io_events */ | |||
unsigned head; /* Written to by userland or by kernel. */ | |||
unsigned tail; | |||
unsigned magic; | |||
unsigned compat_features; | |||
unsigned incompat_features; | |||
unsigned header_length; /* size of aio_ring */ | |||
struct io_event io_events[0]; | |||
}; | |||
inline_size | |||
int | |||
evsys_io_setup (unsigned nr_events, aio_context_t *ctx_idp) | |||
{ | |||
return ev_syscall2 (SYS_io_setup, nr_events, ctx_idp); | |||
} | |||
inline_size | |||
int | |||
evsys_io_destroy (aio_context_t ctx_id) | |||
{ | |||
return ev_syscall1 (SYS_io_destroy, ctx_id); | |||
} | |||
inline_size | |||
int | |||
evsys_io_submit (aio_context_t ctx_id, long nr, struct iocb *cbp[]) | |||
{ | |||
return ev_syscall3 (SYS_io_submit, ctx_id, nr, cbp); | |||
} | |||
inline_size | |||
int | |||
evsys_io_cancel (aio_context_t ctx_id, struct iocb *cbp, struct io_event *result) | |||
{ | |||
return ev_syscall3 (SYS_io_cancel, ctx_id, cbp, result); | |||
} | |||
inline_size | |||
int | |||
evsys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout) | |||
{ | |||
return ev_syscall5 (SYS_io_getevents, ctx_id, min_nr, nr, events, timeout); | |||
} | |||
/*****************************************************************************/ | |||
/* actual backed implementation */ | |||
ecb_cold | |||
static int | |||
linuxaio_nr_events (EV_P) | |||
{ | |||
/* we start with 16 iocbs and incraese from there | |||
* that's tiny, but the kernel has a rather low system-wide | |||
* limit that can be reached quickly, so let's be parsimonious | |||
* with this resource. | |||
* Rest assured, the kernel generously rounds up small and big numbers | |||
* in different ways (but doesn't seem to charge you for it). | |||
* The 15 here is because the kernel usually has a power of two as aio-max-nr, | |||
* and this helps to take advantage of that limit. | |||
*/ | |||
/* we try to fill 4kB pages exactly. | |||
* the ring buffer header is 32 bytes, every io event is 32 bytes. | |||
* the kernel takes the io requests number, doubles it, adds 2 | |||
* and adds the ring buffer. | |||
* the way we use this is by starting low, and then roughly doubling the | |||
* size each time we hit a limit. | |||
*/ | |||
int requests = 15 << linuxaio_iteration; | |||
int one_page = (4096 | |||
/ sizeof (struct io_event) ) / 2; /* how many fit into one page */ | |||
int first_page = ((4096 - sizeof (struct aio_ring)) | |||
/ sizeof (struct io_event) - 2) / 2; /* how many fit into the first page */ | |||
/* if everything fits into one page, use count exactly */ | |||
if (requests > first_page) | |||
/* otherwise, round down to full pages and add the first page */ | |||
requests = requests / one_page * one_page + first_page; | |||
return requests; | |||
} | |||
/* we use out own wrapper structure in case we ever want to do something "clever" */ | |||
typedef struct aniocb | |||
{ | |||
struct iocb io; | |||
/*int inuse;*/ | |||
} *ANIOCBP; | |||
inline_size | |||
void | |||
linuxaio_array_needsize_iocbp (ANIOCBP *base, int offset, int count) | |||
{ | |||
while (count--) | |||
{ | |||
/* TODO: quite the overhead to allocate every iocb separately, maybe use our own allocator? */ | |||
ANIOCBP iocb = (ANIOCBP)ev_malloc (sizeof (*iocb)); | |||
/* full zero initialise is probably not required at the moment, but | |||
* this is not well documented, so we better do it. | |||
*/ | |||
memset (iocb, 0, sizeof (*iocb)); | |||
iocb->io.aio_lio_opcode = IOCB_CMD_POLL; | |||
iocb->io.aio_fildes = offset; | |||
base [offset++] = iocb; | |||
} | |||
} | |||
ecb_cold | |||
static void | |||
linuxaio_free_iocbp (EV_P) | |||
{ | |||
while (linuxaio_iocbpmax--) | |||
ev_free (linuxaio_iocbps [linuxaio_iocbpmax]); | |||
linuxaio_iocbpmax = 0; /* next resize will completely reallocate the array, at some overhead */ | |||
} | |||
static void | |||
linuxaio_modify (EV_P_ int fd, int oev, int nev) | |||
{ | |||
array_needsize (ANIOCBP, linuxaio_iocbps, linuxaio_iocbpmax, fd + 1, linuxaio_array_needsize_iocbp); | |||
ANIOCBP iocb = linuxaio_iocbps [fd]; | |||
ANFD *anfd = &anfds [fd]; | |||
if (ecb_expect_false (iocb->io.aio_reqprio < 0)) | |||
{ | |||
/* we handed this fd over to epoll, so undo this first */ | |||
/* we do it manually because the optimisations on epoll_modify won't do us any good */ | |||
epoll_ctl (backend_fd, EPOLL_CTL_DEL, fd, 0); | |||
anfd->emask = 0; | |||
iocb->io.aio_reqprio = 0; | |||
} | |||
else if (ecb_expect_false (iocb->io.aio_buf)) | |||
{ | |||
/* iocb active, so cancel it first before resubmit */ | |||
/* this assumes we only ever get one call per fd per loop iteration */ | |||
for (;;) | |||
{ | |||
/* on all relevant kernels, io_cancel fails with EINPROGRESS on "success" */ | |||
if (ecb_expect_false (evsys_io_cancel (linuxaio_ctx, &iocb->io, (struct io_event *)0) == 0)) | |||
break; | |||
if (ecb_expect_true (errno == EINPROGRESS)) | |||
break; | |||
/* the EINPROGRESS test is for nicer error message. clumsy. */ | |||
if (errno != EINTR) | |||
{ | |||
assert (("libev: linuxaio unexpected io_cancel failed", errno != EINTR && errno != EINPROGRESS)); | |||
break; | |||
} | |||
} | |||
/* increment generation counter to avoid handling old events */ | |||
++anfd->egen; | |||
} | |||
iocb->io.aio_buf = (nev & EV_READ ? POLLIN : 0) | |||
| (nev & EV_WRITE ? POLLOUT : 0); | |||
if (nev) | |||
{ | |||
iocb->io.aio_data = (uint32_t)fd | ((__u64)(uint32_t)anfd->egen << 32); | |||
/* queue iocb up for io_submit */ | |||
/* this assumes we only ever get one call per fd per loop iteration */ | |||
++linuxaio_submitcnt; | |||
array_needsize (struct iocb *, linuxaio_submits, linuxaio_submitmax, linuxaio_submitcnt, array_needsize_noinit); | |||
linuxaio_submits [linuxaio_submitcnt - 1] = &iocb->io; | |||
} | |||
} | |||
static void | |||
linuxaio_epoll_cb (EV_P_ struct ev_io *w, int revents) | |||
{ | |||
epoll_poll (EV_A_ 0); | |||
} | |||
inline_speed | |||
void | |||
linuxaio_fd_rearm (EV_P_ int fd) | |||
{ | |||
anfds [fd].events = 0; | |||
linuxaio_iocbps [fd]->io.aio_buf = 0; | |||
fd_change (EV_A_ fd, EV_ANFD_REIFY); | |||
} | |||
static void | |||
linuxaio_parse_events (EV_P_ struct io_event *ev, int nr) | |||
{ | |||
while (nr) | |||
{ | |||
int fd = ev->data & 0xffffffff; | |||
uint32_t gen = ev->data >> 32; | |||
int res = ev->res; | |||
assert (("libev: iocb fd must be in-bounds", fd >= 0 && fd < anfdmax)); | |||
/* only accept events if generation counter matches */ | |||
if (ecb_expect_true (gen == (uint32_t)anfds [fd].egen)) | |||
{ | |||
/* feed events, we do not expect or handle POLLNVAL */ | |||
fd_event ( | |||
EV_A_ | |||
fd, | |||
(res & (POLLOUT | POLLERR | POLLHUP) ? EV_WRITE : 0) | |||
| (res & (POLLIN | POLLERR | POLLHUP) ? EV_READ : 0) | |||
); | |||
/* linux aio is oneshot: rearm fd. TODO: this does more work than strictly needed */ | |||
linuxaio_fd_rearm (EV_A_ fd); | |||
} | |||
--nr; | |||
++ev; | |||
} | |||
} | |||
/* get any events from ring buffer, return true if any were handled */ | |||
static int | |||
linuxaio_get_events_from_ring (EV_P) | |||
{ | |||
struct aio_ring *ring = (struct aio_ring *)linuxaio_ctx; | |||
unsigned head, tail; | |||
/* the kernel reads and writes both of these variables, */ | |||
/* as a C extension, we assume that volatile use here */ | |||
/* both makes reads atomic and once-only */ | |||
head = *(volatile unsigned *)&ring->head; | |||
ECB_MEMORY_FENCE_ACQUIRE; | |||
tail = *(volatile unsigned *)&ring->tail; | |||
if (head == tail) | |||
return 0; | |||
/* parse all available events, but only once, to avoid starvation */ | |||
if (ecb_expect_true (tail > head)) /* normal case around */ | |||
linuxaio_parse_events (EV_A_ ring->io_events + head, tail - head); | |||
else /* wrapped around */ | |||
{ | |||
linuxaio_parse_events (EV_A_ ring->io_events + head, ring->nr - head); | |||
linuxaio_parse_events (EV_A_ ring->io_events, tail); | |||
} | |||
ECB_MEMORY_FENCE_RELEASE; | |||
/* as an extension to C, we hope that the volatile will make this atomic and once-only */ | |||
*(volatile unsigned *)&ring->head = tail; | |||
return 1; | |||
} | |||
inline_size | |||
int | |||
linuxaio_ringbuf_valid (EV_P) | |||
{ | |||
struct aio_ring *ring = (struct aio_ring *)linuxaio_ctx; | |||
return ecb_expect_true (ring->magic == AIO_RING_MAGIC) | |||
&& ring->incompat_features == EV_AIO_RING_INCOMPAT_FEATURES | |||
&& ring->header_length == sizeof (struct aio_ring); /* TODO: or use it to find io_event[0]? */ | |||
} | |||
/* read at least one event from kernel, or timeout */ | |||
inline_size | |||
void | |||
linuxaio_get_events (EV_P_ ev_tstamp timeout) | |||
{ | |||
struct timespec ts; | |||
struct io_event ioev[8]; /* 256 octet stack space */ | |||
int want = 1; /* how many events to request */ | |||
int ringbuf_valid = linuxaio_ringbuf_valid (EV_A); | |||
if (ecb_expect_true (ringbuf_valid)) | |||
{ | |||
/* if the ring buffer has any events, we don't wait or call the kernel at all */ | |||
if (linuxaio_get_events_from_ring (EV_A)) | |||
return; | |||
/* if the ring buffer is empty, and we don't have a timeout, then don't call the kernel */ | |||
if (!timeout) | |||
return; | |||
} | |||
else | |||
/* no ringbuffer, request slightly larger batch */ | |||
want = sizeof (ioev) / sizeof (ioev [0]); | |||
/* no events, so wait for some | |||
* for fairness reasons, we do this in a loop, to fetch all events | |||
*/ | |||
for (;;) | |||
{ | |||
int res; | |||
EV_RELEASE_CB; | |||
EV_TS_SET (ts, timeout); | |||
res = evsys_io_getevents (linuxaio_ctx, 1, want, ioev, &ts); | |||
EV_ACQUIRE_CB; | |||
if (res < 0) | |||
if (errno == EINTR) | |||
/* ignored, retry */; | |||
else | |||
ev_syserr ("(libev) linuxaio io_getevents"); | |||
else if (res) | |||
{ | |||
/* at least one event available, handle them */ | |||
linuxaio_parse_events (EV_A_ ioev, res); | |||
if (ecb_expect_true (ringbuf_valid)) | |||
{ | |||
/* if we have a ring buffer, handle any remaining events in it */ | |||
linuxaio_get_events_from_ring (EV_A); | |||
/* at this point, we should have handled all outstanding events */ | |||
break; | |||
} | |||
else if (res < want) | |||
/* otherwise, if there were fewere events than we wanted, we assume there are no more */ | |||
break; | |||
} | |||
else | |||
break; /* no events from the kernel, we are done */ | |||
timeout = EV_TS_CONST (0.); /* only wait in the first iteration */ | |||
} | |||
} | |||
inline_size | |||
int | |||
linuxaio_io_setup (EV_P) | |||
{ | |||
linuxaio_ctx = 0; | |||
return evsys_io_setup (linuxaio_nr_events (EV_A), &linuxaio_ctx); | |||
} | |||
static void | |||
linuxaio_poll (EV_P_ ev_tstamp timeout) | |||
{ | |||
int submitted; | |||
/* first phase: submit new iocbs */ | |||
/* io_submit might return less than the requested number of iocbs */ | |||
/* this is, afaics, only because of errors, but we go by the book and use a loop, */ | |||
/* which allows us to pinpoint the erroneous iocb */ | |||
for (submitted = 0; submitted < linuxaio_submitcnt; ) | |||
{ | |||
int res = evsys_io_submit (linuxaio_ctx, linuxaio_submitcnt - submitted, linuxaio_submits + submitted); | |||
if (ecb_expect_false (res < 0)) | |||
if (errno == EINVAL) | |||
{ | |||
/* This happens for unsupported fds, officially, but in my testing, | |||
* also randomly happens for supported fds. We fall back to good old | |||
* poll() here, under the assumption that this is a very rare case. | |||
* See https://lore.kernel.org/patchwork/patch/1047453/ to see | |||
* discussion about such a case (ttys) where polling for POLLIN | |||
* fails but POLLIN|POLLOUT works. | |||
*/ | |||
struct iocb *iocb = linuxaio_submits [submitted]; | |||
epoll_modify (EV_A_ iocb->aio_fildes, 0, anfds [iocb->aio_fildes].events); | |||
iocb->aio_reqprio = -1; /* mark iocb as epoll */ | |||
res = 1; /* skip this iocb - another iocb, another chance */ | |||
} | |||
else if (errno == EAGAIN) | |||
{ | |||
/* This happens when the ring buffer is full, or some other shit we | |||
* don't know and isn't documented. Most likely because we have too | |||
* many requests and linux aio can't be assed to handle them. | |||
* In this case, we try to allocate a larger ring buffer, freeing | |||
* ours first. This might fail, in which case we have to fall back to 100% | |||
* epoll. | |||
* God, how I hate linux not getting its act together. Ever. | |||
*/ | |||
evsys_io_destroy (linuxaio_ctx); | |||
linuxaio_submitcnt = 0; | |||
/* rearm all fds with active iocbs */ | |||
{ | |||
int fd; | |||
for (fd = 0; fd < linuxaio_iocbpmax; ++fd) | |||
if (linuxaio_iocbps [fd]->io.aio_buf) | |||
linuxaio_fd_rearm (EV_A_ fd); | |||
} | |||
++linuxaio_iteration; | |||
if (linuxaio_io_setup (EV_A) < 0) | |||
{ | |||
/* TODO: rearm all and recreate epoll backend from scratch */ | |||
/* TODO: might be more prudent? */ | |||
/* to bad, we can't get a new aio context, go 100% epoll */ | |||
linuxaio_free_iocbp (EV_A); | |||
ev_io_stop (EV_A_ &linuxaio_epoll_w); | |||
ev_ref (EV_A); | |||
linuxaio_ctx = 0; | |||
backend = EVBACKEND_EPOLL; | |||
backend_modify = epoll_modify; | |||
backend_poll = epoll_poll; | |||
} | |||
timeout = EV_TS_CONST (0.); | |||
/* it's easiest to handle this mess in another iteration */ | |||
return; | |||
} | |||
else if (errno == EBADF) | |||
{ | |||
assert (("libev: event loop rejected bad fd", errno != EBADF)); | |||
fd_kill (EV_A_ linuxaio_submits [submitted]->aio_fildes); | |||
res = 1; /* skip this iocb */ | |||
} | |||
else if (errno == EINTR) /* not seen in reality, not documented */ | |||
res = 0; /* silently ignore and retry */ | |||
else | |||
{ | |||
ev_syserr ("(libev) linuxaio io_submit"); | |||
res = 0; | |||
} | |||
submitted += res; | |||
} | |||
linuxaio_submitcnt = 0; | |||
/* second phase: fetch and parse events */ | |||
linuxaio_get_events (EV_A_ timeout); | |||
} | |||
inline_size | |||
int | |||
linuxaio_init (EV_P_ int flags) | |||
{ | |||
/* would be great to have a nice test for IOCB_CMD_POLL instead */ | |||
/* also: test some semi-common fd types, such as files and ttys in recommended_backends */ | |||
/* 4.18 introduced IOCB_CMD_POLL, 4.19 made epoll work, and we need that */ | |||
if (ev_linux_version () < 0x041300) | |||
return 0; | |||
if (!epoll_init (EV_A_ 0)) | |||
return 0; | |||
linuxaio_iteration = 0; | |||
if (linuxaio_io_setup (EV_A) < 0) | |||
{ | |||
epoll_destroy (EV_A); | |||
return 0; | |||
} | |||
ev_io_init (&linuxaio_epoll_w, linuxaio_epoll_cb, backend_fd, EV_READ); | |||
ev_set_priority (&linuxaio_epoll_w, EV_MAXPRI); | |||
ev_io_start (EV_A_ &linuxaio_epoll_w); | |||
ev_unref (EV_A); /* watcher should not keep loop alive */ | |||
backend_modify = linuxaio_modify; | |||
backend_poll = linuxaio_poll; | |||
linuxaio_iocbpmax = 0; | |||
linuxaio_iocbps = 0; | |||
linuxaio_submits = 0; | |||
linuxaio_submitmax = 0; | |||
linuxaio_submitcnt = 0; | |||
return EVBACKEND_LINUXAIO; | |||
} | |||
inline_size | |||
void | |||
linuxaio_destroy (EV_P) | |||
{ | |||
epoll_destroy (EV_A); | |||
linuxaio_free_iocbp (EV_A); | |||
evsys_io_destroy (linuxaio_ctx); /* fails in child, aio context is destroyed */ | |||
} | |||
ecb_cold | |||
static void | |||
linuxaio_fork (EV_P) | |||
{ | |||
linuxaio_submitcnt = 0; /* all pointers were invalidated */ | |||
linuxaio_free_iocbp (EV_A); /* this frees all iocbs, which is very heavy-handed */ | |||
evsys_io_destroy (linuxaio_ctx); /* fails in child, aio context is destroyed */ | |||
linuxaio_iteration = 0; /* we start over in the child */ | |||
while (linuxaio_io_setup (EV_A) < 0) | |||
ev_syserr ("(libev) linuxaio io_setup"); | |||
/* forking epoll should also effectively unregister all fds from the backend */ | |||
epoll_fork (EV_A); | |||
/* epoll_fork already did this. hopefully */ | |||
/*fd_rearm_all (EV_A);*/ | |||
ev_io_stop (EV_A_ &linuxaio_epoll_w); | |||
ev_io_set (EV_A_ &linuxaio_epoll_w, backend_fd, EV_READ); | |||
ev_io_start (EV_A_ &linuxaio_epoll_w); | |||
} | |||
@@ -1,7 +1,7 @@ | |||
/* | |||
* libev poll fd activity backend | |||
* | |||
* Copyright (c) 2007,2008,2009,2010,2011 Marc Alexander Lehmann <libev@schmorp.de> | |||
* Copyright (c) 2007,2008,2009,2010,2011,2016,2019 Marc Alexander Lehmann <libev@schmorp.de> | |||
* All rights reserved. | |||
* | |||
* Redistribution and use in source and binary forms, with or without modifica- | |||
@@ -41,10 +41,12 @@ | |||
inline_size | |||
void | |||
pollidx_init (int *base, int count) | |||
array_needsize_pollidx (int *base, int offset, int count) | |||
{ | |||
/* consider using memset (.., -1, ...), which is practically guaranteed | |||
* to work on all systems implementing poll */ | |||
/* using memset (.., -1, ...) is tempting, we we try | |||
* to be ultraportable | |||
*/ | |||
base += offset; | |||
while (count--) | |||
*base++ = -1; | |||
} | |||
@@ -57,14 +59,14 @@ poll_modify (EV_P_ int fd, int oev, int nev) | |||
if (oev == nev) | |||
return; | |||
array_needsize (int, pollidxs, pollidxmax, fd + 1, pollidx_init); | |||
array_needsize (int, pollidxs, pollidxmax, fd + 1, array_needsize_pollidx); | |||
idx = pollidxs [fd]; | |||
if (idx < 0) /* need to allocate a new pollfd */ | |||
{ | |||
pollidxs [fd] = idx = pollcnt++; | |||
array_needsize (struct pollfd, polls, pollmax, pollcnt, EMPTY2); | |||
array_needsize (struct pollfd, polls, pollmax, pollcnt, array_needsize_noinit); | |||
polls [idx].fd = fd; | |||
} | |||
@@ -78,7 +80,7 @@ poll_modify (EV_P_ int fd, int oev, int nev) | |||
{ | |||
pollidxs [fd] = -1; | |||
if (expect_true (idx < --pollcnt)) | |||
if (ecb_expect_true (idx < --pollcnt)) | |||
{ | |||
polls [idx] = polls [pollcnt]; | |||
pollidxs [polls [idx].fd] = idx; | |||
@@ -93,10 +95,10 @@ poll_poll (EV_P_ ev_tstamp timeout) | |||
int res; | |||
EV_RELEASE_CB; | |||
res = poll (polls, pollcnt, timeout * 1e3); | |||
res = poll (polls, pollcnt, EV_TS_TO_MSEC (timeout)); | |||
EV_ACQUIRE_CB; | |||
if (expect_false (res < 0)) | |||
if (ecb_expect_false (res < 0)) | |||
{ | |||
if (errno == EBADF) | |||
fd_ebadf (EV_A); | |||
@@ -108,14 +110,17 @@ poll_poll (EV_P_ ev_tstamp timeout) | |||
else | |||
for (p = polls; res; ++p) | |||
{ | |||
assert (("libev: poll() returned illegal result, broken BSD kernel?", p < polls + pollcnt)); | |||
assert (("libev: poll returned illegal result, broken BSD kernel?", p < polls + pollcnt)); | |||
if (expect_false (p->revents)) /* this expect is debatable */ | |||
if (ecb_expect_false (p->revents)) /* this expect is debatable */ | |||
{ | |||
--res; | |||
if (expect_false (p->revents & POLLNVAL)) | |||
fd_kill (EV_A_ p->fd); | |||
if (ecb_expect_false (p->revents & POLLNVAL)) | |||
{ | |||
assert (("libev: poll found invalid fd in poll set", 0)); | |||
fd_kill (EV_A_ p->fd); | |||
} | |||
else | |||
fd_event ( | |||
EV_A_ | |||
@@ -131,7 +136,7 @@ inline_size | |||
int | |||
poll_init (EV_P_ int flags) | |||
{ | |||
backend_mintime = 1e-3; | |||
backend_mintime = EV_TS_CONST (1e-3); | |||
backend_modify = poll_modify; | |||
backend_poll = poll_poll; | |||
@@ -1,7 +1,7 @@ | |||
/* | |||
* libev solaris event port backend | |||
* | |||
* Copyright (c) 2007,2008,2009,2010,2011 Marc Alexander Lehmann <libev@schmorp.de> | |||
* Copyright (c) 2007,2008,2009,2010,2011,2019 Marc Alexander Lehmann <libev@schmorp.de> | |||
* All rights reserved. | |||
* | |||
* Redistribution and use in source and binary forms, with or without modifica- | |||
@@ -69,7 +69,10 @@ port_associate_and_check (EV_P_ int fd, int ev) | |||
) | |||
{ | |||
if (errno == EBADFD) | |||
fd_kill (EV_A_ fd); | |||
{ | |||
assert (("libev: port_associate found invalid fd", errno != EBADFD)); | |||
fd_kill (EV_A_ fd); | |||
} | |||
else | |||
ev_syserr ("(libev) port_associate"); | |||
} | |||
@@ -129,7 +132,7 @@ port_poll (EV_P_ ev_tstamp timeout) | |||
} | |||
} | |||
if (expect_false (nget == port_eventmax)) | |||
if (ecb_expect_false (nget == port_eventmax)) | |||
{ | |||
ev_free (port_events); | |||
port_eventmax = array_nextsize (sizeof (port_event_t), port_eventmax, port_eventmax + 1); | |||
@@ -151,11 +154,11 @@ port_init (EV_P_ int flags) | |||
/* if my reading of the opensolaris kernel sources are correct, then | |||
* opensolaris does something very stupid: it checks if the time has already | |||
* elapsed and doesn't round up if that is the case,m otherwise it DOES round | |||
* elapsed and doesn't round up if that is the case, otherwise it DOES round | |||
* up. Since we can't know what the case is, we need to guess by using a | |||
* "large enough" timeout. Normally, 1e-9 would be correct. | |||
*/ | |||
backend_mintime = 1e-3; /* needed to compensate for port_getn returning early */ | |||
backend_mintime = EV_TS_CONST (1e-3); /* needed to compensate for port_getn returning early */ | |||
backend_modify = port_modify; | |||
backend_poll = port_poll; | |||
@@ -108,7 +108,7 @@ select_modify (EV_P_ int fd, int oev, int nev) | |||
int word = fd / NFDBITS; | |||
fd_mask mask = 1UL << (fd % NFDBITS); | |||
if (expect_false (vec_max <= word)) | |||
if (ecb_expect_false (vec_max <= word)) | |||
{ | |||
int new_max = word + 1; | |||
@@ -171,7 +171,7 @@ select_poll (EV_P_ ev_tstamp timeout) | |||
#endif | |||
EV_ACQUIRE_CB; | |||
if (expect_false (res < 0)) | |||
if (ecb_expect_false (res < 0)) | |||
{ | |||
#if EV_SELECT_IS_WINSOCKET | |||
errno = WSAGetLastError (); | |||
@@ -197,7 +197,7 @@ select_poll (EV_P_ ev_tstamp timeout) | |||
{ | |||
if (timeout) | |||
{ | |||
unsigned long ms = timeout * 1e3; | |||
unsigned long ms = EV_TS_TO_MSEC (timeout); | |||
Sleep (ms ? ms : 1); | |||
} | |||
@@ -236,7 +236,7 @@ select_poll (EV_P_ ev_tstamp timeout) | |||
if (FD_ISSET (handle, (fd_set *)vec_eo)) events |= EV_WRITE; | |||
#endif | |||
if (expect_true (events)) | |||
if (ecb_expect_true (events)) | |||
fd_event (EV_A_ fd, events); | |||
} | |||
} | |||
@@ -262,7 +262,7 @@ select_poll (EV_P_ ev_tstamp timeout) | |||
events |= word_r & mask ? EV_READ : 0; | |||
events |= word_w & mask ? EV_WRITE : 0; | |||
if (expect_true (events)) | |||
if (ecb_expect_true (events)) | |||
fd_event (EV_A_ word * NFDBITS + bit, events); | |||
} | |||
} | |||
@@ -275,7 +275,7 @@ inline_size | |||
int | |||
select_init (EV_P_ int flags) | |||
{ | |||
backend_mintime = 1e-6; | |||
backend_mintime = EV_TS_CONST (1e-6); | |||
backend_modify = select_modify; | |||
backend_poll = select_poll; | |||
@@ -1,7 +1,7 @@ | |||
/* | |||
* loop member variable declarations | |||
* | |||
* Copyright (c) 2007,2008,2009,2010,2011,2012,2013 Marc Alexander Lehmann <libev@schmorp.de> | |||
* Copyright (c) 2007,2008,2009,2010,2011,2012,2013,2019 Marc Alexander Lehmann <libev@schmorp.de> | |||
* All rights reserved. | |||
* | |||
* Redistribution and use in source and binary forms, with or without modifica- | |||
@@ -107,6 +107,46 @@ VARx(int, epoll_epermcnt) | |||
VARx(int, epoll_epermmax) | |||
#endif | |||
#if EV_USE_LINUXAIO || EV_GENWRAP | |||
VARx(aio_context_t, linuxaio_ctx) | |||
VARx(int, linuxaio_iteration) | |||
VARx(struct aniocb **, linuxaio_iocbps) | |||
VARx(int, linuxaio_iocbpmax) | |||
VARx(struct iocb **, linuxaio_submits) | |||
VARx(int, linuxaio_submitcnt) | |||
VARx(int, linuxaio_submitmax) | |||
VARx(ev_io, linuxaio_epoll_w) | |||
#endif | |||
#if EV_USE_IOURING || EV_GENWRAP | |||
VARx(int, iouring_fd) | |||
VARx(unsigned, iouring_to_submit); | |||
VARx(int, iouring_entries) | |||
VARx(int, iouring_max_entries) | |||
VARx(void *, iouring_sq_ring) | |||
VARx(void *, iouring_cq_ring) | |||
VARx(void *, iouring_sqes) | |||
VARx(uint32_t, iouring_sq_ring_size) | |||
VARx(uint32_t, iouring_cq_ring_size) | |||
VARx(uint32_t, iouring_sqes_size) | |||
VARx(uint32_t, iouring_sq_head) | |||
VARx(uint32_t, iouring_sq_tail) | |||
VARx(uint32_t, iouring_sq_ring_mask) | |||
VARx(uint32_t, iouring_sq_ring_entries) | |||
VARx(uint32_t, iouring_sq_flags) | |||
VARx(uint32_t, iouring_sq_dropped) | |||
VARx(uint32_t, iouring_sq_array) | |||
VARx(uint32_t, iouring_cq_head) | |||
VARx(uint32_t, iouring_cq_tail) | |||
VARx(uint32_t, iouring_cq_ring_mask) | |||
VARx(uint32_t, iouring_cq_ring_entries) | |||
VARx(uint32_t, iouring_cq_overflow) | |||
VARx(uint32_t, iouring_cq_cqes) | |||
VARx(ev_tstamp, iouring_tfd_to) | |||
VARx(int, iouring_tfd) | |||
VARx(ev_io, iouring_tfd_w) | |||
#endif | |||
#if EV_USE_KQUEUE || EV_GENWRAP | |||
VARx(pid_t, kqueue_fd_pid) | |||
VARx(struct kevent *, kqueue_changes) | |||
@@ -187,6 +227,11 @@ VARx(ev_io, sigfd_w) | |||
VARx(sigset_t, sigfd_set) | |||
#endif | |||
#if EV_USE_TIMERFD || EV_GENWRAP | |||
VARx(int, timerfd) /* timerfd for time jump detection */ | |||
VARx(ev_io, timerfd_w) | |||
#endif | |||
VARx(unsigned int, origflags) /* original loop flags */ | |||
#if EV_FEATURE_API || EV_GENWRAP |
@@ -154,8 +154,8 @@ ev_time (void) | |||
ui.u.LowPart = ft.dwLowDateTime; | |||
ui.u.HighPart = ft.dwHighDateTime; | |||
/* msvc cannot convert ulonglong to double... yes, it is that sucky */ | |||
return (LONGLONG)(ui.QuadPart - 116444736000000000) * 1e-7; | |||
/* also, msvc cannot convert ulonglong to double... yes, it is that sucky */ | |||
return EV_TS_FROM_USEC (((LONGLONG)(ui.QuadPart - 116444736000000000) * 1e-1)); | |||
} | |||
#endif |
@@ -44,12 +44,46 @@ | |||
#define invoke_cb ((loop)->invoke_cb) | |||
#define io_blocktime ((loop)->io_blocktime) | |||
#define iocp ((loop)->iocp) | |||
#define iouring_cq_cqes ((loop)->iouring_cq_cqes) | |||
#define iouring_cq_head ((loop)->iouring_cq_head) | |||
#define iouring_cq_overflow ((loop)->iouring_cq_overflow) | |||
#define iouring_cq_ring ((loop)->iouring_cq_ring) | |||
#define iouring_cq_ring_entries ((loop)->iouring_cq_ring_entries) | |||
#define iouring_cq_ring_mask ((loop)->iouring_cq_ring_mask) | |||
#define iouring_cq_ring_size ((loop)->iouring_cq_ring_size) | |||
#define iouring_cq_tail ((loop)->iouring_cq_tail) | |||
#define iouring_entries ((loop)->iouring_entries) | |||
#define iouring_fd ((loop)->iouring_fd) | |||
#define iouring_max_entries ((loop)->iouring_max_entries) | |||
#define iouring_sq_array ((loop)->iouring_sq_array) | |||
#define iouring_sq_dropped ((loop)->iouring_sq_dropped) | |||
#define iouring_sq_flags ((loop)->iouring_sq_flags) | |||
#define iouring_sq_head ((loop)->iouring_sq_head) | |||
#define iouring_sq_ring ((loop)->iouring_sq_ring) | |||
#define iouring_sq_ring_entries ((loop)->iouring_sq_ring_entries) | |||
#define iouring_sq_ring_mask ((loop)->iouring_sq_ring_mask) | |||
#define iouring_sq_ring_size ((loop)->iouring_sq_ring_size) | |||
#define iouring_sq_tail ((loop)->iouring_sq_tail) | |||
#define iouring_sqes ((loop)->iouring_sqes) | |||
#define iouring_sqes_size ((loop)->iouring_sqes_size) | |||
#define iouring_tfd ((loop)->iouring_tfd) | |||
#define iouring_tfd_to ((loop)->iouring_tfd_to) | |||
#define iouring_tfd_w ((loop)->iouring_tfd_w) | |||
#define iouring_to_submit ((loop)->iouring_to_submit) | |||
#define kqueue_changecnt ((loop)->kqueue_changecnt) | |||
#define kqueue_changemax ((loop)->kqueue_changemax) | |||
#define kqueue_changes ((loop)->kqueue_changes) | |||
#define kqueue_eventmax ((loop)->kqueue_eventmax) | |||
#define kqueue_events ((loop)->kqueue_events) | |||
#define kqueue_fd_pid ((loop)->kqueue_fd_pid) | |||
#define linuxaio_ctx ((loop)->linuxaio_ctx) | |||
#define linuxaio_epoll_w ((loop)->linuxaio_epoll_w) | |||
#define linuxaio_iocbpmax ((loop)->linuxaio_iocbpmax) | |||
#define linuxaio_iocbps ((loop)->linuxaio_iocbps) | |||
#define linuxaio_iteration ((loop)->linuxaio_iteration) | |||
#define linuxaio_submitcnt ((loop)->linuxaio_submitcnt) | |||
#define linuxaio_submitmax ((loop)->linuxaio_submitmax) | |||
#define linuxaio_submits ((loop)->linuxaio_submits) | |||
#define loop_count ((loop)->loop_count) | |||
#define loop_depth ((loop)->loop_depth) | |||
#define loop_done ((loop)->loop_done) | |||
@@ -89,6 +123,8 @@ | |||
#define sigfd_w ((loop)->sigfd_w) | |||
#define timeout_blocktime ((loop)->timeout_blocktime) | |||
#define timercnt ((loop)->timercnt) | |||
#define timerfd ((loop)->timerfd) | |||
#define timerfd_w ((loop)->timerfd_w) | |||
#define timermax ((loop)->timermax) | |||
#define timers ((loop)->timers) | |||
#define userdata ((loop)->userdata) | |||
@@ -143,12 +179,46 @@ | |||
#undef invoke_cb | |||
#undef io_blocktime | |||
#undef iocp | |||
#undef iouring_cq_cqes | |||
#undef iouring_cq_head | |||
#undef iouring_cq_overflow | |||
#undef iouring_cq_ring | |||
#undef iouring_cq_ring_entries | |||
#undef iouring_cq_ring_mask | |||
#undef iouring_cq_ring_size | |||
#undef iouring_cq_tail | |||
#undef iouring_entries | |||
#undef iouring_fd | |||
#undef iouring_max_entries | |||
#undef iouring_sq_array | |||
#undef iouring_sq_dropped | |||
#undef iouring_sq_flags | |||
#undef iouring_sq_head | |||
#undef iouring_sq_ring | |||
#undef iouring_sq_ring_entries | |||
#undef iouring_sq_ring_mask | |||
#undef iouring_sq_ring_size | |||
#undef iouring_sq_tail | |||
#undef iouring_sqes | |||
#undef iouring_sqes_size | |||
#undef iouring_tfd | |||
#undef iouring_tfd_to | |||
#undef iouring_tfd_w | |||
#undef iouring_to_submit | |||
#undef kqueue_changecnt | |||
#undef kqueue_changemax | |||
#undef kqueue_changes | |||
#undef kqueue_eventmax | |||
#undef kqueue_events | |||
#undef kqueue_fd_pid | |||
#undef linuxaio_ctx | |||
#undef linuxaio_epoll_w | |||
#undef linuxaio_iocbpmax | |||
#undef linuxaio_iocbps | |||
#undef linuxaio_iteration | |||
#undef linuxaio_submitcnt | |||
#undef linuxaio_submitmax | |||
#undef linuxaio_submits | |||
#undef loop_count | |||
#undef loop_depth | |||
#undef loop_done | |||
@@ -188,6 +258,8 @@ | |||
#undef sigfd_w | |||
#undef timeout_blocktime | |||
#undef timercnt | |||
#undef timerfd | |||
#undef timerfd_w | |||
#undef timermax | |||
#undef timers | |||
#undef userdata |
@@ -2631,6 +2631,14 @@ rspamd_config_ev_backend_to_string (int ev_backend, gboolean *effective) | |||
} | |||
if (ev_backend & EVBACKEND_EPOLL) { | |||
if (ev_backend & EVBACKEND_IOURING) { | |||
SET_EFFECTIVE (TRUE); | |||
return "epoll+io_uring"; | |||
} | |||
if (ev_backend & EVBACKEND_LINUXAIO) { | |||
SET_EFFECTIVE (TRUE); | |||
return "epoll+aio"; | |||
} | |||
SET_EFFECTIVE (TRUE); | |||
return "epoll"; | |||
} |