sd-event: add new API for subscribing to inotify events

This adds a new call sd_event_add_inotify() which allows watching for inotify events on specified paths. sd-event will try to minimize the number of inotify fds allocated, and will try to add file watches to the same inotify fd objects as far as that's possible. Doing this kind of inotify object should optimize behaviour in programs that watch a limited set of mostly independent files as in most cases a single inotify object will suffice for watching all files. Traditionally, this kind of coalescing logic (i.e. that multiple event sources are implemented on top of a single inotify object) was very hard to do, as the inotify API had serious limitations: it only allowed adding watches by path, and would implicitly merge watches installed on the same node via different path, without letting the caller know about whether such merging took place or not. With the advent of O_PATH this issue can be dealt with to some point: instead of adding a path to watch to an inotify object with inotify_add_watch() right away, we can open the path with O_PATH first, call fstat() on the fd, and check the .st_dev/.st_ino fields of that against a list of watches we already have in place. If we find one we know that the inotify_add_watch() will update the watch mask of the existing watch, otherwise it will create a new watch. To make this race-free we use inotify_add_watch() on the /proc/self/fd/ path of the O_PATH fd, instead of the original path, so that we do the checking and watch updating with guaranteed the same inode. This approach let's us deal safely with inodes that may appear under various different paths (due to symlinks, hardlinks, bind mounts, fs namespaces). However it's not a perfect solution: currently the kernel has no API for changing the watch mask of an existing watch -- unless you have a path or fd to the original inode. This means we can "merge" the watches of the same inode of multiple event sources correctly, but we cannot "unmerge" it again correctly in many cases, as access to the original inode might have been lost, due to renames, mount/unmount, or deletions. We could in theory always keep open an O_PATH fd of the inode to watch so that we can change the mask anytime we want, but this is highly problematics, as it would consume too many fds (and in fact the scarcity of fds is the reason why watch descriptors are a separate concepts from fds) and would keep the backing mounts busy (wds do not keep mounts busy, fds do). The current implemented approach to all this: filter in userspace and accept that the watch mask on some inode might be higher than necessary due to earlier installed event sources that might have ceased to exist. This approach while ugly shouldn't be too bad for most cases as the same inodes are probably wacthed for the same masks in most implementations. In order to implement priorities correctly a seperate inotify object is allocated for each priority that is used. This way we get separate per-priority event queues, of which we never dequeue more than a few events at a time. Fixes: #3982
author: Lennart Poettering <lennart@poettering.net> 2018-05-28 16:26:50 +0200
committer: Sven Eden <yamakuzure@gmx.net> 2018-08-24 16:47:08 +0200
commit: 2f4c9492056c9170b3c673638f9f1d14df3a5de2 (patch)
tree: bb7236dd7f85e482e0bc2dbd63ce0ff1254f5de0 /src
parent: 8017c497ece57371624c52b415ea52acfb3bfc9c (diff)
3 files changed, 821 insertions, 1 deletions
diff --git a/src/libelogind/libelogind.sym b/src/libelogind/libelogind.sym
index fd00c2198..9dd25c738 100644
--- a/src/libelogind/libelogind.sym
+++ b/src/libelogind/libelogind.sym
@@ -569,4 +569,6 @@ global:
         sd_bus_open_system_with_description;
         sd_bus_slot_get_floating;
         sd_bus_slot_set_floating;
+        sd_event_add_inotify;
+        sd_event_source_get_inotify_mask;
 } LIBSYSTEMD_238;
diff --git a/src/libelogind/sd-event/sd-event.c b/src/libelogind/sd-event/sd-event.c
index fd8ba14ce..f05c02d1d 100644
--- a/src/libelogind/sd-event/sd-event.c
+++ b/src/libelogind/sd-event/sd-event.c
@@ -15,6 +15,7 @@
 
 #include "alloc-util.h"
 #include "fd-util.h"
+//#include "fs-util.h"
 #include "hashmap.h"
 #include "list.h"
 #include "macro.h"
@@ -43,6 +44,7 @@ typedef enum EventSourceType {
         SOURCE_POST,
         SOURCE_EXIT,
         SOURCE_WATCHDOG,
+        SOURCE_INOTIFY,
         _SOURCE_EVENT_SOURCE_TYPE_MAX,
         _SOURCE_EVENT_SOURCE_TYPE_INVALID = -1
 } EventSourceType;
@@ -60,6 +62,7 @@ static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX]
         [SOURCE_POST] = "post",
         [SOURCE_EXIT] = "exit",
         [SOURCE_WATCHDOG] = "watchdog",
+        [SOURCE_INOTIFY] = "inotify",
 };
 
 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
@@ -71,12 +74,15 @@ typedef enum WakeupType {
         WAKEUP_EVENT_SOURCE,
         WAKEUP_CLOCK_DATA,
         WAKEUP_SIGNAL_DATA,
+        WAKEUP_INOTIFY_DATA,
         _WAKEUP_TYPE_MAX,
         _WAKEUP_TYPE_INVALID = -1,
 } WakeupType;
 
 #define EVENT_SOURCE_IS_TIME(t) IN_SET((t), SOURCE_TIME_REALTIME, SOURCE_TIME_BOOTTIME, SOURCE_TIME_MONOTONIC, SOURCE_TIME_REALTIME_ALARM, SOURCE_TIME_BOOTTIME_ALARM)
 
+struct inode_data;
+
 struct sd_event_source {
         WakeupType wakeup;
 
@@ -138,6 +144,12 @@ struct sd_event_source {
                         sd_event_handler_t callback;
                         unsigned prioq_index;
                 } exit;
+                struct {
+                        sd_event_inotify_handler_t callback;
+                        uint32_t mask;
+                        struct inode_data *inode_data;
+                        LIST_FIELDS(sd_event_source, by_inode_data);
+                } inotify;
         };
 };
 
@@ -172,6 +184,64 @@ struct signal_data {
         sd_event_source *current;
 };
 
+/* A structure listing all event sources currently watching a specific inode */
+struct inode_data {
+        /* The identifier for the inode, the combination of the .st_dev + .st_ino fields of the file */
+        ino_t ino;
+        dev_t dev;
+
+        /* An fd of the inode to watch. The fd is kept open until the next iteration of the loop, so that we can
+         * rearrange the priority still until then, as we need the original inode to change the priority as we need to
+         * add a watch descriptor to the right inotify for the priority which we can only do if we have a handle to the
+         * original inode. We keep a list of all inode_data objects with an open fd in the to_close list (see below) of
+         * the sd-event object, so that it is efficient to close everything, before entering the next event loop
+         * iteration. */
+        int fd;
+
+        /* The inotify "watch descriptor" */
+        int wd;
+
+        /* The combination of the mask of all inotify watches on this inode we manage. This is also the mask that has
+         * most recently been set on the watch descriptor. */
+        uint32_t combined_mask;
+
+        /* All event sources subscribed to this inode */
+        LIST_HEAD(sd_event_source, event_sources);
+
+        /* The inotify object we watch this inode with */
+        struct inotify_data *inotify_data;
+
+        /* A linked list of all inode data objects with fds to close (see above) */
+        LIST_FIELDS(struct inode_data, to_close);
+};
+
+/* A structure encapsulating an inotify fd */
+struct inotify_data {
+        WakeupType wakeup;
+
+        /* For each priority we maintain one inotify fd, so that we only have to dequeue a single event per priority at
+         * a time */
+
+        int fd;
+        int64_t priority;
+
+        Hashmap *inodes; /* The inode_data structures keyed by dev+ino */
+        Hashmap *wd;     /* The inode_data structures keyed by the watch descriptor for each */
+
+        /* The buffer we read inotify events into */
+        union inotify_event_buffer buffer;
+        size_t buffer_filled; /* fill level of the buffer */
+
+        /* How many event sources are currently marked pending for this inotify. We won't read new events off the
+         * inotify fd as long as there are still pending events on the inotify (because we have no strategy of queuing
+         * the events locally if they can't be coalesced). */
+        unsigned n_pending;
+
+        /* A linked list of all inotify objects with data already read, that still need processing. We keep this list
+         * to make it efficient to figure out what inotify objects to process data on next. */
+        LIST_FIELDS(struct inotify_data, buffered);
+};
+
 struct sd_event {
         unsigned n_ref;
 
@@ -202,6 +272,14 @@ struct sd_event {
 
         Prioq *exit;
 
+        Hashmap *inotify_data; /* indexed by priority */
+
+        /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
+        LIST_HEAD(struct inode_data, inode_data_to_close);
+
+        /* A list of inotify objects that already have events buffered which aren't processed yet */
+        LIST_HEAD(struct inotify_data, inotify_data_buffered);
+
         pid_t original_pid;
 
         uint64_t iteration;
@@ -231,6 +309,7 @@ struct sd_event {
 static thread_local sd_event *default_event = NULL;
 
 static void source_disconnect(sd_event_source *s);
+static void event_gc_inode_data(sd_event *e, struct inode_data *d);
 
 static sd_event *event_resolve(sd_event *e) {
         return e == SD_EVENT_DEFAULT ? default_event : e;
@@ -412,6 +491,8 @@ static void event_free(sd_event *e) {
         free(e->signal_sources);
         hashmap_free(e->signal_data);
 
+        hashmap_free(e->inotify_data);
+
         hashmap_free(e->child_sources);
         set_free(e->post_sources);
         free(e);
@@ -855,6 +936,41 @@ static void source_disconnect(sd_event_source *s) {
                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
                 break;
 
+        case SOURCE_INOTIFY: {
+                struct inode_data *inode_data;
+
+                inode_data = s->inotify.inode_data;
+                if (inode_data) {
+                        struct inotify_data *inotify_data;
+                        assert_se(inotify_data = inode_data->inotify_data);
+
+                        /* Detach this event source from the inode object */
+                        LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
+                        s->inotify.inode_data = NULL;
+
+                        if (s->pending) {
+                                assert(inotify_data->n_pending > 0);
+                                inotify_data->n_pending--;
+                        }
+
+                        /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
+                         * continued to being watched. That's because inotify doesn't really have an API for that: we
+                         * can only change watch masks with access to the original inode either by fd or by path. But
+                         * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
+                         * continously and keeping the mount busy which we can't really do. We could reconstruct the
+                         * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
+                         * there), but given the need for open_by_handle_at() which is privileged and not universally
+                         * available this would be quite an incomplete solution. Hence we go the other way, leave the
+                         * mask set, even if it is not minimized now, and ignore all events we aren't interested in
+                         * anymore after reception. Yes, this sucks, but … Linux … */
+
+                        /* Maybe release the inode data (and its inotify) */
+                        event_gc_inode_data(s->event, inode_data);
+                }
+
+                break;
+        }
+
         default:
                 assert_not_reached("Wut? I shouldn't exist.");
         }
@@ -929,6 +1045,19 @@ static int source_set_pending(sd_event_source *s, bool b) {
                         d->current = NULL;
         }
 
+        if (s->type == SOURCE_INOTIFY) {
+
+                assert(s->inotify.inode_data);
+                assert(s->inotify.inode_data->inotify_data);
+
+                if (b)
+                        s->inotify.inode_data->inotify_data->n_pending ++;
+                else {
+                        assert(s->inotify.inode_data->inotify_data->n_pending > 0);
+                        s->inotify.inode_data->inotify_data->n_pending --;
+                }
+        }
+
         return 0;
 }
 
@@ -1377,6 +1506,405 @@ _public_ int sd_event_add_exit(
         return 0;
 }
 
+static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
+        assert(e);
+
+        if (!d)
+                return;
+
+        assert(hashmap_isempty(d->inodes));
+        assert(hashmap_isempty(d->wd));
+
+        if (d->buffer_filled > 0)
+                LIST_REMOVE(buffered, e->inotify_data_buffered, d);
+
+        hashmap_free(d->inodes);
+        hashmap_free(d->wd);
+
+        assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
+
+        if (d->fd >= 0) {
+                if (epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
+                        log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
+
+                safe_close(d->fd);
+        }
+        free(d);
+}
+
+static int event_make_inotify_data(
+                sd_event *e,
+                int64_t priority,
+                struct inotify_data **ret) {
+
+        _cleanup_close_ int fd = -1;
+        struct inotify_data *d;
+        struct epoll_event ev;
+        int r;
+
+        assert(e);
+
+        d = hashmap_get(e->inotify_data, &priority);
+        if (d) {
+                if (ret)
+                        *ret = d;
+                return 0;
+        }
+
+        fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
+        if (fd < 0)
+                return -errno;
+
+        fd = fd_move_above_stdio(fd);
+
+        r = hashmap_ensure_allocated(&e->inotify_data, &uint64_hash_ops);
+        if (r < 0)
+                return r;
+
+        d = new(struct inotify_data, 1);
+        if (!d)
+                return -ENOMEM;
+
+        *d = (struct inotify_data) {
+                .wakeup = WAKEUP_INOTIFY_DATA,
+                .fd = TAKE_FD(fd),
+                .priority = priority,
+        };
+
+        r = hashmap_put(e->inotify_data, &d->priority, d);
+        if (r < 0) {
+                d->fd = safe_close(d->fd);
+                free(d);
+                return r;
+        }
+
+        ev = (struct epoll_event) {
+                .events = EPOLLIN,
+                .data.ptr = d,
+        };
+
+        if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
+                r = -errno;
+                d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
+                                            * remove the fd from the epoll first, which we don't want as we couldn't
+                                            * add it in the first place. */
+                event_free_inotify_data(e, d);
+                return r;
+        }
+
+        if (ret)
+                *ret = d;
+
+        return 1;
+}
+
+static int inode_data_compare(const void *a, const void *b) {
+        const struct inode_data *x = a, *y = b;
+
+        assert(x);
+        assert(y);
+
+        if (x->dev < y->dev)
+                return -1;
+        if (x->dev > y->dev)
+                return 1;
+
+        if (x->ino < y->ino)
+                return -1;
+        if (x->ino > y->ino)
+                return 1;
+
+        return 0;
+}
+
+static void inode_data_hash_func(const void *p, struct siphash *state) {
+        const struct inode_data *d = p;
+
+        assert(p);
+
+        siphash24_compress(&d->dev, sizeof(d->dev), state);
+        siphash24_compress(&d->ino, sizeof(d->ino), state);
+}
+
+const struct hash_ops inode_data_hash_ops = {
+        .hash = inode_data_hash_func,
+        .compare = inode_data_compare
+};
+
+static void event_free_inode_data(
+                sd_event *e,
+                struct inode_data *d) {
+
+        assert(e);
+
+        if (!d)
+                return;
+
+        assert(!d->event_sources);
+
+        if (d->fd >= 0) {
+                LIST_REMOVE(to_close, e->inode_data_to_close, d);
+                safe_close(d->fd);
+        }
+
+        if (d->inotify_data) {
+
+                if (d->wd >= 0) {
+                        if (d->inotify_data->fd >= 0) {
+                                /* So here's a problem. At the time this runs the watch descriptor might already be
+                                 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
+                                 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
+                                 * likely case to happen. */
+
+                                if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
+                                        log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
+                        }
+
+                        assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
+                }
+
+                assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
+        }
+
+        free(d);
+}
+
+static void event_gc_inode_data(
+                sd_event *e,
+                struct inode_data *d) {
+
+        struct inotify_data *inotify_data;
+
+        assert(e);
+
+        if (!d)
+                return;
+
+        if (d->event_sources)
+                return;
+
+        inotify_data = d->inotify_data;
+        event_free_inode_data(e, d);
+
+        if (inotify_data && hashmap_isempty(inotify_data->inodes))
+                event_free_inotify_data(e, inotify_data);
+}
+
+static int event_make_inode_data(
+                sd_event *e,
+                struct inotify_data *inotify_data,
+                dev_t dev,
+                ino_t ino,
+                struct inode_data **ret) {
+
+        struct inode_data *d, key;
+        int r;
+
+        assert(e);
+        assert(inotify_data);
+
+        key = (struct inode_data) {
+                .ino = ino,
+                .dev = dev,
+        };
+
+        d = hashmap_get(inotify_data->inodes, &key);
+        if (d) {
+                if (ret)
+                        *ret = d;
+
+                return 0;
+        }
+
+        r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
+        if (r < 0)
+                return r;
+
+        d = new(struct inode_data, 1);
+        if (!d)
+                return -ENOMEM;
+
+        *d = (struct inode_data) {
+                .dev = dev,
+                .ino = ino,
+                .wd = -1,
+                .fd = -1,
+                .inotify_data = inotify_data,
+        };
+
+        r = hashmap_put(inotify_data->inodes, d, d);
+        if (r < 0) {
+                free(d);
+                return r;
+        }
+
+        if (ret)
+                *ret = d;
+
+        return 1;
+}
+
+static uint32_t inode_data_determine_mask(struct inode_data *d) {
+        bool excl_unlink = true;
+        uint32_t combined = 0;
+        sd_event_source *s;
+
+        assert(d);
+
+        /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
+         * the IN_EXCL_UNLINK flag is ANDed instead.
+         *
+         * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
+         * because we cannot change the mask anymore after the event source was created once, since the kernel has no
+         * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and supress
+         * events we don't care for client-side. */
+
+        LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
+
+                if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
+                        excl_unlink = false;
+
+                combined |= s->inotify.mask;
+        }
+
+        return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
+}
+
+static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
+        uint32_t combined_mask;
+        int wd, r;
+
+        assert(d);
+        assert(d->fd >= 0);
+
+        combined_mask = inode_data_determine_mask(d);
+
+        if (d->wd >= 0 && combined_mask == d->combined_mask)
+                return 0;
+
+        r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
+        if (r < 0)
+                return r;
+
+        wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
+        if (wd < 0)
+                return -errno;
+
+        if (d->wd < 0) {
+                r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
+                if (r < 0) {
+                        (void) inotify_rm_watch(d->inotify_data->fd, wd);
+                        return r;
+                }
+
+                d->wd = wd;
+
+        } else if (d->wd != wd) {
+
+                log_debug("Weird, the watch descriptor we already knew for this inode changed?");
+                (void) inotify_rm_watch(d->fd, wd);
+                return -EINVAL;
+        }
+
+        d->combined_mask = combined_mask;
+        return 1;
+}
+
+_public_ int sd_event_add_inotify(
+                sd_event *e,
+                sd_event_source **ret,
+                const char *path,
+                uint32_t mask,
+                sd_event_inotify_handler_t callback,
+                void *userdata) {
+
+        bool rm_inotify = false, rm_inode = false;
+        struct inotify_data *inotify_data = NULL;
+        struct inode_data *inode_data = NULL;
+        _cleanup_close_ int fd = -1;
+        sd_event_source *s;
+        struct stat st;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(path, -EINVAL);
+        assert_return(callback, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
+         * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
+         * the user can't use them for us. */
+        if (mask & IN_MASK_ADD)
+                return -EINVAL;
+
+        fd = open(path, O_PATH|O_CLOEXEC|
+                  (mask & IN_ONLYDIR ? O_DIRECTORY : 0)|
+                  (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
+        if (fd < 0)
+                return -errno;
+
+        if (fstat(fd, &st) < 0)
+                return -errno;
+
+        s = source_new(e, !ret, SOURCE_INOTIFY);
+        if (!s)
+                return -ENOMEM;
+
+        s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
+        s->inotify.mask = mask;
+        s->inotify.callback = callback;
+        s->userdata = userdata;
+
+        /* Allocate an inotify object for this priority, and an inode object within it */
+        r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
+        if (r < 0)
+                goto fail;
+        rm_inotify = r > 0;
+
+        r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
+        if (r < 0)
+                goto fail;
+        rm_inode = r > 0;
+
+        /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
+         * the event source, until then, for which we need the original inode. */
+        if (inode_data->fd < 0) {
+                inode_data->fd = TAKE_FD(fd);
+                LIST_PREPEND(to_close, e->inode_data_to_close, inode_data);
+        }
+
+        /* Link our event source to the inode data object */
+        LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
+        s->inotify.inode_data = inode_data;
+
+        rm_inode = rm_inotify = false;
+
+        /* Actually realize the watch now */
+        r = inode_data_realize_watch(e, inode_data);
+        if (r < 0)
+                goto fail;
+
+        (void) sd_event_source_set_description(s, path);
+
+        if (ret)
+                *ret = s;
+
+        return 0;
+
+fail:
+        source_free(s);
+
+        if (rm_inode)
+                event_free_inode_data(e, inode_data);
+
+        if (rm_inotify)
+                event_free_inotify_data(e, inotify_data);
+
+        return r;
+}
+
 _public_ sd_event_source* sd_event_source_ref(sd_event_source *s) {
 
         if (!s)
@@ -1574,6 +2102,9 @@ _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority)
 }
 
 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
+        bool rm_inotify = false, rm_inode = false;
+        struct inotify_data *new_inotify_data = NULL;
+        struct inode_data *new_inode_data = NULL;
         int r;
 
         assert_return(s, -EINVAL);
@@ -1583,7 +2114,59 @@ _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority)
         if (s->priority == priority)
                 return 0;
 
-        if (s->type == SOURCE_SIGNAL && s->enabled != SD_EVENT_OFF) {
+        if (s->type == SOURCE_INOTIFY) {
+                struct inode_data *old_inode_data;
+
+                assert(s->inotify.inode_data);
+                old_inode_data = s->inotify.inode_data;
+
+                /* We need the original fd to change the priority. If we don't have it we can't change the priority,
+                 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
+                 * events we allow priority changes only until the first following iteration. */
+                if (old_inode_data->fd < 0)
+                        return -EOPNOTSUPP;
+
+                r = event_make_inotify_data(s->event, priority, &new_inotify_data);
+                if (r < 0)
+                        return r;
+                rm_inotify = r > 0;
+
+                r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
+                if (r < 0)
+                        goto fail;
+                rm_inode = r > 0;
+
+                if (new_inode_data->fd < 0) {
+                        /* Duplicate the fd for the new inode object if we don't have any yet */
+                        new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
+                        if (new_inode_data->fd < 0) {
+                                r = -errno;
+                                goto fail;
+                        }
+
+                        LIST_PREPEND(to_close, s->event->inode_data_to_close, new_inode_data);
+                }
+
+                /* Move the event source to the new inode data structure */
+                LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
+                LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
+                s->inotify.inode_data = new_inode_data;
+
+                /* Now create the new watch */
+                r = inode_data_realize_watch(s->event, new_inode_data);
+                if (r < 0) {
+                        /* Move it back */
+                        LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
+                        LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
+                        s->inotify.inode_data = old_inode_data;
+                        goto fail;
+                }
+
+                s->priority = priority;
+
+                event_gc_inode_data(s->event, old_inode_data);
+
+        } else if (s->type == SOURCE_SIGNAL && s->enabled != SD_EVENT_OFF) {
                 struct signal_data *old, *d;
 
                 /* Move us from the signalfd belonging to the old
@@ -1613,6 +2196,15 @@ _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority)
                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
 
         return 0;
+
+fail:
+        if (rm_inode)
+                event_free_inode_data(s->event, new_inode_data);
+
+        if (rm_inotify)
+                event_free_inotify_data(s->event, new_inotify_data);
+
+        return r;
 }
 
 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *m) {
@@ -1694,6 +2286,7 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
 
                 case SOURCE_DEFER:
                 case SOURCE_POST:
+                case SOURCE_INOTIFY:
                         s->enabled = m;
                         break;
 
@@ -1774,6 +2367,7 @@ _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
 
                 case SOURCE_DEFER:
                 case SOURCE_POST:
+                case SOURCE_INOTIFY:
                         s->enabled = m;
                         break;
 
@@ -1884,6 +2478,16 @@ _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
         return 0;
 }
 
+_public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
+        assert_return(s, -EINVAL);
+        assert_return(mask, -EINVAL);
+        assert_return(s->type == SOURCE_INOTIFY, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        *mask = s->inotify.mask;
+        return 0;
+}
+
 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
         int r;
 
@@ -2217,6 +2821,7 @@ static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
         int r;
 
         assert(e);
+        assert(d);
         assert_return(events == EPOLLIN, -EIO);
 
         /* If there's a signal queued on this priority and SIGCHLD is
@@ -2273,6 +2878,160 @@ static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) {
         }
 }
 
+static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents) {
+        ssize_t n;
+
+        assert(e);
+        assert(d);
+
+        assert_return(revents == EPOLLIN, -EIO);
+
+        /* If there's already an event source pending for this priority, don't read another */
+        if (d->n_pending > 0)
+                return 0;
+
+        /* Is the read buffer non-empty? If so, let's not read more */
+        if (d->buffer_filled > 0)
+                return 0;
+
+        n = read(d->fd, &d->buffer, sizeof(d->buffer));
+        if (n < 0) {
+                if (IN_SET(errno, EAGAIN, EINTR))
+                        return 0;
+
+                return -errno;
+        }
+
+        assert(n > 0);
+        d->buffer_filled = (size_t) n;
+        LIST_PREPEND(buffered, e->inotify_data_buffered, d);
+
+        return 1;
+}
+
+static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
+        assert(e);
+        assert(d);
+        assert(sz <= d->buffer_filled);
+
+        if (sz == 0)
+                return;
+
+        /* Move the rest to the buffer to the front, in order to get things properly aligned again */
+        memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
+        d->buffer_filled -= sz;
+
+        if (d->buffer_filled == 0)
+                LIST_REMOVE(buffered, e->inotify_data_buffered, d);
+}
+
+static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
+        int r;
+
+        assert(e);
+        assert(d);
+
+        /* If there's already an event source pending for this priority, don't read another */
+        if (d->n_pending > 0)
+                return 0;
+
+        while (d->buffer_filled > 0) {
+                size_t sz;
+
+                /* Let's validate that the event structures are complete */
+                if (d->buffer_filled < offsetof(struct inotify_event, name))
+                        return -EIO;
+
+                sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
+                if (d->buffer_filled < sz)
+                        return -EIO;
+
+                if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
+                        struct inode_data *inode_data;
+                        Iterator i;
+
+                        /* The queue overran, let's pass this event to all event sources connected to this inotify
+                         * object */
+
+                        HASHMAP_FOREACH(inode_data, d->inodes, i) {
+                                sd_event_source *s;
+
+                                LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
+
+                                        if (s->enabled == SD_EVENT_OFF)
+                                                continue;
+
+                                        r = source_set_pending(s, true);
+                                        if (r < 0)
+                                                return r;
+                                }
+                        }
+                } else {
+                        struct inode_data *inode_data;
+                        sd_event_source *s;
+
+                        /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
+                         * our watch descriptor table. */
+                        if (d->buffer.ev.mask & IN_IGNORED) {
+
+                                inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
+                                if (!inode_data) {
+                                        event_inotify_data_drop(e, d, sz);
+                                        continue;
+                                }
+
+                                /* The watch descriptor was removed by the kernel, let's drop it here too */
+                                inode_data->wd = -1;
+                        } else {
+                                inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
+                                if (!inode_data) {
+                                        event_inotify_data_drop(e, d, sz);
+                                        continue;
+                                }
+                        }
+
+                        /* Trigger all event sources that are interested in these events. Also trigger all event
+                         * sources if IN_IGNORED or IN_UNMOUNT is set. */
+                        LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
+
+                                if (s->enabled == SD_EVENT_OFF)
+                                        continue;
+
+                                if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
+                                    (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
+                                        continue;
+
+                                r = source_set_pending(s, true);
+                                if (r < 0)
+                                        return r;
+                        }
+                }
+
+                /* Something pending now? If so, let's finish, otherwise let's read more. */
+                if (d->n_pending > 0)
+                        return 1;
+        }
+
+        return 0;
+}
+
+static int process_inotify(sd_event *e) {
+        struct inotify_data *d;
+        int r, done = 0;
+
+        assert(e);
+
+        LIST_FOREACH(buffered, d, e->inotify_data_buffered) {
+                r = event_inotify_data_process(e, d);
+                if (r < 0)
+                        return r;
+                if (r > 0)
+                        done ++;
+        }
+
+        return done;
+}
+
 static int source_dispatch(sd_event_source *s) {
         EventSourceType saved_type;
         int r = 0;
@@ -2359,6 +3118,28 @@ static int source_dispatch(sd_event_source *s) {
                 r = s->exit.callback(s, s->userdata);
                 break;
 
+        case SOURCE_INOTIFY: {
+                struct sd_event *e = s->event;
+                struct inotify_data *d;
+                size_t sz;
+
+                assert(s->inotify.inode_data);
+                assert_se(d = s->inotify.inode_data->inotify_data);
+
+                assert(d->buffer_filled >= offsetof(struct inotify_event, name));
+                sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
+                assert(d->buffer_filled >= sz);
+
+                r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
+
+                /* When no event is pending anymore on this inotify object, then let's drop the event from the
+                 * buffer. */
+                if (d->n_pending == 0)
+                        event_inotify_data_drop(e, d, sz);
+
+                break;
+        }
+
         case SOURCE_WATCHDOG:
         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
@@ -2493,6 +3274,25 @@ static int process_watchdog(sd_event *e) {
         return arm_watchdog(e);
 }
 
+static void event_close_inode_data_fds(sd_event *e) {
+        struct inode_data *d;
+
+        assert(e);
+
+        /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
+         * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
+         * adjustments to the even source, such as changing the priority (which requires us to remove and readd a watch
+         * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
+         * compromise. */
+
+        while ((d = e->inode_data_to_close)) {
+                assert(d->fd >= 0);
+                d->fd = safe_close(d->fd);
+
+                LIST_REMOVE(to_close, e->inode_data_to_close, d);
+        }
+}
+
 _public_ int sd_event_prepare(sd_event *e) {
         int r;
 
@@ -2533,6 +3333,8 @@ _public_ int sd_event_prepare(sd_event *e) {
         if (r < 0)
                 return r;
 
+        event_close_inode_data_fds(e);
+
         if (event_next_pending(e) || e->need_process_child)
                 goto pending;
 
@@ -2568,6 +3370,10 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
         ev_queue_max = MAX(e->n_sources, 1u);
         ev_queue = newa(struct epoll_event, ev_queue_max);
 
+        /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
+        if (e->inotify_data_buffered)
+                timeout = 0;
+
         m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max,
                        timeout == (uint64_t) -1 ? -1 : (int) ((timeout + USEC_PER_MSEC - 1) / USEC_PER_MSEC));
         if (m < 0) {
@@ -2605,6 +3411,10 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
                                 r = process_signal(e, ev_queue[i].data.ptr, ev_queue[i].events);
                                 break;
 
+                        case WAKEUP_INOTIFY_DATA:
+                                r = event_inotify_data_read(e, ev_queue[i].data.ptr, ev_queue[i].events);
+                                break;
+
                         default:
                                 assert_not_reached("Invalid wake-up pointer");
                         }
@@ -2643,6 +3453,10 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
                         goto finish;
         }
 
+        r = process_inotify(e);
+        if (r < 0)
+                goto finish;
+
         if (event_next_pending(e)) {
                 e->state = SD_EVENT_PENDING;
 
diff --git a/src/systemd/sd-event.h b/src/systemd/sd-event.h
index 820962cbd..422393888 100644
--- a/src/systemd/sd-event.h
+++ b/src/systemd/sd-event.h
@@ -24,6 +24,7 @@
 #include <inttypes.h>
 #include <signal.h>
 #include <sys/epoll.h>
+//#include <sys/inotify.h>
 #include <sys/signalfd.h>
 #include <sys/types.h>
 /*#include <time.h>*/
@@ -78,6 +79,7 @@ typedef int (*sd_event_child_handler_t)(sd_event_source *s, const siginfo_t *si,
 #else
 typedef void* sd_event_child_handler_t;
 #endif
+typedef int (*sd_event_inotify_handler_t)(sd_event_source *s, const struct inotify_event *event, void *userdata);
 
 int sd_event_default(sd_event **e);
 
@@ -89,6 +91,7 @@ int sd_event_add_io(sd_event *e, sd_event_source **s, int fd, uint32_t events, s
 int sd_event_add_time(sd_event *e, sd_event_source **s, clockid_t clock, uint64_t usec, uint64_t accuracy, sd_event_time_handler_t callback, void *userdata);
 int sd_event_add_signal(sd_event *e, sd_event_source **s, int sig, sd_event_signal_handler_t callback, void *userdata);
 int sd_event_add_child(sd_event *e, sd_event_source **s, pid_t pid, int options, sd_event_child_handler_t callback, void *userdata);
+int sd_event_add_inotify(sd_event *e, sd_event_source **s, const char *path, uint32_t mask, sd_event_inotify_handler_t callback, void *userdata);
 int sd_event_add_defer(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata);
 int sd_event_add_post(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata);
 int sd_event_add_exit(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata);
@@ -139,6 +142,7 @@ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec);
 int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock);
 int sd_event_source_get_signal(sd_event_source *s);
 int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid);
+int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *ret);
 
 /* Define helpers so that __attribute__((cleanup(sd_event_unrefp))) and similar may be used. */
 _SD_DEFINE_POINTER_CLEANUP_FUNC(sd_event, sd_event_unref);
author	Lennart Poettering <lennart@poettering.net>	2018-05-28 16:26:50 +0200
committer	Sven Eden <yamakuzure@gmx.net>	2018-08-24 16:47:08 +0200
commit	2f4c9492056c9170b3c673638f9f1d14df3a5de2 (patch)
tree	bb7236dd7f85e482e0bc2dbd63ce0ff1254f5de0 /src
parent	8017c497ece57371624c52b415ea52acfb3bfc9c (diff)