From 12001b1bf067339db089d52e08fd0b4c6a9945df Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 23 Apr 2024 23:22:07 +0200 Subject: [PATCH] execute: send handoff timestamps from executor to service manager This changes the executor to systematically send handoff timestamps to the service manager if a socket for that is supplied. This drops the code that did this via Type=exec messages, and reverts that part to the old behaviour before 93cb78aee2cff8109a5a70128287732f03d7a062. Benefits of this approach: 1. We can collect the handoff for any command we fork off, regardless if it's ExecStart= something else, regardless whether it's Type=exec, Type=simple or some any other service type, regardless of the unit type. 2. We collect both CLOCK_REALTIME and CLOCK_MONOTONIC, as we do for the other process timestamps. 3. It's entirely backwards compatible, as this doesn't change the protocol between service manager and executor, but just extends it. --- src/core/exec-invoke.c | 102 ++++++++++++++++++++++++++--------- src/core/execute-serialize.c | 12 +++++ src/core/execute.h | 2 + src/core/service.c | 69 ++++++++---------------- src/core/unit.c | 1 + 5 files changed, 113 insertions(+), 73 deletions(-) diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 8b16acb399..1a734a972a 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -3519,7 +3519,7 @@ static int close_remaining_fds( const int *fds, size_t n_fds) { size_t n_dont_close = 0; - int dont_close[n_fds + 15]; + int dont_close[n_fds + 16]; assert(params); @@ -3555,6 +3555,9 @@ static int close_remaining_fds( if (params->user_lookup_fd >= 0) dont_close[n_dont_close++] = params->user_lookup_fd; + if (params->handoff_timestamp_fd >= 0) + dont_close[n_dont_close++] = params->handoff_timestamp_fd; + assert(n_dont_close <= ELEMENTSOF(dont_close)); return close_all_fds(dont_close, n_dont_close); @@ -3974,6 +3977,52 @@ static void exec_params_close(ExecParameters *p) { p->stderr_fd = safe_close(p->stderr_fd); } +static int exec_fd_mark_hot( + const ExecContext *c, + ExecParameters *p, + bool hot, + int *reterr_exit_status) { + + assert(c); + assert(p); + + if (p->exec_fd < 0) + return 0; + + uint8_t x = hot; + + if (write(p->exec_fd, &x, sizeof(x)) < 0) { + if (reterr_exit_status) + *reterr_exit_status = EXIT_EXEC; + return log_exec_error_errno(c, p, errno, "Failed to mark exec_fd as %s: %m", hot ? "hot" : "cold"); + } + + return 1; +} + +static int send_handoff_timestamp( + const ExecContext *c, + ExecParameters *p, + int *reterr_exit_status) { + + assert(c); + assert(p); + + if (p->handoff_timestamp_fd < 0) + return 0; + + dual_timestamp dt; + dual_timestamp_now(&dt); + + if (send(p->handoff_timestamp_fd, (const usec_t[2]) { dt.realtime, dt.monotonic }, sizeof(usec_t) * 2, 0) < 0) { + if (reterr_exit_status) + *reterr_exit_status = EXIT_EXEC; + return log_exec_error_errno(c, p, errno, "Failed to send handoff timestamp: %m"); + } + + return 1; +} + int exec_invoke( const ExecCommand *command, const ExecContext *context, @@ -4105,7 +4154,7 @@ int exec_invoke( return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m"); } - int keep_fds[n_fds + 3]; + int keep_fds[n_fds + 4]; memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int)); n_keep_fds = n_fds; @@ -4115,6 +4164,12 @@ int exec_invoke( return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m"); } + r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, ¶ms->handoff_timestamp_fd); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m"); + } + #if HAVE_LIBBPF r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, ¶ms->bpf_restrict_fs_map_fd); if (r < 0) { @@ -4849,8 +4904,9 @@ int exec_invoke( /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any - * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final - * execve(). But first, close the remaining sockets in the context objects. */ + * more. We do keep exec_fd and handoff_timestamp_fd however, if we have it, since we need to keep + * them open until the final execve(). But first, close the remaining sockets in the context + * objects. */ exec_runtime_close(runtime); exec_params_close(params); @@ -5266,33 +5322,29 @@ int exec_invoke( log_command_line(context, params, "Executing", executable, final_argv); - if (params->exec_fd >= 0) { - usec_t t = now(CLOCK_MONOTONIC); + /* We have finished with all our initializations. Let's now let the manager know that. From this + * point on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */ - /* We have finished with all our initializations. Let's now let the manager know that. From this point - * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. We send a - * timestamp so that the service manager and users can track the precise moment we handed - * over execution of the service to the kernel. */ + r = exec_fd_mark_hot(context, params, /* hot= */ true, exit_status); + if (r < 0) + return r; - if (write(params->exec_fd, &t, sizeof(t)) < 0) { - *exit_status = EXIT_EXEC; - return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m"); - } + /* As last thing before the execve(), let's send the handoff timestamp */ + r = send_handoff_timestamp(context, params, exit_status); + if (r < 0) { + /* If this handoff timestamp failed, let's undo the marking as hot */ + (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL); + return r; } - r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env); - - if (params->exec_fd >= 0) { - uint64_t hot = 0; + /* NB: we leave executable_fd, exec_fd, handoff_timestamp_fd open here. This is safe, because they + * have O_CLOEXEC set, and the execve() below will thus automatically close them. In fact, for + * exec_fd this is pretty much the whole raison d'etre. */ - /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager - * that POLLHUP on it no longer means execve() succeeded. */ + r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env); - if (write(params->exec_fd, &hot, sizeof(hot)) < 0) { - *exit_status = EXIT_EXEC; - return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m"); - } - } + /* The execve() failed, let's undo the marking as hot */ + (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL); *exit_status = EXIT_EXEC; return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable); diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c index 9e402e5e69..0b6939b5d5 100644 --- a/src/core/execute-serialize.c +++ b/src/core/execute-serialize.c @@ -1373,6 +1373,10 @@ static int exec_parameters_serialize(const ExecParameters *p, const ExecContext if (r < 0) return r; + r = serialize_fd(f, fds, "exec-parameters-handoff-timestamp-fd", p->handoff_timestamp_fd); + if (r < 0) + return r; + if (c && exec_context_restrict_filesystems_set(c)) { r = serialize_fd(f, fds, "exec-parameters-bpf-outer-map-fd", p->bpf_restrict_fs_map_fd); if (r < 0) @@ -1620,6 +1624,14 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { continue; p->exec_fd = fd; + } else if ((val = startswith(l, "exec-parameters-handoff-timestamp-fd="))) { + int fd; + + fd = deserialize_fd(fds, val); + if (fd < 0) + continue; + + close_and_replace(p->handoff_timestamp_fd, fd); } else if ((val = startswith(l, "exec-parameters-bpf-outer-map-fd="))) { int fd; diff --git a/src/core/execute.h b/src/core/execute.h index f6595cf93e..202ef5f82b 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -457,6 +457,7 @@ struct ExecParameters { char **files_env; int user_lookup_fd; + int handoff_timestamp_fd; int bpf_restrict_fs_map_fd; @@ -475,6 +476,7 @@ struct ExecParameters { .exec_fd = -EBADF, \ .bpf_restrict_fs_map_fd = -EBADF, \ .user_lookup_fd = -EBADF, \ + .handoff_timestamp_fd = -EBADF, \ } #include "unit.h" diff --git a/src/core/service.c b/src/core/service.c index b2dc42f440..4a512fd24b 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -1676,8 +1676,7 @@ static int service_spawn_internal( log_unit_debug(UNIT(s), "Passing %zu fds to service", exec_params.n_socket_fds + exec_params.n_storage_fds); } - if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) && - IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD, SERVICE_EXEC, SERVICE_DBUS)) { + if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) { r = service_allocate_exec_fd(s, &exec_fd_source, &exec_params.exec_fd); if (r < 0) return r; @@ -3519,28 +3518,27 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev log_unit_debug(UNIT(s), "got exec-fd event"); /* If Type=exec is set, we'll consider a service started successfully the instant we invoked execve() - * successfully for it. We implement this through a pipe() towards the child, which the kernel automatically - * closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on the pipe in the - * parent. We need to be careful however, as there are other reasons that we might cause the child's side of - * the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless - * the child signalled us first that it is about to call the execve(). It does so by sending us a simple - * non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it - * sends a zero byte we'll ignore POLLHUP on the fd again. - * For exec, dbus, notify and notify-reload types we also get a timestamp from sd-executor, taken immediately - * before the target executable is execve'd, so that we can accurately track when control is handed over to the - * payload. */ + * successfully for it. We implement this through a pipe() towards the child, which the kernel + * automatically closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on + * the pipe in the parent. We need to be careful however, as there are other reasons that we might + * cause the child's side of the pipe to be closed (for example, a simple exit()). To deal with that + * we'll ignore EOFs on the pipe unless the child signalled us first that it is about to call the + * execve(). It does so by sending us a simple non-zero byte via the pipe. We also provide the child + * with a way to inform us in case execve() failed: if it sends a zero byte we'll ignore POLLHUP on + * the fd again. */ for (;;) { - uint64_t x = 0; + uint8_t x; ssize_t n; - n = loop_read(fd, &x, sizeof(x), /* do_poll= */ false); - if (n == -EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */ - return 0; - if (n < 0) - return log_unit_error_errno(UNIT(s), n, "Failed to read from exec_fd: %m"); - if (n == 0) { - /* EOF → the event we are waiting for in case of Type=exec */ + n = read(fd, &x, sizeof(x)); + if (n < 0) { + if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */ + return 0; + + return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m"); + } + if (n == 0) { /* EOF → the event we are waiting for in case of Type=exec */ s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source); if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */ @@ -3556,36 +3554,11 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev return 0; } - if (n == 1) { - /* Backward compatibility block: a single byte was read, which means somehow an old - * executor before this logic was introduced sent the notification, so there is no - * timestamp (reset it). */ - - s->exec_fd_hot = x; - if (s->state == SERVICE_START) - s->main_exec_status.handover_timestamp = DUAL_TIMESTAMP_NULL; - - continue; - } - if (x == 0) { - /* If we get x=0 then the execve actually failed, which means the exec fd logic is - * now off. Also reset the exec timestamp, given it has no meaning anymore. */ + /* A byte was read → this turns on/off the exec fd logic */ + assert(n == sizeof(x)); - s->exec_fd_hot = false; - if (s->state == SERVICE_START) - s->main_exec_status.handover_timestamp = DUAL_TIMESTAMP_NULL; - } else { - /* A non-zero value was read, which means the exec fd logic is now enabled. Record - * the received timestamp so that users can track when control is handed over to the - * service payload. */ - - s->exec_fd_hot = true; - if (s->state == SERVICE_START) { - assert_cc(sizeof(uint64_t) == sizeof(usec_t)); - dual_timestamp_from_monotonic(&s->main_exec_status.handover_timestamp, (usec_t)x); - } - } + s->exec_fd_hot = x; } } diff --git a/src/core/unit.c b/src/core/unit.c index 15285a3ac4..ca5623b5c2 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -5367,6 +5367,7 @@ int unit_set_exec_params(Unit *u, ExecParameters *p) { } p->user_lookup_fd = u->manager->user_lookup_fds[1]; + p->handoff_timestamp_fd = u->manager->handoff_timestamp_fds[1]; p->cgroup_id = crt ? crt->cgroup_id : 0; p->invocation_id = u->invocation_id; -- 2.25.1