From a7a7163df7fc8a9f794f6803b2f6c9c9b0745a1f Mon Sep 17 00:00:00 2001 From: David Tardon Date: Tue, 24 Apr 2018 15:19:38 +0200 Subject: [PATCH] fix race between daemon-reload and other commands When "systemctl daemon-reload" is run at the same time as "systemctl start foo", the latter might hang. That's because commands like start wait for JobRemoved signal to know when the job is finished. But if the job is finished during reloading, the signal is never sent. The hang can be easily reproduced by running # for ((N=1; N>0; N++)) ; do echo $N ; systemctl daemon-reload ; done # for ((N=1; N>0; N++)) ; do echo $N ; systemctl start systemd-coredump.socket ; done in two different terminals. The start command will hang after 1-2 iterations. This keeps track of jobs that were started before reload and finished during it and sends JobRemoved after the reload has finished. --- src/core/job.c | 49 ++++++++++++++++++++++++++++++++++++++++------ src/core/job.h | 2 ++ src/core/manager.c | 14 +++++++++++++ src/core/manager.h | 3 +++ 4 files changed, 62 insertions(+), 6 deletions(-) diff --git a/src/core/job.c b/src/core/job.c index 32f6fc1a50..a0f7685c4e 100644 --- a/src/core/job.c +++ b/src/core/job.c @@ -43,6 +43,7 @@ Job* job_new_raw(Unit *unit) { j->manager = unit->manager; j->unit = unit; j->type = _JOB_TYPE_INVALID; + j->reloaded = false; return j; } @@ -64,7 +65,7 @@ Job* job_new(Unit *unit, JobType type) { return j; } -void job_free(Job *j) { +void job_unlink(Job *j) { assert(j); assert(!j->installed); assert(!j->transaction_prev); @@ -72,16 +73,33 @@ void job_free(Job *j) { assert(!j->subject_list); assert(!j->object_list); - if (j->in_run_queue) + if (j->in_run_queue) { LIST_REMOVE(run_queue, j->manager->run_queue, j); + j->in_run_queue = false; + } - if (j->in_dbus_queue) + if (j->in_dbus_queue) { LIST_REMOVE(dbus_queue, j->manager->dbus_job_queue, j); + j->in_dbus_queue = false; + } - if (j->in_gc_queue) + if (j->in_gc_queue) { LIST_REMOVE(gc_queue, j->manager->gc_job_queue, j); + j->in_gc_queue = false; + } - sd_event_source_unref(j->timer_event_source); + j->timer_event_source = sd_event_source_unref(j->timer_event_source); +} + +void job_free(Job *j) { + assert(j); + assert(!j->installed); + assert(!j->transaction_prev); + assert(!j->transaction_next); + assert(!j->subject_list); + assert(!j->object_list); + + job_unlink(j); sd_bus_track_unref(j->bus_track); strv_free(j->deserialized_clients); @@ -241,6 +259,7 @@ int job_install_deserialized(Job *j) { *pj = j; j->installed = true; + j->reloaded = true; if (j->state == JOB_RUNNING) j->unit->manager->n_running_jobs++; @@ -844,6 +863,19 @@ static void job_fail_dependencies(Unit *u, UnitDependency d) { } } +static int job_save_pending_finished_job(Job *j) { + int r; + + assert(j); + + r = set_ensure_allocated(&j->manager->pending_finished_jobs, NULL); + if (r < 0) + return r; + + job_unlink(j); + return set_put(j->manager->pending_finished_jobs, j); +} + int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool already) { Unit *u; Unit *other; @@ -883,7 +915,12 @@ int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool alr j->manager->n_failed_jobs++; job_uninstall(j); - job_free(j); + /* Remember jobs started before the reload */ + if (MANAGER_IS_RELOADING(j->manager) && j->reloaded) { + if (job_save_pending_finished_job(j) < 0) + job_free(j); + } else + job_free(j); /* Fail depending jobs on failure */ if (result != JOB_DONE && recursive) { diff --git a/src/core/job.h b/src/core/job.h index ccb8e1b674..338670e393 100644 --- a/src/core/job.h +++ b/src/core/job.h @@ -162,10 +162,12 @@ struct Job { bool irreversible:1; bool in_gc_queue:1; bool ref_by_private_bus:1; + bool reloaded:1; }; Job* job_new(Unit *unit, JobType type); Job* job_new_raw(Unit *unit); +void job_unlink(Job *job); void job_free(Job *job); Job* job_install(Job *j); int job_install_deserialized(Job *j); diff --git a/src/core/manager.c b/src/core/manager.c index 204fc8b819..b4f197c204 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -3186,6 +3186,17 @@ finish: return r; } +static void manager_flush_finished_jobs(Manager *m) { + Job *j; + + while ((j = set_steal_first(m->pending_finished_jobs))) { + bus_job_send_removed_signal(j); + job_free(j); + } + + m->pending_finished_jobs = set_free(m->pending_finished_jobs); +} + int manager_reload(Manager *m) { int r, q; _cleanup_fclose_ FILE *f = NULL; @@ -3294,6 +3305,9 @@ int manager_reload(Manager *m) { if (q < 0 && r >= 0) r = q; + if (!MANAGER_IS_RELOADING(m)) + manager_flush_finished_jobs(m); + m->send_reloading_done = true; return r; diff --git a/src/core/manager.h b/src/core/manager.h index f41cce1c09..1f97c15365 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -300,6 +300,9 @@ struct Manager { /* non-zero if we are reloading or reexecuting, */ int n_reloading; + /* A set which contains all jobs that started before reload and finished + * during it */ + Set *pending_finished_jobs; unsigned n_installed_jobs; unsigned n_failed_jobs; -- 2.25.1