From 8d13d808a5634701c55276796683857ac2d5c99a Mon Sep 17 00:00:00 2001 From: Tom Cherry Date: Thu, 29 Jun 2017 16:18:08 -0700 Subject: [PATCH] init: reap zombies only after kill(-pid, ...) When init gets SIGCHLD, it uses waitpid() to get the pid of an exited process. It then calls kill(-pid, ...) to ensure that all processes in the process group started by that process are killed as well. There is a bug here however as waitpid() reaps the pid when it returns, meaning that the call to kill(-pid, ...) may fail with ESRCH as there are no remaining references to that pid. Or worse, if the pid is reused, the wrong processes may get the signal. This fixes the bug by using waitid() with WNOWAIT to get the pid of an exited process, which does not reap the pid. It then uses waitpid() with the returned pid to do the reap only after the above kill(-pid, ...) and other operations have completed. Bug: 38164998 Test: kill surfaceflinger and see that processes exit and are reaped appropriately Test: `adb reboot` and observe that the extraneous kill() failed messages do not appear Change-Id: Ic0213e1c97e0141e6c13129dc2abbfed86de138b --- init/service.cpp | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/init/service.cpp b/init/service.cpp index f9a452ba9..b222305cf 100644 --- a/init/service.cpp +++ b/init/service.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +48,7 @@ using android::base::boot_clock; using android::base::GetProperty; using android::base::Join; +using android::base::make_scope_guard; using android::base::ParseInt; using android::base::StartsWith; using android::base::StringPrintf; @@ -1081,14 +1083,24 @@ void ServiceManager::DumpState() const { } bool ServiceManager::ReapOneProcess() { - int status; - pid_t pid = TEMP_FAILURE_RETRY(waitpid(-1, &status, WNOHANG)); - if (pid == 0) { + siginfo_t siginfo = {}; + // This returns a zombie pid or informs us that there are no zombies left to be reaped. + // It does NOT reap the pid; that is done below. + if (TEMP_FAILURE_RETRY(waitid(P_ALL, 0, &siginfo, WEXITED | WNOHANG | WNOWAIT)) != 0) { + PLOG(ERROR) << "waitid failed"; return false; - } else if (pid == -1) { - PLOG(ERROR) << "waitpid failed"; - return false; - } else if (PropertyChildReap(pid)) { + } + + auto pid = siginfo.si_pid; + if (pid == 0) return false; + + // At this point we know we have a zombie pid, so we use this scopeguard to reap the pid + // whenever the function returns from this point forward. + // We do NOT want to reap the zombie earlier as in Service::Reap(), we kill(-pid, ...) and we + // want the pid to remain valid throughout that (and potentially future) usages. + auto reaper = make_scope_guard([pid] { TEMP_FAILURE_RETRY(waitpid(pid, nullptr, WNOHANG)); }); + + if (PropertyChildReap(pid)) { return true; } @@ -1105,14 +1117,11 @@ bool ServiceManager::ReapOneProcess() { name = StringPrintf("Untracked pid %d", pid); } + auto status = siginfo.si_status; if (WIFEXITED(status)) { LOG(INFO) << name << " exited with status " << WEXITSTATUS(status) << wait_string; } else if (WIFSIGNALED(status)) { LOG(INFO) << name << " killed by signal " << WTERMSIG(status) << wait_string; - } else if (WIFSTOPPED(status)) { - LOG(INFO) << name << " stopped by signal " << WSTOPSIG(status) << wait_string; - } else { - LOG(INFO) << name << " state changed" << wait_string; } if (!svc) {