init: reap zombies only after kill(-pid, ...)

When init gets SIGCHLD, it uses waitpid() to get the pid of an exited
process.  It then calls kill(-pid, ...) to ensure that all processes
in the process group started by that process are killed as well.

There is a bug here however as waitpid() reaps the pid when it
returns, meaning that the call to kill(-pid, ...) may fail with ESRCH
as there are no remaining references to that pid.  Or worse, if the
pid is reused, the wrong processes may get the signal.

This fixes the bug by using waitid() with WNOWAIT to get the pid of an
exited process, which does not reap the pid.  It then uses waitpid()
with the returned pid to do the reap only after the above kill(-pid,
...) and other operations have completed.

Bug: 38164998
Test: kill surfaceflinger and see that processes exit and are reaped
      appropriately
Test: `adb reboot` and observe that the extraneous kill() failed
      messages do not appear

Change-Id: Ic0213e1c97e0141e6c13129dc2abbfed86de138b
This commit is contained in:
Tom Cherry 2017-06-29 16:18:08 -07:00
parent c70bf5836c
commit 8d13d808a5
1 changed files with 20 additions and 11 deletions

View File

@ -34,6 +34,7 @@
#include <android-base/logging.h>
#include <android-base/parseint.h>
#include <android-base/properties.h>
#include <android-base/scopeguard.h>
#include <android-base/stringprintf.h>
#include <android-base/strings.h>
#include <processgroup/processgroup.h>
@ -47,6 +48,7 @@
using android::base::boot_clock;
using android::base::GetProperty;
using android::base::Join;
using android::base::make_scope_guard;
using android::base::ParseInt;
using android::base::StartsWith;
using android::base::StringPrintf;
@ -1081,14 +1083,24 @@ void ServiceManager::DumpState() const {
}
bool ServiceManager::ReapOneProcess() {
int status;
pid_t pid = TEMP_FAILURE_RETRY(waitpid(-1, &status, WNOHANG));
if (pid == 0) {
siginfo_t siginfo = {};
// This returns a zombie pid or informs us that there are no zombies left to be reaped.
// It does NOT reap the pid; that is done below.
if (TEMP_FAILURE_RETRY(waitid(P_ALL, 0, &siginfo, WEXITED | WNOHANG | WNOWAIT)) != 0) {
PLOG(ERROR) << "waitid failed";
return false;
} else if (pid == -1) {
PLOG(ERROR) << "waitpid failed";
return false;
} else if (PropertyChildReap(pid)) {
}
auto pid = siginfo.si_pid;
if (pid == 0) return false;
// At this point we know we have a zombie pid, so we use this scopeguard to reap the pid
// whenever the function returns from this point forward.
// We do NOT want to reap the zombie earlier as in Service::Reap(), we kill(-pid, ...) and we
// want the pid to remain valid throughout that (and potentially future) usages.
auto reaper = make_scope_guard([pid] { TEMP_FAILURE_RETRY(waitpid(pid, nullptr, WNOHANG)); });
if (PropertyChildReap(pid)) {
return true;
}
@ -1105,14 +1117,11 @@ bool ServiceManager::ReapOneProcess() {
name = StringPrintf("Untracked pid %d", pid);
}
auto status = siginfo.si_status;
if (WIFEXITED(status)) {
LOG(INFO) << name << " exited with status " << WEXITSTATUS(status) << wait_string;
} else if (WIFSIGNALED(status)) {
LOG(INFO) << name << " killed by signal " << WTERMSIG(status) << wait_string;
} else if (WIFSTOPPED(status)) {
LOG(INFO) << name << " stopped by signal " << WSTOPSIG(status) << wait_string;
} else {
LOG(INFO) << name << " state changed" << wait_string;
}
if (!svc) {