From 20514c4411923e03df4a5057ef43bf9716983e14 Mon Sep 17 00:00:00 2001 From: Tom Cherry Date: Fri, 21 Apr 2017 13:48:49 -0700 Subject: [PATCH 1/2] Better logging in libprocessgroup and make resources clean up themselves Bug: 36661364 Bug: 36701253 Bug: 37540956 Test: Reboot bullhead Test: Start and stop services Change-Id: I97b6e17a7350850db733bee9fc83bdd296d88c61 --- .../include/processgroup/processgroup.h | 5 + libprocessgroup/processgroup.cpp | 181 ++++++++++-------- 2 files changed, 103 insertions(+), 83 deletions(-) diff --git a/libprocessgroup/include/processgroup/processgroup.h b/libprocessgroup/include/processgroup/processgroup.h index 47f6ff3cd..f0c37954c 100644 --- a/libprocessgroup/include/processgroup/processgroup.h +++ b/libprocessgroup/include/processgroup/processgroup.h @@ -22,8 +22,13 @@ __BEGIN_DECLS +// Return 0 and removes the cgroup if there are no longer any processes in it. +// Returns -1 in the case of an error occurring or if there are processes still running +// even after retrying for up to 200ms. int killProcessGroup(uid_t uid, int initialPid, int signal); +// Returns the same as killProcessGroup(), however it does not retry, which means +// that it only returns 0 in the case that the cgroup exists and it contains no processes. int killProcessGroupOnce(uid_t uid, int initialPid, int signal); int createProcessGroup(uid_t uid, int initialPid); diff --git a/libprocessgroup/processgroup.cpp b/libprocessgroup/processgroup.cpp index 1572cb312..27b4065da 100644 --- a/libprocessgroup/processgroup.cpp +++ b/libprocessgroup/processgroup.cpp @@ -34,6 +34,7 @@ #include #include +#include #include #include @@ -64,12 +65,27 @@ using namespace std::chrono_literals; std::once_flag init_path_flag; -struct ctx { - bool initialized; - int fd; - char buf[128]; - char *buf_ptr; - size_t buf_len; +class ProcessGroup { + public: + ProcessGroup() : buf_ptr_(buf_), buf_len_(0) {} + + bool Open(uid_t uid, int pid); + + // Return positive number and sets *pid = next pid in process cgroup on success + // Returns 0 if there are no pids left in the process cgroup + // Returns -errno if an error was encountered + int GetOneAppProcess(pid_t* pid); + + private: + // Returns positive number of bytes filled on success + // Returns 0 if there was nothing to read + // Returns -errno if an error was encountered + int RefillBuffer(); + + android::base::unique_fd fd_; + char buf_[128]; + char* buf_ptr_; + size_t buf_len_; }; static const char* getCgroupRootPath() { @@ -105,87 +121,67 @@ static int convertUidPidToPath(char *path, size_t size, uid_t uid, int pid) pid); } -static int initCtx(uid_t uid, int pid, struct ctx *ctx) -{ - int ret; +bool ProcessGroup::Open(uid_t uid, int pid) { char path[PROCESSGROUP_MAX_PATH_LEN] = {0}; convertUidPidToPath(path, sizeof(path), uid, pid); strlcat(path, PROCESSGROUP_CGROUP_PROCS_FILE, sizeof(path)); int fd = open(path, O_RDONLY); - if (fd < 0) { - ret = -errno; - PLOG(WARNING) << "failed to open " << path; - return ret; - } + if (fd < 0) return false; - ctx->fd = fd; - ctx->buf_ptr = ctx->buf; - ctx->buf_len = 0; - ctx->initialized = true; + fd_.reset(fd); LOG(VERBOSE) << "Initialized context for " << path; - return 0; + return true; } -static int refillBuffer(struct ctx *ctx) -{ - memmove(ctx->buf, ctx->buf_ptr, ctx->buf_len); - ctx->buf_ptr = ctx->buf; +int ProcessGroup::RefillBuffer() { + memmove(buf_, buf_ptr_, buf_len_); + buf_ptr_ = buf_; - ssize_t ret = read(ctx->fd, ctx->buf_ptr + ctx->buf_len, - sizeof(ctx->buf) - ctx->buf_len - 1); + ssize_t ret = read(fd_, buf_ptr_ + buf_len_, sizeof(buf_) - buf_len_ - 1); if (ret < 0) { return -errno; } else if (ret == 0) { return 0; } - ctx->buf_len += ret; - ctx->buf[ctx->buf_len] = 0; - LOG(VERBOSE) << "Read " << ret << " to buffer: " << ctx->buf; + buf_len_ += ret; + buf_[buf_len_] = 0; + LOG(VERBOSE) << "Read " << ret << " to buffer: " << buf_; - assert(ctx->buf_len <= sizeof(ctx->buf)); + assert(buf_len_ <= sizeof(buf_)); return ret; } -static pid_t getOneAppProcess(uid_t uid, int appProcessPid, struct ctx *ctx) -{ - if (!ctx->initialized) { - int ret = initCtx(uid, appProcessPid, ctx); - if (ret < 0) { - return ret; - } - } +int ProcessGroup::GetOneAppProcess(pid_t* out_pid) { + *out_pid = 0; - char *eptr; - while ((eptr = (char *)memchr(ctx->buf_ptr, '\n', ctx->buf_len)) == NULL) { - int ret = refillBuffer(ctx); - if (ret == 0) { - return -ERANGE; - } - if (ret < 0) { - return ret; - } + char* eptr; + while ((eptr = static_cast(memchr(buf_ptr_, '\n', buf_len_))) == nullptr) { + int ret = RefillBuffer(); + if (ret <= 0) return ret; } *eptr = '\0'; - char *pid_eptr = NULL; + char* pid_eptr = nullptr; errno = 0; - long pid = strtol(ctx->buf_ptr, &pid_eptr, 10); + long pid = strtol(buf_ptr_, &pid_eptr, 10); if (errno != 0) { return -errno; } if (pid_eptr != eptr) { - return -EINVAL; + errno = EINVAL; + return -errno; } - ctx->buf_len -= (eptr - ctx->buf_ptr) + 1; - ctx->buf_ptr = eptr + 1; + buf_len_ -= (eptr - buf_ptr_) + 1; + buf_ptr_ = eptr + 1; - return (pid_t)pid; + *out_pid = static_cast(pid); + return 1; } static int removeProcessGroup(uid_t uid, int pid) @@ -219,8 +215,8 @@ static void removeUidProcessGroups(const char *uid_path) } snprintf(path, sizeof(path), "%s/%s", uid_path, dir->d_name); - LOG(VERBOSE) << "removing " << path; - if (rmdir(path) == -1) PLOG(WARNING) << "failed to remove " << path; + LOG(VERBOSE) << "Removing " << path; + if (rmdir(path) == -1) PLOG(WARNING) << "Failed to remove " << path; } } } @@ -231,7 +227,7 @@ void removeAllProcessGroups() const char* cgroup_root_path = getCgroupRootPath(); std::unique_ptr root(opendir(cgroup_root_path), closedir); if (root == NULL) { - PLOG(ERROR) << "failed to open " << cgroup_root_path; + PLOG(ERROR) << "Failed to open " << cgroup_root_path; } else { dirent* dir; while ((dir = readdir(root.get())) != nullptr) { @@ -246,20 +242,26 @@ void removeAllProcessGroups() snprintf(path, sizeof(path), "%s/%s", cgroup_root_path, dir->d_name); removeUidProcessGroups(path); - LOG(VERBOSE) << "removing " << path; - if (rmdir(path) == -1) PLOG(WARNING) << "failed to remove " << path; + LOG(VERBOSE) << "Removing " << path; + if (rmdir(path) == -1) PLOG(WARNING) << "Failed to remove " << path; } } } +// Returns number of processes killed on success +// Returns 0 if there are no processes in the process cgroup left to kill +// Returns -errno on error static int doKillProcessGroupOnce(uid_t uid, int initialPid, int signal) { - int processes = 0; - struct ctx ctx; + ProcessGroup process_group; + if (!process_group.Open(uid, initialPid)) { + PLOG(WARNING) << "Failed to open process cgroup uid " << uid << " pid " << initialPid; + return -errno; + } + + int ret; pid_t pid; - - ctx.initialized = false; - - while ((pid = getOneAppProcess(uid, initialPid, &ctx)) >= 0) { + int processes = 0; + while ((ret = process_group.GetOneAppProcess(&pid)) > 0 && pid >= 0) { processes++; if (pid == 0) { // Should never happen... but if it does, trying to kill this @@ -267,55 +269,68 @@ static int doKillProcessGroupOnce(uid_t uid, int initialPid, int signal) { LOG(WARNING) << "Yikes, we've been told to kill pid 0! How about we don't do that?"; continue; } - LOG(VERBOSE) << "Killing pid " << pid << " in uid " << uid - << " as part of process group " << initialPid; + LOG(VERBOSE) << "Killing pid " << pid << " in uid " << uid << " as part of process cgroup " + << initialPid; if (kill(pid, signal) == -1) { PLOG(WARNING) << "kill(" << pid << ", " << signal << ") failed"; } } - if (ctx.initialized) { - close(ctx.fd); - } - - return processes; + return ret >= 0 ? processes : ret; } -static int killProcessGroup(uid_t uid, int initialPid, int signal, int retry) { +static int killProcessGroup(uid_t uid, int initialPid, int signal, int retries) { std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); + int retry = retries; int processes; while ((processes = doKillProcessGroupOnce(uid, initialPid, signal)) > 0) { - LOG(VERBOSE) << "killed " << processes << " processes for processgroup " << initialPid; + LOG(VERBOSE) << "Killed " << processes << " processes for processgroup " << initialPid; if (retry > 0) { std::this_thread::sleep_for(5ms); --retry; } else { - LOG(ERROR) << "failed to kill " << processes << " processes for processgroup " - << initialPid; break; } } - std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); + if (processes < 0) { + PLOG(ERROR) << "Error encountered killing process cgroup uid " << uid << " pid " + << initialPid; + return -1; + } + std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); auto ms = std::chrono::duration_cast(end - start).count(); - LOG(VERBOSE) << "Killed process group uid " << uid << " pid " << initialPid << " in " - << static_cast(ms) << "ms, " << processes << " procs remain"; + + // We only calculate the number of 'processes' when killing the processes. + // In the retries == 0 case, we only kill the processes once and therefore + // will not have waited then recalculated how many processes are remaining + // after the first signals have been sent. + // Logging anything regarding the number of 'processes' here does not make sense. if (processes == 0) { + if (retries > 0) { + LOG(INFO) << "Successfully killed process cgroup uid " << uid << " pid " << initialPid + << " in " << static_cast(ms) << "ms"; + } return removeProcessGroup(uid, initialPid); } else { + if (retries > 0) { + LOG(ERROR) << "Failed to kill process cgroup uid " << uid << " pid " << initialPid + << " in " << static_cast(ms) << "ms, " << processes + << " processes remain"; + } return -1; } } int killProcessGroup(uid_t uid, int initialPid, int signal) { - return killProcessGroup(uid, initialPid, signal, 40 /*maxRetry*/); + return killProcessGroup(uid, initialPid, signal, 40 /*retries*/); } int killProcessGroupOnce(uid_t uid, int initialPid, int signal) { - return killProcessGroup(uid, initialPid, signal, 0 /*maxRetry*/); + return killProcessGroup(uid, initialPid, signal, 0 /*retries*/); } static bool mkdirAndChown(const char *path, mode_t mode, uid_t uid, gid_t gid) @@ -341,14 +356,14 @@ int createProcessGroup(uid_t uid, int initialPid) convertUidToPath(path, sizeof(path), uid); if (!mkdirAndChown(path, 0750, AID_SYSTEM, AID_SYSTEM)) { - PLOG(ERROR) << "failed to make and chown " << path; + PLOG(ERROR) << "Failed to make and chown " << path; return -errno; } convertUidPidToPath(path, sizeof(path), uid, initialPid); if (!mkdirAndChown(path, 0750, AID_SYSTEM, AID_SYSTEM)) { - PLOG(ERROR) << "failed to make and chown " << path; + PLOG(ERROR) << "Failed to make and chown " << path; return -errno; } @@ -357,7 +372,7 @@ int createProcessGroup(uid_t uid, int initialPid) int fd = open(path, O_WRONLY); if (fd == -1) { int ret = -errno; - PLOG(ERROR) << "failed to open " << path; + PLOG(ERROR) << "Failed to open " << path; return ret; } @@ -367,7 +382,7 @@ int createProcessGroup(uid_t uid, int initialPid) int ret = 0; if (write(fd, pid, len) < 0) { ret = -errno; - PLOG(ERROR) << "failed to write '" << pid << "' to " << path; + PLOG(ERROR) << "Failed to write '" << pid << "' to " << path; } close(fd); From 33838b1156cd1a5f8eec94756bee9e030a3dd2eb Mon Sep 17 00:00:00 2001 From: Tom Cherry Date: Thu, 4 May 2017 11:32:36 -0700 Subject: [PATCH 2/2] init: change kill order and fix error reporting in KillProcessGroup() First kill the process group before killing the cgroup to catch the hopeful case that killing the cgroup becomes a no-op as all of its processes have already been killed. Do not report an error if kill fails due to ESRCH, as this happens often when reaping processes due to the order in which we call waitpid() and kill(). Do not call killProcessGroup in libprocessgroup if we have already successfully killed and removed a process group. Bug: 36661364 Bug: 36701253 Bug: 37540956 Test: Reboot bullhead Test: Start and stop services Test: Init unit tests Change-Id: I172acf0f8e00189f910f865f4635a7b1782fc7e3 --- init/service.cpp | 39 ++++++++++++++++++++++++++------------- init/service.h | 3 +++ init/service_test.cpp | 2 ++ 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/init/service.cpp b/init/service.cpp index 3a9f62228..881825fe8 100644 --- a/init/service.cpp +++ b/init/service.cpp @@ -209,21 +209,33 @@ void Service::NotifyStateChange(const std::string& new_state) const { } void Service::KillProcessGroup(int signal) { - LOG(INFO) << "Sending signal " << signal - << " to service '" << name_ - << "' (pid " << pid_ << ") process group..."; - int r; - if (signal == SIGTERM) { - r = killProcessGroupOnce(uid_, pid_, signal); - } else { - r = killProcessGroup(uid_, pid_, signal); - } - if (r == -1) { - LOG(ERROR) << "killProcessGroup(" << uid_ << ", " << pid_ << ", " << signal << ") failed"; - } - if (kill(-pid_, signal) == -1) { + // We ignore reporting errors of ESRCH as this commonly happens in the below case, + // 1) Terminate() is called, which sends SIGTERM to the process + // 2) The process successfully exits + // 3) ReapOneProcess() is called, which calls waitpid(-1, ...) which removes the pid entry. + // 4) Reap() is called, which sends SIGKILL, but the pid no longer exists. + // TODO: sigaction for SIGCHLD reports the pid of the exiting process, + // we should do this kill with that pid first before calling waitpid(). + if (kill(-pid_, signal) == -1 && errno != ESRCH) { PLOG(ERROR) << "kill(" << pid_ << ", " << signal << ") failed"; } + + // If we've already seen a successful result from killProcessGroup*(), then we have removed + // the cgroup already and calling these functions a second time will simply result in an error. + // This is true regardless of which signal was sent. + // These functions handle their own logging, so no additional logging is needed. + if (!process_cgroup_empty_) { + LOG(INFO) << "Sending signal " << signal << " to service '" << name_ << "' (pid " << pid_ + << ") process group..."; + int r; + if (signal == SIGTERM) { + r = killProcessGroupOnce(uid_, pid_, signal); + } else { + r = killProcessGroup(uid_, pid_, signal); + } + + if (r == 0) process_cgroup_empty_ = true; + } } void Service::SetProcessAttributes() { @@ -736,6 +748,7 @@ bool Service::Start() { time_started_ = boot_clock::now(); pid_ = pid; flags_ |= SVC_RUNNING; + process_cgroup_empty_ = false; errno = -createProcessGroup(uid_, pid_); if (errno != 0) { diff --git a/init/service.h b/init/service.h index 426577ffe..b9c270aa6 100644 --- a/init/service.h +++ b/init/service.h @@ -107,6 +107,7 @@ class Service { int ioprio_pri() const { return ioprio_pri_; } int priority() const { return priority_; } int oom_score_adjust() const { return oom_score_adjust_; } + bool process_cgroup_empty() const { return process_cgroup_empty_; } const std::vector& args() const { return args_; } private: @@ -179,6 +180,8 @@ class Service { int oom_score_adjust_; + bool process_cgroup_empty_ = false; + std::vector args_; }; diff --git a/init/service_test.cpp b/init/service_test.cpp index 4493f254a..b9c4627ad 100644 --- a/init/service_test.cpp +++ b/init/service_test.cpp @@ -45,6 +45,7 @@ TEST(service, pod_initialized) { EXPECT_EQ(0, service_in_old_memory->ioprio_pri()); EXPECT_EQ(0, service_in_old_memory->priority()); EXPECT_EQ(-1000, service_in_old_memory->oom_score_adjust()); + EXPECT_FALSE(service_in_old_memory->process_cgroup_empty()); for (std::size_t i = 0; i < memory_size; ++i) { old_memory[i] = 0xFF; @@ -64,4 +65,5 @@ TEST(service, pod_initialized) { EXPECT_EQ(0, service_in_old_memory2->ioprio_pri()); EXPECT_EQ(0, service_in_old_memory2->priority()); EXPECT_EQ(-1000, service_in_old_memory2->oom_score_adjust()); + EXPECT_FALSE(service_in_old_memory->process_cgroup_empty()); }