gh-117293: Fix race condition in run_workers.py (#117298)

The worker thread may still be alive after it enqueues it's last result,
which can lead to a delay of 30 seconds after the test finishes. This
happens much more frequently in the free-threaded build with the GIL
disabled.

This changes run_workers.py to track of live workers by enqueueing a
`WorkerExited()` instance before the worker exits.
This commit is contained in:
Sam Gross 2024-04-08 10:47:42 -04:00 committed by GitHub
parent 59864edd57
commit 26a680a585
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 21 additions and 16 deletions

View File

@ -79,8 +79,12 @@ class MultiprocessResult:
err_msg: str | None = None err_msg: str | None = None
class WorkerThreadExited:
"""Indicates that a worker thread has exited"""
ExcStr = str ExcStr = str
QueueOutput = tuple[Literal[False], MultiprocessResult] | tuple[Literal[True], ExcStr] QueueOutput = tuple[Literal[False], MultiprocessResult] | tuple[Literal[True], ExcStr]
QueueContent = QueueOutput | WorkerThreadExited
class ExitThread(Exception): class ExitThread(Exception):
@ -376,8 +380,8 @@ def _runtest(self, test_name: TestName) -> MultiprocessResult:
def run(self) -> None: def run(self) -> None:
fail_fast = self.runtests.fail_fast fail_fast = self.runtests.fail_fast
fail_env_changed = self.runtests.fail_env_changed fail_env_changed = self.runtests.fail_env_changed
while not self._stopped:
try: try:
while not self._stopped:
try: try:
test_name = next(self.pending) test_name = next(self.pending)
except StopIteration: except StopIteration:
@ -397,10 +401,11 @@ def run(self) -> None:
if mp_result.result.must_stop(fail_fast, fail_env_changed): if mp_result.result.must_stop(fail_fast, fail_env_changed):
break break
except ExitThread: except ExitThread:
break pass
except BaseException: except BaseException:
self.output.put((True, traceback.format_exc())) self.output.put((True, traceback.format_exc()))
break finally:
self.output.put(WorkerThreadExited())
def _wait_completed(self) -> None: def _wait_completed(self) -> None:
popen = self._popen popen = self._popen
@ -458,8 +463,9 @@ def __init__(self, num_workers: int, runtests: RunTests,
self.log = logger.log self.log = logger.log
self.display_progress = logger.display_progress self.display_progress = logger.display_progress
self.results: TestResults = results self.results: TestResults = results
self.live_worker_count = 0
self.output: queue.Queue[QueueOutput] = queue.Queue() self.output: queue.Queue[QueueContent] = queue.Queue()
tests_iter = runtests.iter_tests() tests_iter = runtests.iter_tests()
self.pending = MultiprocessIterator(tests_iter) self.pending = MultiprocessIterator(tests_iter)
self.timeout = runtests.timeout self.timeout = runtests.timeout
@ -497,6 +503,7 @@ def start_workers(self) -> None:
self.log(msg) self.log(msg)
for worker in self.workers: for worker in self.workers:
worker.start() worker.start()
self.live_worker_count += 1
def stop_workers(self) -> None: def stop_workers(self) -> None:
start_time = time.monotonic() start_time = time.monotonic()
@ -511,14 +518,18 @@ def _get_result(self) -> QueueOutput | None:
# bpo-46205: check the status of workers every iteration to avoid # bpo-46205: check the status of workers every iteration to avoid
# waiting forever on an empty queue. # waiting forever on an empty queue.
while any(worker.is_alive() for worker in self.workers): while self.live_worker_count > 0:
if use_faulthandler: if use_faulthandler:
faulthandler.dump_traceback_later(MAIN_PROCESS_TIMEOUT, faulthandler.dump_traceback_later(MAIN_PROCESS_TIMEOUT,
exit=True) exit=True)
# wait for a thread # wait for a thread
try: try:
return self.output.get(timeout=PROGRESS_UPDATE) result = self.output.get(timeout=PROGRESS_UPDATE)
if isinstance(result, WorkerThreadExited):
self.live_worker_count -= 1
continue
return result
except queue.Empty: except queue.Empty:
pass pass
@ -528,12 +539,6 @@ def _get_result(self) -> QueueOutput | None:
if running: if running:
self.log(running) self.log(running)
# all worker threads are done: consume pending results
try:
return self.output.get(timeout=0)
except queue.Empty:
return None
def display_result(self, mp_result: MultiprocessResult) -> None: def display_result(self, mp_result: MultiprocessResult) -> None:
result = mp_result.result result = mp_result.result
pgo = self.runtests.pgo pgo = self.runtests.pgo