mirror of https://github.com/python/cpython.git
gh-117293: Fix race condition in run_workers.py (#117298)
The worker thread may still be alive after it enqueues it's last result, which can lead to a delay of 30 seconds after the test finishes. This happens much more frequently in the free-threaded build with the GIL disabled. This changes run_workers.py to track of live workers by enqueueing a `WorkerExited()` instance before the worker exits.
This commit is contained in:
parent
59864edd57
commit
26a680a585
|
@ -79,8 +79,12 @@ class MultiprocessResult:
|
||||||
err_msg: str | None = None
|
err_msg: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class WorkerThreadExited:
|
||||||
|
"""Indicates that a worker thread has exited"""
|
||||||
|
|
||||||
ExcStr = str
|
ExcStr = str
|
||||||
QueueOutput = tuple[Literal[False], MultiprocessResult] | tuple[Literal[True], ExcStr]
|
QueueOutput = tuple[Literal[False], MultiprocessResult] | tuple[Literal[True], ExcStr]
|
||||||
|
QueueContent = QueueOutput | WorkerThreadExited
|
||||||
|
|
||||||
|
|
||||||
class ExitThread(Exception):
|
class ExitThread(Exception):
|
||||||
|
@ -376,8 +380,8 @@ def _runtest(self, test_name: TestName) -> MultiprocessResult:
|
||||||
def run(self) -> None:
|
def run(self) -> None:
|
||||||
fail_fast = self.runtests.fail_fast
|
fail_fast = self.runtests.fail_fast
|
||||||
fail_env_changed = self.runtests.fail_env_changed
|
fail_env_changed = self.runtests.fail_env_changed
|
||||||
while not self._stopped:
|
try:
|
||||||
try:
|
while not self._stopped:
|
||||||
try:
|
try:
|
||||||
test_name = next(self.pending)
|
test_name = next(self.pending)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
|
@ -396,11 +400,12 @@ def run(self) -> None:
|
||||||
|
|
||||||
if mp_result.result.must_stop(fail_fast, fail_env_changed):
|
if mp_result.result.must_stop(fail_fast, fail_env_changed):
|
||||||
break
|
break
|
||||||
except ExitThread:
|
except ExitThread:
|
||||||
break
|
pass
|
||||||
except BaseException:
|
except BaseException:
|
||||||
self.output.put((True, traceback.format_exc()))
|
self.output.put((True, traceback.format_exc()))
|
||||||
break
|
finally:
|
||||||
|
self.output.put(WorkerThreadExited())
|
||||||
|
|
||||||
def _wait_completed(self) -> None:
|
def _wait_completed(self) -> None:
|
||||||
popen = self._popen
|
popen = self._popen
|
||||||
|
@ -458,8 +463,9 @@ def __init__(self, num_workers: int, runtests: RunTests,
|
||||||
self.log = logger.log
|
self.log = logger.log
|
||||||
self.display_progress = logger.display_progress
|
self.display_progress = logger.display_progress
|
||||||
self.results: TestResults = results
|
self.results: TestResults = results
|
||||||
|
self.live_worker_count = 0
|
||||||
|
|
||||||
self.output: queue.Queue[QueueOutput] = queue.Queue()
|
self.output: queue.Queue[QueueContent] = queue.Queue()
|
||||||
tests_iter = runtests.iter_tests()
|
tests_iter = runtests.iter_tests()
|
||||||
self.pending = MultiprocessIterator(tests_iter)
|
self.pending = MultiprocessIterator(tests_iter)
|
||||||
self.timeout = runtests.timeout
|
self.timeout = runtests.timeout
|
||||||
|
@ -497,6 +503,7 @@ def start_workers(self) -> None:
|
||||||
self.log(msg)
|
self.log(msg)
|
||||||
for worker in self.workers:
|
for worker in self.workers:
|
||||||
worker.start()
|
worker.start()
|
||||||
|
self.live_worker_count += 1
|
||||||
|
|
||||||
def stop_workers(self) -> None:
|
def stop_workers(self) -> None:
|
||||||
start_time = time.monotonic()
|
start_time = time.monotonic()
|
||||||
|
@ -511,14 +518,18 @@ def _get_result(self) -> QueueOutput | None:
|
||||||
|
|
||||||
# bpo-46205: check the status of workers every iteration to avoid
|
# bpo-46205: check the status of workers every iteration to avoid
|
||||||
# waiting forever on an empty queue.
|
# waiting forever on an empty queue.
|
||||||
while any(worker.is_alive() for worker in self.workers):
|
while self.live_worker_count > 0:
|
||||||
if use_faulthandler:
|
if use_faulthandler:
|
||||||
faulthandler.dump_traceback_later(MAIN_PROCESS_TIMEOUT,
|
faulthandler.dump_traceback_later(MAIN_PROCESS_TIMEOUT,
|
||||||
exit=True)
|
exit=True)
|
||||||
|
|
||||||
# wait for a thread
|
# wait for a thread
|
||||||
try:
|
try:
|
||||||
return self.output.get(timeout=PROGRESS_UPDATE)
|
result = self.output.get(timeout=PROGRESS_UPDATE)
|
||||||
|
if isinstance(result, WorkerThreadExited):
|
||||||
|
self.live_worker_count -= 1
|
||||||
|
continue
|
||||||
|
return result
|
||||||
except queue.Empty:
|
except queue.Empty:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -528,12 +539,6 @@ def _get_result(self) -> QueueOutput | None:
|
||||||
if running:
|
if running:
|
||||||
self.log(running)
|
self.log(running)
|
||||||
|
|
||||||
# all worker threads are done: consume pending results
|
|
||||||
try:
|
|
||||||
return self.output.get(timeout=0)
|
|
||||||
except queue.Empty:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def display_result(self, mp_result: MultiprocessResult) -> None:
|
def display_result(self, mp_result: MultiprocessResult) -> None:
|
||||||
result = mp_result.result
|
result = mp_result.result
|
||||||
pgo = self.runtests.pgo
|
pgo = self.runtests.pgo
|
||||||
|
|
Loading…
Reference in New Issue