fix(rl): ensure queue and process cleanup on abnormal exit (#3063)

Wrap the main execution in actor_cli and start_learner_threads with
try/finally so that queues are closed and processes are joined even
when an unhandled exception occurs. Previously, exceptions in
act_with_policy or add_actor_information_and_train would skip all
cleanup code, leaking GPU/CPU resources.

Also sets the shutdown_event on exception so child processes exit
gracefully.

Fixes #3059

Co-authored-by: Khalil Meftah <khalil.meftah@huggingface.co>
This commit is contained in:
Jash Shah
2026-04-13 07:25:42 -07:00
committed by GitHub
parent df0763a2bc
commit 9bd844a3b9
2 changed files with 51 additions and 45 deletions
+27 -24
View File
@@ -175,33 +175,36 @@ def actor_cli(cfg: TrainRLServerPipelineConfig):
interactions_process.start()
receive_policy_process.start()
act_with_policy(
cfg=cfg,
shutdown_event=shutdown_event,
parameters_queue=parameters_queue,
transitions_queue=transitions_queue,
interactions_queue=interactions_queue,
)
logging.info("[ACTOR] Policy process joined")
try:
act_with_policy(
cfg=cfg,
shutdown_event=shutdown_event,
parameters_queue=parameters_queue,
transitions_queue=transitions_queue,
interactions_queue=interactions_queue,
)
logging.info("[ACTOR] Policy loop finished")
except Exception:
logging.exception("[ACTOR] Unhandled exception in act_with_policy")
shutdown_event.set()
finally:
logging.info("[ACTOR] Closing queues")
transitions_queue.close()
interactions_queue.close()
parameters_queue.close()
logging.info("[ACTOR] Closing queues")
transitions_queue.close()
interactions_queue.close()
parameters_queue.close()
transitions_process.join()
logging.info("[ACTOR] Transitions process joined")
interactions_process.join()
logging.info("[ACTOR] Interactions process joined")
receive_policy_process.join()
logging.info("[ACTOR] Receive policy process joined")
transitions_process.join()
logging.info("[ACTOR] Transitions process joined")
interactions_process.join()
logging.info("[ACTOR] Interactions process joined")
receive_policy_process.join()
logging.info("[ACTOR] Receive policy process joined")
transitions_queue.cancel_join_thread()
interactions_queue.cancel_join_thread()
parameters_queue.cancel_join_thread()
logging.info("[ACTOR] join queues")
transitions_queue.cancel_join_thread()
interactions_queue.cancel_join_thread()
parameters_queue.cancel_join_thread()
logging.info("[ACTOR] queues closed")
logging.info("[ACTOR] Cleanup complete")
# Core algorithm functions
+24 -21
View File
@@ -218,30 +218,33 @@ def start_learner_threads(
)
communication_process.start()
add_actor_information_and_train(
cfg=cfg,
wandb_logger=wandb_logger,
shutdown_event=shutdown_event,
transition_queue=transition_queue,
interaction_message_queue=interaction_message_queue,
parameters_queue=parameters_queue,
)
logging.info("[LEARNER] Training process stopped")
try:
add_actor_information_and_train(
cfg=cfg,
wandb_logger=wandb_logger,
shutdown_event=shutdown_event,
transition_queue=transition_queue,
interaction_message_queue=interaction_message_queue,
parameters_queue=parameters_queue,
)
logging.info("[LEARNER] Training process stopped")
except Exception:
logging.exception("[LEARNER] Unhandled exception in training loop")
shutdown_event.set()
finally:
logging.info("[LEARNER] Closing queues")
transition_queue.close()
interaction_message_queue.close()
parameters_queue.close()
logging.info("[LEARNER] Closing queues")
transition_queue.close()
interaction_message_queue.close()
parameters_queue.close()
communication_process.join()
logging.info("[LEARNER] Communication process joined")
communication_process.join()
logging.info("[LEARNER] Communication process joined")
transition_queue.cancel_join_thread()
interaction_message_queue.cancel_join_thread()
parameters_queue.cancel_join_thread()
logging.info("[LEARNER] join queues")
transition_queue.cancel_join_thread()
interaction_message_queue.cancel_join_thread()
parameters_queue.cancel_join_thread()
logging.info("[LEARNER] queues closed")
logging.info("[LEARNER] Cleanup complete")
# Core algorithm functions