Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions lmms_eval/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,16 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
traceback.print_exc()
eval_logger.error(f"Error during evaluation: {e}. Please set `--verbosity=DEBUG` to get more information.")
results_list.append(None)
# Under torchrun/accelerate, a rank returning after a local
# exception leaves peers blocked in NCCL collectives until
# the launcher's timeout. Abort all ranks so the launcher
# propagates the failure immediately instead of deadlocking.
if torch.distributed.is_available() and torch.distributed.is_initialized():
try:
torch.distributed.destroy_process_group()
except Exception:
pass
sys.exit(1)

for args, results in zip(args_list, results_list):
# cli_evaluate will return none if the process is not the main process (rank 0)
Expand Down
Loading