Skip to content

Commit 6d59b77

Browse files
author
Minh Quan Ho
committed
ompi/instance: fix cleanup function registration order
- Append PML cleanup into the finalize of the instance domain ('ompi_instance_common_domain') before RTE/OPAL init. - The reason is RTE init (ompi_rte_init()) will call opal_init(), which in turn will set the internal tracking domain to OPAL's one ('opal_init_domain'), and this PML cleanup function would be mis-registered as belonging to 'opal_init_domain' instead of the current 'ompi_instance_common_domain'. - The consequence of such mis-registration is that: at MPI_Finalize(), this PML cleanup (*_del_procs()) will be executed by RTE; and, depending on their registration order, this may cut the grass under the feet of other running components (*_progress()) - This may be the root cause of issue #10117 Signed-off-by: Minh Quan Ho <[email protected]>
1 parent c5e02ab commit 6d59b77

File tree

1 file changed

+8
-4
lines changed

1 file changed

+8
-4
lines changed

ompi/instance/instance.c

+8-4
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
* reserved.
99
* Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved.
1010
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
11+
* Copyright (c) 2025 SiPearl. All rights reserved.
1112
* $COPYRIGHT$
1213
*
1314
* Additional copyrights may follow
@@ -381,6 +382,10 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
381382
opal_finalize_domain_init (&ompi_instance_common_domain, "ompi_mpi_instance_init_common");
382383
opal_finalize_set_domain (&ompi_instance_common_domain);
383384

385+
/* Append PML cleanup into the finalize of this domain ('ompi_instance_common_domain')
386+
before RTE init */
387+
ompi_mpi_instance_append_finalize (ompi_mpi_instance_cleanup_pml);
388+
384389
if (OPAL_SUCCESS != (ret = opal_arch_set_fortran_logical_size(sizeof(ompi_fortran_logical_t)))) {
385390
return ompi_instance_print_error ("ompi_mpi_init: opal_arch_set_fortran_logical_size failed", ret);
386391
}
@@ -638,8 +643,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
638643
return ompi_instance_print_error ("ompi_group_init() failed", ret);
639644
}
640645

641-
ompi_mpi_instance_append_finalize (ompi_mpi_instance_cleanup_pml);
642-
643646
/* initialize communicator subsystem */
644647
if (OMPI_SUCCESS != (ret = ompi_comm_init ())) {
645648
opal_mutex_unlock (&instance_lock);
@@ -906,8 +909,6 @@ static int ompi_mpi_instance_finalize_common (void)
906909
mca_mpool_base_tree_print (ompi_debug_show_mpi_alloc_mem_leaks);
907910
}
908911

909-
opal_finalize_cleanup_domain (&ompi_instance_common_domain);
910-
911912
if (NULL != ompi_mpi_main_thread) {
912913
OBJ_RELEASE(ompi_mpi_main_thread);
913914
ompi_mpi_main_thread = NULL;
@@ -960,6 +961,9 @@ static int ompi_mpi_instance_finalize_common (void)
960961
}
961962
}
962963

964+
/* Should be called in reverse order of init, i.e. after RTE finalize */
965+
opal_finalize_cleanup_domain (&ompi_instance_common_domain);
966+
963967
ompi_proc_finalize();
964968

965969
ompi_mpi_instance_release ();

0 commit comments

Comments
 (0)