Skip to content
35 changes: 5 additions & 30 deletions packages/teuchos/core/src/Teuchos_GlobalMPISession.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,6 @@
# include "mpi.h"
#endif

#ifdef HAVE_TEUCHOSCORE_KOKKOS
# include "Kokkos_Core.hpp"
#endif // HAVE_TEUCHOSCORE_KOKKOS



namespace Teuchos {


Expand All @@ -31,15 +25,10 @@ bool GlobalMPISession::mpiIsFinalized_ = false;
int GlobalMPISession::rank_ = 0 ;
int GlobalMPISession::nProc_ = 1 ;

#ifdef HAVE_TEUCHOSCORE_KOKKOS

// We have to invoke the std::vector's constructor here,
// because it's a class (static) variable.
std::vector<std::string> GlobalMPISession::argvCopy_;

#endif // HAVE_TEUCHOSCORE_KOKKOS


GlobalMPISession::GlobalMPISession( int* argc, char*** argv, std::ostream *out )
{
std::ostringstream oss;
Expand Down Expand Up @@ -120,7 +109,6 @@ GlobalMPISession::GlobalMPISession( int* argc, char*** argv, std::ostream *out )

#endif

#ifdef HAVE_TEUCHOSCORE_KOKKOS
// mfh 15/16 Apr 2016: This is the one chance we get to save the
// command-line arguments, so that we can (later) initialize Kokkos
// with the correct number of threads as specified by (e.g.,) the
Expand All @@ -136,36 +124,23 @@ GlobalMPISession::GlobalMPISession( int* argc, char*** argv, std::ostream *out )
// requirement.

const int numArgs = *argc;
argvCopy_.resize (numArgs);
argvCopy_.resize (numArgs);
for (int c = 0; c < numArgs; ++c) {
argvCopy_[c] = std::string ((*argv)[c]); // deep copy
}
#endif // HAVE_TEUCHOSCORE_KOKKOS


}


#ifdef HAVE_TEUCHOSCORE_KOKKOS

std::vector<std::string> GlobalMPISession::getArgv ()
{
return argvCopy_;
}
#endif // HAVE_TEUCHOSCORE_KOKKOS


GlobalMPISession::~GlobalMPISession()
{

#ifdef HAVE_TEUCHOSCORE_KOKKOS
try {
if (Kokkos::is_initialized())
Kokkos::finalize();
}
catch (const std::runtime_error& e) {
std::cerr << "Kokkos::finalize failed:\n"
<< e.what() << "\n";
}
#endif

haveMPIState_ = false;
#ifdef HAVE_MPI
const int mpierr = ::MPI_Finalize();
Expand Down
5 changes: 1 addition & 4 deletions packages/teuchos/core/src/Teuchos_GlobalMPISession.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,6 @@ class TEUCHOSCORE_LIB_DLL_EXPORT GlobalMPISession
*/
static void allGather(int localVal, const ArrayView<int> &allVals);

#ifdef HAVE_TEUCHOSCORE_KOKKOS
/// \brief Fetch a deep copy of the input arguments to \c main()
/// (that is, \c argv), as given to GlobalMPISession's
/// constructor.
Expand All @@ -240,7 +239,6 @@ class TEUCHOSCORE_LIB_DLL_EXPORT GlobalMPISession
/// yet, return an empty vector. Else, return the input
/// arguments.
static std::vector<std::string> getArgv ();
#endif // HAVE_TEUCHOSCORE_KOKKOS
//@}

private:
Expand All @@ -249,7 +247,7 @@ class TEUCHOSCORE_LIB_DLL_EXPORT GlobalMPISession
static bool mpiIsFinalized_;
static int rank_;
static int nProc_;
#ifdef HAVE_TEUCHOSCORE_KOKKOS

/// \brief Deep copy of the input arguments.
///
/// This is useful if we want to call Kokkos::initialize later with
Expand All @@ -258,7 +256,6 @@ class TEUCHOSCORE_LIB_DLL_EXPORT GlobalMPISession
/// after calling this object's constructor. That could mess up
/// indexing if we just keep a pointer to the original.
static std::vector<std::string> argvCopy_;
#endif // HAVE_TEUCHOSCORE_KOKKOS

static void initialize( std::ostream *out );

Expand Down
55 changes: 29 additions & 26 deletions packages/tpetra/core/src/Tpetra_Details_initializeKokkos.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,36 +19,39 @@
namespace Tpetra {
namespace Details {

void finalizeKokkosIfNeeded() {
if (!Kokkos::is_finalized()) {
Kokkos::finalize();
}
}

void initializeKokkos() {
if (!Kokkos::is_initialized()) {
std::vector<std::string> args = Teuchos::GlobalMPISession::getArgv();
int narg = static_cast<int>(args.size()); // must be nonconst

std::vector<char*> args_c;
std::vector<std::unique_ptr<char[]>> args_;
for (auto const& x : args) {
args_.emplace_back(new char[x.size() + 1]);
char* ptr = args_.back().get();
strcpy(ptr, x.c_str());
args_c.push_back(ptr);
static const int initialized = []() {
TEUCHOS_TEST_FOR_EXCEPTION(Kokkos::is_finalized(), std::runtime_error,
"Tpetra::Details::initializeKokkos: Kokkos is already finalized");
if (!Kokkos::is_initialized()) {
std::vector<std::string> args = Teuchos::GlobalMPISession::getArgv();
int narg = static_cast<int>(args.size()); // must be nonconst

std::vector<char*> args_c;
std::vector<std::unique_ptr<char[]>> args_;
for (auto const& x : args) {
args_.emplace_back(new char[x.size() + 1]);
char* ptr = args_.back().get();
strcpy(ptr, x.c_str());
args_c.push_back(ptr);
}
args_c.push_back(nullptr);

Kokkos::initialize(narg, args_c.data());
checkOldCudaLaunchBlocking();

std::atexit(Kokkos::finalize);
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Using std::atexit(Kokkos::finalize) to finalize Kokkos can lead to an incorrect finalization order when Teuchos::GlobalMPISession is used. In typical Trilinos applications, GlobalMPISession is instantiated on the stack in main(). Its destructor, which calls MPI_Finalize(), will execute when main() returns, which is before any functions registered with std::atexit() are called. This results in Kokkos::finalize() being called after MPI_Finalize(). This is generally considered unsafe in MPI environments and can lead to crashes or undefined behavior, especially with MPI-aware Kokkos backends or when Kokkos cleanup tasks (such as releasing memory in certain execution spaces) depend on a valid MPI state.

}
args_c.push_back(nullptr);

Kokkos::initialize(narg, narg == 0 ? nullptr : args_c.data());
checkOldCudaLaunchBlocking();
// Add Kokkos calls to the TimeMonitor if the environment says so
Tpetra::Details::AddKokkosDeepCopyToTimeMonitor();
Tpetra::Details::AddKokkosFenceToTimeMonitor();
Tpetra::Details::AddKokkosFunctionsToTimeMonitor();
return 1;
}();

std::atexit(finalizeKokkosIfNeeded);
}
// Add Kokkos calls to the TimeMonitor if the environment says so
Tpetra::Details::AddKokkosDeepCopyToTimeMonitor();
Tpetra::Details::AddKokkosFenceToTimeMonitor();
Tpetra::Details::AddKokkosFunctionsToTimeMonitor();
TEUCHOS_TEST_FOR_EXCEPTION(!Kokkos::is_initialized() || initialized != 1, std::runtime_error,
"Tpetra::Details::initializeKokkos: Initialization failed or Kokkos has already been finalized.");
}

} // namespace Details
Expand Down
Loading