Skip to content

Commit 2c0879d

Browse files
authored
feat: Adding signal and external error detection & output (#3856)
- output error message & context reliably in the log, even if the stderr get lost or used for another reason, - be sure to detect any kernel / system allocator errors and add the stack-trace of these errors, - factorize them with external tools / scripts, thus highlighting which are the source rank(s) of the issue. - also has the effect to prevent the stacktrace to be cut by other ranks message, which could previously happen on a signal.
1 parent 4103efd commit 2c0879d

File tree

7 files changed

+699
-23
lines changed

7 files changed

+699
-23
lines changed

src/coreComponents/common/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ set( common_headers
3131
format/LogPart.hpp
3232
format/Format.hpp
3333
format/StringUtilities.hpp
34-
logger/Logger.hpp
3534
BufferAllocator.hpp
3635
DataLayouts.hpp
3736
DataTypes.hpp
@@ -41,6 +40,7 @@ set( common_headers
4140
MemoryInfos.hpp
4241
logger/Logger.hpp
4342
logger/ErrorHandling.hpp
43+
logger/ExternalErrorHandler.hpp
4444
MpiWrapper.hpp
4545
Path.hpp
4646
Span.hpp
@@ -77,6 +77,7 @@ set( common_sources
7777
format/StringUtilities.cpp
7878
logger/Logger.cpp
7979
logger/ErrorHandling.cpp
80+
logger/ExternalErrorHandler.cpp
8081
BufferAllocator.cpp
8182
MemoryInfos.cpp
8283
MpiWrapper.cpp

src/coreComponents/common/initializeEnvironment.cpp

Lines changed: 84 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
#include "LvArray/src/system.hpp"
2121
#include "common/LifoStorageCommon.hpp"
2222
#include "common/MemoryInfos.hpp"
23+
#include "logger/ErrorHandling.hpp"
24+
#include "logger/ExternalErrorHandler.hpp"
2325
#include <umpire/TypedAllocator.hpp>
2426
// TPL includes
2527
#include <umpire/ResourceManager.hpp>
@@ -66,6 +68,88 @@ void setupLogger()
6668
#else
6769
logger::InitializeLogger();
6870
#endif
71+
72+
{ // setup error handling (using LvArray helper system functions)
73+
using ErrorContext = ErrorLogger::ErrorContext;
74+
75+
///// set Post-Handled Error behaviour /////
76+
LvArray::system::setErrorHandler( []()
77+
{
78+
#if defined( GEOS_USE_MPI )
79+
int mpi = 0;
80+
MPI_Initialized( &mpi );
81+
if( mpi )
82+
{
83+
MPI_Abort( MPI_COMM_WORLD, EXIT_FAILURE );
84+
}
85+
#endif
86+
std::abort();
87+
} );
88+
89+
///// set external error handling behaviour /////
90+
ExternalErrorHandler::instance().setErrorHandling( []( string_view errorMsg,
91+
string_view detectionLocation )
92+
{
93+
std::string const stackHistory = LvArray::system::stackTrace( true );
94+
95+
GEOS_LOG( GEOS_FMT( "***** ERROR\n"
96+
"***** LOCATION: (external error, detected {})\n"
97+
"{}\n{}",
98+
detectionLocation, errorMsg, stackHistory ) );
99+
if( ErrorLogger::global().isOutputFileEnabled() )
100+
{
101+
ErrorLogger::ErrorMsg error;
102+
error.setType( ErrorLogger::MsgType::Error );
103+
error.addToMsg( errorMsg );
104+
error.addRank( ::geos::logger::internal::g_rank );
105+
error.addCallStackInfo( stackHistory );
106+
error.addContextInfo(
107+
ErrorContext{ { { ErrorContext::Attribute::DetectionLoc, string( detectionLocation ) } } } );
108+
109+
ErrorLogger::global().flushErrorMsg( error );
110+
}
111+
112+
// we do not terminate the program as 1. the error could be non-fatal, 2. there may be more messages to output.
113+
} );
114+
ExternalErrorHandler::instance().enableStderrPipeDeviation( true );
115+
116+
///// set signal handling behaviour /////
117+
LvArray::system::setSignalHandling( []( int const signal )
118+
{
119+
// Disable signal handling to prevent catching exit signal (infinite loop)
120+
LvArray::system::setSignalHandling( nullptr );
121+
122+
// first of all, external error can await to be output, we must output them
123+
ExternalErrorHandler::instance().flush( "before signal error output" );
124+
125+
// error message output
126+
std::string const stackHistory = LvArray::system::stackTrace( true );
127+
ErrorLogger::ErrorMsg error;
128+
error.addSignalToMsg( signal );
129+
130+
GEOS_LOG( GEOS_FMT( "***** ERROR\n"
131+
"***** SIGNAL: {}\n"
132+
"***** LOCATION: (external error, captured by signal handler)\n"
133+
"{}\n{}",
134+
signal, error.m_msg, stackHistory ) );
135+
136+
if( ErrorLogger::global().isOutputFileEnabled() )
137+
{
138+
error.setType( ErrorLogger::MsgType::Error );
139+
error.addRank( ::geos::logger::internal::g_rank );
140+
error.addCallStackInfo( stackHistory );
141+
error.addContextInfo(
142+
ErrorContext{ { { ErrorContext::Attribute::Signal, std::to_string( signal ) } }, 1 },
143+
ErrorContext{ { { ErrorContext::Attribute::DetectionLoc, string( "signal handler" ) } }, 0 } );
144+
145+
ErrorLogger::global().flushErrorMsg( error );
146+
}
147+
148+
// call program termination
149+
LvArray::system::callErrorHandler();
150+
} );
151+
152+
}
69153
}
70154

71155
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -77,21 +161,6 @@ void finalizeLogger()
77161
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
78162
void setupLvArray()
79163
{
80-
LvArray::system::setErrorHandler( []()
81-
{
82-
#if defined( GEOS_USE_MPI )
83-
int mpi = 0;
84-
MPI_Initialized( &mpi );
85-
if( mpi )
86-
{
87-
MPI_Abort( MPI_COMM_WORLD, EXIT_FAILURE );
88-
}
89-
#endif
90-
std::abort();
91-
} );
92-
93-
LvArray::system::setSignalHandling( []( int const signal ) { LvArray::system::stackTraceHandler( signal, true ); } );
94-
95164
#if defined(GEOS_USE_FPE)
96165
LvArray::system::setFPE();
97166
#else

src/coreComponents/common/logger/ErrorHandling.cpp

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@
2525
#include <regex>
2626
#include <string_view>
2727

28+
// signal management
29+
#include <csignal>
30+
#include <cfenv>
31+
#include <cstring>
32+
2833
namespace geos
2934
{
3035
static constexpr std::string_view g_level1Start = " - ";
@@ -68,7 +73,9 @@ std::string ErrorLogger::ErrorContext::attributeToString( ErrorLogger::ErrorCont
6873
case ErrorLogger::ErrorContext::Attribute::InputFile: return "inputFile";
6974
case ErrorLogger::ErrorContext::Attribute::InputLine: return "inputLine";
7075
case ErrorLogger::ErrorContext::Attribute::DataPath: return "dataPath";
71-
default: return "Unknown";
76+
case ErrorLogger::ErrorContext::Attribute::DetectionLoc: return "detectionLocation";
77+
case ErrorLogger::ErrorContext::Attribute::Signal: return "signal";
78+
default: return "unknown";
7279
}
7380
}
7481

@@ -98,6 +105,39 @@ ErrorLogger::ErrorMsg & ErrorLogger::ErrorMsg::addToMsg( std::string_view errorM
98105
return *this;
99106
}
100107

108+
ErrorLogger::ErrorMsg & ErrorLogger::ErrorMsg::addSignalToMsg( int sig, bool toEnd )
109+
{
110+
if( sig == SIGFPE )
111+
{
112+
std::string errorMsg = "Floating point error encountered: \n";
113+
114+
if( std::fetestexcept( FE_DIVBYZERO ) )
115+
errorMsg += "- Division by zero operation.\n";
116+
117+
if( std::fetestexcept( FE_INEXACT ) )
118+
errorMsg += "- Inexact result.\n";
119+
120+
if( std::fetestexcept( FE_INVALID ) )
121+
errorMsg += "- Domain error occurred in an earlier floating-point operation.\n";
122+
123+
if( std::fetestexcept( FE_OVERFLOW ) )
124+
errorMsg += "- The result of the earlier floating-point operation was too large to be representable.\n";
125+
126+
if( std::fetestexcept( FE_UNDERFLOW ) )
127+
errorMsg += "- The result of the earlier floating-point operation was subnormal with a loss of precision.\n";
128+
129+
return addToMsg( errorMsg,
130+
toEnd );
131+
}
132+
else
133+
{
134+
// standard messages
135+
return addToMsg( GEOS_FMT( "Signal no. {} encountered: {}\n",
136+
sig, ::strsignal( sig ) ),
137+
toEnd );
138+
}
139+
}
140+
101141
ErrorLogger::ErrorMsg & ErrorLogger::ErrorMsg::setCodeLocation( std::string_view msgFile, integer msgLine )
102142
{
103143
m_file = msgFile;
@@ -230,10 +270,12 @@ void ErrorLogger::flushErrorMsg( ErrorLogger::ErrorMsg & errorMsg )
230270
}
231271

232272
// Location of the error in the code
233-
yamlFile << g_level1Next << "sourceLocation:\n";
234-
yamlFile << g_level2Next << "file: " << errorMsg.m_file << "\n";
235-
yamlFile << g_level2Next << "line: " << errorMsg.m_line << "\n";
236-
273+
if( !errorMsg.m_file.empty() )
274+
{
275+
yamlFile << g_level1Next << "sourceLocation:\n";
276+
yamlFile << g_level2Next << "file: " << errorMsg.m_file << "\n";
277+
yamlFile << g_level2Next << "line: " << errorMsg.m_line << "\n";
278+
}
237279
// Information about the stack trace
238280
if( !errorMsg.m_sourceCallStack.empty() )
239281
{
@@ -249,7 +291,7 @@ void ErrorLogger::flushErrorMsg( ErrorLogger::ErrorMsg & errorMsg )
249291
yamlFile << "\n";
250292
yamlFile.flush();
251293
errorMsg = ErrorMsg();
252-
GEOS_LOG_RANK( GEOS_FMT( "The error file {} was appended.", m_filename ) );
294+
GEOS_LOG_RANK( GEOS_FMT( "The error file {} has been appended.\n", m_filename ) );
253295
}
254296
else
255297
{

src/coreComponents/common/logger/ErrorHandling.hpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,9 @@ class ErrorLogger
6161
{
6262
InputFile,
6363
InputLine,
64-
DataPath
64+
DataPath,
65+
DetectionLoc,
66+
Signal,
6567
};
6668

6769
/// The map contains contextual information about the error
@@ -157,6 +159,16 @@ class ErrorLogger
157159
*/
158160
ErrorMsg & addToMsg( std::string_view msg, bool toEnd = false );
159161

162+
/**
163+
* @brief Add text to the error msg that occured according to the specified signal.
164+
* - the signal can be one of the main error signals.
165+
* - if the signal is SIGFPE, the nature of floating point error will be interpreted.
166+
* @param signal The signal, from ISO C99 or POSIX standard.
167+
* @param toEnd adds the message to the end if true, at the start otherwise.
168+
* @return The instance, for builder pattern.
169+
*/
170+
ErrorMsg & addSignalToMsg( int signal, bool toEnd = false );
171+
160172
/**
161173
* @brief Set the source code location values (file and line where the error is detected)
162174
* @param msgFile Name of the source file location to add

0 commit comments

Comments
 (0)