Skip to content

Commit 312fc25

Browse files
jbachorikclaude
andauthored
fix(sigaction): prevent infinite loop in signal handler chaining (#437)
* fix(sigaction): prevent infinite loop in signal handler chaining When intercepting sigaction calls from other libraries (e.g., wasmtime), we were returning our handler as oldact. This caused infinite loops: profiler -> wasmtime -> profiler -> wasmtime -> ... Fix: Save original (JVM's) handlers in protectSignalHandlers() BEFORE installing ours, then return those saved handlers as oldact. Now the chain is: profiler -> wasmtime -> JVM Also: - Add sigaction_interception_ut.cpp test to catch this bug - Wire gtest to run before Java tests in testdebug - Remove broken test_tlsPriming.cpp (referenced removed APIs) - Add getSigactionHook() stub for macOS Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: address PR review comments - Add missing <cstring> include for memset - Fix comment to match int return type (0 = not handled, non-zero = handled) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: close query-only sigaction loop and fix debug-mode SIGABRT Extend sigaction interception to cover query-only calls (sigaction(SIGSEGV/SIGBUS, nullptr, &oldact)): return the saved JVM handler instead of ours, so callers that store oldact and later chain to it don't loop back into our handler. Fix intermittent SIGABRT in debug builds on aarch64 GraalVM: extend crashProtectionActive() with a VMThread::isExceptionActive() fallback so the cast_to() assert no longer fires for threads without ProfiledThread TLS when setjmp crash protection is already active in walkVM. Add OS::resetSignalHandlersForTesting() to prevent static state from leaking between sigaction interception unit tests. Add ASCII flow diagram to sigaction_hook documenting the full handler chain and interception cases. Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent f34b274 commit 312fc25

9 files changed

Lines changed: 484 additions & 356 deletions

File tree

build-logic/conventions/src/main/kotlin/com/datadoghq/profiler/ProfilerTestPlugin.kt

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -560,21 +560,22 @@ class ProfilerTestPlugin : Plugin<Project> {
560560
}
561561
}
562562

563-
// Wire up assemble/gtest dependencies
563+
// Wire up gtest -> test dependencies (C++ tests run before Java tests)
564564
project.gradle.projectsEvaluated {
565565
configNames.forEach { cfgName ->
566-
val testTask = project.tasks.findByName("test$cfgName")
566+
val capitalizedCfgName = cfgName.replaceFirstChar { it.uppercaseChar() }
567+
val testTaskName = "test$capitalizedCfgName"
568+
val testTask = project.tasks.findByName(testTaskName)
567569
val profilerLibProject = project.rootProject.findProject(profilerLibProjectPath)
568570

569-
if (profilerLibProject != null) {
570-
val assembleTask = profilerLibProject.tasks.findByName("assemble${cfgName.replaceFirstChar { it.uppercaseChar() }}")
571-
if (testTask != null && assembleTask != null) {
572-
assembleTask.dependsOn(testTask)
573-
}
574-
575-
val gtestTask = profilerLibProject.tasks.findByName("gtest${cfgName.replaceFirstChar { it.uppercaseChar() }}")
576-
if (testTask != null && gtestTask != null) {
571+
if (profilerLibProject != null && testTask != null) {
572+
// gtest runs before test (C++ unit tests run before Java integration tests)
573+
val gtestTaskName = "gtest${capitalizedCfgName}"
574+
try {
575+
val gtestTask = profilerLibProject.tasks.named(gtestTaskName)
577576
testTask.dependsOn(gtestTask)
577+
} catch (e: org.gradle.api.UnknownTaskException) {
578+
project.logger.info("Task $gtestTaskName not found in $profilerLibProjectPath - gtest may not be available")
578579
}
579580
}
580581
}

ddprof-lib/src/main/cpp/os.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ class OS {
157157
static SigAction getSegvChainTarget();
158158
static SigAction getBusChainTarget();
159159
static void* getSigactionHook();
160+
static void resetSignalHandlersForTesting();
160161

161162
static int getMaxThreadId(int floor) {
162163
int maxThreadId = getMaxThreadId();

ddprof-lib/src/main/cpp/os_linux.cpp

Lines changed: 108 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -739,6 +739,11 @@ static SigAction _protected_bus_handler = nullptr;
739739
static volatile SigAction _segv_chain_target = nullptr;
740740
static volatile SigAction _bus_chain_target = nullptr;
741741

742+
// Original handlers (JVM's) saved before we install ours - used for oldact in sigaction hook
743+
static struct sigaction _orig_segv_sigaction;
744+
static struct sigaction _orig_bus_sigaction;
745+
static bool _orig_handlers_saved = false;
746+
742747
// Real sigaction function pointer (resolved via dlsym)
743748
typedef int (*real_sigaction_t)(int, const struct sigaction*, struct sigaction*);
744749
static real_sigaction_t _real_sigaction = nullptr;
@@ -748,6 +753,14 @@ void OS::protectSignalHandlers(SigAction segvHandler, SigAction busHandler) {
748753
if (_real_sigaction == nullptr) {
749754
_real_sigaction = (real_sigaction_t)dlsym(RTLD_DEFAULT, "sigaction");
750755
}
756+
// Save the current (JVM's) signal handlers BEFORE we install ours.
757+
// These will be returned as oldact when we intercept other libraries' sigaction calls,
758+
// so they chain to JVM instead of back to us (which would cause infinite loops).
759+
if (!__atomic_load_n(&_orig_handlers_saved, __ATOMIC_ACQUIRE) && _real_sigaction != nullptr) {
760+
_real_sigaction(SIGSEGV, nullptr, &_orig_segv_sigaction);
761+
_real_sigaction(SIGBUS, nullptr, &_orig_bus_sigaction);
762+
__atomic_store_n(&_orig_handlers_saved, true, __ATOMIC_RELEASE);
763+
}
751764
_protected_segv_handler = segvHandler;
752765
_protected_bus_handler = busHandler;
753766
}
@@ -760,7 +773,67 @@ SigAction OS::getBusChainTarget() {
760773
return __atomic_load_n(&_bus_chain_target, __ATOMIC_ACQUIRE);
761774
}
762775

763-
// sigaction hook - called via GOT patching to intercept sigaction calls
776+
// sigaction_hook - intercepts sigaction(2) calls from any library via GOT patching.
777+
//
778+
// PROBLEM SOLVED
779+
// ==============
780+
// Without interception, a library (e.g. wasmtime) can overwrite our SIGSEGV handler:
781+
//
782+
// Before: kernel --> our_handler --> JVM_handler
783+
// After lib calls sigaction(SIGSEGV, lib_handler, &oldact):
784+
// kernel --> lib_handler
785+
// lib_handler stores oldact = our_handler as its chain target
786+
// => when lib chains on unhandled fault: lib_handler --> our_handler --> lib_handler --> ...
787+
// INFINITE LOOP
788+
//
789+
// HANDLER CHAIN AFTER SETUP
790+
// ==========================
791+
//
792+
// protectSignalHandlers() replaceSigsegvHandler() LibraryPatcher::patch_sigaction()
793+
// | | |
794+
// v v v
795+
// save JVM handler install our_handler GOT-patch sigaction
796+
// into _orig_segv_sigaction as real OS handler => all future sigaction()
797+
// calls go through us
798+
//
799+
// Signal delivery chain:
800+
//
801+
// kernel
802+
// |
803+
// v
804+
// our_handler (installed via replaceSigsegvHandler, never displaced)
805+
// |
806+
// +-- handled by us? --> done
807+
// |
808+
// v (not handled)
809+
// _segv_chain_target (lib_handler, set when we intercepted lib's sigaction call)
810+
// |
811+
// +-- handled by lib? --> done
812+
// |
813+
// v (lib chains to its saved oldact)
814+
// _orig_segv_sigaction (JVM's original handler, what we returned as oldact to lib)
815+
// |
816+
// v
817+
// JVM handles or terminates
818+
//
819+
// INTERCEPTION LOGIC (this function)
820+
// ===================================
821+
// Case 1 - Install call [act != nullptr, SA_SIGINFO]:
822+
// - Save lib's handler as _segv_chain_target (we'll call it if we can't handle)
823+
// - Return _orig_segv_sigaction as oldact (NOT our handler, to break the loop)
824+
// - Do NOT actually install lib's handler (keep ours on top)
825+
//
826+
// Case 2 - Query-only call [act == nullptr, oldact != nullptr]:
827+
// - Return _orig_segv_sigaction as oldact (same reason: lib must not see our handler)
828+
// - A lib that queries, stores the result, then uses it as a chain target would
829+
// loop if we returned our handler here.
830+
//
831+
// Case 3 - 1-arg handler [act != nullptr, no SA_SIGINFO]:
832+
// - Pass through: we cannot safely chain 1-arg handlers (different calling convention)
833+
//
834+
// Case 4 - Any other signal, or protection not yet active:
835+
// - Pass through to real sigaction unchanged.
836+
//
764837
static int sigaction_hook(int signum, const struct sigaction* act, struct sigaction* oldact) {
765838
// _real_sigaction must be resolved before any GOT patching happens
766839
if (_real_sigaction == nullptr) {
@@ -769,37 +842,53 @@ static int sigaction_hook(int signum, const struct sigaction* act, struct sigact
769842
}
770843

771844
// If this is SIGSEGV or SIGBUS and we have protected handlers installed,
772-
// intercept the call to keep our handler on top
773-
if (act != nullptr) {
774-
if (signum == SIGSEGV && _protected_segv_handler != nullptr) {
775-
// Only intercept SA_SIGINFO handlers (3-arg form) for safe chaining
845+
// intercept the call to keep our handler on top.
846+
// We intercept both install calls (act != nullptr) and query-only calls (act == nullptr)
847+
// to ensure callers always see the JVM's original handler, never ours.
848+
// A caller that gets our handler as oldact and later chains to it would cause an
849+
// infinite loop: us -> them -> us -> ...
850+
if (signum == SIGSEGV && _protected_segv_handler != nullptr) {
851+
if (act != nullptr) {
852+
// Install call: only intercept SA_SIGINFO handlers (3-arg form) for safe chaining
776853
if (act->sa_flags & SA_SIGINFO) {
777854
SigAction new_handler = act->sa_sigaction;
778855
// Don't intercept if it's our own handler being installed
779856
if (new_handler != _protected_segv_handler) {
780857
// Save their handler as our chain target
781858
__atomic_exchange_n(&_segv_chain_target, new_handler, __ATOMIC_ACQ_REL);
782859
if (oldact != nullptr) {
783-
_real_sigaction(signum, nullptr, oldact);
860+
// Return the original (JVM's) handler, not ours, to prevent
861+
// the caller from chaining back to us.
862+
*oldact = _orig_segv_sigaction;
784863
}
785864
Counters::increment(SIGACTION_INTERCEPTED);
786865
// Don't actually install their handler - keep ours on top
787866
return 0;
788867
}
789868
}
790869
// Let 1-arg handlers (without SA_SIGINFO) pass through - we can't safely chain them
791-
} else if (signum == SIGBUS && _protected_bus_handler != nullptr) {
870+
} else if (oldact != nullptr) {
871+
// Query-only call: return the JVM's original handler, not ours.
872+
// Same reason: a caller that stores our handler and later chains to it causes loops.
873+
*oldact = _orig_segv_sigaction;
874+
return 0;
875+
}
876+
} else if (signum == SIGBUS && _protected_bus_handler != nullptr) {
877+
if (act != nullptr) {
792878
if (act->sa_flags & SA_SIGINFO) {
793879
SigAction new_handler = act->sa_sigaction;
794880
if (new_handler != _protected_bus_handler) {
795881
__atomic_exchange_n(&_bus_chain_target, new_handler, __ATOMIC_ACQ_REL);
796882
if (oldact != nullptr) {
797-
_real_sigaction(signum, nullptr, oldact);
883+
*oldact = _orig_bus_sigaction;
798884
}
799885
Counters::increment(SIGACTION_INTERCEPTED);
800886
return 0;
801887
}
802888
}
889+
} else if (oldact != nullptr) {
890+
*oldact = _orig_bus_sigaction;
891+
return 0;
803892
}
804893
}
805894

@@ -811,4 +900,15 @@ void* OS::getSigactionHook() {
811900
return (void*)sigaction_hook;
812901
}
813902

903+
void OS::resetSignalHandlersForTesting() {
904+
__atomic_store_n(&_orig_handlers_saved, false, __ATOMIC_RELEASE);
905+
memset(&_orig_segv_sigaction, 0, sizeof(_orig_segv_sigaction));
906+
memset(&_orig_bus_sigaction, 0, sizeof(_orig_bus_sigaction));
907+
_protected_segv_handler = nullptr;
908+
_protected_bus_handler = nullptr;
909+
__atomic_store_n(&_segv_chain_target, (SigAction)nullptr, __ATOMIC_RELEASE);
910+
__atomic_store_n(&_bus_chain_target, (SigAction)nullptr, __ATOMIC_RELEASE);
911+
// _real_sigaction is intentionally not reset: safe to reuse across tests
912+
}
913+
814914
#endif // __linux__

ddprof-lib/src/main/cpp/os_macos.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -496,4 +496,12 @@ SigAction OS::getBusChainTarget() {
496496
return nullptr;
497497
}
498498

499+
void* OS::getSigactionHook() {
500+
return nullptr; // No sigaction interception on macOS
501+
}
502+
503+
void OS::resetSignalHandlersForTesting() {
504+
// No-op: no sigaction interception state on macOS
505+
}
506+
499507
#endif // __APPLE__

ddprof-lib/src/main/cpp/profiler.cpp

Lines changed: 59 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1044,93 +1044,116 @@ void Profiler::disableEngines() {
10441044
}
10451045

10461046
void Profiler::segvHandler(int signo, siginfo_t *siginfo, void *ucontext) {
1047-
if (!crashHandler(signo, siginfo, ucontext)) {
1048-
// Check dynamic chain target first (set by intercepted sigaction calls)
1049-
SigAction chain = OS::getSegvChainTarget();
1050-
if (chain != nullptr) {
1051-
chain(signo, siginfo, ucontext);
1052-
} else if (orig_segvHandler != nullptr) {
1053-
orig_segvHandler(signo, siginfo, ucontext);
1054-
}
1047+
if (crashHandlerInternal(signo, siginfo, ucontext)) {
1048+
return; // Handled
1049+
}
1050+
// Not handled, chain to next handler
1051+
SigAction chain = OS::getSegvChainTarget();
1052+
if (chain != nullptr) {
1053+
chain(signo, siginfo, ucontext);
1054+
} else if (orig_segvHandler != nullptr) {
1055+
orig_segvHandler(signo, siginfo, ucontext);
10551056
}
10561057
}
10571058

10581059
void Profiler::busHandler(int signo, siginfo_t *siginfo, void *ucontext) {
1059-
if (!crashHandler(signo, siginfo, ucontext)) {
1060-
// Check dynamic chain target first (set by intercepted sigaction calls)
1061-
SigAction chain = OS::getBusChainTarget();
1062-
if (chain != nullptr) {
1063-
chain(signo, siginfo, ucontext);
1064-
} else if (orig_busHandler != nullptr) {
1065-
orig_busHandler(signo, siginfo, ucontext);
1066-
}
1060+
if (crashHandlerInternal(signo, siginfo, ucontext)) {
1061+
return; // Handled
1062+
}
1063+
// Not handled, chain to next handler
1064+
SigAction chain = OS::getBusChainTarget();
1065+
if (chain != nullptr) {
1066+
chain(signo, siginfo, ucontext);
1067+
} else if (orig_busHandler != nullptr) {
1068+
orig_busHandler(signo, siginfo, ucontext);
10671069
}
10681070
}
10691071

1070-
bool Profiler::crashHandler(int signo, siginfo_t *siginfo, void *ucontext) {
1072+
// Returns: 0 = not handled (chain to next handler), non-zero = handled
1073+
int Profiler::crashHandlerInternal(int signo, siginfo_t *siginfo, void *ucontext) {
10711074
ProfiledThread* thrd = ProfiledThread::currentSignalSafe();
1072-
if (thrd != nullptr && !thrd->enterCrashHandler()) {
1073-
// we are already in a crash handler; don't recurse!
1074-
return false;
1075-
}
10761075

1076+
// First, try to handle safefetch - this doesn't need TLS or any protection
1077+
// because it directly checks the PC and modifies ucontext to skip the fault.
1078+
// This must be checked first before any reentrancy checks.
10771079
if (SafeAccess::handle_safefetch(signo, ucontext)) {
1078-
if (thrd != nullptr) {
1079-
thrd->exitCrashHandler();
1080+
return 1; // handled
1081+
}
1082+
1083+
// Reentrancy protection: use TLS-based tracking if available.
1084+
// If TLS is not available, we can only safely handle faults that we can
1085+
// prove are from our protected code paths (checked via sameStack heuristic
1086+
// in StackWalker::checkFault). For anything else, we must chain immediately
1087+
// to avoid claiming faults that aren't ours.
1088+
bool have_tls_protection = false;
1089+
if (thrd != nullptr) {
1090+
if (!thrd->enterCrashHandler()) {
1091+
// we are already in a crash handler; don't recurse!
1092+
return 0; // not handled, safe to chain
10801093
}
1081-
return true;
1094+
have_tls_protection = true;
10821095
}
1096+
// If thrd == nullptr, we proceed but with limited handling capability.
1097+
// Only StackWalker::checkFault (which has its own sameStack fallback)
1098+
// and the JDK-8313796 workaround can safely handle faults without TLS.
10831099

1084-
uintptr_t fault_address = (uintptr_t)siginfo->si_addr;
10851100
StackFrame frame(ucontext);
10861101
uintptr_t pc = frame.pc();
1102+
1103+
uintptr_t fault_address = (uintptr_t)siginfo->si_addr;
10871104
if (pc == fault_address) {
10881105
// it is 'pc' that is causing the fault; can not access it safely
1089-
if (thrd != nullptr) {
1106+
if (have_tls_protection) {
10901107
thrd->exitCrashHandler();
10911108
}
1092-
return false;
1109+
return 0; // not handled, safe to chain
10931110
}
10941111

10951112
if (WX_MEMORY && Trap::isFaultInstruction(pc)) {
1096-
if (thrd != nullptr) {
1113+
if (have_tls_protection) {
10971114
thrd->exitCrashHandler();
10981115
}
1099-
return true;
1116+
return 1; // handled
11001117
}
11011118

11021119
if (VM::isHotspot()) {
11031120
// the following checks require vmstructs and therefore HotSpot
11041121

1122+
// StackWalker::checkFault has its own fallback for when TLS is unavailable:
1123+
// it uses sameStack() heuristic to check if we're in a protected stack walk.
1124+
// If the fault is from our protected walk, it will longjmp and never return.
1125+
// If it returns, the fault wasn't from our code.
11051126
StackWalker::checkFault(thrd);
11061127

11071128
// Workaround for JDK-8313796 if needed. Setting cstack=dwarf also helps
11081129
if (_need_JDK_8313796_workaround &&
11091130
VMStructs::isInterpretedFrameValidFunc((const void *)pc) &&
11101131
frame.skipFaultInstruction()) {
1111-
if (thrd != nullptr) {
1132+
if (have_tls_protection) {
11121133
thrd->exitCrashHandler();
11131134
}
1114-
return true;
1135+
return 1; // handled
11151136
}
11161137
}
11171138

1118-
if (thrd != nullptr) {
1139+
if (have_tls_protection) {
11191140
thrd->exitCrashHandler();
11201141
}
1121-
return false;
1142+
return 0; // not handled, safe to chain
11221143
}
11231144

11241145
void Profiler::setupSignalHandlers() {
11251146
// Do not re-run the signal setup (run only when VM has not been loaded yet)
11261147
if (__sync_bool_compare_and_swap(&_signals_initialized, false, true)) {
11271148
if (VM::isHotspot() || VM::isOpenJ9()) {
11281149
// HotSpot and J9 tolerate interposed SIGSEGV/SIGBUS handler; other JVMs probably not
1150+
// IMPORTANT: protectSignalHandlers must be called BEFORE replaceSigsegvHandler so that
1151+
// the original (JVM's) handlers are saved before we install ours. This way, when we
1152+
// intercept other libraries' sigaction calls and return oldact, we return the JVM's
1153+
// handler (not ours), preventing infinite chaining loops.
1154+
OS::protectSignalHandlers(segvHandler, busHandler);
11291155
orig_segvHandler = OS::replaceSigsegvHandler(segvHandler);
11301156
orig_busHandler = OS::replaceSigbusHandler(busHandler);
1131-
// Protect our handlers from being overwritten by other libraries (e.g., wasmtime).
1132-
// Their handlers will be stored as chain targets and called from our handlers.
1133-
OS::protectSignalHandlers(segvHandler, busHandler);
11341157
// Patch sigaction GOT in libraries with broken signal handlers (already loaded)
11351158
LibraryPatcher::patch_sigaction();
11361159
}

ddprof-lib/src/main/cpp/profiler.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ class alignas(alignof(SpinLock)) Profiler {
193193
void lockAll();
194194
void unlockAll();
195195

196-
static bool crashHandler(int signo, siginfo_t *siginfo, void *ucontext);
196+
static int crashHandlerInternal(int signo, siginfo_t *siginfo, void *ucontext);
197197
static void check_JDK_8313796_workaround();
198198

199199
static Profiler *const _instance;

0 commit comments

Comments
 (0)