Skip to content

Commit e6126ee

Browse files
committed
Fix OWN_GIL safety issues: mutex leak, ABBA deadlock, dangling env
- Fix mutex leak in erlang_module_free: always destroy async_futures_mutex regardless of pipe_initialized flag since mutex is always initialized - Fix ABBA deadlock in event_loop_down and event_loop_destructor: acquire GIL before namespaces_mutex to match normal execution path lock ordering - Add interp_id validation in owngil_execute_*_with_env functions to detect env resources from wrong interpreter, preventing dangling pointer access - Document OWN_GIL callback re-entry limitation: erlang.call() uses thread_worker_call rather than suspension/resume protocol
1 parent 9c5435c commit e6126ee

3 files changed

Lines changed: 120 additions & 30 deletions

File tree

c_src/py_callback.c

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1666,6 +1666,13 @@ static PyObject *erlang_call_impl(PyObject *self, PyObject *args) {
16661666
* 2. tl_current_context with callback_handler (old blocking pipe mode)
16671667
* 3. tl_current_worker (legacy worker API)
16681668
* 4. thread_worker_call (spawned threads)
1669+
*
1670+
* NOTE: In OWN_GIL mode, erlang.call() goes through thread_worker_call()
1671+
* rather than using suspension/resume. This is because OWN_GIL contexts
1672+
* bypass the suspension protocol - the dedicated pthread that owns the GIL
1673+
* cannot be suspended. As a result, the call executes on a different
1674+
* context/interpreter (the thread worker), not the calling OWN_GIL context.
1675+
* Re-entrant calls back to the same OWN_GIL context are not supported.
16691676
*/
16701677
bool has_context_suspension = (tl_current_context != NULL && tl_allow_suspension);
16711678
bool has_context_handler = (tl_current_context != NULL && tl_current_context->has_callback_handler);
@@ -1678,6 +1685,7 @@ static PyObject *erlang_call_impl(PyObject *self, PyObject *args) {
16781685
* - threading.Thread instances
16791686
* - concurrent.futures.ThreadPoolExecutor workers
16801687
* - Any other Python threads
1688+
* - OWN_GIL contexts (which don't support suspension)
16811689
*/
16821690
Py_ssize_t nargs = PyTuple_Size(args);
16831691
if (nargs < 1) {
@@ -2783,10 +2791,9 @@ static void erlang_module_free(void *module) {
27832791
Py_XDECREF(state->async_pending_futures);
27842792
state->async_pending_futures = NULL;
27852793

2786-
if (state->pipe_initialized) {
2787-
pthread_mutex_destroy(&state->async_futures_mutex);
2788-
state->pipe_initialized = false;
2789-
}
2794+
/* Always destroy mutex - it was always initialized in create_erlang_module */
2795+
pthread_mutex_destroy(&state->async_futures_mutex);
2796+
state->pipe_initialized = false;
27902797
}
27912798

27922799
/* Module definition */

c_src/py_event_loop.c

Lines changed: 76 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -465,29 +465,51 @@ void event_loop_destructor(ErlNifEnv *env, void *obj) {
465465
loop->msg_env = NULL;
466466
}
467467

468-
/* Clean up per-process namespaces.
469-
* Note: Same leak-vs-crash tradeoff as above. If we can't safely
470-
* acquire the GIL, we skip Py_XDECREF and accept leaking the Python
471-
* dict objects. The native namespace struct is always freed. */
472-
pthread_mutex_lock(&loop->namespaces_mutex);
473-
process_namespace_t *ns = loop->namespaces_head;
474-
while (ns != NULL) {
475-
process_namespace_t *next = ns->next;
476-
/* Only cleanup Python objects if runtime is still running */
477-
if (runtime_is_running() && loop->interp_id == 0 &&
478-
PyGILState_GetThisThreadState() == NULL &&
479-
!PyGILState_Check()) {
480-
PyGILState_STATE gstate = PyGILState_Ensure();
468+
/*
469+
* Clean up per-process namespaces.
470+
*
471+
* Lock ordering: GIL first, then namespaces_mutex (consistent with normal path).
472+
* This prevents ABBA deadlock with execution paths that acquire GIL then mutex.
473+
*
474+
* For subinterpreters (interp_id != 0), we can't use PyGILState_Ensure.
475+
* Just free the native structs without Py_DECREF - Python objects will be
476+
* cleaned up when the interpreter is destroyed.
477+
*/
478+
if (runtime_is_running() && loop->interp_id == 0 &&
479+
PyGILState_GetThisThreadState() == NULL &&
480+
!PyGILState_Check()) {
481+
/* Main interpreter: GIL first, then mutex */
482+
PyGILState_STATE gstate = PyGILState_Ensure();
483+
pthread_mutex_lock(&loop->namespaces_mutex);
484+
485+
process_namespace_t *ns = loop->namespaces_head;
486+
while (ns != NULL) {
487+
process_namespace_t *next = ns->next;
481488
Py_XDECREF(ns->globals);
482489
Py_XDECREF(ns->locals);
483490
Py_XDECREF(ns->module_cache);
484-
PyGILState_Release(gstate);
491+
enif_free(ns);
492+
ns = next;
485493
}
486-
enif_free(ns);
487-
ns = next;
494+
loop->namespaces_head = NULL;
495+
496+
pthread_mutex_unlock(&loop->namespaces_mutex);
497+
PyGILState_Release(gstate);
498+
} else {
499+
/* Subinterpreter or runtime not running: just free structs */
500+
pthread_mutex_lock(&loop->namespaces_mutex);
501+
502+
process_namespace_t *ns = loop->namespaces_head;
503+
while (ns != NULL) {
504+
process_namespace_t *next = ns->next;
505+
/* Skip Py_XDECREF - can't safely acquire GIL */
506+
enif_free(ns);
507+
ns = next;
508+
}
509+
loop->namespaces_head = NULL;
510+
511+
pthread_mutex_unlock(&loop->namespaces_mutex);
488512
}
489-
loop->namespaces_head = NULL;
490-
pthread_mutex_unlock(&loop->namespaces_mutex);
491513
pthread_mutex_destroy(&loop->namespaces_mutex);
492514

493515
/* Destroy synchronization primitives */
@@ -616,13 +638,45 @@ void timer_resource_destructor(ErlNifEnv *env, void *obj) {
616638
* @brief Down callback for event loop resources (process monitor)
617639
*
618640
* Called when a monitored process dies. Cleans up the process's namespace.
641+
*
642+
* Lock ordering: GIL first, then namespaces_mutex (consistent with normal path)
619643
*/
620644
void event_loop_down(ErlNifEnv *env, void *obj, ErlNifPid *pid,
621645
ErlNifMonitor *mon) {
622646
(void)env;
623647
(void)mon;
624648
erlang_event_loop_t *loop = (erlang_event_loop_t *)obj;
625649

650+
/*
651+
* For subinterpreters (interp_id != 0), we can't use PyGILState_Ensure.
652+
* Just remove from the list without Py_DECREF - the Python objects will
653+
* be cleaned up when the interpreter is destroyed.
654+
*/
655+
if (!runtime_is_running() || loop->interp_id != 0) {
656+
pthread_mutex_lock(&loop->namespaces_mutex);
657+
658+
process_namespace_t **pp = &loop->namespaces_head;
659+
while (*pp != NULL) {
660+
if (enif_compare_pids(&(*pp)->owner_pid, pid) == 0) {
661+
process_namespace_t *to_free = *pp;
662+
*pp = to_free->next;
663+
/* Skip Py_XDECREF - can't safely acquire GIL for subinterp */
664+
enif_free(to_free);
665+
break;
666+
}
667+
pp = &(*pp)->next;
668+
}
669+
670+
pthread_mutex_unlock(&loop->namespaces_mutex);
671+
return;
672+
}
673+
674+
/*
675+
* For main interpreter: acquire GIL FIRST to maintain consistent lock
676+
* ordering with the normal execution path (which acquires GIL, then mutex).
677+
* This prevents ABBA deadlock.
678+
*/
679+
PyGILState_STATE gstate = PyGILState_Ensure();
626680
pthread_mutex_lock(&loop->namespaces_mutex);
627681

628682
/* Find and remove namespace for this pid */
@@ -632,14 +686,9 @@ void event_loop_down(ErlNifEnv *env, void *obj, ErlNifPid *pid,
632686
process_namespace_t *to_free = *pp;
633687
*pp = to_free->next;
634688

635-
/* Must hold GIL to free Python objects */
636-
if (runtime_is_running() && loop->interp_id == 0) {
637-
PyGILState_STATE gstate = PyGILState_Ensure();
638-
Py_XDECREF(to_free->globals);
639-
Py_XDECREF(to_free->locals);
640-
Py_XDECREF(to_free->module_cache);
641-
PyGILState_Release(gstate);
642-
}
689+
Py_XDECREF(to_free->globals);
690+
Py_XDECREF(to_free->locals);
691+
Py_XDECREF(to_free->module_cache);
643692

644693
enif_free(to_free);
645694
break;
@@ -648,6 +697,7 @@ void event_loop_down(ErlNifEnv *env, void *obj, ErlNifPid *pid,
648697
}
649698

650699
pthread_mutex_unlock(&loop->namespaces_mutex);
700+
PyGILState_Release(gstate);
651701
}
652702

653703
/**

c_src/py_nif.c

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2775,6 +2775,17 @@ static void owngil_execute_exec_with_env(py_context_t *ctx) {
27752775
return;
27762776
}
27772777

2778+
/* Verify interpreter ownership - prevent dangling pointer access.
2779+
* Compare env's interp_id with the current Python interpreter's ID. */
2780+
PyInterpreterState *current_interp = PyInterpreterState_Get();
2781+
if (current_interp != NULL && penv->interp_id != PyInterpreterState_GetID(current_interp)) {
2782+
ctx->response_term = enif_make_tuple2(ctx->shared_env,
2783+
enif_make_atom(ctx->shared_env, "error"),
2784+
enif_make_atom(ctx->shared_env, "env_wrong_interpreter"));
2785+
ctx->response_ok = false;
2786+
return;
2787+
}
2788+
27782789
ErlNifBinary code_bin;
27792790
if (!enif_inspect_binary(ctx->shared_env, ctx->request_term, &code_bin)) {
27802791
ctx->response_term = enif_make_tuple2(ctx->shared_env,
@@ -2841,6 +2852,17 @@ static void owngil_execute_eval_with_env(py_context_t *ctx) {
28412852
return;
28422853
}
28432854

2855+
/* Verify interpreter ownership - prevent dangling pointer access.
2856+
* Compare env's interp_id with the current Python interpreter's ID. */
2857+
PyInterpreterState *current_interp = PyInterpreterState_Get();
2858+
if (current_interp != NULL && penv->interp_id != PyInterpreterState_GetID(current_interp)) {
2859+
ctx->response_term = enif_make_tuple2(ctx->shared_env,
2860+
enif_make_atom(ctx->shared_env, "error"),
2861+
enif_make_atom(ctx->shared_env, "env_wrong_interpreter"));
2862+
ctx->response_ok = false;
2863+
return;
2864+
}
2865+
28442866
/* Decode request: {Code, Locals} */
28452867
const ERL_NIF_TERM *tuple_terms;
28462868
int tuple_arity;
@@ -2933,6 +2955,17 @@ static void owngil_execute_call_with_env(py_context_t *ctx) {
29332955
return;
29342956
}
29352957

2958+
/* Verify interpreter ownership - prevent dangling pointer access.
2959+
* Compare env's interp_id with the current Python interpreter's ID. */
2960+
PyInterpreterState *current_interp = PyInterpreterState_Get();
2961+
if (current_interp != NULL && penv->interp_id != PyInterpreterState_GetID(current_interp)) {
2962+
ctx->response_term = enif_make_tuple2(ctx->shared_env,
2963+
enif_make_atom(ctx->shared_env, "error"),
2964+
enif_make_atom(ctx->shared_env, "env_wrong_interpreter"));
2965+
ctx->response_ok = false;
2966+
return;
2967+
}
2968+
29362969
/* Decode request from shared_env: {Module, Func, Args, Kwargs} */
29372970
ERL_NIF_TERM module_term, func_term, args_term, kwargs_term;
29382971
const ERL_NIF_TERM *tuple_terms;

0 commit comments

Comments
 (0)