benoitc
diff --git a/‎c_src/py_asgi.c‎
Lines changed: 193 additions & 843 deletions b/‎c_src/py_asgi.c‎
Lines changed: 193 additions & 843 deletions
diff --git a/‎c_src/py_asgi.h‎
Lines changed: 9 additions & 19 deletions b/‎c_src/py_asgi.h‎
Lines changed: 9 additions & 19 deletions
diff --git a/‎c_src/py_event_loop.c‎
Lines changed: 170 additions & 33 deletions b/‎c_src/py_event_loop.c‎
Lines changed: 170 additions & 33 deletions
diff --git a/‎c_src/py_event_loop.h‎
Lines changed: 8 additions & 2 deletions b/‎c_src/py_event_loop.h‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎c_src/py_nif.c‎
Lines changed: 0 additions & 6 deletions b/‎c_src/py_nif.c‎
Lines changed: 0 additions & 6 deletions
@@ -90,22 +90,6 @@
  */
 #define ASGI_MAX_INTERPRETERS 64
 
-/**
- * @def SCOPE_CACHE_SIZE
- * @brief Number of scope templates to cache per thread
- */
-#define SCOPE_CACHE_SIZE 64
-
-/**
- * @def LAZY_HEADERS_THRESHOLD
- * @brief Minimum number of headers to use lazy conversion
- *
- * For small header counts, eager conversion is faster due to lower overhead.
- * Only use lazy conversion when there are enough headers to benefit.
- */
-#ifndef LAZY_HEADERS_THRESHOLD
-#define LAZY_HEADERS_THRESHOLD 4
-#endif
 
 /* ============================================================================
  * ASGI Erlang Atoms
@@ -120,9 +104,6 @@ extern ERL_NIF_TERM ATOM_ASGI_METHOD;
 /* Resource type for zero-copy body buffers */
 extern ErlNifResourceType *ASGI_BUFFER_RESOURCE_TYPE;
 
-/* Resource type for lazy header conversion */
-extern ErlNifResourceType *ASGI_LAZY_HEADERS_RESOURCE_TYPE;
-
 /* ============================================================================
  * Per-Interpreter State (Sub-interpreter & Free-threading Support)
  * ============================================================================ */
@@ -237,6 +218,15 @@ typedef struct asgi_interp_state {
     PyObject *status_500;   /**< 500 Internal Server Error */
     PyObject *status_502;   /**< 502 Bad Gateway */
     PyObject *status_503;   /**< 503 Service Unavailable */
+
+    /* Callable cache (avoids per-request module imports) */
+    char *cached_module_name;       /**< Last used module name */
+    char *cached_callable_name;     /**< Last used callable name */
+    PyObject *cached_callable;      /**< Cached callable object */
+
+    char *cached_runner_name;       /**< Last used runner module name */
+    PyObject *cached_runner;        /**< Cached runner module */
+    PyObject *cached_run_func;      /**< Cached _run_asgi_sync function */
 } asgi_interp_state_t;
 
 /**
 
@@ -120,6 +120,76 @@ static ErlNifPid g_global_shared_router;
 static bool g_global_shared_router_valid = false;
 static pthread_mutex_t g_global_router_mutex = PTHREAD_MUTEX_INITIALIZER;
 
+/* ============================================================================
+ * Cached Reactor Callables (Performance Optimization)
+ * ============================================================================
+ *
+ * Cache erlang.reactor module and callbacks to avoid expensive PyImport
+ * on every read/write callback in the hot path.
+ */
+static PyObject *g_reactor_module = NULL;
+static PyObject *g_on_read_ready = NULL;
+static PyObject *g_on_write_ready = NULL;
+static bool g_reactor_cached = false;
+static pthread_mutex_t g_reactor_cache_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/**
+ * Initialize cached reactor callables.
+ * MUST be called with GIL held.
+ * Thread-safe: uses mutex for first initialization.
+ *
+ * @return true if callables are cached and ready, false on error
+ */
+static bool ensure_reactor_cached(void) {
+    /* Fast path: already cached */
+    if (g_reactor_cached) {
+        return true;
+    }
+
+    pthread_mutex_lock(&g_reactor_cache_mutex);
+
+    /* Double-check after acquiring lock */
+    if (g_reactor_cached) {
+        pthread_mutex_unlock(&g_reactor_cache_mutex);
+        return true;
+    }
+
+    /* Import erlang.reactor module */
+    PyObject *module = PyImport_ImportModule("erlang.reactor");
+    if (module == NULL) {
+        pthread_mutex_unlock(&g_reactor_cache_mutex);
+        return false;
+    }
+
+    /* Get on_read_ready function */
+    PyObject *on_read = PyObject_GetAttrString(module, "on_read_ready");
+    if (on_read == NULL || !PyCallable_Check(on_read)) {
+        Py_XDECREF(on_read);
+        Py_DECREF(module);
+        pthread_mutex_unlock(&g_reactor_cache_mutex);
+        return false;
+    }
+
+    /* Get on_write_ready function */
+    PyObject *on_write = PyObject_GetAttrString(module, "on_write_ready");
+    if (on_write == NULL || !PyCallable_Check(on_write)) {
+        Py_XDECREF(on_write);
+        Py_DECREF(on_read);
+        Py_DECREF(module);
+        pthread_mutex_unlock(&g_reactor_cache_mutex);
+        return false;
+    }
+
+    /* Store cached references */
+    g_reactor_module = module;
+    g_on_read_ready = on_read;
+    g_on_write_ready = on_write;
+    g_reactor_cached = true;
+
+    pthread_mutex_unlock(&g_reactor_cache_mutex);
+    return true;
+}
+
 /* Forward declaration for module state access */
 static py_event_loop_module_state_t *get_module_state(void);
 static py_event_loop_module_state_t *get_module_state_from_module(PyObject *module);
@@ -977,6 +1047,9 @@ static int poll_events_wait(erlang_event_loop_t *loop, int timeout_ms) {
 
     pthread_mutex_lock(&loop->mutex);
 
+    /* Reset wake_pending flag since we're about to process events */
+    atomic_store(&loop->wake_pending, false);
+
     int current_count = atomic_load(&loop->pending_count);
     if (current_count == 0 && !loop->shutdown) {
         /* No events, wait with timeout */
@@ -1088,11 +1161,49 @@ ERL_NIF_TERM nif_get_pending(ErlNifEnv *env, int argc,
     /*
      * Phase 2: Build Erlang list outside lock (no contention)
      * Term creation and memory operations happen without holding the mutex.
+     *
+     * Optimization: Count elements first, then use enif_make_list_from_array
+     * to build the list in O(n) instead of O(2n) with build-then-reverse.
      */
-    ERL_NIF_TERM list = enif_make_list(env, 0);
-    pending_event_t *current = snapshot_head;
 
+    /* Count events in the snapshot */
+    size_t count = 0;
+    pending_event_t *current = snapshot_head;
     while (current != NULL) {
+        count++;
+        current = current->next;
+    }
+
+    if (count == 0) {
+        return enif_make_list(env, 0);
+    }
+
+    /* Allocate array for terms - use stack for small counts, heap for large */
+    ERL_NIF_TERM *terms;
+    ERL_NIF_TERM stack_terms[64];
+    bool heap_allocated = false;
+
+    if (count <= 64) {
+        terms = stack_terms;
+    } else {
+        terms = enif_alloc(count * sizeof(ERL_NIF_TERM));
+        if (terms == NULL) {
+            /* Fallback: free events and return empty list */
+            current = snapshot_head;
+            while (current != NULL) {
+                pending_event_t *next = current->next;
+                enif_free(current);
+                current = next;
+            }
+            return enif_make_list(env, 0);
+        }
+        heap_allocated = true;
+    }
+
+    /* Build terms array in forward order (matching linked list order) */
+    current = snapshot_head;
+    size_t i = 0;
+    while (current != NULL && i < count) {
         ERL_NIF_TERM type_atom;
         switch (current->type) {
             case EVENT_TYPE_READ:
@@ -1108,26 +1219,26 @@ ERL_NIF_TERM nif_get_pending(ErlNifEnv *env, int argc,
                 type_atom = ATOM_UNDEFINED;
         }
 
-        ERL_NIF_TERM event = enif_make_tuple2(
+        terms[i] = enif_make_tuple2(
             env,
             enif_make_uint64(env, current->callback_id),
             type_atom
         );
 
-        list = enif_make_list_cell(env, event, list);
         pending_event_t *next = current->next;
         enif_free(current);
         current = next;
+        i++;
     }
 
-    /* Reverse the list to maintain order */
-    ERL_NIF_TERM reversed = enif_make_list(env, 0);
-    ERL_NIF_TERM head;
-    while (enif_get_list_cell(env, list, &head, &list)) {
-        reversed = enif_make_list_cell(env, head, reversed);
+    /* Build list from array in O(n) */
+    ERL_NIF_TERM result = enif_make_list_from_array(env, terms, (unsigned int)i);
+
+    if (heap_allocated) {
+        enif_free(terms);
     }
 
-    return reversed;
+    return result;
 }
 
 /**
@@ -1705,10 +1816,13 @@ static inline uint64_t pending_hash_key(uint64_t callback_id, event_type_t type)
 
 /**
  * @brief Compute hash bucket index
+ *
+ * Note: PENDING_HASH_SIZE must be a power of 2 for bitwise AND to work.
+ * Using AND instead of modulo is faster (single instruction vs division).
  */
 static inline uint32_t pending_hash_index(uint64_t key) {
-    /* Simple hash: XOR fold and modulo */
-    return (uint32_t)((key ^ (key >> 32)) % PENDING_HASH_SIZE);
+    /* Simple hash: XOR fold and bitwise AND (faster than modulo) */
+    return (uint32_t)((key ^ (key >> 32)) & (PENDING_HASH_SIZE - 1));
 }
 
 /**
@@ -1728,9 +1842,9 @@ static inline bool pending_hash_contains(erlang_event_loop_t *loop,
     uint64_t key = pending_hash_key(callback_id, type);
     uint32_t idx = pending_hash_index(key);
 
-    /* Linear probing */
+    /* Linear probing with bitwise AND for wrap-around */
     for (int i = 0; i < PENDING_HASH_SIZE; i++) {
-        uint32_t probe = (idx + i) % PENDING_HASH_SIZE;
+        uint32_t probe = (idx + i) & (PENDING_HASH_SIZE - 1);
         if (!loop->pending_hash_occupied[probe]) {
             return false;  /* Empty slot means key not present */
         }
@@ -1759,9 +1873,9 @@ static inline bool pending_hash_insert(erlang_event_loop_t *loop,
     uint64_t key = pending_hash_key(callback_id, type);
     uint32_t idx = pending_hash_index(key);
 
-    /* Linear probing */
+    /* Linear probing with bitwise AND for wrap-around */
     for (int i = 0; i < PENDING_HASH_SIZE; i++) {
-        uint32_t probe = (idx + i) % PENDING_HASH_SIZE;
+        uint32_t probe = (idx + i) & (PENDING_HASH_SIZE - 1);
         if (!loop->pending_hash_occupied[probe]) {
             loop->pending_hash_keys[probe] = key;
             loop->pending_hash_occupied[probe] = true;
@@ -1789,6 +1903,11 @@ static inline void pending_hash_clear(erlang_event_loop_t *loop) {
 
 void event_loop_add_pending(erlang_event_loop_t *loop, event_type_t type,
                             uint64_t callback_id, int fd) {
+    /* Backpressure: check pending count before acquiring lock (fast path) */
+    if (atomic_load(&loop->pending_count) >= MAX_PENDING_EVENTS) {
+        return;  /* Queue full, drop event */
+    }
+
     pthread_mutex_lock(&loop->mutex);
 
     /* O(1) duplicate check using hash set */
@@ -1822,7 +1941,14 @@ void event_loop_add_pending(erlang_event_loop_t *loop, event_type_t type,
     pending_hash_insert(loop, callback_id, type);
 
     atomic_fetch_add(&loop->pending_count, 1);
-    pthread_cond_signal(&loop->event_cond);
+
+    /*
+     * Coalesced wakeup (uvloop-style): Only signal if no wakeup is pending.
+     * This reduces condition variable signals under high event rates.
+     */
+    if (!atomic_exchange(&loop->wake_pending, true)) {
+        pthread_cond_signal(&loop->event_cond);
+    }
 
     pthread_mutex_unlock(&loop->mutex);
 }
@@ -3120,19 +3246,25 @@ ERL_NIF_TERM nif_reactor_on_read_ready(ErlNifEnv *env, int argc,
         return make_error(env, "buffer_creation_failed");
     }
 
-    /* Import erlang.reactor module */
-    PyObject *reactor_module = PyImport_ImportModule("erlang.reactor");
-    if (reactor_module == NULL) {
+    /* Ensure reactor callables are cached (fast path after first call) */
+    if (!ensure_reactor_cached()) {
         PyErr_Clear();
         Py_DECREF(py_buffer);
         py_context_release(&guard);
-        return make_error(env, "import_erlang_reactor_failed");
+        return make_error(env, "reactor_cache_init_failed");
     }
 
-    /* Call on_read_ready(fd, data) with the buffer */
-    PyObject *result = PyObject_CallMethod(reactor_module, "on_read_ready",
-                                            "iO", fd, py_buffer);
-    Py_DECREF(reactor_module);
+    /* Call cached on_read_ready(fd, data) - avoids PyImport on every call */
+    PyObject *py_fd = PyLong_FromLong(fd);
+    if (py_fd == NULL) {
+        PyErr_Clear();
+        Py_DECREF(py_buffer);
+        py_context_release(&guard);
+        return make_error(env, "fd_conversion_failed");
+    }
+
+    PyObject *result = PyObject_CallFunctionObjArgs(g_on_read_ready, py_fd, py_buffer, NULL);
+    Py_DECREF(py_fd);
     Py_DECREF(py_buffer);
 
     if (result == NULL) {
@@ -3190,18 +3322,23 @@ ERL_NIF_TERM nif_reactor_on_write_ready(ErlNifEnv *env, int argc,
         return make_error(env, "acquire_failed");
     }
 
-    /* Import erlang.reactor module */
-    PyObject *reactor_module = PyImport_ImportModule("erlang.reactor");
-    if (reactor_module == NULL) {
+    /* Ensure reactor callables are cached (fast path after first call) */
+    if (!ensure_reactor_cached()) {
         PyErr_Clear();
         py_context_release(&guard);
-        return make_error(env, "import_erlang_reactor_failed");
+        return make_error(env, "reactor_cache_init_failed");
     }
 
-    /* Call on_write_ready(fd) */
-    PyObject *result = PyObject_CallMethod(reactor_module, "on_write_ready",
-                                            "i", fd);
-    Py_DECREF(reactor_module);
+    /* Call cached on_write_ready(fd) - avoids PyImport on every call */
+    PyObject *py_fd = PyLong_FromLong(fd);
+    if (py_fd == NULL) {
+        PyErr_Clear();
+        py_context_release(&guard);
+        return make_error(env, "fd_conversion_failed");
+    }
+
+    PyObject *result = PyObject_CallFunctionObjArgs(g_on_write_ready, py_fd, NULL);
+    Py_DECREF(py_fd);
 
     if (result == NULL) {
         PyErr_Clear();
 
@@ -49,8 +49,9 @@
 /** @brief Maximum events to keep in freelist (Phase 7 optimization) */
 #define EVENT_FREELIST_SIZE 256
 
-/** @brief Size of pending event hash set for O(1) duplicate detection */
-#define PENDING_HASH_SIZE 128
+/** @brief Size of pending event hash set for O(1) duplicate detection
+ *  Note: Must be a power of 2 for efficient bitwise AND indexing */
+#define PENDING_HASH_SIZE 256
 
 /** @brief Event types for pending callbacks */
 typedef enum {
@@ -240,6 +241,11 @@ typedef struct erlang_event_loop {
     /** @brief Count of occupied slots in hash set */
     int pending_hash_count;
 
+    /* ========== Coalesced Wakeup Support ========== */
+
+    /** @brief Flag indicating a wakeup is pending (uvloop-style coalescing) */
+    _Atomic bool wake_pending;
+
     /* ========== Synchronous Sleep Support ========== */
 
     /** @brief Current synchronous sleep ID being waited on */
 
@@ -3659,12 +3659,6 @@ static int load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info) {
         asgi_buffer_resource_dtor,
         ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER, NULL);
 
-    /* ASGI lazy headers resource type for on-demand header conversion */
-    ASGI_LAZY_HEADERS_RESOURCE_TYPE = enif_open_resource_type(
-        env, NULL, "asgi_lazy_headers",
-        lazy_headers_resource_dtor,
-        ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER, NULL);
-
     /* Reactor buffer resource type for zero-copy read handling */
     REACTOR_BUFFER_RESOURCE_TYPE = enif_open_resource_type(
         env, NULL, "reactor_buffer",