diff --git a/Lib/profiling/sampling/sample.py b/Lib/profiling/sampling/sample.py index 5bbe2483581333..235eacd889877b 100644 --- a/Lib/profiling/sampling/sample.py +++ b/Lib/profiling/sampling/sample.py @@ -296,6 +296,33 @@ def _print_unwinder_stats(self): print(f" Hits: {code_hits:n} ({ANSIColors.GREEN}{fmt(code_hits_pct)}%{ANSIColors.RESET})") print(f" Misses: {code_misses:n} ({ANSIColors.RED}{fmt(code_misses_pct)}%{ANSIColors.RESET})") + batched_attempts = stats.get('batched_read_attempts', 0) + batched_successes = stats.get('batched_read_successes', 0) + batched_misses = stats.get('batched_read_misses', 0) + segments_requested = stats.get('batched_read_segments_requested', 0) + segments_completed = stats.get('batched_read_segments_completed', 0) + if batched_attempts > 0: + batched_success_rate = stats.get('batched_read_success_rate', 0.0) + batched_miss_rate = 100.0 - batched_success_rate + segment_completion_rate = stats.get( + 'batched_read_segment_completion_rate', 0.0 + ) + + print(f" {ANSIColors.CYAN}Batched Reads:{ANSIColors.RESET}") + print(f" Attempts: {batched_attempts:n}") + print( + f" Successes: {batched_successes:n} " + f"({ANSIColors.GREEN}{fmt(batched_success_rate)}%{ANSIColors.RESET})" + ) + print( + f" Misses: {batched_misses:n} " + f"({ANSIColors.RED}{fmt(batched_miss_rate)}%{ANSIColors.RESET})" + ) + print( + f" Segments read: {segments_completed:n}/{segments_requested:n} " + f"({ANSIColors.GREEN}{fmt(segment_completion_rate)}%{ANSIColors.RESET})" + ) + # Memory operations memory_reads = stats.get('memory_reads', 0) memory_bytes = stats.get('memory_bytes_read', 0) diff --git a/Lib/test/test_external_inspection.py b/Lib/test/test_external_inspection.py index a29e6cdbbf6c78..6b1529aa173f01 100644 --- a/Lib/test/test_external_inspection.py +++ b/Lib/test/test_external_inspection.py @@ -3767,6 +3767,13 @@ def test_get_stats(self): "frames_read_from_cache", "frames_read_from_memory", "frame_cache_hit_rate", + "batched_read_attempts", + "batched_read_successes", + "batched_read_misses", + "batched_read_segments_requested", + "batched_read_segments_completed", + "batched_read_success_rate", + "batched_read_segment_completion_rate", ] for key in expected_keys: self.assertIn(key, stats) diff --git a/Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst b/Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst new file mode 100644 index 00000000000000..6734250fdd6af3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst @@ -0,0 +1,4 @@ +Fix excessive overhead in the Tachyon profiler when inspecting a remote +process by avoiding repeated remote page-cache scans, batching predicted +remote reads, and reusing cached profiler result objects. Patch by Pablo +Galindo and Maurycy Pawłowski-Wieroński. diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h index 7369cd1514c296..9eac4bfd7ec6ea 100644 --- a/Modules/_remote_debugging/_remote_debugging.h +++ b/Modules/_remote_debugging/_remote_debugging.h @@ -30,6 +30,7 @@ extern "C" { #include "internal/pycore_llist.h" // struct llist_node #include "internal/pycore_long.h" // _PyLong_GetZero #include "internal/pycore_pyerrors.h" // _PyErr_FormatFromCause +#include "internal/pycore_pyhash.h" // _Py_HashPointerRaw #include "internal/pycore_stackref.h" // Py_TAG_BITS #include "../../Python/remote_debug.h" @@ -215,6 +216,8 @@ typedef struct { PyObject *file_name; int first_lineno; PyObject *linetable; // bytes + PyObject *last_frame_info; + ptrdiff_t last_addrq; uintptr_t addr_code_adaptive; } CachedCodeMetadata; @@ -224,11 +227,34 @@ typedef struct { typedef struct { uint64_t thread_id; // 0 = empty slot + uintptr_t thread_state_addr; uintptr_t addrs[FRAME_CACHE_MAX_FRAMES]; Py_ssize_t num_addrs; + PyObject *thread_id_obj; // owned reference, NULL if empty PyObject *frame_list; // owned reference, NULL if empty } FrameCacheEntry; +#define INTERPRETER_THREAD_CACHE_SIZE 32 +#if (INTERPRETER_THREAD_CACHE_SIZE & (INTERPRETER_THREAD_CACHE_SIZE - 1)) != 0 +# error "INTERPRETER_THREAD_CACHE_SIZE must be a power of two" +#endif + +typedef struct { + uintptr_t interpreter_addr; + uintptr_t thread_state_addr; +} InterpreterThreadCacheEntry; + +// Carries already-read thread state and/or frame buffers across helpers so the +// downstream callee can skip a remote read. Address fields are caller-supplied +// inputs; buffer pointers (tstate, frame) are NULL unless a prior batched read +// successfully populated them. +typedef struct { + const char *tstate; + uintptr_t tstate_addr; + const char *frame; + uintptr_t frame_addr; +} RemoteReadPrefetch; + /* Statistics for profiling performance analysis */ typedef struct { uint64_t total_samples; // Total number of get_stack_trace calls @@ -242,14 +268,40 @@ typedef struct { uint64_t code_object_cache_hits; // Code object cache hits uint64_t code_object_cache_misses; // Code object cache misses uint64_t stale_cache_invalidations; // Times stale entries were cleared + uint64_t batched_read_attempts; // Batched remote-read attempts + uint64_t batched_read_successes; // Attempts that read all requested segments + uint64_t batched_read_misses; // Attempts that fell back or partially read + uint64_t batched_read_segments_requested; // Segments requested by batched reads + uint64_t batched_read_segments_completed; // Segments completed by batched reads } UnwinderStats; +#if defined(__GNUC__) || defined(__clang__) +# define REMOTE_DEBUG_UNLIKELY(value) __builtin_expect(!!(value), 0) +#else +# define REMOTE_DEBUG_UNLIKELY(value) (value) +#endif + /* Stats tracking macros - no-op when stats collection is disabled */ #define STATS_INC(unwinder, field) \ - do { if ((unwinder)->collect_stats) (unwinder)->stats.field++; } while(0) + do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) (unwinder)->stats.field++; } while(0) #define STATS_ADD(unwinder, field, val) \ - do { if ((unwinder)->collect_stats) (unwinder)->stats.field += (val); } while(0) + do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) (unwinder)->stats.field += (val); } while(0) + +#define STATS_BATCHED_READ(unwinder, requested, completed) \ + do { \ + if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) { \ + (unwinder)->stats.batched_read_attempts++; \ + (unwinder)->stats.batched_read_segments_requested += (uint64_t)(requested); \ + (unwinder)->stats.batched_read_segments_completed += (uint64_t)(completed); \ + if ((completed) == (requested)) { \ + (unwinder)->stats.batched_read_successes++; \ + } \ + else { \ + (unwinder)->stats.batched_read_misses++; \ + } \ + } \ + } while(0) typedef struct { PyTypeObject *RemoteDebugging_Type; @@ -302,9 +354,14 @@ typedef struct { int cache_frames; int collect_stats; // whether to collect statistics uint32_t stale_invalidation_counter; // counter for throttling frame_cache_invalidate_stale + // L1 single-entry shortcut over cached_tstates[]: most workloads sample one + // interpreter, so check this pair before hashing into the table below. + uintptr_t cached_tstate_interpreter_addr; + uintptr_t cached_tstate_addr; RemoteDebuggingState *cached_state; FrameCacheEntry *frame_cache; // preallocated array of FRAME_CACHE_MAX_THREADS entries UnwinderStats stats; // statistics for performance analysis + InterpreterThreadCacheEntry cached_tstates[INTERPRETER_THREAD_CACHE_SIZE]; #ifdef Py_GIL_DISABLED uint32_t tlbc_generation; _Py_hashtable_t *tlbc_cache; @@ -361,11 +418,13 @@ typedef struct { typedef struct { /* Inputs */ uintptr_t frame_addr; // Starting frame address + uintptr_t thread_state_addr; // Owning thread state address uintptr_t base_frame_addr; // Sentinel at bottom (for validation) uintptr_t gc_frame; // GC frame address (0 if not tracking) uintptr_t last_profiled_frame; // Last cached frame (0 if no cache) StackChunkList *chunks; // Pre-copied stack chunks int skip_first_frame; // Skip frame_addr itself (continue from its caller) + RemoteReadPrefetch prefetch; // Optional already-read thread/frame buffers /* Outputs */ PyObject *frame_info; // List to append FrameInfo objects @@ -548,6 +607,7 @@ extern int process_frame_chain( extern int frame_cache_init(RemoteUnwinderObject *unwinder); extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder); extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id); +extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr); extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder); extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result); extern int frame_cache_lookup_and_extend( @@ -566,6 +626,7 @@ extern int frame_cache_store( PyObject *frame_list, const uintptr_t *addrs, Py_ssize_t num_addrs, + uintptr_t thread_state_addr, uintptr_t base_frame_addr, uintptr_t last_frame_visited); @@ -605,7 +666,8 @@ extern PyObject* unwind_stack_for_thread( uintptr_t *current_tstate, uintptr_t gil_holder_tstate, uintptr_t gc_frame, - uintptr_t main_thread_tstate + uintptr_t main_thread_tstate, + const RemoteReadPrefetch *prefetch ); /* Thread stopping functions (for blocking mode) */ diff --git a/Modules/_remote_debugging/clinic/module.c.h b/Modules/_remote_debugging/clinic/module.c.h index d56622fb82ab56..78b1d3e8d80962 100644 --- a/Modules/_remote_debugging/clinic/module.c.h +++ b/Modules/_remote_debugging/clinic/module.c.h @@ -411,8 +411,15 @@ PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_stats__doc__, " - code_object_cache_hits: Code object cache hits\n" " - code_object_cache_misses: Code object cache misses\n" " - stale_cache_invalidations: Times stale cache entries were cleared\n" +" - batched_read_attempts: Batched remote-read attempts\n" +" - batched_read_successes: Attempts that read all requested segments\n" +" - batched_read_misses: Attempts that fell back or partially read\n" +" - batched_read_segments_requested: Segments requested by batched reads\n" +" - batched_read_segments_completed: Segments completed by batched reads\n" " - frame_cache_hit_rate: Percentage of samples that hit the cache\n" " - code_object_cache_hit_rate: Percentage of code object lookups that hit cache\n" +" - batched_read_success_rate: Percentage of batched reads that completed all segments\n" +" - batched_read_segment_completion_rate: Percentage of requested segments read by batched reads\n" "\n" "Raises:\n" " RuntimeError: If stats collection was not enabled (stats=False)"); @@ -1540,4 +1547,4 @@ _remote_debugging_get_gc_stats(PyObject *module, PyObject *const *args, Py_ssize exit: return return_value; } -/*[clinic end generated code: output=5e2a29746a0c5d65 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=884914b100e9c90c input=a9049054013a1b77]*/ diff --git a/Modules/_remote_debugging/code_objects.c b/Modules/_remote_debugging/code_objects.c index 7b95c0f2d4fa8d..2ac6edb3f662f6 100644 --- a/Modules/_remote_debugging/code_objects.c +++ b/Modules/_remote_debugging/code_objects.c @@ -405,6 +405,8 @@ parse_code_object(RemoteUnwinderObject *unwinder, meta->func_name = func; meta->file_name = file; meta->linetable = linetable; + meta->last_frame_info = NULL; + meta->last_addrq = -1; meta->first_lineno = GET_MEMBER(int, code_object, unwinder->debug_offsets.code_object.firstlineno); meta->addr_code_adaptive = real_address + (uintptr_t)unwinder->debug_offsets.code_object.co_code_adaptive; @@ -482,6 +484,12 @@ parse_code_object(RemoteUnwinderObject *unwinder, addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive; #endif ; // Empty statement to avoid C23 extension warning + + if (!unwinder->opcodes && meta->last_frame_info != NULL && meta->last_addrq == addrq) { + *result = Py_NewRef(meta->last_frame_info); + return 0; + } + LocationInfo info = {0}; bool ok = parse_linetable(addrq, PyBytes_AS_STRING(meta->linetable), PyBytes_GET_SIZE(meta->linetable), @@ -529,6 +537,11 @@ parse_code_object(RemoteUnwinderObject *unwinder, goto error; } + if (!unwinder->opcodes) { + Py_XSETREF(meta->last_frame_info, Py_NewRef(tuple)); + meta->last_addrq = addrq; + } + *result = tuple; return 0; diff --git a/Modules/_remote_debugging/frame_cache.c b/Modules/_remote_debugging/frame_cache.c index b6566d7cff7b54..19fc406bca9ac9 100644 --- a/Modules/_remote_debugging/frame_cache.c +++ b/Modules/_remote_debugging/frame_cache.c @@ -30,6 +30,7 @@ frame_cache_cleanup(RemoteUnwinderObject *unwinder) return; } for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) { + Py_CLEAR(unwinder->frame_cache[i].thread_id_obj); Py_CLEAR(unwinder->frame_cache[i].frame_list); } PyMem_Free(unwinder->frame_cache); @@ -53,6 +54,21 @@ frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id) return NULL; } +FrameCacheEntry * +frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr) +{ + if (!unwinder->frame_cache || tstate_addr == 0) { + return NULL; + } + for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) { + if (unwinder->frame_cache[i].thread_state_addr == tstate_addr) { + assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES); + return &unwinder->frame_cache[i]; + } + } + return NULL; +} + // Allocate a cache slot for a thread // Returns NULL if cache is full (graceful degradation) static FrameCacheEntry * @@ -127,8 +143,10 @@ frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result) } if (!found) { // Clear this entry + Py_CLEAR(unwinder->frame_cache[i].thread_id_obj); Py_CLEAR(unwinder->frame_cache[i].frame_list); unwinder->frame_cache[i].thread_id = 0; + unwinder->frame_cache[i].thread_state_addr = 0; unwinder->frame_cache[i].num_addrs = 0; STATS_INC(unwinder, stale_cache_invalidations); } @@ -216,6 +234,7 @@ frame_cache_store( PyObject *frame_list, const uintptr_t *addrs, Py_ssize_t num_addrs, + uintptr_t thread_state_addr, uintptr_t base_frame_addr, uintptr_t last_frame_visited) { @@ -257,6 +276,13 @@ frame_cache_store( return -1; } entry->thread_id = thread_id; + entry->thread_state_addr = thread_state_addr; + if (entry->thread_id_obj == NULL) { + entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id); + if (entry->thread_id_obj == NULL) { + return -1; + } + } memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t)); entry->num_addrs = num_addrs; assert(entry->num_addrs == num_addrs); diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c index bbdfce3f7201d9..8d8019396b3e31 100644 --- a/Modules/_remote_debugging/frames.c +++ b/Modules/_remote_debugging/frames.c @@ -186,30 +186,16 @@ is_frame_valid( return 1; } -int -parse_frame_object( +static int +parse_frame_buffer( RemoteUnwinderObject *unwinder, PyObject** result, - uintptr_t address, + const char *frame, uintptr_t* address_of_code_object, uintptr_t* previous_frame ) { - char frame[SIZEOF_INTERP_FRAME]; *address_of_code_object = 0; - Py_ssize_t bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory( - &unwinder->handle, - address, - SIZEOF_INTERP_FRAME, - frame - ); - if (bytes_read < 0) { - set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame"); - return -1; - } - STATS_INC(unwinder, memory_reads); - STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME); - *previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous); uintptr_t code_object = GET_MEMBER_NO_TAG(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable); int frame_valid = is_frame_valid(unwinder, (uintptr_t)frame, code_object); @@ -237,6 +223,31 @@ parse_frame_object( return parse_code_object(unwinder, result, &code_ctx); } +int +parse_frame_object( + RemoteUnwinderObject *unwinder, + PyObject** result, + uintptr_t address, + uintptr_t* address_of_code_object, + uintptr_t* previous_frame +) { + char frame[SIZEOF_INTERP_FRAME]; + Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory( + &unwinder->handle, + address, + SIZEOF_INTERP_FRAME, + frame + ); + if (bytes_read < 0) { + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame"); + return -1; + } + STATS_INC(unwinder, memory_reads); + STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME); + + return parse_frame_buffer(unwinder, result, frame, address_of_code_object, previous_frame); +} + int parse_frame_from_chunks( RemoteUnwinderObject *unwinder, @@ -312,15 +323,32 @@ process_frame_chain( } assert(frame_count <= MAX_FRAMES); - if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) < 0) { + if (ctx->chunks && ctx->chunks->count > 0) { + if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) == 0) { + goto parsed_frame; + } PyErr_Clear(); + } + { uintptr_t address_of_code_object = 0; - if (parse_frame_object(unwinder, &frame, frame_addr, &address_of_code_object, &next_frame_addr) < 0) { + int parse_result; + if (ctx->prefetch.frame && ctx->prefetch.frame_addr == frame_addr) { + parse_result = parse_frame_buffer( + unwinder, &frame, ctx->prefetch.frame, + &address_of_code_object, &next_frame_addr); + } + else { + parse_result = parse_frame_object( + unwinder, &frame, frame_addr, + &address_of_code_object, &next_frame_addr); + } + if (parse_result < 0) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to parse frame object in chain"); return -1; } } +parsed_frame: // Skip first frame if requested (used for cache miss continuation) if (ctx->skip_first_frame && frame_count == 1) { Py_XDECREF(frame); @@ -501,41 +529,37 @@ try_full_cache_hit( PyObject *current_frame = NULL; uintptr_t code_object_addr = 0; uintptr_t previous_frame = 0; - int parse_result = parse_frame_object(unwinder, ¤t_frame, ctx->frame_addr, + int parse_result; + if (ctx->prefetch.frame && ctx->prefetch.frame_addr == ctx->frame_addr) { + parse_result = parse_frame_buffer(unwinder, ¤t_frame, + ctx->prefetch.frame, &code_object_addr, &previous_frame); + } + else { + parse_result = parse_frame_object(unwinder, ¤t_frame, ctx->frame_addr, + &code_object_addr, &previous_frame); + } if (parse_result < 0) { return -1; } - Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list); - PyObject *parent_slice = NULL; - if (cached_size > 1) { - parent_slice = PyList_GetSlice(entry->frame_list, 1, cached_size); - if (!parent_slice) { - Py_XDECREF(current_frame); - return -1; - } - } - if (current_frame != NULL) { if (PyList_Append(ctx->frame_info, current_frame) < 0) { Py_DECREF(current_frame); - Py_XDECREF(parent_slice); return -1; } Py_DECREF(current_frame); STATS_ADD(unwinder, frames_read_from_memory, 1); } - if (parent_slice) { - Py_ssize_t cur_size = PyList_GET_SIZE(ctx->frame_info); - int result = PyList_SetSlice(ctx->frame_info, cur_size, cur_size, parent_slice); - Py_DECREF(parent_slice); - if (result < 0) { + Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list); + for (Py_ssize_t i = 1; i < cached_size; i++) { + PyObject *cached_frame = PyList_GET_ITEM(entry->frame_list, i); + if (PyList_Append(ctx->frame_info, cached_frame) < 0) { return -1; } - STATS_ADD(unwinder, frames_read_from_cache, cached_size - 1); } + STATS_ADD(unwinder, frames_read_from_cache, cached_size > 1 ? cached_size - 1 : 0); STATS_INC(unwinder, frame_cache_hits); return 1; @@ -606,7 +630,8 @@ collect_frames_with_cache( } if (frame_cache_store(unwinder, thread_id, ctx->frame_info, ctx->frame_addrs, ctx->num_addrs, - ctx->base_frame_addr, ctx->last_frame_visited) < 0) { + ctx->thread_state_addr, ctx->base_frame_addr, + ctx->last_frame_visited) < 0) { return -1; } diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c index efdd2e1a2d7b7a..ea138b03252367 100644 --- a/Modules/_remote_debugging/module.c +++ b/Modules/_remote_debugging/module.c @@ -166,6 +166,7 @@ cached_code_metadata_destroy(void *ptr) Py_DECREF(meta->func_name); Py_DECREF(meta->file_name); Py_DECREF(meta->linetable); + Py_XDECREF(meta->last_frame_info); PyMem_RawFree(meta); } @@ -360,6 +361,9 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self, self->cache_frames = cache_frames; self->collect_stats = stats; self->stale_invalidation_counter = 0; + self->cached_tstate_interpreter_addr = 0; + self->cached_tstate_addr = 0; + memset(self->cached_tstates, 0, sizeof(self->cached_tstates)); self->debug = debug; self->only_active_thread = only_active_thread; self->mode = mode; @@ -473,6 +477,152 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self, return 0; } +static inline size_t +interpreter_thread_cache_index(uintptr_t interpreter_addr) +{ + // Direct-mapped table indexed by the remote interpreter address. Each entry + // stores the full address and verifies it on lookup, so hash collisions + // degrade to misses and cannot return a wrong tstate. + return (size_t)_Py_HashPointerRaw((const void *)interpreter_addr) + & (INTERPRETER_THREAD_CACHE_SIZE - 1); +} + +static inline uintptr_t +get_cached_tstate_for_interpreter( + RemoteUnwinderObject *self, + uintptr_t interpreter_addr) +{ + if (interpreter_addr == 0) { + return 0; + } + + if (self->cached_tstate_interpreter_addr == interpreter_addr) { + return self->cached_tstate_addr; + } + + InterpreterThreadCacheEntry *entry = + &self->cached_tstates[interpreter_thread_cache_index(interpreter_addr)]; + if (entry->interpreter_addr == interpreter_addr) { + self->cached_tstate_interpreter_addr = interpreter_addr; + self->cached_tstate_addr = entry->thread_state_addr; + return entry->thread_state_addr; + } + return 0; +} + +static inline void +set_cached_tstate_for_interpreter( + RemoteUnwinderObject *self, + uintptr_t interpreter_addr, + uintptr_t thread_state_addr) +{ + if (interpreter_addr == 0 || thread_state_addr == 0) { + return; + } + + self->cached_tstate_interpreter_addr = interpreter_addr; + self->cached_tstate_addr = thread_state_addr; + + InterpreterThreadCacheEntry *entry = + &self->cached_tstates[interpreter_thread_cache_index(interpreter_addr)]; + entry->interpreter_addr = interpreter_addr; + entry->thread_state_addr = thread_state_addr; +} + +static void +refresh_generation_caches_from_interp_state( + RemoteUnwinderObject *self, + const char *interp_state_buffer) +{ + uint64_t code_object_generation = GET_MEMBER(uint64_t, interp_state_buffer, + self->debug_offsets.interpreter_state.code_object_generation); + + if (code_object_generation != self->code_object_generation) { + self->code_object_generation = code_object_generation; + _Py_hashtable_clear(self->code_object_cache); + } + +#ifdef Py_GIL_DISABLED + uint32_t current_tlbc_generation = GET_MEMBER(uint32_t, interp_state_buffer, + self->debug_offsets.interpreter_state.tlbc_generation); + if (current_tlbc_generation != self->tlbc_generation) { + self->tlbc_generation = current_tlbc_generation; + _Py_hashtable_clear(self->tlbc_cache); + } +#endif +} + +static int +refresh_generation_caches_for_interpreter( + RemoteUnwinderObject *self, + uintptr_t interpreter_addr) +{ + char interp_state_buffer[INTERP_STATE_BUFFER_SIZE]; + if (_Py_RemoteDebug_ReadRemoteMemory( + &self->handle, + interpreter_addr, + INTERP_STATE_BUFFER_SIZE, + interp_state_buffer) < 0) { + set_exception_cause(self, PyExc_RuntimeError, + "Failed to read interpreter state buffer"); + return -1; + } + refresh_generation_caches_from_interp_state(self, interp_state_buffer); + return 0; +} + +static int +read_interp_state_and_maybe_thread_frame( + RemoteUnwinderObject *unwinder, + uintptr_t interpreter_addr, + char *interp_state_buffer, + char *tstate_buffer, + char *frame_buffer, + RemoteReadPrefetch *prefetch) +{ + prefetch->tstate = NULL; + prefetch->frame = NULL; + if (prefetch->tstate_addr != 0) { + size_t tstate_size = (size_t)unwinder->debug_offsets.thread_state.size; + _Py_RemoteReadSegment segments[3] = { + {interpreter_addr, interp_state_buffer, INTERP_STATE_BUFFER_SIZE}, + {prefetch->tstate_addr, tstate_buffer, tstate_size}, + {prefetch->frame_addr, frame_buffer, SIZEOF_INTERP_FRAME}, + }; + int nsegs = prefetch->frame_addr != 0 ? 3 : 2; + Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory( + &unwinder->handle, segments, nsegs); + int completed = 0; + if (nread >= (Py_ssize_t)INTERP_STATE_BUFFER_SIZE) { + completed = 1; + Py_ssize_t with_tstate = (Py_ssize_t)INTERP_STATE_BUFFER_SIZE + + (Py_ssize_t)tstate_size; + if (nread >= with_tstate) { + completed = 2; + } + if (nsegs == 3 + && nread == with_tstate + (Py_ssize_t)SIZEOF_INTERP_FRAME) { + completed = 3; + } + } + STATS_BATCHED_READ(unwinder, nsegs, completed); + if (completed >= 1) { + if (completed >= 2) { + prefetch->tstate = tstate_buffer; + } + if (completed >= 3) { + prefetch->frame = frame_buffer; + } + return 0; + } + } + return _Py_RemoteDebug_ReadRemoteMemory( + &unwinder->handle, + interpreter_addr, + INTERP_STATE_BUFFER_SIZE, + interp_state_buffer); +} + /*[clinic input] @permit_long_docstring_body @critical_section @@ -537,15 +687,32 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self while (current_interpreter != 0) { // Read interpreter state to get the interpreter ID char interp_state_buffer[INTERP_STATE_BUFFER_SIZE]; - if (_Py_RemoteDebug_PagedReadRemoteMemory( - &self->handle, + char prefetched_tstate[SIZEOF_THREAD_STATE]; + char prefetched_frame[SIZEOF_INTERP_FRAME]; + RemoteReadPrefetch prefetch = {0}; + if (self->cache_frames) { + prefetch.tstate_addr = get_cached_tstate_for_interpreter( + self, current_interpreter); + } + if (prefetch.tstate_addr != 0) { + FrameCacheEntry *entry = frame_cache_find_by_tstate(self, prefetch.tstate_addr); + if (entry && entry->num_addrs > 0) { + prefetch.frame_addr = entry->addrs[0]; + } + } + + if (read_interp_state_and_maybe_thread_frame( + self, current_interpreter, - INTERP_STATE_BUFFER_SIZE, - interp_state_buffer) < 0) { + interp_state_buffer, + prefetched_tstate, + prefetched_frame, + &prefetch) < 0) { set_exception_cause(self, PyExc_RuntimeError, "Failed to read interpreter state buffer"); Py_CLEAR(result); goto exit; } + refresh_generation_caches_from_interp_state(self, interp_state_buffer); uintptr_t gc_frame = 0; if (self->gc) { @@ -557,25 +724,6 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self int64_t interpreter_id = GET_MEMBER(int64_t, interp_state_buffer, self->debug_offsets.interpreter_state.id); - // Get code object generation from buffer - uint64_t code_object_generation = GET_MEMBER(uint64_t, interp_state_buffer, - self->debug_offsets.interpreter_state.code_object_generation); - - if (code_object_generation != self->code_object_generation) { - self->code_object_generation = code_object_generation; - _Py_hashtable_clear(self->code_object_cache); - } - -#ifdef Py_GIL_DISABLED - // Check TLBC generation and invalidate cache if needed - uint32_t current_tlbc_generation = GET_MEMBER(uint32_t, interp_state_buffer, - self->debug_offsets.interpreter_state.tlbc_generation); - if (current_tlbc_generation != self->tlbc_generation) { - self->tlbc_generation = current_tlbc_generation; - _Py_hashtable_clear(self->tlbc_cache); - } -#endif - // Create a list to hold threads for this interpreter PyObject *interpreter_threads = PyList_New(0); if (!interpreter_threads) { @@ -611,6 +759,9 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self // Target specific thread (only process first interpreter) current_tstate = self->tstate_addr; } + if (current_tstate != 0 && self->cache_frames) { + set_cached_tstate_for_interpreter(self, current_interpreter, current_tstate); + } // Acquire main thread state information uintptr_t main_thread_tstate = GET_MEMBER(uintptr_t, interp_state_buffer, @@ -621,7 +772,8 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self PyObject* frame_info = unwind_stack_for_thread(self, ¤t_tstate, gil_holder_tstate, gc_frame, - main_thread_tstate); + main_thread_tstate, + &prefetch); if (!frame_info) { // Check if this was an intentional skip due to mode-based filtering if ((self->mode == PROFILING_MODE_CPU || self->mode == PROFILING_MODE_GIL || @@ -771,6 +923,9 @@ _remote_debugging_RemoteUnwinder_get_all_awaited_by_impl(RemoteUnwinderObject *s if (ensure_async_debug_offsets(self) < 0) { return NULL; } + if (refresh_generation_caches_for_interpreter(self, self->interpreter_addr) < 0) { + return NULL; + } PyObject *result = PyList_New(0); if (result == NULL) { @@ -860,6 +1015,9 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject if (ensure_async_debug_offsets(self) < 0) { return NULL; } + if (refresh_generation_caches_for_interpreter(self, self->interpreter_addr) < 0) { + return NULL; + } PyObject *result = PyList_New(0); if (result == NULL) { @@ -904,8 +1062,15 @@ RemoteUnwinder was created with stats=True. - code_object_cache_hits: Code object cache hits - code_object_cache_misses: Code object cache misses - stale_cache_invalidations: Times stale cache entries were cleared + - batched_read_attempts: Batched remote-read attempts + - batched_read_successes: Attempts that read all requested segments + - batched_read_misses: Attempts that fell back or partially read + - batched_read_segments_requested: Segments requested by batched reads + - batched_read_segments_completed: Segments completed by batched reads - frame_cache_hit_rate: Percentage of samples that hit the cache - code_object_cache_hit_rate: Percentage of code object lookups that hit cache + - batched_read_success_rate: Percentage of batched reads that completed all segments + - batched_read_segment_completion_rate: Percentage of requested segments read by batched reads Raises: RuntimeError: If stats collection was not enabled (stats=False) @@ -913,7 +1078,7 @@ RemoteUnwinder was created with stats=True. static PyObject * _remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self) -/*[clinic end generated code: output=21e36477122be2a0 input=75fef4134c12a8c9]*/ +/*[clinic end generated code: output=21e36477122be2a0 input=0392d62b278e9c35]*/ { if (!self->collect_stats) { PyErr_SetString(PyExc_RuntimeError, @@ -948,9 +1113,24 @@ _remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self) ADD_STAT(code_object_cache_hits); ADD_STAT(code_object_cache_misses); ADD_STAT(stale_cache_invalidations); + ADD_STAT(batched_read_attempts); + ADD_STAT(batched_read_successes); + ADD_STAT(batched_read_misses); + ADD_STAT(batched_read_segments_requested); + ADD_STAT(batched_read_segments_completed); #undef ADD_STAT +#define ADD_DERIVED_STAT(name, value) do { \ + PyObject *val = PyFloat_FromDouble(value); \ + if (!val || PyDict_SetItemString(result, name, val) < 0) { \ + Py_XDECREF(val); \ + Py_DECREF(result); \ + return NULL; \ + } \ + Py_DECREF(val); \ +} while(0) + // Calculate and add derived statistics // Hit rate is calculated as (hits + partial_hits) / total_cache_lookups double frame_cache_hit_rate = 0.0; @@ -959,26 +1139,33 @@ _remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self) frame_cache_hit_rate = 100.0 * (double)(self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits) / (double)total_cache_lookups; } - PyObject *hit_rate = PyFloat_FromDouble(frame_cache_hit_rate); - if (!hit_rate || PyDict_SetItemString(result, "frame_cache_hit_rate", hit_rate) < 0) { - Py_XDECREF(hit_rate); - Py_DECREF(result); - return NULL; - } - Py_DECREF(hit_rate); + ADD_DERIVED_STAT("frame_cache_hit_rate", frame_cache_hit_rate); double code_object_hit_rate = 0.0; uint64_t total_code_lookups = self->stats.code_object_cache_hits + self->stats.code_object_cache_misses; if (total_code_lookups > 0) { code_object_hit_rate = 100.0 * (double)self->stats.code_object_cache_hits / (double)total_code_lookups; } - PyObject *code_hit_rate = PyFloat_FromDouble(code_object_hit_rate); - if (!code_hit_rate || PyDict_SetItemString(result, "code_object_cache_hit_rate", code_hit_rate) < 0) { - Py_XDECREF(code_hit_rate); - Py_DECREF(result); - return NULL; + ADD_DERIVED_STAT("code_object_cache_hit_rate", code_object_hit_rate); + + double batched_read_success_rate = 0.0; + if (self->stats.batched_read_attempts > 0) { + batched_read_success_rate = + 100.0 * (double)self->stats.batched_read_successes + / (double)self->stats.batched_read_attempts; + } + ADD_DERIVED_STAT("batched_read_success_rate", batched_read_success_rate); + + double batched_read_segment_completion_rate = 0.0; + if (self->stats.batched_read_segments_requested > 0) { + batched_read_segment_completion_rate = + 100.0 * (double)self->stats.batched_read_segments_completed + / (double)self->stats.batched_read_segments_requested; } - Py_DECREF(code_hit_rate); + ADD_DERIVED_STAT("batched_read_segment_completion_rate", + batched_read_segment_completion_rate); + +#undef ADD_DERIVED_STAT return result; } diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c index 4daa5e5f92bcd9..7284f43042061a 100644 --- a/Modules/_remote_debugging/threads.c +++ b/Modules/_remote_debugging/threads.c @@ -289,28 +289,110 @@ typedef struct { unsigned int :24; } _thread_status; +static int +read_thread_state_and_maybe_frame( + RemoteUnwinderObject *unwinder, + uintptr_t tstate_addr, + size_t tstate_size, + char *tstate_buffer, + uintptr_t predicted_frame_addr, + char *frame_buffer, + int *frame_read) +{ + *frame_read = 0; + if (predicted_frame_addr != 0) { + _Py_RemoteReadSegment segments[2] = { + {tstate_addr, tstate_buffer, tstate_size}, + {predicted_frame_addr, frame_buffer, SIZEOF_INTERP_FRAME}, + }; + Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory( + &unwinder->handle, segments, 2); + int completed = 0; + if (nread >= (Py_ssize_t)tstate_size) { + completed = 1; + if (nread == (Py_ssize_t)(tstate_size + SIZEOF_INTERP_FRAME)) { + completed = 2; + } + } + STATS_BATCHED_READ(unwinder, 2, completed); + if (completed >= 1) { + *frame_read = completed == 2; + return 0; + } + } + return _Py_RemoteDebug_ReadRemoteMemory( + &unwinder->handle, tstate_addr, tstate_size, tstate_buffer); +} + PyObject* unwind_stack_for_thread( RemoteUnwinderObject *unwinder, uintptr_t *current_tstate, uintptr_t gil_holder_tstate, uintptr_t gc_frame, - uintptr_t main_thread_tstate + uintptr_t main_thread_tstate, + const RemoteReadPrefetch *prefetch ) { PyObject *frame_info = NULL; PyObject *thread_id = NULL; PyObject *result = NULL; StackChunkList chunks = {0}; - char ts[SIZEOF_THREAD_STATE]; - int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory( - &unwinder->handle, *current_tstate, (size_t)unwinder->debug_offsets.thread_state.size, ts); - if (bytes_read < 0) { - set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state"); - goto error; + char local_ts[SIZEOF_THREAD_STATE]; + char local_prefetched_frame[SIZEOF_INTERP_FRAME]; + const char *ts; + RemoteReadPrefetch ctx_prefetch = {0}; + if (prefetch->tstate && prefetch->tstate_addr == *current_tstate) { + ts = prefetch->tstate; + if (prefetch->frame) { + ctx_prefetch.frame = prefetch->frame; + ctx_prefetch.frame_addr = prefetch->frame_addr; + } + } + else if (unwinder->cache_frames) { + uintptr_t predicted_frame_addr = 0; + int have_prefetched_frame = 0; + FrameCacheEntry *entry = frame_cache_find_by_tstate(unwinder, *current_tstate); + if (entry && entry->num_addrs > 0) { + predicted_frame_addr = entry->addrs[0]; + } + + int rc = read_thread_state_and_maybe_frame( + unwinder, + *current_tstate, + (size_t)unwinder->debug_offsets.thread_state.size, + local_ts, + predicted_frame_addr, + local_prefetched_frame, + &have_prefetched_frame); + if (rc < 0) { + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state"); + goto error; + } + ts = local_ts; + if (have_prefetched_frame) { + ctx_prefetch.frame = local_prefetched_frame; + ctx_prefetch.frame_addr = predicted_frame_addr; + } + } + else { + int rc = _Py_RemoteDebug_ReadRemoteMemory( + &unwinder->handle, + *current_tstate, + (size_t)unwinder->debug_offsets.thread_state.size, + local_ts); + if (rc < 0) { + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state"); + goto error; + } + ts = local_ts; } STATS_INC(unwinder, memory_reads); STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.thread_state.size); + if (ctx_prefetch.frame) { + STATS_INC(unwinder, memory_reads); + STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME); + } long tid = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id); @@ -432,9 +514,11 @@ unwind_stack_for_thread( uintptr_t addrs[FRAME_CACHE_MAX_FRAMES]; FrameWalkContext ctx = { .frame_addr = frame_addr, + .thread_state_addr = *current_tstate, .base_frame_addr = base_frame_addr, .gc_frame = gc_frame, .chunks = &chunks, + .prefetch = ctx_prefetch, .frame_info = frame_info, .frame_addrs = addrs, .num_addrs = 0, @@ -469,10 +553,18 @@ unwind_stack_for_thread( *current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next); - thread_id = PyLong_FromLongLong(tid); + if (unwinder->cache_frames) { + FrameCacheEntry *entry = frame_cache_find(unwinder, (uint64_t)tid); + if (entry && entry->thread_id_obj) { + thread_id = Py_NewRef(entry->thread_id_obj); + } + } if (thread_id == NULL) { - set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread ID"); - goto error; + thread_id = PyLong_FromLongLong(tid); + if (thread_id == NULL) { + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread ID"); + goto error; + } } RemoteDebuggingState *state = RemoteDebugging_GetStateFromObject((PyObject*)unwinder); diff --git a/Python/remote_debug.h b/Python/remote_debug.h index 6c089a834dcd40..7b2c4f3bcb8077 100644 --- a/Python/remote_debug.h +++ b/Python/remote_debug.h @@ -147,6 +147,7 @@ typedef struct { int memfd; #endif page_cache_entry_t pages[MAX_PAGES]; + int page_cache_count; Py_ssize_t page_size; } proc_handle_t; @@ -185,14 +186,16 @@ _Py_RemoteDebug_FreePageCache(proc_handle_t *handle) handle->pages[i].data = NULL; handle->pages[i].valid = 0; } + handle->page_cache_count = 0; } UNUSED static void _Py_RemoteDebug_ClearCache(proc_handle_t *handle) { - for (int i = 0; i < MAX_PAGES; i++) { + for (int i = 0; i < handle->page_cache_count; i++) { handle->pages[i].valid = 0; } + handle->page_cache_count = 0; } #if defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX @@ -222,6 +225,7 @@ _Py_RemoteDebug_InitProcHandle(proc_handle_t *handle, pid_t pid) { handle->memfd = -1; #endif handle->page_size = get_page_size(); + handle->page_cache_count = 0; for (int i = 0; i < MAX_PAGES; i++) { handle->pages[i].data = NULL; handle->pages[i].valid = 0; @@ -1287,8 +1291,9 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle, return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out); } - // Search for valid cached page - for (int i = 0; i < MAX_PAGES; i++) { + // Search only the pages used since the last clear. The cache is cleared + // between profiler samples, so entries are packed at the front. + for (int i = 0; i < handle->page_cache_count; i++) { page_cache_entry_t *entry = &handle->pages[i]; if (entry->valid && entry->page_addr == page_base) { memcpy(out, entry->data + offset_in_page, size); @@ -1296,33 +1301,31 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle, } } - // Find reusable slot - for (int i = 0; i < MAX_PAGES; i++) { - page_cache_entry_t *entry = &handle->pages[i]; - if (!entry->valid) { + if (handle->page_cache_count < MAX_PAGES) { + page_cache_entry_t *entry = &handle->pages[handle->page_cache_count]; + if (entry->data == NULL) { + entry->data = PyMem_RawMalloc(page_size); if (entry->data == NULL) { - entry->data = PyMem_RawMalloc(page_size); - if (entry->data == NULL) { - PyErr_NoMemory(); - _set_debug_exception_cause(PyExc_MemoryError, - "Cannot allocate %zu bytes for page cache entry " - "during read from PID %d at address 0x%lx", - page_size, handle->pid, addr); - return -1; - } - } - - if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, entry->data) < 0) { - // Try to just copy the exact amount as a fallback - PyErr_Clear(); - goto fallback; + PyErr_NoMemory(); + _set_debug_exception_cause(PyExc_MemoryError, + "Cannot allocate %zu bytes for page cache entry " + "during read from PID %d at address 0x%lx", + page_size, handle->pid, addr); + return -1; } + } - entry->page_addr = page_base; - entry->valid = 1; - memcpy(out, entry->data + offset_in_page, size); - return 0; + if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, entry->data) < 0) { + // Try to just copy the exact amount as a fallback + PyErr_Clear(); + goto fallback; } + + entry->page_addr = page_base; + entry->valid = 1; + handle->page_cache_count++; + memcpy(out, entry->data + offset_in_page, size); + return 0; } fallback: @@ -1330,6 +1333,49 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle, return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out); } +typedef struct { + uintptr_t remote_addr; + void *local_buf; + size_t size; +} _Py_RemoteReadSegment; + +#define _PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS 4 + +// Batched read of multiple remote regions in a single syscall when supported. +// Returns total bytes read (>= 0) on success, -1 if batched reads are +// unavailable or the syscall failed. Callers compare the return value against +// cumulative segment sizes to determine which segments were fully populated. +UNUSED static Py_ssize_t +_Py_RemoteDebug_BatchedReadRemoteMemory( + proc_handle_t *handle, + const _Py_RemoteReadSegment *segments, + int nsegs) +{ +#if defined(__linux__) && HAVE_PROCESS_VM_READV + if (handle->memfd == -1 + && nsegs > 0 + && nsegs <= _PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS) { + struct iovec local[_PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS]; + struct iovec remote[_PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS]; + for (int i = 0; i < nsegs; i++) { + local[i].iov_base = segments[i].local_buf; + local[i].iov_len = segments[i].size; + remote[i].iov_base = (void *)segments[i].remote_addr; + remote[i].iov_len = segments[i].size; + } + ssize_t nread = process_vm_readv(handle->pid, local, nsegs, remote, nsegs, 0); + if (nread >= 0) { + return (Py_ssize_t)nread; + } + } +#else + (void)handle; + (void)segments; + (void)nsegs; +#endif + return -1; +} + UNUSED static int _Py_RemoteDebug_ReadDebugOffsets( proc_handle_t *handle, diff --git a/Tools/inspection/benchmark_external_inspection.py b/Tools/inspection/benchmark_external_inspection.py index fee3435496da0b..8e367422a961da 100644 --- a/Tools/inspection/benchmark_external_inspection.py +++ b/Tools/inspection/benchmark_external_inspection.py @@ -151,6 +151,45 @@ def create_threads(n): time.sleep(0.05) ''' +ASYNC_CODE = '''\ +import asyncio +import contextlib +import math + +def compute_slice(seed): + result = 0.0 + for i in range(2000): + result += math.sin(seed + i) * math.sqrt(i + 1) + return result + +async def leaf_task(seed): + total = 0.0 + while True: + total += compute_slice(seed) + await asyncio.sleep(0) + +async def parent_task(seed): + child = asyncio.create_task(leaf_task(seed + 1000), name=f"leaf-{seed}") + try: + while True: + compute_slice(seed) + await asyncio.sleep(0.001) + finally: + child.cancel() + with contextlib.suppress(asyncio.CancelledError): + await child + +async def main(): + tasks = [ + asyncio.create_task(parent_task(i), name=f"parent-{i}") + for i in range(8) + ] + await asyncio.gather(*tasks) + +if __name__ == "__main__": + asyncio.run(main()) +''' + CODE_EXAMPLES = { "basic": { "code": CODE, @@ -164,10 +203,29 @@ def create_threads(n): "code": CODE_WITH_TONS_OF_THREADS, "description": "Tons of threads doing mixed CPU/IO work", }, + "asyncio": { + "code": ASYNC_CODE, + "description": "Asyncio tasks with active and awaited coroutine chains", + }, +} + +OPERATIONS = { + "stack_trace": { + "method": "get_stack_trace", + "label": "get_stack_trace()", + }, + "async_stack_trace": { + "method": "get_async_stack_trace", + "label": "get_async_stack_trace()", + }, + "all_awaited_by": { + "method": "get_all_awaited_by", + "label": "get_all_awaited_by()", + }, } -def benchmark(unwinder, duration_seconds=10, blocking=False): +def benchmark(unwinder, duration_seconds=10, blocking=False, operation="stack_trace"): """Benchmark mode - measure raw sampling speed for specified duration""" sample_count = 0 fail_count = 0 @@ -175,11 +233,14 @@ def benchmark(unwinder, duration_seconds=10, blocking=False): start_time = time.perf_counter() end_time = start_time + duration_seconds total_attempts = 0 + operation_info = OPERATIONS[operation] + operation_method = getattr(unwinder, operation_info["method"]) colors = get_colors(can_colorize()) print( - f"{colors.BOLD_BLUE}Benchmarking sampling speed for {duration_seconds} seconds...{colors.RESET}" + f"{colors.BOLD_BLUE}Benchmarking {operation_info['label']} speed " + f"for {duration_seconds} seconds...{colors.RESET}" ) try: @@ -190,8 +251,8 @@ def benchmark(unwinder, duration_seconds=10, blocking=False): if blocking: unwinder.pause_threads() try: - stack_trace = unwinder.get_stack_trace() - if stack_trace: + sample = operation_method() + if sample: sample_count += 1 finally: if blocking: @@ -239,6 +300,7 @@ def benchmark(unwinder, duration_seconds=10, blocking=False): (sample_count / total_attempts) * 100 if total_attempts > 0 else 0 ), "total_work_time": total_work_time, + "operation": operation_info["label"], "avg_work_time_us": ( (total_work_time / total_attempts) * 1e6 if total_attempts > 0 else 0 ), @@ -252,7 +314,7 @@ def print_benchmark_results(results): colors = get_colors(can_colorize()) print(f"\n{colors.BOLD_GREEN}{'='*60}{colors.RESET}") - print(f"{colors.BOLD_GREEN}get_stack_trace() Benchmark Results{colors.RESET}") + print(f"{colors.BOLD_GREEN}{results['operation']} Benchmark Results{colors.RESET}") print(f"{colors.BOLD_GREEN}{'='*60}{colors.RESET}") # Basic statistics @@ -329,6 +391,8 @@ def parse_arguments(): %(prog)s -d 60 # Run basic benchmark for 60 seconds %(prog)s --code deep_static # Run deep static call stack benchmark %(prog)s --code deep_static -d 30 # Run deep static benchmark for 30 seconds + %(prog)s --operation async_stack_trace + %(prog)s --operation all_awaited_by Available code examples: {examples_desc} @@ -348,8 +412,15 @@ def parse_arguments(): "--code", "-c", choices=list(CODE_EXAMPLES.keys()), - default="basic", - help="Code example to benchmark (default: basic)", + default=None, + help="Code example to benchmark (default: basic, or asyncio for async operations)", + ) + + parser.add_argument( + "--operation", + choices=list(OPERATIONS.keys()), + default="stack_trace", + help="Remote unwinder operation to benchmark (default: stack_trace)", ) parser.add_argument( @@ -365,7 +436,10 @@ def parse_arguments(): help="Stop all threads before sampling for consistent snapshots", ) - return parser.parse_args() + args = parser.parse_args() + if args.code is None: + args.code = "asyncio" if args.operation != "stack_trace" else "basic" + return args def create_target_process(temp_file, code_example="basic"): @@ -420,6 +494,9 @@ def main(): print( f"{colors.CYAN}Benchmark Duration:{colors.RESET} {colors.YELLOW}{args.duration}{colors.RESET} seconds" ) + print( + f"{colors.CYAN}Operation:{colors.RESET} {colors.GREEN}{OPERATIONS[args.operation]['label']}{colors.RESET}" + ) print( f"{colors.CYAN}Blocking Mode:{colors.RESET} {colors.GREEN if args.blocking else colors.YELLOW}{'enabled' if args.blocking else 'disabled'}{colors.RESET}" ) @@ -451,7 +528,12 @@ def main(): unwinder = _remote_debugging.RemoteUnwinder( process.pid, cache_frames=True, **kwargs ) - results = benchmark(unwinder, duration_seconds=args.duration, blocking=args.blocking) + results = benchmark( + unwinder, + duration_seconds=args.duration, + blocking=args.blocking, + operation=args.operation, + ) finally: cleanup_process(process, temp_file_path)