Skip to content

Commit 5848408

Browse files
Add portable SIMD optimization for find_max_char (STRINGLIB_SIZEOF_CHAR==1)
Use GCC/Clang vector extensions to check 64 bytes at a time for non-ASCII characters, similar to the approach in pythongh-143991 (pystrhex). The compiler lowers the 64-byte vector to the widest native SIMD available (4x NEON on AArch64, 2x AVX-256 or 1x AVX-512 on x86-64). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 2cf6a68 commit 5848408

1 file changed

Lines changed: 53 additions & 0 deletions

File tree

Objects/stringlib/find_max_char.h

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,57 @@
1717

1818
#if STRINGLIB_SIZEOF_CHAR == 1
1919

20+
#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR
21+
22+
/* 512-bit vector of 64 unsigned bytes. GCC/Clang will lower this to
23+
the widest SIMD the target supports (4x NEON on AArch64, 2x AVX-256
24+
or 1x AVX-512 on x86-64) and unroll automatically. */
25+
typedef unsigned char v64u8 __attribute__((vector_size(64)));
26+
27+
Py_LOCAL_INLINE(Py_UCS4)
28+
STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
29+
{
30+
const unsigned char *p = (const unsigned char *)begin;
31+
const unsigned char *_end = (const unsigned char *)end;
32+
33+
/* SIMD path: OR 64 bytes at a time into an accumulator, then check
34+
the high bits once at the end. The branchless inner loop lets
35+
the compiler schedule multiple independent vector loads.
36+
Uses memcpy for safe unaligned loads; the compiler optimizes
37+
these to native vector load instructions. */
38+
v64u8 accum = {0};
39+
while (p + 64 <= _end) {
40+
v64u8 data;
41+
memcpy(&data, p, 64);
42+
accum |= data;
43+
p += 64;
44+
}
45+
46+
/* Reduce: OR 64-byte accumulator down to a scalar high-bit check */
47+
uint64_t a, b, c, d, e, f, g, h;
48+
memcpy(&a, (const char *)&accum + 0, 8);
49+
memcpy(&b, (const char *)&accum + 8, 8);
50+
memcpy(&c, (const char *)&accum + 16, 8);
51+
memcpy(&d, (const char *)&accum + 24, 8);
52+
memcpy(&e, (const char *)&accum + 32, 8);
53+
memcpy(&f, (const char *)&accum + 40, 8);
54+
memcpy(&g, (const char *)&accum + 48, 8);
55+
memcpy(&h, (const char *)&accum + 56, 8);
56+
if ((a | b | c | d | e | f | g | h) & UCS1_ASCII_CHAR_MASK) {
57+
return 255;
58+
}
59+
60+
/* Scalar tail for remaining bytes */
61+
while (p < _end) {
62+
if (*p++ & 0x80) {
63+
return 255;
64+
}
65+
}
66+
return 127;
67+
}
68+
69+
#else /* !HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR */
70+
2071
Py_LOCAL_INLINE(Py_UCS4)
2172
STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
2273
{
@@ -43,6 +94,8 @@ STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
4394
return 127;
4495
}
4596

97+
#endif /* HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR */
98+
4699
#undef ASCII_CHAR_MASK
47100

48101
#else /* STRINGLIB_SIZEOF_CHAR == 1 */

0 commit comments

Comments
 (0)