diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index d4135e4baf..54843dbcc5 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1852,6 +1852,80 @@ strscan_values_at(int argc, VALUE *argv, VALUE self) return new_ary; } +/* + * call-seq: + * integer_at(index) -> integer or nil + * + * Returns the captured substring at the given +index+ as an Integer, + * without creating an intermediate String object. + * + * Returns +nil+ if the most recent match failed, or if the capture + * at +index+ is out of range, or if the capture did not participate + * in the match. + * + * This is semantically equivalent to self[index].to_i but + * avoids the allocation of a temporary String. + * + * scanner = StringScanner.new("2024-06-15") + * scanner.scan(/(\d{4})-(\d{2})-(\d{2})/) + * scanner.integer_at(1) # => 2024 + * scanner.integer_at(2) # => 6 + * scanner.integer_at(3) # => 15 + * scanner.integer_at(0) # => 20240615 (entire match as integer) + * + */ +/* rb_int_parse_cstr is declared in internal/bignum.h which is not + * available to extensions. Declare it here since the symbol is + * exported from libruby. */ +VALUE rb_int_parse_cstr(const char *str, ssize_t len, char **endp, + size_t *ndigits, int base, int flags); +#define RB_INT_PARSE_SIGN 0x01 + +static VALUE +strscan_integer_at(VALUE self, VALUE idx) +{ + struct strscanner *p; + long i; + long beg, end, len; + const char *ptr; + + GET_SCANNER(self, p); + if (! MATCHED_P(p)) return Qnil; + + i = NUM2LONG(idx); + + if (i < 0) + i += p->regs.num_regs; + if (i < 0) return Qnil; + if (i >= p->regs.num_regs) return Qnil; + if (p->regs.beg[i] == -1) return Qnil; + + beg = adjust_register_position(p, p->regs.beg[i]); + end = adjust_register_position(p, p->regs.end[i]); + len = end - beg; + + if (len <= 0) { + rb_raise(rb_eArgError, "empty capture for integer conversion"); + } + + ptr = S_PBEG(p) + beg; + + /* Parse directly from source bytes without buffer allocation. + * rb_int_parse_cstr accepts a length so no NUL-termination needed. + * Use endp to verify the entire capture was consumed as digits. */ + { + char *endp; + VALUE integer = rb_int_parse_cstr(ptr, len, &endp, NULL, 10, RB_INT_PARSE_SIGN); + + if (endp != ptr + len) { + rb_raise(rb_eArgError, + "non-digit character in capture: %.*s", + (int)len, ptr); + } + return integer; + } +} + /* * :markup: markdown * :include: strscan/link_refs.txt @@ -2290,6 +2364,7 @@ Init_strscan(void) rb_define_method(StringScanner, "size", strscan_size, 0); rb_define_method(StringScanner, "captures", strscan_captures, 0); rb_define_method(StringScanner, "values_at", strscan_values_at, -1); + rb_define_method(StringScanner, "integer_at", strscan_integer_at, 1); rb_define_method(StringScanner, "rest", strscan_rest, 0); rb_define_method(StringScanner, "rest_size", strscan_rest_size, 0); diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index dd3663ea6a..a3f32aac1c 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -968,6 +968,112 @@ def test_named_captures_same_name_union assert_equal({"number" => "1"}, scan.named_captures) end + def test_integer_at + s = create_string_scanner("2024-06-15") + s.scan(/(\d{4})-(\d{2})-(\d{2})/) + assert_equal(2024, s.integer_at(1)) + assert_equal(6, s.integer_at(2)) + assert_equal(15, s.integer_at(3)) + end + + def test_integer_at_index_zero + s = create_string_scanner("42 abc") + s.scan(/(\d+)/) + assert_equal(42, s.integer_at(0)) + end + + def test_integer_at_negative_index + s = create_string_scanner("2024-06-15") + s.scan(/(\d{4})-(\d{2})-(\d{2})/) + assert_equal(15, s.integer_at(-1)) + assert_equal(6, s.integer_at(-2)) + assert_equal(2024, s.integer_at(-3)) + end + + def test_integer_at_no_match + s = create_string_scanner("abc") + s.scan(/\d+/) + assert_nil(s.integer_at(0)) + end + + def test_integer_at_before_match + s = create_string_scanner("abc") + assert_nil(s.integer_at(0)) + end + + def test_integer_at_index_out_of_range + s = create_string_scanner("42") + s.scan(/(\d+)/) + assert_nil(s.integer_at(2)) + assert_nil(s.integer_at(100)) + assert_nil(s.integer_at(-3)) + end + + def test_integer_at_optional_group_not_matched + s = create_string_scanner("2024-06") + s.scan(/(\d{4})-(\d{2})(-(\d{2}))?/) + assert_equal(2024, s.integer_at(1)) + assert_equal(6, s.integer_at(2)) + assert_nil(s.integer_at(4)) + end + + def test_integer_at_large_number + huge = '9' * 100 + s = create_string_scanner(huge) + s.scan(/(#{huge})/) + assert_equal(huge.to_i, s.integer_at(1)) + end + + def test_integer_at_non_digit + s = create_string_scanner("1.5") + s.scan(/([\d.]+)/) + assert_raise(ArgumentError) { s.integer_at(1) } + end + + def test_integer_at_non_digit_alpha + s = create_string_scanner("foo bar") + s.scan(/(\w+)/) + assert_raise(ArgumentError) { s.integer_at(1) } + end + + def test_integer_at_empty_capture + s = create_string_scanner("abc") + s.scan(/()abc/) + assert_raise(ArgumentError) { s.integer_at(1) } + end + + def test_integer_at_sign_only + s = create_string_scanner("+") + s.scan(/([+\-])/) + assert_raise(ArgumentError) { s.integer_at(1) } + + s = create_string_scanner("-") + s.scan(/([+\-])/) + assert_raise(ArgumentError) { s.integer_at(1) } + end + + def test_integer_at_signed_number + s = create_string_scanner("-42") + s.scan(/([+\-]?\d+)/) + assert_equal(-42, s.integer_at(1)) + + s = create_string_scanner("+42") + s.scan(/([+\-]?\d+)/) + assert_equal(42, s.integer_at(1)) + end + + def test_integer_at_leading_zeros + s = create_string_scanner("007") + s.scan(/(\d+)/) + assert_equal(7, s.integer_at(1)) + end + + def test_integer_at_full_match_with_non_digits + s = create_string_scanner("2024-06-15") + s.scan(/(\d{4})-(\d{2})-(\d{2})/) + assert_raise(ArgumentError) { s.integer_at(0) } + end + def test_scan_integer s = create_string_scanner('abc') assert_equal(3, s.match?(/(?abc)/)) # set named_captures