Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions rolling_hash/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Rolling hash algorithms for string matching and similarity."""
77 changes: 77 additions & 0 deletions rolling_hash/rabin_karp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""Rabin-Karp rolling hash algorithm for substring search.

Implements the classic Rabin-Karp algorithm using a rolling hash to find
all occurrences of a pattern in a text in O(n) average time.

The algorithm uses a simple polynomial rolling hash with modulo prime to
avoid overflow. It works well for ASCII/Unicode strings.

References:
- Rabin, M. O., & Karp, R. M. (1987). Algorithms for pattern matching.
"""
from typing import List

Check failure on line 12 in rolling_hash/rabin_karp.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (UP035)

rolling_hash/rabin_karp.py:12:1: UP035 `typing.List` is deprecated, use `list` instead


def rabin_karp(text: str, pattern: str) -> List[int]:

Check failure on line 15 in rolling_hash/rabin_karp.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (UP006)

rolling_hash/rabin_karp.py:15:44: UP006 Use `list` instead of `List` for type annotation help: Replace with `list`
"""Return starting indices of pattern in text using rolling hash.

Args:
text: The text to search within.
pattern: The pattern to find.

Returns:
List of starting indices (0-based) where pattern occurs.

Example:
>>> rabin_karp("abracadabra", "abra")
[0, 7]
"""
# Edge cases
if pattern == "":
# By convention, empty pattern matches at each position plus one
return list(range(len(text) + 1))
if len(pattern) > len(text):
return []

# Rolling hash parameters
base = 256 # number of possible character values (ASCII/extended)
prime = 101 # a small prime for modulus
m, n = len(pattern), len(text)

# Precompute base^(m-1) mod prime for rolling removal
h = 1
for _ in range(m - 1):
h = (h * base) % prime

# Compute initial hash values
pattern_hash = 0
window_hash = 0
for i in range(m):
pattern_hash = (base * pattern_hash + ord(pattern[i])) % prime
window_hash = (base * window_hash + ord(text[i])) % prime

matches: List[int] = []

Check failure on line 53 in rolling_hash/rabin_karp.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (UP006)

rolling_hash/rabin_karp.py:53:14: UP006 Use `list` instead of `List` for type annotation help: Replace with `list`
# Slide the window over text
for i in range(n - m + 1):
if pattern_hash == window_hash:
# Double-check to avoid hash collisions
if text[i:i + m] == pattern:

Check failure on line 58 in rolling_hash/rabin_karp.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (SIM102)

rolling_hash/rabin_karp.py:56:9: SIM102 Use a single `if` statement instead of nested `if` statements help: Combine `if` statements using `and`
matches.append(i)
if i < n - m:
# Roll: remove leading char, add trailing char
window_hash = (base * (window_hash - ord(text[i]) * h) + ord(text[i + m])) % prime

Check failure on line 62 in rolling_hash/rabin_karp.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (E501)

rolling_hash/rabin_karp.py:62:89: E501 Line too long (94 > 88)
if window_hash < 0:
window_hash += prime
return matches


def demo() -> None:
"""Run a simple demonstration."""
text = "abracadabra"
pattern = "abra"
indices = rabin_karp(text, pattern)
print(f"Pattern '{pattern}' found at positions: {indices}")


if __name__ == "__main__":
demo()
49 changes: 49 additions & 0 deletions tests/test_rolling_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""Tests for rolling hash Rabin-Karp implementation."""

Check failure on line 1 in tests/test_rolling_hash.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (INP001)

tests/test_rolling_hash.py:1:1: INP001 File `tests/test_rolling_hash.py` is part of an implicit namespace package. Add an `__init__.py`.
import pytest

Check failure on line 2 in tests/test_rolling_hash.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (F401)

tests/test_rolling_hash.py:2:8: F401 `pytest` imported but unused help: Remove unused import: `pytest`
from rolling_hash.rabin_karp import rabin_karp

Check failure on line 3 in tests/test_rolling_hash.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (I001)

tests/test_rolling_hash.py:2:1: I001 Import block is un-sorted or un-formatted help: Organize imports


def test_basic_matches():
assert rabin_karp("abracadabra", "abra") == [0, 7]
assert rabin_karp("aaaaa", "aa") == [0, 1, 2, 3]
assert rabin_karp("hello world", "world") == [6]


def test_no_match():
assert rabin_karp("abcdef", "gh") == []
assert rabin_karp("abc", "abcd") == []


def test_empty_pattern():
# Empty pattern matches at every position (including end)
assert rabin_karp("abc", "") == [0, 1, 2, 3]
assert rabin_karp("", "") == [0]


def test_single_character():
assert rabin_karp("a", "a") == [0]
assert rabin_karp("ab", "a") == [0]
assert rabin_karp("ab", "b") == [1]


def test_overlapping():
text = "aaa"
pattern = "aa"
assert rabin_karp(text, pattern) == [0, 1]


def test_case_sensitive():
assert rabin_karp("ABCabc", "abc") == [3]
assert rabin_karp("ABCabc", "ABC") == [0]


def test_unicode():
# Unicode characters
assert rabin_karp("你好世界你好", "你好") == [0, 4]


def test_long_pattern():
text = "a" * 1000
pattern = "a" * 100
expected = list(range(0, 901))

Check failure on line 48 in tests/test_rolling_hash.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (PIE808)

tests/test_rolling_hash.py:48:27: PIE808 Unnecessary `start` argument in `range` help: Remove `start` argument
assert rabin_karp(text, pattern) == expected
Loading