Skip to content

Commit 5cc98a3

Browse files
fix: handle Boolean search syntax with hyphenated terms
- Add _prepare_boolean_query method to parse Boolean queries and quote individual terms - Add _prepare_single_term method to handle single term preparation logic - Update _prepare_search_term to use new Boolean query processing - Simplify search method by removing redundant Boolean handling logic - Add test cases for hyphenated terms with Boolean operators This fixes the issue where queries like "tier1-test AND unicode" would cause SQL parsing errors because FTS5 was treating "tier1-test" as a column name instead of a search term. The fix ensures that: - Boolean operators (AND, OR, NOT) are preserved - Terms with special characters (hyphens, dots, etc.) are properly quoted - Simple terms in Boolean queries remain unquoted for optimal FTS5 performance - Complex Boolean expressions with parentheses are handled correctly Fixes #178 Co-authored-by: Paul Hernandez <phernandez@users.noreply.github.com>
1 parent e5923a0 commit 5cc98a3

2 files changed

Lines changed: 111 additions & 26 deletions

File tree

src/basic_memory/repository/search_repository.py

Lines changed: 96 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -120,23 +120,82 @@ async def init_search_index(self):
120120
logger.error(f"Error initializing search index: {e}")
121121
raise e
122122

123-
def _prepare_search_term(self, term: str, is_prefix: bool = True) -> str:
124-
"""Prepare a search term for FTS5 query.
125-
123+
def _prepare_boolean_query(self, query: str) -> str:
124+
"""Prepare a Boolean query by quoting individual terms while preserving operators.
125+
126126
Args:
127-
term: The search term to prepare
127+
query: A Boolean query like "tier1-test AND unicode" or "(hello OR world) NOT test"
128+
129+
Returns:
130+
A properly formatted Boolean query with quoted terms that need quoting
131+
"""
132+
import re
133+
134+
# Define Boolean operators and their boundaries
135+
boolean_pattern = r'\b(AND|OR|NOT)\b'
136+
137+
# Split the query by Boolean operators, keeping the operators
138+
parts = re.split(f'({boolean_pattern})', query)
139+
140+
processed_parts = []
141+
for part in parts:
142+
part = part.strip()
143+
if not part:
144+
continue
145+
146+
# If it's a Boolean operator, keep it as is
147+
if re.match(boolean_pattern, part):
148+
processed_parts.append(part)
149+
else:
150+
# This is a search term (may include parentheses)
151+
# Handle parentheses separately
152+
if part.startswith('(') and part.endswith(')'):
153+
# Extract the term inside parentheses
154+
inner_term = part[1:-1].strip()
155+
# Recursively process the inner term if it contains Boolean operators
156+
if any(op in f" {inner_term} " for op in [" AND ", " OR ", " NOT "]):
157+
processed_inner = self._prepare_boolean_query(inner_term)
158+
processed_parts.append(f"({processed_inner})")
159+
else:
160+
# Single term in parentheses - for Boolean queries, don't add prefix wildcards
161+
prepared_term = self._prepare_single_term(inner_term, is_prefix=False)
162+
processed_parts.append(f"({prepared_term})")
163+
elif part.startswith('('):
164+
# Opening parenthesis with term - for Boolean queries, don't add prefix wildcards
165+
paren_match = re.match(r'\((.+)', part)
166+
if paren_match:
167+
inner_term = paren_match.group(1).strip()
168+
prepared_term = self._prepare_single_term(inner_term, is_prefix=False)
169+
processed_parts.append(f"({prepared_term}")
170+
elif part.endswith(')'):
171+
# Closing parenthesis with term - for Boolean queries, don't add prefix wildcards
172+
paren_match = re.match(r'(.+)\)', part)
173+
if paren_match:
174+
inner_term = paren_match.group(1).strip()
175+
prepared_term = self._prepare_single_term(inner_term, is_prefix=False)
176+
processed_parts.append(f"{prepared_term})")
177+
else:
178+
# Regular term - for Boolean queries, don't add prefix wildcards
179+
prepared_term = self._prepare_single_term(part, is_prefix=False)
180+
processed_parts.append(prepared_term)
181+
182+
return " ".join(processed_parts)
183+
184+
def _prepare_single_term(self, term: str, is_prefix: bool = True) -> str:
185+
"""Prepare a single search term (no Boolean operators).
186+
187+
Args:
188+
term: A single search term
128189
is_prefix: Whether to add prefix search capability (* suffix)
129-
130-
For FTS5:
131-
- Boolean operators (AND, OR, NOT) are preserved for complex queries
132-
- Terms with FTS5 special characters are quoted to prevent syntax errors
133-
- Simple terms get prefix wildcards for better matching
190+
191+
Returns:
192+
A properly formatted single term
134193
"""
135-
# Check for explicit boolean operators - if present, return the term as is
136-
boolean_operators = [" AND ", " OR ", " NOT "]
137-
if any(op in f" {term} " for op in boolean_operators):
194+
if not term or not term.strip():
138195
return term
139-
196+
197+
term = term.strip()
198+
140199
# Check if term is already a proper wildcard pattern (alphanumeric + *)
141200
# e.g., "hello*", "test*world" - these should be left alone
142201
if "*" in term and all(c.isalnum() or c in "*_-" for c in term):
@@ -218,6 +277,26 @@ def _prepare_search_term(self, term: str, is_prefix: bool = True) -> str:
218277

219278
return term
220279

280+
def _prepare_search_term(self, term: str, is_prefix: bool = True) -> str:
281+
"""Prepare a search term for FTS5 query.
282+
283+
Args:
284+
term: The search term to prepare
285+
is_prefix: Whether to add prefix search capability (* suffix)
286+
287+
For FTS5:
288+
- Boolean operators (AND, OR, NOT) are preserved for complex queries
289+
- Terms with FTS5 special characters are quoted to prevent syntax errors
290+
- Simple terms get prefix wildcards for better matching
291+
"""
292+
# Check for explicit boolean operators - if present, process as Boolean query
293+
boolean_operators = [" AND ", " OR ", " NOT "]
294+
if any(op in f" {term} " for op in boolean_operators):
295+
return self._prepare_boolean_query(term)
296+
297+
# For non-Boolean queries, use the single term preparation logic
298+
return self._prepare_single_term(term, is_prefix)
299+
221300
async def search(
222301
self,
223302
search_text: Optional[str] = None,
@@ -242,19 +321,10 @@ async def search(
242321
# For wildcard searches, don't add any text conditions - return all results
243322
pass
244323
else:
245-
# Check for explicit boolean operators - only detect them in proper boolean contexts
246-
has_boolean = any(op in f" {search_text} " for op in [" AND ", " OR ", " NOT "])
247-
248-
if has_boolean:
249-
# If boolean operators are present, use the raw query
250-
# No need to prepare it, FTS5 will understand the operators
251-
params["text"] = search_text
252-
conditions.append("(title MATCH :text OR content_stems MATCH :text)")
253-
else:
254-
# Standard search with term preparation
255-
processed_text = self._prepare_search_term(search_text.strip())
256-
params["text"] = processed_text
257-
conditions.append("(title MATCH :text OR content_stems MATCH :text)")
324+
# Use _prepare_search_term to handle both Boolean and non-Boolean queries
325+
processed_text = self._prepare_search_term(search_text.strip())
326+
params["text"] = processed_text
327+
conditions.append("(title MATCH :text OR content_stems MATCH :text)")
258328

259329
# Handle title match search
260330
if title:

tests/repository/test_search_repository.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,21 @@ def test_boolean_operators_preserved(self, search_repository):
329329
== "(hello AND world) OR test"
330330
)
331331

332+
def test_hyphenated_terms_with_boolean_operators(self, search_repository):
333+
"""Hyphenated terms with Boolean operators should be properly quoted."""
334+
# Test the specific case from the GitHub issue
335+
result = search_repository._prepare_search_term("tier1-test AND unicode")
336+
assert result == '"tier1-test" AND unicode'
337+
338+
# Test other hyphenated Boolean combinations
339+
assert search_repository._prepare_search_term("multi-word OR single") == '"multi-word" OR single'
340+
assert search_repository._prepare_search_term("well-formed NOT badly-formed") == '"well-formed" NOT "badly-formed"'
341+
assert search_repository._prepare_search_term("test-case AND (hello OR world)") == '"test-case" AND (hello OR world)'
342+
343+
# Test mixed special characters with Boolean operators
344+
assert search_repository._prepare_search_term("config.json AND test-file") == '"config.json" AND "test-file"'
345+
assert search_repository._prepare_search_term("C++ OR python-script") == '"C++" OR "python-script"'
346+
332347
def test_programming_terms_should_work(self, search_repository):
333348
"""Programming-related terms with special chars should be searchable."""
334349
# These should be quoted to handle special characters safely

0 commit comments

Comments
 (0)