Skip to content

Commit 30a8935

Browse files
authored
fix(core): handle SQLite and Windows semantic regressions (#655)
Signed-off-by: phernandez <paul@basicmachines.co>
1 parent 222ec5d commit 30a8935

6 files changed

Lines changed: 351 additions & 55 deletions

File tree

src/basic_memory/repository/search_repository_base.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -451,21 +451,36 @@ def _compose_row_source_text(self, row) -> str:
451451
return "\n\n".join(part for part in row_parts if part)
452452

453453
def _build_chunk_records(self, rows) -> list[dict[str, str]]:
454-
records: list[dict[str, str]] = []
454+
records_by_key: dict[str, dict[str, str]] = {}
455+
duplicate_chunk_keys = 0
455456
for row in rows:
456457
source_text = self._compose_row_source_text(row)
457458
chunks = self._split_text_into_chunks(source_text)
458459
for chunk_index, chunk_text in enumerate(chunks):
459460
chunk_key = f"{row.type}:{row.id}:{chunk_index}"
460461
source_hash = hashlib.sha256(chunk_text.encode("utf-8")).hexdigest()
461-
records.append(
462-
{
463-
"chunk_key": chunk_key,
464-
"chunk_text": chunk_text,
465-
"source_hash": source_hash,
466-
}
467-
)
468-
return records
462+
# Trigger: SQLite FTS5 can accumulate duplicate logical rows for the
463+
# same search_index id because it does not enforce relational uniqueness.
464+
# Why: duplicate chunk keys would schedule duplicate writes for the same
465+
# chunk row and eventually trip UNIQUE(rowid) in search_vector_embeddings.
466+
# Outcome: collapse chunk work to one deterministic record per chunk key.
467+
if chunk_key in records_by_key:
468+
duplicate_chunk_keys += 1
469+
records_by_key[chunk_key] = {
470+
"chunk_key": chunk_key,
471+
"chunk_text": chunk_text,
472+
"source_hash": source_hash,
473+
}
474+
475+
if duplicate_chunk_keys:
476+
logger.warning(
477+
"Collapsed duplicate vector chunk keys before embedding sync: "
478+
"project_id={project_id} duplicate_chunk_keys={duplicate_chunk_keys}",
479+
project_id=self.project_id,
480+
duplicate_chunk_keys=duplicate_chunk_keys,
481+
)
482+
483+
return list(records_by_key.values())
469484

470485
# --- Text splitting ---
471486

src/basic_memory/services/project_service.py

Lines changed: 70 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from loguru import logger
1313
from sqlalchemy import text
14+
from sqlalchemy.exc import OperationalError as SAOperationalError
1415

1516
from basic_memory.models import Project
1617
from basic_memory.repository.project_repository import ProjectRepository
@@ -1004,56 +1005,81 @@ async def get_embedding_status(self, project_id: int) -> EmbeddingStatus:
10041005
)
10051006
total_indexed_entities = si_result.scalar() or 0
10061007

1007-
chunks_result = await self.repository.execute_query(
1008-
text("SELECT COUNT(*) FROM search_vector_chunks WHERE project_id = :project_id"),
1009-
{"project_id": project_id},
1010-
)
1011-
total_chunks = chunks_result.scalar() or 0
1012-
1013-
entities_with_chunks_result = await self.repository.execute_query(
1014-
text(
1015-
"SELECT COUNT(DISTINCT entity_id) FROM search_vector_chunks "
1016-
"WHERE project_id = :project_id"
1017-
),
1018-
{"project_id": project_id},
1019-
)
1020-
total_entities_with_chunks = entities_with_chunks_result.scalar() or 0
1021-
1022-
# Embeddings count — join pattern differs between SQLite and Postgres
1023-
if is_postgres:
1024-
embeddings_sql = text(
1025-
"SELECT COUNT(*) FROM search_vector_chunks c "
1026-
"JOIN search_vector_embeddings e ON e.chunk_id = c.id "
1027-
"WHERE c.project_id = :project_id"
1028-
)
1029-
else:
1030-
embeddings_sql = text(
1031-
"SELECT COUNT(*) FROM search_vector_chunks c "
1032-
"JOIN search_vector_embeddings e ON e.rowid = c.id "
1033-
"WHERE c.project_id = :project_id"
1008+
try:
1009+
chunks_result = await self.repository.execute_query(
1010+
text("SELECT COUNT(*) FROM search_vector_chunks WHERE project_id = :project_id"),
1011+
{"project_id": project_id},
10341012
)
1013+
total_chunks = chunks_result.scalar() or 0
10351014

1036-
embeddings_result = await self.repository.execute_query(
1037-
embeddings_sql, {"project_id": project_id}
1038-
)
1039-
total_embeddings = embeddings_result.scalar() or 0
1015+
entities_with_chunks_result = await self.repository.execute_query(
1016+
text(
1017+
"SELECT COUNT(DISTINCT entity_id) FROM search_vector_chunks "
1018+
"WHERE project_id = :project_id"
1019+
),
1020+
{"project_id": project_id},
1021+
)
1022+
total_entities_with_chunks = entities_with_chunks_result.scalar() or 0
1023+
1024+
# Embeddings count — join pattern differs between SQLite and Postgres
1025+
if is_postgres:
1026+
embeddings_sql = text(
1027+
"SELECT COUNT(*) FROM search_vector_chunks c "
1028+
"JOIN search_vector_embeddings e ON e.chunk_id = c.id "
1029+
"WHERE c.project_id = :project_id"
1030+
)
1031+
else:
1032+
embeddings_sql = text(
1033+
"SELECT COUNT(*) FROM search_vector_chunks c "
1034+
"JOIN search_vector_embeddings e ON e.rowid = c.id "
1035+
"WHERE c.project_id = :project_id"
1036+
)
10401037

1041-
# Orphaned chunks (chunks without embeddings — indicates interrupted indexing)
1042-
if is_postgres:
1043-
orphan_sql = text(
1044-
"SELECT COUNT(*) FROM search_vector_chunks c "
1045-
"LEFT JOIN search_vector_embeddings e ON e.chunk_id = c.id "
1046-
"WHERE c.project_id = :project_id AND e.chunk_id IS NULL"
1038+
embeddings_result = await self.repository.execute_query(
1039+
embeddings_sql, {"project_id": project_id}
10471040
)
1048-
else:
1049-
orphan_sql = text(
1050-
"SELECT COUNT(*) FROM search_vector_chunks c "
1051-
"LEFT JOIN search_vector_embeddings e ON e.rowid = c.id "
1052-
"WHERE c.project_id = :project_id AND e.rowid IS NULL"
1041+
total_embeddings = embeddings_result.scalar() or 0
1042+
1043+
# Orphaned chunks (chunks without embeddings — indicates interrupted indexing)
1044+
if is_postgres:
1045+
orphan_sql = text(
1046+
"SELECT COUNT(*) FROM search_vector_chunks c "
1047+
"LEFT JOIN search_vector_embeddings e ON e.chunk_id = c.id "
1048+
"WHERE c.project_id = :project_id AND e.chunk_id IS NULL"
1049+
)
1050+
else:
1051+
orphan_sql = text(
1052+
"SELECT COUNT(*) FROM search_vector_chunks c "
1053+
"LEFT JOIN search_vector_embeddings e ON e.rowid = c.id "
1054+
"WHERE c.project_id = :project_id AND e.rowid IS NULL"
1055+
)
1056+
1057+
orphan_result = await self.repository.execute_query(
1058+
orphan_sql, {"project_id": project_id}
10531059
)
1060+
orphaned_chunks = orphan_result.scalar() or 0
1061+
except SAOperationalError as exc:
1062+
# Trigger: sqlite_master can list vec0 virtual tables even when sqlite-vec
1063+
# is not loaded in the current Python runtime.
1064+
# Why: project info should degrade gracefully instead of crashing on stats queries.
1065+
# Outcome: report vector tables as unavailable and point the user to install the
1066+
# missing dependency before rebuilding embeddings.
1067+
if is_postgres or "no such module: vec0" not in str(exc).lower():
1068+
raise
10541069

1055-
orphan_result = await self.repository.execute_query(orphan_sql, {"project_id": project_id})
1056-
orphaned_chunks = orphan_result.scalar() or 0
1070+
return EmbeddingStatus(
1071+
semantic_search_enabled=True,
1072+
embedding_provider=provider,
1073+
embedding_model=model,
1074+
embedding_dimensions=dimensions,
1075+
total_indexed_entities=total_indexed_entities,
1076+
vector_tables_exist=False,
1077+
reindex_recommended=True,
1078+
reindex_reason=(
1079+
"SQLite vector tables exist but sqlite-vec is unavailable in this Python "
1080+
"environment — install/update basic-memory, then run: bm reindex --embeddings"
1081+
),
1082+
)
10571083

10581084
# --- Reindex recommendation logic (priority order) ---
10591085
reindex_recommended = False

src/basic_memory/utils.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ def __str__(self) -> str: ...
6666
# In type annotations, use Union[Path, str] instead of FilePath for now
6767
# This preserves compatibility with existing code while we migrate
6868
FilePath = Union[Path, str]
69+
WINDOWS_LOG_FILE_RETENTION = 5
6970

7071

7172
def generate_permalink(file_path: Union[Path, str, PathLike], split_extension: bool = True) -> str:
@@ -250,7 +251,7 @@ def setup_logging(
250251
log_to_file: bool = False,
251252
log_to_stdout: bool = False,
252253
structured_context: bool = False,
253-
) -> None: # pragma: no cover
254+
) -> None:
254255
"""Configure logging with explicit settings.
255256
256257
This function provides a simple, explicit interface for configuring logging.
@@ -273,8 +274,14 @@ def setup_logging(
273274

274275
# Add file handler with rotation
275276
if log_to_file:
276-
log_path = Path.home() / ".basic-memory" / "basic-memory.log"
277+
# Trigger: Windows does not allow renaming an open file held by another process.
278+
# Why: multiple basic-memory processes can share the same log directory at once.
279+
# Outcome: use per-process log files on Windows so log rotation stays local.
280+
log_filename = f"basic-memory-{os.getpid()}.log" if os.name == "nt" else "basic-memory.log"
281+
log_path = Path.home() / ".basic-memory" / log_filename
277282
log_path.parent.mkdir(parents=True, exist_ok=True)
283+
if os.name == "nt":
284+
_cleanup_windows_log_files(log_path.parent, log_path.name)
278285
# Keep logging synchronous (enqueue=False) to avoid background logging threads.
279286
# Background threads are a common source of "hang on exit" issues in CLI/test runs.
280287
logger.add(
@@ -308,6 +315,28 @@ def setup_logging(
308315
logging.getLogger("watchfiles.main").setLevel(logging.WARNING)
309316

310317

318+
def _cleanup_windows_log_files(log_dir: Path, current_log_name: str) -> None:
319+
"""Trim stale per-process Windows log files so the directory stays bounded."""
320+
stale_logs = [
321+
path
322+
for path in log_dir.glob("basic-memory-*.log*")
323+
if path.is_file() and path.name != current_log_name
324+
]
325+
326+
if len(stale_logs) <= WINDOWS_LOG_FILE_RETENTION - 1:
327+
return
328+
329+
# Trigger: per-process log filenames avoid Windows rename contention but fragment retention.
330+
# Why: loguru retention applies per sink, not across the whole basic-memory log directory.
331+
# Outcome: keep only the newest stale PID logs so repeated CLI/server launches stay bounded.
332+
stale_logs.sort(key=lambda path: path.stat().st_mtime, reverse=True)
333+
for stale_log in stale_logs[WINDOWS_LOG_FILE_RETENTION - 1 :]:
334+
try:
335+
stale_log.unlink()
336+
except OSError:
337+
logger.debug("Failed to delete stale Windows log file: {path}", path=stale_log)
338+
339+
311340
def parse_tags(tags: Union[List[str], str, None]) -> List[str]:
312341
"""Parse tags from various input formats into a consistent list.
313342

tests/repository/test_semantic_search_base.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,29 @@ def test_chunk_key_includes_row_id(self):
237237
records = self.repo._build_chunk_records(rows)
238238
assert any("99" in r["chunk_key"] for r in records)
239239

240+
def test_duplicate_rows_collapse_to_unique_chunk_keys(self):
241+
rows = [
242+
_make_row(
243+
row_type=SearchItemType.ENTITY.value,
244+
title="Spec",
245+
permalink="spec",
246+
content_snippet="shared content",
247+
row_id=77,
248+
),
249+
_make_row(
250+
row_type=SearchItemType.ENTITY.value,
251+
title="Spec",
252+
permalink="spec",
253+
content_snippet="shared content",
254+
row_id=77,
255+
),
256+
]
257+
258+
records = self.repo._build_chunk_records(rows)
259+
260+
assert len(records) == 1
261+
assert records[0]["chunk_key"] == "entity:77:0"
262+
240263

241264
# --- SQLite SemanticSearchDisabledError ---
242265

tests/services/test_project_service_embedding_status.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import pytest
77
from sqlalchemy import text
8+
from sqlalchemy.exc import OperationalError as SAOperationalError
89

910
from basic_memory.schemas.project_info import EmbeddingStatus
1011
from basic_memory.services.project_service import ProjectService
@@ -142,6 +143,46 @@ async def test_embedding_status_orphaned_chunks(
142143
assert "orphaned chunks" in (status.reindex_reason or "")
143144

144145

146+
@pytest.mark.asyncio
147+
async def test_embedding_status_handles_sqlite_vec_unavailable(
148+
project_service: ProjectService, test_graph, test_project
149+
):
150+
"""Unreadable vec0 tables should degrade to unavailable status instead of crashing."""
151+
# Trigger: Postgres test matrix executes the same unit suite.
152+
# Why: sqlite-vec loading failures are specific to SQLite virtual tables, not Postgres joins.
153+
# Outcome: keep the regression focused on the backend that can actually hit this path.
154+
if _is_postgres():
155+
pytest.skip("sqlite-vec unavailable handling is SQLite-specific.")
156+
157+
original_execute_query = project_service.repository.execute_query
158+
159+
async def _execute_query_with_vec0_failure(query, params):
160+
query_text = str(query)
161+
if "JOIN search_vector_embeddings" in query_text:
162+
raise SAOperationalError(query_text, params, Exception("no such module: vec0"))
163+
return await original_execute_query(query, params)
164+
165+
with patch.object(
166+
type(project_service),
167+
"config_manager",
168+
new_callable=lambda: property(
169+
lambda self: _config_manager_with(semantic_search_enabled=True)
170+
),
171+
):
172+
with patch.object(
173+
project_service.repository,
174+
"execute_query",
175+
side_effect=_execute_query_with_vec0_failure,
176+
):
177+
status = await project_service.get_embedding_status(test_project.id)
178+
179+
assert status.semantic_search_enabled is True
180+
assert status.total_indexed_entities > 0
181+
assert status.vector_tables_exist is False
182+
assert status.reindex_recommended is True
183+
assert "sqlite-vec is unavailable" in (status.reindex_reason or "")
184+
185+
145186
@pytest.mark.asyncio
146187
async def test_embedding_status_healthy(project_service: ProjectService, test_graph, test_project):
147188
"""When all entities have embeddings, no reindex recommended."""

0 commit comments

Comments
 (0)