Skip to content

Commit 6bf5343

Browse files
phernandezclaude
andcommitted
fix(core): exclude stale entity rows from embedding coverage stats
When entities are deleted, their rows in search_index and search_vector_chunks can remain as orphans. This caused `bm project info` to overcount total_indexed_entities and report false "missing embeddings" recommendations after a successful reindex. Two-pronged fix: 1. Filter embedding status queries to only count entity_ids that still exist in the entity table (fixes reporting) 2. Purge stale search rows during reindex (fixes root cause) Fixes #670 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: phernandez <paul@basicmachines.co>
1 parent dd91b49 commit 6bf5343

3 files changed

Lines changed: 103 additions & 3 deletions

File tree

src/basic_memory/services/project_service.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -997,25 +997,33 @@ async def get_embedding_status(self, project_id: int) -> EmbeddingStatus:
997997
)
998998

999999
# --- Count queries (tables exist) ---
1000+
# Filter by entity existence to exclude stale rows from deleted entities
1001+
# that remain in derived search tables (search_index, search_vector_chunks)
1002+
entity_exists = "AND entity_id IN (SELECT id FROM entity WHERE project_id = :project_id)"
1003+
10001004
si_result = await self.repository.execute_query(
10011005
text(
1002-
"SELECT COUNT(DISTINCT entity_id) FROM search_index WHERE project_id = :project_id"
1006+
"SELECT COUNT(DISTINCT entity_id) FROM search_index "
1007+
f"WHERE project_id = :project_id {entity_exists}"
10031008
),
10041009
{"project_id": project_id},
10051010
)
10061011
total_indexed_entities = si_result.scalar() or 0
10071012

10081013
try:
10091014
chunks_result = await self.repository.execute_query(
1010-
text("SELECT COUNT(*) FROM search_vector_chunks WHERE project_id = :project_id"),
1015+
text(
1016+
"SELECT COUNT(*) FROM search_vector_chunks "
1017+
f"WHERE project_id = :project_id {entity_exists}"
1018+
),
10111019
{"project_id": project_id},
10121020
)
10131021
total_chunks = chunks_result.scalar() or 0
10141022

10151023
entities_with_chunks_result = await self.repository.execute_query(
10161024
text(
10171025
"SELECT COUNT(DISTINCT entity_id) FROM search_vector_chunks "
1018-
"WHERE project_id = :project_id"
1026+
f"WHERE project_id = :project_id {entity_exists}"
10191027
),
10201028
{"project_id": project_id},
10211029
)

src/basic_memory/services/search_service.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,11 @@ async def reindex_vectors(self, progress_callback=None) -> dict:
403403
"""
404404
entities = await self.entity_repository.find_all()
405405
entity_ids = [entity.id for entity in entities]
406+
407+
# Clean up stale rows in search_index and search_vector_chunks
408+
# that reference entity_ids no longer in the entity table
409+
await self._purge_stale_search_rows(set(entity_ids))
410+
406411
batch_result = await self.repository.sync_entity_vectors_batch(
407412
entity_ids,
408413
progress_callback=progress_callback,
@@ -419,6 +424,52 @@ async def reindex_vectors(self, progress_callback=None) -> dict:
419424

420425
return stats
421426

427+
async def _purge_stale_search_rows(self, valid_entity_ids: set[int]) -> None:
428+
"""Remove rows from search_index and search_vector_chunks for deleted entities.
429+
430+
Trigger: entities are deleted but their derived search rows remain
431+
Why: stale rows inflate embedding coverage stats in project info
432+
Outcome: search tables only contain rows for entities that still exist
433+
"""
434+
from basic_memory.repository.sqlite_search_repository import SQLiteSearchRepository
435+
from sqlalchemy import text
436+
437+
project_id = self.repository.project_id
438+
stale_entity_filter = (
439+
"entity_id NOT IN (SELECT id FROM entity WHERE project_id = :project_id)"
440+
)
441+
params = {"project_id": project_id}
442+
443+
# Delete stale search_index rows
444+
await self.repository.execute_query(
445+
text(
446+
f"DELETE FROM search_index WHERE project_id = :project_id AND {stale_entity_filter}"
447+
),
448+
params,
449+
)
450+
451+
# SQLite vec has no CASCADE — must delete embeddings before chunks
452+
if isinstance(self.repository, SQLiteSearchRepository):
453+
await self.repository.execute_query(
454+
text(
455+
"DELETE FROM search_vector_embeddings WHERE rowid IN ("
456+
"SELECT id FROM search_vector_chunks "
457+
f"WHERE project_id = :project_id AND {stale_entity_filter})"
458+
),
459+
params,
460+
)
461+
462+
# Postgres CASCADE handles embedding deletion automatically
463+
await self.repository.execute_query(
464+
text(
465+
f"DELETE FROM search_vector_chunks "
466+
f"WHERE project_id = :project_id AND {stale_entity_filter}"
467+
),
468+
params,
469+
)
470+
471+
logger.info("Purged stale search rows for deleted entities", project_id=project_id)
472+
422473
async def index_entity_file(
423474
self,
424475
entity: Entity,

tests/services/test_project_service_embedding_status.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,47 @@ async def test_embedding_status_healthy(project_service: ProjectService, test_gr
251251
assert status.reindex_reason is None
252252

253253

254+
@pytest.mark.asyncio
255+
async def test_embedding_status_excludes_stale_entity_ids(
256+
project_service: ProjectService, test_graph, test_project
257+
):
258+
"""Stale rows in search_index for deleted entities should not inflate counts.
259+
260+
Regression test for #670: after reindex, project info reported missing embeddings
261+
because stale entity_ids in search_index/search_vector_chunks inflated total_indexed_entities.
262+
"""
263+
# Insert a stale search_index row for an entity_id that doesn't exist in the entity table
264+
stale_entity_id = 999999
265+
await project_service.repository.execute_query(
266+
text(
267+
"INSERT INTO search_index "
268+
"(entity_id, project_id, type, title, permalink, content_stems, "
269+
"content_snippet, file_path, metadata) "
270+
"VALUES (:eid, :pid, 'entity', 'Stale Note', 'stale-note', "
271+
"'stale content', 'stale snippet', 'stale.md', '{}')"
272+
),
273+
{"eid": stale_entity_id, "pid": test_project.id},
274+
)
275+
276+
with patch.object(
277+
type(project_service),
278+
"config_manager",
279+
new_callable=lambda: property(
280+
lambda self: _config_manager_with(semantic_search_enabled=True)
281+
),
282+
):
283+
status = await project_service.get_embedding_status(test_project.id)
284+
285+
# The stale entity_id should NOT be counted in total_indexed_entities
286+
real_entity_result = await project_service.repository.execute_query(
287+
text("SELECT COUNT(*) FROM entity WHERE project_id = :pid"),
288+
{"pid": test_project.id},
289+
)
290+
real_entity_count = real_entity_result.scalar() or 0
291+
292+
assert status.total_indexed_entities <= real_entity_count
293+
294+
254295
@pytest.mark.asyncio
255296
async def test_get_project_info_includes_embedding_status(
256297
project_service: ProjectService, test_graph, test_project

0 commit comments

Comments
 (0)