Skip to content

Commit c8b0044

Browse files
phernandezclaude
andauthored
fix(core): exclude stale entity rows from embedding coverage stats (#675)
Signed-off-by: phernandez <paul@basicmachines.co> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 013864e commit c8b0044

3 files changed

Lines changed: 119 additions & 7 deletions

File tree

src/basic_memory/services/project_service.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -997,25 +997,37 @@ async def get_embedding_status(self, project_id: int) -> EmbeddingStatus:
997997
)
998998

999999
# --- Count queries (tables exist) ---
1000+
# Filter by entity existence to exclude stale rows from deleted entities
1001+
# that remain in derived search tables (search_index, search_vector_chunks)
1002+
entity_exists = "AND entity_id IN (SELECT id FROM entity WHERE project_id = :project_id)"
1003+
# Same filter for aliased chunks table (used in JOIN queries below)
1004+
chunk_entity_exists = (
1005+
"AND c.entity_id IN (SELECT id FROM entity WHERE project_id = :project_id)"
1006+
)
1007+
10001008
si_result = await self.repository.execute_query(
10011009
text(
1002-
"SELECT COUNT(DISTINCT entity_id) FROM search_index WHERE project_id = :project_id"
1010+
"SELECT COUNT(DISTINCT entity_id) FROM search_index "
1011+
f"WHERE project_id = :project_id {entity_exists}"
10031012
),
10041013
{"project_id": project_id},
10051014
)
10061015
total_indexed_entities = si_result.scalar() or 0
10071016

10081017
try:
10091018
chunks_result = await self.repository.execute_query(
1010-
text("SELECT COUNT(*) FROM search_vector_chunks WHERE project_id = :project_id"),
1019+
text(
1020+
"SELECT COUNT(*) FROM search_vector_chunks "
1021+
f"WHERE project_id = :project_id {entity_exists}"
1022+
),
10111023
{"project_id": project_id},
10121024
)
10131025
total_chunks = chunks_result.scalar() or 0
10141026

10151027
entities_with_chunks_result = await self.repository.execute_query(
10161028
text(
10171029
"SELECT COUNT(DISTINCT entity_id) FROM search_vector_chunks "
1018-
"WHERE project_id = :project_id"
1030+
f"WHERE project_id = :project_id {entity_exists}"
10191031
),
10201032
{"project_id": project_id},
10211033
)
@@ -1026,13 +1038,13 @@ async def get_embedding_status(self, project_id: int) -> EmbeddingStatus:
10261038
embeddings_sql = text(
10271039
"SELECT COUNT(*) FROM search_vector_chunks c "
10281040
"JOIN search_vector_embeddings e ON e.chunk_id = c.id "
1029-
"WHERE c.project_id = :project_id"
1041+
f"WHERE c.project_id = :project_id {chunk_entity_exists}"
10301042
)
10311043
else:
10321044
embeddings_sql = text(
10331045
"SELECT COUNT(*) FROM search_vector_chunks c "
10341046
"JOIN search_vector_embeddings e ON e.rowid = c.id "
1035-
"WHERE c.project_id = :project_id"
1047+
f"WHERE c.project_id = :project_id {chunk_entity_exists}"
10361048
)
10371049

10381050
embeddings_result = await self.repository.execute_query(
@@ -1045,13 +1057,13 @@ async def get_embedding_status(self, project_id: int) -> EmbeddingStatus:
10451057
orphan_sql = text(
10461058
"SELECT COUNT(*) FROM search_vector_chunks c "
10471059
"LEFT JOIN search_vector_embeddings e ON e.chunk_id = c.id "
1048-
"WHERE c.project_id = :project_id AND e.chunk_id IS NULL"
1060+
f"WHERE c.project_id = :project_id AND e.chunk_id IS NULL {chunk_entity_exists}"
10491061
)
10501062
else:
10511063
orphan_sql = text(
10521064
"SELECT COUNT(*) FROM search_vector_chunks c "
10531065
"LEFT JOIN search_vector_embeddings e ON e.rowid = c.id "
1054-
"WHERE c.project_id = :project_id AND e.rowid IS NULL"
1066+
f"WHERE c.project_id = :project_id AND e.rowid IS NULL {chunk_entity_exists}"
10551067
)
10561068

10571069
orphan_result = await self.repository.execute_query(

src/basic_memory/services/search_service.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,11 @@ async def reindex_vectors(self, progress_callback=None) -> dict:
403403
"""
404404
entities = await self.entity_repository.find_all()
405405
entity_ids = [entity.id for entity in entities]
406+
407+
# Clean up stale rows in search_index and search_vector_chunks
408+
# that reference entity_ids no longer in the entity table
409+
await self._purge_stale_search_rows()
410+
406411
batch_result = await self.repository.sync_entity_vectors_batch(
407412
entity_ids,
408413
progress_callback=progress_callback,
@@ -419,6 +424,52 @@ async def reindex_vectors(self, progress_callback=None) -> dict:
419424

420425
return stats
421426

427+
async def _purge_stale_search_rows(self) -> None:
428+
"""Remove rows from search_index and search_vector_chunks for deleted entities.
429+
430+
Trigger: entities are deleted but their derived search rows remain
431+
Why: stale rows inflate embedding coverage stats in project info
432+
Outcome: search tables only contain rows for entities that still exist
433+
"""
434+
from basic_memory.repository.sqlite_search_repository import SQLiteSearchRepository
435+
from sqlalchemy import text
436+
437+
project_id = self.repository.project_id
438+
stale_entity_filter = (
439+
"entity_id NOT IN (SELECT id FROM entity WHERE project_id = :project_id)"
440+
)
441+
params = {"project_id": project_id}
442+
443+
# Delete stale search_index rows
444+
await self.repository.execute_query(
445+
text(
446+
f"DELETE FROM search_index WHERE project_id = :project_id AND {stale_entity_filter}"
447+
),
448+
params,
449+
)
450+
451+
# SQLite vec has no CASCADE — must delete embeddings before chunks
452+
if isinstance(self.repository, SQLiteSearchRepository):
453+
await self.repository.execute_query(
454+
text(
455+
"DELETE FROM search_vector_embeddings WHERE rowid IN ("
456+
"SELECT id FROM search_vector_chunks "
457+
f"WHERE project_id = :project_id AND {stale_entity_filter})"
458+
),
459+
params,
460+
)
461+
462+
# Postgres CASCADE handles embedding deletion automatically
463+
await self.repository.execute_query(
464+
text(
465+
f"DELETE FROM search_vector_chunks "
466+
f"WHERE project_id = :project_id AND {stale_entity_filter}"
467+
),
468+
params,
469+
)
470+
471+
logger.info("Purged stale search rows for deleted entities", project_id=project_id)
472+
422473
async def index_entity_file(
423474
self,
424475
entity: Entity,

tests/services/test_project_service_embedding_status.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,55 @@ async def test_embedding_status_healthy(project_service: ProjectService, test_gr
251251
assert status.reindex_reason is None
252252

253253

254+
@pytest.mark.asyncio
255+
async def test_embedding_status_excludes_stale_entity_ids(
256+
project_service: ProjectService, test_graph, test_project
257+
):
258+
"""Stale rows in search_index for deleted entities should not inflate counts.
259+
260+
Regression test for #670: after reindex, project info reported missing embeddings
261+
because stale entity_ids in search_index/search_vector_chunks inflated total_indexed_entities.
262+
"""
263+
# Insert a stale search_index row for an entity_id that doesn't exist in the entity table.
264+
# Include 'id' column — required NOT NULL on Postgres (regular table),
265+
# ignored on SQLite (FTS5 virtual table where id is UNINDEXED).
266+
stale_entity_id = 999999
267+
await project_service.repository.execute_query(
268+
text(
269+
"INSERT INTO search_index "
270+
"(id, entity_id, project_id, type, title, permalink, content_stems, "
271+
"content_snippet, file_path, metadata) "
272+
"VALUES (:id, :eid, :pid, 'entity', 'Stale Note', 'stale-note', "
273+
"'stale content', 'stale snippet', 'stale.md', '{}')"
274+
),
275+
{"id": stale_entity_id, "eid": stale_entity_id, "pid": test_project.id},
276+
)
277+
278+
with patch.object(
279+
type(project_service),
280+
"config_manager",
281+
new_callable=lambda: property(
282+
lambda self: _config_manager_with(semantic_search_enabled=True)
283+
),
284+
):
285+
status = await project_service.get_embedding_status(test_project.id)
286+
287+
# The stale entity_id should NOT be counted in total_indexed_entities.
288+
# Count real entities that have search_index rows (the stale one should be excluded).
289+
real_indexed_result = await project_service.repository.execute_query(
290+
text(
291+
"SELECT COUNT(DISTINCT si.entity_id) FROM search_index si "
292+
"JOIN entity e ON e.id = si.entity_id "
293+
"WHERE si.project_id = :pid"
294+
),
295+
{"pid": test_project.id},
296+
)
297+
real_indexed_count = real_indexed_result.scalar() or 0
298+
299+
# Exact match — stale entity_id must not inflate the count
300+
assert status.total_indexed_entities == real_indexed_count
301+
302+
254303
@pytest.mark.asyncio
255304
async def test_get_project_info_includes_embedding_status(
256305
project_service: ProjectService, test_graph, test_project

0 commit comments

Comments
 (0)