fix(core): exclude stale entity rows from embedding coverage stats (#675)

phernandez · claude · web-flow · commit c8b00449d279 · 2026-03-15T18:45:36.000-05:00
Signed-off-by: phernandez &lt;paul@basicmachines.co&gt;
Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/basic_memory/services/project_service.py b/src/basic_memory/services/project_service.py
@@ -997,25 +997,37 @@ async def get_embedding_status(self, project_id: int) -> EmbeddingStatus:
             )
 
         # --- Count queries (tables exist) ---
+        # Filter by entity existence to exclude stale rows from deleted entities
+        # that remain in derived search tables (search_index, search_vector_chunks)
+        entity_exists = "AND entity_id IN (SELECT id FROM entity WHERE project_id = :project_id)"
+        # Same filter for aliased chunks table (used in JOIN queries below)
+        chunk_entity_exists = (
+            "AND c.entity_id IN (SELECT id FROM entity WHERE project_id = :project_id)"
+        )
+
         si_result = await self.repository.execute_query(
             text(
-                "SELECT COUNT(DISTINCT entity_id) FROM search_index WHERE project_id = :project_id"
+                "SELECT COUNT(DISTINCT entity_id) FROM search_index "
+                f"WHERE project_id = :project_id {entity_exists}"
             ),
             {"project_id": project_id},
         )
         total_indexed_entities = si_result.scalar() or 0
 
         try:
             chunks_result = await self.repository.execute_query(
-                text("SELECT COUNT(*) FROM search_vector_chunks WHERE project_id = :project_id"),
+                text(
+                    "SELECT COUNT(*) FROM search_vector_chunks "
+                    f"WHERE project_id = :project_id {entity_exists}"
+                ),
                 {"project_id": project_id},
             )
             total_chunks = chunks_result.scalar() or 0
 
             entities_with_chunks_result = await self.repository.execute_query(
                 text(
                     "SELECT COUNT(DISTINCT entity_id) FROM search_vector_chunks "
-                    "WHERE project_id = :project_id"
+                    f"WHERE project_id = :project_id {entity_exists}"
                 ),
                 {"project_id": project_id},
             )
@@ -1026,13 +1038,13 @@ async def get_embedding_status(self, project_id: int) -> EmbeddingStatus:
                 embeddings_sql = text(
                     "SELECT COUNT(*) FROM search_vector_chunks c "
                     "JOIN search_vector_embeddings e ON e.chunk_id = c.id "
-                    "WHERE c.project_id = :project_id"
+                    f"WHERE c.project_id = :project_id {chunk_entity_exists}"
                 )
             else:
                 embeddings_sql = text(
                     "SELECT COUNT(*) FROM search_vector_chunks c "
                     "JOIN search_vector_embeddings e ON e.rowid = c.id "
-                    "WHERE c.project_id = :project_id"
+                    f"WHERE c.project_id = :project_id {chunk_entity_exists}"
                 )
 
             embeddings_result = await self.repository.execute_query(
@@ -1045,13 +1057,13 @@ async def get_embedding_status(self, project_id: int) -> EmbeddingStatus:
                 orphan_sql = text(
                     "SELECT COUNT(*) FROM search_vector_chunks c "
                     "LEFT JOIN search_vector_embeddings e ON e.chunk_id = c.id "
-                    "WHERE c.project_id = :project_id AND e.chunk_id IS NULL"
+                    f"WHERE c.project_id = :project_id AND e.chunk_id IS NULL {chunk_entity_exists}"
                 )
             else:
                 orphan_sql = text(
                     "SELECT COUNT(*) FROM search_vector_chunks c "
                     "LEFT JOIN search_vector_embeddings e ON e.rowid = c.id "
-                    "WHERE c.project_id = :project_id AND e.rowid IS NULL"
+                    f"WHERE c.project_id = :project_id AND e.rowid IS NULL {chunk_entity_exists}"
                 )
 
             orphan_result = await self.repository.execute_query(
diff --git a/src/basic_memory/services/search_service.py b/src/basic_memory/services/search_service.py
@@ -403,6 +403,11 @@ async def reindex_vectors(self, progress_callback=None) -> dict:
         """
         entities = await self.entity_repository.find_all()
         entity_ids = [entity.id for entity in entities]
+
+        # Clean up stale rows in search_index and search_vector_chunks
+        # that reference entity_ids no longer in the entity table
+        await self._purge_stale_search_rows()
+
         batch_result = await self.repository.sync_entity_vectors_batch(
             entity_ids,
             progress_callback=progress_callback,
@@ -419,6 +424,52 @@ async def reindex_vectors(self, progress_callback=None) -> dict:
 
         return stats
 
+    async def _purge_stale_search_rows(self) -> None:
+        """Remove rows from search_index and search_vector_chunks for deleted entities.
+
+        Trigger: entities are deleted but their derived search rows remain
+        Why: stale rows inflate embedding coverage stats in project info
+        Outcome: search tables only contain rows for entities that still exist
+        """
+        from basic_memory.repository.sqlite_search_repository import SQLiteSearchRepository
+        from sqlalchemy import text
+
+        project_id = self.repository.project_id
+        stale_entity_filter = (
+            "entity_id NOT IN (SELECT id FROM entity WHERE project_id = :project_id)"
+        )
+        params = {"project_id": project_id}
+
+        # Delete stale search_index rows
+        await self.repository.execute_query(
+            text(
+                f"DELETE FROM search_index WHERE project_id = :project_id AND {stale_entity_filter}"
+            ),
+            params,
+        )
+
+        # SQLite vec has no CASCADE — must delete embeddings before chunks
+        if isinstance(self.repository, SQLiteSearchRepository):
+            await self.repository.execute_query(
+                text(
+                    "DELETE FROM search_vector_embeddings WHERE rowid IN ("
+                    "SELECT id FROM search_vector_chunks "
+                    f"WHERE project_id = :project_id AND {stale_entity_filter})"
+                ),
+                params,
+            )
+
+        # Postgres CASCADE handles embedding deletion automatically
+        await self.repository.execute_query(
+            text(
+                f"DELETE FROM search_vector_chunks "
+                f"WHERE project_id = :project_id AND {stale_entity_filter}"
+            ),
+            params,
+        )
+
+        logger.info("Purged stale search rows for deleted entities", project_id=project_id)
+
     async def index_entity_file(
         self,
         entity: Entity,
diff --git a/tests/services/test_project_service_embedding_status.py b/tests/services/test_project_service_embedding_status.py
@@ -251,6 +251,55 @@ async def test_embedding_status_healthy(project_service: ProjectService, test_gr
     assert status.reindex_reason is None
 
 
+@pytest.mark.asyncio
+async def test_embedding_status_excludes_stale_entity_ids(
+    project_service: ProjectService, test_graph, test_project
+):
+    """Stale rows in search_index for deleted entities should not inflate counts.
+
+    Regression test for #670: after reindex, project info reported missing embeddings
+    because stale entity_ids in search_index/search_vector_chunks inflated total_indexed_entities.
+    """
+    # Insert a stale search_index row for an entity_id that doesn't exist in the entity table.
+    # Include 'id' column — required NOT NULL on Postgres (regular table),
+    # ignored on SQLite (FTS5 virtual table where id is UNINDEXED).
+    stale_entity_id = 999999
+    await project_service.repository.execute_query(
+        text(
+            "INSERT INTO search_index "
+            "(id, entity_id, project_id, type, title, permalink, content_stems, "
+            "content_snippet, file_path, metadata) "
+            "VALUES (:id, :eid, :pid, 'entity', 'Stale Note', 'stale-note', "
+            "'stale content', 'stale snippet', 'stale.md', '{}')"
+        ),
+        {"id": stale_entity_id, "eid": stale_entity_id, "pid": test_project.id},
+    )
+
+    with patch.object(
+        type(project_service),
+        "config_manager",
+        new_callable=lambda: property(
+            lambda self: _config_manager_with(semantic_search_enabled=True)
+        ),
+    ):
+        status = await project_service.get_embedding_status(test_project.id)
+
+    # The stale entity_id should NOT be counted in total_indexed_entities.
+    # Count real entities that have search_index rows (the stale one should be excluded).
+    real_indexed_result = await project_service.repository.execute_query(
+        text(
+            "SELECT COUNT(DISTINCT si.entity_id) FROM search_index si "
+            "JOIN entity e ON e.id = si.entity_id "
+            "WHERE si.project_id = :pid"
+        ),
+        {"pid": test_project.id},
+    )
+    real_indexed_count = real_indexed_result.scalar() or 0
+
+    # Exact match — stale entity_id must not inflate the count
+    assert status.total_indexed_entities == real_indexed_count
+
+
 @pytest.mark.asyncio
 async def test_get_project_info_includes_embedding_status(
     project_service: ProjectService, test_graph, test_project