Skip to content

Commit b09eca1

Browse files
phernandezclaude
andcommitted
feat: add EmbeddingStatus schema and get_embedding_status() service method
Add EmbeddingStatus model to project_info schemas and wire it into ProjectInfoResponse. ProjectService.get_embedding_status() queries vector tables for chunk/embedding counts, detects orphaned chunks and missing embeddings, and recommends reindex when appropriate. Handles both SQLite and Postgres backends. 🔍 Includes 6 unit tests covering: disabled search, missing vector tables, entities without chunks, orphaned chunks, healthy state, and integration with get_project_info(). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: phernandez <paul@basicmachines.co>
1 parent 3004d0d commit b09eca1

4 files changed

Lines changed: 458 additions & 0 deletions

File tree

src/basic_memory/schemas/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
ProjectStatistics,
4242
ActivityMetrics,
4343
SystemStatus,
44+
EmbeddingStatus,
4445
ProjectInfoResponse,
4546
)
4647

@@ -78,6 +79,7 @@
7879
"ProjectStatistics",
7980
"ActivityMetrics",
8081
"SystemStatus",
82+
"EmbeddingStatus",
8183
"ProjectInfoResponse",
8284
# Directory
8385
"DirectoryNode",

src/basic_memory/schemas/project_info.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,28 @@ class SystemStatus(BaseModel):
7979
timestamp: datetime = Field(description="Timestamp when the information was collected")
8080

8181

82+
class EmbeddingStatus(BaseModel):
83+
"""Embedding/vector index status for a project."""
84+
85+
# Config
86+
semantic_search_enabled: bool
87+
embedding_provider: Optional[str] = None
88+
embedding_model: Optional[str] = None
89+
embedding_dimensions: Optional[int] = None
90+
91+
# Counts
92+
total_indexed_entities: int = 0
93+
total_entities_with_chunks: int = 0
94+
total_chunks: int = 0
95+
total_embeddings: int = 0
96+
orphaned_chunks: int = 0
97+
vector_tables_exist: bool = False
98+
99+
# Derived
100+
reindex_recommended: bool = False
101+
reindex_reason: Optional[str] = None
102+
103+
82104
class ProjectInfoResponse(BaseModel):
83105
"""Response for the project_info tool."""
84106

@@ -99,6 +121,11 @@ class ProjectInfoResponse(BaseModel):
99121
# System status
100122
system: SystemStatus = Field(description="System and service status information")
101123

124+
# Embedding status
125+
embedding_status: Optional[EmbeddingStatus] = Field(
126+
default=None, description="Embedding/vector index status"
127+
)
128+
102129

103130
class ProjectInfoRequest(BaseModel):
104131
"""Request model for switching projects."""

src/basic_memory/services/project_service.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from basic_memory.repository.project_repository import ProjectRepository
1717
from basic_memory.schemas import (
1818
ActivityMetrics,
19+
EmbeddingStatus,
1920
ProjectInfoResponse,
2021
ProjectStatistics,
2122
SystemStatus,
@@ -597,6 +598,9 @@ async def get_project_info(self, project_name: Optional[str] = None) -> ProjectI
597598
# Get activity metrics for the specified project
598599
activity = await self.get_activity_metrics(db_project.id)
599600

601+
# Get embedding status for the specified project
602+
embedding_status = await self.get_embedding_status(db_project.id)
603+
600604
# Get system status
601605
system = self.get_system_status()
602606

@@ -650,6 +654,7 @@ async def get_project_info(self, project_name: Optional[str] = None) -> ProjectI
650654
statistics=statistics,
651655
activity=activity,
652656
system=system,
657+
embedding_status=embedding_status,
653658
)
654659

655660
async def get_statistics(self, project_id: int) -> ProjectStatistics:
@@ -918,6 +923,163 @@ async def get_activity_metrics(self, project_id: int) -> ActivityMetrics:
918923
monthly_growth=monthly_growth,
919924
)
920925

926+
async def get_embedding_status(self, project_id: int) -> EmbeddingStatus:
927+
"""Get embedding/vector index status for the specified project.
928+
929+
Reports config, counts, and whether a reindex is recommended.
930+
"""
931+
config = self.config_manager.config
932+
semantic_enabled = config.semantic_search_enabled
933+
934+
# When semantic search is disabled, return minimal status
935+
if not semantic_enabled:
936+
return EmbeddingStatus(semantic_search_enabled=False)
937+
938+
provider = config.semantic_embedding_provider
939+
model = config.semantic_embedding_model
940+
dimensions = config.semantic_embedding_dimensions
941+
942+
is_postgres = config.database_backend == DatabaseBackend.POSTGRES
943+
944+
# --- Check vector table existence ---
945+
if is_postgres:
946+
table_check_sql = text(
947+
"SELECT COUNT(*) FROM information_schema.tables "
948+
"WHERE table_name = 'search_vector_chunks'"
949+
)
950+
else:
951+
table_check_sql = text(
952+
"SELECT COUNT(*) FROM sqlite_master "
953+
"WHERE type = 'table' AND name = 'search_vector_chunks'"
954+
)
955+
956+
table_result = await self.repository.execute_query(table_check_sql, {})
957+
vector_tables_exist = (table_result.scalar() or 0) > 0
958+
959+
if not vector_tables_exist:
960+
# Count distinct entities in search index for the recommendation message
961+
si_result = await self.repository.execute_query(
962+
text(
963+
"SELECT COUNT(DISTINCT entity_id) FROM search_index "
964+
"WHERE project_id = :project_id"
965+
),
966+
{"project_id": project_id},
967+
)
968+
total_indexed_entities = si_result.scalar() or 0
969+
970+
return EmbeddingStatus(
971+
semantic_search_enabled=True,
972+
embedding_provider=provider,
973+
embedding_model=model,
974+
embedding_dimensions=dimensions,
975+
total_indexed_entities=total_indexed_entities,
976+
vector_tables_exist=False,
977+
reindex_recommended=True,
978+
reindex_reason=(
979+
"Vector tables not initialized — run: bm reindex --embeddings"
980+
),
981+
)
982+
983+
# --- Count queries (tables exist) ---
984+
si_result = await self.repository.execute_query(
985+
text(
986+
"SELECT COUNT(DISTINCT entity_id) FROM search_index "
987+
"WHERE project_id = :project_id"
988+
),
989+
{"project_id": project_id},
990+
)
991+
total_indexed_entities = si_result.scalar() or 0
992+
993+
chunks_result = await self.repository.execute_query(
994+
text("SELECT COUNT(*) FROM search_vector_chunks WHERE project_id = :project_id"),
995+
{"project_id": project_id},
996+
)
997+
total_chunks = chunks_result.scalar() or 0
998+
999+
entities_with_chunks_result = await self.repository.execute_query(
1000+
text(
1001+
"SELECT COUNT(DISTINCT entity_id) FROM search_vector_chunks "
1002+
"WHERE project_id = :project_id"
1003+
),
1004+
{"project_id": project_id},
1005+
)
1006+
total_entities_with_chunks = entities_with_chunks_result.scalar() or 0
1007+
1008+
# Embeddings count — join pattern differs between SQLite and Postgres
1009+
if is_postgres:
1010+
embeddings_sql = text(
1011+
"SELECT COUNT(*) FROM search_vector_chunks c "
1012+
"JOIN search_vector_embeddings e ON e.chunk_id = c.id "
1013+
"WHERE c.project_id = :project_id"
1014+
)
1015+
else:
1016+
embeddings_sql = text(
1017+
"SELECT COUNT(*) FROM search_vector_chunks c "
1018+
"JOIN search_vector_embeddings e ON e.rowid = c.id "
1019+
"WHERE c.project_id = :project_id"
1020+
)
1021+
1022+
embeddings_result = await self.repository.execute_query(
1023+
embeddings_sql, {"project_id": project_id}
1024+
)
1025+
total_embeddings = embeddings_result.scalar() or 0
1026+
1027+
# Orphaned chunks (chunks without embeddings — indicates interrupted indexing)
1028+
if is_postgres:
1029+
orphan_sql = text(
1030+
"SELECT COUNT(*) FROM search_vector_chunks c "
1031+
"LEFT JOIN search_vector_embeddings e ON e.chunk_id = c.id "
1032+
"WHERE c.project_id = :project_id AND e.chunk_id IS NULL"
1033+
)
1034+
else:
1035+
orphan_sql = text(
1036+
"SELECT COUNT(*) FROM search_vector_chunks c "
1037+
"LEFT JOIN search_vector_embeddings e ON e.rowid = c.id "
1038+
"WHERE c.project_id = :project_id AND e.rowid IS NULL"
1039+
)
1040+
1041+
orphan_result = await self.repository.execute_query(
1042+
orphan_sql, {"project_id": project_id}
1043+
)
1044+
orphaned_chunks = orphan_result.scalar() or 0
1045+
1046+
# --- Reindex recommendation logic (priority order) ---
1047+
reindex_recommended = False
1048+
reindex_reason = None
1049+
1050+
if total_indexed_entities > 0 and total_chunks == 0:
1051+
reindex_recommended = True
1052+
reindex_reason = (
1053+
"Embeddings have never been built — run: bm reindex --embeddings"
1054+
)
1055+
elif orphaned_chunks > 0:
1056+
reindex_recommended = True
1057+
reindex_reason = (
1058+
f"{orphaned_chunks} orphaned chunks found (interrupted indexing) "
1059+
"— run: bm reindex --embeddings"
1060+
)
1061+
elif total_indexed_entities > total_entities_with_chunks:
1062+
missing = total_indexed_entities - total_entities_with_chunks
1063+
reindex_recommended = True
1064+
reindex_reason = (
1065+
f"{missing} entities missing embeddings — run: bm reindex --embeddings"
1066+
)
1067+
1068+
return EmbeddingStatus(
1069+
semantic_search_enabled=True,
1070+
embedding_provider=provider,
1071+
embedding_model=model,
1072+
embedding_dimensions=dimensions,
1073+
total_indexed_entities=total_indexed_entities,
1074+
total_entities_with_chunks=total_entities_with_chunks,
1075+
total_chunks=total_chunks,
1076+
total_embeddings=total_embeddings,
1077+
orphaned_chunks=orphaned_chunks,
1078+
vector_tables_exist=True,
1079+
reindex_recommended=reindex_recommended,
1080+
reindex_reason=reindex_reason,
1081+
)
1082+
9211083
def get_system_status(self) -> SystemStatus:
9221084
"""Get system status information."""
9231085
import basic_memory

0 commit comments

Comments
 (0)