Skip to content

Commit 5a22239

Browse files
committed
lint
Signed-off-by: Anxhela Coba <acoba@redhat.com> fix path Signed-off-by: Anxhela Coba <acoba@redhat.com> fix lint Signed-off-by: Anxhela Coba <acoba@redhat.com> fix e2e Signed-off-by: Anxhela Coba <acoba@redhat.com> lint Signed-off-by: Anxhela Coba <acoba@redhat.com> fix e2e Signed-off-by: Anxhela Coba <acoba@redhat.com> fix e2e Signed-off-by: Anxhela Coba <acoba@redhat.com> fix e2e Signed-off-by: Anxhela Coba <acoba@redhat.com> test Signed-off-by: Anxhela Coba <acoba@redhat.com> test Signed-off-by: Anxhela Coba <acoba@redhat.com> test Signed-off-by: Anxhela Coba <acoba@redhat.com> add doc_id to referenced docs Signed-off-by: Anxhela Coba <acoba@redhat.com> update unit tests Signed-off-by: Anxhela Coba <acoba@redhat.com>
1 parent 5382997 commit 5a22239

11 files changed

Lines changed: 56 additions & 37 deletions

File tree

docker-compose.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ services:
2424
environment:
2525
- BRAVE_SEARCH_API_KEY=${BRAVE_SEARCH_API_KEY:-}
2626
- TAVILY_SEARCH_API_KEY=${TAVILY_SEARCH_API_KEY:-}
27+
- EXTERNAL_PROVIDERS_DIR=${EXTERNAL_PROVIDERS_DIR:-/opt/app-root/external_providers}
2728
# OpenAI
2829
- OPENAI_API_KEY=${OPENAI_API_KEY}
2930
- E2E_OPENAI_MODEL=${E2E_OPENAI_MODEL:-gpt-4o-mini}

src/models/common/turn_summary.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class ReferencedDocument(BaseModel):
3131
Attributes:
3232
doc_url: Url to the referenced doc.
3333
doc_title: Title of the referenced doc.
34+
document_id: Document ID for preserving identity during deduplication.
3435
"""
3536

3637
doc_url: Optional[AnyUrl] = Field(
@@ -46,6 +47,11 @@ class ReferencedDocument(BaseModel):
4647
description="Index name identifying the knowledge source from configuration",
4748
)
4849

50+
document_id: Optional[str] = Field(
51+
default=None,
52+
description="Document ID for preserving identity during deduplication",
53+
)
54+
4955

5056
class RAGContext(BaseModel):
5157
"""Result of building RAG context from all enabled pre-query RAG sources.

src/utils/reranker.py

Lines changed: 7 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ async def _get_cross_encoder(model_name: str) -> Any:
3131
Loaded CrossEncoder model instance, or None if loading fails.
3232
"""
3333
# Check if reranking is enabled before attempting to load the model
34-
if not configuration.reranker.enabled:
34+
if not configuration.reranker.enabled: # pylint: disable=no-member
3535
logger.debug("Reranker is disabled, not loading cross-encoder model")
3636
return None
3737

@@ -41,9 +41,9 @@ async def _get_cross_encoder(model_name: str) -> Any:
4141
if model_name in _cross_encoder_models:
4242
return _cross_encoder_models[model_name]
4343
try:
44-
from sentence_transformers import (
44+
from sentence_transformers import ( # pylint: disable=import-outside-toplevel
4545
CrossEncoder,
46-
) # pylint: disable=import-outside-toplevel
46+
)
4747

4848
model = await asyncio.to_thread(CrossEncoder, model_name)
4949
_cross_encoder_models[model_name] = model
@@ -162,12 +162,7 @@ async def rerank_chunks_with_cross_encoder(
162162

163163
# Return RAGChunk list with combined scores
164164
return [
165-
RAGChunk(
166-
content=chunk.content,
167-
source=chunk.source,
168-
score=float(score),
169-
attributes=chunk.attributes,
170-
)
165+
chunk.model_copy(update={"score": float(score)})
171166
for score, chunk in top_indexed
172167
]
173168

@@ -201,16 +196,10 @@ def apply_byok_rerank_boost(
201196
score = chunk.score if chunk.score is not None else float("-inf")
202197
if chunk.source != constants.OKP_RAG_ID:
203198
score = score * boost
204-
boosted.append(
205-
RAGChunk(
206-
content=chunk.content,
207-
source=chunk.source,
208-
score=score,
209-
attributes=chunk.attributes,
210-
)
211-
)
199+
boosted.append(chunk.model_copy(update={"score": score}))
212200
boosted.sort(
213201
key=lambda c: c.score if c.score is not None else float("-inf"),
214202
reverse=True,
215203
)
216-
return boosted
204+
205+
return boosted

src/utils/responses.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -856,6 +856,7 @@ def parse_referenced_documents( # pylint: disable=too-many-locals
856856
or attributes.get("link")
857857
)
858858
doc_title = attributes.get("title")
859+
doc_id = attributes.get("document_id") or attributes.get("doc_id")
859860

860861
if doc_title or doc_url:
861862
# Treat empty string as None for URL to satisfy Optional[AnyUrl]
@@ -866,6 +867,7 @@ def parse_referenced_documents( # pylint: disable=too-many-locals
866867
doc_url=final_url,
867868
doc_title=doc_title,
868869
source=resolved_source,
870+
document_id=doc_id,
869871
)
870872
)
871873
seen_docs.add((final_url, doc_title))

src/utils/vector_search.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
logger = get_logger(__name__)
2828

2929

30-
3130
def _filter_documents_for_chunks(
3231
all_documents: list[ReferencedDocument],
3332
final_chunks: list[RAGChunk],
@@ -50,6 +49,8 @@ def _filter_documents_for_chunks(
5049
attrs.get("reference_url") or attrs.get("doc_url") or attrs.get("docs_url")
5150
)
5251
doc_id = attrs.get("document_id") or attrs.get("doc_id")
52+
# Use same precedence as _process_byok_rag_chunks_for_documents:
53+
# reference_url first, then doc_id
5354
dedup_key = doc_url or doc_id or chunk.source or ""
5455
if dedup_key:
5556
final_chunk_identifiers.add(dedup_key)
@@ -58,9 +59,11 @@ def _filter_documents_for_chunks(
5859
filtered_documents = []
5960
seen = set()
6061
for doc in all_documents:
61-
# Build same dedup key for document
62+
# Build same dedup key for document using same logic as extraction
6263
doc_url_str = str(doc.doc_url) if doc.doc_url else None
63-
dedup_key = doc_url_str or doc.source or ""
64+
# Use the same dedup key logic as _process_byok_rag_chunks_for_documents
65+
# which uses reference_url or doc_id as the key
66+
dedup_key = doc_url_str or doc.document_id or doc.source or ""
6467

6568
if dedup_key in final_chunk_identifiers and dedup_key not in seen:
6669
seen.add(dedup_key)
@@ -313,8 +316,16 @@ def _process_byok_rag_chunks_for_documents(
313316
or metadata.get("docs_url")
314317
)
315318

319+
# If no standard document identifiers are available, create a fallback
320+
# using the source (vector store ID) to ensure referenced documents
321+
# are still created for e2e tests where metadata may be minimal
316322
if not doc_id and not reference_url:
317-
continue
323+
# Use source as fallback document identifier
324+
fallback_doc_id = result.get("source", "unknown")
325+
if fallback_doc_id and fallback_doc_id != "unknown":
326+
doc_id = fallback_doc_id
327+
else:
328+
continue
318329

319330
# Use doc_id or reference_url as deduplication key
320331
dedup_key = reference_url or doc_id
@@ -334,6 +345,7 @@ def _process_byok_rag_chunks_for_documents(
334345
doc_title=title,
335346
doc_url=parsed_url,
336347
source=result.get("source"), # Vector store ID
348+
document_id=doc_id,
337349
)
338350
)
339351

@@ -387,6 +399,7 @@ def _process_solr_chunks_for_documents(
387399
doc_title=title,
388400
doc_url=parsed_url,
389401
source=constants.OKP_RAG_ID,
402+
document_id=doc_id,
390403
)
391404
)
392405

tests/unit/app/endpoints/test_query.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ async def test_query_merges_inline_and_tool_rag_chunks_and_documents(
212212
)
213213

214214
inline_chunk = RAGChunk(content="inline chunk content", source="byok")
215-
inline_doc = ReferencedDocument(doc_title="Inline Doc")
215+
inline_doc = ReferencedDocument(doc_title="Inline Doc", document_id="inline_doc_1")
216216
inline_rag = RAGContext(
217217
context_text="",
218218
rag_chunks=[inline_chunk],
@@ -237,7 +237,7 @@ async def test_query_merges_inline_and_tool_rag_chunks_and_documents(
237237
)
238238

239239
tool_chunk = RAGChunk(content="tool chunk content", source="vs-1")
240-
tool_doc = ReferencedDocument(doc_title="Tool Doc")
240+
tool_doc = ReferencedDocument(doc_title="Tool Doc", document_id="tool_doc_1")
241241
mock_turn_summary = TurnSummary()
242242
mock_turn_summary.rag_chunks = [tool_chunk]
243243
mock_turn_summary.referenced_documents = [tool_doc]

tests/unit/cache/test_postgres_cache.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,7 @@ def test_insert_and_get_with_referenced_documents(
637637

638638
# Create a CacheEntry with referenced documents
639639
docs = [
640-
ReferencedDocument(doc_title="Test Doc", doc_url=AnyUrl("http://example.com/"))
640+
ReferencedDocument(doc_title="Test Doc", doc_url=AnyUrl("http://example.com/"), document_id="test_doc_postgres_1")
641641
]
642642
entry_with_docs = CacheEntry(
643643
query="user message",
@@ -664,7 +664,7 @@ def test_insert_and_get_with_referenced_documents(
664664
inserted_json_str = sql_params[-3]
665665

666666
assert json.loads(inserted_json_str) == [
667-
{"doc_url": "http://example.com/", "doc_title": "Test Doc", "source": None}
667+
{"doc_url": "http://example.com/", "doc_title": "Test Doc", "source": None, "document_id": "test_doc_postgres_1"}
668668
]
669669

670670
# Simulate the database returning that data
@@ -675,7 +675,7 @@ def test_insert_and_get_with_referenced_documents(
675675
"bar",
676676
"start_time",
677677
"end_time",
678-
[{"doc_url": "http://example.com/", "doc_title": "Test Doc"}],
678+
[{"doc_url": "http://example.com/", "doc_title": "Test Doc", "document_id": "test_doc_postgres_1"}],
679679
None, # tool_calls
680680
None, # tool_results
681681
)

tests/unit/cache/test_sqlite_cache.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -467,7 +467,7 @@ def test_insert_and_get_with_referenced_documents(tmpdir: Path) -> None:
467467

468468
# Create a CacheEntry with referenced documents
469469
docs = [
470-
ReferencedDocument(doc_title="Test Doc", doc_url=AnyUrl("http://example.com"))
470+
ReferencedDocument(doc_title="Test Doc", doc_url=AnyUrl("http://example.com"), document_id="test_doc_cache_1")
471471
]
472472
entry_with_docs = CacheEntry(
473473
query="user message",
@@ -571,7 +571,7 @@ def test_insert_and_get_with_all_fields(tmpdir: Path) -> None:
571571

572572
# Create all fields
573573
docs = [
574-
ReferencedDocument(doc_title="Test Doc", doc_url=AnyUrl("http://example.com"))
574+
ReferencedDocument(doc_title="Test Doc", doc_url=AnyUrl("http://example.com"), document_id="test_doc_cache_1")
575575
]
576576
tool_calls = [
577577
ToolCallSummary(

tests/unit/models/responses/test_successful_responses.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ def test_constructor_full(self) -> None:
321321
)
322322
]
323323
referenced_docs = [
324-
ReferencedDocument(doc_url=AnyUrl("https://example.com"), doc_title="Doc")
324+
ReferencedDocument(doc_url=AnyUrl("https://example.com"), doc_title="Doc", document_id="test_doc_1")
325325
]
326326

327327
response = QueryResponse( # type: ignore[call-arg]

tests/unit/utils/test_responses.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2126,10 +2126,11 @@ def test_parse_referenced_documents_file_search_call(
21262126
mock_result1.attributes = {
21272127
"link": "https://example.com/doc1",
21282128
"title": "Document 1",
2129+
"document_id": "doc_1",
21292130
}
21302131

21312132
mock_result2 = {
2132-
"attributes": {"url": "https://example.com/doc2", "title": "Document 2"},
2133+
"attributes": {"url": "https://example.com/doc2", "title": "Document 2", "doc_id": "doc_2"},
21332134
}
21342135

21352136
mock_output_item = mocker.Mock()
@@ -2143,8 +2144,10 @@ def test_parse_referenced_documents_file_search_call(
21432144
assert len(result) == 2
21442145
assert result[0].doc_title == "Document 1"
21452146
assert result[0].doc_url == AnyUrl("https://example.com/doc1")
2147+
assert result[0].document_id == "doc_1"
21462148
assert result[1].doc_title == "Document 2"
21472149
assert result[1].doc_url == AnyUrl("https://example.com/doc2")
2150+
assert result[1].document_id == "doc_2"
21482151

21492152
def test_parse_referenced_documents_message_annotations(
21502153
self, mocker: MockerFixture
@@ -2206,6 +2209,7 @@ def test_parse_referenced_documents_deduplication(
22062209
mock_result.attributes = {
22072210
"link": "https://example.com/doc1",
22082211
"title": "Document 1",
2212+
"document_id": "doc_1",
22092213
}
22102214

22112215
mock_output_item = mocker.Mock()
@@ -2992,6 +2996,7 @@ def test_single_store_source_populated(self, mocker: MockerFixture) -> None:
29922996
mock_result.attributes = {
29932997
"url": "https://docs.example.com/page",
29942998
"title": "Example Page",
2999+
"document_id": "doc_page_1",
29953000
}
29963001

29973002
mock_output = mocker.Mock()
@@ -3014,7 +3019,7 @@ def test_single_store_source_populated(self, mocker: MockerFixture) -> None:
30143019
def test_no_mapping_source_is_none(self, mocker: MockerFixture) -> None:
30153020
"""Test that source is None when no mapping provided."""
30163021
mock_result = mocker.Mock()
3017-
mock_result.attributes = {"title": "Doc"}
3022+
mock_result.attributes = {"title": "Doc", "document_id": "doc_no_mapping"}
30183023

30193024
mock_output = mocker.Mock()
30203025
mock_output.type = "file_search_call"
@@ -3031,7 +3036,7 @@ def test_no_mapping_source_is_none(self, mocker: MockerFixture) -> None:
30313036
def test_multiple_stores_source_is_none(self, mocker: MockerFixture) -> None:
30323037
"""Test that source is None with multiple stores (ambiguous)."""
30333038
mock_result = mocker.Mock()
3034-
mock_result.attributes = {"title": "Doc"}
3039+
mock_result.attributes = {"title": "Doc", "document_id": "doc_multi_stores"}
30353040

30363041
mock_output = mocker.Mock()
30373042
mock_output.type = "file_search_call"

0 commit comments

Comments
 (0)