Skip to content

Commit a36e2bd

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Client(evals) - add rich HTML visualization for loss pattern analysis
PiperOrigin-RevId: 894799725
1 parent 9ed3759 commit a36e2bd

6 files changed

Lines changed: 686 additions & 73 deletions

File tree

tests/unit/vertexai/genai/test_evals.py

Lines changed: 90 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -520,7 +520,82 @@ def test_response_structure(self):
520520
assert result.clusters[0].item_count == 3
521521
assert result.clusters[1].cluster_id == "cluster-2"
522522

523-
def test_response_show_with_results(self, capsys):
523+
def test_get_loss_analysis_html(self):
524+
"""Tests that _get_loss_analysis_html generates valid HTML with data."""
525+
from vertexai._genai import _evals_visualization
526+
import json
527+
528+
data = {
529+
"results": [
530+
{
531+
"config": {
532+
"metric": "test_metric",
533+
"candidate": "test-candidate",
534+
},
535+
"clusters": [
536+
{
537+
"cluster_id": "c1",
538+
"taxonomy_entry": {
539+
"l1_category": "Tool Calling",
540+
"l2_category": "Missing Invocation",
541+
"description": "Agent failed to call the tool.",
542+
},
543+
"item_count": 5,
544+
"examples": [
545+
{
546+
"evaluation_result": {
547+
"request": {
548+
"prompt": {
549+
"agent_data": {
550+
"turns": [
551+
{
552+
"turn_index": 0,
553+
"events": [
554+
{
555+
"author": "user",
556+
"content": {
557+
"parts": [
558+
{
559+
"text": "Find flights to Paris"
560+
}
561+
],
562+
},
563+
}
564+
],
565+
}
566+
],
567+
},
568+
},
569+
},
570+
},
571+
"failed_rubrics": [
572+
{
573+
"rubric_id": "tool_use",
574+
"classification_rationale": "Did not invoke find_flights.",
575+
}
576+
],
577+
}
578+
],
579+
},
580+
],
581+
}
582+
]
583+
}
584+
html = _evals_visualization._get_loss_analysis_html(json.dumps(data))
585+
assert "Loss Pattern Analysis" in html
586+
assert "test_metric" not in html # data is Base64-encoded in the HTML
587+
assert "<!DOCTYPE html>" in html
588+
assert "extractScenarioPreview" in html
589+
assert "example-scenario" in html
590+
assert "DOMPurify" in html # uses DOMPurify for sanitization
591+
assert "example-section-label" in html # labels for scenario/rubrics
592+
assert "Analysis Summary" in html # summary heading
593+
594+
def test_display_loss_clusters_response_no_ipython(self):
595+
"""Tests graceful fallback when not in IPython."""
596+
from vertexai._genai import _evals_visualization
597+
from unittest import mock
598+
524599
response = common_types.GenerateLossClustersResponse(
525600
results=[
526601
common_types.LossAnalysisResult(
@@ -541,12 +616,17 @@ def test_response_show_with_results(self, capsys):
541616
)
542617
],
543618
)
544-
response.show()
545-
captured = capsys.readouterr()
546-
assert "test_metric" in captured.out
547-
assert "c1" in captured.out
619+
with mock.patch.object(
620+
_evals_visualization, "_is_ipython_env", return_value=False
621+
):
622+
# Should not raise, just log a warning
623+
response.show()
624+
625+
def test_display_loss_analysis_result_no_ipython(self):
626+
"""Tests graceful fallback for individual result when not in IPython."""
627+
from vertexai._genai import _evals_visualization
628+
from unittest import mock
548629

549-
def test_loss_analysis_result_show(self, capsys):
550630
result = common_types.LossAnalysisResult(
551631
config=common_types.LossAnalysisConfig(
552632
metric="test_metric",
@@ -563,10 +643,10 @@ def test_loss_analysis_result_show(self, capsys):
563643
),
564644
],
565645
)
566-
result.show()
567-
captured = capsys.readouterr()
568-
assert "test_metric" in captured.out
569-
assert "c1" in captured.out
646+
with mock.patch.object(
647+
_evals_visualization, "_is_ipython_env", return_value=False
648+
):
649+
result.show()
570650

571651

572652
def _make_eval_result(

vertexai/_genai/_evals_utils.py

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,218 @@ def _resolve_loss_analysis_config(
591591
return resolved_config
592592

593593

594+
def _build_rubric_description_map(
595+
eval_result: types.EvaluationResult,
596+
) -> dict[str, str]:
597+
"""Builds a rubric_id -> description map from the EvaluationResult."""
598+
rubric_map: dict[str, str] = {}
599+
for case_result in eval_result.eval_case_results or []:
600+
for resp_cand in case_result.response_candidate_results or []:
601+
for metric_res in (resp_cand.metric_results or {}).values():
602+
for verdict in metric_res.rubric_verdicts or []:
603+
rubric = verdict.evaluated_rubric
604+
if rubric and rubric.rubric_id and rubric.content:
605+
if (
606+
rubric.content.property
607+
and rubric.content.property.description
608+
):
609+
rubric_map[rubric.rubric_id] = (
610+
rubric.content.property.description
611+
)
612+
return rubric_map
613+
614+
615+
def _extract_scenario_preview_from_dict(
616+
eval_result_dict: dict[str, Any],
617+
) -> Optional[str]:
618+
"""Extracts the first user message from an evaluation_result dict.
619+
620+
Handles both snake_case (SDK-side) and camelCase (API echo-back) keys.
621+
"""
622+
request = eval_result_dict.get("request")
623+
if not request:
624+
return None
625+
prompt = request.get("prompt")
626+
if not prompt:
627+
return None
628+
# Try agent_data (snake_case or camelCase)
629+
agent_data = prompt.get("agent_data") or prompt.get("agentData")
630+
if agent_data and isinstance(agent_data, dict):
631+
turns = agent_data.get("turns", [])
632+
for turn in turns:
633+
events = turn.get("events", [])
634+
for event in events:
635+
author = event.get("author", "")
636+
content = event.get("content")
637+
if (
638+
author.lower() == "user"
639+
and content
640+
and isinstance(content, dict)
641+
):
642+
parts = content.get("parts", [])
643+
for part in parts:
644+
text = str(part.get("text", "")).strip()
645+
if text:
646+
if len(text) > 150:
647+
return text[:150] + "..."
648+
return text
649+
# Try simple prompt path
650+
parts = prompt.get("parts", [])
651+
for part in parts:
652+
text = str(part.get("text", "")).strip()
653+
if text:
654+
if len(text) > 150:
655+
return text[:150] + "..."
656+
return text
657+
return None
658+
659+
660+
def _extract_scenario_from_agent_data(agent_data: Any) -> Optional[str]:
661+
"""Extracts the first user message from an AgentData object or dict."""
662+
if agent_data is None:
663+
return None
664+
if hasattr(agent_data, "model_dump"):
665+
agent_data = agent_data.model_dump()
666+
if isinstance(agent_data, str):
667+
try:
668+
agent_data = json.loads(agent_data)
669+
except (json.JSONDecodeError, ValueError):
670+
return None
671+
if not isinstance(agent_data, dict):
672+
return None
673+
turns = agent_data.get("turns", [])
674+
if not isinstance(turns, list):
675+
return None
676+
for turn in turns:
677+
if not isinstance(turn, dict):
678+
continue
679+
events = turn.get("events", [])
680+
if not isinstance(events, list):
681+
continue
682+
for event in events:
683+
if not isinstance(event, dict):
684+
continue
685+
author = event.get("author", "")
686+
if not isinstance(author, str) or author.lower() != "user":
687+
continue
688+
content = event.get("content")
689+
if not content or not isinstance(content, dict):
690+
continue
691+
parts = content.get("parts", [])
692+
if not isinstance(parts, list):
693+
continue
694+
for part in parts:
695+
if not isinstance(part, dict):
696+
continue
697+
text = str(part.get("text", "")).strip()
698+
if text:
699+
if len(text) > 150:
700+
return text[:150] + "..."
701+
return text
702+
return None
703+
704+
705+
def _build_scenario_preview_list(
706+
eval_result: types.EvaluationResult,
707+
) -> list[Optional[str]]:
708+
"""Builds an ordered list of scenario previews from the EvaluationResult.
709+
710+
Returns one scenario preview per eval_case_result, in the same order as
711+
eval_case_results. This extracts the first user message from the original
712+
SDK EvaluationResult (via eval_cases or DataFrame), rather than relying
713+
on the API echo-back which may not preserve the request data.
714+
"""
715+
eval_dataset = eval_result.evaluation_dataset
716+
eval_cases: list[Any] = []
717+
if isinstance(eval_dataset, list) and eval_dataset:
718+
eval_cases = getv(eval_dataset[0], ["eval_cases"]) or []
719+
720+
eval_case_results = eval_result.eval_case_results or []
721+
scenarios: list[Optional[str]] = []
722+
723+
for case_result in eval_case_results:
724+
case_idx = case_result.eval_case_index or 0
725+
scenario: Optional[str] = None
726+
727+
eval_case = None
728+
if 0 <= case_idx < len(eval_cases):
729+
eval_case = eval_cases[case_idx]
730+
731+
if eval_case:
732+
agent_data = getv(eval_case, ["agent_data"])
733+
if agent_data:
734+
scenario = _extract_scenario_from_agent_data(agent_data)
735+
elif getv(eval_case, ["prompt"]):
736+
from . import _evals_data_converters
737+
738+
text = _evals_data_converters._get_content_text(
739+
getv(eval_case, ["prompt"])
740+
)
741+
if text:
742+
text = str(text).strip()
743+
if len(text) > 150:
744+
scenario = text[:150] + "..."
745+
else:
746+
scenario = text
747+
748+
# Fallback: extract from DataFrame
749+
if scenario is None:
750+
df_agent_data = _transformers._extract_agent_data_from_df(
751+
eval_dataset, case_idx
752+
)
753+
if df_agent_data is not None:
754+
scenario = _extract_scenario_from_agent_data(df_agent_data)
755+
756+
scenarios.append(scenario)
757+
758+
return scenarios
759+
760+
761+
def _enrich_loss_response_with_rubric_descriptions(
762+
response: types.GenerateLossClustersResponse,
763+
eval_result: types.EvaluationResult,
764+
) -> None:
765+
"""Enriches loss response with rubric descriptions and scenario previews.
766+
767+
Rubric descriptions and scenario previews are extracted from the original
768+
SDK EvaluationResult object, because the API echo-back in
769+
LossExample.evaluation_result may not preserve all request data (e.g.,
770+
agent_data turns with user messages).
771+
"""
772+
rubric_map = _build_rubric_description_map(eval_result)
773+
scenario_list = _build_scenario_preview_list(eval_result)
774+
for result in response.results or []:
775+
for cluster in result.clusters or []:
776+
for example in cluster.examples or []:
777+
if example.evaluation_result is None:
778+
example.evaluation_result = {}
779+
if rubric_map:
780+
example.evaluation_result["rubric_descriptions"] = (
781+
rubric_map
782+
)
783+
# Try extracting scenario from the API echo-back first
784+
if "scenario_preview" not in example.evaluation_result:
785+
scenario = _extract_scenario_preview_from_dict(
786+
example.evaluation_result
787+
)
788+
if scenario:
789+
example.evaluation_result["scenario_preview"] = (
790+
scenario
791+
)
792+
# Fallback: match against scenarios from original eval_result
793+
if "scenario_preview" not in example.evaluation_result:
794+
if scenario_list:
795+
# Use the first available scenario as a fallback.
796+
# A single eval_result typically maps to one scenario
797+
# per eval case, and loss examples share these.
798+
for s in scenario_list:
799+
if s:
800+
example.evaluation_result[
801+
"scenario_preview"
802+
] = s
803+
break
804+
805+
594806
def _poll_operation(
595807
api_client: BaseApiClient,
596808
operation: types.GenerateLossClustersOperation,

0 commit comments

Comments
 (0)