googleapis
diff --git a/‎tests/unit/vertexai/genai/test_evals.py‎
Lines changed: 90 additions & 10 deletions b/‎tests/unit/vertexai/genai/test_evals.py‎
Lines changed: 90 additions & 10 deletions
diff --git a/‎vertexai/_genai/_evals_utils.py‎
Lines changed: 212 additions & 0 deletions b/‎vertexai/_genai/_evals_utils.py‎
Lines changed: 212 additions & 0 deletions
@@ -520,7 +520,82 @@ def test_response_structure(self):
         assert result.clusters[0].item_count == 3
         assert result.clusters[1].cluster_id == "cluster-2"
 
-    def test_response_show_with_results(self, capsys):
+    def test_get_loss_analysis_html(self):
+        """Tests that _get_loss_analysis_html generates valid HTML with data."""
+        from vertexai._genai import _evals_visualization
+        import json
+
+        data = {
+            "results": [
+                {
+                    "config": {
+                        "metric": "test_metric",
+                        "candidate": "test-candidate",
+                    },
+                    "clusters": [
+                        {
+                            "cluster_id": "c1",
+                            "taxonomy_entry": {
+                                "l1_category": "Tool Calling",
+                                "l2_category": "Missing Invocation",
+                                "description": "Agent failed to call the tool.",
+                            },
+                            "item_count": 5,
+                            "examples": [
+                                {
+                                    "evaluation_result": {
+                                        "request": {
+                                            "prompt": {
+                                                "agent_data": {
+                                                    "turns": [
+                                                        {
+                                                            "turn_index": 0,
+                                                            "events": [
+                                                                {
+                                                                    "author": "user",
+                                                                    "content": {
+                                                                        "parts": [
+                                                                            {
+                                                                                "text": "Find flights to Paris"
+                                                                            }
+                                                                        ],
+                                                                    },
+                                                                }
+                                                            ],
+                                                        }
+                                                    ],
+                                                },
+                                            },
+                                        },
+                                    },
+                                    "failed_rubrics": [
+                                        {
+                                            "rubric_id": "tool_use",
+                                            "classification_rationale": "Did not invoke find_flights.",
+                                        }
+                                    ],
+                                }
+                            ],
+                        },
+                    ],
+                }
+            ]
+        }
+        html = _evals_visualization._get_loss_analysis_html(json.dumps(data))
+        assert "Loss Pattern Analysis" in html
+        assert "test_metric" not in html  # data is Base64-encoded in the HTML
+        assert "<!DOCTYPE html>" in html
+        assert "extractScenarioPreview" in html
+        assert "example-scenario" in html
+        assert "DOMPurify" in html  # uses DOMPurify for sanitization
+        assert "example-section-label" in html  # labels for scenario/rubrics
+        assert "Analysis Summary" in html  # summary heading
+
+    def test_display_loss_clusters_response_no_ipython(self):
+        """Tests graceful fallback when not in IPython."""
+        from vertexai._genai import _evals_visualization
+        from unittest import mock
+
         response = common_types.GenerateLossClustersResponse(
             results=[
                 common_types.LossAnalysisResult(
@@ -541,12 +616,17 @@ def test_response_show_with_results(self, capsys):
                 )
             ],
         )
-        response.show()
-        captured = capsys.readouterr()
-        assert "test_metric" in captured.out
-        assert "c1" in captured.out
+        with mock.patch.object(
+            _evals_visualization, "_is_ipython_env", return_value=False
+        ):
+            # Should not raise, just log a warning
+            response.show()
+
+    def test_display_loss_analysis_result_no_ipython(self):
+        """Tests graceful fallback for individual result when not in IPython."""
+        from vertexai._genai import _evals_visualization
+        from unittest import mock
 
-    def test_loss_analysis_result_show(self, capsys):
         result = common_types.LossAnalysisResult(
             config=common_types.LossAnalysisConfig(
                 metric="test_metric",
@@ -563,10 +643,10 @@ def test_loss_analysis_result_show(self, capsys):
                 ),
             ],
         )
-        result.show()
-        captured = capsys.readouterr()
-        assert "test_metric" in captured.out
-        assert "c1" in captured.out
+        with mock.patch.object(
+            _evals_visualization, "_is_ipython_env", return_value=False
+        ):
+            result.show()
 
 
 def _make_eval_result(
 
@@ -591,6 +591,218 @@ def _resolve_loss_analysis_config(
     return resolved_config
 
 
+def _build_rubric_description_map(
+    eval_result: types.EvaluationResult,
+) -> dict[str, str]:
+    """Builds a rubric_id -> description map from the EvaluationResult."""
+    rubric_map: dict[str, str] = {}
+    for case_result in eval_result.eval_case_results or []:
+        for resp_cand in case_result.response_candidate_results or []:
+            for metric_res in (resp_cand.metric_results or {}).values():
+                for verdict in metric_res.rubric_verdicts or []:
+                    rubric = verdict.evaluated_rubric
+                    if rubric and rubric.rubric_id and rubric.content:
+                        if (
+                            rubric.content.property
+                            and rubric.content.property.description
+                        ):
+                            rubric_map[rubric.rubric_id] = (
+                                rubric.content.property.description
+                            )
+    return rubric_map
+
+
+def _extract_scenario_preview_from_dict(
+    eval_result_dict: dict[str, Any],
+) -> Optional[str]:
+    """Extracts the first user message from an evaluation_result dict.
+
+    Handles both snake_case (SDK-side) and camelCase (API echo-back) keys.
+    """
+    request = eval_result_dict.get("request")
+    if not request:
+        return None
+    prompt = request.get("prompt")
+    if not prompt:
+        return None
+    # Try agent_data (snake_case or camelCase)
+    agent_data = prompt.get("agent_data") or prompt.get("agentData")
+    if agent_data and isinstance(agent_data, dict):
+        turns = agent_data.get("turns", [])
+        for turn in turns:
+            events = turn.get("events", [])
+            for event in events:
+                author = event.get("author", "")
+                content = event.get("content")
+                if (
+                    author.lower() == "user"
+                    and content
+                    and isinstance(content, dict)
+                ):
+                    parts = content.get("parts", [])
+                    for part in parts:
+                        text = str(part.get("text", "")).strip()
+                        if text:
+                            if len(text) > 150:
+                                return text[:150] + "..."
+                            return text
+    # Try simple prompt path
+    parts = prompt.get("parts", [])
+    for part in parts:
+        text = str(part.get("text", "")).strip()
+        if text:
+            if len(text) > 150:
+                return text[:150] + "..."
+            return text
+    return None
+
+
+def _extract_scenario_from_agent_data(agent_data: Any) -> Optional[str]:
+    """Extracts the first user message from an AgentData object or dict."""
+    if agent_data is None:
+        return None
+    if hasattr(agent_data, "model_dump"):
+        agent_data = agent_data.model_dump()
+    if isinstance(agent_data, str):
+        try:
+            agent_data = json.loads(agent_data)
+        except (json.JSONDecodeError, ValueError):
+            return None
+    if not isinstance(agent_data, dict):
+        return None
+    turns = agent_data.get("turns", [])
+    if not isinstance(turns, list):
+        return None
+    for turn in turns:
+        if not isinstance(turn, dict):
+            continue
+        events = turn.get("events", [])
+        if not isinstance(events, list):
+            continue
+        for event in events:
+            if not isinstance(event, dict):
+                continue
+            author = event.get("author", "")
+            if not isinstance(author, str) or author.lower() != "user":
+                continue
+            content = event.get("content")
+            if not content or not isinstance(content, dict):
+                continue
+            parts = content.get("parts", [])
+            if not isinstance(parts, list):
+                continue
+            for part in parts:
+                if not isinstance(part, dict):
+                    continue
+                text = str(part.get("text", "")).strip()
+                if text:
+                    if len(text) > 150:
+                        return text[:150] + "..."
+                    return text
+    return None
+
+
+def _build_scenario_preview_list(
+    eval_result: types.EvaluationResult,
+) -> list[Optional[str]]:
+    """Builds an ordered list of scenario previews from the EvaluationResult.
+
+    Returns one scenario preview per eval_case_result, in the same order as
+    eval_case_results. This extracts the first user message from the original
+    SDK EvaluationResult (via eval_cases or DataFrame), rather than relying
+    on the API echo-back which may not preserve the request data.
+    """
+    eval_dataset = eval_result.evaluation_dataset
+    eval_cases: list[Any] = []
+    if isinstance(eval_dataset, list) and eval_dataset:
+        eval_cases = getv(eval_dataset[0], ["eval_cases"]) or []
+
+    eval_case_results = eval_result.eval_case_results or []
+    scenarios: list[Optional[str]] = []
+
+    for case_result in eval_case_results:
+        case_idx = case_result.eval_case_index or 0
+        scenario: Optional[str] = None
+
+        eval_case = None
+        if 0 <= case_idx < len(eval_cases):
+            eval_case = eval_cases[case_idx]
+
+        if eval_case:
+            agent_data = getv(eval_case, ["agent_data"])
+            if agent_data:
+                scenario = _extract_scenario_from_agent_data(agent_data)
+            elif getv(eval_case, ["prompt"]):
+                from . import _evals_data_converters
+
+                text = _evals_data_converters._get_content_text(
+                    getv(eval_case, ["prompt"])
+                )
+                if text:
+                    text = str(text).strip()
+                    if len(text) > 150:
+                        scenario = text[:150] + "..."
+                    else:
+                        scenario = text
+
+        # Fallback: extract from DataFrame
+        if scenario is None:
+            df_agent_data = _transformers._extract_agent_data_from_df(
+                eval_dataset, case_idx
+            )
+            if df_agent_data is not None:
+                scenario = _extract_scenario_from_agent_data(df_agent_data)
+
+        scenarios.append(scenario)
+
+    return scenarios
+
+
+def _enrich_loss_response_with_rubric_descriptions(
+    response: types.GenerateLossClustersResponse,
+    eval_result: types.EvaluationResult,
+) -> None:
+    """Enriches loss response with rubric descriptions and scenario previews.
+
+    Rubric descriptions and scenario previews are extracted from the original
+    SDK EvaluationResult object, because the API echo-back in
+    LossExample.evaluation_result may not preserve all request data (e.g.,
+    agent_data turns with user messages).
+    """
+    rubric_map = _build_rubric_description_map(eval_result)
+    scenario_list = _build_scenario_preview_list(eval_result)
+    for result in response.results or []:
+        for cluster in result.clusters or []:
+            for example in cluster.examples or []:
+                if example.evaluation_result is None:
+                    example.evaluation_result = {}
+                if rubric_map:
+                    example.evaluation_result["rubric_descriptions"] = (
+                        rubric_map
+                    )
+                # Try extracting scenario from the API echo-back first
+                if "scenario_preview" not in example.evaluation_result:
+                    scenario = _extract_scenario_preview_from_dict(
+                        example.evaluation_result
+                    )
+                    if scenario:
+                        example.evaluation_result["scenario_preview"] = (
+                            scenario
+                        )
+                # Fallback: match against scenarios from original eval_result
+                if "scenario_preview" not in example.evaluation_result:
+                    if scenario_list:
+                        # Use the first available scenario as a fallback.
+                        # A single eval_result typically maps to one scenario
+                        # per eval case, and loss examples share these.
+                        for s in scenario_list:
+                            if s:
+                                example.evaluation_result[
+                                    "scenario_preview"
+                                ] = s
+                                break
+
+
 def _poll_operation(
     api_client: BaseApiClient,
     operation: types.GenerateLossClustersOperation,