fix: Fix column name inconsistency (history vs conversation_history) across evaluation flows

vertex-sdk-bot · copybara-github · commit 37f72e5c389f · 2026-04-15T13:13:49.000-07:00
PiperOrigin-RevId: 900310250
diff --git a/tests/unit/vertexai/genai/test_evals.py b/tests/unit/vertexai/genai/test_evals.py
@@ -4691,6 +4691,34 @@ def test_convert_with_conversation_history(self):
             eval_case.conversation_history[1].content.parts[0].text == "Old model msg"
         )
 
+    def test_convert_with_conversation_history_column_name(self):
+        """Tests that 'conversation_history' is accepted as a column name alias for 'history'."""
+        raw_data_df = pd.DataFrame(
+            {
+                "prompt": ["Current prompt"],
+                "response": ["A response"],
+                "conversation_history": [
+                    [
+                        {"role": "user", "parts": [{"text": "Old user msg"}]},
+                        {"role": "model", "parts": [{"text": "Old model msg"}]},
+                    ]
+                ],
+            }
+        )
+        raw_data = raw_data_df.to_dict(orient="records")
+        result_dataset = self.converter.convert(raw_data)
+        eval_case = result_dataset.eval_cases[0]
+
+        assert eval_case.prompt == genai_types.Content(
+            parts=[genai_types.Part(text="Current prompt")]
+        )
+        assert eval_case.reference is None
+        assert len(eval_case.conversation_history) == 2
+        assert eval_case.conversation_history[0].content.parts[0].text == "Old user msg"
+        assert (
+            eval_case.conversation_history[1].content.parts[0].text == "Old model msg"
+        )
+
     def test_convert_missing_response_raises_value_error(self):
         raw_data_df = pd.DataFrame({"prompt": ["Hello"]})  # Missing response
         raw_data = raw_data_df.to_dict(orient="records")
@@ -8355,6 +8383,176 @@ def test_create_evaluation_set_with_agent_data(
         assert candidate_response["candidate"] == "test-candidate"
         assert candidate_response["agent_data"] == agent_data
 
+    @mock.patch.object(_evals_common, "evals")
+    @mock.patch.object(_evals_common, "_gcs_utils")
+    def test_create_evaluation_set_with_history_column(
+        self, mock_gcs_utils, mock_evals_module
+    ):
+        """Tests that 'history' column is accepted and mapped to prompt_template_data."""
+        eval_df = pd.DataFrame(
+            [
+                {
+                    "prompt": "test prompt",
+                    "response": "test response",
+                    "history": "previous conversation",
+                }
+            ]
+        )
+
+        mock_gcs_instance = mock_gcs_utils.GcsUtils.return_value
+        mock_gcs_instance.upload_json_to_prefix.return_value = (
+            "gs://bucket/path/request.json"
+        )
+
+        mock_evals_instance = mock_evals_module.Evals.return_value
+        mock_eval_item = mock.Mock()
+        mock_eval_item.name = "eval_item_1"
+        mock_evals_instance.create_evaluation_item.return_value = mock_eval_item
+
+        mock_eval_set = mock.Mock()
+        mock_evals_instance.create_evaluation_set.return_value = mock_eval_set
+
+        result = _evals_common._create_evaluation_set_from_dataframe(
+            api_client=self.mock_api_client,
+            gcs_dest_prefix="gs://bucket/prefix",
+            eval_df=eval_df,
+            candidate_name="test-candidate",
+        )
+
+        assert result == mock_eval_set
+
+        mock_gcs_instance.upload_json_to_prefix.assert_called_once()
+        call_args = mock_gcs_instance.upload_json_to_prefix.call_args
+        uploaded_data = call_args.kwargs["data"]
+
+        assert "prompt_template_data" in uploaded_data["prompt"]
+        ptd_values = uploaded_data["prompt"]["prompt_template_data"]["values"]
+        assert "conversation_history" in ptd_values
+
+    @mock.patch.object(_evals_common, "evals")
+    @mock.patch.object(_evals_common, "_gcs_utils")
+    def test_create_evaluation_set_with_conversation_history_column(
+        self, mock_gcs_utils, mock_evals_module
+    ):
+        """Tests that 'conversation_history' column is accepted and mapped to prompt_template_data."""
+        eval_df = pd.DataFrame(
+            [
+                {
+                    "prompt": "test prompt",
+                    "response": "test response",
+                    "conversation_history": "previous conversation",
+                }
+            ]
+        )
+
+        mock_gcs_instance = mock_gcs_utils.GcsUtils.return_value
+        mock_gcs_instance.upload_json_to_prefix.return_value = (
+            "gs://bucket/path/request.json"
+        )
+
+        mock_evals_instance = mock_evals_module.Evals.return_value
+        mock_eval_item = mock.Mock()
+        mock_eval_item.name = "eval_item_1"
+        mock_evals_instance.create_evaluation_item.return_value = mock_eval_item
+
+        mock_eval_set = mock.Mock()
+        mock_evals_instance.create_evaluation_set.return_value = mock_eval_set
+
+        result = _evals_common._create_evaluation_set_from_dataframe(
+            api_client=self.mock_api_client,
+            gcs_dest_prefix="gs://bucket/prefix",
+            eval_df=eval_df,
+            candidate_name="test-candidate",
+        )
+
+        assert result == mock_eval_set
+
+        mock_gcs_instance.upload_json_to_prefix.assert_called_once()
+        call_args = mock_gcs_instance.upload_json_to_prefix.call_args
+        uploaded_data = call_args.kwargs["data"]
+
+        assert "prompt_template_data" in uploaded_data["prompt"]
+        ptd_values = uploaded_data["prompt"]["prompt_template_data"]["values"]
+        assert "conversation_history" in ptd_values
+
+
+class TestResolveDataset:
+    """Unit tests for the _resolve_dataset function."""
+
+    def setup_method(self):
+        self.mock_api_client = mock.Mock(spec=client.Client)
+        self.mock_api_client.project = "test-project"
+        self.mock_api_client.location = "us-central1"
+
+    @mock.patch.object(_evals_common, "evals")
+    @mock.patch.object(_evals_common, "_gcs_utils")
+    def test_resolve_dataset_preserves_conversation_history(
+        self, mock_gcs_utils, mock_evals_module
+    ):
+        """Tests that conversation_history from EvalCase is included in the DataFrame."""
+        mock_gcs_instance = mock_gcs_utils.GcsUtils.return_value
+        mock_gcs_instance.upload_json_to_prefix.return_value = (
+            "gs://bucket/path/request.json"
+        )
+
+        mock_evals_instance = mock_evals_module.Evals.return_value
+        mock_eval_item = mock.Mock()
+        mock_eval_item.name = "eval_item_1"
+        mock_evals_instance.create_evaluation_item.return_value = mock_eval_item
+
+        mock_eval_set = mock.Mock()
+        mock_eval_set.name = "eval_set_1"
+        mock_evals_instance.create_evaluation_set.return_value = mock_eval_set
+
+        history_content_1 = genai_types.Content(
+            role="user", parts=[genai_types.Part(text="Old user msg")]
+        )
+        history_content_2 = genai_types.Content(
+            role="model", parts=[genai_types.Part(text="Old model msg")]
+        )
+
+        dataset = vertexai_genai_types.EvaluationDataset(
+            eval_cases=[
+                vertexai_genai_types.EvalCase(
+                    prompt=genai_types.Content(
+                        parts=[genai_types.Part(text="test prompt")]
+                    ),
+                    responses=[
+                        vertexai_genai_types.ResponseCandidate(
+                            response=genai_types.Content(
+                                parts=[genai_types.Part(text="test response")]
+                            )
+                        )
+                    ],
+                    conversation_history=[
+                        vertexai_genai_types.evals.Message(
+                            turn_id="0", content=history_content_1
+                        ),
+                        vertexai_genai_types.evals.Message(
+                            turn_id="1", content=history_content_2
+                        ),
+                    ],
+                )
+            ]
+        )
+
+        result = _evals_common._resolve_dataset(
+            api_client=self.mock_api_client,
+            dataset=dataset,
+            dest="gs://bucket/prefix",
+        )
+
+        assert result.evaluation_set == "eval_set_1"
+
+        # Verify that conversation_history was passed through to the GCS upload
+        mock_gcs_instance.upload_json_to_prefix.assert_called_once()
+        call_args = mock_gcs_instance.upload_json_to_prefix.call_args
+        uploaded_data = call_args.kwargs["data"]
+
+        assert "prompt_template_data" in uploaded_data["prompt"]
+        ptd_values = uploaded_data["prompt"]["prompt_template_data"]["values"]
+        assert "conversation_history" in ptd_values
+
 
 class TestRateLimiter:
     """Tests for the RateLimiter class in _evals_utils."""
diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py
@@ -326,6 +326,18 @@ def _resolve_dataset(
                         if event.content
                     ]
 
+                if case.conversation_history:
+                    history_parts = []
+                    for msg in case.conversation_history:
+                        if msg.content:
+                            role = msg.content.role or "user"
+                            text = _evals_data_converters._get_content_text(msg.content)
+                            history_parts.append(f"{role}: {text}")
+                    if history_parts:
+                        row[_evals_constant.CONVERSATION_HISTORY] = "\n".join(
+                            history_parts
+                        )
+
                 if case.user_scenario:
                     if case.user_scenario.starting_prompt:
                         row[_evals_constant.STARTING_PROMPT] = (
@@ -2586,6 +2598,14 @@ def _create_evaluation_set_from_dataframe(
             )
 
         prompt = None
+        # Determine which history column name is present, preferring
+        # "conversation_history" over "history" if both exist.
+        history_col = None
+        if _evals_constant.CONVERSATION_HISTORY in row:
+            history_col = _evals_constant.CONVERSATION_HISTORY
+        elif _evals_constant.HISTORY in row:
+            history_col = _evals_constant.HISTORY
+
         if (
             _evals_constant.STARTING_PROMPT in row
             and _evals_constant.CONVERSATION_PLAN in row
@@ -2596,15 +2616,15 @@ def _create_evaluation_set_from_dataframe(
                     conversation_plan=row[_evals_constant.CONVERSATION_PLAN],
                 )
             )
-        elif _evals_constant.CONTEXT in row or _evals_constant.HISTORY in row:
+        elif _evals_constant.CONTEXT in row or history_col:
             values = {}
             if _evals_constant.CONTEXT in row:
                 values[_evals_constant.CONTEXT] = _get_content(
                     row, _evals_constant.CONTEXT
                 )
-            if _evals_constant.HISTORY in row:
-                values[_evals_constant.HISTORY] = _get_content(
-                    row, _evals_constant.HISTORY
+            if history_col:
+                values[_evals_constant.CONVERSATION_HISTORY] = _get_content(
+                    row, history_col
                 )
             if _evals_constant.PROMPT in row:
                 values[_evals_constant.PROMPT] = _get_content(
diff --git a/vertexai/_genai/_evals_constant.py b/vertexai/_genai/_evals_constant.py
@@ -59,7 +59,8 @@
 AGENT_DATA = "agent_data"
 STARTING_PROMPT = "starting_prompt"
 CONVERSATION_PLAN = "conversation_plan"
-HISTORY = "conversation_history"
+HISTORY = "history"
+CONVERSATION_HISTORY = "conversation_history"
 
 COMMON_DATASET_COLUMNS = frozenset(
     {
@@ -69,6 +70,7 @@
         SESSION_INPUT,
         CONTEXT,
         HISTORY,
+        CONVERSATION_HISTORY,
         STARTING_PROMPT,
         CONVERSATION_PLAN,
         AGENT_DATA,
diff --git a/vertexai/_genai/_evals_data_converters.py b/vertexai/_genai/_evals_data_converters.py
@@ -196,7 +196,9 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
             if not prompt_data:
                 prompt_data = item.pop("source", None)
 
-            conversation_history_data = item.pop("history", None)
+            conversation_history_data = item.pop("conversation_history", None)
+            if conversation_history_data is None:
+                conversation_history_data = item.pop("history", None)
             response_data = item.pop("response", None)
             reference_data = item.pop("reference", None)
             system_instruction_data = item.pop("instruction", None)