Skip to content

Commit b24330d

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Client(evals): Update to enable agent run functionality in eval management service
PiperOrigin-RevId: 877600757
1 parent fba5350 commit b24330d

5 files changed

Lines changed: 372 additions & 19 deletions

File tree

tests/unit/vertexai/genai/test_evals.py

Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5822,3 +5822,264 @@ async def test_async_generate_user_scenarios(self):
58225822
assert len(eval_dataset.eval_dataset_df) == 2
58235823

58245824
self.mock_api_client.async_request.assert_called_once()
5825+
5826+
5827+
class TestCreateEvaluationSetFromDataFrame:
5828+
"""Unit tests for the _create_evaluation_set_from_dataframe function."""
5829+
5830+
def setup_method(self):
5831+
self.mock_api_client = mock.Mock(spec=client.Client)
5832+
self.mock_api_client.project = "test-project"
5833+
self.mock_api_client.location = "us-central1"
5834+
5835+
@mock.patch.object(_evals_common, "evals")
5836+
@mock.patch.object(_evals_common, "_gcs_utils")
5837+
def test_create_evaluation_set_with_intermediate_events(
5838+
self, mock_gcs_utils, mock_evals_module
5839+
):
5840+
intermediate_events = [
5841+
{
5842+
"content": {"parts": [{"text": "thought 1"}]},
5843+
"timestamp": "2024-01-01T00:00:00Z",
5844+
},
5845+
{
5846+
"content": {"parts": [{"functionCall": {"name": "foo"}}]},
5847+
"timestamp": "2024-01-01T00:00:01Z",
5848+
},
5849+
]
5850+
5851+
eval_df = pd.DataFrame(
5852+
[
5853+
{
5854+
"prompt": "test prompt",
5855+
"response": "test response",
5856+
"intermediate_events": intermediate_events,
5857+
}
5858+
]
5859+
)
5860+
5861+
mock_gcs_instance = mock_gcs_utils.GcsUtils.return_value
5862+
mock_gcs_instance.upload_json_to_prefix.return_value = (
5863+
"gs://bucket/path/request.json"
5864+
)
5865+
5866+
mock_evals_instance = mock_evals_module.Evals.return_value
5867+
mock_eval_item = mock.Mock()
5868+
mock_eval_item.name = "eval_item_1"
5869+
mock_evals_instance.create_evaluation_item.return_value = mock_eval_item
5870+
5871+
mock_eval_set = mock.Mock()
5872+
mock_evals_instance.create_evaluation_set.return_value = mock_eval_set
5873+
5874+
result = _evals_common._create_evaluation_set_from_dataframe(
5875+
api_client=self.mock_api_client,
5876+
gcs_dest_prefix="gs://bucket/prefix",
5877+
eval_df=eval_df,
5878+
candidate_name="test-candidate",
5879+
)
5880+
5881+
assert result == mock_eval_set
5882+
5883+
mock_gcs_instance.upload_json_to_prefix.assert_called_once()
5884+
call_args = mock_gcs_instance.upload_json_to_prefix.call_args
5885+
uploaded_data = call_args.kwargs["data"]
5886+
5887+
candidate_responses = uploaded_data["candidate_responses"]
5888+
assert len(candidate_responses) == 1
5889+
candidate_response = candidate_responses[0]
5890+
assert candidate_response["candidate"] == "test-candidate"
5891+
assert candidate_response["text"] == "test response"
5892+
5893+
expected_events = [
5894+
{"parts": [{"text": "thought 1"}]},
5895+
{"parts": [{"function_call": {"name": "foo"}}]},
5896+
]
5897+
assert candidate_response["events"] == expected_events
5898+
5899+
@mock.patch.object(_evals_common, "evals")
5900+
@mock.patch.object(_evals_common, "_gcs_utils")
5901+
def test_create_evaluation_set_with_user_scenario(
5902+
self, mock_gcs_utils, mock_evals_module
5903+
):
5904+
eval_df = pd.DataFrame(
5905+
[
5906+
{
5907+
"starting_prompt": "test starting prompt",
5908+
"conversation_plan": "test conversation plan",
5909+
}
5910+
]
5911+
)
5912+
5913+
mock_gcs_instance = mock_gcs_utils.GcsUtils.return_value
5914+
mock_gcs_instance.upload_json_to_prefix.return_value = (
5915+
"gs://bucket/path/request.json"
5916+
)
5917+
5918+
mock_evals_instance = mock_evals_module.Evals.return_value
5919+
mock_eval_item = mock.Mock()
5920+
mock_eval_item.name = "eval_item_1"
5921+
mock_evals_instance.create_evaluation_item.return_value = mock_eval_item
5922+
5923+
mock_eval_set = mock.Mock()
5924+
mock_evals_instance.create_evaluation_set.return_value = mock_eval_set
5925+
5926+
result = _evals_common._create_evaluation_set_from_dataframe(
5927+
api_client=self.mock_api_client,
5928+
gcs_dest_prefix="gs://bucket/prefix",
5929+
eval_df=eval_df,
5930+
candidate_name="test-candidate",
5931+
)
5932+
5933+
assert result == mock_eval_set
5934+
5935+
mock_gcs_instance.upload_json_to_prefix.assert_called_once()
5936+
call_args = mock_gcs_instance.upload_json_to_prefix.call_args
5937+
uploaded_data = call_args.kwargs["data"]
5938+
5939+
assert uploaded_data.get("candidate_responses") is None
5940+
assert uploaded_data["prompt"]["user_scenario"] == {
5941+
"starting_prompt": "test starting prompt",
5942+
"conversation_plan": "test conversation plan",
5943+
}
5944+
5945+
@mock.patch.object(_evals_common, "evals")
5946+
@mock.patch.object(_evals_common, "_gcs_utils")
5947+
def test_create_evaluation_set_with_agent_data(
5948+
self, mock_gcs_utils, mock_evals_module
5949+
):
5950+
agent_data = {"turns": [{"turn_id": "turn1", "events": []}]}
5951+
eval_df = pd.DataFrame(
5952+
[
5953+
{
5954+
"prompt": "test prompt",
5955+
"agent_data": agent_data,
5956+
}
5957+
]
5958+
)
5959+
5960+
mock_gcs_instance = mock_gcs_utils.GcsUtils.return_value
5961+
mock_gcs_instance.upload_json_to_prefix.return_value = (
5962+
"gs://bucket/path/request.json"
5963+
)
5964+
5965+
mock_evals_instance = mock_evals_module.Evals.return_value
5966+
mock_eval_item = mock.Mock()
5967+
mock_eval_item.name = "eval_item_1"
5968+
mock_evals_instance.create_evaluation_item.return_value = mock_eval_item
5969+
5970+
mock_eval_set = mock.Mock()
5971+
mock_evals_instance.create_evaluation_set.return_value = mock_eval_set
5972+
5973+
result = _evals_common._create_evaluation_set_from_dataframe(
5974+
api_client=self.mock_api_client,
5975+
gcs_dest_prefix="gs://bucket/prefix",
5976+
eval_df=eval_df,
5977+
candidate_name="test-candidate",
5978+
)
5979+
5980+
assert result == mock_eval_set
5981+
5982+
mock_gcs_instance.upload_json_to_prefix.assert_called_once()
5983+
call_args = mock_gcs_instance.upload_json_to_prefix.call_args
5984+
uploaded_data = call_args.kwargs["data"]
5985+
5986+
assert uploaded_data["prompt"]["text"] == "test prompt"
5987+
candidate_responses = uploaded_data["candidate_responses"]
5988+
assert len(candidate_responses) == 1
5989+
candidate_response = candidate_responses[0]
5990+
assert candidate_response["candidate"] == "test-candidate"
5991+
assert candidate_response["agent_data"] == agent_data
5992+
5993+
@mock.patch.object(_evals_common, "evals")
5994+
@mock.patch.object(_evals_common, "_gcs_utils")
5995+
def test_create_evaluation_set_with_user_scenario(
5996+
self, mock_gcs_utils, mock_evals_module
5997+
):
5998+
eval_df = pd.DataFrame(
5999+
[
6000+
{
6001+
"starting_prompt": "test starting prompt",
6002+
"conversation_plan": "test conversation plan",
6003+
}
6004+
]
6005+
)
6006+
6007+
mock_gcs_instance = mock_gcs_utils.GcsUtils.return_value
6008+
mock_gcs_instance.upload_json_to_prefix.return_value = (
6009+
"gs://bucket/path/request.json"
6010+
)
6011+
6012+
mock_evals_instance = mock_evals_module.Evals.return_value
6013+
mock_eval_item = mock.Mock()
6014+
mock_eval_item.name = "eval_item_1"
6015+
mock_evals_instance.create_evaluation_item.return_value = mock_eval_item
6016+
6017+
mock_eval_set = mock.Mock()
6018+
mock_evals_instance.create_evaluation_set.return_value = mock_eval_set
6019+
6020+
result = _evals_common._create_evaluation_set_from_dataframe(
6021+
api_client=self.mock_api_client,
6022+
gcs_dest_prefix="gs://bucket/prefix",
6023+
eval_df=eval_df,
6024+
candidate_name="test-candidate",
6025+
)
6026+
6027+
assert result == mock_eval_set
6028+
6029+
mock_gcs_instance.upload_json_to_prefix.assert_called_once()
6030+
call_args = mock_gcs_instance.upload_json_to_prefix.call_args
6031+
uploaded_data = call_args.kwargs["data"]
6032+
6033+
assert uploaded_data.get("candidate_responses") is None
6034+
assert uploaded_data["prompt"]["user_scenario"] == {
6035+
"starting_prompt": "test starting prompt",
6036+
"conversation_plan": "test conversation plan",
6037+
}
6038+
6039+
@mock.patch.object(_evals_common, "evals")
6040+
@mock.patch.object(_evals_common, "_gcs_utils")
6041+
def test_create_evaluation_set_with_agent_data(
6042+
self, mock_gcs_utils, mock_evals_module
6043+
):
6044+
agent_data = {"turns": [{"turn_id": "turn1", "events": []}]}
6045+
eval_df = pd.DataFrame(
6046+
[
6047+
{
6048+
"prompt": "test prompt",
6049+
"agent_data": agent_data,
6050+
}
6051+
]
6052+
)
6053+
6054+
mock_gcs_instance = mock_gcs_utils.GcsUtils.return_value
6055+
mock_gcs_instance.upload_json_to_prefix.return_value = (
6056+
"gs://bucket/path/request.json"
6057+
)
6058+
6059+
mock_evals_instance = mock_evals_module.Evals.return_value
6060+
mock_eval_item = mock.Mock()
6061+
mock_eval_item.name = "eval_item_1"
6062+
mock_evals_instance.create_evaluation_item.return_value = mock_eval_item
6063+
6064+
mock_eval_set = mock.Mock()
6065+
mock_evals_instance.create_evaluation_set.return_value = mock_eval_set
6066+
6067+
result = _evals_common._create_evaluation_set_from_dataframe(
6068+
api_client=self.mock_api_client,
6069+
gcs_dest_prefix="gs://bucket/prefix",
6070+
eval_df=eval_df,
6071+
candidate_name="test-candidate",
6072+
)
6073+
6074+
assert result == mock_eval_set
6075+
6076+
mock_gcs_instance.upload_json_to_prefix.assert_called_once()
6077+
call_args = mock_gcs_instance.upload_json_to_prefix.call_args
6078+
uploaded_data = call_args.kwargs["data"]
6079+
6080+
assert uploaded_data["prompt"]["text"] == "test prompt"
6081+
candidate_responses = uploaded_data["candidate_responses"]
6082+
assert len(candidate_responses) == 1
6083+
candidate_response = candidate_responses[0]
6084+
assert candidate_response["candidate"] == "test-candidate"
6085+
assert candidate_response["agent_data"] == agent_data

vertexai/_genai/_evals_common.py

Lines changed: 51 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from google.genai.models import Models
3636
import pandas as pd
3737
from tqdm import tqdm
38+
from pydantic import ValidationError
3839

3940
from . import _evals_constant
4041
from . import _evals_data_converters
@@ -2247,7 +2248,52 @@ def _create_evaluation_set_from_dataframe(
22472248
for event in row[_evals_constant.INTERMEDIATE_EVENTS]:
22482249
if CONTENT in event:
22492250
intermediate_events.append(event[CONTENT])
2250-
if _evals_constant.CONTEXT in row or _evals_constant.HISTORY in row:
2251+
2252+
agent_data_obj = None
2253+
if _evals_constant.AGENT_DATA in row:
2254+
agent_data_val = row[AGENT_DATA]
2255+
if isinstance(agent_data_val, str):
2256+
try:
2257+
agent_data_val = json.loads(agent_data_val)
2258+
except json.JSONDecodeError:
2259+
pass
2260+
if isinstance(agent_data_val, dict):
2261+
try:
2262+
agent_data_obj = types.evals.AgentData.model_validate(
2263+
agent_data_val
2264+
)
2265+
except ValidationError:
2266+
pass
2267+
elif isinstance(agent_data_val, types.evals.AgentData):
2268+
agent_data_obj = agent_data_val
2269+
2270+
candidate_responses = []
2271+
if (
2272+
_evals_constant.RESPONSE in row
2273+
or agent_data_obj
2274+
or intermediate_events
2275+
):
2276+
candidate_responses.append(
2277+
types.CandidateResponse(
2278+
candidate=candidate_name or "Candidate 1",
2279+
text=row.get(_evals_constant.RESPONSE) or None,
2280+
events=intermediate_events or None,
2281+
agent_data=agent_data_obj,
2282+
)
2283+
)
2284+
2285+
prompt = None
2286+
if (
2287+
_evals_constant.STARTING_PROMPT in row
2288+
and _evals_constant.CONVERSATION_PLAN in row
2289+
):
2290+
prompt = types.EvaluationPrompt(
2291+
user_scenario=types.evals.UserScenario(
2292+
starting_prompt=row[_evals_constant.STARTING_PROMPT],
2293+
conversation_plan=row[_evals_constant.CONVERSATION_PLAN],
2294+
)
2295+
)
2296+
elif _evals_constant.CONTEXT in row or _evals_constant.HISTORY in row:
22512297
values = {}
22522298
if _evals_constant.CONTEXT in row:
22532299
values[_evals_constant.CONTEXT] = _get_content(
@@ -2266,15 +2312,7 @@ def _create_evaluation_set_from_dataframe(
22662312
)
22672313
elif _evals_constant.PROMPT in row:
22682314
prompt = types.EvaluationPrompt(text=row[_evals_constant.PROMPT])
2269-
candidate_responses = []
2270-
if _evals_constant.RESPONSE in row:
2271-
candidate_responses.append(
2272-
types.CandidateResponse(
2273-
candidate=candidate_name or "Candidate 1",
2274-
text=row[_evals_constant.RESPONSE],
2275-
events=intermediate_events or None,
2276-
)
2277-
)
2315+
22782316
eval_item_requests.append(
22792317
types.EvaluationItemRequest(
22802318
prompt=prompt or None,
@@ -2283,7 +2321,9 @@ def _create_evaluation_set_from_dataframe(
22832321
if _evals_constant.REFERENCE in row
22842322
else None
22852323
),
2286-
candidate_responses=candidate_responses,
2324+
candidate_responses=(
2325+
candidate_responses if candidate_responses else None
2326+
),
22872327
)
22882328
)
22892329
logger.info("Writing evaluation item requests to GCS.")

vertexai/_genai/_evals_constant.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@
5757
PARTS = "parts"
5858
USER_AUTHOR = "user"
5959
AGENT_DATA = "agent_data"
60+
STARTING_PROMPT = "starting_prompt"
61+
CONVERSATION_PLAN = "conversation_plan"
6062
HISTORY = "conversation_history"
6163

6264
COMMON_DATASET_COLUMNS = frozenset(

vertexai/_genai/types/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,9 @@
130130
from .common import AgentEngineSessionOperation
131131
from .common import AgentEngineSessionOperationDict
132132
from .common import AgentEngineSessionOperationOrDict
133+
from .common import AgentRunConfig
134+
from .common import AgentRunConfigDict
135+
from .common import AgentRunConfigOrDict
133136
from .common import AgentServerMode
134137
from .common import AggregatedMetricResult
135138
from .common import AggregatedMetricResultDict
@@ -1391,6 +1394,9 @@
13911394
"EvaluationRunAgentConfig",
13921395
"EvaluationRunAgentConfigDict",
13931396
"EvaluationRunAgentConfigOrDict",
1397+
"AgentRunConfig",
1398+
"AgentRunConfigDict",
1399+
"AgentRunConfigOrDict",
13941400
"EvaluationRunInferenceConfig",
13951401
"EvaluationRunInferenceConfigDict",
13961402
"EvaluationRunInferenceConfigOrDict",

0 commit comments

Comments
 (0)