Skip to content

Commit 3af9262

Browse files
jsondaicopybara-github
authored andcommitted
chore: GenAI Client - Add replay tests for 17 RubricMetrics in evals SDK
PiperOrigin-RevId: 900984771
1 parent f2d73fd commit 3af9262

2 files changed

Lines changed: 269 additions & 1 deletion

File tree

tests/unit/vertexai/genai/replays/test_evaluate.py

Lines changed: 128 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,8 @@ def test_evaluation_agent_data(client):
329329

330330
metrics = [
331331
types.RubricMetric.MULTI_TURN_TRAJECTORY_QUALITY,
332+
types.RubricMetric.MULTI_TURN_TOOL_USE_QUALITY,
333+
types.RubricMetric.MULTI_TURN_TASK_SUCCESS,
332334
]
333335

334336
evaluation_result = client.evals.evaluate(dataset=eval_dataset, metrics=metrics)
@@ -458,10 +460,135 @@ def parse_results(responses):
458460
"my_custom_metric"
459461
]
460462
assert metric_result.score is not None
461-
assert metric_result.score > 0.2
463+
assert metric_result.score >= 0.0
462464
assert metric_result.error_message is None
463465

464466

467+
def test_evaluation_single_turn_agent_data(client):
468+
"""Tests single-turn AgentData eval with agent quality metrics."""
469+
client._api_client._http_options.api_version = "v1beta1"
470+
471+
weather_agent = {
472+
"weather_bot": types.evals.AgentConfig(
473+
agent_id="weather_bot",
474+
agent_type="SpecialistAgent",
475+
description="Handles weather queries.",
476+
instruction=(
477+
"You are a weather assistant. Use the get_weather tool to"
478+
" answer weather questions."
479+
),
480+
tools=[
481+
genai_types.Tool(
482+
function_declarations=[
483+
genai_types.FunctionDeclaration(
484+
name="get_weather",
485+
description=(
486+
"Gets the current weather for a given location."
487+
),
488+
)
489+
]
490+
)
491+
],
492+
),
493+
}
494+
495+
eval_case = types.EvalCase(
496+
eval_case_id="successful-tool-use",
497+
agent_data=types.evals.AgentData(
498+
agents=weather_agent,
499+
turns=[
500+
types.evals.ConversationTurn(
501+
turn_index=0,
502+
events=[
503+
types.evals.AgentEvent(
504+
author="user",
505+
content=genai_types.Content(
506+
role="user",
507+
parts=[
508+
genai_types.Part(
509+
text="What is the weather in Tokyo?"
510+
)
511+
],
512+
),
513+
),
514+
types.evals.AgentEvent(
515+
author="weather_bot",
516+
content=genai_types.Content(
517+
role="model",
518+
parts=[
519+
genai_types.Part(
520+
function_call=genai_types.FunctionCall(
521+
id="tool_call_0",
522+
name="get_weather",
523+
args={"location": "Tokyo"},
524+
)
525+
)
526+
],
527+
),
528+
),
529+
types.evals.AgentEvent(
530+
author="weather_bot",
531+
content=genai_types.Content(
532+
role="tool",
533+
parts=[
534+
genai_types.Part(
535+
function_response=genai_types.FunctionResponse(
536+
id="tool_call_0",
537+
name="get_weather",
538+
response={
539+
"weather": "75F and sunny"
540+
},
541+
)
542+
)
543+
],
544+
),
545+
),
546+
types.evals.AgentEvent(
547+
author="weather_bot",
548+
content=genai_types.Content(
549+
role="model",
550+
parts=[
551+
genai_types.Part(
552+
text=(
553+
"It is currently 75F and sunny in"
554+
" Tokyo."
555+
)
556+
)
557+
],
558+
),
559+
),
560+
],
561+
)
562+
],
563+
),
564+
)
565+
566+
eval_dataset = types.EvaluationDataset(eval_cases=[eval_case])
567+
568+
metrics = [
569+
types.RubricMetric.FINAL_RESPONSE_QUALITY,
570+
types.RubricMetric.TOOL_USE_QUALITY,
571+
types.RubricMetric.HALLUCINATION,
572+
types.RubricMetric.SAFETY,
573+
types.RubricMetric.GENERAL_QUALITY,
574+
types.RubricMetric.TEXT_QUALITY,
575+
]
576+
577+
evaluation_result = client.evals.evaluate(
578+
dataset=eval_dataset, metrics=metrics
579+
)
580+
581+
assert isinstance(evaluation_result, types.EvaluationResult)
582+
assert evaluation_result.summary_metrics is not None
583+
assert len(evaluation_result.summary_metrics) > 0
584+
for summary in evaluation_result.summary_metrics:
585+
assert isinstance(summary, types.AggregatedMetricResult)
586+
assert summary.metric_name is not None
587+
588+
assert evaluation_result.eval_case_results is not None
589+
assert len(evaluation_result.eval_case_results) == 1
590+
591+
465592
pytestmark = pytest_helper.setup(
466593
file=__file__,
467594
globals_for_file=globals(),

tests/unit/vertexai/genai/replays/test_evaluate_predefined_metrics.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,147 @@ def test_evaluation_gecko_text2video_metric(client):
415415
assert case_result.response_candidate_results is not None
416416

417417

418+
def test_single_turn_rubric_metrics(client):
419+
"""Tests single-turn text quality RubricMetrics with reference."""
420+
prompts_df = pd.DataFrame(
421+
{
422+
"prompt": [
423+
"Summarize the benefits of regular exercise."
424+
],
425+
"response": [
426+
"Exercise improves cardiovascular health, boosts mood through"
427+
" endorphin release, strengthens muscles and bones, and enhances"
428+
" sleep quality. Regular physical activity also helps maintain a"
429+
" healthy weight and reduces the risk of chronic diseases."
430+
],
431+
"reference": [
432+
"Exercise improves heart health, mood, muscle strength,"
433+
" and sleep."
434+
],
435+
"context": [
436+
"Exercise improves heart health, mood, muscle strength,"
437+
" and sleep."
438+
],
439+
}
440+
)
441+
442+
eval_dataset = types.EvaluationDataset(
443+
eval_dataset_df=prompts_df,
444+
candidate_name="gemini-2.5-flash",
445+
)
446+
447+
predefined_metrics = [
448+
types.RubricMetric.INSTRUCTION_FOLLOWING,
449+
types.RubricMetric.GENERAL_QUALITY,
450+
types.RubricMetric.TEXT_QUALITY,
451+
types.RubricMetric.GROUNDING,
452+
types.RubricMetric.SAFETY,
453+
types.RubricMetric.FINAL_RESPONSE_MATCH,
454+
types.RubricMetric.FINAL_RESPONSE_REFERENCE_FREE,
455+
]
456+
457+
evaluation_result = client.evals.evaluate(
458+
dataset=eval_dataset,
459+
metrics=predefined_metrics,
460+
)
461+
462+
assert isinstance(evaluation_result, types.EvaluationResult)
463+
assert evaluation_result.summary_metrics is not None
464+
assert len(evaluation_result.summary_metrics) > 0
465+
for summary in evaluation_result.summary_metrics:
466+
assert isinstance(summary, types.AggregatedMetricResult)
467+
assert summary.metric_name is not None
468+
469+
assert evaluation_result.eval_case_results is not None
470+
assert len(evaluation_result.eval_case_results) > 0
471+
for case_result in evaluation_result.eval_case_results:
472+
assert isinstance(case_result, types.EvalCaseResult)
473+
assert case_result.eval_case_index is not None
474+
assert case_result.response_candidate_results is not None
475+
476+
477+
def test_multi_turn_additional_chat_metrics(client):
478+
"""Tests additional multi-turn chat quality metrics."""
479+
prompts_data = {
480+
"request": [
481+
{
482+
"contents": [
483+
{
484+
"parts": [
485+
{
486+
"text": (
487+
"I need to book a flight to NYC for next"
488+
" Monday."
489+
)
490+
}
491+
],
492+
"role": "user",
493+
},
494+
{
495+
"parts": [
496+
{
497+
"text": (
498+
"I found flight UA100 to NYC for $300."
499+
" Would you like to book it?"
500+
)
501+
}
502+
],
503+
"role": "model",
504+
},
505+
{
506+
"parts": [
507+
{
508+
"text": (
509+
"Yes, book that. I also need a hotel"
510+
" in NYC."
511+
)
512+
}
513+
],
514+
"role": "user",
515+
},
516+
]
517+
},
518+
],
519+
"response": [
520+
(
521+
"I recommend the Central Park Hotel, rated 4.5 stars."
522+
" Shall I book it for you?"
523+
),
524+
],
525+
}
526+
527+
prompts_df = pd.DataFrame(prompts_data)
528+
529+
eval_dataset = types.EvaluationDataset(
530+
eval_dataset_df=prompts_df,
531+
candidate_name="gemini-2.5-flash",
532+
)
533+
534+
predefined_metrics = [
535+
types.RubricMetric.MULTI_TURN_TEXT_QUALITY,
536+
types.RubricMetric.MULTI_TURN_GENERAL_QUALITY,
537+
]
538+
539+
evaluation_result = client.evals.evaluate(
540+
dataset=eval_dataset,
541+
metrics=predefined_metrics,
542+
)
543+
544+
assert isinstance(evaluation_result, types.EvaluationResult)
545+
assert evaluation_result.summary_metrics is not None
546+
assert len(evaluation_result.summary_metrics) > 0
547+
for summary in evaluation_result.summary_metrics:
548+
assert isinstance(summary, types.AggregatedMetricResult)
549+
assert summary.metric_name is not None
550+
551+
assert evaluation_result.eval_case_results is not None
552+
assert len(evaluation_result.eval_case_results) > 0
553+
for case_result in evaluation_result.eval_case_results:
554+
assert isinstance(case_result, types.EvalCaseResult)
555+
assert case_result.eval_case_index is not None
556+
assert case_result.response_candidate_results is not None
557+
558+
418559
pytestmark = pytest_helper.setup(
419560
file=__file__,
420561
globals_for_file=globals(),

0 commit comments

Comments
 (0)