Skip to content

Commit 065b347

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Add EvaluationMetric Create, Get and List methods to Vertex SDK GenAI evals
PiperOrigin-RevId: 882635735
1 parent b7738a4 commit 065b347

5 files changed

Lines changed: 1042 additions & 163 deletions

File tree

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# pylint: disable=protected-access,bad-continuation,missing-function-docstring
16+
17+
from tests.unit.vertexai.genai.replays import pytest_helper
18+
from vertexai._genai import types
19+
20+
_TEST_PROJECT = "977012026409"
21+
_TEST_LOCATION = "us-central1"
22+
23+
def test_create_evaluation_metric(client):
24+
client._api_client._http_options.api_version = "v1beta1"
25+
client._api_client._http_options.base_url = (
26+
"https://us-central1-staging-aiplatform.sandbox.googleapis.com/"
27+
)
28+
metric = client.evals.create_evaluation_metric(
29+
display_name="test_metric",
30+
description="test_description",
31+
metric=types.RubricMetric.GENERAL_QUALITY,
32+
)
33+
assert isinstance(metric, types.EvaluationMetric)
34+
assert metric.display_name == "test_metric"
35+
36+
37+
def test_get_evaluation_metric(client):
38+
client._api_client._http_options.api_version = "v1beta1"
39+
client._api_client._http_options.base_url = (
40+
"https://us-central1-staging-aiplatform.sandbox.googleapis.com/"
41+
)
42+
metric_resource_name = "projects/977012026409/locations/us-central1/evaluationMetrics/6048334299558576128"
43+
metric = client.evals.get_evaluation_metric(
44+
metric_resource_name=metric_resource_name
45+
)
46+
assert isinstance(metric, types.EvaluationMetric)
47+
assert metric.display_name == 'tone-check-v1'
48+
49+
50+
def test_list_evaluation_metrics(client):
51+
client._api_client._http_options.api_version = "v1beta1"
52+
client._api_client._http_options.base_url = (
53+
"https://us-central1-staging-aiplatform.sandbox.googleapis.com/"
54+
)
55+
response = client.evals.list_evaluation_metrics()
56+
assert isinstance(response, types.ListEvaluationMetricsResponse)
57+
assert len(response.evaluation_metrics) >= 0
58+
59+
60+
# The setup function registers the module and method for the recorder
61+
pytestmark = pytest_helper.setup(
62+
file=__file__,
63+
globals_for_file=globals(),
64+
test_method="evals.create_evaluation_metric",
65+
)

vertexai/_genai/_transformers.py

Lines changed: 64 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,63 @@
2222
from . import types
2323

2424

25+
def _transform_metric(
26+
metric: "types.MetricSubclass",
27+
set_default_aggregation_metrics: bool = False,
28+
) -> dict[str, Any]:
29+
"""Transforms a single metric to its payload representation."""
30+
metric_payload_item: dict[str, Any] = {}
31+
if hasattr(metric, "metric_resource_name") and metric.metric_resource_name:
32+
metric_payload_item["metric_resource_name"] = metric.metric_resource_name
33+
34+
metric_name = getv(metric, ["name"]).lower()
35+
36+
if set_default_aggregation_metrics:
37+
metric_payload_item["aggregation_metrics"] = [
38+
"AVERAGE",
39+
"STANDARD_DEVIATION",
40+
]
41+
42+
if metric_name == "exact_match":
43+
metric_payload_item["exact_match_spec"] = {}
44+
elif metric_name == "bleu":
45+
metric_payload_item["bleu_spec"] = {}
46+
elif metric_name.startswith("rouge"):
47+
rouge_type = metric_name.replace("_", "")
48+
metric_payload_item["rouge_spec"] = {"rouge_type": rouge_type}
49+
# API Pre-defined metrics
50+
elif metric_name in _evals_constant.SUPPORTED_PREDEFINED_METRICS:
51+
metric_payload_item["predefined_metric_spec"] = {
52+
"metric_spec_name": metric_name,
53+
"metric_spec_parameters": metric.metric_spec_parameters,
54+
}
55+
# Custom Code Execution Metric
56+
elif hasattr(metric, "remote_custom_function") and metric.remote_custom_function:
57+
metric_payload_item["custom_code_execution_spec"] = {
58+
"evaluation_function": metric.remote_custom_function
59+
}
60+
# Pointwise metrics
61+
elif hasattr(metric, "prompt_template") and metric.prompt_template:
62+
pointwise_spec = {"metric_prompt_template": metric.prompt_template}
63+
system_instruction = getv(metric, ["judge_model_system_instruction"])
64+
if system_instruction:
65+
pointwise_spec["system_instruction"] = system_instruction
66+
return_raw_output = getv(metric, ["return_raw_output"])
67+
if return_raw_output:
68+
pointwise_spec["custom_output_format_config"] = {
69+
"return_raw_output": return_raw_output
70+
}
71+
metric_payload_item["pointwise_metric_spec"] = pointwise_spec
72+
elif "metric_resource_name" in metric_payload_item:
73+
# Valid case: Metric is identified by resource name; no inline spec required.
74+
pass
75+
else:
76+
raise ValueError(
77+
f"Unsupported metric type or invalid metric name: {metric_name}"
78+
)
79+
return metric_payload_item
80+
81+
2582
def t_metrics(
2683
metrics: list["types.MetricSubclass"],
2784
set_default_aggregation_metrics: bool = False,
@@ -35,58 +92,13 @@ def t_metrics(
3592
A list of resolved metric payloads for the evaluation request.
3693
"""
3794
metrics_payload = []
38-
3995
for metric in metrics:
40-
metric_payload_item: dict[str, Any] = {}
41-
if hasattr(metric, "metric_resource_name") and metric.metric_resource_name:
42-
metric_payload_item["metric_resource_name"] = metric.metric_resource_name
43-
44-
metric_name = getv(metric, ["name"]).lower()
96+
metrics_payload.append(
97+
_transform_metric(metric, set_default_aggregation_metrics)
98+
)
99+
return metrics_payload
45100

46-
if set_default_aggregation_metrics:
47-
metric_payload_item["aggregation_metrics"] = [
48-
"AVERAGE",
49-
"STANDARD_DEVIATION",
50-
]
51101

52-
if metric_name == "exact_match":
53-
metric_payload_item["exact_match_spec"] = {}
54-
elif metric_name == "bleu":
55-
metric_payload_item["bleu_spec"] = {}
56-
elif metric_name.startswith("rouge"):
57-
rouge_type = metric_name.replace("_", "")
58-
metric_payload_item["rouge_spec"] = {"rouge_type": rouge_type}
59-
# API Pre-defined metrics
60-
elif metric_name in _evals_constant.SUPPORTED_PREDEFINED_METRICS:
61-
metric_payload_item["predefined_metric_spec"] = {
62-
"metric_spec_name": metric_name,
63-
"metric_spec_parameters": metric.metric_spec_parameters,
64-
}
65-
# Custom Code Execution Metric
66-
elif (
67-
hasattr(metric, "remote_custom_function") and metric.remote_custom_function
68-
):
69-
metric_payload_item["custom_code_execution_spec"] = {
70-
"evaluation_function": metric.remote_custom_function
71-
}
72-
# Pointwise metrics
73-
elif hasattr(metric, "prompt_template") and metric.prompt_template:
74-
pointwise_spec = {"metric_prompt_template": metric.prompt_template}
75-
system_instruction = getv(metric, ["judge_model_system_instruction"])
76-
if system_instruction:
77-
pointwise_spec["system_instruction"] = system_instruction
78-
return_raw_output = getv(metric, ["return_raw_output"])
79-
if return_raw_output:
80-
pointwise_spec["custom_output_format_config"] = {
81-
"return_raw_output": return_raw_output
82-
}
83-
metric_payload_item["pointwise_metric_spec"] = pointwise_spec
84-
elif "metric_resource_name" in metric_payload_item:
85-
# Valid case: Metric is identified by resource name; no inline spec required.
86-
pass
87-
else:
88-
raise ValueError(
89-
f"Unsupported metric type or invalid metric name: {metric_name}"
90-
)
91-
metrics_payload.append(metric_payload_item)
92-
return metrics_payload
102+
def t_metric(metric: "types.MetricOrDict") -> dict[str, Any]:
103+
"""Prepares the metric payload for the evaluation metric resource."""
104+
return _transform_metric(metric)

0 commit comments

Comments
 (0)