feat: Add support for metric_resource_name in rubric generation

vertex-sdk-bot · copybara-github · commit ba5020d99a3d · 2026-03-11T09:39:12.000-07:00
PiperOrigin-RevId: 880805629
diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
@@ -238,6 +238,28 @@ def test_create_eval_run_with_inference_configs(client):
     assert evaluation_run.error is None
 
 
+def test_create_eval_run_with_metric_resource_name(client):
+    """Tests create_evaluation_run with metric_resource_name."""
+    client._api_client._http_options.api_version = "v1beta1"
+    client._api_client._http_options.base_url = (
+        "https://us-central1-autopush-aiplatform.sandbox.googleapis.com/"
+    )
+    metric_resource_name = "projects/977012026409/locations/us-central1/evaluationMetrics/6048334299558576128"
+    metric = types.EvaluationRunMetric(
+        metric="my_custom_metric",
+        metric_resource_name=metric_resource_name,
+    )
+    evaluation_run = client.evals.create_evaluation_run(
+        dataset=types.EvaluationDataset(
+            eval_dataset_df=INPUT_DF_WITH_CONTEXT_AND_HISTORY
+        ),
+        metrics=[metric],
+        dest=GCS_DEST,
+    )
+    assert isinstance(evaluation_run, types.EvaluationRun)
+    assert evaluation_run.evaluation_config.metrics[0].metric == "my_custom_metric"
+
+
 # Dataframe tests fail in replay mode because of UUID generation mismatch.
 # def test_create_eval_run_data_source_evaluation_dataset(client):
 #     """Tests that create_evaluation_run() creates a correctly structured
diff --git a/tests/unit/vertexai/genai/replays/test_public_generate_rubrics.py b/tests/unit/vertexai/genai/replays/test_public_generate_rubrics.py
@@ -143,19 +143,21 @@
 User prompt:
 {prompt}"""
 
-
-def test_public_method_generate_rubrics(client):
-    """Tests the public generate_rubrics method."""
-    prompts_df = pd.DataFrame(
+_PROMPTS_DF = pd.DataFrame(
         {
             "prompt": [
                 "Explain the theory of relativity in one sentence.",
                 "Write a short poem about a cat.",
             ]
         }
     )
+
+
+def test_public_method_generate_rubrics(client):
+    """Tests the public generate_rubrics method."""
+
     eval_dataset = client.evals.generate_rubrics(
-        src=prompts_df,
+        src=_PROMPTS_DF,
         prompt_template=_TEST_RUBRIC_GENERATION_PROMPT,
         rubric_group_name="text_quality_rubrics",
     )
@@ -176,6 +178,37 @@ def test_public_method_generate_rubrics(client):
     assert isinstance(first_rubric_group["text_quality_rubrics"][0], types.evals.Rubric)
 
 
+def test_public_method_generate_rubrics_with_metric(client):
+    """Tests the public generate_rubrics method with a metric."""
+    client._api_client._http_options.api_version = "v1beta1"
+    client._api_client._http_options.base_url = (
+        "https://us-central1-staging-aiplatform.sandbox.googleapis.com/"
+    )
+    metric_resource_name = "projects/977012026409/locations/us-central1/evaluationMetrics/6048334299558576128"
+    metric = types.Metric(
+        name="my_custom_metric",
+        metric_resource_name=metric_resource_name
+    )
+    eval_dataset = client.evals.generate_rubrics(
+        src=_PROMPTS_DF,
+        rubric_group_name="my_registered_rubrics",
+        metric=metric
+    )
+    eval_dataset_df = eval_dataset.eval_dataset_df
+
+    assert isinstance(eval_dataset, types.EvaluationDataset)
+    assert isinstance(eval_dataset_df, pd.DataFrame)
+    assert "rubric_groups" in eval_dataset_df.columns
+    assert len(eval_dataset_df) == 2
+
+    first_rubric_group = eval_dataset_df["rubric_groups"][0]
+    assert isinstance(first_rubric_group, dict)
+    assert "my_registered_rubrics" in first_rubric_group
+    assert isinstance(first_rubric_group["my_registered_rubrics"], list)
+    assert first_rubric_group["my_registered_rubrics"]
+    assert isinstance(first_rubric_group["my_registered_rubrics"][0], types.evals.Rubric)
+
+
 pytestmark = pytest_helper.setup(
     file=__file__,
     globals_for_file=globals(),
diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py
@@ -45,6 +45,7 @@
 from . import _gcs_utils
 from . import evals
 from . import types
+from . import _transformers as t
 
 logger = logging.getLogger(__name__)
 
@@ -1328,7 +1329,7 @@ def _resolve_dataset_inputs(
 
 
 def _resolve_evaluation_run_metrics(
-    metrics: list[types.EvaluationRunMetric], api_client: Any
+    metrics: Union[list[types.EvaluationRunMetric], list[types.Metric]], api_client: Any
 ) -> list[types.EvaluationRunMetric]:
     """Resolves a list of evaluation run metric instances, loading RubricMetric if necessary."""
     if not metrics:
@@ -1361,6 +1362,16 @@ def _resolve_evaluation_run_metrics(
                     e,
                 )
                 raise
+        elif isinstance(metric_instance, types.Metric):
+            config_dict = t.t_metrics([metric_instance])[0]
+            res_name = config_dict.pop("metric_resource_name", None)
+            resolved_metrics_list.append(
+                types.EvaluationRunMetric(
+                    metric=metric_instance.name,
+                    metric_config=config_dict if config_dict else None,
+                    metric_resource_name=res_name,
+                )
+            )
         else:
             try:
                 metric_name_str = str(metric_instance)
diff --git a/vertexai/_genai/_transformers.py b/vertexai/_genai/_transformers.py
@@ -38,6 +38,8 @@ def t_metrics(
 
     for metric in metrics:
         metric_payload_item: dict[str, Any] = {}
+        if hasattr(metric, "metric_resource_name") and metric.metric_resource_name:
+            metric_payload_item["metric_resource_name"] = metric.metric_resource_name
 
         metric_name = getv(metric, ["name"]).lower()
 
@@ -79,6 +81,9 @@ def t_metrics(
                     "return_raw_output": return_raw_output
                 }
             metric_payload_item["pointwise_metric_spec"] = pointwise_spec
+        elif "metric_resource_name" in metric_payload_item:
+            # Valid case: Metric is identified by resource name; no inline spec required.
+            pass
         else:
             raise ValueError(
                 f"Unsupported metric type or invalid metric name: {metric_name}"
diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py
@@ -392,6 +392,13 @@ def _EvaluationRunMetric_from_vertex(
     if getv(from_object, ["metric"]) is not None:
         setv(to_object, ["metric"], getv(from_object, ["metric"]))
 
+    if getv(from_object, ["metricResourceName"]) is not None:
+        setv(
+            to_object,
+            ["metric_resource_name"],
+            getv(from_object, ["metricResourceName"]),
+        )
+
     if getv(from_object, ["metricConfig"]) is not None:
         setv(
             to_object,
@@ -410,6 +417,13 @@ def _EvaluationRunMetric_to_vertex(
     if getv(from_object, ["metric"]) is not None:
         setv(to_object, ["metric"], getv(from_object, ["metric"]))
 
+    if getv(from_object, ["metric_resource_name"]) is not None:
+        setv(
+            to_object,
+            ["metricResourceName"],
+            getv(from_object, ["metric_resource_name"]),
+        )
+
     if getv(from_object, ["metric_config"]) is not None:
         setv(
             to_object,
@@ -512,6 +526,13 @@ def _GenerateInstanceRubricsRequest_to_vertex(
             ),
         )
 
+    if getv(from_object, ["metric_resource_name"]) is not None:
+        setv(
+            to_object,
+            ["metricResourceName"],
+            getv(from_object, ["metric_resource_name"]),
+        )
+
     if getv(from_object, ["config"]) is not None:
         setv(to_object, ["config"], getv(from_object, ["config"]))
 
@@ -1049,6 +1070,7 @@ def _generate_rubrics(
             types.PredefinedMetricSpecOrDict
         ] = None,
         rubric_generation_spec: Optional[types.RubricGenerationSpecOrDict] = None,
+        metric_resource_name: Optional[str] = None,
         config: Optional[types.RubricGenerationConfigOrDict] = None,
     ) -> types.GenerateInstanceRubricsResponse:
         """
@@ -1059,6 +1081,7 @@ def _generate_rubrics(
             contents=contents,
             predefined_rubric_generation_spec=predefined_rubric_generation_spec,
             rubric_generation_spec=rubric_generation_spec,
+            metric_resource_name=metric_resource_name,
             config=config,
         )
 
@@ -1561,16 +1584,20 @@ def generate_rubrics(
         rubric_type_ontology: Optional[list[str]] = None,
         predefined_spec_name: Optional[Union[str, "types.PrebuiltMetric"]] = None,
         metric_spec_parameters: Optional[dict[str, Any]] = None,
+        metric: Optional[types.MetricOrDict] = None,
         config: Optional[types.RubricGenerationConfigOrDict] = None,
     ) -> types.EvaluationDataset:
         """Generates rubrics for each prompt in the source and adds them as a new column
         structured as a dictionary.
 
         You can generate rubrics by providing either:
-          1. A `predefined_spec_name` to use a Vertex AI backend recipe.
-          2. A `prompt_template` along with other configuration parameters
+          1. A `metric` to use a pre-registered metric resource.
+          2. A `predefined_spec_name` to use a Vertex AI backend recipe.
+          3. A `prompt_template` along with other configuration parameters
              (`generator_model_config`, `rubric_content_type`, `rubric_type_ontology`)
              for custom rubric generation.
+        with `metric` taking precedence over `predefined_spec_name`,
+        and `predefined_spec_name` taking precedence over `prompt_template`
 
         These two modes are mutually exclusive.
 
@@ -1600,6 +1627,9 @@ def generate_rubrics(
             metric_spec_parameters: Optional. Parameters for the Predefined Metric,
                 used to customize rubric generation. Only used if `predefined_spec_name` is set.
                 Example: {"guidelines": ["The response must be in Japanese."]}
+            metric: Optional. A types.Metric object containing a metric_resource_name,
+                or a resource name string. If provided, this will take precedence over
+                predefined_spec_name and prompt_template.
             config: Optional. Configuration for the rubric generation process.
 
         Returns:
@@ -1639,10 +1669,32 @@ def generate_rubrics(
         )
         all_rubric_groups: list[dict[str, list[types.Rubric]]] = []
 
+        actual_metric_resource_name = None
+        if metric:
+            if isinstance(metric, str) and metric.startswith("projects/"):
+                actual_metric_resource_name = metric
+            else:
+                metric_obj = (
+                    types.Metric.model_validate(metric)
+                    if isinstance(metric, dict)
+                    else metric
+                )
+                actual_metric_resource_name = getattr(
+                    metric_obj, "metric_resource_name", None
+                )
+                if not actual_metric_resource_name:
+                    raise ValueError(
+                        "The provided Metric object must have metric_resource_name set."
+                    )
+
         rubric_gen_spec = None
         predefined_spec = None
 
-        if predefined_spec_name:
+        if actual_metric_resource_name:
+            # Precedence: Registered metric resource overrides everything else.
+            predefined_spec = None
+            rubric_gen_spec = None
+        elif predefined_spec_name:
             if prompt_template:
                 logger.warning(
                     "prompt_template is ignored when predefined_spec_name is provided."
@@ -1699,7 +1751,7 @@ def generate_rubrics(
             rubric_gen_spec = types.RubricGenerationSpec.model_validate(spec_dict)
         else:
             raise ValueError(
-                "Either predefined_spec_name or prompt_template must be provided."
+                "Either metric, predefined_spec_name or prompt_template must be provided."
             )
 
         for _, row in prompts_df.iterrows():
@@ -1722,6 +1774,7 @@ def generate_rubrics(
                     contents=contents,
                     rubric_generation_spec=rubric_gen_spec,
                     predefined_rubric_generation_spec=predefined_spec,
+                    metric_resource_name=actual_metric_resource_name,
                     config=config,
                 )
                 rubric_group = {rubric_group_name: response.generated_rubrics}
@@ -2307,6 +2360,7 @@ async def _generate_rubrics(
             types.PredefinedMetricSpecOrDict
         ] = None,
         rubric_generation_spec: Optional[types.RubricGenerationSpecOrDict] = None,
+        metric_resource_name: Optional[str] = None,
         config: Optional[types.RubricGenerationConfigOrDict] = None,
     ) -> types.GenerateInstanceRubricsResponse:
         """
@@ -2317,6 +2371,7 @@ async def _generate_rubrics(
             contents=contents,
             predefined_rubric_generation_spec=predefined_rubric_generation_spec,
             rubric_generation_spec=rubric_generation_spec,
+            metric_resource_name=metric_resource_name,
             config=config,
         )
 
diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py
@@ -2479,6 +2479,10 @@ class EvaluationRunMetric(_common.BaseModel):
     metric: Optional[str] = Field(
         default=None, description="""The name of the metric."""
     )
+    metric_resource_name: Optional[str] = Field(
+        default=None,
+        description="""The resource name of the metric definition. Example: projects/{project}/locations/{location}/evaluationMetrics/{evaluation_metric_id}""",
+    )
     metric_config: Optional[UnifiedMetric] = Field(
         default=None, description="""The unified metric used for evaluation run."""
     )
@@ -2490,6 +2494,9 @@ class EvaluationRunMetricDict(TypedDict, total=False):
     metric: Optional[str]
     """The name of the metric."""
 
+    metric_resource_name: Optional[str]
+    """The resource name of the metric definition. Example: projects/{project}/locations/{location}/evaluationMetrics/{evaluation_metric_id}"""
+
     metric_config: Optional[UnifiedMetricDict]
     """The unified metric used for evaluation run."""
 
@@ -4439,6 +4446,10 @@ class Metric(_common.BaseModel):
         default=None,
         description="""Optional steering instruction parameters for the automated predefined metric.""",
     )
+    metric_resource_name: Optional[str] = Field(
+        default=None,
+        description="""The resource name of the metric definition. Example: projects/{project}/locations/{location}/evaluationMetrics/{evaluation_metric_id}""",
+    )
 
     # Allow extra fields to support metric-specific config fields.
     model_config = ConfigDict(extra="allow")
@@ -4643,6 +4654,9 @@ class MetricDict(TypedDict, total=False):
     metric_spec_parameters: Optional[dict[str, Any]]
     """Optional steering instruction parameters for the automated predefined metric."""
 
+    metric_resource_name: Optional[str]
+    """The resource name of the metric definition. Example: projects/{project}/locations/{location}/evaluationMetrics/{evaluation_metric_id}"""
+
 
 MetricOrDict = Union[Metric, MetricDict]
 
@@ -5354,6 +5368,10 @@ class _GenerateInstanceRubricsRequest(_common.BaseModel):
         default=None,
         description="""Specification for how the rubrics should be generated.""",
     )
+    metric_resource_name: Optional[str] = Field(
+        default=None,
+        description="""Registered metric resource name. If this field is set, the configuration provided in this field is used for rubric generation. The `predefined_rubric_generation_spec` and `rubric_generation_spec` fields will be ignored.""",
+    )
     config: Optional[RubricGenerationConfig] = Field(default=None, description="""""")
 
 
@@ -5374,6 +5392,9 @@ class _GenerateInstanceRubricsRequestDict(TypedDict, total=False):
     rubric_generation_spec: Optional[RubricGenerationSpecDict]
     """Specification for how the rubrics should be generated."""
 
+    metric_resource_name: Optional[str]
+    """Registered metric resource name. If this field is set, the configuration provided in this field is used for rubric generation. The `predefined_rubric_generation_spec` and `rubric_generation_spec` fields will be ignored."""
+
     config: Optional[RubricGenerationConfigDict]
     """"""