feat(EvalSchema): updating eval schema

akshaylive · akshaylive · commit 06e328b6f6c3 · 2025-09-24T17:57:31.000-07:00
Specifically updating evaluation and evaluationSet schema to adhere to agent definitions.
diff --git a/src/uipath/_cli/_evals/_models/_evaluation_set.py b/src/uipath/_cli/_evals/_models/_evaluation_set.py
@@ -5,6 +5,10 @@
 from pydantic.alias_generators import to_camel
 
 
+class EvaluationSimulationTool(BaseModel):
+    name: str = Field(..., alias="name")
+
+
 class EvaluationItem(BaseModel):
     """Individual evaluation item within an evaluation set."""
 
@@ -14,15 +18,19 @@ class EvaluationItem(BaseModel):
     name: str
     inputs: Dict[str, Any]
     expected_output: Dict[str, Any]
-    expected_agent_behavior: str = ""
-    simulation_instructions: str = ""
-    simulate_input: bool = False
-    input_generation_instructions: str = ""
-    simulate_tools: bool = False
-    tools_to_simulate: List[str] = Field(default_factory=list)
-    eval_set_id: str
-    created_at: str
-    updated_at: str
+    expected_agent_behavior: str = Field(default="", alias="expectedAgentBehavior")
+    simulation_instructions: str = Field(default="", alias="simulationInstructions")
+    simulate_input: bool = Field(default=False, alias="simulateInput")
+    input_generation_instructions: str = Field(
+        default="", alias="inputGenerationInstructions"
+    )
+    simulate_tools: bool = Field(default=False, alias="simulateTools")
+    tools_to_simulate: List[EvaluationSimulationTool] = Field(
+        default_factory=list, alias="toolsToSimulate"
+    )
+    eval_set_id: str = Field(alias="evalSetId")
+    created_at: str = Field(alias="createdAt")
+    updated_at: str = Field(alias="updatedAt")
 
 
 class EvaluationSet(BaseModel):
@@ -31,15 +39,17 @@ class EvaluationSet(BaseModel):
     model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
 
     id: str
-    file_name: str
+    file_name: str = Field(..., alias="fileName")
     evaluator_refs: List[str] = Field(default_factory=list)
     evaluations: List[EvaluationItem] = Field(default_factory=list)
     name: str
-    batch_size: int = 10
-    timeout_minutes: int = 20
-    model_settings: List[Dict[str, Any]] = Field(default_factory=list)
-    created_at: str
-    updated_at: str
+    batch_size: int = Field(10, alias="batch_size")
+    timeout_minutes: int = Field(default=20, alias="timeoutMinutes")
+    model_settings: List[Dict[str, Any]] = Field(
+        default_factory=list, alias="modelSettings"
+    )
+    created_at: str = Field(alias="createdAt")
+    updated_at: str = Field(alias="updatedAt")
 
     def extract_selected_evals(self, eval_ids) -> None:
         selected_evals: list[EvaluationItem] = []
diff --git a/src/uipath/_cli/_evals/_models/_evaluator.py b/src/uipath/_cli/_evals/_models/_evaluator.py
@@ -0,0 +1,61 @@
+from typing import Annotated, Any, Union
+
+from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
+
+from uipath.eval.models.models import EvaluatorCategory, EvaluatorType
+
+
+class EvaluatorBaseParams(BaseModel):
+    """Parameters for initializing the base evaluator."""
+
+    id: str
+    name: str
+    description: str
+    category: EvaluatorCategory = Field(..., alias="category")
+    evaluator_type: EvaluatorType = Field(..., alias="type")
+    created_at: str = Field(..., alias="createdAt")
+    updated_at: str = Field(..., alias="updatedAt")
+    target_output_key: str = Field(..., alias="targetOutputKey")
+    file_name: str = Field(..., alias="fileName")
+
+
+class LLMEvaluatorParams(EvaluatorBaseParams):
+    prompt: str = Field(..., alias="prompt")
+    model: str = Field(..., alias="model")
+
+    model_config = ConfigDict(
+        validate_by_name=True, validate_by_alias=True, extra="allow"
+    )
+
+
+class UnknownEvaluatorParams(EvaluatorBaseParams):
+    model_config = ConfigDict(
+        validate_by_name=True, validate_by_alias=True, extra="allow"
+    )
+
+
+def evaluator_discriminator(data: Any) -> str:
+    if isinstance(data, dict):
+        category = data.get("category")
+        evaluator_type = data.get("type")
+        if (
+            category == EvaluatorCategory.LlmAsAJudge
+            or evaluator_type == EvaluatorType.Trajectory
+        ):
+            return "LLMEvaluatorParams"
+    return "UnknownEvaluatorParams"
+
+
+Evaluator = Annotated[
+    Union[
+        Annotated[
+            LLMEvaluatorParams,
+            Tag("LLMEvaluatorParams"),
+        ],
+        Annotated[
+            UnknownEvaluatorParams,
+            Tag("UnknownEvaluatorParams"),
+        ],
+    ],
+    Field(discriminator=Discriminator(evaluator_discriminator)),
+]
diff --git a/src/uipath/_cli/cli_pull.py b/src/uipath/_cli/cli_pull.py
@@ -112,7 +112,7 @@ async def download_folder_files(
             if local_hash != remote_hash:
                 styled_path = click.style(str(file_path), fg="cyan")
                 console.warning(f"File {styled_path}" + " differs from remote version.")
-                response = click.prompt("Do you want to override it? (y/n)", type=str)
+                response = click.prompt("Do you want to overwrite it? (y/n)", type=str)
                 if response.lower() == "y":
                     with open(local_path, "w", encoding="utf-8", newline="\n") as f:
                         f.write(remote_content)
diff --git a/src/uipath/agent/_utils.py b/src/uipath/agent/_utils.py
@@ -31,6 +31,52 @@ async def load_agent_definition(project_id: str):
         await get_file(project_structure, PurePath("agent.json"), studio_client)
     ).json()
 
+    evaluators = []
+    try:
+        evaluators_path = resolve_path(
+            project_structure, PurePath("evals", "evaluators")
+        )
+        if isinstance(evaluators_path, ProjectFolder):
+            for file in evaluators_path.files:
+                evaluators.append(
+                    (
+                        await get_file(
+                            evaluators_path, PurePath(file.name), studio_client
+                        )
+                    ).json()
+                )
+        else:
+            logger.warning(
+                "Unable to read evaluators from project. Defaulting to empty evaluators."
+            )
+    except Exception:
+        logger.warning(
+            "Unable to read evaluators from project. Defaulting to empty evaluators."
+        )
+
+    evaluation_sets = []
+    try:
+        evaluation_sets_path = resolve_path(
+            project_structure, PurePath("evals", "eval-sets")
+        )
+        if isinstance(evaluation_sets_path, ProjectFolder):
+            for file in evaluation_sets_path.files:
+                evaluation_sets.append(
+                    (
+                        await get_file(
+                            evaluation_sets_path, PurePath(file.name), studio_client
+                        )
+                    ).json()
+                )
+        else:
+            logger.warning(
+                "Unable to read eval-sets from project. Defaulting to empty eval-sets."
+            )
+    except Exception:
+        logger.warning(
+            "Unable to read eval-sets from project. Defaulting to empty eval-sets."
+        )
+
     resolved_path = resolve_path(project_structure, PurePath("resources"))
     if isinstance(resolved_path, ProjectFolder):
         resource_folders = resolved_path.folders
@@ -50,6 +96,8 @@ async def load_agent_definition(project_id: str):
         "id": project_id,
         "name": project_structure.name,
         "resources": resources,
+        "evaluators": evaluators,
+        "evaluationSets": evaluation_sets,
         **agent,
     }
     return TypeAdapter(AgentDefinition).validate_python(agent_definition)
diff --git a/src/uipath/agent/models/agent.py b/src/uipath/agent/models/agent.py
@@ -5,6 +5,8 @@
 
 from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
 
+from uipath._cli._evals._models._evaluation_set import EvaluationSet
+from uipath._cli._evals._models._evaluator import Evaluator
 from uipath.models import Connection
 
 
@@ -307,6 +309,12 @@ class BaseAgentDefinition(BaseModel):
     resources: List[AgentResourceConfig] = Field(
         ..., description="List of tools, context, and escalation resources"
     )
+    evaluation_sets: List[EvaluationSet] = Field(
+        ...,
+        alias="evaluationSets",
+        description="List of agent evaluation sets",
+    )
+    evaluators: List[Evaluator] = Field(..., description="List of agent evaluators")
 
     model_config = ConfigDict(
         validate_by_name=True, validate_by_alias=True, extra="allow"
diff --git a/tests/agent/models/test_agent.py b/tests/agent/models/test_agent.py
@@ -262,6 +262,158 @@ def test_agent_config_loads_complete_json(self):
                 "model": "gpt-4o-2024-11-20",
                 "temperature": 0,
             },
+            "evaluators": [
+                {
+                    "fileName": "evaluator-default.json",
+                    "id": "c395579a-4e15-425e-b400-a630a63a6237",
+                    "name": "Default Evaluator",
+                    "description": "An evaluator that uses a LLM to score the similarity of the actual output to the expected output",
+                    "category": 1,
+                    "type": 5,
+                    "prompt": "As an expert evaluator, analyze the semantic similarity of these JSON contents to determine a score from 0-100. Focus on comparing the meaning and contextual equivalence of corresponding fields, accounting for alternative valid expressions, synonyms, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.\n----\nExpectedOutput:\n{{ExpectedOutput}}\n----\nActualOutput:\n{{ActualOutput}}\n",
+                    "model": "same-as-agent",
+                    "targetOutputKey": "*",
+                    "createdAt": "2025-06-09T18:20:06.080Z",
+                    "updatedAt": "2025-06-09T18:20:06.080Z",
+                },
+                {
+                    "fileName": "evaluator-default-simulation.json",
+                    "id": "3c82e1a2-0112-4e3b-ba45-25f379298faa",
+                    "name": "Default Simulation Evaluator",
+                    "description": "An evaluator that uses a LLM to score the similarity of the actual output to the expected output",
+                    "category": 3,
+                    "type": 7,
+                    "prompt": "As an expert evaluator, determine how well the agent did on a scale of 0-100. Focus on if the simulation was successful and if the agent behaved according to the expected output accounting for alternative valid expressions, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.\n----\nUserOrSyntheticInputGivenToAgent:\n{{UserOrSyntheticInput}}\n----\nSimulationInstructions:\n{{SimulationInstructions}}\n----\nExpectedAgentBehavior:\n{{ExpectedAgentBehavior}}\n----\nAgentRunHistory:\n{{AgentRunHistory}}\n",
+                    "model": "same-as-agent",
+                    "targetOutputKey": "*",
+                    "createdAt": "2025-06-09T18:20:06.335Z",
+                    "updatedAt": "2025-06-09T18:20:06.335Z",
+                },
+                {
+                    "fileName": "evaluator-default-trajectory.json",
+                    "id": "a544a330-5e6b-4dca-a4e5-ea5fd024779b",
+                    "name": "Default Trajectory Evaluator",
+                    "description": "An evaluator that judges the agent based on it's run history and expected behavior",
+                    "category": 3,
+                    "type": 7,
+                    "prompt": "As an expert evaluator, determine how well the agent did on a scale of 0-100. Focus on if the simulation was successful and if the agent behaved according to the expected output accounting for alternative valid expressions, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.\n----\nUserOrSyntheticInputGivenToAgent:\n{{UserOrSyntheticInput}}\n----\nSimulationInstructions:\n{{SimulationInstructions}}\n----\nExpectedAgentBehavior:\n{{ExpectedAgentBehavior}}\n----\nAgentRunHistory:\n{{AgentRunHistory}}\n",
+                    "model": "same-as-agent",
+                    "targetOutputKey": "*",
+                    "createdAt": "2025-06-26T17:45:39.651Z",
+                    "updatedAt": "2025-06-26T17:45:39.651Z",
+                },
+            ],
+            "evaluationSets": [
+                {
+                    "fileName": "evaluation-set-1757012098378.json",
+                    "id": "d649e632-1582-4cb1-9e68-c3aff46c2802",
+                    "name": "Loan Agent Evaluation Set",
+                    "batchSize": 10,
+                    "evaluatorRefs": [
+                        "c395579a-4e15-425e-b400-a630a63a6237",
+                        "a544a330-5e6b-4dca-a4e5-ea5fd024779b",
+                    ],
+                    "evaluations": [
+                        {
+                            "id": "7309b5dc-46c5-46cb-b6cb-dbb5d9ff5ccf",
+                            "name": "Low Credit Score Rejection",
+                            "inputs": {},
+                            "expectedOutput": {"content": '"rejected"'},
+                            "simulationInstructions": "The A2ALoanCreditRatingTool should return a credit rating of 650.",
+                            "expectedAgentBehavior": "The agent should reject the loan application due to the credit rating being below 700.",
+                            "simulateInput": True,
+                            "inputGenerationInstructions": "Generate a loan application query with name, loan amount, and loan type (mortgage or personal loan).",
+                            "simulateTools": True,
+                            "toolsToSimulate": [
+                                {"name": "A2ALoanCreditRatingTool"},
+                                {"name": "escalate_escalation_1"},
+                            ],
+                            "evalSetId": "7e4a91a3-e387-47c6-b4e2-75cd503a77d3",
+                            "createdAt": "2025-09-04T18:54:58.378Z",
+                            "updatedAt": "2025-09-04T18:55:55.416Z",
+                        },
+                        {
+                            "id": "f8e31cc4-1e70-4043-80df-eac1439f6120",
+                            "name": "High Credit Score Small Loan Approval",
+                            "inputs": {},
+                            "expectedOutput": {},
+                            "simulationInstructions": "The A2ALoanCreditRatingTool should return a credit rating of 850.",
+                            "expectedAgentBehavior": "The agent should approve the loan application due to the credit rating being above 800 and the loan amount being less than $10,000.",
+                            "simulateInput": True,
+                            "inputGenerationInstructions": "Generate a loan application query with name, loan amount under $10,000, and loan type (mortgage or personal loan).",
+                            "simulateTools": True,
+                            "toolsToSimulate": [
+                                {"name": "A2ALoanCreditRatingTool"},
+                                {"name": "escalate_escalation_1"},
+                            ],
+                            "evalSetId": "7e4a91a3-e387-47c6-b4e2-75cd503a77d3",
+                            "createdAt": "2025-09-04T18:54:58.378Z",
+                            "updatedAt": "2025-09-04T18:54:58.378Z",
+                        },
+                        {
+                            "id": "73a5dc37-9147-4184-9427-dd7306ed8e71",
+                            "name": "Manual Review Escalation",
+                            "inputs": {},
+                            "expectedOutput": {},
+                            "simulationInstructions": "The A2ALoanCreditRatingTool should return a credit rating of 750.",
+                            "expectedAgentBehavior": "The agent should escalate the application for manual review as the credit rating is between 700 and 800.",
+                            "simulateInput": True,
+                            "inputGenerationInstructions": "Generate a loan application query with name, loan amount over $10,000, and loan type (mortgage or personal loan).",
+                            "simulateTools": True,
+                            "toolsToSimulate": [
+                                {"name": "A2ALoanCreditRatingTool"},
+                                {"name": "escalate_escalation_1"},
+                            ],
+                            "evalSetId": "7e4a91a3-e387-47c6-b4e2-75cd503a77d3",
+                            "createdAt": "2025-09-04T18:54:58.378Z",
+                            "updatedAt": "2025-09-04T18:54:58.378Z",
+                        },
+                        {
+                            "id": "5c8f2030-0129-478f-8c56-140c287f22ab",
+                            "name": "Incomplete Application",
+                            "inputs": {},
+                            "expectedOutput": {},
+                            "simulationInstructions": "No tool calls should be made.",
+                            "expectedAgentBehavior": "The agent should inform the user that all mandatory details (name, loan amount, and loan type) are required to process the application.",
+                            "simulateInput": True,
+                            "inputGenerationInstructions": "Generate a loan application query missing one of the mandatory details (name, loan amount, or loan type).",
+                            "simulateTools": True,
+                            "toolsToSimulate": [
+                                {"name": "A2ALoanCreditRatingTool"},
+                                {"name": "escalate_escalation_1"},
+                            ],
+                            "evalSetId": "7e4a91a3-e387-47c6-b4e2-75cd503a77d3",
+                            "createdAt": "2025-09-04T18:54:58.378Z",
+                            "updatedAt": "2025-09-04T18:54:58.378Z",
+                        },
+                    ],
+                    "modelSettings": [],
+                    "createdAt": "2025-09-04T18:54:58.379Z",
+                    "updatedAt": "2025-09-04T18:55:55.416Z",
+                },
+                {
+                    "fileName": "evaluation-set-default.json",
+                    "id": "aee3efd3-252a-439b-baf7-565cef3d0ef4",
+                    "name": "Default Evaluation Set",
+                    "batchSize": 10,
+                    "evaluatorRefs": ["c395579a-4e15-425e-b400-a630a63a6237"],
+                    "evaluations": [],
+                    "modelSettings": [],
+                    "createdAt": "2025-06-09T18:20:06.644Z",
+                    "updatedAt": "2025-06-09T18:20:06.644Z",
+                },
+                {
+                    "fileName": "evaluation-set-simulation-default-simulation_set.json",
+                    "id": "f52b67e1-6fe5-4cb6-966a-082f2ccbf0ae",
+                    "name": "Default Simulation Evaluation Set",
+                    "batchSize": 10,
+                    "evaluatorRefs": ["3c82e1a2-0112-4e3b-ba45-25f379298faa"],
+                    "evaluations": [],
+                    "modelSettings": [],
+                    "createdAt": "2025-06-09T18:20:07.045Z",
+                    "updatedAt": "2025-06-09T18:20:07.045Z",
+                },
+            ],
             "version": "1.0.0",
         }
 
@@ -351,6 +503,8 @@ def test_agent_config_loads_unknown_resource_json(self):
                     "extraField": {"foo": "bar"},
                 }
             ],
+            "evaluators": [],
+            "evaluationSets": [],
         }
 
         config: AgentDefinition = TypeAdapter(AgentDefinition).validate_python(
@@ -398,6 +552,8 @@ def test_agent_config_loads_unknown_agent_type(self):
                 },
             },
             "resources": [],
+            "evaluators": [],
+            "evaluationSets": [],
         }
 
         config: AgentDefinition = TypeAdapter(AgentDefinition).validate_python(