Skip to content

Commit 06e328b

Browse files
committed
feat(EvalSchema): updating eval schema
Specifically updating evaluation and evaluationSet schema to adhere to agent definitions.
1 parent 9405619 commit 06e328b

6 files changed

Lines changed: 299 additions & 16 deletions

File tree

src/uipath/_cli/_evals/_models/_evaluation_set.py

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
from pydantic.alias_generators import to_camel
66

77

8+
class EvaluationSimulationTool(BaseModel):
9+
name: str = Field(..., alias="name")
10+
11+
812
class EvaluationItem(BaseModel):
913
"""Individual evaluation item within an evaluation set."""
1014

@@ -14,15 +18,19 @@ class EvaluationItem(BaseModel):
1418
name: str
1519
inputs: Dict[str, Any]
1620
expected_output: Dict[str, Any]
17-
expected_agent_behavior: str = ""
18-
simulation_instructions: str = ""
19-
simulate_input: bool = False
20-
input_generation_instructions: str = ""
21-
simulate_tools: bool = False
22-
tools_to_simulate: List[str] = Field(default_factory=list)
23-
eval_set_id: str
24-
created_at: str
25-
updated_at: str
21+
expected_agent_behavior: str = Field(default="", alias="expectedAgentBehavior")
22+
simulation_instructions: str = Field(default="", alias="simulationInstructions")
23+
simulate_input: bool = Field(default=False, alias="simulateInput")
24+
input_generation_instructions: str = Field(
25+
default="", alias="inputGenerationInstructions"
26+
)
27+
simulate_tools: bool = Field(default=False, alias="simulateTools")
28+
tools_to_simulate: List[EvaluationSimulationTool] = Field(
29+
default_factory=list, alias="toolsToSimulate"
30+
)
31+
eval_set_id: str = Field(alias="evalSetId")
32+
created_at: str = Field(alias="createdAt")
33+
updated_at: str = Field(alias="updatedAt")
2634

2735

2836
class EvaluationSet(BaseModel):
@@ -31,15 +39,17 @@ class EvaluationSet(BaseModel):
3139
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
3240

3341
id: str
34-
file_name: str
42+
file_name: str = Field(..., alias="fileName")
3543
evaluator_refs: List[str] = Field(default_factory=list)
3644
evaluations: List[EvaluationItem] = Field(default_factory=list)
3745
name: str
38-
batch_size: int = 10
39-
timeout_minutes: int = 20
40-
model_settings: List[Dict[str, Any]] = Field(default_factory=list)
41-
created_at: str
42-
updated_at: str
46+
batch_size: int = Field(10, alias="batch_size")
47+
timeout_minutes: int = Field(default=20, alias="timeoutMinutes")
48+
model_settings: List[Dict[str, Any]] = Field(
49+
default_factory=list, alias="modelSettings"
50+
)
51+
created_at: str = Field(alias="createdAt")
52+
updated_at: str = Field(alias="updatedAt")
4353

4454
def extract_selected_evals(self, eval_ids) -> None:
4555
selected_evals: list[EvaluationItem] = []
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
from typing import Annotated, Any, Union
2+
3+
from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
4+
5+
from uipath.eval.models.models import EvaluatorCategory, EvaluatorType
6+
7+
8+
class EvaluatorBaseParams(BaseModel):
9+
"""Parameters for initializing the base evaluator."""
10+
11+
id: str
12+
name: str
13+
description: str
14+
category: EvaluatorCategory = Field(..., alias="category")
15+
evaluator_type: EvaluatorType = Field(..., alias="type")
16+
created_at: str = Field(..., alias="createdAt")
17+
updated_at: str = Field(..., alias="updatedAt")
18+
target_output_key: str = Field(..., alias="targetOutputKey")
19+
file_name: str = Field(..., alias="fileName")
20+
21+
22+
class LLMEvaluatorParams(EvaluatorBaseParams):
23+
prompt: str = Field(..., alias="prompt")
24+
model: str = Field(..., alias="model")
25+
26+
model_config = ConfigDict(
27+
validate_by_name=True, validate_by_alias=True, extra="allow"
28+
)
29+
30+
31+
class UnknownEvaluatorParams(EvaluatorBaseParams):
32+
model_config = ConfigDict(
33+
validate_by_name=True, validate_by_alias=True, extra="allow"
34+
)
35+
36+
37+
def evaluator_discriminator(data: Any) -> str:
38+
if isinstance(data, dict):
39+
category = data.get("category")
40+
evaluator_type = data.get("type")
41+
if (
42+
category == EvaluatorCategory.LlmAsAJudge
43+
or evaluator_type == EvaluatorType.Trajectory
44+
):
45+
return "LLMEvaluatorParams"
46+
return "UnknownEvaluatorParams"
47+
48+
49+
Evaluator = Annotated[
50+
Union[
51+
Annotated[
52+
LLMEvaluatorParams,
53+
Tag("LLMEvaluatorParams"),
54+
],
55+
Annotated[
56+
UnknownEvaluatorParams,
57+
Tag("UnknownEvaluatorParams"),
58+
],
59+
],
60+
Field(discriminator=Discriminator(evaluator_discriminator)),
61+
]

src/uipath/_cli/cli_pull.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ async def download_folder_files(
112112
if local_hash != remote_hash:
113113
styled_path = click.style(str(file_path), fg="cyan")
114114
console.warning(f"File {styled_path}" + " differs from remote version.")
115-
response = click.prompt("Do you want to override it? (y/n)", type=str)
115+
response = click.prompt("Do you want to overwrite it? (y/n)", type=str)
116116
if response.lower() == "y":
117117
with open(local_path, "w", encoding="utf-8", newline="\n") as f:
118118
f.write(remote_content)

src/uipath/agent/_utils.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,52 @@ async def load_agent_definition(project_id: str):
3131
await get_file(project_structure, PurePath("agent.json"), studio_client)
3232
).json()
3333

34+
evaluators = []
35+
try:
36+
evaluators_path = resolve_path(
37+
project_structure, PurePath("evals", "evaluators")
38+
)
39+
if isinstance(evaluators_path, ProjectFolder):
40+
for file in evaluators_path.files:
41+
evaluators.append(
42+
(
43+
await get_file(
44+
evaluators_path, PurePath(file.name), studio_client
45+
)
46+
).json()
47+
)
48+
else:
49+
logger.warning(
50+
"Unable to read evaluators from project. Defaulting to empty evaluators."
51+
)
52+
except Exception:
53+
logger.warning(
54+
"Unable to read evaluators from project. Defaulting to empty evaluators."
55+
)
56+
57+
evaluation_sets = []
58+
try:
59+
evaluation_sets_path = resolve_path(
60+
project_structure, PurePath("evals", "eval-sets")
61+
)
62+
if isinstance(evaluation_sets_path, ProjectFolder):
63+
for file in evaluation_sets_path.files:
64+
evaluation_sets.append(
65+
(
66+
await get_file(
67+
evaluation_sets_path, PurePath(file.name), studio_client
68+
)
69+
).json()
70+
)
71+
else:
72+
logger.warning(
73+
"Unable to read eval-sets from project. Defaulting to empty eval-sets."
74+
)
75+
except Exception:
76+
logger.warning(
77+
"Unable to read eval-sets from project. Defaulting to empty eval-sets."
78+
)
79+
3480
resolved_path = resolve_path(project_structure, PurePath("resources"))
3581
if isinstance(resolved_path, ProjectFolder):
3682
resource_folders = resolved_path.folders
@@ -50,6 +96,8 @@ async def load_agent_definition(project_id: str):
5096
"id": project_id,
5197
"name": project_structure.name,
5298
"resources": resources,
99+
"evaluators": evaluators,
100+
"evaluationSets": evaluation_sets,
53101
**agent,
54102
}
55103
return TypeAdapter(AgentDefinition).validate_python(agent_definition)

src/uipath/agent/models/agent.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
77

8+
from uipath._cli._evals._models._evaluation_set import EvaluationSet
9+
from uipath._cli._evals._models._evaluator import Evaluator
810
from uipath.models import Connection
911

1012

@@ -307,6 +309,12 @@ class BaseAgentDefinition(BaseModel):
307309
resources: List[AgentResourceConfig] = Field(
308310
..., description="List of tools, context, and escalation resources"
309311
)
312+
evaluation_sets: List[EvaluationSet] = Field(
313+
...,
314+
alias="evaluationSets",
315+
description="List of agent evaluation sets",
316+
)
317+
evaluators: List[Evaluator] = Field(..., description="List of agent evaluators")
310318

311319
model_config = ConfigDict(
312320
validate_by_name=True, validate_by_alias=True, extra="allow"

tests/agent/models/test_agent.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,158 @@ def test_agent_config_loads_complete_json(self):
262262
"model": "gpt-4o-2024-11-20",
263263
"temperature": 0,
264264
},
265+
"evaluators": [
266+
{
267+
"fileName": "evaluator-default.json",
268+
"id": "c395579a-4e15-425e-b400-a630a63a6237",
269+
"name": "Default Evaluator",
270+
"description": "An evaluator that uses a LLM to score the similarity of the actual output to the expected output",
271+
"category": 1,
272+
"type": 5,
273+
"prompt": "As an expert evaluator, analyze the semantic similarity of these JSON contents to determine a score from 0-100. Focus on comparing the meaning and contextual equivalence of corresponding fields, accounting for alternative valid expressions, synonyms, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.\n----\nExpectedOutput:\n{{ExpectedOutput}}\n----\nActualOutput:\n{{ActualOutput}}\n",
274+
"model": "same-as-agent",
275+
"targetOutputKey": "*",
276+
"createdAt": "2025-06-09T18:20:06.080Z",
277+
"updatedAt": "2025-06-09T18:20:06.080Z",
278+
},
279+
{
280+
"fileName": "evaluator-default-simulation.json",
281+
"id": "3c82e1a2-0112-4e3b-ba45-25f379298faa",
282+
"name": "Default Simulation Evaluator",
283+
"description": "An evaluator that uses a LLM to score the similarity of the actual output to the expected output",
284+
"category": 3,
285+
"type": 7,
286+
"prompt": "As an expert evaluator, determine how well the agent did on a scale of 0-100. Focus on if the simulation was successful and if the agent behaved according to the expected output accounting for alternative valid expressions, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.\n----\nUserOrSyntheticInputGivenToAgent:\n{{UserOrSyntheticInput}}\n----\nSimulationInstructions:\n{{SimulationInstructions}}\n----\nExpectedAgentBehavior:\n{{ExpectedAgentBehavior}}\n----\nAgentRunHistory:\n{{AgentRunHistory}}\n",
287+
"model": "same-as-agent",
288+
"targetOutputKey": "*",
289+
"createdAt": "2025-06-09T18:20:06.335Z",
290+
"updatedAt": "2025-06-09T18:20:06.335Z",
291+
},
292+
{
293+
"fileName": "evaluator-default-trajectory.json",
294+
"id": "a544a330-5e6b-4dca-a4e5-ea5fd024779b",
295+
"name": "Default Trajectory Evaluator",
296+
"description": "An evaluator that judges the agent based on it's run history and expected behavior",
297+
"category": 3,
298+
"type": 7,
299+
"prompt": "As an expert evaluator, determine how well the agent did on a scale of 0-100. Focus on if the simulation was successful and if the agent behaved according to the expected output accounting for alternative valid expressions, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.\n----\nUserOrSyntheticInputGivenToAgent:\n{{UserOrSyntheticInput}}\n----\nSimulationInstructions:\n{{SimulationInstructions}}\n----\nExpectedAgentBehavior:\n{{ExpectedAgentBehavior}}\n----\nAgentRunHistory:\n{{AgentRunHistory}}\n",
300+
"model": "same-as-agent",
301+
"targetOutputKey": "*",
302+
"createdAt": "2025-06-26T17:45:39.651Z",
303+
"updatedAt": "2025-06-26T17:45:39.651Z",
304+
},
305+
],
306+
"evaluationSets": [
307+
{
308+
"fileName": "evaluation-set-1757012098378.json",
309+
"id": "d649e632-1582-4cb1-9e68-c3aff46c2802",
310+
"name": "Loan Agent Evaluation Set",
311+
"batchSize": 10,
312+
"evaluatorRefs": [
313+
"c395579a-4e15-425e-b400-a630a63a6237",
314+
"a544a330-5e6b-4dca-a4e5-ea5fd024779b",
315+
],
316+
"evaluations": [
317+
{
318+
"id": "7309b5dc-46c5-46cb-b6cb-dbb5d9ff5ccf",
319+
"name": "Low Credit Score Rejection",
320+
"inputs": {},
321+
"expectedOutput": {"content": '"rejected"'},
322+
"simulationInstructions": "The A2ALoanCreditRatingTool should return a credit rating of 650.",
323+
"expectedAgentBehavior": "The agent should reject the loan application due to the credit rating being below 700.",
324+
"simulateInput": True,
325+
"inputGenerationInstructions": "Generate a loan application query with name, loan amount, and loan type (mortgage or personal loan).",
326+
"simulateTools": True,
327+
"toolsToSimulate": [
328+
{"name": "A2ALoanCreditRatingTool"},
329+
{"name": "escalate_escalation_1"},
330+
],
331+
"evalSetId": "7e4a91a3-e387-47c6-b4e2-75cd503a77d3",
332+
"createdAt": "2025-09-04T18:54:58.378Z",
333+
"updatedAt": "2025-09-04T18:55:55.416Z",
334+
},
335+
{
336+
"id": "f8e31cc4-1e70-4043-80df-eac1439f6120",
337+
"name": "High Credit Score Small Loan Approval",
338+
"inputs": {},
339+
"expectedOutput": {},
340+
"simulationInstructions": "The A2ALoanCreditRatingTool should return a credit rating of 850.",
341+
"expectedAgentBehavior": "The agent should approve the loan application due to the credit rating being above 800 and the loan amount being less than $10,000.",
342+
"simulateInput": True,
343+
"inputGenerationInstructions": "Generate a loan application query with name, loan amount under $10,000, and loan type (mortgage or personal loan).",
344+
"simulateTools": True,
345+
"toolsToSimulate": [
346+
{"name": "A2ALoanCreditRatingTool"},
347+
{"name": "escalate_escalation_1"},
348+
],
349+
"evalSetId": "7e4a91a3-e387-47c6-b4e2-75cd503a77d3",
350+
"createdAt": "2025-09-04T18:54:58.378Z",
351+
"updatedAt": "2025-09-04T18:54:58.378Z",
352+
},
353+
{
354+
"id": "73a5dc37-9147-4184-9427-dd7306ed8e71",
355+
"name": "Manual Review Escalation",
356+
"inputs": {},
357+
"expectedOutput": {},
358+
"simulationInstructions": "The A2ALoanCreditRatingTool should return a credit rating of 750.",
359+
"expectedAgentBehavior": "The agent should escalate the application for manual review as the credit rating is between 700 and 800.",
360+
"simulateInput": True,
361+
"inputGenerationInstructions": "Generate a loan application query with name, loan amount over $10,000, and loan type (mortgage or personal loan).",
362+
"simulateTools": True,
363+
"toolsToSimulate": [
364+
{"name": "A2ALoanCreditRatingTool"},
365+
{"name": "escalate_escalation_1"},
366+
],
367+
"evalSetId": "7e4a91a3-e387-47c6-b4e2-75cd503a77d3",
368+
"createdAt": "2025-09-04T18:54:58.378Z",
369+
"updatedAt": "2025-09-04T18:54:58.378Z",
370+
},
371+
{
372+
"id": "5c8f2030-0129-478f-8c56-140c287f22ab",
373+
"name": "Incomplete Application",
374+
"inputs": {},
375+
"expectedOutput": {},
376+
"simulationInstructions": "No tool calls should be made.",
377+
"expectedAgentBehavior": "The agent should inform the user that all mandatory details (name, loan amount, and loan type) are required to process the application.",
378+
"simulateInput": True,
379+
"inputGenerationInstructions": "Generate a loan application query missing one of the mandatory details (name, loan amount, or loan type).",
380+
"simulateTools": True,
381+
"toolsToSimulate": [
382+
{"name": "A2ALoanCreditRatingTool"},
383+
{"name": "escalate_escalation_1"},
384+
],
385+
"evalSetId": "7e4a91a3-e387-47c6-b4e2-75cd503a77d3",
386+
"createdAt": "2025-09-04T18:54:58.378Z",
387+
"updatedAt": "2025-09-04T18:54:58.378Z",
388+
},
389+
],
390+
"modelSettings": [],
391+
"createdAt": "2025-09-04T18:54:58.379Z",
392+
"updatedAt": "2025-09-04T18:55:55.416Z",
393+
},
394+
{
395+
"fileName": "evaluation-set-default.json",
396+
"id": "aee3efd3-252a-439b-baf7-565cef3d0ef4",
397+
"name": "Default Evaluation Set",
398+
"batchSize": 10,
399+
"evaluatorRefs": ["c395579a-4e15-425e-b400-a630a63a6237"],
400+
"evaluations": [],
401+
"modelSettings": [],
402+
"createdAt": "2025-06-09T18:20:06.644Z",
403+
"updatedAt": "2025-06-09T18:20:06.644Z",
404+
},
405+
{
406+
"fileName": "evaluation-set-simulation-default-simulation_set.json",
407+
"id": "f52b67e1-6fe5-4cb6-966a-082f2ccbf0ae",
408+
"name": "Default Simulation Evaluation Set",
409+
"batchSize": 10,
410+
"evaluatorRefs": ["3c82e1a2-0112-4e3b-ba45-25f379298faa"],
411+
"evaluations": [],
412+
"modelSettings": [],
413+
"createdAt": "2025-06-09T18:20:07.045Z",
414+
"updatedAt": "2025-06-09T18:20:07.045Z",
415+
},
416+
],
265417
"version": "1.0.0",
266418
}
267419

@@ -351,6 +503,8 @@ def test_agent_config_loads_unknown_resource_json(self):
351503
"extraField": {"foo": "bar"},
352504
}
353505
],
506+
"evaluators": [],
507+
"evaluationSets": [],
354508
}
355509

356510
config: AgentDefinition = TypeAdapter(AgentDefinition).validate_python(
@@ -398,6 +552,8 @@ def test_agent_config_loads_unknown_agent_type(self):
398552
},
399553
},
400554
"resources": [],
555+
"evaluators": [],
556+
"evaluationSets": [],
401557
}
402558

403559
config: AgentDefinition = TypeAdapter(AgentDefinition).validate_python(

0 commit comments

Comments
 (0)