Skip to content

Commit 9803899

Browse files
committed
add evaluator_name and datapoint_id identifiers to EvaluationResult + initial version for results aggregation
1 parent 0312217 commit 9803899

12 files changed

Lines changed: 558 additions & 5 deletions

src/uipath/_cli/_evals/_models/_output.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ class EvaluationResultDto(BaseModel):
2424
score: float
2525
details: Optional[str | BaseModel] = None
2626
evaluation_time: Optional[float] = None
27+
evaluator_name: Optional[str] = None
28+
datapoint_id: Optional[str] = None
2729

2830
@model_serializer(mode="wrap")
2931
def serialize_model(self, serializer, info):
@@ -49,6 +51,8 @@ def from_evaluation_result(
4951
score=score,
5052
details=evaluation_result.details,
5153
evaluation_time=evaluation_result.evaluation_time,
54+
evaluator_name=evaluation_result.evaluator_name,
55+
datapoint_id=evaluation_result.datapoint_id,
5256
)
5357

5458

src/uipath/eval/_helpers/coded_evaluators_helpers.py

Lines changed: 137 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,20 @@
11
import ast
2+
import hashlib
23
import json
4+
from collections import defaultdict
35
from collections.abc import Mapping, Sequence
46
from datetime import datetime
57
from typing import Any
68

79
from opentelemetry.sdk.trace import ReadableSpan
810

9-
from ..models import ToolCall, ToolOutput
11+
from ..models import (
12+
AgentExecution,
13+
EvaluationResult,
14+
NumericEvaluationResult,
15+
ToolCall,
16+
ToolOutput,
17+
)
1018

1119
COMPARATOR_MAPPINGS = {
1220
">": "gt",
@@ -21,6 +29,48 @@
2129
COMMUNITY_agents_SUFFIX = "-community-agents"
2230

2331

32+
def generate_datapoint_id(agent_execution: AgentExecution) -> str:
33+
"""Generate a collision-safe but readable datapoint ID from agent_input.
34+
35+
Creates a short, readable ID that includes meaningful content from the input
36+
plus a hash suffix for collision safety.
37+
38+
Args:
39+
agent_execution: The agent execution containing agent_input
40+
41+
Returns:
42+
String datapoint ID in format: "readable_part_HASH"
43+
"""
44+
if not agent_execution.agent_input:
45+
# Handle empty input case
46+
raw_input = "empty_input"
47+
else:
48+
# Convert agent_input to JSON string for hashing
49+
raw_input = json.dumps(
50+
agent_execution.agent_input, sort_keys=True, separators=(",", ":")
51+
)
52+
53+
# Create readable part from input (first 30 chars, alphanumeric only)
54+
readable_part = ""
55+
if isinstance(agent_execution.agent_input, dict):
56+
# Try to extract meaningful text from common fields
57+
for key in ["query", "question", "input", "prompt", "text", "message"]:
58+
if key in agent_execution.agent_input and agent_execution.agent_input[key]:
59+
text = str(agent_execution.agent_input[key])
60+
readable_part = "".join(c for c in text if c.isalnum() or c in " _-")
61+
readable_part = readable_part.replace(" ", "_").lower()[:30]
62+
break
63+
64+
# If no readable part found, use "input" prefix
65+
if not readable_part:
66+
readable_part = "input"
67+
68+
# Generate 8-character hash for collision safety
69+
hash_part = hashlib.md5(raw_input.encode("utf-8")).hexdigest()[:8]
70+
71+
return f"{readable_part}_{hash_part}"
72+
73+
2474
def extract_tool_calls_names(spans: Sequence[ReadableSpan]) -> list[str]:
2575
"""Extract the tool call names from execution spans IN ORDER.
2676
@@ -456,3 +506,89 @@ def trace_to_str(agent_trace: Sequence[ReadableSpan]) -> str:
456506
platform_history.append("")
457507

458508
return "\n".join(platform_history)
509+
510+
511+
def calculate_final_score(
512+
evaluation_results: list[EvaluationResult],
513+
evaluator_weights: dict[str, float] | None = None,
514+
default_weight: float = 1.0,
515+
) -> tuple[float, dict[str, float]]:
516+
"""Aggregate evaluation results with deduplication and weighted scoring.
517+
518+
Only NumericEvaluationResult can be aggregated, other types of results are ignored.
519+
520+
This function performs the following steps:
521+
1. Deduplicates results by datapoint_id and evaluator_name (averages duplicates)
522+
2. Calculates average score per evaluator across all datapoints
523+
3. Computes final weighted score across evaluators
524+
525+
Args:
526+
evaluation_results: List of EvaluationResult objects with datapoint_id and evaluator_name
527+
evaluator_weights: Optional dict mapping evaluator names to weights
528+
529+
Returns:
530+
Tuple of (final_score, agg_metrics_per_evaluator)
531+
- final_score: Weighted average across evaluators
532+
- agg_metrics_per_evaluator: Dict mapping evaluator names to their average scores
533+
"""
534+
if not evaluation_results:
535+
return 0.0, {}
536+
537+
if evaluator_weights is None:
538+
evaluator_weights = {}
539+
540+
# Step 1: Group by datapoint_id and evaluator_name for deduplication
541+
grouped_by_datapoint_evaluator = defaultdict(
542+
lambda: defaultdict(list[NumericEvaluationResult])
543+
)
544+
545+
for result in evaluation_results:
546+
# Only NumericEvaluationResult can be aggregated
547+
if isinstance(result, NumericEvaluationResult):
548+
datapoint_id = result.datapoint_id or "unknown_datapoint"
549+
evaluator_name = result.evaluator_name or "unknown_evaluator"
550+
grouped_by_datapoint_evaluator[datapoint_id][evaluator_name].append(result)
551+
552+
# Step 2: Deduplicate by averaging same evaluator results for same datapoint
553+
dedup_results: list[NumericEvaluationResult] = []
554+
for datapoint_id, evaluators_dict in grouped_by_datapoint_evaluator.items():
555+
for evaluator_name, results_list in evaluators_dict.items():
556+
if results_list:
557+
# Average the scores for this evaluator on this datapoint
558+
avg_score = sum(r.score for r in results_list) / len(results_list)
559+
# Create a representative result (use first result as template)
560+
first_result = results_list[0]
561+
dedup_result = type(first_result)(
562+
score=avg_score,
563+
datapoint_id=datapoint_id,
564+
evaluator_name=evaluator_name,
565+
details=first_result.details,
566+
evaluation_time=first_result.evaluation_time,
567+
)
568+
dedup_results.append(dedup_result)
569+
570+
# Step 3: Group by evaluator and calculate average score per evaluator
571+
grouped_by_evaluator = defaultdict(list[NumericEvaluationResult])
572+
for result in dedup_results:
573+
grouped_by_evaluator[result.evaluator_name].append(result)
574+
575+
agg_metrics_per_evaluator = {}
576+
for evaluator_name, results_list in grouped_by_evaluator.items():
577+
avg_score = sum(r.score for r in results_list) / len(results_list)
578+
agg_metrics_per_evaluator[evaluator_name] = avg_score
579+
580+
# Step 4: Calculate final weighted score
581+
if not agg_metrics_per_evaluator:
582+
return 0.0, {}
583+
584+
total_weighted_score = 0.0
585+
total_weight = 0.0
586+
587+
for evaluator_name, avg_score in agg_metrics_per_evaluator.items():
588+
weight = evaluator_weights.get(evaluator_name, default_weight)
589+
total_weighted_score += avg_score * weight
590+
total_weight += weight
591+
592+
final_score = total_weighted_score / total_weight if total_weight > 0 else 0.0
593+
594+
return final_score, agg_metrics_per_evaluator

src/uipath/eval/coded_evaluators/contains_evaluator.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Contains evaluator for agent outputs."""
22

3+
from .._helpers.coded_evaluators_helpers import generate_datapoint_id
34
from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult
45
from .base_evaluator import BaseEvaluationCriteria, BaseEvaluator, BaseEvaluatorConfig
56

@@ -57,7 +58,11 @@ async def evaluate(
5758
if self.evaluator_config.negated:
5859
is_contains = not is_contains
5960

60-
return NumericEvaluationResult(score=float(is_contains))
61+
return NumericEvaluationResult(
62+
score=float(is_contains),
63+
evaluator_name=self.evaluator_config.name,
64+
datapoint_id=generate_datapoint_id(agent_execution),
65+
)
6166

6267
def _get_actual_output(self, agent_execution: AgentExecution) -> str:
6368
"""Get the actual output from the agent execution."""

src/uipath/eval/coded_evaluators/exact_match_evaluator.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Exact match evaluator for agent outputs."""
22

3+
from .._helpers.coded_evaluators_helpers import generate_datapoint_id
34
from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult
45
from .output_evaluator import (
56
OutputEvaluationCriteria,
@@ -52,4 +53,8 @@ async def evaluate(
5253
if self.evaluator_config.negated:
5354
is_exact_match = not is_exact_match
5455

55-
return NumericEvaluationResult(score=float(is_exact_match))
56+
return NumericEvaluationResult(
57+
score=float(is_exact_match),
58+
evaluator_name=self.evaluator_config.name,
59+
datapoint_id=generate_datapoint_id(agent_execution),
60+
)

src/uipath/eval/coded_evaluators/json_similarity_evaluator.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from pydantic import Field
77

8+
from .._helpers.coded_evaluators_helpers import generate_datapoint_id
89
from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult
910
from .output_evaluator import (
1011
OutputEvaluationCriteria,
@@ -55,6 +56,8 @@ async def evaluate(
5556
return NumericEvaluationResult(
5657
score=score,
5758
details=validated_justification,
59+
evaluator_name=self.evaluator_config.name,
60+
datapoint_id=generate_datapoint_id(agent_execution),
5861
)
5962

6063
def _compare_json(self, expected: Any, actual: Any) -> tuple[float, str]:

src/uipath/eval/coded_evaluators/llm_as_judge_evaluator.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@
77

88
from pydantic import BaseModel, Field, model_validator
99

10-
from .._helpers.coded_evaluators_helpers import COMMUNITY_agents_SUFFIX
10+
from .._helpers.coded_evaluators_helpers import (
11+
COMMUNITY_agents_SUFFIX,
12+
generate_datapoint_id,
13+
)
1114
from ..models import (
1215
AgentExecution,
1316
EvaluationResult,
@@ -99,8 +102,10 @@ async def evaluate(
99102
)
100103

101104
return NumericEvaluationResult(
102-
score=round(llm_response.score / 100.0, 2),
105+
score=max(0.0, min(1.0, round(llm_response.score / 100.0, 2))),
103106
details=validated_justification,
107+
evaluator_name=self.evaluator_config.name,
108+
datapoint_id=generate_datapoint_id(agent_execution),
104109
)
105110

106111
def _create_evaluation_prompt(

src/uipath/eval/coded_evaluators/tool_call_args_evaluator.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from .._helpers.coded_evaluators_helpers import (
44
extract_tool_calls,
5+
generate_datapoint_id,
56
tool_calls_args_score,
67
)
78
from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult, ToolCall
@@ -74,4 +75,6 @@ async def evaluate(
7475
return NumericEvaluationResult(
7576
score=score,
7677
details=validated_justification,
78+
evaluator_name=self.evaluator_config.name,
79+
datapoint_id=generate_datapoint_id(agent_execution),
7780
)

src/uipath/eval/coded_evaluators/tool_call_count_evaluator.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from .._helpers.coded_evaluators_helpers import (
66
extract_tool_calls_names,
7+
generate_datapoint_id,
78
tool_calls_count_score,
89
)
910
from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult
@@ -77,4 +78,6 @@ async def evaluate(
7778
return NumericEvaluationResult(
7879
score=score,
7980
details=validated_justification,
81+
evaluator_name=self.evaluator_config.name,
82+
datapoint_id=generate_datapoint_id(agent_execution),
8083
)

src/uipath/eval/coded_evaluators/tool_call_order_evaluator.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from .._helpers.coded_evaluators_helpers import (
44
extract_tool_calls_names,
5+
generate_datapoint_id,
56
tool_calls_order_score,
67
)
78
from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult
@@ -74,4 +75,6 @@ async def evaluate(
7475
return NumericEvaluationResult(
7576
score=score,
7677
details=validated_justification,
78+
evaluator_name=self.evaluator_config.name,
79+
datapoint_id=generate_datapoint_id(agent_execution),
7780
)

src/uipath/eval/coded_evaluators/tool_call_output_evaluator.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from .._helpers.coded_evaluators_helpers import (
44
extract_tool_calls_outputs,
5+
generate_datapoint_id,
56
tool_calls_output_score,
67
)
78
from ..models import (
@@ -77,4 +78,6 @@ async def evaluate(
7778
return NumericEvaluationResult(
7879
score=score,
7980
details=validated_justification,
81+
evaluator_name=self.evaluator_config.name,
82+
datapoint_id=generate_datapoint_id(agent_execution),
8083
)

0 commit comments

Comments
 (0)