Skip to content

Commit 9516bea

Browse files
committed
fix: move some functions to utility and dedup the legacy and coded
1 parent 3578e81 commit 9516bea

11 files changed

Lines changed: 772 additions & 245 deletions

File tree

packages/uipath/samples/line_by_line_test/README.md

Lines changed: 68 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,17 +45,23 @@ uv run uipath eval main evaluations/eval-sets/default.json --workers 1
4545

4646
## Evaluation Results
4747

48-
The sample includes three test cases with three evaluators:
48+
The sample includes three test cases with five evaluators:
49+
50+
### ExactMatch Evaluators
4951
- **LineByLineExactMatch** - New evaluator with line-by-line support
5052
- **RegularExactMatch** - New evaluator without line-by-line (for comparison)
5153
- **LegacyLineByLineExactMatch** - Legacy evaluator with line-by-line support
5254

55+
### Contains Evaluators
56+
- **LineByLineContains** - New evaluator with line-by-line support (checks if each line contains the search text)
57+
- **RegularContains** - New evaluator without line-by-line (checks if the entire output contains the search text)
58+
5359
Test cases:
5460
1. **All lines match exactly** - All evaluators score 1.0
55-
2. **One line doesn't match** - Line-by-line evaluators: 0.67, Regular: 0.0 (shows partial credit!)
61+
2. **One line doesn't match** - Line-by-line ExactMatch: 0.67, Regular ExactMatch: 0.0 (shows partial credit!)
5662
3. **Single item** - All evaluators score 1.0
5763

58-
Expected output:
64+
Expected output (showing ExactMatch evaluators):
5965
```
6066
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
6167
┃ Evaluation ┃ LineByLineExactMatch ┃ RegularExactMatch ┃ LegacyLineByLineExactMatch ┃
@@ -68,6 +74,8 @@ Expected output:
6874
└───────────────────────────────┴────────────────────────┴─────────────────────┴───────────────────────────────┘
6975
```
7076

77+
Contains evaluators will all score 1.0 since all test outputs contain "Item:".
78+
7179
## Configuration
7280

7381
### Evaluator Configuration
@@ -104,9 +112,59 @@ Legacy evaluators also support line-by-line evaluation in `evaluations/evaluator
104112
}
105113
```
106114

107-
Key options for both evaluator types:
115+
#### Contains Evaluators
116+
117+
The Contains evaluator checks if the output contains a specific search text. In line-by-line mode, it checks each line independently:
118+
119+
**Line-by-line Contains** (`evaluations/evaluators/line-by-line-contains.json`):
120+
```json
121+
{
122+
"version": "1.0",
123+
"evaluatorTypeId": "uipath-contains",
124+
"evaluatorConfig": {
125+
"name": "LineByLineContains",
126+
"target_output_key": "result",
127+
"line_by_line_evaluator": true,
128+
"line_delimiter": "\n",
129+
"case_sensitive": false,
130+
"negated": false
131+
}
132+
}
133+
```
134+
135+
**Regular Contains** (`evaluations/evaluators/regular-contains.json`):
136+
```json
137+
{
138+
"version": "1.0",
139+
"evaluatorTypeId": "uipath-contains",
140+
"evaluatorConfig": {
141+
"name": "RegularContains",
142+
"target_output_key": "result",
143+
"line_by_line_evaluator": false,
144+
"case_sensitive": false,
145+
"negated": false
146+
}
147+
}
148+
```
149+
150+
In evaluation criteria, specify the search text:
151+
```json
152+
{
153+
"LineByLineContains": {
154+
"searchText": "Item:"
155+
}
156+
}
157+
```
158+
159+
**Behavior difference**:
160+
- **Line-by-line**: Checks if each line contains "Item:", gives partial credit (e.g., 2/3 if one line is missing it)
161+
- **Regular**: Checks if the entire output contains "Item:" at least once, returns 1.0 or 0.0
162+
163+
Key options for all evaluator types:
108164
- `lineByLineEvaluator`/`lineByLineEvaluation`: Enable line-by-line evaluation (default: `false`)
109165
- `lineDelimiter`: Delimiter to split lines (default: `"\n"`)
166+
- `case_sensitive`: Case-sensitive comparison (default: `false` for Contains, `true` for ExactMatch)
167+
- `negated`: Invert the result (default: `false`, only for Contains)
110168

111169
### Custom Delimiters
112170

@@ -130,11 +188,13 @@ line_by_line_test/
130188
├── pyproject.toml # Dependencies (uses TestPyPI)
131189
└── evaluations/
132190
├── evaluators/
133-
│ ├── line-by-line-exact-match.json # New line-by-line evaluator
134-
│ ├── regular-exact-match.json # New regular evaluator (for comparison)
135-
│ └── legacy-line-by-line-exact-match.json # Legacy line-by-line evaluator
191+
│ ├── line-by-line-exact-match.json # New line-by-line ExactMatch evaluator
192+
│ ├── regular-exact-match.json # New regular ExactMatch evaluator
193+
│ ├── legacy-line-by-line-exact-match.json # Legacy line-by-line ExactMatch evaluator
194+
│ ├── line-by-line-contains.json # New line-by-line Contains evaluator
195+
│ └── regular-contains.json # New regular Contains evaluator
136196
└── eval-sets/
137-
└── default.json # Test cases
197+
└── default.json # Test cases with all 5 evaluators
138198
```
139199

140200
## Learn More

packages/uipath/samples/line_by_line_test/evaluations/eval-sets/default.json

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,20 @@
55
"evaluatorRefs": [
66
"LineByLineExactMatch",
77
"RegularExactMatch",
8-
"LegacyLineByLineExactMatch"
8+
"LegacyLineByLineExactMatch",
9+
"LineByLineContains",
10+
"RegularContains"
911
],
1012
"evaluations": [
1113
{
1214
"id": "test-all-lines-match",
1315
"name": "Test all lines match exactly",
1416
"inputs": {
15-
"items": ["apple", "banana", "cherry"]
17+
"items": [
18+
"apple",
19+
"banana",
20+
"cherry"
21+
]
1622
},
1723
"evaluationCriterias": {
1824
"LineByLineExactMatch": {
@@ -30,14 +36,24 @@
3036
"result": "Item: apple\nItem: banana\nItem: cherry"
3137
},
3238
"expectedAgentBehavior": ""
39+
},
40+
"LineByLineContains": {
41+
"searchText": "apple"
42+
},
43+
"RegularContains": {
44+
"searchText": "apple"
3345
}
3446
}
3547
},
3648
{
3749
"id": "test-partial-line-mismatch",
3850
"name": "Test when one line doesn't match",
3951
"inputs": {
40-
"items": ["apple", "banana", "cherry"]
52+
"items": [
53+
"apple",
54+
"banana",
55+
"cherry"
56+
]
4157
},
4258
"evaluationCriterias": {
4359
"LineByLineExactMatch": {
@@ -55,14 +71,22 @@
5571
"result": "Item: apple\nItem: WRONG\nItem: cherry"
5672
},
5773
"expectedAgentBehavior": ""
74+
},
75+
"LineByLineContains": {
76+
"searchText": "Item:"
77+
},
78+
"RegularContains": {
79+
"searchText": "Item:"
5880
}
5981
}
6082
},
6183
{
6284
"id": "test-single-item",
6385
"name": "Test with single item",
6486
"inputs": {
65-
"items": ["orange"]
87+
"items": [
88+
"orange"
89+
]
6690
},
6791
"evaluationCriterias": {
6892
"LineByLineExactMatch": {
@@ -80,8 +104,14 @@
80104
"result": "Item: orange"
81105
},
82106
"expectedAgentBehavior": ""
107+
},
108+
"LineByLineContains": {
109+
"searchText": "Item:"
110+
},
111+
"RegularContains": {
112+
"searchText": "Item:"
83113
}
84114
}
85115
}
86116
]
87-
}
117+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"version": "1.0",
3+
"evaluatorTypeId": "uipath-contains",
4+
"id": "LineByLineContains",
5+
"name": "LineByLineContains",
6+
"evaluatorConfig": {
7+
"name": "LineByLineContains",
8+
"target_output_key": "result",
9+
"line_by_line_evaluator": true,
10+
"line_delimiter": "\n",
11+
"case_sensitive": false,
12+
"negated": false
13+
}
14+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"version": "1.0",
3+
"evaluatorTypeId": "uipath-contains",
4+
"id": "RegularContains",
5+
"name": "RegularContains",
6+
"evaluatorConfig": {
7+
"name": "RegularContains",
8+
"target_output_key": "result",
9+
"line_by_line_evaluator": false,
10+
"case_sensitive": false,
11+
"negated": false
12+
}
13+
}

packages/uipath/src/uipath/eval/evaluators/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
LLMJudgeTrajectorySimulationEvaluator,
3535
)
3636
from .multiclass_classification_evaluator import MulticlassClassificationEvaluator
37+
from .output_evaluator import AggregationMethod
3738
from .tool_call_args_evaluator import ToolCallArgsEvaluator
3839
from .tool_call_count_evaluator import ToolCallCountEvaluator
3940
from .tool_call_order_evaluator import ToolCallOrderEvaluator
@@ -84,4 +85,5 @@
8485
"BaseEvaluatorConfig",
8586
"BaseEvaluatorJustification",
8687
"LLMJudgeJustification",
88+
"AggregationMethod",
8789
]

0 commit comments

Comments
 (0)