You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
"description": "An evaluator that uses a LLM to score the similarity of the actual output to the expected output",
271
+
"category": 1,
272
+
"type": 5,
273
+
"prompt": "As an expert evaluator, analyze the semantic similarity of these JSON contents to determine a score from 0-100. Focus on comparing the meaning and contextual equivalence of corresponding fields, accounting for alternative valid expressions, synonyms, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.\n----\nExpectedOutput:\n{{ExpectedOutput}}\n----\nActualOutput:\n{{ActualOutput}}\n",
274
+
"model": "same-as-agent",
275
+
"targetOutputKey": "*",
276
+
"createdAt": "2025-06-09T18:20:06.080Z",
277
+
"updatedAt": "2025-06-09T18:20:06.080Z",
278
+
},
279
+
{
280
+
"fileName": "evaluator-default-simulation.json",
281
+
"id": "3c82e1a2-0112-4e3b-ba45-25f379298faa",
282
+
"name": "Default Simulation Evaluator",
283
+
"description": "An evaluator that uses a LLM to score the similarity of the actual output to the expected output",
284
+
"category": 3,
285
+
"type": 7,
286
+
"prompt": "As an expert evaluator, determine how well the agent did on a scale of 0-100. Focus on if the simulation was successful and if the agent behaved according to the expected output accounting for alternative valid expressions, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.\n----\nUserOrSyntheticInputGivenToAgent:\n{{UserOrSyntheticInput}}\n----\nSimulationInstructions:\n{{SimulationInstructions}}\n----\nExpectedAgentBehavior:\n{{ExpectedAgentBehavior}}\n----\nAgentRunHistory:\n{{AgentRunHistory}}\n",
287
+
"model": "same-as-agent",
288
+
"targetOutputKey": "*",
289
+
"createdAt": "2025-06-09T18:20:06.335Z",
290
+
"updatedAt": "2025-06-09T18:20:06.335Z",
291
+
},
292
+
{
293
+
"fileName": "evaluator-default-trajectory.json",
294
+
"id": "a544a330-5e6b-4dca-a4e5-ea5fd024779b",
295
+
"name": "Default Trajectory Evaluator",
296
+
"description": "An evaluator that judges the agent based on it's run history and expected behavior",
297
+
"category": 3,
298
+
"type": 7,
299
+
"prompt": "As an expert evaluator, determine how well the agent did on a scale of 0-100. Focus on if the simulation was successful and if the agent behaved according to the expected output accounting for alternative valid expressions, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.\n----\nUserOrSyntheticInputGivenToAgent:\n{{UserOrSyntheticInput}}\n----\nSimulationInstructions:\n{{SimulationInstructions}}\n----\nExpectedAgentBehavior:\n{{ExpectedAgentBehavior}}\n----\nAgentRunHistory:\n{{AgentRunHistory}}\n",
300
+
"model": "same-as-agent",
301
+
"targetOutputKey": "*",
302
+
"createdAt": "2025-06-26T17:45:39.651Z",
303
+
"updatedAt": "2025-06-26T17:45:39.651Z",
304
+
},
305
+
],
306
+
"evaluationSets": [
307
+
{
308
+
"fileName": "evaluation-set-1757012098378.json",
309
+
"id": "d649e632-1582-4cb1-9e68-c3aff46c2802",
310
+
"name": "Loan Agent Evaluation Set",
311
+
"batchSize": 10,
312
+
"evaluatorRefs": [
313
+
"c395579a-4e15-425e-b400-a630a63a6237",
314
+
"a544a330-5e6b-4dca-a4e5-ea5fd024779b",
315
+
],
316
+
"evaluations": [
317
+
{
318
+
"id": "7309b5dc-46c5-46cb-b6cb-dbb5d9ff5ccf",
319
+
"name": "Low Credit Score Rejection",
320
+
"inputs": {},
321
+
"expectedOutput": {"content": '"rejected"'},
322
+
"simulationInstructions": "The A2ALoanCreditRatingTool should return a credit rating of 650.",
323
+
"expectedAgentBehavior": "The agent should reject the loan application due to the credit rating being below 700.",
324
+
"simulateInput": True,
325
+
"inputGenerationInstructions": "Generate a loan application query with name, loan amount, and loan type (mortgage or personal loan).",
"simulationInstructions": "The A2ALoanCreditRatingTool should return a credit rating of 850.",
341
+
"expectedAgentBehavior": "The agent should approve the loan application due to the credit rating being above 800 and the loan amount being less than $10,000.",
342
+
"simulateInput": True,
343
+
"inputGenerationInstructions": "Generate a loan application query with name, loan amount under $10,000, and loan type (mortgage or personal loan).",
"simulationInstructions": "No tool calls should be made.",
377
+
"expectedAgentBehavior": "The agent should inform the user that all mandatory details (name, loan amount, and loan type) are required to process the application.",
378
+
"simulateInput": True,
379
+
"inputGenerationInstructions": "Generate a loan application query missing one of the mandatory details (name, loan amount, or loan type).",
0 commit comments