Skip to content

Commit bd20483

Browse files
andrei-rusuradugheo
authored andcommitted
feat: add evals
1 parent 109aff8 commit bd20483

9 files changed

Lines changed: 2527 additions & 1663 deletions

File tree

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ dependencies = [
99
"llama-index-embeddings-azure-openai>=0.3.8",
1010
"llama-index-llms-azure-openai>=0.3.2",
1111
"openinference-instrumentation-llama-index>=4.3.0",
12-
"uipath>=2.1.54, <2.2.0",
12+
"uipath>=2.1.106.dev1007760000,<2.1.106.dev1007770000"
1313
]
1414
classifiers = [
1515
"Development Status :: 3 - Alpha",
@@ -91,3 +91,5 @@ url = "https://test.pypi.org/simple/"
9191
publish-url = "https://test.pypi.org/legacy/"
9292
explicit = true
9393

94+
[tool.uv.sources]
95+
uipath = { index = "testpypi" }

src/uipath_llamaindex/_cli/_runtime/_context.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,5 @@ class UiPathLlamaIndexRuntimeContext(UiPathRuntimeContext):
1111

1212
config: Optional[LlamaIndexConfig] = None
1313
workflow: Optional[Workflow] = None
14-
workflow_context: Optional[Context] = None # type: ignore[type-arg]
14+
workflow_context: Optional[Context] = None
1515
resumed_trigger: Optional[UiPathResumeTrigger] = None

src/uipath_llamaindex/_cli/_runtime/_exception.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@ def __init__(
1414
category: UiPathErrorCategory = UiPathErrorCategory.UNKNOWN,
1515
status: Optional[int] = None,
1616
):
17-
super().__init__(code, title, detail, category, status, prefix="LlamaIndex")
17+
super().__init__(code, title, detail, category, status, prefix="LlamaIndex") # type: ignore[arg-type]
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
"""OpenTelemetry SpanProcessor for normalizing LlamaIndex tool call attributes.
2+
3+
LlamaIndex wraps tool arguments in {"kwargs": {...}} which differs from other
4+
frameworks like LangChain that use flat {"arg": value} format. This processor
5+
normalizes the format at the span level before exporters or dev terminal read it.
6+
"""
7+
8+
import json
9+
import logging
10+
from typing import Any, Optional
11+
12+
from opentelemetry.context import Context
13+
from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor
14+
15+
logger = logging.getLogger(__name__)
16+
17+
18+
class AttributeNormalizingSpanProcessor(SpanProcessor):
19+
"""Normalizes LlamaIndex tool call attributes to match other frameworks.
20+
21+
Unwraps {"kwargs": {...}} to flat {...} format for consistency with LangChain.
22+
"""
23+
24+
def on_start(self, span: Span, parent_context: Optional[Context] = None) -> None:
25+
"""Called when span starts - no action needed."""
26+
pass
27+
28+
def on_end(self, span: ReadableSpan) -> None:
29+
"""Normalize tool call attributes before span is consumed by exporters/terminal."""
30+
if not span._attributes:
31+
return
32+
33+
try:
34+
# Get the mutable internal attributes dict
35+
attrs: dict[str, Any] = span._attributes # type: ignore
36+
37+
if attrs.get("openinference.span.kind", None) == "TOOL":
38+
# Normalize tool call attributes
39+
for key in ("input.value", "output.value"):
40+
if key in attrs:
41+
original = attrs[key]
42+
normalized = self._normalize_attribute(key, original)
43+
44+
if normalized != original:
45+
attrs[key] = normalized
46+
if logger.isEnabledFor(logging.DEBUG):
47+
logger.debug(
48+
f"Normalized {key} in span '{span.name}': "
49+
f"{original[:50]}... → {normalized[:50]}..."
50+
)
51+
52+
except Exception as e:
53+
# Don't crash span processing if normalization fails
54+
logger.debug(
55+
f"Failed to normalize span '{getattr(span, 'name', 'unknown')}': {e}"
56+
)
57+
58+
def _normalize_attribute(self, key: str, value: Any) -> str:
59+
"""Unwrap LlamaIndex's kwargs wrapper if present."""
60+
if isinstance(value, str):
61+
try:
62+
value = json.loads(value)
63+
except Exception:
64+
pass
65+
if isinstance(value, dict):
66+
if key == "input.value":
67+
if "kwargs" in value:
68+
value = json.dumps(value["kwargs"])
69+
elif key == "output.value":
70+
value = json.dumps(
71+
{
72+
"content": value.get("raw_output"),
73+
"status": "success"
74+
if not value.get("is_error", False)
75+
else "error",
76+
"tool_call_id": value.get("tool_call_id"),
77+
}
78+
)
79+
return str(value)
80+
81+
def shutdown(self) -> None:
82+
"""Called on processor shutdown - no cleanup needed."""
83+
pass
84+
85+
def force_flush(self, timeout_millis: int = 30000) -> bool:
86+
"""Force flush - always succeeds (nothing to flush)."""
87+
return True

src/uipath_llamaindex/_cli/cli_dev.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
from ._runtime._context import UiPathLlamaIndexRuntimeContext
1414
from ._runtime._runtime import UiPathLlamaIndexRuntime
15+
from ._tracing._attribute_normalizer import AttributeNormalizingSpanProcessor
1516

1617
console = ConsoleLogger()
1718

@@ -22,8 +23,10 @@ def llamaindex_dev_middleware(interface: Optional[str]) -> MiddlewareResult:
2223
try:
2324
if interface == "terminal":
2425
runtime_factory = UiPathRuntimeFactory(
25-
UiPathLlamaIndexRuntime, UiPathLlamaIndexRuntimeContext
26+
UiPathLlamaIndexRuntime,
27+
UiPathLlamaIndexRuntimeContext,
2628
)
29+
runtime_factory.add_span_processor(AttributeNormalizingSpanProcessor())
2730
runtime_factory.add_instrumentor(LlamaIndexInstrumentor, get_current_span)
2831
app = UiPathDevTerminal(runtime_factory)
2932
asyncio.run(app.run_async())
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import asyncio
2+
from typing import List, Optional
3+
4+
from openinference.instrumentation.llama_index import (
5+
LlamaIndexInstrumentor,
6+
get_current_span,
7+
)
8+
from uipath._cli._evals._console_progress_reporter import ConsoleProgressReporter
9+
from uipath._cli._evals._progress_reporter import StudioWebProgressReporter
10+
from uipath._cli._evals._runtime import UiPathEvalContext, UiPathEvalRuntime
11+
from uipath._cli._runtime._contracts import (
12+
UiPathRuntimeFactory,
13+
)
14+
from uipath._cli._utils._eval_set import EvalHelpers
15+
from uipath._cli.middlewares import MiddlewareResult
16+
from uipath._events._event_bus import EventBus
17+
from uipath.eval._helpers import auto_discover_entrypoint
18+
19+
from ._runtime._context import UiPathLlamaIndexRuntimeContext
20+
from ._runtime._runtime import UiPathLlamaIndexRuntime
21+
from ._tracing._attribute_normalizer import AttributeNormalizingSpanProcessor
22+
from ._tracing._oteladapter import LlamaIndexExporter
23+
from ._utils._config import LlamaIndexConfig
24+
25+
26+
def llamaindex_eval_middleware(
27+
entrypoint: Optional[str], eval_set: Optional[str], eval_ids: List[str], **kwargs
28+
) -> MiddlewareResult:
29+
"""Middleware to handle LlamaIndex evaluation runs"""
30+
config = LlamaIndexConfig()
31+
if not config.exists:
32+
return MiddlewareResult(
33+
should_continue=True
34+
) # Continue with normal flow if no llama_index.json
35+
36+
try:
37+
event_bus = EventBus()
38+
if kwargs.get("register_progress_reporter", False):
39+
progress_reporter = StudioWebProgressReporter(
40+
spans_exporter=LlamaIndexExporter()
41+
)
42+
asyncio.run(progress_reporter.subscribe_to_eval_runtime_events(event_bus))
43+
console_reporter = ConsoleProgressReporter()
44+
asyncio.run(console_reporter.subscribe_to_eval_runtime_events(event_bus))
45+
46+
def generate_runtime_context(
47+
context_entrypoint: str, **context_kwargs
48+
) -> UiPathLlamaIndexRuntimeContext:
49+
context = UiPathLlamaIndexRuntimeContext.with_defaults(**context_kwargs)
50+
context.entrypoint = context_entrypoint
51+
context.config = config
52+
return context
53+
54+
runtime_entrypoint = entrypoint or auto_discover_entrypoint()
55+
56+
eval_context = UiPathEvalContext.with_defaults(
57+
entrypoint=runtime_entrypoint, **kwargs
58+
)
59+
eval_context.eval_set = eval_set or EvalHelpers.auto_discover_eval_set()
60+
eval_context.eval_ids = eval_ids
61+
62+
def generate_runtime(
63+
ctx: UiPathLlamaIndexRuntimeContext,
64+
) -> UiPathLlamaIndexRuntime:
65+
return UiPathLlamaIndexRuntime(ctx)
66+
67+
runtime_factory = UiPathRuntimeFactory(
68+
UiPathLlamaIndexRuntime,
69+
UiPathLlamaIndexRuntimeContext,
70+
context_generator=lambda **context_kwargs: generate_runtime_context(
71+
context_entrypoint=runtime_entrypoint,
72+
**context_kwargs,
73+
),
74+
runtime_generator=generate_runtime,
75+
)
76+
runtime_factory.add_span_processor(AttributeNormalizingSpanProcessor())
77+
78+
if eval_context.job_id:
79+
runtime_factory.add_span_exporter(LlamaIndexExporter())
80+
81+
runtime_factory.add_instrumentor(LlamaIndexInstrumentor, get_current_span)
82+
83+
async def execute():
84+
async with UiPathEvalRuntime.from_eval_context(
85+
factory=runtime_factory, context=eval_context, event_bus=event_bus
86+
) as eval_runtime:
87+
await eval_runtime.execute()
88+
await event_bus.wait_for_all()
89+
90+
asyncio.run(execute())
91+
return MiddlewareResult(should_continue=False)
92+
93+
except Exception as e:
94+
return MiddlewareResult(
95+
should_continue=False, error_message=f"Error running evaluation: {str(e)}"
96+
)

src/uipath_llamaindex/_cli/cli_run.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import asyncio
22
import logging
33
from os import environ as env
4-
from typing import Optional
4+
from typing import Any, Optional
55

66
from openinference.instrumentation.llama_index import (
77
LlamaIndexInstrumentor,
@@ -13,14 +13,15 @@
1313
from ._runtime._context import UiPathLlamaIndexRuntimeContext
1414
from ._runtime._exception import UiPathLlamaIndexRuntimeError
1515
from ._runtime._runtime import UiPathLlamaIndexRuntime
16+
from ._tracing._attribute_normalizer import AttributeNormalizingSpanProcessor
1617
from ._tracing._oteladapter import LlamaIndexExporter
1718
from ._utils._config import LlamaIndexConfig
1819

1920
logger = logging.getLogger(__name__)
2021

2122

2223
def llamaindex_run_middleware(
23-
entrypoint: Optional[str], input: Optional[str], resume: bool, **kwargs
24+
entrypoint: Optional[str], input: Optional[str], resume: bool, **kwargs: Any
2425
) -> MiddlewareResult:
2526
"""Middleware to handle LlamaIndex agent execution"""
2627

@@ -64,8 +65,10 @@ async def execute():
6465
env["UIPATH_REQUESTING_FEATURE"] = "llamaindex"
6566

6667
runtime_factory = UiPathRuntimeFactory(
67-
UiPathLlamaIndexRuntime, UiPathLlamaIndexRuntimeContext
68+
UiPathLlamaIndexRuntime,
69+
UiPathLlamaIndexRuntimeContext,
6870
)
71+
runtime_factory.add_span_processor(AttributeNormalizingSpanProcessor())
6972

7073
if context.job_id:
7174
runtime_factory.add_span_exporter(LlamaIndexExporter())

src/uipath_llamaindex/middlewares.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from uipath._cli.middlewares import Middlewares
22

33
from ._cli.cli_dev import llamaindex_dev_middleware
4+
from ._cli.cli_eval import llamaindex_eval_middleware
45
from ._cli.cli_init import llamaindex_init_middleware
56
from ._cli.cli_new import llamaindex_new_middleware
67
from ._cli.cli_run import llamaindex_run_middleware
@@ -12,3 +13,4 @@ def register_middleware():
1213
Middlewares.register("run", llamaindex_run_middleware)
1314
Middlewares.register("new", llamaindex_new_middleware)
1415
Middlewares.register("dev", llamaindex_dev_middleware)
16+
Middlewares.register("eval", llamaindex_eval_middleware)

0 commit comments

Comments
 (0)