feat: add evals

andrei-rusu · radugheo · commit bd20483eed37 · 2025-10-24T17:30:12.000+03:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ dependencies = [
     "llama-index-embeddings-azure-openai>=0.3.8",
     "llama-index-llms-azure-openai>=0.3.2",
     "openinference-instrumentation-llama-index>=4.3.0",
-    "uipath>=2.1.54, <2.2.0",
+    "uipath>=2.1.106.dev1007760000,<2.1.106.dev1007770000"
 ]
 classifiers = [
     "Development Status :: 3 - Alpha",
@@ -91,3 +91,5 @@ url = "https://test.pypi.org/simple/"
 publish-url = "https://test.pypi.org/legacy/"
 explicit = true
 
+[tool.uv.sources]
+uipath = { index = "testpypi" }
diff --git a/src/uipath_llamaindex/_cli/_runtime/_context.py b/src/uipath_llamaindex/_cli/_runtime/_context.py
@@ -11,5 +11,5 @@ class UiPathLlamaIndexRuntimeContext(UiPathRuntimeContext):
 
     config: Optional[LlamaIndexConfig] = None
     workflow: Optional[Workflow] = None
-    workflow_context: Optional[Context] = None  # type: ignore[type-arg]
+    workflow_context: Optional[Context] = None
     resumed_trigger: Optional[UiPathResumeTrigger] = None
diff --git a/src/uipath_llamaindex/_cli/_runtime/_exception.py b/src/uipath_llamaindex/_cli/_runtime/_exception.py
@@ -14,4 +14,4 @@ def __init__(
         category: UiPathErrorCategory = UiPathErrorCategory.UNKNOWN,
         status: Optional[int] = None,
     ):
-        super().__init__(code, title, detail, category, status, prefix="LlamaIndex")
+        super().__init__(code, title, detail, category, status, prefix="LlamaIndex")  # type: ignore[arg-type]
diff --git a/src/uipath_llamaindex/_cli/_tracing/_attribute_normalizer.py b/src/uipath_llamaindex/_cli/_tracing/_attribute_normalizer.py
@@ -0,0 +1,87 @@
+"""OpenTelemetry SpanProcessor for normalizing LlamaIndex tool call attributes.
+
+LlamaIndex wraps tool arguments in {"kwargs": {...}} which differs from other
+frameworks like LangChain that use flat {"arg": value} format. This processor
+normalizes the format at the span level before exporters or dev terminal read it.
+"""
+
+import json
+import logging
+from typing import Any, Optional
+
+from opentelemetry.context import Context
+from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor
+
+logger = logging.getLogger(__name__)
+
+
+class AttributeNormalizingSpanProcessor(SpanProcessor):
+    """Normalizes LlamaIndex tool call attributes to match other frameworks.
+
+    Unwraps {"kwargs": {...}} to flat {...} format for consistency with LangChain.
+    """
+
+    def on_start(self, span: Span, parent_context: Optional[Context] = None) -> None:
+        """Called when span starts - no action needed."""
+        pass
+
+    def on_end(self, span: ReadableSpan) -> None:
+        """Normalize tool call attributes before span is consumed by exporters/terminal."""
+        if not span._attributes:
+            return
+
+        try:
+            # Get the mutable internal attributes dict
+            attrs: dict[str, Any] = span._attributes  # type: ignore
+
+            if attrs.get("openinference.span.kind", None) == "TOOL":
+                # Normalize tool call attributes
+                for key in ("input.value", "output.value"):
+                    if key in attrs:
+                        original = attrs[key]
+                        normalized = self._normalize_attribute(key, original)
+
+                        if normalized != original:
+                            attrs[key] = normalized
+                            if logger.isEnabledFor(logging.DEBUG):
+                                logger.debug(
+                                    f"Normalized {key} in span '{span.name}': "
+                                    f"{original[:50]}... → {normalized[:50]}..."
+                                )
+
+        except Exception as e:
+            # Don't crash span processing if normalization fails
+            logger.debug(
+                f"Failed to normalize span '{getattr(span, 'name', 'unknown')}': {e}"
+            )
+
+    def _normalize_attribute(self, key: str, value: Any) -> str:
+        """Unwrap LlamaIndex's kwargs wrapper if present."""
+        if isinstance(value, str):
+            try:
+                value = json.loads(value)
+            except Exception:
+                pass
+        if isinstance(value, dict):
+            if key == "input.value":
+                if "kwargs" in value:
+                    value = json.dumps(value["kwargs"])
+            elif key == "output.value":
+                value = json.dumps(
+                    {
+                        "content": value.get("raw_output"),
+                        "status": "success"
+                        if not value.get("is_error", False)
+                        else "error",
+                        "tool_call_id": value.get("tool_call_id"),
+                    }
+                )
+        return str(value)
+
+    def shutdown(self) -> None:
+        """Called on processor shutdown - no cleanup needed."""
+        pass
+
+    def force_flush(self, timeout_millis: int = 30000) -> bool:
+        """Force flush - always succeeds (nothing to flush)."""
+        return True
diff --git a/src/uipath_llamaindex/_cli/cli_dev.py b/src/uipath_llamaindex/_cli/cli_dev.py
@@ -12,6 +12,7 @@
 
 from ._runtime._context import UiPathLlamaIndexRuntimeContext
 from ._runtime._runtime import UiPathLlamaIndexRuntime
+from ._tracing._attribute_normalizer import AttributeNormalizingSpanProcessor
 
 console = ConsoleLogger()
 
@@ -22,8 +23,10 @@ def llamaindex_dev_middleware(interface: Optional[str]) -> MiddlewareResult:
     try:
         if interface == "terminal":
             runtime_factory = UiPathRuntimeFactory(
-                UiPathLlamaIndexRuntime, UiPathLlamaIndexRuntimeContext
+                UiPathLlamaIndexRuntime,
+                UiPathLlamaIndexRuntimeContext,
             )
+            runtime_factory.add_span_processor(AttributeNormalizingSpanProcessor())
             runtime_factory.add_instrumentor(LlamaIndexInstrumentor, get_current_span)
             app = UiPathDevTerminal(runtime_factory)
             asyncio.run(app.run_async())
diff --git a/src/uipath_llamaindex/_cli/cli_eval.py b/src/uipath_llamaindex/_cli/cli_eval.py
@@ -0,0 +1,96 @@
+import asyncio
+from typing import List, Optional
+
+from openinference.instrumentation.llama_index import (
+    LlamaIndexInstrumentor,
+    get_current_span,
+)
+from uipath._cli._evals._console_progress_reporter import ConsoleProgressReporter
+from uipath._cli._evals._progress_reporter import StudioWebProgressReporter
+from uipath._cli._evals._runtime import UiPathEvalContext, UiPathEvalRuntime
+from uipath._cli._runtime._contracts import (
+    UiPathRuntimeFactory,
+)
+from uipath._cli._utils._eval_set import EvalHelpers
+from uipath._cli.middlewares import MiddlewareResult
+from uipath._events._event_bus import EventBus
+from uipath.eval._helpers import auto_discover_entrypoint
+
+from ._runtime._context import UiPathLlamaIndexRuntimeContext
+from ._runtime._runtime import UiPathLlamaIndexRuntime
+from ._tracing._attribute_normalizer import AttributeNormalizingSpanProcessor
+from ._tracing._oteladapter import LlamaIndexExporter
+from ._utils._config import LlamaIndexConfig
+
+
+def llamaindex_eval_middleware(
+    entrypoint: Optional[str], eval_set: Optional[str], eval_ids: List[str], **kwargs
+) -> MiddlewareResult:
+    """Middleware to handle LlamaIndex evaluation runs"""
+    config = LlamaIndexConfig()
+    if not config.exists:
+        return MiddlewareResult(
+            should_continue=True
+        )  # Continue with normal flow if no llama_index.json
+
+    try:
+        event_bus = EventBus()
+        if kwargs.get("register_progress_reporter", False):
+            progress_reporter = StudioWebProgressReporter(
+                spans_exporter=LlamaIndexExporter()
+            )
+            asyncio.run(progress_reporter.subscribe_to_eval_runtime_events(event_bus))
+        console_reporter = ConsoleProgressReporter()
+        asyncio.run(console_reporter.subscribe_to_eval_runtime_events(event_bus))
+
+        def generate_runtime_context(
+            context_entrypoint: str, **context_kwargs
+        ) -> UiPathLlamaIndexRuntimeContext:
+            context = UiPathLlamaIndexRuntimeContext.with_defaults(**context_kwargs)
+            context.entrypoint = context_entrypoint
+            context.config = config
+            return context
+
+        runtime_entrypoint = entrypoint or auto_discover_entrypoint()
+
+        eval_context = UiPathEvalContext.with_defaults(
+            entrypoint=runtime_entrypoint, **kwargs
+        )
+        eval_context.eval_set = eval_set or EvalHelpers.auto_discover_eval_set()
+        eval_context.eval_ids = eval_ids
+
+        def generate_runtime(
+            ctx: UiPathLlamaIndexRuntimeContext,
+        ) -> UiPathLlamaIndexRuntime:
+            return UiPathLlamaIndexRuntime(ctx)
+
+        runtime_factory = UiPathRuntimeFactory(
+            UiPathLlamaIndexRuntime,
+            UiPathLlamaIndexRuntimeContext,
+            context_generator=lambda **context_kwargs: generate_runtime_context(
+                context_entrypoint=runtime_entrypoint,
+                **context_kwargs,
+            ),
+            runtime_generator=generate_runtime,
+        )
+        runtime_factory.add_span_processor(AttributeNormalizingSpanProcessor())
+
+        if eval_context.job_id:
+            runtime_factory.add_span_exporter(LlamaIndexExporter())
+
+        runtime_factory.add_instrumentor(LlamaIndexInstrumentor, get_current_span)
+
+        async def execute():
+            async with UiPathEvalRuntime.from_eval_context(
+                factory=runtime_factory, context=eval_context, event_bus=event_bus
+            ) as eval_runtime:
+                await eval_runtime.execute()
+                await event_bus.wait_for_all()
+
+        asyncio.run(execute())
+        return MiddlewareResult(should_continue=False)
+
+    except Exception as e:
+        return MiddlewareResult(
+            should_continue=False, error_message=f"Error running evaluation: {str(e)}"
+        )
diff --git a/src/uipath_llamaindex/_cli/cli_run.py b/src/uipath_llamaindex/_cli/cli_run.py
@@ -1,7 +1,7 @@
 import asyncio
 import logging
 from os import environ as env
-from typing import Optional
+from typing import Any, Optional
 
 from openinference.instrumentation.llama_index import (
     LlamaIndexInstrumentor,
@@ -13,14 +13,15 @@
 from ._runtime._context import UiPathLlamaIndexRuntimeContext
 from ._runtime._exception import UiPathLlamaIndexRuntimeError
 from ._runtime._runtime import UiPathLlamaIndexRuntime
+from ._tracing._attribute_normalizer import AttributeNormalizingSpanProcessor
 from ._tracing._oteladapter import LlamaIndexExporter
 from ._utils._config import LlamaIndexConfig
 
 logger = logging.getLogger(__name__)
 
 
 def llamaindex_run_middleware(
-    entrypoint: Optional[str], input: Optional[str], resume: bool, **kwargs
+    entrypoint: Optional[str], input: Optional[str], resume: bool, **kwargs: Any
 ) -> MiddlewareResult:
     """Middleware to handle LlamaIndex agent execution"""
 
@@ -64,8 +65,10 @@ async def execute():
             env["UIPATH_REQUESTING_FEATURE"] = "llamaindex"
 
             runtime_factory = UiPathRuntimeFactory(
-                UiPathLlamaIndexRuntime, UiPathLlamaIndexRuntimeContext
+                UiPathLlamaIndexRuntime,
+                UiPathLlamaIndexRuntimeContext,
             )
+            runtime_factory.add_span_processor(AttributeNormalizingSpanProcessor())
 
             if context.job_id:
                 runtime_factory.add_span_exporter(LlamaIndexExporter())
diff --git a/src/uipath_llamaindex/middlewares.py b/src/uipath_llamaindex/middlewares.py
@@ -1,6 +1,7 @@
 from uipath._cli.middlewares import Middlewares
 
 from ._cli.cli_dev import llamaindex_dev_middleware
+from ._cli.cli_eval import llamaindex_eval_middleware
 from ._cli.cli_init import llamaindex_init_middleware
 from ._cli.cli_new import llamaindex_new_middleware
 from ._cli.cli_run import llamaindex_run_middleware
@@ -12,3 +13,4 @@ def register_middleware():
     Middlewares.register("run", llamaindex_run_middleware)
     Middlewares.register("new", llamaindex_new_middleware)
     Middlewares.register("dev", llamaindex_dev_middleware)
+    Middlewares.register("eval", llamaindex_eval_middleware)
diff --git a/uv.lock b/uv.lock