fix(langchain): normalize tool definitions and tool_calls for Langfuse UI

udayshnk · udayshnk · commit d47bfefdbdb6 · 2026-03-19T12:46:48.000+05:30
1. Add _to_langfuse_tool() to convert Anthropic (input_schema) and OpenAI (function wrapper) formats to flat {name, description, parameters} shape. 2. Structure LLM input as {messages, tools} when tools are present so extractToolsFromObservation finds definitions at the top-level tools key. 3. Convert AIMessage.tool_calls and invalid_tool_calls from {name, args, id} to {id, type, name, arguments} with args serialized as a JSON string. Only assign to message_dict when the converted list is non-empty. Fixes: langfuse/langfuse#11850
diff --git a/langfuse/langchain/CallbackHandler.py b/langfuse/langchain/CallbackHandler.py
@@ -1,3 +1,4 @@
+import json
 from contextvars import Token
 from typing import (
     Any,
@@ -35,6 +36,38 @@
 from langfuse.logger import langfuse_logger
 from langfuse.types import TraceContext
 
+
+def _to_langfuse_tool(tool: Any) -> Any:
+    """Normalize a tool definition to Langfuse's LLMToolDefinitionSchema: {name, description, parameters}.
+
+    LangChain providers serialize tools differently depending on the backend:
+    - Anthropic (ChatAnthropic): {name, description, input_schema}
+    - OpenAI / LiteLLM: {type: "function", function: {name, description, parameters}}
+
+    Langfuse's backend (extractToolsFromObservation) validates tools against
+    LLMToolDefinitionSchema, which expects the flat format {name, description, parameters}.
+    Both provider formats are normalized here into that canonical shape.
+    """
+    if not isinstance(tool, dict):
+        return tool
+    # OpenAI wrapper: {type: "function", function: {name, description, parameters}}
+    if tool.get("type") == "function" and "function" in tool:
+        fn = tool["function"]
+        return {
+            "name": fn.get("name", ""),
+            "description": fn.get("description", ""),
+            "parameters": fn.get("parameters", {}),
+        }
+    # Anthropic format: {name, description, input_schema}
+    if "name" in tool and "input_schema" in tool:
+        return {
+            "name": tool["name"],
+            "description": tool.get("description", ""),
+            "parameters": tool["input_schema"],
+        }
+    return tool
+
+
 try:
     import langchain
 
@@ -841,9 +874,16 @@ def __on_llm_action(
         self._child_to_parent_run_id_map[run_id] = parent_run_id
 
         try:
+            observation_input: Any = prompts
             tools = kwargs.get("invocation_params", {}).get("tools", None)
             if tools and isinstance(tools, list):
-                prompts.extend([{"role": "tool", "content": tool} for tool in tools])
+                # Structure input as {messages, tools} so extractToolsFromObservation
+                # can find tool definitions at the top-level tools key — the canonical
+                # format expected by the backend's LLMToolDefinitionSchema.
+                observation_input = {
+                    "messages": prompts,
+                    "tools": [_to_langfuse_tool(t) for t in tools],
+                }
 
             model_name = self._parse_model_and_log_errors(
                 serialized=serialized, metadata=metadata, kwargs=kwargs
@@ -868,7 +908,7 @@ def __on_llm_action(
 
             content = {
                 "name": self.get_langchain_run_name(serialized, **kwargs),
-                "input": prompts,
+                "input": observation_input,
                 "metadata": self.__join_tags_and_metadata(
                     tags,
                     metadata,
@@ -1056,14 +1096,66 @@ def _convert_message_to_dict(self, message: BaseMessage) -> Dict[str, Any]:
                 and message.tool_calls is not None
                 and len(message.tool_calls) > 0
             ):
-                message_dict["tool_calls"] = message.tool_calls
+                # Convert LangChain's tool_calls format {name, args, id} to
+                # the flat ToolCallSchema: {id, type, name, arguments}.
+                # Langfuse's frontend ToolCallSchema expects the flat format with
+                # arguments as a JSON string.
+                converted_tool_calls = []
+                for tc in message.tool_calls:
+                    if not isinstance(tc, dict):
+                        langfuse_logger.debug(
+                            "Skipping tool_call entry that is not a dict: %s", tc
+                        )
+                        continue
+                    try:
+                        arguments = json.dumps(tc.get("args", {}))
+                    except (TypeError, ValueError) as e:
+                        langfuse_logger.debug(
+                            "Failed to serialize tool call args to JSON: %s", e
+                        )
+                        arguments = "{}"
+                    converted_tool_calls.append(
+                        {
+                            "id": tc.get("id", ""),
+                            "type": "function",
+                            "name": tc.get("name", ""),
+                            "arguments": arguments,
+                        }
+                    )
+                if converted_tool_calls:
+                    message_dict["tool_calls"] = converted_tool_calls
             
             if (
-                hasattr(message, "invalid_tool_calls") 
-                and message.invalid_tool_calls is not None 
+                hasattr(message, "invalid_tool_calls")
+                and message.invalid_tool_calls is not None
                 and len(message.invalid_tool_calls) > 0
             ):
-                message_dict["invalid_tool_calls"] = message.invalid_tool_calls
+                converted_invalid_tool_calls = []
+                for tc in message.invalid_tool_calls:
+                    if not isinstance(tc, dict):
+                        langfuse_logger.debug(
+                            "Skipping invalid_tool_call entry that is not a dict: %s",
+                            tc,
+                        )
+                        continue
+                    try:
+                        arguments = json.dumps(tc.get("args", {}))
+                    except (TypeError, ValueError) as e:
+                        langfuse_logger.debug(
+                            "Failed to serialize invalid tool call args to JSON: %s", e
+                        )
+                        arguments = "{}"
+                    converted_invalid_tool_calls.append(
+                        {
+                            "id": tc.get("id", ""),
+                            "type": "function",
+                            "name": tc.get("name", ""),
+                            "arguments": arguments,
+                            "error": tc.get("error", ""),
+                        }
+                    )
+                if converted_invalid_tool_calls:
+                    message_dict["invalid_tool_calls"] = converted_invalid_tool_calls
 
         elif isinstance(message, SystemMessage):
             message_dict = {"role": "system", "content": message.content}
diff --git a/tests/test_langchain.py b/tests/test_langchain.py
@@ -18,9 +18,58 @@
 
 from langfuse._client.client import Langfuse
 from langfuse.langchain import CallbackHandler
+from langfuse.langchain.CallbackHandler import _to_langfuse_tool
 from tests.utils import create_uuid, encode_file_to_base64, get_api
 
 
+# --- Unit tests for _to_langfuse_tool ---
+
+
+def test_to_langfuse_tool_openai_format():
+    tool = {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the weather",
+            "parameters": {"type": "object", "properties": {}},
+        },
+    }
+    result = _to_langfuse_tool(tool)
+    assert result == {
+        "name": "get_weather",
+        "description": "Get the weather",
+        "parameters": {"type": "object", "properties": {}},
+    }
+
+
+def test_to_langfuse_tool_anthropic_format():
+    tool = {
+        "name": "get_weather",
+        "description": "Get the weather",
+        "input_schema": {"type": "object", "properties": {}},
+    }
+    result = _to_langfuse_tool(tool)
+    assert result == {
+        "name": "get_weather",
+        "description": "Get the weather",
+        "parameters": {"type": "object", "properties": {}},
+    }
+
+
+def test_to_langfuse_tool_passthrough_unknown_dict():
+    tool = {"name": "my_tool", "custom_field": "value"}
+    result = _to_langfuse_tool(tool)
+    assert result == tool
+
+
+def test_to_langfuse_tool_passthrough_non_dict():
+    result = _to_langfuse_tool("not a dict")
+    assert result == "not a dict"
+
+
+# --- End unit tests ---
+
+
 def test_callback_generated_from_trace_chat():
     langfuse = Langfuse()
 
@@ -762,15 +811,17 @@ class GetWeather(BaseModel):
 
     for generation in generations:
         assert generation.input is not None
-        tool_messages = [msg for msg in generation.input if msg["role"] == "tool"]
-        assert len(tool_messages) == 2
-        assert any(
-            "standardize_address" == msg["content"]["function"]["name"]
-            for msg in tool_messages
-        )
-        assert any(
-            "get_weather" == msg["content"]["function"]["name"] for msg in tool_messages
-        )
+        # Input is structured as {messages, tools} for extractToolsFromObservation
+        assert "messages" in generation.input
+        assert "tools" in generation.input
+        tool_names = [t["name"] for t in generation.input["tools"]]
+        assert "standardize_address" in tool_names
+        assert "get_weather" in tool_names
+        # Each tool must conform to LLMToolDefinitionSchema
+        for t in generation.input["tools"]:
+            assert "name" in t
+            assert "description" in t
+            assert "parameters" in t
 
         assert generation.output is not None