hermes-bsd/agent/tool_executor.py

"""Tool-call execution — sequential and concurrent dispatch.

Both AIAgent methods (``_execute_tool_calls_sequential`` and
``_execute_tool_calls_concurrent``) live here as module-level
functions that take the parent ``AIAgent`` as their first argument.

``run_agent`` keeps thin wrappers so existing call sites work; tests
that patch ``run_agent._set_interrupt`` are honored because the
extracted functions reach back through the ``run_agent`` module via
``_ra()`` for that symbol.
"""

from __future__ import annotations

import concurrent.futures
import json
import logging
import os
import random
import threading
import time
from typing import Any, Optional

from agent.display import (
    KawaiiSpinner,
    build_tool_preview as _build_tool_preview,
    get_cute_tool_message as _get_cute_tool_message_impl,
    get_tool_emoji as _get_tool_emoji,
    _detect_tool_failure,
)
from agent.tool_guardrails import ToolGuardrailDecision
from agent.tool_dispatch_helpers import (
    _is_destructive_command,
    _is_multimodal_tool_result,
    _multimodal_text_summary,
    _append_subdir_hint_to_multimodal,
    make_tool_result_message,
)
from tools.terminal_tool import (
    get_active_env,
)
from tools.thread_context import propagate_context_to_thread
from tools.tool_result_storage import (
    maybe_persist_tool_result,
    enforce_turn_budget,
)

logger = logging.getLogger(__name__)

# Maximum number of concurrent worker threads for parallel tool execution.
# Mirrors the constant in ``run_agent`` for tests/imports that look here.
_MAX_TOOL_WORKERS = 8


def _ra():
    """Lazy reference to ``run_agent`` so patches like ``run_agent._set_interrupt`` work."""
    import run_agent
    return run_agent


def _emit_terminal_post_tool_call(
    agent,
    *,
    function_name: str,
    function_args: dict,
    result: Any,
    effective_task_id: str,
    tool_call_id: str,
    duration_ms: int = 0,
    status: str | None = None,
    error_type: str | None = None,
    error_message: str | None = None,
) -> None:
    try:
        from model_tools import _emit_post_tool_call_hook
        _emit_post_tool_call_hook(
            function_name=function_name,
            function_args=function_args,
            result=result,
            task_id=effective_task_id or "",
            session_id=getattr(agent, "session_id", "") or "",
            tool_call_id=tool_call_id or "",
            turn_id=getattr(agent, "_current_turn_id", "") or "",
            api_request_id=getattr(agent, "_current_api_request_id", "") or "",
            duration_ms=duration_ms,
            status=status,
            error_type=error_type,
            error_message=error_message,
        )
    except Exception:
        pass


def _cancelled_tool_result(reason: str = "user interrupt") -> str:
    return json.dumps(
        {
            "error": f"Tool execution cancelled by {reason}",
            "status": "cancelled",
        },
        ensure_ascii=False,
    )


def _emit_cancelled_terminal_post_tool_call(
    agent,
    *,
    function_name: str,
    function_args: dict,
    effective_task_id: str,
    tool_call_id: str,
    start_time: float,
    reason: str = "user interrupt",
    error_type: str = "keyboard_interrupt",
) -> str:
    result = _cancelled_tool_result(reason)
    _emit_terminal_post_tool_call(
        agent,
        function_name=function_name,
        function_args=function_args,
        result=result,
        effective_task_id=effective_task_id,
        tool_call_id=tool_call_id,
        duration_ms=int((time.time() - start_time) * 1000),
        status="cancelled",
        error_type=error_type,
        error_message=f"Tool execution cancelled by {reason}",
    )
    return result


def _tool_search_scoped_names(agent) -> frozenset:
    """Return the deferrable tool names the session may invoke via tool_call.

    The Tool Search unwrap dispatches the underlying tool directly, bypassing
    the bridge branch (and its scope check) in
    ``model_tools.handle_function_call``. To keep a restricted-toolset session
    (subagent, kanban worker, curated gateway session) from reaching tools it
    was never granted, the unwrap validates the underlying name against this
    set: the deferrable subset of the session's own enabled/disabled toolset
    scope.

    Result is cached on the agent and refreshed when the tool registry's
    generation changes (e.g. an MCP server reconnects), so the common case is
    a dict lookup, not a full tool-defs rebuild on every tool call.
    """
    try:
        import model_tools
        from tools import tool_search as _ts
        from tools.registry import registry as _registry
    except Exception:
        return frozenset()

    enabled = getattr(agent, "enabled_toolsets", None)
    disabled = getattr(agent, "disabled_toolsets", None)
    cache_key = (
        getattr(_registry, "_generation", 0),
        frozenset(enabled) if enabled is not None else None,
        frozenset(disabled) if disabled is not None else None,
    )
    cached = getattr(agent, "_tool_search_scope_cache", None)
    if cached is not None and cached[0] == cache_key:
        return cached[1]
    try:
        scoped_defs = model_tools.get_tool_definitions(
            enabled_toolsets=enabled,
            disabled_toolsets=disabled,
            quiet_mode=True,
            skip_tool_search_assembly=True,
        ) or []
        names = _ts.scoped_deferrable_names(scoped_defs)
    except Exception:
        names = frozenset()
    try:
        agent._tool_search_scope_cache = (cache_key, names)
    except Exception:
        pass
    return names


def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
    """Execute multiple tool calls concurrently using a thread pool.

    Results are collected in the original tool-call order and appended to
    messages so the API sees them in the expected sequence.
    """
    tool_calls = assistant_message.tool_calls
    num_tools = len(tool_calls)

    # ── Pre-flight: interrupt check ──────────────────────────────────
    if agent._interrupt_requested:
        print(f"{agent.log_prefix}⚡ Interrupt: skipping {num_tools} tool call(s)")
        for tc in tool_calls:
            messages.append(make_tool_result_message(
                tc.function.name,
                f"[Tool execution cancelled — {tc.function.name} was skipped due to user interrupt]",
                tc.id,
            ))
        return

    # ── Parse args + pre-execution bookkeeping ───────────────────────
    parsed_calls = []  # list of (tool_call, function_name, function_args)
    for tool_call in tool_calls:
        function_name = tool_call.function.name

        # Reset nudge counters
        if function_name == "memory":
            agent._turns_since_memory = 0
        elif function_name == "skill_manage":
            agent._iters_since_skill = 0

        try:
            function_args = json.loads(tool_call.function.arguments)
        except json.JSONDecodeError:
            function_args = {}
        if not isinstance(function_args, dict):
            function_args = {}

        # ── Tool Search unwrap ────────────────────────────────────────
        # When the model invokes the tool_call bridge, peel it open so
        # every downstream check (checkpointing, guardrails, plugin
        # pre-tool-call hooks, the display/activity feed, the post-call
        # callback) sees the underlying tool — not the bridge. This is
        # the OpenClaw lesson: hooks must observe the real tool name.
        #
        # The original tool_call entry on ``tool_call.function`` is left
        # untouched so the conversation transcript and the matching
        # tool_call_id are preserved exactly as the model emitted them.
        #
        # Scope gate: the unwrap dispatches the underlying tool directly
        # (bypassing the bridge branch in handle_function_call and its
        # scope check), so we enforce session toolset scope HERE. A tool
        # the session was not granted is rejected before any checkpoint,
        # hook, or dispatch fires.
        _ts_scope_block = None
        try:
            from tools import tool_search as _ts
            if function_name == _ts.TOOL_CALL_NAME:
                _underlying, _underlying_args, _err = _ts.resolve_underlying_call(function_args)
                if not _err and _underlying:
                    if _underlying in _tool_search_scoped_names(agent):
                        function_name = _underlying
                        function_args = _underlying_args
                    else:
                        _ts_scope_block = json.dumps({
                            "error": (
                                f"'{_underlying}' is not available in this session. "
                                "Use tool_search to find tools you can call."
                            ),
                        }, ensure_ascii=False)
        except Exception:
            pass

        # ── Block evaluation (BEFORE checkpoint preflight) ───────────
        # We must know whether the tool will execute before touching
        # checkpoint state (dedup slot, real snapshots).
        block_result = None
        blocked_by_guardrail = False
        if _ts_scope_block is not None:
            # Out-of-scope tool_call: reject before hooks/guardrails/dispatch.
            block_result = _ts_scope_block
            _emit_terminal_post_tool_call(
                agent,
                function_name=function_name,
                function_args=function_args,
                result=block_result,
                effective_task_id=effective_task_id,
                tool_call_id=getattr(tool_call, "id", "") or "",
                status="blocked",
                error_type="tool_scope_block",
                error_message=_ts_scope_block,
            )
        else:
            try:
                from hermes_cli.plugins import get_pre_tool_call_block_message
                block_message = get_pre_tool_call_block_message(
                    function_name,
                    function_args,
                    task_id=effective_task_id or "",
                    session_id=getattr(agent, "session_id", "") or "",
                    tool_call_id=getattr(tool_call, "id", "") or "",
                    turn_id=getattr(agent, "_current_turn_id", "") or "",
                    api_request_id=getattr(agent, "_current_api_request_id", "") or "",
                )
            except Exception:
                block_message = None

            if block_message is not None:
                block_result = json.dumps({"error": block_message}, ensure_ascii=False)
                _emit_terminal_post_tool_call(
                    agent,
                    function_name=function_name,
                    function_args=function_args,
                    result=block_result,
                    effective_task_id=effective_task_id,
                    tool_call_id=getattr(tool_call, "id", "") or "",
                    status="blocked",
                    error_type="plugin_block",
                    error_message=block_message,
                )
            else:
                guardrail_decision = agent._tool_guardrails.before_call(function_name, function_args)
                if not guardrail_decision.allows_execution:
                    block_result = agent._guardrail_block_result(guardrail_decision)
                    blocked_by_guardrail = True
                    _emit_terminal_post_tool_call(
                        agent,
                        function_name=function_name,
                        function_args=function_args,
                        result=block_result,
                        effective_task_id=effective_task_id,
                        tool_call_id=getattr(tool_call, "id", "") or "",
                        status="blocked",
                        error_type="guardrail_block",
                        error_message=getattr(guardrail_decision, "message", None) or "Tool blocked by guardrail policy",
                    )

        # ── Checkpoint preflight (only for tools that will execute) ──
        if block_result is None:
            # Checkpoint for file-mutating tools
            if function_name in {"write_file", "patch"} and agent._checkpoint_mgr.enabled:
                try:
                    file_path = function_args.get("path", "")
                    if file_path:
                        work_dir = agent._checkpoint_mgr.get_working_dir_for_path(file_path)
                        agent._checkpoint_mgr.ensure_checkpoint(work_dir, f"before {function_name}")
                except Exception:
                    pass

            # Checkpoint before destructive terminal commands
            if function_name == "terminal" and agent._checkpoint_mgr.enabled:
                try:
                    cmd = function_args.get("command", "")
                    if _is_destructive_command(cmd):
                        cwd = function_args.get("workdir") or os.getenv("TERMINAL_CWD", os.getcwd())
                        agent._checkpoint_mgr.ensure_checkpoint(
                            cwd, f"before terminal: {cmd[:60]}"
                        )
                except Exception:
                    pass

        parsed_calls.append((tool_call, function_name, function_args, block_result, blocked_by_guardrail))

    # ── Logging / callbacks ──────────────────────────────────────────
    tool_names_str = ", ".join(name for _, name, _, _, _ in parsed_calls)
    if not agent.quiet_mode:
        print(f"  ⚡ Concurrent: {num_tools} tool calls — {tool_names_str}")
        for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls, 1):
            args_str = json.dumps(args, ensure_ascii=False)
            if agent.verbose_logging:
                print(f"  📞 Tool {i}: {name}({list(args.keys())})")
                print(agent._wrap_verbose("Args: ", json.dumps(args, indent=2, ensure_ascii=False)))
            else:
                args_preview = args_str[:agent.log_prefix_chars] + "..." if len(args_str) > agent.log_prefix_chars else args_str
                print(f"  📞 Tool {i}: {name}({list(args.keys())}) - {args_preview}")

    for tc, name, args, block_result, blocked_by_guardrail in parsed_calls:
        if block_result is not None:
            continue
        if agent.tool_progress_callback:
            try:
                preview = _build_tool_preview(name, args)
                agent.tool_progress_callback("tool.started", name, preview, args)
            except Exception as cb_err:
                logging.debug(f"Tool progress callback error: {cb_err}")

    for tc, name, args, block_result, blocked_by_guardrail in parsed_calls:
        if block_result is not None:
            continue
        if agent.tool_start_callback:
            try:
                agent.tool_start_callback(tc.id, name, args)
            except Exception as cb_err:
                logging.debug(f"Tool start callback error: {cb_err}")

    # ── Concurrent execution ─────────────────────────────────────────
    # Each slot holds (function_name, function_args, function_result, duration, error_flag, blocked_flag)
    results = [None] * num_tools
    for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls):
        if block_result is not None:
            results[i] = (name, args, block_result, 0.0, True, True)

    # Touch activity before launching workers so the gateway knows
    # we're executing tools (not stuck).
    agent._current_tool = tool_names_str
    agent._touch_activity(f"executing {num_tools} tools concurrently: {tool_names_str}")

    def _run_tool(index, tool_call, function_name, function_args):
        """Worker function executed in a thread."""
        # Register this worker tid so the agent can fan out an interrupt
        # to it — see AIAgent.interrupt().  Must happen first thing, and
        # must be paired with discard + clear in the finally block.
        _worker_tid = threading.current_thread().ident
        with agent._tool_worker_threads_lock:
            agent._tool_worker_threads.add(_worker_tid)
        # Race: if the agent was interrupted between fan-out (which
        # snapshotted an empty/earlier set) and our registration, apply
        # the interrupt to our own tid now so is_interrupted() inside
        # the tool returns True on the next poll.
        if agent._interrupt_requested:
            try:
                _ra()._set_interrupt(True, _worker_tid)
            except Exception:
                pass
        # Set the activity callback on THIS worker thread so
        # _wait_for_process (terminal commands) can fire heartbeats.
        # The callback is thread-local; the main thread's callback
        # is invisible to worker threads.
        try:
            from tools.environments.base import set_activity_callback
            set_activity_callback(agent._touch_activity)
        except Exception:
            pass
        # Approval/sudo callbacks (thread-local) and the agent turn's
        # ContextVars are propagated by propagate_context_to_thread() at the
        # submit site below (GHSA-qg5c-hvr5-hjgr, #13617).
        start = time.time()
        try:
            try:
                result = agent._invoke_tool(
                    function_name,
                    function_args,
                    effective_task_id,
                    tool_call.id,
                    messages=messages,
                    pre_tool_block_checked=True,
                )
            except KeyboardInterrupt:
                try:
                    agent.interrupt("keyboard interrupt")
                except Exception:
                    pass
                result = _emit_cancelled_terminal_post_tool_call(
                    agent,
                    function_name=function_name,
                    function_args=function_args,
                    effective_task_id=effective_task_id,
                    tool_call_id=getattr(tool_call, "id", "") or "",
                    start_time=start,
                )
                duration = time.time() - start
                logger.info("tool %s cancelled (%.2fs)", function_name, duration)
                results[index] = (function_name, function_args, result, duration, True, False)
                return
            except Exception as tool_error:
                result = f"Error executing tool '{function_name}': {tool_error}"
                logger.error("_invoke_tool raised for %s: %s", function_name, tool_error, exc_info=True)
            duration = time.time() - start
            is_error, _ = _detect_tool_failure(function_name, result)
            if is_error:
                logger.info("tool %s failed (%.2fs): %s", function_name, duration, result[:200])
            else:
                logger.info("tool %s completed (%.2fs, %d chars)", function_name, duration, len(result))
            results[index] = (function_name, function_args, result, duration, is_error, False)
        finally:
            # Tear down worker-tid tracking.  Clear any interrupt bit we may
            # have set so the next task scheduled onto this recycled tid
            # starts with a clean slate.  This MUST be in a finally block
            # because BaseException subclasses (CancelledError, KeyboardInterrupt)
            # bypass ``except Exception`` and would otherwise leak the tid
            # into _interrupted_threads, poisoning the recycled thread.
            with agent._tool_worker_threads_lock:
                agent._tool_worker_threads.discard(_worker_tid)
            try:
                _ra()._set_interrupt(False, _worker_tid)
            except Exception:
                pass

    # Start spinner for CLI mode (skip when TUI handles tool progress)
    spinner = None
    if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
        face = random.choice(KawaiiSpinner.get_waiting_faces())
        spinner = KawaiiSpinner(f"{face} ⚡ running {num_tools} tools concurrently", spinner_type='dots', print_fn=agent._print_fn)
        spinner.start()

    try:
        runnable_calls = [
            (i, tc, name, args)
            for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls)
            if block_result is None
        ]
        futures = []
        if runnable_calls:
            max_workers = min(len(runnable_calls), _MAX_TOOL_WORKERS)
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                for i, tc, name, args in runnable_calls:
                    # Propagate the agent turn's ContextVars (e.g.
                    # _approval_session_key) AND thread-local approval/sudo
                    # callbacks into the worker thread; clears callbacks on exit.
                    f = executor.submit(
                        propagate_context_to_thread(_run_tool), i, tc, name, args
                    )
                    futures.append(f)

                # Wait for all to complete with periodic heartbeats so the
                # gateway's inactivity monitor doesn't kill us during long
                # concurrent tool batches. Also check for user interrupts
                # so we don't block indefinitely when the user sends /stop
                # or a new message during concurrent tool execution.
                _conc_start = time.time()
                _interrupt_logged = False
                while True:
                    done, not_done = concurrent.futures.wait(
                        futures, timeout=5.0,
                    )
                    if not not_done:
                        break

                    # Check for interrupt — the per-thread interrupt signal
                    # already causes individual tools (terminal, execute_code)
                    # to abort, but tools without interrupt checks (web_search,
                    # read_file) will run to completion. Cancel any futures
                    # that haven't started yet so we don't block on them.
                    if agent._interrupt_requested:
                        if not _interrupt_logged:
                            _interrupt_logged = True
                            agent._vprint(
                                f"{agent.log_prefix}⚡ Interrupt: cancelling "
                                f"{len(not_done)} pending concurrent tool(s)",
                                force=True,
                            )
                        for f in not_done:
                            f.cancel()
                        # Give already-running tools a moment to notice the
                        # per-thread interrupt signal and exit gracefully.
                        concurrent.futures.wait(not_done, timeout=3.0)
                        break

                    _conc_elapsed = int(time.time() - _conc_start)
                    # Heartbeat every ~30s (6 × 5s poll intervals)
                    if _conc_elapsed > 0 and _conc_elapsed % 30 < 6:
                        _still_running = [
                            parsed_calls[futures.index(f)][1]
                            for f in not_done
                            if f in futures
                        ]
                        agent._touch_activity(
                            f"concurrent tools running ({_conc_elapsed}s, "
                            f"{len(not_done)} remaining: {', '.join(_still_running[:3])})"
                        )
    finally:
        if spinner:
            # Build a summary message for the spinner stop
            completed = sum(1 for r in results if r is not None)
            total_dur = sum(r[3] for r in results if r is not None)
            spinner.stop(f"⚡ {completed}/{num_tools} tools completed in {total_dur:.1f}s total")

    # ── Post-execution: display per-tool results ─────────────────────
    for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls):
        r = results[i]
        blocked = False
        if r is None:
            # Tool was cancelled (interrupt) or thread didn't return
            if agent._interrupt_requested:
                function_result = f"[Tool execution cancelled — {name} was skipped due to user interrupt]"
                _emit_terminal_post_tool_call(
                    agent,
                    function_name=name,
                    function_args=args,
                    result=function_result,
                    effective_task_id=effective_task_id,
                    tool_call_id=getattr(tc, "id", "") or "",
                    status="cancelled",
                    error_type="keyboard_interrupt",
                    error_message="Tool execution cancelled by user interrupt",
                )
            else:
                function_result = f"Error executing tool '{name}': thread did not return a result"
                _emit_terminal_post_tool_call(
                    agent,
                    function_name=name,
                    function_args=args,
                    result=function_result,
                    effective_task_id=effective_task_id,
                    tool_call_id=getattr(tc, "id", "") or "",
                    status="error",
                    error_type="thread_missing_result",
                    error_message=function_result,
                )
            tool_duration = 0.0
        else:
            function_name, function_args, function_result, tool_duration, is_error, blocked = r

            if not blocked:
                function_result = agent._append_guardrail_observation(
                    function_name,
                    function_args,
                    function_result,
                    failed=is_error,
                )

            if is_error:
                _err_text = _multimodal_text_summary(function_result)
                result_preview = _err_text[:200] if len(_err_text) > 200 else _err_text
                logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview)

            # Track file-mutation outcome for the turn-end verifier.
            # `blocked` calls never actually ran — don't let a guardrail
            # block count as either a failure or a success.
            if not blocked:
                try:
                    agent._record_file_mutation_result(
                        function_name, function_args, function_result, is_error,
                    )
                except Exception as _ver_err:
                    logging.debug("file-mutation verifier record failed: %s", _ver_err)

            if not blocked and agent.tool_progress_callback:
                try:
                    agent.tool_progress_callback(
                        "tool.completed", function_name, None, None,
                        duration=tool_duration, is_error=is_error,
                        result=function_result,
                    )
                except Exception as cb_err:
                    logging.debug(f"Tool progress callback error: {cb_err}")

            if agent.verbose_logging:
                logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s")
                logging.debug(f"Tool result ({len(function_result)} chars): {function_result}")

        # Print cute message per tool
        if agent._should_emit_quiet_tool_messages():
            cute_msg = _get_cute_tool_message_impl(name, args, tool_duration, result=function_result)
            agent._safe_print(f"  {cute_msg}")
        elif not agent.quiet_mode:
            _preview_str = _multimodal_text_summary(function_result)
            if agent.verbose_logging:
                print(f"  ✅ Tool {i+1} completed in {tool_duration:.2f}s")
                print(agent._wrap_verbose("Result: ", _preview_str))
            else:
                response_preview = _preview_str[:agent.log_prefix_chars] + "..." if len(_preview_str) > agent.log_prefix_chars else _preview_str
                print(f"  ✅ Tool {i+1} completed in {tool_duration:.2f}s - {response_preview}")

        agent._current_tool = None
        agent._touch_activity(f"tool completed: {name} ({tool_duration:.1f}s)")

        if not blocked and agent.tool_complete_callback:
            try:
                agent.tool_complete_callback(tc.id, name, args, function_result)
            except Exception as cb_err:
                logging.debug(f"Tool complete callback error: {cb_err}")

        function_result = maybe_persist_tool_result(
            content=function_result,
            tool_name=name,
            tool_use_id=tc.id,
            env=get_active_env(effective_task_id),
        ) if not _is_multimodal_tool_result(function_result) else function_result

        subdir_hints = agent._subdirectory_hints.check_tool_call(name, args)
        if subdir_hints:
            if _is_multimodal_tool_result(function_result):
                # Append the hint to the text summary part so the model
                # still sees it; don't touch the image blocks.
                _append_subdir_hint_to_multimodal(function_result, subdir_hints)
            else:
                function_result += subdir_hints

        # Unwrap _multimodal dicts to an OpenAI-style content list so any
        # vision-capable provider receives [{type:text},{type:image_url}]
        # rather than a raw Python dict.  The Anthropic adapter already
        # accepts content lists; vision-capable OpenAI-compatible servers
        # (mlx-vlm, GPT-4o, …) accept image_url in tool messages natively.
        # Text-only servers get a string-safe fallback here so a rejected
        # image tool result never poisons canonical session history.
        # String results pass through unchanged.
        _tool_content = agent._tool_result_content_for_active_model(name, function_result)
        messages.append(make_tool_result_message(name, _tool_content, tc.id))

        # ── Per-tool /steer drain ───────────────────────────────────
        # Same as the sequential path: drain between each collected
        # result so the steer lands as early as possible.
        agent._apply_pending_steer_to_tool_results(messages, 1)

    # ── Per-turn aggregate budget enforcement ─────────────────────────
    num_tools = len(parsed_calls)
    if num_tools > 0:
        turn_tool_msgs = messages[-num_tools:]
        enforce_turn_budget(turn_tool_msgs, env=get_active_env(effective_task_id))

    # ── /steer injection ──────────────────────────────────────────────
    # Append any pending user steer text to the last tool result so the
    # agent sees it on its next iteration. Runs AFTER budget enforcement
    # so the steer marker is never truncated. See steer() for details.
    if num_tools > 0:
        agent._apply_pending_steer_to_tool_results(messages, num_tools)


def execute_tool_calls_sequential(agent, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
    """Execute tool calls sequentially (original behavior). Used for single calls or interactive tools."""
    for i, tool_call in enumerate(assistant_message.tool_calls, 1):
        # SAFETY: check interrupt BEFORE starting each tool.
        # If the user sent "stop" during a previous tool's execution,
        # do NOT start any more tools -- skip them all immediately.
        if agent._interrupt_requested:
            remaining_calls = assistant_message.tool_calls[i-1:]
            if remaining_calls:
                agent._vprint(f"{agent.log_prefix}⚡ Interrupt: skipping {len(remaining_calls)} tool call(s)", force=True)
            for skipped_tc in remaining_calls:
                skipped_name = skipped_tc.function.name
                skip_msg = {
                    "role": "tool",
                    "name": skipped_name,
                    "content": f"[Tool execution cancelled — {skipped_name} was skipped due to user interrupt]",
                    "tool_call_id": skipped_tc.id,
                }
                messages.append(skip_msg)
            break

        function_name = tool_call.function.name

        try:
            function_args = json.loads(tool_call.function.arguments)
        except json.JSONDecodeError as e:
            logger.warning(f"Unexpected JSON error after validation: {e}")
            function_args = {}
        if not isinstance(function_args, dict):
            function_args = {}

        # Tool Search unwrap — see execute_tool_calls_concurrent for full
        # rationale, including the scope gate (the unwrap dispatches the
        # underlying tool directly, so session toolset scope is enforced here).
        _ts_scope_block: Optional[str] = None
        try:
            from tools import tool_search as _ts
            if function_name == _ts.TOOL_CALL_NAME:
                _underlying, _underlying_args, _err = _ts.resolve_underlying_call(function_args)
                if not _err and _underlying:
                    if _underlying in _tool_search_scoped_names(agent):
                        function_name = _underlying
                        function_args = _underlying_args
                    else:
                        _ts_scope_block = (
                            f"'{_underlying}' is not available in this session. "
                            "Use tool_search to find tools you can call."
                        )
        except Exception:
            pass

        # Check plugin hooks for a block directive before executing.
        _block_msg: Optional[str] = None
        _block_error_type = "plugin_block"
        if _ts_scope_block is not None:
            _block_msg = _ts_scope_block
            _block_error_type = "tool_scope_block"
        else:
            try:
                from hermes_cli.plugins import get_pre_tool_call_block_message
                _block_msg = get_pre_tool_call_block_message(
                    function_name,
                    function_args,
                    task_id=effective_task_id or "",
                    session_id=getattr(agent, "session_id", "") or "",
                    tool_call_id=getattr(tool_call, "id", "") or "",
                    turn_id=getattr(agent, "_current_turn_id", "") or "",
                    api_request_id=getattr(agent, "_current_api_request_id", "") or "",
                )
            except Exception:
                pass

        _guardrail_block_decision: ToolGuardrailDecision | None = None
        if _block_msg is None:
            guardrail_decision = agent._tool_guardrails.before_call(function_name, function_args)
            if not guardrail_decision.allows_execution:
                _guardrail_block_decision = guardrail_decision

        _execution_blocked = _block_msg is not None or _guardrail_block_decision is not None

        if _execution_blocked:
            # Tool blocked by plugin or guardrail policy — skip counters,
            # callbacks, checkpointing, activity mutation, and real execution.
            pass
        # Reset nudge counters when the relevant tool is actually used
        elif function_name == "memory":
            agent._turns_since_memory = 0
        elif function_name == "skill_manage":
            agent._iters_since_skill = 0

        if not agent.quiet_mode:
            args_str = json.dumps(function_args, ensure_ascii=False)
            if agent.verbose_logging:
                print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())})")
                print(agent._wrap_verbose("Args: ", json.dumps(function_args, indent=2, ensure_ascii=False)))
            else:
                args_preview = args_str[:agent.log_prefix_chars] + "..." if len(args_str) > agent.log_prefix_chars else args_str
                print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())}) - {args_preview}")

        if not _execution_blocked:
            agent._current_tool = function_name
            agent._touch_activity(f"executing tool: {function_name}")

        # Set activity callback for long-running tool execution (terminal
        # commands, etc.) so the gateway's inactivity monitor doesn't kill
        # the agent while a command is running.
        if not _execution_blocked:
            try:
                from tools.environments.base import set_activity_callback
                set_activity_callback(agent._touch_activity)
            except Exception:
                pass

        if not _execution_blocked and agent.tool_progress_callback:
            try:
                preview = _build_tool_preview(function_name, function_args)
                agent.tool_progress_callback("tool.started", function_name, preview, function_args)
            except Exception as cb_err:
                logging.debug(f"Tool progress callback error: {cb_err}")

        if not _execution_blocked and agent.tool_start_callback:
            try:
                agent.tool_start_callback(tool_call.id, function_name, function_args)
            except Exception as cb_err:
                logging.debug(f"Tool start callback error: {cb_err}")

        # Checkpoint: snapshot working dir before file-mutating tools
        if not _execution_blocked and function_name in {"write_file", "patch"} and agent._checkpoint_mgr.enabled:
            try:
                file_path = function_args.get("path", "")
                if file_path:
                    work_dir = agent._checkpoint_mgr.get_working_dir_for_path(file_path)
                    agent._checkpoint_mgr.ensure_checkpoint(
                        work_dir, f"before {function_name}"
                    )
            except Exception:
                pass  # never block tool execution

        # Checkpoint before destructive terminal commands
        if not _execution_blocked and function_name == "terminal" and agent._checkpoint_mgr.enabled:
            try:
                cmd = function_args.get("command", "")
                if _is_destructive_command(cmd):
                    cwd = function_args.get("workdir") or os.getenv("TERMINAL_CWD", os.getcwd())
                    agent._checkpoint_mgr.ensure_checkpoint(
                        cwd, f"before terminal: {cmd[:60]}"
                    )
            except Exception:
                pass  # never block tool execution

        tool_start_time = time.time()

        if _block_msg is not None:
            # Tool blocked by plugin policy — return error without executing.
            function_result = json.dumps({"error": _block_msg}, ensure_ascii=False)
            tool_duration = 0.0
            _emit_terminal_post_tool_call(
                agent,
                function_name=function_name,
                function_args=function_args,
                result=function_result,
                effective_task_id=effective_task_id,
                tool_call_id=getattr(tool_call, "id", "") or "",
                status="blocked",
                error_type=_block_error_type,
                error_message=_block_msg,
            )
        elif _guardrail_block_decision is not None:
            # Tool blocked by tool-loop guardrail — synthesize exactly one
            # tool result for the original tool_call_id without executing.
            function_result = agent._guardrail_block_result(_guardrail_block_decision)
            tool_duration = 0.0
            _emit_terminal_post_tool_call(
                agent,
                function_name=function_name,
                function_args=function_args,
                result=function_result,
                effective_task_id=effective_task_id,
                tool_call_id=getattr(tool_call, "id", "") or "",
                status="blocked",
                error_type="guardrail_block",
                error_message=getattr(_guardrail_block_decision, "message", None) or "Tool blocked by guardrail policy",
            )
        elif function_name == "todo":
            from tools.todo_tool import todo_tool as _todo_tool
            function_result = _todo_tool(
                todos=function_args.get("todos"),
                merge=function_args.get("merge", False),
                store=agent._todo_store,
            )
            tool_duration = time.time() - tool_start_time
            if agent._should_emit_quiet_tool_messages():
                agent._vprint(f"  {_get_cute_tool_message_impl('todo', function_args, tool_duration, result=function_result)}")
        elif function_name == "session_search":
            session_db = agent._get_session_db_for_recall()
            if not session_db:
                from hermes_state import format_session_db_unavailable
                function_result = json.dumps({"success": False, "error": format_session_db_unavailable()})
            else:
                from tools.session_search_tool import session_search as _session_search
                function_result = _session_search(
                    query=function_args.get("query", ""),
                    role_filter=function_args.get("role_filter"),
                    limit=function_args.get("limit", 3),
                    session_id=function_args.get("session_id"),
                    around_message_id=function_args.get("around_message_id"),
                    window=function_args.get("window", 5),
                    sort=function_args.get("sort"),
                    db=session_db,
                    current_session_id=agent.session_id,
                )
            tool_duration = time.time() - tool_start_time
            if agent._should_emit_quiet_tool_messages():
                agent._vprint(f"  {_get_cute_tool_message_impl('session_search', function_args, tool_duration, result=function_result)}")
        elif function_name == "memory":
            target = function_args.get("target", "memory")
            from tools.memory_tool import memory_tool as _memory_tool
            function_result = _memory_tool(
                action=function_args.get("action"),
                target=target,
                content=function_args.get("content"),
                old_text=function_args.get("old_text"),
                store=agent._memory_store,
            )
            # Bridge: notify external memory provider of built-in memory writes
            if agent._memory_manager and function_args.get("action") in {"add", "replace"}:
                try:
                    agent._memory_manager.on_memory_write(
                        function_args.get("action", ""),
                        target,
                        function_args.get("content", ""),
                        metadata=agent._build_memory_write_metadata(
                            task_id=effective_task_id,
                            tool_call_id=getattr(tool_call, "id", None),
                        ),
                    )
                except Exception:
                    pass
            tool_duration = time.time() - tool_start_time
            if agent._should_emit_quiet_tool_messages():
                agent._vprint(f"  {_get_cute_tool_message_impl('memory', function_args, tool_duration, result=function_result)}")
        elif function_name == "clarify":
            from tools.clarify_tool import clarify_tool as _clarify_tool
            function_result = _clarify_tool(
                question=function_args.get("question", ""),
                choices=function_args.get("choices"),
                callback=agent.clarify_callback,
            )
            tool_duration = time.time() - tool_start_time
            if agent._should_emit_quiet_tool_messages():
                agent._vprint(f"  {_get_cute_tool_message_impl('clarify', function_args, tool_duration, result=function_result)}")
        elif function_name == "delegate_task":
            tasks_arg = function_args.get("tasks")
            if tasks_arg and isinstance(tasks_arg, list):
                spinner_label = f"🔀 delegating {len(tasks_arg)} tasks · (/agents to monitor)"
            else:
                goal_preview = (function_args.get("goal") or "")[:30]
                spinner_label = (
                    f"🔀 {goal_preview} · (/agents to monitor)"
                    if goal_preview
                    else "🔀 delegating · (/agents to monitor)"
                )
            spinner = None
            if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
                face = random.choice(KawaiiSpinner.get_waiting_faces())
                spinner = KawaiiSpinner(f"{face} {spinner_label}", spinner_type='dots', print_fn=agent._print_fn)
                spinner.start()
            agent._delegate_spinner = spinner
            _delegate_result = None
            try:
                function_result = agent._dispatch_delegate_task(function_args)
                _delegate_result = function_result
            finally:
                agent._delegate_spinner = None
                tool_duration = time.time() - tool_start_time
                cute_msg = _get_cute_tool_message_impl('delegate_task', function_args, tool_duration, result=_delegate_result)
                if spinner:
                    spinner.stop(cute_msg)
                elif agent._should_emit_quiet_tool_messages():
                    agent._vprint(f"  {cute_msg}")
        elif agent._context_engine_tool_names and function_name in agent._context_engine_tool_names:
            # Context engine tools (lcm_grep, lcm_describe, lcm_expand, etc.)
            spinner = None
            if agent._should_emit_quiet_tool_messages():
                face = random.choice(KawaiiSpinner.get_waiting_faces())
                emoji = _get_tool_emoji(function_name)
                preview = _build_tool_preview(function_name, function_args) or function_name
                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
                spinner.start()
            _ce_result = None
            try:
                function_result = agent.context_compressor.handle_tool_call(function_name, function_args, messages=messages)
                _ce_result = function_result
            except Exception as tool_error:
                function_result = json.dumps({"error": f"Context engine tool '{function_name}' failed: {tool_error}"})
                logger.error("context_engine.handle_tool_call raised for %s: %s", function_name, tool_error, exc_info=True)
            finally:
                tool_duration = time.time() - tool_start_time
                cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_ce_result)
                if spinner:
                    spinner.stop(cute_msg)
                elif agent._should_emit_quiet_tool_messages():
                    agent._vprint(f"  {cute_msg}")
        elif agent._memory_manager and agent._memory_manager.has_tool(function_name):
            # Memory provider tools (hindsight_retain, honcho_search, etc.)
            # These are not in the tool registry — route through MemoryManager.
            spinner = None
            if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
                face = random.choice(KawaiiSpinner.get_waiting_faces())
                emoji = _get_tool_emoji(function_name)
                preview = _build_tool_preview(function_name, function_args) or function_name
                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
                spinner.start()
            _mem_result = None
            try:
                function_result = agent._memory_manager.handle_tool_call(function_name, function_args)
                _mem_result = function_result
            except Exception as tool_error:
                function_result = json.dumps({"error": f"Memory tool '{function_name}' failed: {tool_error}"})
                logger.error("memory_manager.handle_tool_call raised for %s: %s", function_name, tool_error, exc_info=True)
            finally:
                tool_duration = time.time() - tool_start_time
                cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_mem_result)
                if spinner:
                    spinner.stop(cute_msg)
                elif agent._should_emit_quiet_tool_messages():
                    agent._vprint(f"  {cute_msg}")
        elif agent.quiet_mode:
            spinner = None
            if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
                face = random.choice(KawaiiSpinner.get_waiting_faces())
                emoji = _get_tool_emoji(function_name)
                preview = _build_tool_preview(function_name, function_args) or function_name
                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
                spinner.start()
            _spinner_result = None
            try:
                function_result = _ra().handle_function_call(
                    function_name, function_args, effective_task_id,
                    tool_call_id=tool_call.id,
                    session_id=agent.session_id or "",
                    turn_id=getattr(agent, "_current_turn_id", "") or "",
                    api_request_id=getattr(agent, "_current_api_request_id", "") or "",
                    enabled_tools=list(agent.valid_tool_names) if agent.valid_tool_names else None,
                    skip_pre_tool_call_hook=True,
                    enabled_toolsets=getattr(agent, "enabled_toolsets", None),
                    disabled_toolsets=getattr(agent, "disabled_toolsets", None),
                )
                _spinner_result = function_result
            except KeyboardInterrupt:
                function_result = _emit_cancelled_terminal_post_tool_call(
                    agent,
                    function_name=function_name,
                    function_args=function_args,
                    effective_task_id=effective_task_id,
                    tool_call_id=getattr(tool_call, "id", "") or "",
                    start_time=tool_start_time,
                )
                _spinner_result = function_result
                try:
                    agent.interrupt("keyboard interrupt")
                except Exception:
                    pass
                raise
            except Exception as tool_error:
                function_result = f"Error executing tool '{function_name}': {tool_error}"
                logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True)
            finally:
                tool_duration = time.time() - tool_start_time
                cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_spinner_result)
                if spinner:
                    spinner.stop(cute_msg)
                elif agent._should_emit_quiet_tool_messages():
                    agent._vprint(f"  {cute_msg}")
        else:
            try:
                function_result = _ra().handle_function_call(
                    function_name, function_args, effective_task_id,
                    tool_call_id=tool_call.id,
                    session_id=agent.session_id or "",
                    turn_id=getattr(agent, "_current_turn_id", "") or "",
                    api_request_id=getattr(agent, "_current_api_request_id", "") or "",
                    enabled_tools=list(agent.valid_tool_names) if agent.valid_tool_names else None,
                    skip_pre_tool_call_hook=True,
                    enabled_toolsets=getattr(agent, "enabled_toolsets", None),
                    disabled_toolsets=getattr(agent, "disabled_toolsets", None),
                )
            except KeyboardInterrupt:
                _emit_cancelled_terminal_post_tool_call(
                    agent,
                    function_name=function_name,
                    function_args=function_args,
                    effective_task_id=effective_task_id,
                    tool_call_id=getattr(tool_call, "id", "") or "",
                    start_time=tool_start_time,
                )
                try:
                    agent.interrupt("keyboard interrupt")
                except Exception:
                    pass
                raise
            except Exception as tool_error:
                function_result = f"Error executing tool '{function_name}': {tool_error}"
                logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True)
            tool_duration = time.time() - tool_start_time

        if isinstance(function_result, str):
            result_preview = function_result if agent.verbose_logging else (
                function_result[:200] if len(function_result) > 200 else function_result
            )
            _result_len = len(function_result)
        else:
            # Multimodal dict result (_multimodal=True) — not sliceable as string
            result_preview = function_result
            _result_len = len(str(function_result))

        # Log tool errors to the persistent error log so [error] tags
        # in the UI always have a corresponding detailed entry on disk.
        _is_error_result, _ = _detect_tool_failure(function_name, function_result)
        # The agent-runtime tools above (todo, session_search, memory,
        # context-engine, memory-manager, clarify, delegate_task) are
        # dispatched inline — they never reach handle_function_call, so the
        # executor is the one that has to fire post_tool_call. For
        # registry-dispatched tools the else-branch above invoked
        # handle_function_call, which already fires the hook.
        from agent.agent_runtime_helpers import agent_runtime_owns_post_tool_hook
        _executor_must_emit_post_hook = (
            not _execution_blocked
            and agent_runtime_owns_post_tool_hook(agent, function_name)
        )
        if _executor_must_emit_post_hook:
            _emit_terminal_post_tool_call(
                agent,
                function_name=function_name,
                function_args=function_args,
                result=function_result,
                effective_task_id=effective_task_id,
                tool_call_id=getattr(tool_call, "id", "") or "",
                duration_ms=int(tool_duration * 1000),
            )
        if not _execution_blocked:
            function_result = agent._append_guardrail_observation(
                function_name,
                function_args,
                function_result,
                failed=_is_error_result,
            )
            result_preview = function_result if agent.verbose_logging else (
                function_result[:200] if len(function_result) > 200 else function_result
            )
        if _is_error_result:
            logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview)
        else:
            logger.info("tool %s completed (%.2fs, %d chars)", function_name, tool_duration, _result_len)

        # Track file-mutation outcome for the turn-end verifier.  See
        # the concurrent path for the rationale; both paths must feed
        # the same state so the footer reflects every tool call in the
        # turn, not just the parallel ones.
        if not _execution_blocked:
            try:
                agent._record_file_mutation_result(
                    function_name, function_args, function_result, _is_error_result,
                )
            except Exception as _ver_err:
                logging.debug("file-mutation verifier record failed: %s", _ver_err)

        if not _execution_blocked and agent.tool_progress_callback:
            try:
                agent.tool_progress_callback(
                    "tool.completed", function_name, None, None,
                    duration=tool_duration, is_error=_is_error_result,
                    result=function_result,
                )
            except Exception as cb_err:
                logging.debug(f"Tool progress callback error: {cb_err}")

        agent._current_tool = None
        agent._touch_activity(f"tool completed: {function_name} ({tool_duration:.1f}s)")

        if agent.verbose_logging:
            logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s")
            _log_result = _multimodal_text_summary(function_result)
            logging.debug(f"Tool result ({len(_log_result)} chars): {_log_result}")

        if not _execution_blocked and agent.tool_complete_callback:
            try:
                agent.tool_complete_callback(tool_call.id, function_name, function_args, function_result)
            except Exception as cb_err:
                logging.debug(f"Tool complete callback error: {cb_err}")

        function_result = maybe_persist_tool_result(
            content=function_result,
            tool_name=function_name,
            tool_use_id=tool_call.id,
            env=get_active_env(effective_task_id),
        ) if not _is_multimodal_tool_result(function_result) else function_result

        # Discover subdirectory context files from tool arguments
        subdir_hints = agent._subdirectory_hints.check_tool_call(function_name, function_args)
        if subdir_hints:
            if _is_multimodal_tool_result(function_result):
                _append_subdir_hint_to_multimodal(function_result, subdir_hints)
            else:
                function_result += subdir_hints

        # Unwrap _multimodal dicts to an OpenAI-style content list
        # (see parallel path for rationale). String results pass through.
        _tool_content = agent._tool_result_content_for_active_model(function_name, function_result)
        messages.append(make_tool_result_message(function_name, _tool_content, tool_call.id))

        # ── Per-tool /steer drain ───────────────────────────────────
        # Drain pending steer BETWEEN individual tool calls so the
        # injection lands as soon as a tool finishes — not after the
        # entire batch.  The model sees it on the next API iteration.
        agent._apply_pending_steer_to_tool_results(messages, 1)

        if not agent.quiet_mode:
            if agent.verbose_logging:
                print(f"  ✅ Tool {i} completed in {tool_duration:.2f}s")
                print(agent._wrap_verbose("Result: ", function_result))
            else:
                _fr_str = function_result if isinstance(function_result, str) else str(function_result)
                response_preview = _fr_str[:agent.log_prefix_chars] + "..." if len(_fr_str) > agent.log_prefix_chars else _fr_str
                print(f"  ✅ Tool {i} completed in {tool_duration:.2f}s - {response_preview}")

        if agent._interrupt_requested and i < len(assistant_message.tool_calls):
            remaining = len(assistant_message.tool_calls) - i
            agent._vprint(f"{agent.log_prefix}⚡ Interrupt: skipping {remaining} remaining tool call(s)", force=True)
            for skipped_tc in assistant_message.tool_calls[i:]:
                skipped_name = skipped_tc.function.name
                messages.append(make_tool_result_message(
                    skipped_name,
                    f"[Tool execution skipped — {skipped_name} was not started. User sent a new message]",
                    skipped_tc.id,
                ))
            break

        if agent.tool_delay > 0 and i < len(assistant_message.tool_calls):
            time.sleep(agent.tool_delay)

    # ── Per-turn aggregate budget enforcement ─────────────────────────
    num_tools_seq = len(assistant_message.tool_calls)
    if num_tools_seq > 0:
        enforce_turn_budget(messages[-num_tools_seq:], env=get_active_env(effective_task_id))

    # ── /steer injection ──────────────────────────────────────────────
    # See _execute_tool_calls_parallel for the rationale. Same hook,
    # applied to sequential execution as well.
    if num_tools_seq > 0:
        agent._apply_pending_steer_to_tool_results(messages, num_tools_seq)


__all__ = [
    "execute_tool_calls_concurrent",
    "execute_tool_calls_sequential",
]
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								"""Tool-call execution — sequential and concurrent dispatch.
 								Both AIAgent methods (``_execute_tool_calls_sequential`` and
 								``_execute_tool_calls_concurrent``) live here as module-level
 								functions that take the parent ``AIAgent`` as their first argument.
 								``run_agent`` keeps thin wrappers so existing call sites work; tests
 								that patch ``run_agent._set_interrupt`` are honored because the
 								extracted functions reach back through the ``run_agent`` module via
 								``_ra()`` for that symbol.
 								"""
 								from __future__ import annotations
 								import concurrent.futures
 								import json
 								import logging
 								import os
 								import random
 								import threading
 								import time
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								from typing import Any, Optional
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
 								from agent.display import (
 								    KawaiiSpinner,
 								    build_tool_preview as _build_tool_preview,
 								    get_cute_tool_message as _get_cute_tool_message_impl,
 								    get_tool_emoji as _get_tool_emoji,
 								    _detect_tool_failure,
 								)
 								from agent.tool_guardrails import ToolGuardrailDecision
 								from agent.tool_dispatch_helpers import (
 								    _is_destructive_command,
 								    _is_multimodal_tool_result,
 								    _multimodal_text_summary,
 								    _append_subdir_hint_to_multimodal,
-												fix(agent): set tool_name on tool-result messages at construction time

Introduces make_tool_result_message() in tool_dispatch_helpers.py as the
single place where tool-result message dicts are built. All six construction
sites in tool_executor.py, agent_runtime_helpers.py, and mini_swe_runner.py
now use it, so tool_name is set in memory from the moment a message is
created rather than relying on fallback logic in the flush paths.

Fixes blank tool_name in both state.db and JSON session logs.

Adds tests.

											
										
										
											2026-05-19 20:24:30 +01:00
+								    make_tool_result_message,
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								)
 								from tools.terminal_tool import (
 								    get_active_env,
 								)
-												fix(code-exec): propagate agent-turn context into tool worker threads

Worker threads that dispatch Hermes tools started with an empty contextvars.Context and no thread-local approval/sudo callbacks. Add tools/thread_context.propagate_context_to_thread factoring that capture/install/clear lifecycle (mirrors the GHSA-qg5c-hvr5-hjgr pattern), and refactor agent/tool_executor onto it so the security-critical logic lives in one audited place. Update the contextvar-propagation source guard for the new call shape.

Refs #33057

											
										
										
											2026-05-28 17:47:09 -04:00
+								from tools.thread_context import propagate_context_to_thread
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								from tools.tool_result_storage import (
 								    maybe_persist_tool_result,
 								    enforce_turn_budget,
 								)
 								logger = logging.getLogger(__name__)
 								# Maximum number of concurrent worker threads for parallel tool execution.
 								# Mirrors the constant in ``run_agent`` for tests/imports that look here.
 								_MAX_TOOL_WORKERS = 8
 								def _ra():
 								    """Lazy reference to ``run_agent`` so patches like ``run_agent._set_interrupt`` work."""
 								    import run_agent
 								    return run_agent
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								def _emit_terminal_post_tool_call(
 								    agent,
 								    *,
 								    function_name: str,
 								    function_args: dict,
 								    result: Any,
 								    effective_task_id: str,
 								    tool_call_id: str,
 								    duration_ms: int = 0,
 								    status: str | None = None,
 								    error_type: str | None = None,
 								    error_message: str | None = None,
 								) -> None:
 								    try:
-												perf(observability): gate tool-hook emit on has_hook; slim per-tool footprint

The salvaged observer contract gated the API-request hot path on has_hook()
but left the per-tool emit ungated: every tool call ran result-field
derivation + payload dict build + invoke_hook dispatch even with zero
plugins registered.

- _emit_post_tool_call_hook now short-circuits on has_hook("post_tool_call")
  and derives status/error fields lazily (after the gate, only when a
  listener will consume them). status defaults to None -> derived; explicit
  blocked/cancelled callers still pass status through.
- transform_tool_result emit (pre-existing hook) likewise gated on
  has_hook(); skips _tool_result_observer_fields when no listener.
- Removed the now-redundant _tool_result_observer_fields pre-computation at
  the three ok-path call sites (model_tools, agent_runtime_helpers,
  tool_executor) — the helper derives them, so the no-listener path costs
  one dict lookup and the call sites shrink.
- Tests: stub has_hook=True where payload correctness is asserted; add a
  no-listener regression proving post_tool_call/transform_tool_result emit
  is skipped when nothing is registered.

											
										
										
											2026-06-03 06:05:35 -07:00
+								        from model_tools import _emit_post_tool_call_hook
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								        _emit_post_tool_call_hook(
 								            function_name=function_name,
 								            function_args=function_args,
 								            result=result,
 								            task_id=effective_task_id or "",
 								            session_id=getattr(agent, "session_id", "") or "",
 								            tool_call_id=tool_call_id or "",
 								            turn_id=getattr(agent, "_current_turn_id", "") or "",
 								            api_request_id=getattr(agent, "_current_api_request_id", "") or "",
 								            duration_ms=duration_ms,
 								            status=status,
 								            error_type=error_type,
 								            error_message=error_message,
 								        )
 								    except Exception:
 								        pass
 								def _cancelled_tool_result(reason: str = "user interrupt") -> str:
 								    return json.dumps(
 								        {
 								            "error": f"Tool execution cancelled by {reason}",
 								            "status": "cancelled",
 								        },
 								        ensure_ascii=False,
 								    )
 								def _emit_cancelled_terminal_post_tool_call(
 								    agent,
 								    *,
 								    function_name: str,
 								    function_args: dict,
 								    effective_task_id: str,
 								    tool_call_id: str,
 								    start_time: float,
 								    reason: str = "user interrupt",
 								    error_type: str = "keyboard_interrupt",
 								) -> str:
 								    result = _cancelled_tool_result(reason)
 								    _emit_terminal_post_tool_call(
 								        agent,
 								        function_name=function_name,
 								        function_args=function_args,
 								        result=result,
 								        effective_task_id=effective_task_id,
 								        tool_call_id=tool_call_id,
 								        duration_ms=int((time.time() - start_time) * 1000),
 								        status="cancelled",
 								        error_type=error_type,
 								        error_message=f"Tool execution cancelled by {reason}",
 								    )
 								    return result
-												fix(tool-search): scope bridge catalog + dispatch to the session's toolsets

Tool Search read its catalog from the global registry (get_tool_definitions
with no toolset scope = 'start with everything'), so a restricted-toolset
session — subagent, kanban worker, curated gateway session — could:

  1. tool_search the entire process registry, not just its granted tools, and
  2. tool_call any registered plugin/MCP tool it was never given, because
     registry.dispatch() has no enabled_tools gate for non-execute_code tools.

A scoped session (enabled_toolsets=['mcp-github']) reported total_available=26
and successfully invoked an out-of-scope plugin tool via tool_call.

Fix:
- handle_function_call gains enabled_toolsets/disabled_toolsets; the bridge
  dispatch scopes get_tool_definitions to them (also stops polluting the
  process-global _last_resolved_tool_names with out-of-scope tools, which
  leaked into execute_code's sandbox-tool fallback).
- A defense-in-depth gate rejects any tool_call'd name not in the scoped
  deferrable catalog.
- tool_executor's unwrap (both concurrent + sequential paths) enforces the
  same scope before dispatch, since it unwraps tool_call -> underlying name
  and bypasses the bridge branch. New _tool_search_scoped_names() helper,
  cached per-agent on registry generation + toolset scope.
- New scoped_deferrable_names() helper in tool_search.py shared by both sites.

Tests: 4 new regression tests in TestRegression_ToolsetScoping (scoped
catalog, out-of-scope tool_call rejection, no global pollution, helper).

											
										
										
											2026-05-29 01:21:41 -07:00
+								def _tool_search_scoped_names(agent) -> frozenset:
 								    """Return the deferrable tool names the session may invoke via tool_call.
 								    The Tool Search unwrap dispatches the underlying tool directly, bypassing
 								    the bridge branch (and its scope check) in
 								    ``model_tools.handle_function_call``. To keep a restricted-toolset session
 								    (subagent, kanban worker, curated gateway session) from reaching tools it
 								    was never granted, the unwrap validates the underlying name against this
 								    set: the deferrable subset of the session's own enabled/disabled toolset
 								    scope.
 								    Result is cached on the agent and refreshed when the tool registry's
 								    generation changes (e.g. an MCP server reconnects), so the common case is
 								    a dict lookup, not a full tool-defs rebuild on every tool call.
 								    """
 								    try:
 								        import model_tools
 								        from tools import tool_search as _ts
 								        from tools.registry import registry as _registry
 								    except Exception:
 								        return frozenset()
 								    enabled = getattr(agent, "enabled_toolsets", None)
 								    disabled = getattr(agent, "disabled_toolsets", None)
 								    cache_key = (
 								        getattr(_registry, "_generation", 0),
 								        frozenset(enabled) if enabled is not None else None,
 								        frozenset(disabled) if disabled is not None else None,
 								    )
 								    cached = getattr(agent, "_tool_search_scope_cache", None)
 								    if cached is not None and cached[0] == cache_key:
 								        return cached[1]
 								    try:
 								        scoped_defs = model_tools.get_tool_definitions(
 								            enabled_toolsets=enabled,
 								            disabled_toolsets=disabled,
 								            quiet_mode=True,
 								            skip_tool_search_assembly=True,
 								        ) or []
 								        names = _ts.scoped_deferrable_names(scoped_defs)
 								    except Exception:
 								        names = frozenset()
 								    try:
 								        agent._tool_search_scope_cache = (cache_key, names)
 								    except Exception:
 								        pass
 								    return names
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
 								    """Execute multiple tool calls concurrently using a thread pool.
 								    Results are collected in the original tool-call order and appended to
 								    messages so the API sees them in the expected sequence.
 								    """
 								    tool_calls = assistant_message.tool_calls
 								    num_tools = len(tool_calls)
 								    # ── Pre-flight: interrupt check ──────────────────────────────────
 								    if agent._interrupt_requested:
 								        print(f"{agent.log_prefix}⚡ Interrupt: skipping {num_tools} tool call(s)")
 								        for tc in tool_calls:
-												fix(agent): set tool_name on tool-result messages at construction time

Introduces make_tool_result_message() in tool_dispatch_helpers.py as the
single place where tool-result message dicts are built. All six construction
sites in tool_executor.py, agent_runtime_helpers.py, and mini_swe_runner.py
now use it, so tool_name is set in memory from the moment a message is
created rather than relying on fallback logic in the flush paths.

Fixes blank tool_name in both state.db and JSON session logs.

Adds tests.

											
										
										
											2026-05-19 20:24:30 +01:00
+								            messages.append(make_tool_result_message(
 								                tc.function.name,
 								                f"[Tool execution cancelled — {tc.function.name} was skipped due to user interrupt]",
 								                tc.id,
 								            ))
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								        return
 								    # ── Parse args + pre-execution bookkeeping ───────────────────────
 								    parsed_calls = []  # list of (tool_call, function_name, function_args)
 								    for tool_call in tool_calls:
 								        function_name = tool_call.function.name
 								        # Reset nudge counters
 								        if function_name == "memory":
 								            agent._turns_since_memory = 0
 								        elif function_name == "skill_manage":
 								            agent._iters_since_skill = 0
 								        try:
 								            function_args = json.loads(tool_call.function.arguments)
 								        except json.JSONDecodeError:
 								            function_args = {}
 								        if not isinstance(function_args, dict):
 								            function_args = {}
-												feat(tools): progressive tool disclosure for MCP and plugin tools

Adds Tool Search, a structured-tools progressive-disclosure layer that
replaces MCP and non-core plugin tools in the model-visible tools array
with three bridge tools (tool_search / tool_describe / tool_call) when
the deferrable surface would consume more than a configurable percentage
of the active model's context window. Core Hermes tools are never deferred.

Default mode is 'auto' with a 10% context threshold, so small toolsets
pay no overhead. Set tools.tool_search.enabled to 'on' to force or 'off'
to disable.

Design carefully reflects the OpenClaw production failure modes
documented in the openclaw-tool-search-report:

  - Core tools never defer (toolsets._HERMES_CORE_TOOLS). Addresses the
    'tools silently missing from isolated cron turns' regression class
    (openclaw#84141) by construction: there is no code path that can
    drop a core tool.
  - Catalog is stateless across turns — rebuilt from the live tool-defs
    list on every assembly. No session-keyed Map that can drift out of
    sync with the registry.
  - tool_call unwraps the bridge call before any hook fires, so plugin
    pre/post hooks, guardrails, approval flows, and the activity feed
    all see the underlying tool name, not the bridge (addresses
    openclaw#85588 and the verbose-mode complaint on openclaw#79823).
  - The unwrap happens in both the parallel and sequential paths of
    agent/tool_executor.py and also in handle_function_call, so direct
    callers (sandboxed code, eval harnesses) are covered too.
  - Bridge tools cannot invoke each other (recursion guard) and cannot
    invoke core tools (those must be called directly).
  - Tools mode only — no JS-sandbox code-mode. Keeps the surface small.
  - Token estimation via cheap char/4 heuristic; precision isn't needed
    for the threshold decision.

Files:
  - tools/tool_search.py — new module (BM25 retrieval, classification,
    threshold gate, bridge dispatch, unwrap helper).
  - tests/tools/test_tool_search.py — 35 tests including the OpenClaw
    #84141 regression guard.
  - model_tools.py — wires assembly into _compute_tool_definitions as the
    final step, adds skip_tool_search_assembly kwarg so the bridge can
    see the real catalog, dispatches the three bridge tools.
  - agent/tool_executor.py — unwraps tool_call in both parallel and
    sequential parsing loops so checkpointing, guardrails, plugin hooks,
    and tool-progress callbacks all observe the underlying tool name.
  - hermes_cli/config.py — DEFAULT_CONFIG['tools']['tool_search'] block.
  - website/docs/user-guide/features/tool-search.md — user docs.

Validation:
  - 35/35 new tests pass.
  - Existing tool/registry/model_tools/config/coercion/executor tests
    (82 + 74 + small adjacents) green.
  - Live E2E: 20 fake MCP tools registered, get_tool_definitions returns
    3 bridges, tool_search returns top 3 hits, tool_describe returns
    full schema, tool_call dispatches to the real underlying handler
    and the underlying result is what the model sees.
  - Reserved-name recursion guard verified live.
  - Core-tool refusal via tool_call verified live.

											
										
										
											2026-05-23 15:22:01 -07:00
+								        # ── Tool Search unwrap ────────────────────────────────────────
 								        # When the model invokes the tool_call bridge, peel it open so
 								        # every downstream check (checkpointing, guardrails, plugin
 								        # pre-tool-call hooks, the display/activity feed, the post-call
 								        # callback) sees the underlying tool — not the bridge. This is
 								        # the OpenClaw lesson: hooks must observe the real tool name.
 								        #
 								        # The original tool_call entry on ``tool_call.function`` is left
 								        # untouched so the conversation transcript and the matching
 								        # tool_call_id are preserved exactly as the model emitted them.
-												fix(tool-search): scope bridge catalog + dispatch to the session's toolsets

Tool Search read its catalog from the global registry (get_tool_definitions
with no toolset scope = 'start with everything'), so a restricted-toolset
session — subagent, kanban worker, curated gateway session — could:

  1. tool_search the entire process registry, not just its granted tools, and
  2. tool_call any registered plugin/MCP tool it was never given, because
     registry.dispatch() has no enabled_tools gate for non-execute_code tools.

A scoped session (enabled_toolsets=['mcp-github']) reported total_available=26
and successfully invoked an out-of-scope plugin tool via tool_call.

Fix:
- handle_function_call gains enabled_toolsets/disabled_toolsets; the bridge
  dispatch scopes get_tool_definitions to them (also stops polluting the
  process-global _last_resolved_tool_names with out-of-scope tools, which
  leaked into execute_code's sandbox-tool fallback).
- A defense-in-depth gate rejects any tool_call'd name not in the scoped
  deferrable catalog.
- tool_executor's unwrap (both concurrent + sequential paths) enforces the
  same scope before dispatch, since it unwraps tool_call -> underlying name
  and bypasses the bridge branch. New _tool_search_scoped_names() helper,
  cached per-agent on registry generation + toolset scope.
- New scoped_deferrable_names() helper in tool_search.py shared by both sites.

Tests: 4 new regression tests in TestRegression_ToolsetScoping (scoped
catalog, out-of-scope tool_call rejection, no global pollution, helper).

											
										
										
											2026-05-29 01:21:41 -07:00
+								        #
 								        # Scope gate: the unwrap dispatches the underlying tool directly
 								        # (bypassing the bridge branch in handle_function_call and its
 								        # scope check), so we enforce session toolset scope HERE. A tool
 								        # the session was not granted is rejected before any checkpoint,
 								        # hook, or dispatch fires.
 								        _ts_scope_block = None
-												feat(tools): progressive tool disclosure for MCP and plugin tools

Adds Tool Search, a structured-tools progressive-disclosure layer that
replaces MCP and non-core plugin tools in the model-visible tools array
with three bridge tools (tool_search / tool_describe / tool_call) when
the deferrable surface would consume more than a configurable percentage
of the active model's context window. Core Hermes tools are never deferred.

Default mode is 'auto' with a 10% context threshold, so small toolsets
pay no overhead. Set tools.tool_search.enabled to 'on' to force or 'off'
to disable.

Design carefully reflects the OpenClaw production failure modes
documented in the openclaw-tool-search-report:

  - Core tools never defer (toolsets._HERMES_CORE_TOOLS). Addresses the
    'tools silently missing from isolated cron turns' regression class
    (openclaw#84141) by construction: there is no code path that can
    drop a core tool.
  - Catalog is stateless across turns — rebuilt from the live tool-defs
    list on every assembly. No session-keyed Map that can drift out of
    sync with the registry.
  - tool_call unwraps the bridge call before any hook fires, so plugin
    pre/post hooks, guardrails, approval flows, and the activity feed
    all see the underlying tool name, not the bridge (addresses
    openclaw#85588 and the verbose-mode complaint on openclaw#79823).
  - The unwrap happens in both the parallel and sequential paths of
    agent/tool_executor.py and also in handle_function_call, so direct
    callers (sandboxed code, eval harnesses) are covered too.
  - Bridge tools cannot invoke each other (recursion guard) and cannot
    invoke core tools (those must be called directly).
  - Tools mode only — no JS-sandbox code-mode. Keeps the surface small.
  - Token estimation via cheap char/4 heuristic; precision isn't needed
    for the threshold decision.

Files:
  - tools/tool_search.py — new module (BM25 retrieval, classification,
    threshold gate, bridge dispatch, unwrap helper).
  - tests/tools/test_tool_search.py — 35 tests including the OpenClaw
    #84141 regression guard.
  - model_tools.py — wires assembly into _compute_tool_definitions as the
    final step, adds skip_tool_search_assembly kwarg so the bridge can
    see the real catalog, dispatches the three bridge tools.
  - agent/tool_executor.py — unwraps tool_call in both parallel and
    sequential parsing loops so checkpointing, guardrails, plugin hooks,
    and tool-progress callbacks all observe the underlying tool name.
  - hermes_cli/config.py — DEFAULT_CONFIG['tools']['tool_search'] block.
  - website/docs/user-guide/features/tool-search.md — user docs.

Validation:
  - 35/35 new tests pass.
  - Existing tool/registry/model_tools/config/coercion/executor tests
    (82 + 74 + small adjacents) green.
  - Live E2E: 20 fake MCP tools registered, get_tool_definitions returns
    3 bridges, tool_search returns top 3 hits, tool_describe returns
    full schema, tool_call dispatches to the real underlying handler
    and the underlying result is what the model sees.
  - Reserved-name recursion guard verified live.
  - Core-tool refusal via tool_call verified live.

											
										
										
											2026-05-23 15:22:01 -07:00
+								        try:
 								            from tools import tool_search as _ts
 								            if function_name == _ts.TOOL_CALL_NAME:
 								                _underlying, _underlying_args, _err = _ts.resolve_underlying_call(function_args)
 								                if not _err and _underlying:
-												fix(tool-search): scope bridge catalog + dispatch to the session's toolsets

Tool Search read its catalog from the global registry (get_tool_definitions
with no toolset scope = 'start with everything'), so a restricted-toolset
session — subagent, kanban worker, curated gateway session — could:

  1. tool_search the entire process registry, not just its granted tools, and
  2. tool_call any registered plugin/MCP tool it was never given, because
     registry.dispatch() has no enabled_tools gate for non-execute_code tools.

A scoped session (enabled_toolsets=['mcp-github']) reported total_available=26
and successfully invoked an out-of-scope plugin tool via tool_call.

Fix:
- handle_function_call gains enabled_toolsets/disabled_toolsets; the bridge
  dispatch scopes get_tool_definitions to them (also stops polluting the
  process-global _last_resolved_tool_names with out-of-scope tools, which
  leaked into execute_code's sandbox-tool fallback).
- A defense-in-depth gate rejects any tool_call'd name not in the scoped
  deferrable catalog.
- tool_executor's unwrap (both concurrent + sequential paths) enforces the
  same scope before dispatch, since it unwraps tool_call -> underlying name
  and bypasses the bridge branch. New _tool_search_scoped_names() helper,
  cached per-agent on registry generation + toolset scope.
- New scoped_deferrable_names() helper in tool_search.py shared by both sites.

Tests: 4 new regression tests in TestRegression_ToolsetScoping (scoped
catalog, out-of-scope tool_call rejection, no global pollution, helper).

											
										
										
											2026-05-29 01:21:41 -07:00
+								                    if _underlying in _tool_search_scoped_names(agent):
 								                        function_name = _underlying
 								                        function_args = _underlying_args
 								                    else:
 								                        _ts_scope_block = json.dumps({
 								                            "error": (
 								                                f"'{_underlying}' is not available in this session. "
 								                                "Use tool_search to find tools you can call."
 								                            ),
 								                        }, ensure_ascii=False)
-												feat(tools): progressive tool disclosure for MCP and plugin tools

Adds Tool Search, a structured-tools progressive-disclosure layer that
replaces MCP and non-core plugin tools in the model-visible tools array
with three bridge tools (tool_search / tool_describe / tool_call) when
the deferrable surface would consume more than a configurable percentage
of the active model's context window. Core Hermes tools are never deferred.

Default mode is 'auto' with a 10% context threshold, so small toolsets
pay no overhead. Set tools.tool_search.enabled to 'on' to force or 'off'
to disable.

Design carefully reflects the OpenClaw production failure modes
documented in the openclaw-tool-search-report:

  - Core tools never defer (toolsets._HERMES_CORE_TOOLS). Addresses the
    'tools silently missing from isolated cron turns' regression class
    (openclaw#84141) by construction: there is no code path that can
    drop a core tool.
  - Catalog is stateless across turns — rebuilt from the live tool-defs
    list on every assembly. No session-keyed Map that can drift out of
    sync with the registry.
  - tool_call unwraps the bridge call before any hook fires, so plugin
    pre/post hooks, guardrails, approval flows, and the activity feed
    all see the underlying tool name, not the bridge (addresses
    openclaw#85588 and the verbose-mode complaint on openclaw#79823).
  - The unwrap happens in both the parallel and sequential paths of
    agent/tool_executor.py and also in handle_function_call, so direct
    callers (sandboxed code, eval harnesses) are covered too.
  - Bridge tools cannot invoke each other (recursion guard) and cannot
    invoke core tools (those must be called directly).
  - Tools mode only — no JS-sandbox code-mode. Keeps the surface small.
  - Token estimation via cheap char/4 heuristic; precision isn't needed
    for the threshold decision.

Files:
  - tools/tool_search.py — new module (BM25 retrieval, classification,
    threshold gate, bridge dispatch, unwrap helper).
  - tests/tools/test_tool_search.py — 35 tests including the OpenClaw
    #84141 regression guard.
  - model_tools.py — wires assembly into _compute_tool_definitions as the
    final step, adds skip_tool_search_assembly kwarg so the bridge can
    see the real catalog, dispatches the three bridge tools.
  - agent/tool_executor.py — unwraps tool_call in both parallel and
    sequential parsing loops so checkpointing, guardrails, plugin hooks,
    and tool-progress callbacks all observe the underlying tool name.
  - hermes_cli/config.py — DEFAULT_CONFIG['tools']['tool_search'] block.
  - website/docs/user-guide/features/tool-search.md — user docs.

Validation:
  - 35/35 new tests pass.
  - Existing tool/registry/model_tools/config/coercion/executor tests
    (82 + 74 + small adjacents) green.
  - Live E2E: 20 fake MCP tools registered, get_tool_definitions returns
    3 bridges, tool_search returns top 3 hits, tool_describe returns
    full schema, tool_call dispatches to the real underlying handler
    and the underlying result is what the model sees.
  - Reserved-name recursion guard verified live.
  - Core-tool refusal via tool_call verified live.

											
										
										
											2026-05-23 15:22:01 -07:00
+								        except Exception:
 								            pass
-												fix(run_agent): gate concurrent checkpoint preflight on block_result (fixes #34827)

In the concurrent tool-execution path, checkpoint preflight (write_file,
patch, destructive terminal) fired BEFORE plugin guardrail block_result
was computed. A blocked write_file could still dirty checkpoint state
(doc_modified_this_turn, _last_write_file_call_id, turn_counter).

Move checkpoint preflight to AFTER block_result computation, gated on
`if block_result is None:` — matching the invariant the sequential path
already enforces.

											
										
										
											2026-05-29 15:40:01 -05:00
+								        # ── Block evaluation (BEFORE checkpoint preflight) ───────────
 								        # We must know whether the tool will execute before touching
 								        # checkpoint state (dedup slot, real snapshots).
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								        block_result = None
 								        blocked_by_guardrail = False
-												fix(tool-search): scope bridge catalog + dispatch to the session's toolsets

Tool Search read its catalog from the global registry (get_tool_definitions
with no toolset scope = 'start with everything'), so a restricted-toolset
session — subagent, kanban worker, curated gateway session — could:

  1. tool_search the entire process registry, not just its granted tools, and
  2. tool_call any registered plugin/MCP tool it was never given, because
     registry.dispatch() has no enabled_tools gate for non-execute_code tools.

A scoped session (enabled_toolsets=['mcp-github']) reported total_available=26
and successfully invoked an out-of-scope plugin tool via tool_call.

Fix:
- handle_function_call gains enabled_toolsets/disabled_toolsets; the bridge
  dispatch scopes get_tool_definitions to them (also stops polluting the
  process-global _last_resolved_tool_names with out-of-scope tools, which
  leaked into execute_code's sandbox-tool fallback).
- A defense-in-depth gate rejects any tool_call'd name not in the scoped
  deferrable catalog.
- tool_executor's unwrap (both concurrent + sequential paths) enforces the
  same scope before dispatch, since it unwraps tool_call -> underlying name
  and bypasses the bridge branch. New _tool_search_scoped_names() helper,
  cached per-agent on registry generation + toolset scope.
- New scoped_deferrable_names() helper in tool_search.py shared by both sites.

Tests: 4 new regression tests in TestRegression_ToolsetScoping (scoped
catalog, out-of-scope tool_call rejection, no global pollution, helper).

											
										
										
											2026-05-29 01:21:41 -07:00
+								        if _ts_scope_block is not None:
 								            # Out-of-scope tool_call: reject before hooks/guardrails/dispatch.
 								            block_result = _ts_scope_block
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								            _emit_terminal_post_tool_call(
 								                agent,
 								                function_name=function_name,
 								                function_args=function_args,
 								                result=block_result,
 								                effective_task_id=effective_task_id,
 								                tool_call_id=getattr(tool_call, "id", "") or "",
 								                status="blocked",
 								                error_type="tool_scope_block",
 								                error_message=_ts_scope_block,
 								            )
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								        else:
-												fix(tool-search): scope bridge catalog + dispatch to the session's toolsets

Tool Search read its catalog from the global registry (get_tool_definitions
with no toolset scope = 'start with everything'), so a restricted-toolset
session — subagent, kanban worker, curated gateway session — could:

  1. tool_search the entire process registry, not just its granted tools, and
  2. tool_call any registered plugin/MCP tool it was never given, because
     registry.dispatch() has no enabled_tools gate for non-execute_code tools.

A scoped session (enabled_toolsets=['mcp-github']) reported total_available=26
and successfully invoked an out-of-scope plugin tool via tool_call.

Fix:
- handle_function_call gains enabled_toolsets/disabled_toolsets; the bridge
  dispatch scopes get_tool_definitions to them (also stops polluting the
  process-global _last_resolved_tool_names with out-of-scope tools, which
  leaked into execute_code's sandbox-tool fallback).
- A defense-in-depth gate rejects any tool_call'd name not in the scoped
  deferrable catalog.
- tool_executor's unwrap (both concurrent + sequential paths) enforces the
  same scope before dispatch, since it unwraps tool_call -> underlying name
  and bypasses the bridge branch. New _tool_search_scoped_names() helper,
  cached per-agent on registry generation + toolset scope.
- New scoped_deferrable_names() helper in tool_search.py shared by both sites.

Tests: 4 new regression tests in TestRegression_ToolsetScoping (scoped
catalog, out-of-scope tool_call rejection, no global pollution, helper).

											
										
										
											2026-05-29 01:21:41 -07:00
+								            try:
 								                from hermes_cli.plugins import get_pre_tool_call_block_message
 								                block_message = get_pre_tool_call_block_message(
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								                    function_name,
 								                    function_args,
 								                    task_id=effective_task_id or "",
 								                    session_id=getattr(agent, "session_id", "") or "",
 								                    tool_call_id=getattr(tool_call, "id", "") or "",
 								                    turn_id=getattr(agent, "_current_turn_id", "") or "",
 								                    api_request_id=getattr(agent, "_current_api_request_id", "") or "",
-												fix(tool-search): scope bridge catalog + dispatch to the session's toolsets

Tool Search read its catalog from the global registry (get_tool_definitions
with no toolset scope = 'start with everything'), so a restricted-toolset
session — subagent, kanban worker, curated gateway session — could:

  1. tool_search the entire process registry, not just its granted tools, and
  2. tool_call any registered plugin/MCP tool it was never given, because
     registry.dispatch() has no enabled_tools gate for non-execute_code tools.

A scoped session (enabled_toolsets=['mcp-github']) reported total_available=26
and successfully invoked an out-of-scope plugin tool via tool_call.

Fix:
- handle_function_call gains enabled_toolsets/disabled_toolsets; the bridge
  dispatch scopes get_tool_definitions to them (also stops polluting the
  process-global _last_resolved_tool_names with out-of-scope tools, which
  leaked into execute_code's sandbox-tool fallback).
- A defense-in-depth gate rejects any tool_call'd name not in the scoped
  deferrable catalog.
- tool_executor's unwrap (both concurrent + sequential paths) enforces the
  same scope before dispatch, since it unwraps tool_call -> underlying name
  and bypasses the bridge branch. New _tool_search_scoped_names() helper,
  cached per-agent on registry generation + toolset scope.
- New scoped_deferrable_names() helper in tool_search.py shared by both sites.

Tests: 4 new regression tests in TestRegression_ToolsetScoping (scoped
catalog, out-of-scope tool_call rejection, no global pollution, helper).

											
										
										
											2026-05-29 01:21:41 -07:00
+								                )
 								            except Exception:
 								                block_message = None
 								            if block_message is not None:
 								                block_result = json.dumps({"error": block_message}, ensure_ascii=False)
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								                _emit_terminal_post_tool_call(
 								                    agent,
 								                    function_name=function_name,
 								                    function_args=function_args,
 								                    result=block_result,
 								                    effective_task_id=effective_task_id,
 								                    tool_call_id=getattr(tool_call, "id", "") or "",
 								                    status="blocked",
 								                    error_type="plugin_block",
 								                    error_message=block_message,
 								                )
-												fix(tool-search): scope bridge catalog + dispatch to the session's toolsets

Tool Search read its catalog from the global registry (get_tool_definitions
with no toolset scope = 'start with everything'), so a restricted-toolset
session — subagent, kanban worker, curated gateway session — could:

  1. tool_search the entire process registry, not just its granted tools, and
  2. tool_call any registered plugin/MCP tool it was never given, because
     registry.dispatch() has no enabled_tools gate for non-execute_code tools.

A scoped session (enabled_toolsets=['mcp-github']) reported total_available=26
and successfully invoked an out-of-scope plugin tool via tool_call.

Fix:
- handle_function_call gains enabled_toolsets/disabled_toolsets; the bridge
  dispatch scopes get_tool_definitions to them (also stops polluting the
  process-global _last_resolved_tool_names with out-of-scope tools, which
  leaked into execute_code's sandbox-tool fallback).
- A defense-in-depth gate rejects any tool_call'd name not in the scoped
  deferrable catalog.
- tool_executor's unwrap (both concurrent + sequential paths) enforces the
  same scope before dispatch, since it unwraps tool_call -> underlying name
  and bypasses the bridge branch. New _tool_search_scoped_names() helper,
  cached per-agent on registry generation + toolset scope.
- New scoped_deferrable_names() helper in tool_search.py shared by both sites.

Tests: 4 new regression tests in TestRegression_ToolsetScoping (scoped
catalog, out-of-scope tool_call rejection, no global pollution, helper).

											
										
										
											2026-05-29 01:21:41 -07:00
+								            else:
 								                guardrail_decision = agent._tool_guardrails.before_call(function_name, function_args)
 								                if not guardrail_decision.allows_execution:
 								                    block_result = agent._guardrail_block_result(guardrail_decision)
 								                    blocked_by_guardrail = True
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								                    _emit_terminal_post_tool_call(
 								                        agent,
 								                        function_name=function_name,
 								                        function_args=function_args,
 								                        result=block_result,
 								                        effective_task_id=effective_task_id,
 								                        tool_call_id=getattr(tool_call, "id", "") or "",
 								                        status="blocked",
 								                        error_type="guardrail_block",
 								                        error_message=getattr(guardrail_decision, "message", None) or "Tool blocked by guardrail policy",
 								                    )
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
-												fix(run_agent): gate concurrent checkpoint preflight on block_result (fixes #34827)

In the concurrent tool-execution path, checkpoint preflight (write_file,
patch, destructive terminal) fired BEFORE plugin guardrail block_result
was computed. A blocked write_file could still dirty checkpoint state
(doc_modified_this_turn, _last_write_file_call_id, turn_counter).

Move checkpoint preflight to AFTER block_result computation, gated on
`if block_result is None:` — matching the invariant the sequential path
already enforces.

											
										
										
											2026-05-29 15:40:01 -05:00
+								        # ── Checkpoint preflight (only for tools that will execute) ──
 								        if block_result is None:
 								            # Checkpoint for file-mutating tools
 								            if function_name in {"write_file", "patch"} and agent._checkpoint_mgr.enabled:
 								                try:
 								                    file_path = function_args.get("path", "")
 								                    if file_path:
 								                        work_dir = agent._checkpoint_mgr.get_working_dir_for_path(file_path)
 								                        agent._checkpoint_mgr.ensure_checkpoint(work_dir, f"before {function_name}")
 								                except Exception:
 								                    pass
 								            # Checkpoint before destructive terminal commands
 								            if function_name == "terminal" and agent._checkpoint_mgr.enabled:
 								                try:
 								                    cmd = function_args.get("command", "")
 								                    if _is_destructive_command(cmd):
 								                        cwd = function_args.get("workdir") or os.getenv("TERMINAL_CWD", os.getcwd())
 								                        agent._checkpoint_mgr.ensure_checkpoint(
 								                            cwd, f"before terminal: {cmd[:60]}"
 								                        )
 								                except Exception:
 								                    pass
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								        parsed_calls.append((tool_call, function_name, function_args, block_result, blocked_by_guardrail))
 								    # ── Logging / callbacks ──────────────────────────────────────────
 								    tool_names_str = ", ".join(name for _, name, _, _, _ in parsed_calls)
 								    if not agent.quiet_mode:
 								        print(f"  ⚡ Concurrent: {num_tools} tool calls — {tool_names_str}")
 								        for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls, 1):
 								            args_str = json.dumps(args, ensure_ascii=False)
 								            if agent.verbose_logging:
 								                print(f"  📞 Tool {i}: {name}({list(args.keys())})")
 								                print(agent._wrap_verbose("Args: ", json.dumps(args, indent=2, ensure_ascii=False)))
 								            else:
 								                args_preview = args_str[:agent.log_prefix_chars] + "..." if len(args_str) > agent.log_prefix_chars else args_str
 								                print(f"  📞 Tool {i}: {name}({list(args.keys())}) - {args_preview}")
 								    for tc, name, args, block_result, blocked_by_guardrail in parsed_calls:
 								        if block_result is not None:
 								            continue
 								        if agent.tool_progress_callback:
 								            try:
 								                preview = _build_tool_preview(name, args)
 								                agent.tool_progress_callback("tool.started", name, preview, args)
 								            except Exception as cb_err:
 								                logging.debug(f"Tool progress callback error: {cb_err}")
 								    for tc, name, args, block_result, blocked_by_guardrail in parsed_calls:
 								        if block_result is not None:
 								            continue
 								        if agent.tool_start_callback:
 								            try:
 								                agent.tool_start_callback(tc.id, name, args)
 								            except Exception as cb_err:
 								                logging.debug(f"Tool start callback error: {cb_err}")
 								    # ── Concurrent execution ─────────────────────────────────────────
 								    # Each slot holds (function_name, function_args, function_result, duration, error_flag, blocked_flag)
 								    results = [None] * num_tools
 								    for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls):
 								        if block_result is not None:
 								            results[i] = (name, args, block_result, 0.0, True, True)
 								    # Touch activity before launching workers so the gateway knows
 								    # we're executing tools (not stuck).
 								    agent._current_tool = tool_names_str
 								    agent._touch_activity(f"executing {num_tools} tools concurrently: {tool_names_str}")
 								    def _run_tool(index, tool_call, function_name, function_args):
 								        """Worker function executed in a thread."""
 								        # Register this worker tid so the agent can fan out an interrupt
 								        # to it — see AIAgent.interrupt().  Must happen first thing, and
 								        # must be paired with discard + clear in the finally block.
 								        _worker_tid = threading.current_thread().ident
 								        with agent._tool_worker_threads_lock:
 								            agent._tool_worker_threads.add(_worker_tid)
 								        # Race: if the agent was interrupted between fan-out (which
 								        # snapshotted an empty/earlier set) and our registration, apply
 								        # the interrupt to our own tid now so is_interrupted() inside
 								        # the tool returns True on the next poll.
 								        if agent._interrupt_requested:
 								            try:
 								                _ra()._set_interrupt(True, _worker_tid)
 								            except Exception:
 								                pass
 								        # Set the activity callback on THIS worker thread so
 								        # _wait_for_process (terminal commands) can fire heartbeats.
 								        # The callback is thread-local; the main thread's callback
 								        # is invisible to worker threads.
 								        try:
 								            from tools.environments.base import set_activity_callback
 								            set_activity_callback(agent._touch_activity)
 								        except Exception:
 								            pass
-												fix(code-exec): propagate agent-turn context into tool worker threads

Worker threads that dispatch Hermes tools started with an empty contextvars.Context and no thread-local approval/sudo callbacks. Add tools/thread_context.propagate_context_to_thread factoring that capture/install/clear lifecycle (mirrors the GHSA-qg5c-hvr5-hjgr pattern), and refactor agent/tool_executor onto it so the security-critical logic lives in one audited place. Update the contextvar-propagation source guard for the new call shape.

Refs #33057

											
										
										
											2026-05-28 17:47:09 -04:00
+								        # Approval/sudo callbacks (thread-local) and the agent turn's
 								        # ContextVars are propagated by propagate_context_to_thread() at the
 								        # submit site below (GHSA-qg5c-hvr5-hjgr, #13617).
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								        start = time.time()
 								        try:
-												fix(tools): wrap _run_tool cleanup in finally to prevent interrupt state leak

When _invoke_tool raises a BaseException (CancelledError, KeyboardInterrupt),
the cleanup code at the end of _run_tool was bypassed because it sat outside
the except block (which only catches Exception).  ThreadPoolExecutor recycles
thread IDs, so the leaked tid in _interrupted_threads poisons the next tool
scheduled on that thread — it instantly aborts with 'Interrupted'.

Move the discard + _set_interrupt(False) into a finally block so cleanup
runs regardless of how the worker exits.

Fixes #35309

											
										
										
											2026-05-30 19:45:18 +08:00
+								            try:
 								                result = agent._invoke_tool(
 								                    function_name,
 								                    function_args,
 								                    effective_task_id,
 								                    tool_call.id,
 								                    messages=messages,
 								                    pre_tool_block_checked=True,
 								                )
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								            except KeyboardInterrupt:
 								                try:
 								                    agent.interrupt("keyboard interrupt")
 								                except Exception:
 								                    pass
 								                result = _emit_cancelled_terminal_post_tool_call(
 								                    agent,
 								                    function_name=function_name,
 								                    function_args=function_args,
 								                    effective_task_id=effective_task_id,
 								                    tool_call_id=getattr(tool_call, "id", "") or "",
 								                    start_time=start,
 								                )
 								                duration = time.time() - start
 								                logger.info("tool %s cancelled (%.2fs)", function_name, duration)
 								                results[index] = (function_name, function_args, result, duration, True, False)
 								                return
-												fix(tools): wrap _run_tool cleanup in finally to prevent interrupt state leak

When _invoke_tool raises a BaseException (CancelledError, KeyboardInterrupt),
the cleanup code at the end of _run_tool was bypassed because it sat outside
the except block (which only catches Exception).  ThreadPoolExecutor recycles
thread IDs, so the leaked tid in _interrupted_threads poisons the next tool
scheduled on that thread — it instantly aborts with 'Interrupted'.

Move the discard + _set_interrupt(False) into a finally block so cleanup
runs regardless of how the worker exits.

Fixes #35309

											
										
										
											2026-05-30 19:45:18 +08:00
+								            except Exception as tool_error:
 								                result = f"Error executing tool '{function_name}': {tool_error}"
 								                logger.error("_invoke_tool raised for %s: %s", function_name, tool_error, exc_info=True)
 								            duration = time.time() - start
 								            is_error, _ = _detect_tool_failure(function_name, result)
 								            if is_error:
 								                logger.info("tool %s failed (%.2fs): %s", function_name, duration, result[:200])
 								            else:
 								                logger.info("tool %s completed (%.2fs, %d chars)", function_name, duration, len(result))
 								            results[index] = (function_name, function_args, result, duration, is_error, False)
 								        finally:
 								            # Tear down worker-tid tracking.  Clear any interrupt bit we may
 								            # have set so the next task scheduled onto this recycled tid
 								            # starts with a clean slate.  This MUST be in a finally block
 								            # because BaseException subclasses (CancelledError, KeyboardInterrupt)
 								            # bypass ``except Exception`` and would otherwise leak the tid
 								            # into _interrupted_threads, poisoning the recycled thread.
 								            with agent._tool_worker_threads_lock:
 								                agent._tool_worker_threads.discard(_worker_tid)
 								            try:
 								                _ra()._set_interrupt(False, _worker_tid)
 								            except Exception:
 								                pass
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
 								    # Start spinner for CLI mode (skip when TUI handles tool progress)
 								    spinner = None
 								    if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
 								        face = random.choice(KawaiiSpinner.get_waiting_faces())
 								        spinner = KawaiiSpinner(f"{face} ⚡ running {num_tools} tools concurrently", spinner_type='dots', print_fn=agent._print_fn)
 								        spinner.start()
 								    try:
 								        runnable_calls = [
 								            (i, tc, name, args)
 								            for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls)
 								            if block_result is None
 								        ]
 								        futures = []
 								        if runnable_calls:
 								            max_workers = min(len(runnable_calls), _MAX_TOOL_WORKERS)
 								            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
 								                for i, tc, name, args in runnable_calls:
-												fix(code-exec): propagate agent-turn context into tool worker threads

Worker threads that dispatch Hermes tools started with an empty contextvars.Context and no thread-local approval/sudo callbacks. Add tools/thread_context.propagate_context_to_thread factoring that capture/install/clear lifecycle (mirrors the GHSA-qg5c-hvr5-hjgr pattern), and refactor agent/tool_executor onto it so the security-critical logic lives in one audited place. Update the contextvar-propagation source guard for the new call shape.

Refs #33057

											
										
										
											2026-05-28 17:47:09 -04:00
+								                    # Propagate the agent turn's ContextVars (e.g.
 								                    # _approval_session_key) AND thread-local approval/sudo
 								                    # callbacks into the worker thread; clears callbacks on exit.
 								                    f = executor.submit(
 								                        propagate_context_to_thread(_run_tool), i, tc, name, args
 								                    )
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								                    futures.append(f)
 								                # Wait for all to complete with periodic heartbeats so the
 								                # gateway's inactivity monitor doesn't kill us during long
 								                # concurrent tool batches. Also check for user interrupts
 								                # so we don't block indefinitely when the user sends /stop
 								                # or a new message during concurrent tool execution.
 								                _conc_start = time.time()
 								                _interrupt_logged = False
 								                while True:
 								                    done, not_done = concurrent.futures.wait(
 								                        futures, timeout=5.0,
 								                    )
 								                    if not not_done:
 								                        break
 								                    # Check for interrupt — the per-thread interrupt signal
 								                    # already causes individual tools (terminal, execute_code)
 								                    # to abort, but tools without interrupt checks (web_search,
 								                    # read_file) will run to completion. Cancel any futures
 								                    # that haven't started yet so we don't block on them.
 								                    if agent._interrupt_requested:
 								                        if not _interrupt_logged:
 								                            _interrupt_logged = True
 								                            agent._vprint(
 								                                f"{agent.log_prefix}⚡ Interrupt: cancelling "
 								                                f"{len(not_done)} pending concurrent tool(s)",
 								                                force=True,
 								                            )
 								                        for f in not_done:
 								                            f.cancel()
 								                        # Give already-running tools a moment to notice the
 								                        # per-thread interrupt signal and exit gracefully.
 								                        concurrent.futures.wait(not_done, timeout=3.0)
 								                        break
 								                    _conc_elapsed = int(time.time() - _conc_start)
 								                    # Heartbeat every ~30s (6 × 5s poll intervals)
 								                    if _conc_elapsed > 0 and _conc_elapsed % 30 < 6:
 								                        _still_running = [
 								                            parsed_calls[futures.index(f)][1]
 								                            for f in not_done
 								                            if f in futures
 								                        ]
 								                        agent._touch_activity(
 								                            f"concurrent tools running ({_conc_elapsed}s, "
 								                            f"{len(not_done)} remaining: {', '.join(_still_running[:3])})"
 								                        )
 								    finally:
 								        if spinner:
 								            # Build a summary message for the spinner stop
 								            completed = sum(1 for r in results if r is not None)
 								            total_dur = sum(r[3] for r in results if r is not None)
 								            spinner.stop(f"⚡ {completed}/{num_tools} tools completed in {total_dur:.1f}s total")
 								    # ── Post-execution: display per-tool results ─────────────────────
 								    for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls):
 								        r = results[i]
 								        blocked = False
 								        if r is None:
 								            # Tool was cancelled (interrupt) or thread didn't return
 								            if agent._interrupt_requested:
 								                function_result = f"[Tool execution cancelled — {name} was skipped due to user interrupt]"
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								                _emit_terminal_post_tool_call(
 								                    agent,
 								                    function_name=name,
 								                    function_args=args,
 								                    result=function_result,
 								                    effective_task_id=effective_task_id,
 								                    tool_call_id=getattr(tc, "id", "") or "",
 								                    status="cancelled",
 								                    error_type="keyboard_interrupt",
 								                    error_message="Tool execution cancelled by user interrupt",
 								                )
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								            else:
 								                function_result = f"Error executing tool '{name}': thread did not return a result"
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								                _emit_terminal_post_tool_call(
 								                    agent,
 								                    function_name=name,
 								                    function_args=args,
 								                    result=function_result,
 								                    effective_task_id=effective_task_id,
 								                    tool_call_id=getattr(tc, "id", "") or "",
 								                    status="error",
 								                    error_type="thread_missing_result",
 								                    error_message=function_result,
 								                )
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								            tool_duration = 0.0
 								        else:
 								            function_name, function_args, function_result, tool_duration, is_error, blocked = r
 								            if not blocked:
 								                function_result = agent._append_guardrail_observation(
 								                    function_name,
 								                    function_args,
 								                    function_result,
 								                    failed=is_error,
 								                )
 								            if is_error:
 								                _err_text = _multimodal_text_summary(function_result)
 								                result_preview = _err_text[:200] if len(_err_text) > 200 else _err_text
 								                logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview)
 								            # Track file-mutation outcome for the turn-end verifier.
 								            # `blocked` calls never actually ran — don't let a guardrail
 								            # block count as either a failure or a success.
 								            if not blocked:
 								                try:
 								                    agent._record_file_mutation_result(
 								                        function_name, function_args, function_result, is_error,
 								                    )
 								                except Exception as _ver_err:
 								                    logging.debug("file-mutation verifier record failed: %s", _ver_err)
 								            if not blocked and agent.tool_progress_callback:
 								                try:
 								                    agent.tool_progress_callback(
 								                        "tool.completed", function_name, None, None,
 								                        duration=tool_duration, is_error=is_error,
-												fix(cli): surface tool failures with specific error messages

Improves the failure suffix on tool completion lines. Instead of always
showing '[error]' for non-terminal failures, parse the tool's JSON result
and surface the actual message:

  Before:  ┊ 📖 read      foo.py  0.1s [error]
  After:   ┊ 📖 read      foo.py  0.1s [File not found: foo.py]

  Before:  ┊ 💻 $         ls bad  0.1s [exit 127]
  After:   ┊ 💻 $         ls bad  0.1s [ls: cannot access 'bad'...]

Adds a _trim_error helper that strips long absolute paths down to the
filename and caps the suffix at 48 chars so it stays readable on narrow
terminals.

Threads the tool result through the tool.completed progress callback so
agent/display.get_cute_tool_message can inspect it. The cli.py [error]
post-suffix is removed in favor of the richer suffix _detect_tool_failure
now produces directly.

Originally proposed in PR #17194 by Albert.Zhou; salvaged onto current
main with the dead-code preview-length bumps dropped (tool_preview_length
config already strictly caps previews, so the per-tool n= defaults are
unreachable).

Co-authored-by: Albert.Zhou <albert748@gmail.com>

											
										
										
											2026-05-23 20:54:17 -07:00
+								                        result=function_result,
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								                    )
 								                except Exception as cb_err:
 								                    logging.debug(f"Tool progress callback error: {cb_err}")
 								            if agent.verbose_logging:
 								                logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s")
 								                logging.debug(f"Tool result ({len(function_result)} chars): {function_result}")
 								        # Print cute message per tool
 								        if agent._should_emit_quiet_tool_messages():
 								            cute_msg = _get_cute_tool_message_impl(name, args, tool_duration, result=function_result)
 								            agent._safe_print(f"  {cute_msg}")
 								        elif not agent.quiet_mode:
 								            _preview_str = _multimodal_text_summary(function_result)
 								            if agent.verbose_logging:
 								                print(f"  ✅ Tool {i+1} completed in {tool_duration:.2f}s")
 								                print(agent._wrap_verbose("Result: ", _preview_str))
 								            else:
 								                response_preview = _preview_str[:agent.log_prefix_chars] + "..." if len(_preview_str) > agent.log_prefix_chars else _preview_str
 								                print(f"  ✅ Tool {i+1} completed in {tool_duration:.2f}s - {response_preview}")
 								        agent._current_tool = None
 								        agent._touch_activity(f"tool completed: {name} ({tool_duration:.1f}s)")
 								        if not blocked and agent.tool_complete_callback:
 								            try:
 								                agent.tool_complete_callback(tc.id, name, args, function_result)
 								            except Exception as cb_err:
 								                logging.debug(f"Tool complete callback error: {cb_err}")
 								        function_result = maybe_persist_tool_result(
 								            content=function_result,
 								            tool_name=name,
 								            tool_use_id=tc.id,
 								            env=get_active_env(effective_task_id),
 								        ) if not _is_multimodal_tool_result(function_result) else function_result
 								        subdir_hints = agent._subdirectory_hints.check_tool_call(name, args)
 								        if subdir_hints:
 								            if _is_multimodal_tool_result(function_result):
 								                # Append the hint to the text summary part so the model
 								                # still sees it; don't touch the image blocks.
 								                _append_subdir_hint_to_multimodal(function_result, subdir_hints)
 								            else:
 								                function_result += subdir_hints
 								        # Unwrap _multimodal dicts to an OpenAI-style content list so any
 								        # vision-capable provider receives [{type:text},{type:image_url}]
 								        # rather than a raw Python dict.  The Anthropic adapter already
 								        # accepts content lists; vision-capable OpenAI-compatible servers
 								        # (mlx-vlm, GPT-4o, …) accept image_url in tool messages natively.
 								        # Text-only servers get a string-safe fallback here so a rejected
 								        # image tool result never poisons canonical session history.
 								        # String results pass through unchanged.
 								        _tool_content = agent._tool_result_content_for_active_model(name, function_result)
-												fix(agent): set tool_name on tool-result messages at construction time

Introduces make_tool_result_message() in tool_dispatch_helpers.py as the
single place where tool-result message dicts are built. All six construction
sites in tool_executor.py, agent_runtime_helpers.py, and mini_swe_runner.py
now use it, so tool_name is set in memory from the moment a message is
created rather than relying on fallback logic in the flush paths.

Fixes blank tool_name in both state.db and JSON session logs.

Adds tests.

											
										
										
											2026-05-19 20:24:30 +01:00
+								        messages.append(make_tool_result_message(name, _tool_content, tc.id))
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
 								        # ── Per-tool /steer drain ───────────────────────────────────
 								        # Same as the sequential path: drain between each collected
 								        # result so the steer lands as early as possible.
 								        agent._apply_pending_steer_to_tool_results(messages, 1)
 								    # ── Per-turn aggregate budget enforcement ─────────────────────────
 								    num_tools = len(parsed_calls)
 								    if num_tools > 0:
 								        turn_tool_msgs = messages[-num_tools:]
 								        enforce_turn_budget(turn_tool_msgs, env=get_active_env(effective_task_id))
 								    # ── /steer injection ──────────────────────────────────────────────
 								    # Append any pending user steer text to the last tool result so the
 								    # agent sees it on its next iteration. Runs AFTER budget enforcement
 								    # so the steer marker is never truncated. See steer() for details.
 								    if num_tools > 0:
 								        agent._apply_pending_steer_to_tool_results(messages, num_tools)
 								def execute_tool_calls_sequential(agent, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
 								    """Execute tool calls sequentially (original behavior). Used for single calls or interactive tools."""
 								    for i, tool_call in enumerate(assistant_message.tool_calls, 1):
 								        # SAFETY: check interrupt BEFORE starting each tool.
 								        # If the user sent "stop" during a previous tool's execution,
 								        # do NOT start any more tools -- skip them all immediately.
 								        if agent._interrupt_requested:
 								            remaining_calls = assistant_message.tool_calls[i-1:]
 								            if remaining_calls:
 								                agent._vprint(f"{agent.log_prefix}⚡ Interrupt: skipping {len(remaining_calls)} tool call(s)", force=True)
 								            for skipped_tc in remaining_calls:
 								                skipped_name = skipped_tc.function.name
 								                skip_msg = {
 								                    "role": "tool",
 								                    "name": skipped_name,
 								                    "content": f"[Tool execution cancelled — {skipped_name} was skipped due to user interrupt]",
 								                    "tool_call_id": skipped_tc.id,
 								                }
 								                messages.append(skip_msg)
 								            break
 								        function_name = tool_call.function.name
 								        try:
 								            function_args = json.loads(tool_call.function.arguments)
 								        except json.JSONDecodeError as e:
-												fix(compressor): propagate api_mode and fix root logger calls

- Add api_mode to 4 update_model() call sites:
  - conversation_loop.py: long_context failover and probe stepping
  - agent_runtime_helpers.py: rollback restore (also saves compressor_api_mode)
  - chat_completion_helpers.py: fallback activation
- Fix 31 root-logger calls across 5 files (logging.warning/error/info
  -> logger.warning/error/info) to respect module-level log filtering

											
										
										
											2026-05-21 14:09:30 +03:00
+								            logger.warning(f"Unexpected JSON error after validation: {e}")
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								            function_args = {}
 								        if not isinstance(function_args, dict):
 								            function_args = {}
-												fix(tool-search): scope bridge catalog + dispatch to the session's toolsets

Tool Search read its catalog from the global registry (get_tool_definitions
with no toolset scope = 'start with everything'), so a restricted-toolset
session — subagent, kanban worker, curated gateway session — could:

  1. tool_search the entire process registry, not just its granted tools, and
  2. tool_call any registered plugin/MCP tool it was never given, because
     registry.dispatch() has no enabled_tools gate for non-execute_code tools.

A scoped session (enabled_toolsets=['mcp-github']) reported total_available=26
and successfully invoked an out-of-scope plugin tool via tool_call.

Fix:
- handle_function_call gains enabled_toolsets/disabled_toolsets; the bridge
  dispatch scopes get_tool_definitions to them (also stops polluting the
  process-global _last_resolved_tool_names with out-of-scope tools, which
  leaked into execute_code's sandbox-tool fallback).
- A defense-in-depth gate rejects any tool_call'd name not in the scoped
  deferrable catalog.
- tool_executor's unwrap (both concurrent + sequential paths) enforces the
  same scope before dispatch, since it unwraps tool_call -> underlying name
  and bypasses the bridge branch. New _tool_search_scoped_names() helper,
  cached per-agent on registry generation + toolset scope.
- New scoped_deferrable_names() helper in tool_search.py shared by both sites.

Tests: 4 new regression tests in TestRegression_ToolsetScoping (scoped
catalog, out-of-scope tool_call rejection, no global pollution, helper).

											
										
										
											2026-05-29 01:21:41 -07:00
+								        # Tool Search unwrap — see execute_tool_calls_concurrent for full
 								        # rationale, including the scope gate (the unwrap dispatches the
 								        # underlying tool directly, so session toolset scope is enforced here).
 								        _ts_scope_block: Optional[str] = None
-												feat(tools): progressive tool disclosure for MCP and plugin tools

Adds Tool Search, a structured-tools progressive-disclosure layer that
replaces MCP and non-core plugin tools in the model-visible tools array
with three bridge tools (tool_search / tool_describe / tool_call) when
the deferrable surface would consume more than a configurable percentage
of the active model's context window. Core Hermes tools are never deferred.

Default mode is 'auto' with a 10% context threshold, so small toolsets
pay no overhead. Set tools.tool_search.enabled to 'on' to force or 'off'
to disable.

Design carefully reflects the OpenClaw production failure modes
documented in the openclaw-tool-search-report:

  - Core tools never defer (toolsets._HERMES_CORE_TOOLS). Addresses the
    'tools silently missing from isolated cron turns' regression class
    (openclaw#84141) by construction: there is no code path that can
    drop a core tool.
  - Catalog is stateless across turns — rebuilt from the live tool-defs
    list on every assembly. No session-keyed Map that can drift out of
    sync with the registry.
  - tool_call unwraps the bridge call before any hook fires, so plugin
    pre/post hooks, guardrails, approval flows, and the activity feed
    all see the underlying tool name, not the bridge (addresses
    openclaw#85588 and the verbose-mode complaint on openclaw#79823).
  - The unwrap happens in both the parallel and sequential paths of
    agent/tool_executor.py and also in handle_function_call, so direct
    callers (sandboxed code, eval harnesses) are covered too.
  - Bridge tools cannot invoke each other (recursion guard) and cannot
    invoke core tools (those must be called directly).
  - Tools mode only — no JS-sandbox code-mode. Keeps the surface small.
  - Token estimation via cheap char/4 heuristic; precision isn't needed
    for the threshold decision.

Files:
  - tools/tool_search.py — new module (BM25 retrieval, classification,
    threshold gate, bridge dispatch, unwrap helper).
  - tests/tools/test_tool_search.py — 35 tests including the OpenClaw
    #84141 regression guard.
  - model_tools.py — wires assembly into _compute_tool_definitions as the
    final step, adds skip_tool_search_assembly kwarg so the bridge can
    see the real catalog, dispatches the three bridge tools.
  - agent/tool_executor.py — unwraps tool_call in both parallel and
    sequential parsing loops so checkpointing, guardrails, plugin hooks,
    and tool-progress callbacks all observe the underlying tool name.
  - hermes_cli/config.py — DEFAULT_CONFIG['tools']['tool_search'] block.
  - website/docs/user-guide/features/tool-search.md — user docs.

Validation:
  - 35/35 new tests pass.
  - Existing tool/registry/model_tools/config/coercion/executor tests
    (82 + 74 + small adjacents) green.
  - Live E2E: 20 fake MCP tools registered, get_tool_definitions returns
    3 bridges, tool_search returns top 3 hits, tool_describe returns
    full schema, tool_call dispatches to the real underlying handler
    and the underlying result is what the model sees.
  - Reserved-name recursion guard verified live.
  - Core-tool refusal via tool_call verified live.

											
										
										
											2026-05-23 15:22:01 -07:00
+								        try:
 								            from tools import tool_search as _ts
 								            if function_name == _ts.TOOL_CALL_NAME:
 								                _underlying, _underlying_args, _err = _ts.resolve_underlying_call(function_args)
 								                if not _err and _underlying:
-												fix(tool-search): scope bridge catalog + dispatch to the session's toolsets

Tool Search read its catalog from the global registry (get_tool_definitions
with no toolset scope = 'start with everything'), so a restricted-toolset
session — subagent, kanban worker, curated gateway session — could:

  1. tool_search the entire process registry, not just its granted tools, and
  2. tool_call any registered plugin/MCP tool it was never given, because
     registry.dispatch() has no enabled_tools gate for non-execute_code tools.

A scoped session (enabled_toolsets=['mcp-github']) reported total_available=26
and successfully invoked an out-of-scope plugin tool via tool_call.

Fix:
- handle_function_call gains enabled_toolsets/disabled_toolsets; the bridge
  dispatch scopes get_tool_definitions to them (also stops polluting the
  process-global _last_resolved_tool_names with out-of-scope tools, which
  leaked into execute_code's sandbox-tool fallback).
- A defense-in-depth gate rejects any tool_call'd name not in the scoped
  deferrable catalog.
- tool_executor's unwrap (both concurrent + sequential paths) enforces the
  same scope before dispatch, since it unwraps tool_call -> underlying name
  and bypasses the bridge branch. New _tool_search_scoped_names() helper,
  cached per-agent on registry generation + toolset scope.
- New scoped_deferrable_names() helper in tool_search.py shared by both sites.

Tests: 4 new regression tests in TestRegression_ToolsetScoping (scoped
catalog, out-of-scope tool_call rejection, no global pollution, helper).

											
										
										
											2026-05-29 01:21:41 -07:00
+								                    if _underlying in _tool_search_scoped_names(agent):
 								                        function_name = _underlying
 								                        function_args = _underlying_args
 								                    else:
 								                        _ts_scope_block = (
 								                            f"'{_underlying}' is not available in this session. "
 								                            "Use tool_search to find tools you can call."
 								                        )
-												feat(tools): progressive tool disclosure for MCP and plugin tools

Adds Tool Search, a structured-tools progressive-disclosure layer that
replaces MCP and non-core plugin tools in the model-visible tools array
with three bridge tools (tool_search / tool_describe / tool_call) when
the deferrable surface would consume more than a configurable percentage
of the active model's context window. Core Hermes tools are never deferred.

Default mode is 'auto' with a 10% context threshold, so small toolsets
pay no overhead. Set tools.tool_search.enabled to 'on' to force or 'off'
to disable.

Design carefully reflects the OpenClaw production failure modes
documented in the openclaw-tool-search-report:

  - Core tools never defer (toolsets._HERMES_CORE_TOOLS). Addresses the
    'tools silently missing from isolated cron turns' regression class
    (openclaw#84141) by construction: there is no code path that can
    drop a core tool.
  - Catalog is stateless across turns — rebuilt from the live tool-defs
    list on every assembly. No session-keyed Map that can drift out of
    sync with the registry.
  - tool_call unwraps the bridge call before any hook fires, so plugin
    pre/post hooks, guardrails, approval flows, and the activity feed
    all see the underlying tool name, not the bridge (addresses
    openclaw#85588 and the verbose-mode complaint on openclaw#79823).
  - The unwrap happens in both the parallel and sequential paths of
    agent/tool_executor.py and also in handle_function_call, so direct
    callers (sandboxed code, eval harnesses) are covered too.
  - Bridge tools cannot invoke each other (recursion guard) and cannot
    invoke core tools (those must be called directly).
  - Tools mode only — no JS-sandbox code-mode. Keeps the surface small.
  - Token estimation via cheap char/4 heuristic; precision isn't needed
    for the threshold decision.

Files:
  - tools/tool_search.py — new module (BM25 retrieval, classification,
    threshold gate, bridge dispatch, unwrap helper).
  - tests/tools/test_tool_search.py — 35 tests including the OpenClaw
    #84141 regression guard.
  - model_tools.py — wires assembly into _compute_tool_definitions as the
    final step, adds skip_tool_search_assembly kwarg so the bridge can
    see the real catalog, dispatches the three bridge tools.
  - agent/tool_executor.py — unwraps tool_call in both parallel and
    sequential parsing loops so checkpointing, guardrails, plugin hooks,
    and tool-progress callbacks all observe the underlying tool name.
  - hermes_cli/config.py — DEFAULT_CONFIG['tools']['tool_search'] block.
  - website/docs/user-guide/features/tool-search.md — user docs.

Validation:
  - 35/35 new tests pass.
  - Existing tool/registry/model_tools/config/coercion/executor tests
    (82 + 74 + small adjacents) green.
  - Live E2E: 20 fake MCP tools registered, get_tool_definitions returns
    3 bridges, tool_search returns top 3 hits, tool_describe returns
    full schema, tool_call dispatches to the real underlying handler
    and the underlying result is what the model sees.
  - Reserved-name recursion guard verified live.
  - Core-tool refusal via tool_call verified live.

											
										
										
											2026-05-23 15:22:01 -07:00
+								        except Exception:
 								            pass
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								        # Check plugin hooks for a block directive before executing.
 								        _block_msg: Optional[str] = None
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								        _block_error_type = "plugin_block"
-												fix(tool-search): scope bridge catalog + dispatch to the session's toolsets

Tool Search read its catalog from the global registry (get_tool_definitions
with no toolset scope = 'start with everything'), so a restricted-toolset
session — subagent, kanban worker, curated gateway session — could:

  1. tool_search the entire process registry, not just its granted tools, and
  2. tool_call any registered plugin/MCP tool it was never given, because
     registry.dispatch() has no enabled_tools gate for non-execute_code tools.

A scoped session (enabled_toolsets=['mcp-github']) reported total_available=26
and successfully invoked an out-of-scope plugin tool via tool_call.

Fix:
- handle_function_call gains enabled_toolsets/disabled_toolsets; the bridge
  dispatch scopes get_tool_definitions to them (also stops polluting the
  process-global _last_resolved_tool_names with out-of-scope tools, which
  leaked into execute_code's sandbox-tool fallback).
- A defense-in-depth gate rejects any tool_call'd name not in the scoped
  deferrable catalog.
- tool_executor's unwrap (both concurrent + sequential paths) enforces the
  same scope before dispatch, since it unwraps tool_call -> underlying name
  and bypasses the bridge branch. New _tool_search_scoped_names() helper,
  cached per-agent on registry generation + toolset scope.
- New scoped_deferrable_names() helper in tool_search.py shared by both sites.

Tests: 4 new regression tests in TestRegression_ToolsetScoping (scoped
catalog, out-of-scope tool_call rejection, no global pollution, helper).

											
										
										
											2026-05-29 01:21:41 -07:00
+								        if _ts_scope_block is not None:
 								            _block_msg = _ts_scope_block
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								            _block_error_type = "tool_scope_block"
-												fix(tool-search): scope bridge catalog + dispatch to the session's toolsets

Tool Search read its catalog from the global registry (get_tool_definitions
with no toolset scope = 'start with everything'), so a restricted-toolset
session — subagent, kanban worker, curated gateway session — could:

  1. tool_search the entire process registry, not just its granted tools, and
  2. tool_call any registered plugin/MCP tool it was never given, because
     registry.dispatch() has no enabled_tools gate for non-execute_code tools.

A scoped session (enabled_toolsets=['mcp-github']) reported total_available=26
and successfully invoked an out-of-scope plugin tool via tool_call.

Fix:
- handle_function_call gains enabled_toolsets/disabled_toolsets; the bridge
  dispatch scopes get_tool_definitions to them (also stops polluting the
  process-global _last_resolved_tool_names with out-of-scope tools, which
  leaked into execute_code's sandbox-tool fallback).
- A defense-in-depth gate rejects any tool_call'd name not in the scoped
  deferrable catalog.
- tool_executor's unwrap (both concurrent + sequential paths) enforces the
  same scope before dispatch, since it unwraps tool_call -> underlying name
  and bypasses the bridge branch. New _tool_search_scoped_names() helper,
  cached per-agent on registry generation + toolset scope.
- New scoped_deferrable_names() helper in tool_search.py shared by both sites.

Tests: 4 new regression tests in TestRegression_ToolsetScoping (scoped
catalog, out-of-scope tool_call rejection, no global pollution, helper).

											
										
										
											2026-05-29 01:21:41 -07:00
+								        else:
 								            try:
 								                from hermes_cli.plugins import get_pre_tool_call_block_message
 								                _block_msg = get_pre_tool_call_block_message(
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								                    function_name,
 								                    function_args,
 								                    task_id=effective_task_id or "",
 								                    session_id=getattr(agent, "session_id", "") or "",
 								                    tool_call_id=getattr(tool_call, "id", "") or "",
 								                    turn_id=getattr(agent, "_current_turn_id", "") or "",
 								                    api_request_id=getattr(agent, "_current_api_request_id", "") or "",
-												fix(tool-search): scope bridge catalog + dispatch to the session's toolsets

Tool Search read its catalog from the global registry (get_tool_definitions
with no toolset scope = 'start with everything'), so a restricted-toolset
session — subagent, kanban worker, curated gateway session — could:

  1. tool_search the entire process registry, not just its granted tools, and
  2. tool_call any registered plugin/MCP tool it was never given, because
     registry.dispatch() has no enabled_tools gate for non-execute_code tools.

A scoped session (enabled_toolsets=['mcp-github']) reported total_available=26
and successfully invoked an out-of-scope plugin tool via tool_call.

Fix:
- handle_function_call gains enabled_toolsets/disabled_toolsets; the bridge
  dispatch scopes get_tool_definitions to them (also stops polluting the
  process-global _last_resolved_tool_names with out-of-scope tools, which
  leaked into execute_code's sandbox-tool fallback).
- A defense-in-depth gate rejects any tool_call'd name not in the scoped
  deferrable catalog.
- tool_executor's unwrap (both concurrent + sequential paths) enforces the
  same scope before dispatch, since it unwraps tool_call -> underlying name
  and bypasses the bridge branch. New _tool_search_scoped_names() helper,
  cached per-agent on registry generation + toolset scope.
- New scoped_deferrable_names() helper in tool_search.py shared by both sites.

Tests: 4 new regression tests in TestRegression_ToolsetScoping (scoped
catalog, out-of-scope tool_call rejection, no global pollution, helper).

											
										
										
											2026-05-29 01:21:41 -07:00
+								                )
 								            except Exception:
 								                pass
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
 								        _guardrail_block_decision: ToolGuardrailDecision | None = None
 								        if _block_msg is None:
 								            guardrail_decision = agent._tool_guardrails.before_call(function_name, function_args)
 								            if not guardrail_decision.allows_execution:
 								                _guardrail_block_decision = guardrail_decision
 								        _execution_blocked = _block_msg is not None or _guardrail_block_decision is not None
 								        if _execution_blocked:
 								            # Tool blocked by plugin or guardrail policy — skip counters,
 								            # callbacks, checkpointing, activity mutation, and real execution.
 								            pass
 								        # Reset nudge counters when the relevant tool is actually used
 								        elif function_name == "memory":
 								            agent._turns_since_memory = 0
 								        elif function_name == "skill_manage":
 								            agent._iters_since_skill = 0
 								        if not agent.quiet_mode:
 								            args_str = json.dumps(function_args, ensure_ascii=False)
 								            if agent.verbose_logging:
 								                print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())})")
 								                print(agent._wrap_verbose("Args: ", json.dumps(function_args, indent=2, ensure_ascii=False)))
 								            else:
 								                args_preview = args_str[:agent.log_prefix_chars] + "..." if len(args_str) > agent.log_prefix_chars else args_str
 								                print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())}) - {args_preview}")
 								        if not _execution_blocked:
 								            agent._current_tool = function_name
 								            agent._touch_activity(f"executing tool: {function_name}")
 								        # Set activity callback for long-running tool execution (terminal
 								        # commands, etc.) so the gateway's inactivity monitor doesn't kill
 								        # the agent while a command is running.
 								        if not _execution_blocked:
 								            try:
 								                from tools.environments.base import set_activity_callback
 								                set_activity_callback(agent._touch_activity)
 								            except Exception:
 								                pass
 								        if not _execution_blocked and agent.tool_progress_callback:
 								            try:
 								                preview = _build_tool_preview(function_name, function_args)
 								                agent.tool_progress_callback("tool.started", function_name, preview, function_args)
 								            except Exception as cb_err:
 								                logging.debug(f"Tool progress callback error: {cb_err}")
 								        if not _execution_blocked and agent.tool_start_callback:
 								            try:
 								                agent.tool_start_callback(tool_call.id, function_name, function_args)
 								            except Exception as cb_err:
 								                logging.debug(f"Tool start callback error: {cb_err}")
 								        # Checkpoint: snapshot working dir before file-mutating tools
 								        if not _execution_blocked and function_name in {"write_file", "patch"} and agent._checkpoint_mgr.enabled:
 								            try:
 								                file_path = function_args.get("path", "")
 								                if file_path:
 								                    work_dir = agent._checkpoint_mgr.get_working_dir_for_path(file_path)
 								                    agent._checkpoint_mgr.ensure_checkpoint(
 								                        work_dir, f"before {function_name}"
 								                    )
 								            except Exception:
 								                pass  # never block tool execution
 								        # Checkpoint before destructive terminal commands
 								        if not _execution_blocked and function_name == "terminal" and agent._checkpoint_mgr.enabled:
 								            try:
 								                cmd = function_args.get("command", "")
 								                if _is_destructive_command(cmd):
 								                    cwd = function_args.get("workdir") or os.getenv("TERMINAL_CWD", os.getcwd())
 								                    agent._checkpoint_mgr.ensure_checkpoint(
 								                        cwd, f"before terminal: {cmd[:60]}"
 								                    )
 								            except Exception:
 								                pass  # never block tool execution
 								        tool_start_time = time.time()
 								        if _block_msg is not None:
 								            # Tool blocked by plugin policy — return error without executing.
 								            function_result = json.dumps({"error": _block_msg}, ensure_ascii=False)
 								            tool_duration = 0.0
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								            _emit_terminal_post_tool_call(
 								                agent,
 								                function_name=function_name,
 								                function_args=function_args,
 								                result=function_result,
 								                effective_task_id=effective_task_id,
 								                tool_call_id=getattr(tool_call, "id", "") or "",
 								                status="blocked",
 								                error_type=_block_error_type,
 								                error_message=_block_msg,
 								            )
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								        elif _guardrail_block_decision is not None:
 								            # Tool blocked by tool-loop guardrail — synthesize exactly one
 								            # tool result for the original tool_call_id without executing.
 								            function_result = agent._guardrail_block_result(_guardrail_block_decision)
 								            tool_duration = 0.0
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								            _emit_terminal_post_tool_call(
 								                agent,
 								                function_name=function_name,
 								                function_args=function_args,
 								                result=function_result,
 								                effective_task_id=effective_task_id,
 								                tool_call_id=getattr(tool_call, "id", "") or "",
 								                status="blocked",
 								                error_type="guardrail_block",
 								                error_message=getattr(_guardrail_block_decision, "message", None) or "Tool blocked by guardrail policy",
 								            )
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								        elif function_name == "todo":
 								            from tools.todo_tool import todo_tool as _todo_tool
 								            function_result = _todo_tool(
 								                todos=function_args.get("todos"),
 								                merge=function_args.get("merge", False),
 								                store=agent._todo_store,
 								            )
 								            tool_duration = time.time() - tool_start_time
 								            if agent._should_emit_quiet_tool_messages():
 								                agent._vprint(f"  {_get_cute_tool_message_impl('todo', function_args, tool_duration, result=function_result)}")
 								        elif function_name == "session_search":
 								            session_db = agent._get_session_db_for_recall()
 								            if not session_db:
 								                from hermes_state import format_session_db_unavailable
 								                function_result = json.dumps({"success": False, "error": format_session_db_unavailable()})
 								            else:
 								                from tools.session_search_tool import session_search as _session_search
 								                function_result = _session_search(
 								                    query=function_args.get("query", ""),
 								                    role_filter=function_args.get("role_filter"),
 								                    limit=function_args.get("limit", 3),
-												feat(session_search): single-shape tool with discovery, scroll, browse — no LLM (#27590)

* feat(session_search): single-shape tool with discovery, scroll, browse — no LLM

Replaces the LLM-summarized session_search with a single-shape tool that
returns actual messages from the DB. Three calling shapes inferred from
args (no mode parameter):

  1. Discovery — pass query. FTS5 + anchored ±5 window + bookends per hit,
     all in one call. ~20ms on a real DB instead of ~90s for the previous
     three aux-LLM calls.
  2. Scroll — pass session_id + around_message_id. Returns a window
     centered on the anchor. To paginate, re-anchor on the first/last id
     of the returned window. Boundary message appears in both windows
     as the orientation marker. ~1ms per scroll call.
  3. Browse — no args. Recent sessions chronologically.

Bookend_start (first 3 user+assistant msgs) and bookend_end (last 3) give
the agent goal + resolution on every discovery hit, so a single tool call
reconstructs a long session's arc without loading the whole transcript.

The aux-LLM summary path is gone: it cost ~$0.30/call, took ~30s, and
laundered FTS5 hits through a model that could confabulate when the right
session wasn't in the hit list. The merged shape returns byte-for-byte
content from SQLite.

History:
- PR #20238 (JabberELF) seeded the fast/summary dual-mode split.
- PR #26419 (yoniebans) expanded to fast/guided/summary with bookends,
  multi-anchor drill-down, default-mode config, and a teaching skill.

This PR collapses that toolkit into one shape with explicit scroll
support, drops the summary path, drops the mode parameter, drops the
config knob, drops the skill. JabberELF's seed work is acknowledged via
the AUTHOR_MAP entry.

Validation:
- 38/38 tool tests pass (tests/tools/test_session_search.py)
- 12/12 get_messages_around tests pass (tests/hermes_state/)
- 11/11 get_anchored_view tests pass (tests/hermes_state/)
- Full tests/tools/ run: 5168 passing, 2 failures pre-exist on main
  (test ordering in test_delegate.py, unrelated)
- E2E against live state DB: discovery 20ms, scroll 1ms, browse 280ms;
  pagination forward+backward works with boundary-message orientation;
  error paths return clean tool_error responses

Co-authored-by: JabberELF <abcdjmm970703@gmail.com>
Co-authored-by: yoniebans <jonny@nousresearch.com>

* chore(session_search): prune dead LLM-summary config and docs

Companion to the single-shape rewrite. The auxiliary.session_search config
block, max_concurrency / extra_body tunables, and matching docs sections
all referenced the removed LLM summarization path. Removing them so users
don't try to tune knobs that nothing reads.

- hermes_cli/config.py: drop dead auxiliary.session_search block from
  DEFAULT_CONFIG. Leftover keys in user config.yaml are harmless and
  ignored.
- hermes_cli/tips.py: drop two tips referencing the removed
  max_concurrency / extra_body knobs.
- website/docs/user-guide/configuration.md: drop 'Session Search Tuning'
  section and the auxiliary.session_search block from the example.
- website/docs/user-guide/features/fallback-providers.md: drop session_search
  rows from the auxiliary-tasks tables and the dedicated tuning subsection.
- website/docs/reference/tools-reference.md: rewrite the session_search
  entry to describe the new three-shape behaviour.
- CONTRIBUTING.md: update the file-tree description.
- tests/tools/test_llm_content_none_guard.py: remove TestSessionSearchContentNone
  class and test_session_search_tool_guarded — both guard against an
  unguarded .content.strip() call site in _summarize_session() that no
  longer exists.

Validation: 97/97 targeted tests still pass (hermes_state + session_search +
llm_content_none_guard). Config tests 55/55.

---------

Co-authored-by: JabberELF <abcdjmm970703@gmail.com>
Co-authored-by: yoniebans <jonny@nousresearch.com>
											
										
										
											2026-05-17 23:28:45 -07:00
+								                    session_id=function_args.get("session_id"),
 								                    around_message_id=function_args.get("around_message_id"),
 								                    window=function_args.get("window", 5),
 								                    sort=function_args.get("sort"),
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								                    db=session_db,
 								                    current_session_id=agent.session_id,
 								                )
 								            tool_duration = time.time() - tool_start_time
 								            if agent._should_emit_quiet_tool_messages():
 								                agent._vprint(f"  {_get_cute_tool_message_impl('session_search', function_args, tool_duration, result=function_result)}")
 								        elif function_name == "memory":
 								            target = function_args.get("target", "memory")
 								            from tools.memory_tool import memory_tool as _memory_tool
 								            function_result = _memory_tool(
 								                action=function_args.get("action"),
 								                target=target,
 								                content=function_args.get("content"),
 								                old_text=function_args.get("old_text"),
 								                store=agent._memory_store,
 								            )
 								            # Bridge: notify external memory provider of built-in memory writes
 								            if agent._memory_manager and function_args.get("action") in {"add", "replace"}:
 								                try:
 								                    agent._memory_manager.on_memory_write(
 								                        function_args.get("action", ""),
 								                        target,
 								                        function_args.get("content", ""),
 								                        metadata=agent._build_memory_write_metadata(
 								                            task_id=effective_task_id,
 								                            tool_call_id=getattr(tool_call, "id", None),
 								                        ),
 								                    )
 								                except Exception:
 								                    pass
 								            tool_duration = time.time() - tool_start_time
 								            if agent._should_emit_quiet_tool_messages():
 								                agent._vprint(f"  {_get_cute_tool_message_impl('memory', function_args, tool_duration, result=function_result)}")
 								        elif function_name == "clarify":
 								            from tools.clarify_tool import clarify_tool as _clarify_tool
 								            function_result = _clarify_tool(
 								                question=function_args.get("question", ""),
 								                choices=function_args.get("choices"),
 								                callback=agent.clarify_callback,
 								            )
 								            tool_duration = time.time() - tool_start_time
 								            if agent._should_emit_quiet_tool_messages():
 								                agent._vprint(f"  {_get_cute_tool_message_impl('clarify', function_args, tool_duration, result=function_result)}")
 								        elif function_name == "delegate_task":
 								            tasks_arg = function_args.get("tasks")
 								            if tasks_arg and isinstance(tasks_arg, list):
-												fix: surface /agents nudge while delegate_task is in-flight (TUI + CLI)

The subagent spawn-observability overlay added a `(/agents)` hint, but
only on the standalone "Spawn tree" panel, gated behind `!inlineDelegateKey`
— it never showed for a single delegate_task call, and only appeared once
subagents had already registered. A nudge that arrives at the end (or only
after spawn) is useless for the actual goal: letting users open the live
monitor *while* delegation is running.

Surface it the moment delegation starts, on both surfaces:

TUI (ui-tui/src/components/thinking.tsx)
- Show `(/agents)` on any "Delegate Task" tool group as soon as it appears
  (in-flight, before any subagent registers), not gated on subagents
  already existing. Same `startsWith('Delegate Task')` predicate already
  used for delegateGroups.

CLI (agent/tool_executor.py)
- Append `· /agents to monitor` to the delegate spinner label, which is
  displayed for the full duration of the delegate_task call. The previous
  attempt put the hint on the completion line (get_cute_tool_message),
  which only renders after the call finishes — reverted.

TUI tsc clean (pre-existing execFileNoThrow type errors unrelated);
subagentTree 35/35; display.py reverted to upstream.

											
										
										
											2026-05-30 12:54:41 +05:30
+								                spinner_label = f"🔀 delegating {len(tasks_arg)} tasks · (/agents to monitor)"
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								            else:
 								                goal_preview = (function_args.get("goal") or "")[:30]
-												fix: surface /agents nudge while delegate_task is in-flight (TUI + CLI)

The subagent spawn-observability overlay added a `(/agents)` hint, but
only on the standalone "Spawn tree" panel, gated behind `!inlineDelegateKey`
— it never showed for a single delegate_task call, and only appeared once
subagents had already registered. A nudge that arrives at the end (or only
after spawn) is useless for the actual goal: letting users open the live
monitor *while* delegation is running.

Surface it the moment delegation starts, on both surfaces:

TUI (ui-tui/src/components/thinking.tsx)
- Show `(/agents)` on any "Delegate Task" tool group as soon as it appears
  (in-flight, before any subagent registers), not gated on subagents
  already existing. Same `startsWith('Delegate Task')` predicate already
  used for delegateGroups.

CLI (agent/tool_executor.py)
- Append `· /agents to monitor` to the delegate spinner label, which is
  displayed for the full duration of the delegate_task call. The previous
  attempt put the hint on the completion line (get_cute_tool_message),
  which only renders after the call finishes — reverted.

TUI tsc clean (pre-existing execFileNoThrow type errors unrelated);
subagentTree 35/35; display.py reverted to upstream.

											
										
										
											2026-05-30 12:54:41 +05:30
+								                spinner_label = (
 								                    f"🔀 {goal_preview} · (/agents to monitor)"
 								                    if goal_preview
 								                    else "🔀 delegating · (/agents to monitor)"
 								                )
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								            spinner = None
 								            if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
 								                face = random.choice(KawaiiSpinner.get_waiting_faces())
 								                spinner = KawaiiSpinner(f"{face} {spinner_label}", spinner_type='dots', print_fn=agent._print_fn)
 								                spinner.start()
 								            agent._delegate_spinner = spinner
 								            _delegate_result = None
 								            try:
 								                function_result = agent._dispatch_delegate_task(function_args)
 								                _delegate_result = function_result
 								            finally:
 								                agent._delegate_spinner = None
 								                tool_duration = time.time() - tool_start_time
 								                cute_msg = _get_cute_tool_message_impl('delegate_task', function_args, tool_duration, result=_delegate_result)
 								                if spinner:
 								                    spinner.stop(cute_msg)
 								                elif agent._should_emit_quiet_tool_messages():
 								                    agent._vprint(f"  {cute_msg}")
 								        elif agent._context_engine_tool_names and function_name in agent._context_engine_tool_names:
 								            # Context engine tools (lcm_grep, lcm_describe, lcm_expand, etc.)
 								            spinner = None
 								            if agent._should_emit_quiet_tool_messages():
 								                face = random.choice(KawaiiSpinner.get_waiting_faces())
 								                emoji = _get_tool_emoji(function_name)
 								                preview = _build_tool_preview(function_name, function_args) or function_name
 								                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
 								                spinner.start()
 								            _ce_result = None
 								            try:
 								                function_result = agent.context_compressor.handle_tool_call(function_name, function_args, messages=messages)
 								                _ce_result = function_result
 								            except Exception as tool_error:
 								                function_result = json.dumps({"error": f"Context engine tool '{function_name}' failed: {tool_error}"})
 								                logger.error("context_engine.handle_tool_call raised for %s: %s", function_name, tool_error, exc_info=True)
 								            finally:
 								                tool_duration = time.time() - tool_start_time
 								                cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_ce_result)
 								                if spinner:
 								                    spinner.stop(cute_msg)
 								                elif agent._should_emit_quiet_tool_messages():
 								                    agent._vprint(f"  {cute_msg}")
 								        elif agent._memory_manager and agent._memory_manager.has_tool(function_name):
 								            # Memory provider tools (hindsight_retain, honcho_search, etc.)
 								            # These are not in the tool registry — route through MemoryManager.
 								            spinner = None
 								            if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
 								                face = random.choice(KawaiiSpinner.get_waiting_faces())
 								                emoji = _get_tool_emoji(function_name)
 								                preview = _build_tool_preview(function_name, function_args) or function_name
 								                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
 								                spinner.start()
 								            _mem_result = None
 								            try:
 								                function_result = agent._memory_manager.handle_tool_call(function_name, function_args)
 								                _mem_result = function_result
 								            except Exception as tool_error:
 								                function_result = json.dumps({"error": f"Memory tool '{function_name}' failed: {tool_error}"})
 								                logger.error("memory_manager.handle_tool_call raised for %s: %s", function_name, tool_error, exc_info=True)
 								            finally:
 								                tool_duration = time.time() - tool_start_time
 								                cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_mem_result)
 								                if spinner:
 								                    spinner.stop(cute_msg)
 								                elif agent._should_emit_quiet_tool_messages():
 								                    agent._vprint(f"  {cute_msg}")
 								        elif agent.quiet_mode:
 								            spinner = None
 								            if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
 								                face = random.choice(KawaiiSpinner.get_waiting_faces())
 								                emoji = _get_tool_emoji(function_name)
 								                preview = _build_tool_preview(function_name, function_args) or function_name
 								                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
 								                spinner.start()
 								            _spinner_result = None
 								            try:
 								                function_result = _ra().handle_function_call(
 								                    function_name, function_args, effective_task_id,
 								                    tool_call_id=tool_call.id,
 								                    session_id=agent.session_id or "",
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								                    turn_id=getattr(agent, "_current_turn_id", "") or "",
 								                    api_request_id=getattr(agent, "_current_api_request_id", "") or "",
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								                    enabled_tools=list(agent.valid_tool_names) if agent.valid_tool_names else None,
 								                    skip_pre_tool_call_hook=True,
-												fix(tool-search): scope bridge catalog + dispatch to the session's toolsets

Tool Search read its catalog from the global registry (get_tool_definitions
with no toolset scope = 'start with everything'), so a restricted-toolset
session — subagent, kanban worker, curated gateway session — could:

  1. tool_search the entire process registry, not just its granted tools, and
  2. tool_call any registered plugin/MCP tool it was never given, because
     registry.dispatch() has no enabled_tools gate for non-execute_code tools.

A scoped session (enabled_toolsets=['mcp-github']) reported total_available=26
and successfully invoked an out-of-scope plugin tool via tool_call.

Fix:
- handle_function_call gains enabled_toolsets/disabled_toolsets; the bridge
  dispatch scopes get_tool_definitions to them (also stops polluting the
  process-global _last_resolved_tool_names with out-of-scope tools, which
  leaked into execute_code's sandbox-tool fallback).
- A defense-in-depth gate rejects any tool_call'd name not in the scoped
  deferrable catalog.
- tool_executor's unwrap (both concurrent + sequential paths) enforces the
  same scope before dispatch, since it unwraps tool_call -> underlying name
  and bypasses the bridge branch. New _tool_search_scoped_names() helper,
  cached per-agent on registry generation + toolset scope.
- New scoped_deferrable_names() helper in tool_search.py shared by both sites.

Tests: 4 new regression tests in TestRegression_ToolsetScoping (scoped
catalog, out-of-scope tool_call rejection, no global pollution, helper).

											
										
										
											2026-05-29 01:21:41 -07:00
+								                    enabled_toolsets=getattr(agent, "enabled_toolsets", None),
 								                    disabled_toolsets=getattr(agent, "disabled_toolsets", None),
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								                )
 								                _spinner_result = function_result
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								            except KeyboardInterrupt:
 								                function_result = _emit_cancelled_terminal_post_tool_call(
 								                    agent,
 								                    function_name=function_name,
 								                    function_args=function_args,
 								                    effective_task_id=effective_task_id,
 								                    tool_call_id=getattr(tool_call, "id", "") or "",
 								                    start_time=tool_start_time,
 								                )
 								                _spinner_result = function_result
 								                try:
 								                    agent.interrupt("keyboard interrupt")
 								                except Exception:
 								                    pass
 								                raise
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								            except Exception as tool_error:
 								                function_result = f"Error executing tool '{function_name}': {tool_error}"
 								                logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True)
 								            finally:
 								                tool_duration = time.time() - tool_start_time
 								                cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_spinner_result)
 								                if spinner:
 								                    spinner.stop(cute_msg)
 								                elif agent._should_emit_quiet_tool_messages():
 								                    agent._vprint(f"  {cute_msg}")
 								        else:
 								            try:
 								                function_result = _ra().handle_function_call(
 								                    function_name, function_args, effective_task_id,
 								                    tool_call_id=tool_call.id,
 								                    session_id=agent.session_id or "",
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								                    turn_id=getattr(agent, "_current_turn_id", "") or "",
 								                    api_request_id=getattr(agent, "_current_api_request_id", "") or "",
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								                    enabled_tools=list(agent.valid_tool_names) if agent.valid_tool_names else None,
 								                    skip_pre_tool_call_hook=True,
-												fix(tool-search): scope bridge catalog + dispatch to the session's toolsets

Tool Search read its catalog from the global registry (get_tool_definitions
with no toolset scope = 'start with everything'), so a restricted-toolset
session — subagent, kanban worker, curated gateway session — could:

  1. tool_search the entire process registry, not just its granted tools, and
  2. tool_call any registered plugin/MCP tool it was never given, because
     registry.dispatch() has no enabled_tools gate for non-execute_code tools.

A scoped session (enabled_toolsets=['mcp-github']) reported total_available=26
and successfully invoked an out-of-scope plugin tool via tool_call.

Fix:
- handle_function_call gains enabled_toolsets/disabled_toolsets; the bridge
  dispatch scopes get_tool_definitions to them (also stops polluting the
  process-global _last_resolved_tool_names with out-of-scope tools, which
  leaked into execute_code's sandbox-tool fallback).
- A defense-in-depth gate rejects any tool_call'd name not in the scoped
  deferrable catalog.
- tool_executor's unwrap (both concurrent + sequential paths) enforces the
  same scope before dispatch, since it unwraps tool_call -> underlying name
  and bypasses the bridge branch. New _tool_search_scoped_names() helper,
  cached per-agent on registry generation + toolset scope.
- New scoped_deferrable_names() helper in tool_search.py shared by both sites.

Tests: 4 new regression tests in TestRegression_ToolsetScoping (scoped
catalog, out-of-scope tool_call rejection, no global pollution, helper).

											
										
										
											2026-05-29 01:21:41 -07:00
+								                    enabled_toolsets=getattr(agent, "enabled_toolsets", None),
 								                    disabled_toolsets=getattr(agent, "disabled_toolsets", None),
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								                )
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								            except KeyboardInterrupt:
 								                _emit_cancelled_terminal_post_tool_call(
 								                    agent,
 								                    function_name=function_name,
 								                    function_args=function_args,
 								                    effective_task_id=effective_task_id,
 								                    tool_call_id=getattr(tool_call, "id", "") or "",
 								                    start_time=tool_start_time,
 								                )
 								                try:
 								                    agent.interrupt("keyboard interrupt")
 								                except Exception:
 								                    pass
 								                raise
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								            except Exception as tool_error:
 								                function_result = f"Error executing tool '{function_name}': {tool_error}"
 								                logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True)
 								            tool_duration = time.time() - tool_start_time
 								        if isinstance(function_result, str):
 								            result_preview = function_result if agent.verbose_logging else (
 								                function_result[:200] if len(function_result) > 200 else function_result
 								            )
 								            _result_len = len(function_result)
 								        else:
 								            # Multimodal dict result (_multimodal=True) — not sliceable as string
 								            result_preview = function_result
 								            _result_len = len(str(function_result))
 								        # Log tool errors to the persistent error log so [error] tags
 								        # in the UI always have a corresponding detailed entry on disk.
 								        _is_error_result, _ = _detect_tool_failure(function_name, function_result)
-												feat(observability): observer-grade telemetry hooks + NeMo-Relay plugin

Adds backend-neutral observer hooks for plugins: session, turn, API
request, tool, approval, and subagent lifecycle events with stable
correlation IDs (session_id, task_id, turn_id, api_request_id,
tool_call_id, parent/child subagent ids). Extends VALID_HOOKS with
api_request_error and subagent_start.

Hot path is zero-cost when no plugin subscribes: has_hook()/presence
checks gate all payload construction, request payloads are returned
by reference when no middleware rewrites, and the sanitized response
payload no longer embeds raw response objects.

Bundles the optional NeMo-Relay observability plugin
(plugins/observability/nemo_relay) as an in-repo consumer of the new
hooks, peer to the existing langfuse plugin. Fails open when the
optional nemo-relay package is not installed.

Authored-by: Bryan Bednarski <bbednarski@nvidia.com>
Salvaged from #29722 onto current main.

											
										
										
											2026-06-03 17:44:13 +05:30
+								        # The agent-runtime tools above (todo, session_search, memory,
 								        # context-engine, memory-manager, clarify, delegate_task) are
 								        # dispatched inline — they never reach handle_function_call, so the
 								        # executor is the one that has to fire post_tool_call. For
 								        # registry-dispatched tools the else-branch above invoked
 								        # handle_function_call, which already fires the hook.
 								        from agent.agent_runtime_helpers import agent_runtime_owns_post_tool_hook
 								        _executor_must_emit_post_hook = (
 								            not _execution_blocked
 								            and agent_runtime_owns_post_tool_hook(agent, function_name)
 								        )
 								        if _executor_must_emit_post_hook:
 								            _emit_terminal_post_tool_call(
 								                agent,
 								                function_name=function_name,
 								                function_args=function_args,
 								                result=function_result,
 								                effective_task_id=effective_task_id,
 								                tool_call_id=getattr(tool_call, "id", "") or "",
 								                duration_ms=int(tool_duration * 1000),
 								            )
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								        if not _execution_blocked:
 								            function_result = agent._append_guardrail_observation(
 								                function_name,
 								                function_args,
 								                function_result,
 								                failed=_is_error_result,
 								            )
 								            result_preview = function_result if agent.verbose_logging else (
 								                function_result[:200] if len(function_result) > 200 else function_result
 								            )
 								        if _is_error_result:
 								            logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview)
 								        else:
 								            logger.info("tool %s completed (%.2fs, %d chars)", function_name, tool_duration, _result_len)
 								        # Track file-mutation outcome for the turn-end verifier.  See
 								        # the concurrent path for the rationale; both paths must feed
 								        # the same state so the footer reflects every tool call in the
 								        # turn, not just the parallel ones.
 								        if not _execution_blocked:
 								            try:
 								                agent._record_file_mutation_result(
 								                    function_name, function_args, function_result, _is_error_result,
 								                )
 								            except Exception as _ver_err:
 								                logging.debug("file-mutation verifier record failed: %s", _ver_err)
 								        if not _execution_blocked and agent.tool_progress_callback:
 								            try:
 								                agent.tool_progress_callback(
 								                    "tool.completed", function_name, None, None,
 								                    duration=tool_duration, is_error=_is_error_result,
-												fix(cli): surface tool failures with specific error messages

Improves the failure suffix on tool completion lines. Instead of always
showing '[error]' for non-terminal failures, parse the tool's JSON result
and surface the actual message:

  Before:  ┊ 📖 read      foo.py  0.1s [error]
  After:   ┊ 📖 read      foo.py  0.1s [File not found: foo.py]

  Before:  ┊ 💻 $         ls bad  0.1s [exit 127]
  After:   ┊ 💻 $         ls bad  0.1s [ls: cannot access 'bad'...]

Adds a _trim_error helper that strips long absolute paths down to the
filename and caps the suffix at 48 chars so it stays readable on narrow
terminals.

Threads the tool result through the tool.completed progress callback so
agent/display.get_cute_tool_message can inspect it. The cli.py [error]
post-suffix is removed in favor of the richer suffix _detect_tool_failure
now produces directly.

Originally proposed in PR #17194 by Albert.Zhou; salvaged onto current
main with the dead-code preview-length bumps dropped (tool_preview_length
config already strictly caps previews, so the per-tool n= defaults are
unreachable).

Co-authored-by: Albert.Zhou <albert748@gmail.com>

											
										
										
											2026-05-23 20:54:17 -07:00
+								                    result=function_result,
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								                )
 								            except Exception as cb_err:
 								                logging.debug(f"Tool progress callback error: {cb_err}")
 								        agent._current_tool = None
 								        agent._touch_activity(f"tool completed: {function_name} ({tool_duration:.1f}s)")
 								        if agent.verbose_logging:
 								            logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s")
 								            _log_result = _multimodal_text_summary(function_result)
 								            logging.debug(f"Tool result ({len(_log_result)} chars): {_log_result}")
 								        if not _execution_blocked and agent.tool_complete_callback:
 								            try:
 								                agent.tool_complete_callback(tool_call.id, function_name, function_args, function_result)
 								            except Exception as cb_err:
 								                logging.debug(f"Tool complete callback error: {cb_err}")
 								        function_result = maybe_persist_tool_result(
 								            content=function_result,
 								            tool_name=function_name,
 								            tool_use_id=tool_call.id,
 								            env=get_active_env(effective_task_id),
 								        ) if not _is_multimodal_tool_result(function_result) else function_result
 								        # Discover subdirectory context files from tool arguments
 								        subdir_hints = agent._subdirectory_hints.check_tool_call(function_name, function_args)
 								        if subdir_hints:
 								            if _is_multimodal_tool_result(function_result):
 								                _append_subdir_hint_to_multimodal(function_result, subdir_hints)
 								            else:
 								                function_result += subdir_hints
 								        # Unwrap _multimodal dicts to an OpenAI-style content list
 								        # (see parallel path for rationale). String results pass through.
 								        _tool_content = agent._tool_result_content_for_active_model(function_name, function_result)
-												fix(agent): set tool_name on tool-result messages at construction time

Introduces make_tool_result_message() in tool_dispatch_helpers.py as the
single place where tool-result message dicts are built. All six construction
sites in tool_executor.py, agent_runtime_helpers.py, and mini_swe_runner.py
now use it, so tool_name is set in memory from the moment a message is
created rather than relying on fallback logic in the flush paths.

Fixes blank tool_name in both state.db and JSON session logs.

Adds tests.

											
										
										
											2026-05-19 20:24:30 +01:00
+								        messages.append(make_tool_result_message(function_name, _tool_content, tool_call.id))
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
 								        # ── Per-tool /steer drain ───────────────────────────────────
 								        # Drain pending steer BETWEEN individual tool calls so the
 								        # injection lands as soon as a tool finishes — not after the
 								        # entire batch.  The model sees it on the next API iteration.
 								        agent._apply_pending_steer_to_tool_results(messages, 1)
 								        if not agent.quiet_mode:
 								            if agent.verbose_logging:
 								                print(f"  ✅ Tool {i} completed in {tool_duration:.2f}s")
 								                print(agent._wrap_verbose("Result: ", function_result))
 								            else:
 								                _fr_str = function_result if isinstance(function_result, str) else str(function_result)
 								                response_preview = _fr_str[:agent.log_prefix_chars] + "..." if len(_fr_str) > agent.log_prefix_chars else _fr_str
 								                print(f"  ✅ Tool {i} completed in {tool_duration:.2f}s - {response_preview}")
 								        if agent._interrupt_requested and i < len(assistant_message.tool_calls):
 								            remaining = len(assistant_message.tool_calls) - i
 								            agent._vprint(f"{agent.log_prefix}⚡ Interrupt: skipping {remaining} remaining tool call(s)", force=True)
 								            for skipped_tc in assistant_message.tool_calls[i:]:
 								                skipped_name = skipped_tc.function.name
-												fix(agent): set tool_name on tool-result messages at construction time

Introduces make_tool_result_message() in tool_dispatch_helpers.py as the
single place where tool-result message dicts are built. All six construction
sites in tool_executor.py, agent_runtime_helpers.py, and mini_swe_runner.py
now use it, so tool_name is set in memory from the moment a message is
created rather than relying on fallback logic in the flush paths.

Fixes blank tool_name in both state.db and JSON session logs.

Adds tests.

											
										
										
											2026-05-19 20:24:30 +01:00
+								                messages.append(make_tool_result_message(
 								                    skipped_name,
 								                    f"[Tool execution skipped — {skipped_name} was not started. User sent a new message]",
 								                    skipped_tc.id,
 								                ))
-												refactor(run_agent): extract tool execution to agent/tool_executor.py

Move the two big tool-dispatch methods out of run_agent.py:

* execute_tool_calls_concurrent — 408-line concurrent path (interrupt
  pre-flight, guardrail+plugin block, callback fan-out, ContextVar-
  preserving ThreadPoolExecutor, periodic heartbeats for the gateway
  inactivity monitor, per-tool result handling with subdir hints +
  guardrail observations + checkpoint, /steer drain)
* execute_tool_calls_sequential — 441-line sequential path (the
  original behavior used for single-tool batches and interactive
  tools)

Both take the parent AIAgent as their first argument; AIAgent keeps
thin forwarders so call sites unchanged. handle_function_call is
routed through _ra() so tests that patch run_agent.handle_function_call
keep working. _set_interrupt likewise.

The AST guard in test_tool_executor_contextvar_propagation.py is
updated to scan both run_agent.py AND agent/tool_executor.py so it
still catches the executor.submit(_run_tool, ...) regression
regardless of which file the body lives in.

tests/run_agent/ + tests/agent/: 4313 passed (same pre-existing
test_auxiliary_client failure as before).

run_agent.py: 14309 -> 13461 lines (-848).

											
										
										
											2026-05-16 18:24:05 -07:00
+								            break
 								        if agent.tool_delay > 0 and i < len(assistant_message.tool_calls):
 								            time.sleep(agent.tool_delay)
 								    # ── Per-turn aggregate budget enforcement ─────────────────────────
 								    num_tools_seq = len(assistant_message.tool_calls)
 								    if num_tools_seq > 0:
 								        enforce_turn_budget(messages[-num_tools_seq:], env=get_active_env(effective_task_id))
 								    # ── /steer injection ──────────────────────────────────────────────
 								    # See _execute_tool_calls_parallel for the rationale. Same hook,
 								    # applied to sequential execution as well.
 								    if num_tools_seq > 0:
 								        agent._apply_pending_steer_to_tool_results(messages, num_tools_seq)
 								__all__ = [
 								    "execute_tool_calls_concurrent",
 								    "execute_tool_calls_sequential",
 								]