hermes-bsd/tests/tools/test_code_execution_windows_env.py

"""Tests for execute_code env scrubbing on Windows.

On Windows the child process needs a small set of OS-essential env vars
(SYSTEMROOT, WINDIR, COMSPEC, ...) to run.  Without SYSTEMROOT in particular,
``socket.socket(AF_INET, SOCK_STREAM)`` fails inside the sandbox with
WinError 10106 (Winsock can't locate mswsock.dll) and no tool call over
loopback TCP can ever succeed.

These tests cover ``_scrub_child_env`` directly so they run on every OS
— the logic is conditional on a passed-in ``is_windows`` flag, not on
the host platform.  We also keep a live Winsock smoke test that only runs
on a real Windows host.

Also covers the companion Windows bug: the sandbox writes
``hermes_tools.py`` and ``script.py`` into a temp dir, and those files
must be written as UTF-8 on every platform — the generated stub contains
em-dash/en-dash characters in docstrings, and the default ``open(path, "w")``
on Windows uses the system locale (cp1252 typically), corrupting those
bytes.  The child then fails to import with a SyntaxError:
``'utf-8' codec can't decode byte 0x97``.
"""

import os
import subprocess
import sys
import textwrap

import pytest

from tools.code_execution_tool import (
    _SECRET_SUBSTRINGS,
    _WINDOWS_ESSENTIAL_ENV_VARS,
    _scrub_child_env,
)


def _no_passthrough(_name):
    return False


class TestWindowsEssentialAllowlist:
    """The allowlist itself — contents, shape, and invariants."""

    def test_contains_winsock_required_vars(self):
        # Without SYSTEMROOT the child cannot initialize Winsock.
        assert "SYSTEMROOT" in _WINDOWS_ESSENTIAL_ENV_VARS

    def test_contains_subprocess_required_vars(self):
        # Without COMSPEC, subprocess can't resolve the default shell.
        assert "COMSPEC" in _WINDOWS_ESSENTIAL_ENV_VARS

    def test_contains_user_profile_vars(self):
        # os.path.expanduser("~") on Windows uses USERPROFILE.
        assert "USERPROFILE" in _WINDOWS_ESSENTIAL_ENV_VARS
        assert "APPDATA" in _WINDOWS_ESSENTIAL_ENV_VARS
        assert "LOCALAPPDATA" in _WINDOWS_ESSENTIAL_ENV_VARS

    def test_contains_only_uppercase_names(self):
        # Windows env var names are case-insensitive but we canonicalize to
        # uppercase for the membership check (``k.upper() in _WINDOWS_...``).
        for name in _WINDOWS_ESSENTIAL_ENV_VARS:
            assert name == name.upper(), f"{name!r} should be uppercase"

    def test_no_overlap_with_secret_substrings(self):
        # Sanity: none of the essential OS vars should look like secrets.
        # If this ever fires, we'd have a precedence ordering bug (secrets
        # are blocked *before* the essentials check).
        for name in _WINDOWS_ESSENTIAL_ENV_VARS:
            assert not any(s in name for s in _SECRET_SUBSTRINGS), (
                f"{name!r} looks secret-like — would be blocked before the "
                "essentials allowlist can match"
            )


class TestScrubChildEnvWindows:
    """Verify _scrub_child_env passes Windows essentials through when
    is_windows=True and blocks them when is_windows=False (so POSIX hosts
    don't inherit pointless Windows vars)."""

    def _sample_windows_env(self):
        """A realistic subset of what os.environ looks like on Windows."""
        return {
            "SYSTEMROOT": r"C:\Windows",
            "SystemDrive": "C:",        # Windows preserves native case
            "WINDIR": r"C:\Windows",
            "ComSpec": r"C:\Windows\System32\cmd.exe",
            "PATHEXT": ".COM;.EXE;.BAT;.CMD;.PY",
            "USERPROFILE": r"C:\Users\alice",
            "APPDATA": r"C:\Users\alice\AppData\Roaming",
            "LOCALAPPDATA": r"C:\Users\alice\AppData\Local",
            "PATH": r"C:\Windows\System32;C:\Python311",
            "HOME": r"C:\Users\alice",
            "TEMP": r"C:\Users\alice\AppData\Local\Temp",
            # Should still be blocked:
            "OPENAI_API_KEY": "sk-secret",
            "GITHUB_TOKEN": "ghp_secret",
            "MY_PASSWORD": "hunter2",
            # Not matched by any rule — should be dropped on both OSes:
            "RANDOM_UNKNOWN_VAR": "value",
        }

    def test_windows_essentials_passed_through_when_is_windows_true(self):
        env = self._sample_windows_env()
        scrubbed = _scrub_child_env(env,
                                    is_passthrough=_no_passthrough,
                                    is_windows=True)

        # Every essential var from the sample env should survive.
        assert scrubbed["SYSTEMROOT"] == r"C:\Windows"
        assert scrubbed["SystemDrive"] == "C:"  # case preserved
        assert scrubbed["WINDIR"] == r"C:\Windows"
        assert scrubbed["ComSpec"] == r"C:\Windows\System32\cmd.exe"
        assert scrubbed["PATHEXT"] == ".COM;.EXE;.BAT;.CMD;.PY"
        assert scrubbed["USERPROFILE"] == r"C:\Users\alice"
        assert scrubbed["APPDATA"].endswith("Roaming")
        assert scrubbed["LOCALAPPDATA"].endswith("Local")

        # Safe-prefix vars still pass (baseline behavior).
        assert "PATH" in scrubbed
        assert "HOME" in scrubbed
        assert "TEMP" in scrubbed

    def test_secrets_still_blocked_on_windows(self):
        """The Windows allowlist must NOT defeat the secret-substring block.

        This is the key security invariant: essentials are allowed by
        *exact name*, and the secret-substring block runs before the
        essentials check anyway, so a variable named e.g. ``API_KEY`` can
        never sneak through just because we added Windows support.
        """
        env = self._sample_windows_env()
        scrubbed = _scrub_child_env(env,
                                    is_passthrough=_no_passthrough,
                                    is_windows=True)
        assert "OPENAI_API_KEY" not in scrubbed
        assert "GITHUB_TOKEN" not in scrubbed
        assert "MY_PASSWORD" not in scrubbed

    def test_unknown_vars_still_dropped_on_windows(self):
        env = self._sample_windows_env()
        scrubbed = _scrub_child_env(env,
                                    is_passthrough=_no_passthrough,
                                    is_windows=True)
        assert "RANDOM_UNKNOWN_VAR" not in scrubbed

    def test_essentials_blocked_when_is_windows_false(self):
        """On POSIX hosts, Windows-specific vars should not pass — they
        have no meaning and could confuse child tooling."""
        env = self._sample_windows_env()
        scrubbed = _scrub_child_env(env,
                                    is_passthrough=_no_passthrough,
                                    is_windows=False)
        # Safe prefixes still match (PATH, HOME, TEMP).
        assert "PATH" in scrubbed
        assert "HOME" in scrubbed
        assert "TEMP" in scrubbed
        # But Windows OS vars should be dropped.
        assert "SYSTEMROOT" not in scrubbed
        assert "WINDIR" not in scrubbed
        assert "ComSpec" not in scrubbed
        assert "APPDATA" not in scrubbed

    def test_case_insensitive_essential_match(self):
        """Windows env var names are case-insensitive at the OS level but
        Python preserves whatever case os.environ reported.  The scrubber
        must normalize to uppercase for the membership check."""
        env = {
            "SystemRoot": r"C:\Windows",       # mixed case
            "comspec": r"C:\Windows\System32\cmd.exe",  # lowercase
            "APPDATA": r"C:\Users\x\AppData\Roaming",   # uppercase
        }
        scrubbed = _scrub_child_env(env,
                                    is_passthrough=_no_passthrough,
                                    is_windows=True)
        assert "SystemRoot" in scrubbed
        assert "comspec" in scrubbed
        assert "APPDATA" in scrubbed


class TestScrubChildEnvPassthroughInteraction:
    """The passthrough hook runs *before* the secret block, so a skill
    can legitimately forward a third-party API key.  The Windows
    essentials addition must not interfere with that."""

    def test_passthrough_wins_over_secret_block(self):
        env = {"TENOR_API_KEY": "x", "PATH": "/bin"}
        scrubbed = _scrub_child_env(env,
                                    is_passthrough=lambda k: k == "TENOR_API_KEY",
                                    is_windows=False)
        assert scrubbed.get("TENOR_API_KEY") == "x"
        assert scrubbed.get("PATH") == "/bin"

    def test_passthrough_still_works_on_windows(self):
        env = {
            "TENOR_API_KEY": "x",
            "SYSTEMROOT": r"C:\Windows",
            "OPENAI_API_KEY": "sk-secret",  # not passthrough
        }
        scrubbed = _scrub_child_env(
            env,
            is_passthrough=lambda k: k == "TENOR_API_KEY",
            is_windows=True,
        )
        assert scrubbed.get("TENOR_API_KEY") == "x"
        assert scrubbed.get("SYSTEMROOT") == r"C:\Windows"
        assert "OPENAI_API_KEY" not in scrubbed


@pytest.mark.skipif(
    sys.platform != "win32",
    reason="Winsock-specific regression — only meaningful on Windows",
)
class TestWindowsSocketSmokeTest:
    """Integration-ish smoke test: spawn a child Python with a scrubbed
    env and confirm it can create an AF_INET socket.  This is the
    regression that motivated the fix — without SYSTEMROOT the child
    hits WinError 10106 before any RPC is attempted."""

    def test_child_can_create_socket_with_scrubbed_env(self):
        scrubbed = _scrub_child_env(os.environ, is_passthrough=_no_passthrough)

        # Build a tiny child script that simply opens an AF_INET socket.
        script = textwrap.dedent("""
            import socket, sys
            try:
                s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                s.close()
                print("OK")
                sys.exit(0)
            except OSError as exc:
                print(f"FAIL: {exc}")
                sys.exit(1)
        """).strip()

        result = subprocess.run(
            [sys.executable, "-c", script],
            env=scrubbed,
            capture_output=True,
            text=True,
            timeout=15,
        )
        assert result.returncode == 0, (
            f"Child failed to create socket with scrubbed env:\n"
            f"  stdout={result.stdout!r}\n"
            f"  stderr={result.stderr!r}\n"
            f"  scrubbed keys={sorted(scrubbed.keys())}"
        )
        assert "OK" in result.stdout


# ---------------------------------------------------------------------------
# POSIX equivalence guard
# ---------------------------------------------------------------------------

def _legacy_posix_scrubber(source_env, is_passthrough):
    """Independent oracle for TestPosixEquivalence — a from-scratch reimpl of
    _scrub_child_env's POSIX behavior, used to prove the production helper does
    what we think it does.

    Deliberately updated for #27303 (the broad ``HERMES_`` prefix was dropped
    in favor of an explicit operational allowlist, and DSN/WEBHOOK were added
    to the secret substrings).  The original docstring said: if POSIX behavior
    legitimately needs to evolve, adjust this oracle on purpose so the churn is
    visible in review — that is what this change is.
    """
    _SAFE_ENV_PREFIXES = ("PATH", "HOME", "USER", "LANG", "LC_", "TERM",
                          "TMPDIR", "TMP", "TEMP", "SHELL", "LOGNAME",
                          "XDG_", "PYTHONPATH", "VIRTUAL_ENV", "CONDA")
    _SECRET_SUBSTRINGS = ("KEY", "TOKEN", "SECRET", "PASSWORD", "CREDENTIAL",
                          "PASSWD", "AUTH", "DSN", "WEBHOOK")
    _HERMES_CHILD_ALLOWED = frozenset({
        "HERMES_HOME", "HERMES_PROFILE", "HERMES_CONFIG", "HERMES_ENV",
    })
    out = {}
    for k, v in source_env.items():
        if is_passthrough(k):
            out[k] = v
            continue
        if any(s in k.upper() for s in _SECRET_SUBSTRINGS):
            continue
        if any(k.startswith(p) for p in _SAFE_ENV_PREFIXES):
            out[k] = v
            continue
        if k in _HERMES_CHILD_ALLOWED:
            out[k] = v
    return out


class TestPosixEquivalence:
    """Lock in the invariant that _scrub_child_env(env, is_windows=False)
    behaves *bit-for-bit identically* to the pre-refactor inline scrubber.

    If this ever fails, it means somebody changed POSIX env-scrubbing
    behavior — maybe on purpose, maybe not.  Either way it should land
    as a deliberate, reviewed change (update _legacy_posix_scrubber
    above in the same PR).

    Rationale: the Windows-essentials patch refactored the scrubber into
    a helper.  Linux/macOS must not regress.  This class gates that.
    """

    _POSIX_SYNTHETIC_ENV = {
        # Safe-prefix matches
        "PATH": "/usr/bin:/bin",
        "HOME": "/home/alice",
        "USER": "alice",
        "LANG": "en_US.UTF-8",
        "LC_CTYPE": "en_US.UTF-8",
        "TERM": "xterm-256color",
        "SHELL": "/bin/zsh",
        "LOGNAME": "alice",
        "TMPDIR": "/tmp",
        "XDG_RUNTIME_DIR": "/run/user/1000",
        "XDG_CONFIG_HOME": "/home/alice/.config",
        "PYTHONPATH": "/opt/lib",
        "VIRTUAL_ENV": "/home/alice/.venv",
        "CONDA_PREFIX": "/opt/conda",
        # HERMES_* handling (#27303): only the operational allowlist passes;
        # every other HERMES_* is dropped (the broad prefix was removed).
        "HERMES_HOME": "/home/alice/.hermes",        # allowlisted → kept
        "HERMES_PROFILE": "default",                 # allowlisted → kept
        "HERMES_INTERACTIVE": "1",                   # not allowlisted → dropped
        "HERMES_BASE_URL": "https://api.internal",   # not allowlisted → dropped
        "HERMES_KANBAN_DB": "postgres://u:p@h/db",   # not allowlisted → dropped
        # Secret-substring blocks
        "OPENAI_API_KEY": "sk-xxx",
        "GITHUB_TOKEN": "ghp_xxx",
        "AWS_SECRET_ACCESS_KEY": "yyy",
        "MY_PASSWORD": "hunter2",
        "SENTRY_DSN": "https://abc@sentry.io/1",     # DSN substring → blocked
        "SLACK_WEBHOOK": "https://hooks.slack/x",    # WEBHOOK substring → blocked
        # Uncategorized — must be dropped
        "RANDOM_UNKNOWN": "drop-me",
        "DISPLAY": ":0",
        "SSH_AUTH_SOCK": "/run/user/1000/ssh-agent",
        # Passthrough candidate (also matches secret block by default)
        "TENOR_API_KEY": "tenor-xxx",
    }

    _WINDOWS_SYNTHETIC_ENV = {
        # Windows-essential names (must be dropped on POSIX, passed on Win)
        "SYSTEMROOT": r"C:\Windows",
        "SystemDrive": "C:",
        "WINDIR": r"C:\Windows",
        "ComSpec": r"C:\Windows\System32\cmd.exe",
        "PATHEXT": ".COM;.EXE;.BAT",
        "USERPROFILE": r"C:\Users\alice",
        "APPDATA": r"C:\Users\alice\AppData\Roaming",
        "LOCALAPPDATA": r"C:\Users\alice\AppData\Local",
        # Safe-prefix matches (cross-platform)
        "PATH": r"C:\Python311;C:\Windows\System32",
        "HOME": r"C:\Users\alice",
        "TEMP": r"C:\Users\alice\AppData\Local\Temp",
        # Secret-looking (always blocked)
        "OPENAI_API_KEY": "sk-xxx",
        "GITHUB_TOKEN": "ghp_xxx",
    }

    @pytest.mark.parametrize("env_name,env", [
        ("posix_synthetic", _POSIX_SYNTHETIC_ENV),
        ("windows_synthetic_on_posix", _WINDOWS_SYNTHETIC_ENV),
    ])
    @pytest.mark.parametrize("pt_name,pt", [
        ("no_passthrough", lambda _: False),
        ("tenor_passthrough", lambda k: k == "TENOR_API_KEY"),
        ("all_passthrough", lambda _: True),
    ])
    def test_posix_behavior_unchanged(self, env_name, env, pt_name, pt):
        """For every combination of (env shape × passthrough rule), the
        new helper with is_windows=False must produce the exact same dict
        as the legacy inline scrubber.

        We parametrize over three passthrough rules to cover the full
        surface: no passthrough, single-var passthrough (the common
        skill-registered case), and everything-passes (edge case that
        could expose precedence bugs)."""
        expected = _legacy_posix_scrubber(env, pt)
        actual = _scrub_child_env(env, is_passthrough=pt, is_windows=False)
        assert actual == expected, (
            f"POSIX behavior regressed for env={env_name}, passthrough={pt_name}\n"
            f"  only in legacy: {sorted(set(expected) - set(actual))}\n"
            f"  only in new:    {sorted(set(actual) - set(expected))}\n"
            f"  value diffs:    {[k for k in expected if k in actual and expected[k] != actual[k]]}"
        )

    def test_posix_behavior_unchanged_on_real_os_environ(self):
        """Bonus check against the actual os.environ of the host running
        the test.  This covers vars we might not have thought to put in
        the synthetic fixtures."""
        expected = _legacy_posix_scrubber(os.environ, lambda _: False)
        actual = _scrub_child_env(os.environ,
                                  is_passthrough=lambda _: False,
                                  is_windows=False)
        assert actual == expected, (
            "POSIX-mode scrubber diverged from legacy behavior on real "
            f"os.environ (host platform={sys.platform})"
        )

    def test_windows_mode_is_strict_superset_of_posix_mode(self):
        """Correctness check on the NEW behavior: is_windows=True must
        keep everything POSIX mode keeps, and *may* add Windows
        essentials.  It must never drop a var that POSIX mode would keep
        — if it did, we'd have broken same-host reuse of the scrubber."""
        env = {**self._POSIX_SYNTHETIC_ENV, **self._WINDOWS_SYNTHETIC_ENV}
        posix_result = _scrub_child_env(env,
                                        is_passthrough=lambda _: False,
                                        is_windows=False)
        windows_result = _scrub_child_env(env,
                                          is_passthrough=lambda _: False,
                                          is_windows=True)
        missing = set(posix_result) - set(windows_result)
        assert not missing, (
            f"is_windows=True dropped vars that is_windows=False kept: {missing}"
        )
        # And any extras must come from the Windows essentials allowlist.
        extras = set(windows_result) - set(posix_result)
        for k in extras:
            assert k.upper() in _WINDOWS_ESSENTIAL_ENV_VARS, (
                f"Unexpected extra var in windows-mode output: {k} "
                f"(not in _WINDOWS_ESSENTIAL_ENV_VARS)"
            )


# ---------------------------------------------------------------------------
# UTF-8 file-write regression test
# ---------------------------------------------------------------------------
#
# The sandbox writes two Python files into a temp dir — the generated
# ``hermes_tools.py`` stub, and the LLM's ``script.py``.  Both contain
# non-ASCII characters in practice: the stub has em-dashes in docstrings
# ("``tcp://host:port`` — the parent falls back..."), and user scripts
# routinely contain non-ASCII strings, comments, or Unicode identifiers.
#
# On Windows, ``open(path, "w")`` without encoding= uses the system locale
# (cp1252 on US/UK installs), which cannot encode em-dashes.  Python then
# tries to decode the file as UTF-8 when importing it (PEP 3120), fails,
# and the sandbox aborts with:
#
#     SyntaxError: (unicode error) 'utf-8' codec can't decode byte 0x97
#                  in position N: invalid start byte
#
# This was the *second* Windows-specific bug (WinError 10106 was the first).
# The fix is to always pass ``encoding="utf-8"`` when writing Python source.


class TestSandboxWritesUtf8:
    """Verify the file-write call sites use UTF-8 explicitly, not the
    platform default.  We check the source of ``execute_code`` rather
    than spawning a real sandbox because the latter needs a full agent
    context — but the code inspection is deterministic and fast."""

    def test_stub_and_script_writes_specify_utf8(self):
        """Both ``hermes_tools.py`` and ``script.py`` writes in
        ``_execute_local`` must pass ``encoding="utf-8"``."""
        import tools.code_execution_tool as cet
        src = open(cet.__file__, encoding="utf-8").read()

        # There should be no ``open(path, "w")`` without encoding= for
        # the two staging files.  Grep-style check: find every write of
        # a .py file inside tmpdir and assert the line also contains
        # ``encoding="utf-8"`` within a short window.
        import re
        pattern = re.compile(
            r'open\(\s*os\.path\.join\(\s*tmpdir\s*,\s*"[^"]+\.py"\s*\)\s*,\s*"w"[^)]*\)'
        )
        for match in pattern.finditer(src):
            line = match.group(0)
            assert 'encoding="utf-8"' in line or "encoding='utf-8'" in line, (
                f"Sandbox file write missing encoding=\"utf-8\" on Windows: {line!r}"
            )

    def test_file_rpc_stub_uses_utf8(self):
        """The file-based RPC transport stub (used by remote backends)
        reads/writes JSON response files.  Those must also specify UTF-8
        so non-ASCII tool results survive the round-trip intact."""
        from tools.code_execution_tool import generate_hermes_tools_module
        stub = generate_hermes_tools_module(["terminal"], transport="file")
        # The generated stub should open response + request files as UTF-8.
        assert 'encoding="utf-8"' in stub, (
            "File-based RPC stub does not specify encoding=\"utf-8\" — "
            "will corrupt non-ASCII tool results on non-UTF-8 locales."
        )

    def test_stub_source_roundtrips_through_utf8(self):
        """Concrete regression: write the generated stub to a temp file
        using ``encoding="utf-8"``, then parse it.  This is what the
        sandbox does, and it must succeed even when the stub contains
        em-dashes (which it does — check the transport-header docstring).
        """
        from tools.code_execution_tool import generate_hermes_tools_module
        import tempfile, ast
        stub = generate_hermes_tools_module(
            ["terminal", "read_file", "write_file"], transport="uds"
        )
        # Sanity: stub actually contains a non-ASCII character, otherwise
        # this test wouldn't prove anything meaningful.
        non_ascii = [c for c in stub if ord(c) > 127]
        assert non_ascii, (
            "Generated stub is pure ASCII — test is meaningless.  If the "
            "stub's docstrings have lost their em-dashes, update this "
            "assertion, but be aware the original regression is no longer "
            "covered."
        )

        with tempfile.NamedTemporaryFile(
            mode="w", suffix=".py", delete=False, encoding="utf-8"
        ) as f:
            f.write(stub)
            tmp_path = f.name

        try:
            # Re-read and parse exactly like the child Python would.
            with open(tmp_path, encoding="utf-8") as fh:
                round_tripped = fh.read()
            assert round_tripped == stub, "UTF-8 round-trip corrupted the stub"
            ast.parse(round_tripped)  # must not raise SyntaxError
        finally:
            os.unlink(tmp_path)

    @pytest.mark.skipif(
        sys.platform != "win32",
        reason="cp1252 default-encoding regression is Windows-specific",
    )
    def test_windows_default_encoding_would_have_failed(self):
        """Negative control: prove that on Windows, writing the stub
        *without* ``encoding="utf-8"`` would corrupt the file.  If this
        test ever starts failing (i.e. default write succeeds), it means
        Python's default encoding has changed and the explicit UTF-8
        requirement may be obsolete — reconsider the fix."""
        from tools.code_execution_tool import generate_hermes_tools_module
        import tempfile

        stub = generate_hermes_tools_module(["terminal"], transport="uds")
        # Find a non-ASCII character we can use to prove the corruption.
        non_ascii = [c for c in stub if ord(c) > 127]
        if not non_ascii:
            pytest.skip("stub has no non-ASCII chars — nothing to corrupt")

        # Write with default encoding (simulating the old buggy code).
        with tempfile.NamedTemporaryFile(
            mode="w", suffix=".py", delete=False
        ) as f:
            try:
                f.write(stub)
                tmp_path = f.name
                wrote_successfully = True
            except UnicodeEncodeError:
                # Default encoding can't even encode it — that's the bug
                # in a different form.  Still proves the point.
                tmp_path = f.name
                wrote_successfully = False

        try:
            if not wrote_successfully:
                # Default-encoding write raised outright.  The bug is real.
                return

            # Read back as UTF-8 (what Python does on import).
            with open(tmp_path, encoding="utf-8") as fh:
                try:
                    fh.read()
                    # If this succeeds on Windows, the platform default is
                    # already UTF-8 (e.g. Python 3.15 with UTF-8 mode on).
                    # In that case the explicit encoding= is belt-and-
                    # suspenders but no longer strictly required.  Skip.
                    pytest.skip(
                        "Default text-file encoding is UTF-8-compatible on "
                        "this Windows build — explicit encoding= is no "
                        "longer load-bearing, but keep it for belt-and-"
                        "suspenders."
                    )
                except UnicodeDecodeError:
                    # Exactly the failure mode that motivated the fix.
                    pass
        finally:
            os.unlink(tmp_path)


# ---------------------------------------------------------------------------
# UTF-8 stdio regression test
# ---------------------------------------------------------------------------
#
# The third Windows-specific sandbox bug: after the UTF-8 file-write fix
# let the child import hermes_tools, a user script that printed non-ASCII
# to stdout still crashed with:
#
#     UnicodeEncodeError: 'charmap' codec can't encode character '\u2192'
#                         in position N: character maps to <undefined>
#
# Python's sys.stdout on Windows is bound to the console code page
# (cp1252 on US-locale installs) when the process is attached to a pipe
# without PYTHONIOENCODING set.  LLM-generated scripts routinely print
# em-dashes, arrows, accented chars, emoji — all of which break.
#
# Fix: spawn the child with PYTHONIOENCODING=utf-8 and PYTHONUTF8=1.
# The latter also makes open()'s default encoding UTF-8 (PEP 540),
# belt-and-suspenders for user scripts that do their own file I/O.


class TestChildStdioIsUtf8:
    """Verify the sandbox child is spawned with UTF-8 stdio encoding,
    so LLM scripts can print non-ASCII without crashing on Windows."""

    def test_popen_env_sets_pythonioencoding_utf8(self):
        """Source-level check: the Popen call site must set
        PYTHONIOENCODING=utf-8 in child_env."""
        import tools.code_execution_tool as cet
        src = open(cet.__file__, encoding="utf-8").read()
        assert 'child_env["PYTHONIOENCODING"] = "utf-8"' in src, (
            "PYTHONIOENCODING=utf-8 missing from child env — Windows "
            "scripts that print non-ASCII will crash with "
            "UnicodeEncodeError."
        )

    def test_popen_env_sets_pythonutf8_mode(self):
        """Source-level check: PYTHONUTF8=1 must be set too — it makes
        open()'s default encoding UTF-8 in user-written file I/O."""
        import tools.code_execution_tool as cet
        src = open(cet.__file__, encoding="utf-8").read()
        assert 'child_env["PYTHONUTF8"] = "1"' in src, (
            "PYTHONUTF8=1 missing from child env — user scripts that "
            "call open(path, 'w') without encoding= will produce "
            "locale-encoded files on Windows."
        )

    def test_live_child_can_print_non_ascii(self):
        """Live regression: spawn a Python child with the same env
        treatment the sandbox uses (PYTHONIOENCODING=utf-8 + PYTHONUTF8=1)
        and verify it can print em-dashes, arrows, and emoji to stdout
        without crashing.  This is the exact scenario that broke in live
        usage.

        Runs on every OS — on POSIX the fix is belt-and-suspenders but
        still load-bearing for C.ASCII locale environments.
        """
        script = textwrap.dedent("""
            import sys
            # Mix of chars that cp1252 can't encode: arrow, emoji.
            print("em-dash \\u2014 arrow \\u2192 emoji \\U0001f680")
            sys.exit(0)
        """).strip()

        # Build a scrubbed env the same way the sandbox does, then apply
        # the stdio overrides.
        scrubbed = _scrub_child_env(os.environ, is_passthrough=_no_passthrough)
        scrubbed["PYTHONIOENCODING"] = "utf-8"
        scrubbed["PYTHONUTF8"] = "1"

        result = subprocess.run(
            [sys.executable, "-c", script],
            env=scrubbed,
            capture_output=True,
            timeout=15,
            # Don't decode at the subprocess boundary — we want to check
            # the raw bytes match UTF-8, same as what the sandbox does.
        )
        assert result.returncode == 0, (
            f"Child crashed printing non-ASCII:\n"
            f"  stdout (raw): {result.stdout!r}\n"
            f"  stderr (raw): {result.stderr!r}"
        )
        decoded = result.stdout.decode("utf-8")
        assert "\u2014" in decoded, f"em-dash missing from output: {decoded!r}"
        assert "\u2192" in decoded, f"arrow missing from output: {decoded!r}"
        assert "\U0001f680" in decoded, f"emoji missing from output: {decoded!r}"

    @pytest.mark.skipif(
        sys.platform != "win32",
        reason="cp1252 stdout default is Windows-specific",
    )
    def test_windows_child_without_utf8_env_would_fail(self):
        """Negative control: spawn a Python child *without* our env
        overrides and prove that on Windows, printing non-ASCII fails.
        If this ever starts passing, Python has changed its default
        stdio encoding on Windows and the fix may be obsolete — but
        keep the env vars anyway for belt-and-suspenders."""
        script = textwrap.dedent("""
            import sys
            print("em-dash \\u2014 arrow \\u2192")
            sys.exit(0)
        """).strip()

        # Scrubbed env WITHOUT the PYTHONIOENCODING / PYTHONUTF8 overrides.
        # Also scrub PYTHONUTF8 and PYTHONIOENCODING from the inherited
        # env so we reproduce the buggy state even if the parent test
        # runner has them set.
        scrubbed = _scrub_child_env(os.environ, is_passthrough=_no_passthrough)
        for k in ("PYTHONIOENCODING", "PYTHONUTF8", "PYTHONLEGACYWINDOWSSTDIO"):
            scrubbed.pop(k, None)

        result = subprocess.run(
            [sys.executable, "-c", script],
            env=scrubbed,
            capture_output=True,
            text=False,
            timeout=15,
        )
        # Either the child crashed (expected), or modern Python handled
        # it anyway — in which case the fix is still defensive but no
        # longer strictly required.  Skip with a note if so.
        if result.returncode == 0 and b"\xe2\x80\x94" in result.stdout:
            pytest.skip(
                "This Python/Windows build handles non-ASCII stdout even "
                "without PYTHONIOENCODING/PYTHONUTF8 — fix is defensive "
                "but no longer strictly load-bearing.  Keep the env vars "
                "for older Python builds and C.ASCII-locale containers."
            )
        # Otherwise: crash OR garbled output — both count as proving the
        # bug is real on this system.
-												execute_code: pass through Windows OS-essential env vars

The sandbox's env scrubbing was dropping SYSTEMROOT, WINDIR, COMSPEC,
APPDATA, etc. On Windows this broke the child process before any RPC
could happen:

    OSError: [WinError 10106] The requested service provider could not
    be loaded or initialized

Python's socket module uses SYSTEMROOT to locate mswsock.dll during
Winsock initialization. Without it, socket.socket(AF_INET, SOCK_STREAM)
fails — and the existing loopback-TCP fallback for Windows couldn't work.

Fix: add a small Windows-only allowlist (_WINDOWS_ESSENTIAL_ENV_VARS)
matched by exact uppercase name, after the existing secret-substring
block. The secret block still runs first, so the allowlist cannot be
used to exfiltrate credentials. Also extract the env scrubber into a
testable helper (_scrub_child_env) that takes is_windows as a parameter,
so the logic can be unit-tested on any OS.

Live Winsock smoke test verifies that a child spawned with the scrubbed
env can now create an AF_INET socket on a real Windows host; the test
is guarded by sys.platform == 'win32' so POSIX CI stays green.

											
										
										
											2026-05-07 18:39:38 -07:00
+								"""Tests for execute_code env scrubbing on Windows.
 								On Windows the child process needs a small set of OS-essential env vars
 								(SYSTEMROOT, WINDIR, COMSPEC, ...) to run.  Without SYSTEMROOT in particular,
 								``socket.socket(AF_INET, SOCK_STREAM)`` fails inside the sandbox with
 								WinError 10106 (Winsock can't locate mswsock.dll) and no tool call over
 								loopback TCP can ever succeed.
 								These tests cover ``_scrub_child_env`` directly so they run on every OS
 								— the logic is conditional on a passed-in ``is_windows`` flag, not on
 								the host platform.  We also keep a live Winsock smoke test that only runs
 								on a real Windows host.
-												execute_code: write sandbox files as UTF-8 on Windows

Second Windows-specific sandbox bug (WinError 10106 was the first):
after the env-scrub fix let the child start, it immediately failed to
import hermes_tools with:

    SyntaxError: (unicode error) 'utf-8' codec can't decode byte 0x97
                 in position 154: invalid start byte

Root cause: _execute_local wrote the generated hermes_tools.py stub and
the user's script.py via open(path, 'w') without encoding=.  On Windows
the default text-mode encoding is cp1252 (system locale), which encodes
em-dashes (used in the stub's docstrings) as 0x97.  Python then decodes
source files as UTF-8 (PEP 3120) on import, chokes on 0x97, and the
sandbox dies before any tool call.

Fix: pass encoding='utf-8' to all four file opens in the code_execution
path — the two staging writes in _execute_local (hermes_tools.py +
script.py) and the two RPC file-transport reads/writes in the generated
remote stub.  JSON is ASCII-safe for most payloads but tool results
(terminal output, web_extract content) routinely carry non-ASCII.

Tests added (4):
  - test_stub_and_script_writes_specify_utf8 — source grep guard
  - test_file_rpc_stub_uses_utf8 — generated remote stub check
  - test_stub_source_roundtrips_through_utf8 — concrete round-trip
  - test_windows_default_encoding_would_have_failed — negative control
    (skips on modern Python builds where default is already UTF-8
    compatible, but retained for platforms where the regression could
    return)

24/25 tests pass on Windows 3.11 (negative control skips because this
Python build handles em-dashes via cp1252 subset — the fix is still
correct, just the corruption path isn't always triggerable).

											
										
										
											2026-05-07 18:52:59 -07:00
 								Also covers the companion Windows bug: the sandbox writes
 								``hermes_tools.py`` and ``script.py`` into a temp dir, and those files
 								must be written as UTF-8 on every platform — the generated stub contains
 								em-dash/en-dash characters in docstrings, and the default ``open(path, "w")``
 								on Windows uses the system locale (cp1252 typically), corrupting those
 								bytes.  The child then fails to import with a SyntaxError:
 								``'utf-8' codec can't decode byte 0x97``.
-												execute_code: pass through Windows OS-essential env vars

The sandbox's env scrubbing was dropping SYSTEMROOT, WINDIR, COMSPEC,
APPDATA, etc. On Windows this broke the child process before any RPC
could happen:

    OSError: [WinError 10106] The requested service provider could not
    be loaded or initialized

Python's socket module uses SYSTEMROOT to locate mswsock.dll during
Winsock initialization. Without it, socket.socket(AF_INET, SOCK_STREAM)
fails — and the existing loopback-TCP fallback for Windows couldn't work.

Fix: add a small Windows-only allowlist (_WINDOWS_ESSENTIAL_ENV_VARS)
matched by exact uppercase name, after the existing secret-substring
block. The secret block still runs first, so the allowlist cannot be
used to exfiltrate credentials. Also extract the env scrubber into a
testable helper (_scrub_child_env) that takes is_windows as a parameter,
so the logic can be unit-tested on any OS.

Live Winsock smoke test verifies that a child spawned with the scrubbed
env can now create an AF_INET socket on a real Windows host; the test
is guarded by sys.platform == 'win32' so POSIX CI stays green.

											
										
										
											2026-05-07 18:39:38 -07:00
+								"""
 								import os
 								import subprocess
 								import sys
 								import textwrap
 								import pytest
 								from tools.code_execution_tool import (
 								    _SECRET_SUBSTRINGS,
 								    _WINDOWS_ESSENTIAL_ENV_VARS,
 								    _scrub_child_env,
 								)
 								def _no_passthrough(_name):
 								    return False
 								class TestWindowsEssentialAllowlist:
 								    """The allowlist itself — contents, shape, and invariants."""
 								    def test_contains_winsock_required_vars(self):
 								        # Without SYSTEMROOT the child cannot initialize Winsock.
 								        assert "SYSTEMROOT" in _WINDOWS_ESSENTIAL_ENV_VARS
 								    def test_contains_subprocess_required_vars(self):
 								        # Without COMSPEC, subprocess can't resolve the default shell.
 								        assert "COMSPEC" in _WINDOWS_ESSENTIAL_ENV_VARS
 								    def test_contains_user_profile_vars(self):
 								        # os.path.expanduser("~") on Windows uses USERPROFILE.
 								        assert "USERPROFILE" in _WINDOWS_ESSENTIAL_ENV_VARS
 								        assert "APPDATA" in _WINDOWS_ESSENTIAL_ENV_VARS
 								        assert "LOCALAPPDATA" in _WINDOWS_ESSENTIAL_ENV_VARS
 								    def test_contains_only_uppercase_names(self):
 								        # Windows env var names are case-insensitive but we canonicalize to
 								        # uppercase for the membership check (``k.upper() in _WINDOWS_...``).
 								        for name in _WINDOWS_ESSENTIAL_ENV_VARS:
 								            assert name == name.upper(), f"{name!r} should be uppercase"
 								    def test_no_overlap_with_secret_substrings(self):
 								        # Sanity: none of the essential OS vars should look like secrets.
 								        # If this ever fires, we'd have a precedence ordering bug (secrets
 								        # are blocked *before* the essentials check).
 								        for name in _WINDOWS_ESSENTIAL_ENV_VARS:
 								            assert not any(s in name for s in _SECRET_SUBSTRINGS), (
 								                f"{name!r} looks secret-like — would be blocked before the "
 								                "essentials allowlist can match"
 								            )
 								class TestScrubChildEnvWindows:
 								    """Verify _scrub_child_env passes Windows essentials through when
 								    is_windows=True and blocks them when is_windows=False (so POSIX hosts
 								    don't inherit pointless Windows vars)."""
 								    def _sample_windows_env(self):
 								        """A realistic subset of what os.environ looks like on Windows."""
 								        return {
 								            "SYSTEMROOT": r"C:\Windows",
 								            "SystemDrive": "C:",        # Windows preserves native case
 								            "WINDIR": r"C:\Windows",
 								            "ComSpec": r"C:\Windows\System32\cmd.exe",
 								            "PATHEXT": ".COM;.EXE;.BAT;.CMD;.PY",
 								            "USERPROFILE": r"C:\Users\alice",
 								            "APPDATA": r"C:\Users\alice\AppData\Roaming",
 								            "LOCALAPPDATA": r"C:\Users\alice\AppData\Local",
 								            "PATH": r"C:\Windows\System32;C:\Python311",
 								            "HOME": r"C:\Users\alice",
 								            "TEMP": r"C:\Users\alice\AppData\Local\Temp",
 								            # Should still be blocked:
 								            "OPENAI_API_KEY": "sk-secret",
 								            "GITHUB_TOKEN": "ghp_secret",
 								            "MY_PASSWORD": "hunter2",
 								            # Not matched by any rule — should be dropped on both OSes:
 								            "RANDOM_UNKNOWN_VAR": "value",
 								        }
 								    def test_windows_essentials_passed_through_when_is_windows_true(self):
 								        env = self._sample_windows_env()
 								        scrubbed = _scrub_child_env(env,
 								                                    is_passthrough=_no_passthrough,
 								                                    is_windows=True)
 								        # Every essential var from the sample env should survive.
 								        assert scrubbed["SYSTEMROOT"] == r"C:\Windows"
 								        assert scrubbed["SystemDrive"] == "C:"  # case preserved
 								        assert scrubbed["WINDIR"] == r"C:\Windows"
 								        assert scrubbed["ComSpec"] == r"C:\Windows\System32\cmd.exe"
 								        assert scrubbed["PATHEXT"] == ".COM;.EXE;.BAT;.CMD;.PY"
 								        assert scrubbed["USERPROFILE"] == r"C:\Users\alice"
 								        assert scrubbed["APPDATA"].endswith("Roaming")
 								        assert scrubbed["LOCALAPPDATA"].endswith("Local")
 								        # Safe-prefix vars still pass (baseline behavior).
 								        assert "PATH" in scrubbed
 								        assert "HOME" in scrubbed
 								        assert "TEMP" in scrubbed
 								    def test_secrets_still_blocked_on_windows(self):
 								        """The Windows allowlist must NOT defeat the secret-substring block.
 								        This is the key security invariant: essentials are allowed by
 								        *exact name*, and the secret-substring block runs before the
 								        essentials check anyway, so a variable named e.g. ``API_KEY`` can
 								        never sneak through just because we added Windows support.
 								        """
 								        env = self._sample_windows_env()
 								        scrubbed = _scrub_child_env(env,
 								                                    is_passthrough=_no_passthrough,
 								                                    is_windows=True)
 								        assert "OPENAI_API_KEY" not in scrubbed
 								        assert "GITHUB_TOKEN" not in scrubbed
 								        assert "MY_PASSWORD" not in scrubbed
 								    def test_unknown_vars_still_dropped_on_windows(self):
 								        env = self._sample_windows_env()
 								        scrubbed = _scrub_child_env(env,
 								                                    is_passthrough=_no_passthrough,
 								                                    is_windows=True)
 								        assert "RANDOM_UNKNOWN_VAR" not in scrubbed
 								    def test_essentials_blocked_when_is_windows_false(self):
 								        """On POSIX hosts, Windows-specific vars should not pass — they
 								        have no meaning and could confuse child tooling."""
 								        env = self._sample_windows_env()
 								        scrubbed = _scrub_child_env(env,
 								                                    is_passthrough=_no_passthrough,
 								                                    is_windows=False)
 								        # Safe prefixes still match (PATH, HOME, TEMP).
 								        assert "PATH" in scrubbed
 								        assert "HOME" in scrubbed
 								        assert "TEMP" in scrubbed
 								        # But Windows OS vars should be dropped.
 								        assert "SYSTEMROOT" not in scrubbed
 								        assert "WINDIR" not in scrubbed
 								        assert "ComSpec" not in scrubbed
 								        assert "APPDATA" not in scrubbed
 								    def test_case_insensitive_essential_match(self):
 								        """Windows env var names are case-insensitive at the OS level but
 								        Python preserves whatever case os.environ reported.  The scrubber
 								        must normalize to uppercase for the membership check."""
 								        env = {
 								            "SystemRoot": r"C:\Windows",       # mixed case
 								            "comspec": r"C:\Windows\System32\cmd.exe",  # lowercase
 								            "APPDATA": r"C:\Users\x\AppData\Roaming",   # uppercase
 								        }
 								        scrubbed = _scrub_child_env(env,
 								                                    is_passthrough=_no_passthrough,
 								                                    is_windows=True)
 								        assert "SystemRoot" in scrubbed
 								        assert "comspec" in scrubbed
 								        assert "APPDATA" in scrubbed
 								class TestScrubChildEnvPassthroughInteraction:
 								    """The passthrough hook runs *before* the secret block, so a skill
 								    can legitimately forward a third-party API key.  The Windows
 								    essentials addition must not interfere with that."""
 								    def test_passthrough_wins_over_secret_block(self):
 								        env = {"TENOR_API_KEY": "x", "PATH": "/bin"}
 								        scrubbed = _scrub_child_env(env,
 								                                    is_passthrough=lambda k: k == "TENOR_API_KEY",
 								                                    is_windows=False)
 								        assert scrubbed.get("TENOR_API_KEY") == "x"
 								        assert scrubbed.get("PATH") == "/bin"
 								    def test_passthrough_still_works_on_windows(self):
 								        env = {
 								            "TENOR_API_KEY": "x",
 								            "SYSTEMROOT": r"C:\Windows",
 								            "OPENAI_API_KEY": "sk-secret",  # not passthrough
 								        }
 								        scrubbed = _scrub_child_env(
 								            env,
 								            is_passthrough=lambda k: k == "TENOR_API_KEY",
 								            is_windows=True,
 								        )
 								        assert scrubbed.get("TENOR_API_KEY") == "x"
 								        assert scrubbed.get("SYSTEMROOT") == r"C:\Windows"
 								        assert "OPENAI_API_KEY" not in scrubbed
 								@pytest.mark.skipif(
 								    sys.platform != "win32",
 								    reason="Winsock-specific regression — only meaningful on Windows",
 								)
 								class TestWindowsSocketSmokeTest:
 								    """Integration-ish smoke test: spawn a child Python with a scrubbed
 								    env and confirm it can create an AF_INET socket.  This is the
 								    regression that motivated the fix — without SYSTEMROOT the child
 								    hits WinError 10106 before any RPC is attempted."""
 								    def test_child_can_create_socket_with_scrubbed_env(self):
 								        scrubbed = _scrub_child_env(os.environ, is_passthrough=_no_passthrough)
 								        # Build a tiny child script that simply opens an AF_INET socket.
 								        script = textwrap.dedent("""
 								            import socket, sys
 								            try:
 								                s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 								                s.close()
 								                print("OK")
 								                sys.exit(0)
 								            except OSError as exc:
 								                print(f"FAIL: {exc}")
 								                sys.exit(1)
 								        """).strip()
 								        result = subprocess.run(
 								            [sys.executable, "-c", script],
 								            env=scrubbed,
 								            capture_output=True,
 								            text=True,
 								            timeout=15,
 								        )
 								        assert result.returncode == 0, (
 								            f"Child failed to create socket with scrubbed env:\n"
 								            f"  stdout={result.stdout!r}\n"
 								            f"  stderr={result.stderr!r}\n"
 								            f"  scrubbed keys={sorted(scrubbed.keys())}"
 								        )
 								        assert "OK" in result.stdout
-												tests: lock in POSIX-equivalence guard for execute_code env scrubber

Adds TestPosixEquivalence to test_code_execution_windows_env.py.  The
class pins the invariant that _scrub_child_env(env, is_windows=False)
produces byte-for-byte identical output to the pre-refactor inline
scrubber, across a matrix of:

  - 2 synthetic envs (POSIX-shaped, Windows-shaped-on-POSIX)
  - 3 passthrough rules (none, single-var, everything)
  - 1 real-os.environ check on whatever platform runs the test

Plus a superset sanity check: is_windows=True must keep everything
is_windows=False keeps, and any extras must come from the
_WINDOWS_ESSENTIAL_ENV_VARS allowlist.

Rationale: the previous commit refactored the env-scrubbing inline
block into a helper.  Future changes to that helper must not silently
regress POSIX behavior — if someone needs to change it, they update
_legacy_posix_scrubber in lockstep so the churn is visible in review.

All 21 tests in the file pass locally on Windows (pytest 9.0.3).  8 of
them are parametrized equivalence checks that run on every OS.

											
										
										
											2026-05-07 18:45:34 -07:00
 								# ---------------------------------------------------------------------------
 								# POSIX equivalence guard
 								# ---------------------------------------------------------------------------
 								def _legacy_posix_scrubber(source_env, is_passthrough):
-												fix(code-exec): restore approval context in execute_code RPC threads + guard entry

Wrap both execute_code RPC threads (local UDS + remote file-RPC) with propagate_context_to_thread so gateway sessions no longer fall into check_dangerous_command's non-interactive auto-approve branch and the CLI approval prompt stays reachable. Add check_execute_code_guard: one-shot fail-closed approval of the whole script in gateway/ask/cron-deny before the child spawns (skips isolated backends; command-string built only past the early returns). Drop the broad HERMES_ env passthrough for an explicit operational allowlist plus DSN/WEBHOOK secret substrings, and update the POSIX-equivalence oracle.

Refs #4146, #27303, #30882, #33057

											
										
										
											2026-05-28 17:47:09 -04:00
+								    """Independent oracle for TestPosixEquivalence — a from-scratch reimpl of
 								    _scrub_child_env's POSIX behavior, used to prove the production helper does
 								    what we think it does.
 								    Deliberately updated for #27303 (the broad ``HERMES_`` prefix was dropped
 								    in favor of an explicit operational allowlist, and DSN/WEBHOOK were added
 								    to the secret substrings).  The original docstring said: if POSIX behavior
 								    legitimately needs to evolve, adjust this oracle on purpose so the churn is
 								    visible in review — that is what this change is.
-												tests: lock in POSIX-equivalence guard for execute_code env scrubber

Adds TestPosixEquivalence to test_code_execution_windows_env.py.  The
class pins the invariant that _scrub_child_env(env, is_windows=False)
produces byte-for-byte identical output to the pre-refactor inline
scrubber, across a matrix of:

  - 2 synthetic envs (POSIX-shaped, Windows-shaped-on-POSIX)
  - 3 passthrough rules (none, single-var, everything)
  - 1 real-os.environ check on whatever platform runs the test

Plus a superset sanity check: is_windows=True must keep everything
is_windows=False keeps, and any extras must come from the
_WINDOWS_ESSENTIAL_ENV_VARS allowlist.

Rationale: the previous commit refactored the env-scrubbing inline
block into a helper.  Future changes to that helper must not silently
regress POSIX behavior — if someone needs to change it, they update
_legacy_posix_scrubber in lockstep so the churn is visible in review.

All 21 tests in the file pass locally on Windows (pytest 9.0.3).  8 of
them are parametrized equivalence checks that run on every OS.

											
										
										
											2026-05-07 18:45:34 -07:00
+								    """
 								    _SAFE_ENV_PREFIXES = ("PATH", "HOME", "USER", "LANG", "LC_", "TERM",
 								                          "TMPDIR", "TMP", "TEMP", "SHELL", "LOGNAME",
-												fix(code-exec): restore approval context in execute_code RPC threads + guard entry

Wrap both execute_code RPC threads (local UDS + remote file-RPC) with propagate_context_to_thread so gateway sessions no longer fall into check_dangerous_command's non-interactive auto-approve branch and the CLI approval prompt stays reachable. Add check_execute_code_guard: one-shot fail-closed approval of the whole script in gateway/ask/cron-deny before the child spawns (skips isolated backends; command-string built only past the early returns). Drop the broad HERMES_ env passthrough for an explicit operational allowlist plus DSN/WEBHOOK secret substrings, and update the POSIX-equivalence oracle.

Refs #4146, #27303, #30882, #33057

											
										
										
											2026-05-28 17:47:09 -04:00
+								                          "XDG_", "PYTHONPATH", "VIRTUAL_ENV", "CONDA")
-												tests: lock in POSIX-equivalence guard for execute_code env scrubber

Adds TestPosixEquivalence to test_code_execution_windows_env.py.  The
class pins the invariant that _scrub_child_env(env, is_windows=False)
produces byte-for-byte identical output to the pre-refactor inline
scrubber, across a matrix of:

  - 2 synthetic envs (POSIX-shaped, Windows-shaped-on-POSIX)
  - 3 passthrough rules (none, single-var, everything)
  - 1 real-os.environ check on whatever platform runs the test

Plus a superset sanity check: is_windows=True must keep everything
is_windows=False keeps, and any extras must come from the
_WINDOWS_ESSENTIAL_ENV_VARS allowlist.

Rationale: the previous commit refactored the env-scrubbing inline
block into a helper.  Future changes to that helper must not silently
regress POSIX behavior — if someone needs to change it, they update
_legacy_posix_scrubber in lockstep so the churn is visible in review.

All 21 tests in the file pass locally on Windows (pytest 9.0.3).  8 of
them are parametrized equivalence checks that run on every OS.

											
										
										
											2026-05-07 18:45:34 -07:00
+								    _SECRET_SUBSTRINGS = ("KEY", "TOKEN", "SECRET", "PASSWORD", "CREDENTIAL",
-												fix(code-exec): restore approval context in execute_code RPC threads + guard entry

Wrap both execute_code RPC threads (local UDS + remote file-RPC) with propagate_context_to_thread so gateway sessions no longer fall into check_dangerous_command's non-interactive auto-approve branch and the CLI approval prompt stays reachable. Add check_execute_code_guard: one-shot fail-closed approval of the whole script in gateway/ask/cron-deny before the child spawns (skips isolated backends; command-string built only past the early returns). Drop the broad HERMES_ env passthrough for an explicit operational allowlist plus DSN/WEBHOOK secret substrings, and update the POSIX-equivalence oracle.

Refs #4146, #27303, #30882, #33057

											
										
										
											2026-05-28 17:47:09 -04:00
+								                          "PASSWD", "AUTH", "DSN", "WEBHOOK")
 								    _HERMES_CHILD_ALLOWED = frozenset({
 								        "HERMES_HOME", "HERMES_PROFILE", "HERMES_CONFIG", "HERMES_ENV",
 								    })
-												tests: lock in POSIX-equivalence guard for execute_code env scrubber

Adds TestPosixEquivalence to test_code_execution_windows_env.py.  The
class pins the invariant that _scrub_child_env(env, is_windows=False)
produces byte-for-byte identical output to the pre-refactor inline
scrubber, across a matrix of:

  - 2 synthetic envs (POSIX-shaped, Windows-shaped-on-POSIX)
  - 3 passthrough rules (none, single-var, everything)
  - 1 real-os.environ check on whatever platform runs the test

Plus a superset sanity check: is_windows=True must keep everything
is_windows=False keeps, and any extras must come from the
_WINDOWS_ESSENTIAL_ENV_VARS allowlist.

Rationale: the previous commit refactored the env-scrubbing inline
block into a helper.  Future changes to that helper must not silently
regress POSIX behavior — if someone needs to change it, they update
_legacy_posix_scrubber in lockstep so the churn is visible in review.

All 21 tests in the file pass locally on Windows (pytest 9.0.3).  8 of
them are parametrized equivalence checks that run on every OS.

											
										
										
											2026-05-07 18:45:34 -07:00
+								    out = {}
 								    for k, v in source_env.items():
 								        if is_passthrough(k):
 								            out[k] = v
 								            continue
 								        if any(s in k.upper() for s in _SECRET_SUBSTRINGS):
 								            continue
 								        if any(k.startswith(p) for p in _SAFE_ENV_PREFIXES):
 								            out[k] = v
-												fix(code-exec): restore approval context in execute_code RPC threads + guard entry

Wrap both execute_code RPC threads (local UDS + remote file-RPC) with propagate_context_to_thread so gateway sessions no longer fall into check_dangerous_command's non-interactive auto-approve branch and the CLI approval prompt stays reachable. Add check_execute_code_guard: one-shot fail-closed approval of the whole script in gateway/ask/cron-deny before the child spawns (skips isolated backends; command-string built only past the early returns). Drop the broad HERMES_ env passthrough for an explicit operational allowlist plus DSN/WEBHOOK secret substrings, and update the POSIX-equivalence oracle.

Refs #4146, #27303, #30882, #33057

											
										
										
											2026-05-28 17:47:09 -04:00
+								            continue
 								        if k in _HERMES_CHILD_ALLOWED:
 								            out[k] = v
-												tests: lock in POSIX-equivalence guard for execute_code env scrubber

Adds TestPosixEquivalence to test_code_execution_windows_env.py.  The
class pins the invariant that _scrub_child_env(env, is_windows=False)
produces byte-for-byte identical output to the pre-refactor inline
scrubber, across a matrix of:

  - 2 synthetic envs (POSIX-shaped, Windows-shaped-on-POSIX)
  - 3 passthrough rules (none, single-var, everything)
  - 1 real-os.environ check on whatever platform runs the test

Plus a superset sanity check: is_windows=True must keep everything
is_windows=False keeps, and any extras must come from the
_WINDOWS_ESSENTIAL_ENV_VARS allowlist.

Rationale: the previous commit refactored the env-scrubbing inline
block into a helper.  Future changes to that helper must not silently
regress POSIX behavior — if someone needs to change it, they update
_legacy_posix_scrubber in lockstep so the churn is visible in review.

All 21 tests in the file pass locally on Windows (pytest 9.0.3).  8 of
them are parametrized equivalence checks that run on every OS.

											
										
										
											2026-05-07 18:45:34 -07:00
+								    return out
 								class TestPosixEquivalence:
 								    """Lock in the invariant that _scrub_child_env(env, is_windows=False)
 								    behaves *bit-for-bit identically* to the pre-refactor inline scrubber.
 								    If this ever fails, it means somebody changed POSIX env-scrubbing
 								    behavior — maybe on purpose, maybe not.  Either way it should land
 								    as a deliberate, reviewed change (update _legacy_posix_scrubber
 								    above in the same PR).
 								    Rationale: the Windows-essentials patch refactored the scrubber into
 								    a helper.  Linux/macOS must not regress.  This class gates that.
 								    """
 								    _POSIX_SYNTHETIC_ENV = {
 								        # Safe-prefix matches
 								        "PATH": "/usr/bin:/bin",
 								        "HOME": "/home/alice",
 								        "USER": "alice",
 								        "LANG": "en_US.UTF-8",
 								        "LC_CTYPE": "en_US.UTF-8",
 								        "TERM": "xterm-256color",
 								        "SHELL": "/bin/zsh",
 								        "LOGNAME": "alice",
 								        "TMPDIR": "/tmp",
 								        "XDG_RUNTIME_DIR": "/run/user/1000",
 								        "XDG_CONFIG_HOME": "/home/alice/.config",
 								        "PYTHONPATH": "/opt/lib",
 								        "VIRTUAL_ENV": "/home/alice/.venv",
 								        "CONDA_PREFIX": "/opt/conda",
-												fix(code-exec): restore approval context in execute_code RPC threads + guard entry

Wrap both execute_code RPC threads (local UDS + remote file-RPC) with propagate_context_to_thread so gateway sessions no longer fall into check_dangerous_command's non-interactive auto-approve branch and the CLI approval prompt stays reachable. Add check_execute_code_guard: one-shot fail-closed approval of the whole script in gateway/ask/cron-deny before the child spawns (skips isolated backends; command-string built only past the early returns). Drop the broad HERMES_ env passthrough for an explicit operational allowlist plus DSN/WEBHOOK secret substrings, and update the POSIX-equivalence oracle.

Refs #4146, #27303, #30882, #33057

											
										
										
											2026-05-28 17:47:09 -04:00
+								        # HERMES_* handling (#27303): only the operational allowlist passes;
 								        # every other HERMES_* is dropped (the broad prefix was removed).
 								        "HERMES_HOME": "/home/alice/.hermes",        # allowlisted → kept
 								        "HERMES_PROFILE": "default",                 # allowlisted → kept
 								        "HERMES_INTERACTIVE": "1",                   # not allowlisted → dropped
 								        "HERMES_BASE_URL": "https://api.internal",   # not allowlisted → dropped
 								        "HERMES_KANBAN_DB": "postgres://u:p@h/db",   # not allowlisted → dropped
-												tests: lock in POSIX-equivalence guard for execute_code env scrubber

Adds TestPosixEquivalence to test_code_execution_windows_env.py.  The
class pins the invariant that _scrub_child_env(env, is_windows=False)
produces byte-for-byte identical output to the pre-refactor inline
scrubber, across a matrix of:

  - 2 synthetic envs (POSIX-shaped, Windows-shaped-on-POSIX)
  - 3 passthrough rules (none, single-var, everything)
  - 1 real-os.environ check on whatever platform runs the test

Plus a superset sanity check: is_windows=True must keep everything
is_windows=False keeps, and any extras must come from the
_WINDOWS_ESSENTIAL_ENV_VARS allowlist.

Rationale: the previous commit refactored the env-scrubbing inline
block into a helper.  Future changes to that helper must not silently
regress POSIX behavior — if someone needs to change it, they update
_legacy_posix_scrubber in lockstep so the churn is visible in review.

All 21 tests in the file pass locally on Windows (pytest 9.0.3).  8 of
them are parametrized equivalence checks that run on every OS.

											
										
										
											2026-05-07 18:45:34 -07:00
+								        # Secret-substring blocks
 								        "OPENAI_API_KEY": "sk-xxx",
 								        "GITHUB_TOKEN": "ghp_xxx",
 								        "AWS_SECRET_ACCESS_KEY": "yyy",
 								        "MY_PASSWORD": "hunter2",
-												fix(code-exec): restore approval context in execute_code RPC threads + guard entry

Wrap both execute_code RPC threads (local UDS + remote file-RPC) with propagate_context_to_thread so gateway sessions no longer fall into check_dangerous_command's non-interactive auto-approve branch and the CLI approval prompt stays reachable. Add check_execute_code_guard: one-shot fail-closed approval of the whole script in gateway/ask/cron-deny before the child spawns (skips isolated backends; command-string built only past the early returns). Drop the broad HERMES_ env passthrough for an explicit operational allowlist plus DSN/WEBHOOK secret substrings, and update the POSIX-equivalence oracle.

Refs #4146, #27303, #30882, #33057

											
										
										
											2026-05-28 17:47:09 -04:00
+								        "SENTRY_DSN": "https://abc@sentry.io/1",     # DSN substring → blocked
 								        "SLACK_WEBHOOK": "https://hooks.slack/x",    # WEBHOOK substring → blocked
-												tests: lock in POSIX-equivalence guard for execute_code env scrubber

Adds TestPosixEquivalence to test_code_execution_windows_env.py.  The
class pins the invariant that _scrub_child_env(env, is_windows=False)
produces byte-for-byte identical output to the pre-refactor inline
scrubber, across a matrix of:

  - 2 synthetic envs (POSIX-shaped, Windows-shaped-on-POSIX)
  - 3 passthrough rules (none, single-var, everything)
  - 1 real-os.environ check on whatever platform runs the test

Plus a superset sanity check: is_windows=True must keep everything
is_windows=False keeps, and any extras must come from the
_WINDOWS_ESSENTIAL_ENV_VARS allowlist.

Rationale: the previous commit refactored the env-scrubbing inline
block into a helper.  Future changes to that helper must not silently
regress POSIX behavior — if someone needs to change it, they update
_legacy_posix_scrubber in lockstep so the churn is visible in review.

All 21 tests in the file pass locally on Windows (pytest 9.0.3).  8 of
them are parametrized equivalence checks that run on every OS.

											
										
										
											2026-05-07 18:45:34 -07:00
+								        # Uncategorized — must be dropped
 								        "RANDOM_UNKNOWN": "drop-me",
 								        "DISPLAY": ":0",
 								        "SSH_AUTH_SOCK": "/run/user/1000/ssh-agent",
 								        # Passthrough candidate (also matches secret block by default)
 								        "TENOR_API_KEY": "tenor-xxx",
 								    }
 								    _WINDOWS_SYNTHETIC_ENV = {
 								        # Windows-essential names (must be dropped on POSIX, passed on Win)
 								        "SYSTEMROOT": r"C:\Windows",
 								        "SystemDrive": "C:",
 								        "WINDIR": r"C:\Windows",
 								        "ComSpec": r"C:\Windows\System32\cmd.exe",
 								        "PATHEXT": ".COM;.EXE;.BAT",
 								        "USERPROFILE": r"C:\Users\alice",
 								        "APPDATA": r"C:\Users\alice\AppData\Roaming",
 								        "LOCALAPPDATA": r"C:\Users\alice\AppData\Local",
 								        # Safe-prefix matches (cross-platform)
 								        "PATH": r"C:\Python311;C:\Windows\System32",
 								        "HOME": r"C:\Users\alice",
 								        "TEMP": r"C:\Users\alice\AppData\Local\Temp",
 								        # Secret-looking (always blocked)
 								        "OPENAI_API_KEY": "sk-xxx",
 								        "GITHUB_TOKEN": "ghp_xxx",
 								    }
 								    @pytest.mark.parametrize("env_name,env", [
 								        ("posix_synthetic", _POSIX_SYNTHETIC_ENV),
 								        ("windows_synthetic_on_posix", _WINDOWS_SYNTHETIC_ENV),
 								    ])
 								    @pytest.mark.parametrize("pt_name,pt", [
 								        ("no_passthrough", lambda _: False),
 								        ("tenor_passthrough", lambda k: k == "TENOR_API_KEY"),
 								        ("all_passthrough", lambda _: True),
 								    ])
 								    def test_posix_behavior_unchanged(self, env_name, env, pt_name, pt):
 								        """For every combination of (env shape × passthrough rule), the
 								        new helper with is_windows=False must produce the exact same dict
 								        as the legacy inline scrubber.
 								        We parametrize over three passthrough rules to cover the full
 								        surface: no passthrough, single-var passthrough (the common
 								        skill-registered case), and everything-passes (edge case that
 								        could expose precedence bugs)."""
 								        expected = _legacy_posix_scrubber(env, pt)
 								        actual = _scrub_child_env(env, is_passthrough=pt, is_windows=False)
 								        assert actual == expected, (
 								            f"POSIX behavior regressed for env={env_name}, passthrough={pt_name}\n"
 								            f"  only in legacy: {sorted(set(expected) - set(actual))}\n"
 								            f"  only in new:    {sorted(set(actual) - set(expected))}\n"
 								            f"  value diffs:    {[k for k in expected if k in actual and expected[k] != actual[k]]}"
 								        )
 								    def test_posix_behavior_unchanged_on_real_os_environ(self):
 								        """Bonus check against the actual os.environ of the host running
 								        the test.  This covers vars we might not have thought to put in
 								        the synthetic fixtures."""
 								        expected = _legacy_posix_scrubber(os.environ, lambda _: False)
 								        actual = _scrub_child_env(os.environ,
 								                                  is_passthrough=lambda _: False,
 								                                  is_windows=False)
 								        assert actual == expected, (
 								            "POSIX-mode scrubber diverged from legacy behavior on real "
 								            f"os.environ (host platform={sys.platform})"
 								        )
 								    def test_windows_mode_is_strict_superset_of_posix_mode(self):
 								        """Correctness check on the NEW behavior: is_windows=True must
 								        keep everything POSIX mode keeps, and *may* add Windows
 								        essentials.  It must never drop a var that POSIX mode would keep
 								        — if it did, we'd have broken same-host reuse of the scrubber."""
 								        env = {**self._POSIX_SYNTHETIC_ENV, **self._WINDOWS_SYNTHETIC_ENV}
 								        posix_result = _scrub_child_env(env,
 								                                        is_passthrough=lambda _: False,
 								                                        is_windows=False)
 								        windows_result = _scrub_child_env(env,
 								                                          is_passthrough=lambda _: False,
 								                                          is_windows=True)
 								        missing = set(posix_result) - set(windows_result)
 								        assert not missing, (
 								            f"is_windows=True dropped vars that is_windows=False kept: {missing}"
 								        )
 								        # And any extras must come from the Windows essentials allowlist.
 								        extras = set(windows_result) - set(posix_result)
 								        for k in extras:
 								            assert k.upper() in _WINDOWS_ESSENTIAL_ENV_VARS, (
 								                f"Unexpected extra var in windows-mode output: {k} "
 								                f"(not in _WINDOWS_ESSENTIAL_ENV_VARS)"
 								            )
-												execute_code: write sandbox files as UTF-8 on Windows

Second Windows-specific sandbox bug (WinError 10106 was the first):
after the env-scrub fix let the child start, it immediately failed to
import hermes_tools with:

    SyntaxError: (unicode error) 'utf-8' codec can't decode byte 0x97
                 in position 154: invalid start byte

Root cause: _execute_local wrote the generated hermes_tools.py stub and
the user's script.py via open(path, 'w') without encoding=.  On Windows
the default text-mode encoding is cp1252 (system locale), which encodes
em-dashes (used in the stub's docstrings) as 0x97.  Python then decodes
source files as UTF-8 (PEP 3120) on import, chokes on 0x97, and the
sandbox dies before any tool call.

Fix: pass encoding='utf-8' to all four file opens in the code_execution
path — the two staging writes in _execute_local (hermes_tools.py +
script.py) and the two RPC file-transport reads/writes in the generated
remote stub.  JSON is ASCII-safe for most payloads but tool results
(terminal output, web_extract content) routinely carry non-ASCII.

Tests added (4):
  - test_stub_and_script_writes_specify_utf8 — source grep guard
  - test_file_rpc_stub_uses_utf8 — generated remote stub check
  - test_stub_source_roundtrips_through_utf8 — concrete round-trip
  - test_windows_default_encoding_would_have_failed — negative control
    (skips on modern Python builds where default is already UTF-8
    compatible, but retained for platforms where the regression could
    return)

24/25 tests pass on Windows 3.11 (negative control skips because this
Python build handles em-dashes via cp1252 subset — the fix is still
correct, just the corruption path isn't always triggerable).

											
										
										
											2026-05-07 18:52:59 -07:00
 								# ---------------------------------------------------------------------------
 								# UTF-8 file-write regression test
 								# ---------------------------------------------------------------------------
 								#
 								# The sandbox writes two Python files into a temp dir — the generated
 								# ``hermes_tools.py`` stub, and the LLM's ``script.py``.  Both contain
 								# non-ASCII characters in practice: the stub has em-dashes in docstrings
 								# ("``tcp://host:port`` — the parent falls back..."), and user scripts
 								# routinely contain non-ASCII strings, comments, or Unicode identifiers.
 								#
 								# On Windows, ``open(path, "w")`` without encoding= uses the system locale
 								# (cp1252 on US/UK installs), which cannot encode em-dashes.  Python then
 								# tries to decode the file as UTF-8 when importing it (PEP 3120), fails,
 								# and the sandbox aborts with:
 								#
 								#     SyntaxError: (unicode error) 'utf-8' codec can't decode byte 0x97
 								#                  in position N: invalid start byte
 								#
 								# This was the *second* Windows-specific bug (WinError 10106 was the first).
 								# The fix is to always pass ``encoding="utf-8"`` when writing Python source.
 								class TestSandboxWritesUtf8:
 								    """Verify the file-write call sites use UTF-8 explicitly, not the
 								    platform default.  We check the source of ``execute_code`` rather
 								    than spawning a real sandbox because the latter needs a full agent
 								    context — but the code inspection is deterministic and fast."""
 								    def test_stub_and_script_writes_specify_utf8(self):
 								        """Both ``hermes_tools.py`` and ``script.py`` writes in
 								        ``_execute_local`` must pass ``encoding="utf-8"``."""
 								        import tools.code_execution_tool as cet
 								        src = open(cet.__file__, encoding="utf-8").read()
 								        # There should be no ``open(path, "w")`` without encoding= for
 								        # the two staging files.  Grep-style check: find every write of
 								        # a .py file inside tmpdir and assert the line also contains
 								        # ``encoding="utf-8"`` within a short window.
 								        import re
 								        pattern = re.compile(
 								            r'open\(\s*os\.path\.join\(\s*tmpdir\s*,\s*"[^"]+\.py"\s*\)\s*,\s*"w"[^)]*\)'
 								        )
 								        for match in pattern.finditer(src):
 								            line = match.group(0)
 								            assert 'encoding="utf-8"' in line or "encoding='utf-8'" in line, (
 								                f"Sandbox file write missing encoding=\"utf-8\" on Windows: {line!r}"
 								            )
 								    def test_file_rpc_stub_uses_utf8(self):
 								        """The file-based RPC transport stub (used by remote backends)
 								        reads/writes JSON response files.  Those must also specify UTF-8
 								        so non-ASCII tool results survive the round-trip intact."""
 								        from tools.code_execution_tool import generate_hermes_tools_module
 								        stub = generate_hermes_tools_module(["terminal"], transport="file")
 								        # The generated stub should open response + request files as UTF-8.
 								        assert 'encoding="utf-8"' in stub, (
 								            "File-based RPC stub does not specify encoding=\"utf-8\" — "
 								            "will corrupt non-ASCII tool results on non-UTF-8 locales."
 								        )
 								    def test_stub_source_roundtrips_through_utf8(self):
 								        """Concrete regression: write the generated stub to a temp file
 								        using ``encoding="utf-8"``, then parse it.  This is what the
 								        sandbox does, and it must succeed even when the stub contains
 								        em-dashes (which it does — check the transport-header docstring).
 								        """
 								        from tools.code_execution_tool import generate_hermes_tools_module
 								        import tempfile, ast
 								        stub = generate_hermes_tools_module(
 								            ["terminal", "read_file", "write_file"], transport="uds"
 								        )
 								        # Sanity: stub actually contains a non-ASCII character, otherwise
 								        # this test wouldn't prove anything meaningful.
 								        non_ascii = [c for c in stub if ord(c) > 127]
 								        assert non_ascii, (
 								            "Generated stub is pure ASCII — test is meaningless.  If the "
 								            "stub's docstrings have lost their em-dashes, update this "
 								            "assertion, but be aware the original regression is no longer "
 								            "covered."
 								        )
 								        with tempfile.NamedTemporaryFile(
 								            mode="w", suffix=".py", delete=False, encoding="utf-8"
 								        ) as f:
 								            f.write(stub)
 								            tmp_path = f.name
 								        try:
 								            # Re-read and parse exactly like the child Python would.
 								            with open(tmp_path, encoding="utf-8") as fh:
 								                round_tripped = fh.read()
 								            assert round_tripped == stub, "UTF-8 round-trip corrupted the stub"
 								            ast.parse(round_tripped)  # must not raise SyntaxError
 								        finally:
 								            os.unlink(tmp_path)
 								    @pytest.mark.skipif(
 								        sys.platform != "win32",
 								        reason="cp1252 default-encoding regression is Windows-specific",
 								    )
 								    def test_windows_default_encoding_would_have_failed(self):
 								        """Negative control: prove that on Windows, writing the stub
 								        *without* ``encoding="utf-8"`` would corrupt the file.  If this
 								        test ever starts failing (i.e. default write succeeds), it means
 								        Python's default encoding has changed and the explicit UTF-8
 								        requirement may be obsolete — reconsider the fix."""
 								        from tools.code_execution_tool import generate_hermes_tools_module
 								        import tempfile
 								        stub = generate_hermes_tools_module(["terminal"], transport="uds")
 								        # Find a non-ASCII character we can use to prove the corruption.
 								        non_ascii = [c for c in stub if ord(c) > 127]
 								        if not non_ascii:
 								            pytest.skip("stub has no non-ASCII chars — nothing to corrupt")
 								        # Write with default encoding (simulating the old buggy code).
 								        with tempfile.NamedTemporaryFile(
 								            mode="w", suffix=".py", delete=False
 								        ) as f:
 								            try:
 								                f.write(stub)
 								                tmp_path = f.name
 								                wrote_successfully = True
 								            except UnicodeEncodeError:
 								                # Default encoding can't even encode it — that's the bug
 								                # in a different form.  Still proves the point.
 								                tmp_path = f.name
 								                wrote_successfully = False
 								        try:
 								            if not wrote_successfully:
 								                # Default-encoding write raised outright.  The bug is real.
 								                return
 								            # Read back as UTF-8 (what Python does on import).
 								            with open(tmp_path, encoding="utf-8") as fh:
 								                try:
 								                    fh.read()
 								                    # If this succeeds on Windows, the platform default is
 								                    # already UTF-8 (e.g. Python 3.15 with UTF-8 mode on).
 								                    # In that case the explicit encoding= is belt-and-
 								                    # suspenders but no longer strictly required.  Skip.
 								                    pytest.skip(
 								                        "Default text-file encoding is UTF-8-compatible on "
 								                        "this Windows build — explicit encoding= is no "
 								                        "longer load-bearing, but keep it for belt-and-"
 								                        "suspenders."
 								                    )
 								                except UnicodeDecodeError:
 								                    # Exactly the failure mode that motivated the fix.
 								                    pass
 								        finally:
 								            os.unlink(tmp_path)
-												execute_code: set PYTHONIOENCODING=utf-8 + PYTHONUTF8=1 in child env

Third Windows-specific sandbox bug (after WinError 10106 and the UTF-8
file-write bug): user scripts that print non-ASCII to stdout crash with

    UnicodeEncodeError: 'charmap' codec can't encode character '\u2192'
                        in position N: character maps to <undefined>

Root cause: Python's sys.stdout on Windows is bound to the console code
page (cp1252 on US-locale installs) when the process is attached to a
pipe without PYTHONIOENCODING set.  LLM-generated scripts routinely
print em-dashes, arrows, accented chars, and emoji — all of which cp1252
can't encode.

Fix: spawn the sandbox child with:

    PYTHONIOENCODING=utf-8   # sys.stdin/stdout/stderr all UTF-8
    PYTHONUTF8=1             # PEP 540 UTF-8 mode — open() defaults to UTF-8 too

PYTHONUTF8 is the belt-and-suspenders half: LLM scripts that call
open(path, 'w') without encoding= in user code will now produce UTF-8
files by default, matching what the sandbox already does for its own
staging files.

The parent side already decodes child stdout/stderr as UTF-8 with
errors='replace' (lines 1345-1347) so the end-to-end chain is clean.

On POSIX these values usually match the locale default already, so
setting them is harmless belt-and-suspenders for C/POSIX-locale
containers and minimal base images.

Tests added (4) — total file now at 28 passed, 1 skipped on Windows:
  - test_popen_env_sets_pythonioencoding_utf8 (source grep)
  - test_popen_env_sets_pythonutf8_mode (source grep)
  - test_live_child_can_print_non_ascii (cross-platform live test)
  - test_windows_child_without_utf8_env_would_fail (Windows negative
    control — actually reproduces the bug without our env overrides,
    proving the fix is load-bearing on this system)

											
										
										
											2026-05-07 18:59:35 -07:00
 								# ---------------------------------------------------------------------------
 								# UTF-8 stdio regression test
 								# ---------------------------------------------------------------------------
 								#
 								# The third Windows-specific sandbox bug: after the UTF-8 file-write fix
 								# let the child import hermes_tools, a user script that printed non-ASCII
 								# to stdout still crashed with:
 								#
 								#     UnicodeEncodeError: 'charmap' codec can't encode character '\u2192'
 								#                         in position N: character maps to <undefined>
 								#
 								# Python's sys.stdout on Windows is bound to the console code page
 								# (cp1252 on US-locale installs) when the process is attached to a pipe
 								# without PYTHONIOENCODING set.  LLM-generated scripts routinely print
 								# em-dashes, arrows, accented chars, emoji — all of which break.
 								#
 								# Fix: spawn the child with PYTHONIOENCODING=utf-8 and PYTHONUTF8=1.
 								# The latter also makes open()'s default encoding UTF-8 (PEP 540),
 								# belt-and-suspenders for user scripts that do their own file I/O.
 								class TestChildStdioIsUtf8:
 								    """Verify the sandbox child is spawned with UTF-8 stdio encoding,
 								    so LLM scripts can print non-ASCII without crashing on Windows."""
 								    def test_popen_env_sets_pythonioencoding_utf8(self):
 								        """Source-level check: the Popen call site must set
 								        PYTHONIOENCODING=utf-8 in child_env."""
 								        import tools.code_execution_tool as cet
 								        src = open(cet.__file__, encoding="utf-8").read()
 								        assert 'child_env["PYTHONIOENCODING"] = "utf-8"' in src, (
 								            "PYTHONIOENCODING=utf-8 missing from child env — Windows "
 								            "scripts that print non-ASCII will crash with "
 								            "UnicodeEncodeError."
 								        )
 								    def test_popen_env_sets_pythonutf8_mode(self):
 								        """Source-level check: PYTHONUTF8=1 must be set too — it makes
 								        open()'s default encoding UTF-8 in user-written file I/O."""
 								        import tools.code_execution_tool as cet
 								        src = open(cet.__file__, encoding="utf-8").read()
 								        assert 'child_env["PYTHONUTF8"] = "1"' in src, (
 								            "PYTHONUTF8=1 missing from child env — user scripts that "
 								            "call open(path, 'w') without encoding= will produce "
 								            "locale-encoded files on Windows."
 								        )
 								    def test_live_child_can_print_non_ascii(self):
 								        """Live regression: spawn a Python child with the same env
 								        treatment the sandbox uses (PYTHONIOENCODING=utf-8 + PYTHONUTF8=1)
 								        and verify it can print em-dashes, arrows, and emoji to stdout
 								        without crashing.  This is the exact scenario that broke in live
 								        usage.
 								        Runs on every OS — on POSIX the fix is belt-and-suspenders but
 								        still load-bearing for C.ASCII locale environments.
 								        """
 								        script = textwrap.dedent("""
 								            import sys
 								            # Mix of chars that cp1252 can't encode: arrow, emoji.
 								            print("em-dash \\u2014 arrow \\u2192 emoji \\U0001f680")
 								            sys.exit(0)
 								        """).strip()
 								        # Build a scrubbed env the same way the sandbox does, then apply
 								        # the stdio overrides.
 								        scrubbed = _scrub_child_env(os.environ, is_passthrough=_no_passthrough)
 								        scrubbed["PYTHONIOENCODING"] = "utf-8"
 								        scrubbed["PYTHONUTF8"] = "1"
 								        result = subprocess.run(
 								            [sys.executable, "-c", script],
 								            env=scrubbed,
 								            capture_output=True,
 								            timeout=15,
 								            # Don't decode at the subprocess boundary — we want to check
 								            # the raw bytes match UTF-8, same as what the sandbox does.
 								        )
 								        assert result.returncode == 0, (
 								            f"Child crashed printing non-ASCII:\n"
 								            f"  stdout (raw): {result.stdout!r}\n"
 								            f"  stderr (raw): {result.stderr!r}"
 								        )
 								        decoded = result.stdout.decode("utf-8")
 								        assert "\u2014" in decoded, f"em-dash missing from output: {decoded!r}"
 								        assert "\u2192" in decoded, f"arrow missing from output: {decoded!r}"
 								        assert "\U0001f680" in decoded, f"emoji missing from output: {decoded!r}"
 								    @pytest.mark.skipif(
 								        sys.platform != "win32",
 								        reason="cp1252 stdout default is Windows-specific",
 								    )
 								    def test_windows_child_without_utf8_env_would_fail(self):
 								        """Negative control: spawn a Python child *without* our env
 								        overrides and prove that on Windows, printing non-ASCII fails.
 								        If this ever starts passing, Python has changed its default
 								        stdio encoding on Windows and the fix may be obsolete — but
 								        keep the env vars anyway for belt-and-suspenders."""
 								        script = textwrap.dedent("""
 								            import sys
 								            print("em-dash \\u2014 arrow \\u2192")
 								            sys.exit(0)
 								        """).strip()
 								        # Scrubbed env WITHOUT the PYTHONIOENCODING / PYTHONUTF8 overrides.
 								        # Also scrub PYTHONUTF8 and PYTHONIOENCODING from the inherited
 								        # env so we reproduce the buggy state even if the parent test
 								        # runner has them set.
 								        scrubbed = _scrub_child_env(os.environ, is_passthrough=_no_passthrough)
 								        for k in ("PYTHONIOENCODING", "PYTHONUTF8", "PYTHONLEGACYWINDOWSSTDIO"):
 								            scrubbed.pop(k, None)
 								        result = subprocess.run(
 								            [sys.executable, "-c", script],
 								            env=scrubbed,
 								            capture_output=True,
 								            text=False,
 								            timeout=15,
 								        )
 								        # Either the child crashed (expected), or modern Python handled
 								        # it anyway — in which case the fix is still defensive but no
 								        # longer strictly required.  Skip with a note if so.
 								        if result.returncode == 0 and b"\xe2\x80\x94" in result.stdout:
 								            pytest.skip(
 								                "This Python/Windows build handles non-ASCII stdout even "
 								                "without PYTHONIOENCODING/PYTHONUTF8 — fix is defensive "
 								                "but no longer strictly load-bearing.  Keep the env vars "
 								                "for older Python builds and C.ASCII-locale containers."
 								            )
 								        # Otherwise: crash OR garbled output — both count as proving the
 								        # bug is real on this system.