From 0d9ad52922ce1d43fd110b3be1fe6e323d376609 Mon Sep 17 00:00:00 2001 From: Operator & Codex Date: Sat, 25 Apr 2026 19:37:54 +0200 Subject: [PATCH] fix(controlplane): stop git push token burn in jail --- Build: FAIL | Tests: FAIL --- docs/internal/AGENT-WORKFLOW-CHECKLIST.md | 7 ++ docs/internal/AGENT-WORKTREE-WORKFLOW.md | 40 +++++++++++ docs/internal/TOKEN-OPTIMIZATION.md | 81 +++++++++++++++++++++ justfile | 3 + scripts/git-doctor.sh | 68 ++++++++++++++++++ src/controlplane-heartbeat.test.ts | 85 +++++++++++++++++++++++ src/controlplane-heartbeat.ts | 33 ++++++++- 7 files changed, 316 insertions(+), 1 deletion(-) create mode 100644 docs/internal/TOKEN-OPTIMIZATION.md create mode 100755 scripts/git-doctor.sh diff --git a/docs/internal/AGENT-WORKFLOW-CHECKLIST.md b/docs/internal/AGENT-WORKFLOW-CHECKLIST.md index 13ae712..8237acb 100644 --- a/docs/internal/AGENT-WORKFLOW-CHECKLIST.md +++ b/docs/internal/AGENT-WORKFLOW-CHECKLIST.md @@ -6,6 +6,7 @@ Short operator checklist for the per-agent worktree model. - verify `git config core.hooksPath` returns `hooks` - verify `git config extensions.worktreeConfig` is `true` +- run `just git-doctor` - confirm the primary FreeBSD checkout is the only place that will push `multitenant` @@ -32,6 +33,7 @@ Short operator checklist for the per-agent worktree model. - rebase on latest `origin/multitenant` - if local commits were rewritten, push with `--force-with-lease` - report the feature branch name and tip commit hash +- do not ask jailed `git-admin` to push/fetch/pull remote branches ## Before Codex Integrates @@ -57,3 +59,8 @@ Short operator checklist for the per-agent worktree model. - fetch and rebase before retrying - rebase asks for an editor: - use `GIT_EDITOR=true git rebase --continue` +- every git command says `must be run in a work tree`: + - run `just git-doctor` + - if broken, run `sh scripts/git-doctor.sh --fix` +- git-admin push/fetch task burns tokens: + - remote git work belongs in a host worktree, not in `git-worker` diff --git a/docs/internal/AGENT-WORKTREE-WORKFLOW.md b/docs/internal/AGENT-WORKTREE-WORKFLOW.md index e7e65a3..1f7ac40 100644 --- a/docs/internal/AGENT-WORKTREE-WORKFLOW.md +++ b/docs/internal/AGENT-WORKTREE-WORKFLOW.md @@ -42,12 +42,15 @@ Run once in the primary clone: ```sh git config extensions.worktreeConfig true git config core.hooksPath hooks +just git-doctor ``` Why: - `extensions.worktreeConfig=true` enables per-worktree `user.name` - `core.hooksPath hooks` activates the attribution guard and README sync hook +- `just git-doctor` catches shared git config corruption before it bricks the + main checkout and sends helpers into retry loops ## Per-Agent Setup @@ -85,6 +88,7 @@ Both must start with `Operator & ...`. 1. Pull latest refs: ```sh +just git-doctor git fetch origin ``` @@ -117,6 +121,7 @@ git push --force-with-lease origin multitenant-zai 1. Fetch latest refs: ```sh +just git-doctor git fetch origin ``` @@ -207,6 +212,41 @@ Fix: - resolve any conflicts - push again +### Every git command says `fatal: this operation must be run in a work tree` + +Cause: + +- `core.worktree` or `core.bare` leaked into the shared git config instead of + staying in worktree-local config + +Check: + +```sh +just git-doctor +``` + +Fix: + +```sh +sh scripts/git-doctor.sh --fix +``` + +Then retry the original git command from the affected worktree. + +### Git helper spends huge tokens on a simple push/fetch request + +Cause: + +- jailed `git-admin` only has reliable local repository access +- remote push/fetch/pull work belongs in a host worktree, not in `git-worker` + +Rule: + +- use `git-admin` for local repo inspection and local branch operations +- use Codex or a helper worktree for remote branch push/fetch/rebase work +- if a controlplane task asks jailed `git-admin` to push or fetch, it should + fail fast instead of exploring the repo + ### `git rebase --continue` opens `vi` Cause: diff --git a/docs/internal/TOKEN-OPTIMIZATION.md b/docs/internal/TOKEN-OPTIMIZATION.md new file mode 100644 index 0000000..0040b36 --- /dev/null +++ b/docs/internal/TOKEN-OPTIMIZATION.md @@ -0,0 +1,81 @@ +# Token Optimization + +Plan to reduce LLM token spend across Clawdie agents and make the spend +explainable. + +## What We Found + +- Idle `sysadmin` heartbeats were reusing a long-lived Pi session and burning + about `86k` tokens per run. +- Fresh heartbeat-only Pi sessions plus compact carry-forward context reduced + the same path to about `12.5k` tokens. +- Jailed `git-admin` was spending `26k–31k` tokens rediscovering that remote + push/fetch/pull work is not possible from `git-worker`. +- Telegram `/model` overrides exist only for chat groups; controlplane + specialists still use the global default model path. +- Startup/status surfaces showed only configured defaults, not the actual model + Pi resolved to at runtime. + +## Phase 0: Fail Fast On Impossible Work + +Shipped: + +- `src/controlplane-heartbeat.ts` now blocks remote git tasks assigned to jailed + `git-admin` with + `remote_git_operation_not_supported_in_git_worker` +- `scripts/git-doctor.sh` guards shared worktree corruption (`core.worktree` + and `core.bare`) + +Rule: + +- jailed `git-admin` is for local repo work +- host worktrees are for remote branch push/fetch/rebase work + +## Phase 1: Observability + +Recommended next: + +1. Record `provider`, configured `model`, and resolved runtime model in + `agent_activity.payload` +2. Keep `/usage` as the budget view +3. Add `/tokens` as the runtime token analytics view: + - runs today + - tokens today + - avg / max tokens per run + - by event type + - by model + - configured vs actual drift warning +4. Add `Configured / Override / Actual` LLM lines to startup and status output + +## Phase 2: Role-Based Model Policy + +Recommended policy order: + +1. global default +2. role override for controlplane specialists +3. chat override for the user-facing Telegram path + +Recommended first override: + +- `sysadmin` idle heartbeat should use a cheaper, terse model than the main + operator path + +## Phase 3: Tighten Model Resolution + +The runtime showed a drift from configured `GLM-5.1` to actual `glm-5`. Before +changing more model policy, make model resolution explicit: + +1. exact match +2. case-fold match +3. only then fuzzy fallback, with a warning + +At minimum, if Pi resolves a different model than requested, we should record +that fact visibly. + +## Current Recommendations + +- keep fresh heartbeat-only sessions for idle `sysadmin` +- do not raise `sysadmin` budget yet +- prefer behavior fixes before budget increases +- move low-value repetitive specialist work off expensive reasoning defaults +- keep token and model observability separate from budget reporting diff --git a/justfile b/justfile index 576bbee..46e52c2 100644 --- a/justfile +++ b/justfile @@ -363,6 +363,9 @@ gen-changelog: install-hooks: git config core.hooksPath hooks +git-doctor: + sh scripts/git-doctor.sh + # Pull latest changes and rebuild [group("dev")] pull: diff --git a/scripts/git-doctor.sh b/scripts/git-doctor.sh new file mode 100755 index 0000000..a15a3ec --- /dev/null +++ b/scripts/git-doctor.sh @@ -0,0 +1,68 @@ +#!/bin/sh +# git-doctor: catch the shared config corruption that breaks the main +# checkout and sends agents into retry loops. + +set -eu + +FIX=0 +[ "${1:-}" = "--fix" ] && FIX=1 + +COMMON_DIR=$(git rev-parse --git-common-dir 2>/dev/null || true) +if [ -z "$COMMON_DIR" ]; then + echo "git-doctor: not inside a git repo" >&2 + exit 1 +fi + +SHARED_CONFIG="$COMMON_DIR/config" +if [ ! -f "$SHARED_CONFIG" ]; then + echo "git-doctor: $SHARED_CONFIG missing" >&2 + exit 1 +fi + +STRAY=$(git config --file "$SHARED_CONFIG" --get core.worktree 2>/dev/null || true) +FIXED=0 + +if [ -n "$STRAY" ]; then + echo "git-doctor: shared $SHARED_CONFIG has core.worktree = $STRAY" >&2 + echo " This breaks git commands from the main worktree." >&2 + if [ "$FIX" -eq 1 ]; then + git config --file "$SHARED_CONFIG" --unset core.worktree + echo "git-doctor: unset core.worktree from shared config" >&2 + FIXED=1 + else + echo " Fix: scripts/git-doctor.sh --fix" >&2 + exit 1 + fi +fi + +BARE=$(git config --file "$SHARED_CONFIG" --get core.bare 2>/dev/null || true) +if [ "$BARE" = "true" ] && [ -d "$COMMON_DIR/worktrees" ]; then + echo "git-doctor: shared $SHARED_CONFIG has core.bare = true" >&2 + echo " But linked worktrees exist under $COMMON_DIR/worktrees — this is wrong." >&2 + if [ "$FIX" -eq 1 ]; then + git config --file "$SHARED_CONFIG" core.bare false + echo "git-doctor: set core.bare = false in shared config" >&2 + FIXED=1 + else + echo " Fix: scripts/git-doctor.sh --fix" >&2 + exit 1 + fi +fi + +WT_DIR="$COMMON_DIR/worktrees" +if [ -d "$WT_DIR" ]; then + for wt in "$WT_DIR"/*; do + [ -d "$wt" ] || continue + [ -f "$wt/gitdir" ] || continue + target=$(sed -n '1p' "$wt/gitdir") + parent=$(dirname "$target") + if [ ! -d "$parent" ]; then + echo "git-doctor: linked worktree $(basename "$wt") points to missing $parent" >&2 + echo " Fix: git worktree prune" >&2 + exit 1 + fi + done +fi + +[ "$FIXED" -eq 1 ] && exit 2 +exit 0 diff --git a/src/controlplane-heartbeat.test.ts b/src/controlplane-heartbeat.test.ts index 23abbd8..c0360be 100644 --- a/src/controlplane-heartbeat.test.ts +++ b/src/controlplane-heartbeat.test.ts @@ -10,10 +10,12 @@ import { describe, it, expect, vi, beforeEach } from 'vitest'; import path from 'path'; import fs from 'fs'; +import * as controlplaneRunner from './controlplane-runner.js'; import { runAgentHeartbeat, type ControlplaneHeartbeatConfig, + requiresRemoteGitAccess, } from './controlplane-heartbeat.js'; import { createInspectableMockPool } from './test-helpers.js'; import type { Pool } from 'pg'; @@ -170,3 +172,86 @@ describe('runAgentHeartbeat — budget exhausted, real task pending', () => { expect(insertCalls.length).toBeGreaterThan(0); }); }); + +describe('runAgentHeartbeat — git-admin remote task fast-fail', () => { + it('fails fast for remote git operations in git-worker jail without spawning work', async () => { + const jailSpy = vi + .spyOn(controlplaneRunner, 'resolveAgentJail') + .mockReturnValue('mevy_git_worker'); + const querySpy = vi.fn().mockImplementation((sql: string) => { + const s = sql.trim().toUpperCase(); + if (s.includes('AGENT_BUDGETS') && s.startsWith('SELECT')) { + return Promise.resolve({ + rows: [{ + agent_id: 'git-admin', + daily_tokens: 50000, + spent_today: 0, + hard_limit_exceeded: false, + reset_at: new Date().toISOString(), + }], + rowCount: 1, + command: 'SELECT', + oid: 0, + fields: [], + }); + } + if (s.includes('TASKS') && s.startsWith('SELECT')) { + return Promise.resolve({ + rows: [{ + id: 'task-git-push', + title: 'Push multitenant branch to origin', + assigned_to: 'git-admin', + status: 'pending', + priority: 'high', + description: '', + context: null, + deadline: null, + created_at: new Date().toISOString(), + }], + rowCount: 1, + command: 'SELECT', + oid: 0, + fields: [], + }); + } + if (/UPDATE TASKS SET STATUS = 'IN_PROGRESS'/i.test(s)) { + return Promise.resolve({ rows: [], rowCount: 1, command: 'UPDATE', oid: 0, fields: [] }); + } + return Promise.resolve({ rows: [], rowCount: 1, command: 'UPDATE', oid: 0, fields: [] }); + }); + + const mockPool = { query: querySpy } as unknown as Pool; + const config = makeConfig({ pool: mockPool }); + + const result = await runAgentHeartbeat( + config, + 'git-admin', + 'assignment', + 'task-git-push', + ); + + expect(result.woke).toBe(false); + expect(result.reason).toBe('remote_git_operation_not_supported_in_git_worker'); + + const activityInserts = querySpy.mock.calls.filter(([sql]: [string]) => + /INSERT INTO agent_activity/i.test(sql), + ); + expect(activityInserts).toHaveLength(1); + + const failedUpdates = querySpy.mock.calls.filter(([sql]: [string]) => + /UPDATE tasks SET status = \$1 WHERE id = \$2/i.test(sql), + ); + expect(failedUpdates).toHaveLength(1); + + jailSpy.mockRestore(); + }); +}); + +describe('git-admin remote access guard', () => { + it('detects remote git tasks that should not run in git-worker', () => { + expect(requiresRemoteGitAccess('git-admin', 'Push multitenant branch to origin')).toBe(true); + expect(requiresRemoteGitAccess('git-admin', 'Fetch latest from Codeberg and sync mirror')).toBe(true); + expect(requiresRemoteGitAccess('git-admin', 'Create a local branch and show current HEAD')).toBe(false); + expect(requiresRemoteGitAccess('sysadmin', 'Push multitenant branch to origin')).toBe(false); + }); +}); diff --git a/src/controlplane-heartbeat.ts b/src/controlplane-heartbeat.ts index 61b60e9..239526e 100644 --- a/src/controlplane-heartbeat.ts +++ b/src/controlplane-heartbeat.ts @@ -88,6 +88,17 @@ export interface HeartbeatResult { error?: string; } +const REMOTE_GIT_TASK_RE = + /\b(push|pull|fetch|clone|mirror|sync|publish branch|publish to|remote branch|remote push|codeberg|origin\/|git@|https:\/\/codeberg\.org)\b/i; + +export function requiresRemoteGitAccess( + agentId: string, + taskDescription: string, +): boolean { + if (agentId !== 'git-admin') return false; + return REMOTE_GIT_TASK_RE.test(taskDescription); +} + function splitArgs(input: string): string[] { const args: string[] = []; let current = ''; @@ -325,6 +336,27 @@ export async function runAgentHeartbeat( const taskDescription = task?.title ?? idlePrompt ?? `Heartbeat check for ${agentId}`; + const effectiveTaskId = task?.id ?? 'HEARTBEAT'; + + if ( + task && + resolveAgentJail(agentId) && + requiresRemoteGitAccess(agentId, taskDescription) + ) { + const reason = 'remote_git_operation_not_supported_in_git_worker'; + await insertActivity(pool, { + agent_id: agentId, + event_type: 'error', + payload: { + task_id: effectiveTaskId, + error: reason, + action_taken: 'blocked_fast_fail', + }, + }); + await updateTaskStatus(pool, task.id, 'failed'); + return { agentId, woke: false, reason }; + } + const skills = loadSkillsCatalog(); const match = matchTaskToSkill(taskDescription, skills); @@ -335,7 +367,6 @@ export async function runAgentHeartbeat( // Only load session context for real tasks, not idle heartbeats. // Session context grows over time and inflates token usage each tick. - const effectiveTaskId = task?.id ?? 'HEARTBEAT'; const promptWithContext = task ? (() => { const session = loadSession(sessionCwd, agentId, workspaceCwd);