fix(controlplane): stop git push token burn in jail
--- Build: FAIL | Tests: FAIL
This commit is contained in:
parent
9a7cb820b2
commit
0d9ad52922
7 changed files with 316 additions and 1 deletions
|
|
@ -6,6 +6,7 @@ Short operator checklist for the per-agent worktree model.
|
|||
|
||||
- verify `git config core.hooksPath` returns `hooks`
|
||||
- verify `git config extensions.worktreeConfig` is `true`
|
||||
- run `just git-doctor`
|
||||
- confirm the primary FreeBSD checkout is the only place that will push
|
||||
`multitenant`
|
||||
|
||||
|
|
@ -32,6 +33,7 @@ Short operator checklist for the per-agent worktree model.
|
|||
- rebase on latest `origin/multitenant`
|
||||
- if local commits were rewritten, push with `--force-with-lease`
|
||||
- report the feature branch name and tip commit hash
|
||||
- do not ask jailed `git-admin` to push/fetch/pull remote branches
|
||||
|
||||
## Before Codex Integrates
|
||||
|
||||
|
|
@ -57,3 +59,8 @@ Short operator checklist for the per-agent worktree model.
|
|||
- fetch and rebase before retrying
|
||||
- rebase asks for an editor:
|
||||
- use `GIT_EDITOR=true git rebase --continue`
|
||||
- every git command says `must be run in a work tree`:
|
||||
- run `just git-doctor`
|
||||
- if broken, run `sh scripts/git-doctor.sh --fix`
|
||||
- git-admin push/fetch task burns tokens:
|
||||
- remote git work belongs in a host worktree, not in `git-worker`
|
||||
|
|
|
|||
|
|
@ -42,12 +42,15 @@ Run once in the primary clone:
|
|||
```sh
|
||||
git config extensions.worktreeConfig true
|
||||
git config core.hooksPath hooks
|
||||
just git-doctor
|
||||
```
|
||||
|
||||
Why:
|
||||
|
||||
- `extensions.worktreeConfig=true` enables per-worktree `user.name`
|
||||
- `core.hooksPath hooks` activates the attribution guard and README sync hook
|
||||
- `just git-doctor` catches shared git config corruption before it bricks the
|
||||
main checkout and sends helpers into retry loops
|
||||
|
||||
## Per-Agent Setup
|
||||
|
||||
|
|
@ -85,6 +88,7 @@ Both must start with `Operator & ...`.
|
|||
1. Pull latest refs:
|
||||
|
||||
```sh
|
||||
just git-doctor
|
||||
git fetch origin
|
||||
```
|
||||
|
||||
|
|
@ -117,6 +121,7 @@ git push --force-with-lease origin multitenant-zai
|
|||
1. Fetch latest refs:
|
||||
|
||||
```sh
|
||||
just git-doctor
|
||||
git fetch origin
|
||||
```
|
||||
|
||||
|
|
@ -207,6 +212,41 @@ Fix:
|
|||
- resolve any conflicts
|
||||
- push again
|
||||
|
||||
### Every git command says `fatal: this operation must be run in a work tree`
|
||||
|
||||
Cause:
|
||||
|
||||
- `core.worktree` or `core.bare` leaked into the shared git config instead of
|
||||
staying in worktree-local config
|
||||
|
||||
Check:
|
||||
|
||||
```sh
|
||||
just git-doctor
|
||||
```
|
||||
|
||||
Fix:
|
||||
|
||||
```sh
|
||||
sh scripts/git-doctor.sh --fix
|
||||
```
|
||||
|
||||
Then retry the original git command from the affected worktree.
|
||||
|
||||
### Git helper spends huge tokens on a simple push/fetch request
|
||||
|
||||
Cause:
|
||||
|
||||
- jailed `git-admin` only has reliable local repository access
|
||||
- remote push/fetch/pull work belongs in a host worktree, not in `git-worker`
|
||||
|
||||
Rule:
|
||||
|
||||
- use `git-admin` for local repo inspection and local branch operations
|
||||
- use Codex or a helper worktree for remote branch push/fetch/rebase work
|
||||
- if a controlplane task asks jailed `git-admin` to push or fetch, it should
|
||||
fail fast instead of exploring the repo
|
||||
|
||||
### `git rebase --continue` opens `vi`
|
||||
|
||||
Cause:
|
||||
|
|
|
|||
81
docs/internal/TOKEN-OPTIMIZATION.md
Normal file
81
docs/internal/TOKEN-OPTIMIZATION.md
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
# Token Optimization
|
||||
|
||||
Plan to reduce LLM token spend across Clawdie agents and make the spend
|
||||
explainable.
|
||||
|
||||
## What We Found
|
||||
|
||||
- Idle `sysadmin` heartbeats were reusing a long-lived Pi session and burning
|
||||
about `86k` tokens per run.
|
||||
- Fresh heartbeat-only Pi sessions plus compact carry-forward context reduced
|
||||
the same path to about `12.5k` tokens.
|
||||
- Jailed `git-admin` was spending `26k–31k` tokens rediscovering that remote
|
||||
push/fetch/pull work is not possible from `git-worker`.
|
||||
- Telegram `/model` overrides exist only for chat groups; controlplane
|
||||
specialists still use the global default model path.
|
||||
- Startup/status surfaces showed only configured defaults, not the actual model
|
||||
Pi resolved to at runtime.
|
||||
|
||||
## Phase 0: Fail Fast On Impossible Work
|
||||
|
||||
Shipped:
|
||||
|
||||
- `src/controlplane-heartbeat.ts` now blocks remote git tasks assigned to jailed
|
||||
`git-admin` with
|
||||
`remote_git_operation_not_supported_in_git_worker`
|
||||
- `scripts/git-doctor.sh` guards shared worktree corruption (`core.worktree`
|
||||
and `core.bare`)
|
||||
|
||||
Rule:
|
||||
|
||||
- jailed `git-admin` is for local repo work
|
||||
- host worktrees are for remote branch push/fetch/rebase work
|
||||
|
||||
## Phase 1: Observability
|
||||
|
||||
Recommended next:
|
||||
|
||||
1. Record `provider`, configured `model`, and resolved runtime model in
|
||||
`agent_activity.payload`
|
||||
2. Keep `/usage` as the budget view
|
||||
3. Add `/tokens` as the runtime token analytics view:
|
||||
- runs today
|
||||
- tokens today
|
||||
- avg / max tokens per run
|
||||
- by event type
|
||||
- by model
|
||||
- configured vs actual drift warning
|
||||
4. Add `Configured / Override / Actual` LLM lines to startup and status output
|
||||
|
||||
## Phase 2: Role-Based Model Policy
|
||||
|
||||
Recommended policy order:
|
||||
|
||||
1. global default
|
||||
2. role override for controlplane specialists
|
||||
3. chat override for the user-facing Telegram path
|
||||
|
||||
Recommended first override:
|
||||
|
||||
- `sysadmin` idle heartbeat should use a cheaper, terse model than the main
|
||||
operator path
|
||||
|
||||
## Phase 3: Tighten Model Resolution
|
||||
|
||||
The runtime showed a drift from configured `GLM-5.1` to actual `glm-5`. Before
|
||||
changing more model policy, make model resolution explicit:
|
||||
|
||||
1. exact match
|
||||
2. case-fold match
|
||||
3. only then fuzzy fallback, with a warning
|
||||
|
||||
At minimum, if Pi resolves a different model than requested, we should record
|
||||
that fact visibly.
|
||||
|
||||
## Current Recommendations
|
||||
|
||||
- keep fresh heartbeat-only sessions for idle `sysadmin`
|
||||
- do not raise `sysadmin` budget yet
|
||||
- prefer behavior fixes before budget increases
|
||||
- move low-value repetitive specialist work off expensive reasoning defaults
|
||||
- keep token and model observability separate from budget reporting
|
||||
3
justfile
3
justfile
|
|
@ -363,6 +363,9 @@ gen-changelog:
|
|||
install-hooks:
|
||||
git config core.hooksPath hooks
|
||||
|
||||
git-doctor:
|
||||
sh scripts/git-doctor.sh
|
||||
|
||||
# Pull latest changes and rebuild
|
||||
[group("dev")]
|
||||
pull:
|
||||
|
|
|
|||
68
scripts/git-doctor.sh
Executable file
68
scripts/git-doctor.sh
Executable file
|
|
@ -0,0 +1,68 @@
|
|||
#!/bin/sh
|
||||
# git-doctor: catch the shared config corruption that breaks the main
|
||||
# checkout and sends agents into retry loops.
|
||||
|
||||
set -eu
|
||||
|
||||
FIX=0
|
||||
[ "${1:-}" = "--fix" ] && FIX=1
|
||||
|
||||
COMMON_DIR=$(git rev-parse --git-common-dir 2>/dev/null || true)
|
||||
if [ -z "$COMMON_DIR" ]; then
|
||||
echo "git-doctor: not inside a git repo" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
SHARED_CONFIG="$COMMON_DIR/config"
|
||||
if [ ! -f "$SHARED_CONFIG" ]; then
|
||||
echo "git-doctor: $SHARED_CONFIG missing" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
STRAY=$(git config --file "$SHARED_CONFIG" --get core.worktree 2>/dev/null || true)
|
||||
FIXED=0
|
||||
|
||||
if [ -n "$STRAY" ]; then
|
||||
echo "git-doctor: shared $SHARED_CONFIG has core.worktree = $STRAY" >&2
|
||||
echo " This breaks git commands from the main worktree." >&2
|
||||
if [ "$FIX" -eq 1 ]; then
|
||||
git config --file "$SHARED_CONFIG" --unset core.worktree
|
||||
echo "git-doctor: unset core.worktree from shared config" >&2
|
||||
FIXED=1
|
||||
else
|
||||
echo " Fix: scripts/git-doctor.sh --fix" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
BARE=$(git config --file "$SHARED_CONFIG" --get core.bare 2>/dev/null || true)
|
||||
if [ "$BARE" = "true" ] && [ -d "$COMMON_DIR/worktrees" ]; then
|
||||
echo "git-doctor: shared $SHARED_CONFIG has core.bare = true" >&2
|
||||
echo " But linked worktrees exist under $COMMON_DIR/worktrees — this is wrong." >&2
|
||||
if [ "$FIX" -eq 1 ]; then
|
||||
git config --file "$SHARED_CONFIG" core.bare false
|
||||
echo "git-doctor: set core.bare = false in shared config" >&2
|
||||
FIXED=1
|
||||
else
|
||||
echo " Fix: scripts/git-doctor.sh --fix" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
WT_DIR="$COMMON_DIR/worktrees"
|
||||
if [ -d "$WT_DIR" ]; then
|
||||
for wt in "$WT_DIR"/*; do
|
||||
[ -d "$wt" ] || continue
|
||||
[ -f "$wt/gitdir" ] || continue
|
||||
target=$(sed -n '1p' "$wt/gitdir")
|
||||
parent=$(dirname "$target")
|
||||
if [ ! -d "$parent" ]; then
|
||||
echo "git-doctor: linked worktree $(basename "$wt") points to missing $parent" >&2
|
||||
echo " Fix: git worktree prune" >&2
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
[ "$FIXED" -eq 1 ] && exit 2
|
||||
exit 0
|
||||
|
|
@ -10,10 +10,12 @@
|
|||
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
||||
import path from 'path';
|
||||
import fs from 'fs';
|
||||
import * as controlplaneRunner from './controlplane-runner.js';
|
||||
|
||||
import {
|
||||
runAgentHeartbeat,
|
||||
type ControlplaneHeartbeatConfig,
|
||||
requiresRemoteGitAccess,
|
||||
} from './controlplane-heartbeat.js';
|
||||
import { createInspectableMockPool } from './test-helpers.js';
|
||||
import type { Pool } from 'pg';
|
||||
|
|
@ -170,3 +172,86 @@ describe('runAgentHeartbeat — budget exhausted, real task pending', () => {
|
|||
expect(insertCalls.length).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe('runAgentHeartbeat — git-admin remote task fast-fail', () => {
|
||||
it('fails fast for remote git operations in git-worker jail without spawning work', async () => {
|
||||
const jailSpy = vi
|
||||
.spyOn(controlplaneRunner, 'resolveAgentJail')
|
||||
.mockReturnValue('mevy_git_worker');
|
||||
const querySpy = vi.fn().mockImplementation((sql: string) => {
|
||||
const s = sql.trim().toUpperCase();
|
||||
if (s.includes('AGENT_BUDGETS') && s.startsWith('SELECT')) {
|
||||
return Promise.resolve({
|
||||
rows: [{
|
||||
agent_id: 'git-admin',
|
||||
daily_tokens: 50000,
|
||||
spent_today: 0,
|
||||
hard_limit_exceeded: false,
|
||||
reset_at: new Date().toISOString(),
|
||||
}],
|
||||
rowCount: 1,
|
||||
command: 'SELECT',
|
||||
oid: 0,
|
||||
fields: [],
|
||||
});
|
||||
}
|
||||
if (s.includes('TASKS') && s.startsWith('SELECT')) {
|
||||
return Promise.resolve({
|
||||
rows: [{
|
||||
id: 'task-git-push',
|
||||
title: 'Push multitenant branch to origin',
|
||||
assigned_to: 'git-admin',
|
||||
status: 'pending',
|
||||
priority: 'high',
|
||||
description: '',
|
||||
context: null,
|
||||
deadline: null,
|
||||
created_at: new Date().toISOString(),
|
||||
}],
|
||||
rowCount: 1,
|
||||
command: 'SELECT',
|
||||
oid: 0,
|
||||
fields: [],
|
||||
});
|
||||
}
|
||||
if (/UPDATE TASKS SET STATUS = 'IN_PROGRESS'/i.test(s)) {
|
||||
return Promise.resolve({ rows: [], rowCount: 1, command: 'UPDATE', oid: 0, fields: [] });
|
||||
}
|
||||
return Promise.resolve({ rows: [], rowCount: 1, command: 'UPDATE', oid: 0, fields: [] });
|
||||
});
|
||||
|
||||
const mockPool = { query: querySpy } as unknown as Pool;
|
||||
const config = makeConfig({ pool: mockPool });
|
||||
|
||||
const result = await runAgentHeartbeat(
|
||||
config,
|
||||
'git-admin',
|
||||
'assignment',
|
||||
'task-git-push',
|
||||
);
|
||||
|
||||
expect(result.woke).toBe(false);
|
||||
expect(result.reason).toBe('remote_git_operation_not_supported_in_git_worker');
|
||||
|
||||
const activityInserts = querySpy.mock.calls.filter(([sql]: [string]) =>
|
||||
/INSERT INTO agent_activity/i.test(sql),
|
||||
);
|
||||
expect(activityInserts).toHaveLength(1);
|
||||
|
||||
const failedUpdates = querySpy.mock.calls.filter(([sql]: [string]) =>
|
||||
/UPDATE tasks SET status = \$1 WHERE id = \$2/i.test(sql),
|
||||
);
|
||||
expect(failedUpdates).toHaveLength(1);
|
||||
|
||||
jailSpy.mockRestore();
|
||||
});
|
||||
});
|
||||
|
||||
describe('git-admin remote access guard', () => {
|
||||
it('detects remote git tasks that should not run in git-worker', () => {
|
||||
expect(requiresRemoteGitAccess('git-admin', 'Push multitenant branch to origin')).toBe(true);
|
||||
expect(requiresRemoteGitAccess('git-admin', 'Fetch latest from Codeberg and sync mirror')).toBe(true);
|
||||
expect(requiresRemoteGitAccess('git-admin', 'Create a local branch and show current HEAD')).toBe(false);
|
||||
expect(requiresRemoteGitAccess('sysadmin', 'Push multitenant branch to origin')).toBe(false);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -88,6 +88,17 @@ export interface HeartbeatResult {
|
|||
error?: string;
|
||||
}
|
||||
|
||||
const REMOTE_GIT_TASK_RE =
|
||||
/\b(push|pull|fetch|clone|mirror|sync|publish branch|publish to|remote branch|remote push|codeberg|origin\/|git@|https:\/\/codeberg\.org)\b/i;
|
||||
|
||||
export function requiresRemoteGitAccess(
|
||||
agentId: string,
|
||||
taskDescription: string,
|
||||
): boolean {
|
||||
if (agentId !== 'git-admin') return false;
|
||||
return REMOTE_GIT_TASK_RE.test(taskDescription);
|
||||
}
|
||||
|
||||
function splitArgs(input: string): string[] {
|
||||
const args: string[] = [];
|
||||
let current = '';
|
||||
|
|
@ -325,6 +336,27 @@ export async function runAgentHeartbeat(
|
|||
|
||||
const taskDescription =
|
||||
task?.title ?? idlePrompt ?? `Heartbeat check for ${agentId}`;
|
||||
const effectiveTaskId = task?.id ?? 'HEARTBEAT';
|
||||
|
||||
if (
|
||||
task &&
|
||||
resolveAgentJail(agentId) &&
|
||||
requiresRemoteGitAccess(agentId, taskDescription)
|
||||
) {
|
||||
const reason = 'remote_git_operation_not_supported_in_git_worker';
|
||||
await insertActivity(pool, {
|
||||
agent_id: agentId,
|
||||
event_type: 'error',
|
||||
payload: {
|
||||
task_id: effectiveTaskId,
|
||||
error: reason,
|
||||
action_taken: 'blocked_fast_fail',
|
||||
},
|
||||
});
|
||||
await updateTaskStatus(pool, task.id, 'failed');
|
||||
return { agentId, woke: false, reason };
|
||||
}
|
||||
|
||||
const skills = loadSkillsCatalog();
|
||||
const match = matchTaskToSkill(taskDescription, skills);
|
||||
|
||||
|
|
@ -335,7 +367,6 @@ export async function runAgentHeartbeat(
|
|||
|
||||
// Only load session context for real tasks, not idle heartbeats.
|
||||
// Session context grows over time and inflates token usage each tick.
|
||||
const effectiveTaskId = task?.id ?? 'HEARTBEAT';
|
||||
const promptWithContext = task
|
||||
? (() => {
|
||||
const session = loadSession(sessionCwd, agentId, workspaceCwd);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue