fix: idle heartbeat runs on 2h cadence, budget noise silenced
Two problems found from activity log analysis: 1. Idle maintenance prompt fired every 30s (same as task pickup tick). With 50k daily budget and 15-22k tokens/run, sysadmin exhausted its budget after 3 runs then logged a budget error every 30s for the rest of the day (160+ noise rows in 2h). 2. Budget-exceeded errors were logged for idle ticks even when no task was being blocked — only the tick cadence was the issue. Fixes: - Add agentIdleIntervalMs to ControlplaneHeartbeatConfig (default 2h via AGENT_HEARTBEAT_IDLE_INTERVAL_MS). The 30s tick still handles task pickup; idle prompts only fire when the interval has elapsed. Attempt time is recorded before running so budget exhaustion doesn't cause a retry every 30s for the rest of the day. - Reorder runAgentHeartbeat: fetch tasks before budget check. Only log budget-exceeded activity when a real assigned task is being blocked. Idle ticks with no task return silently. Tests: covers idle tick with no log on budget exhaustion, and confirms the error IS logged when a real task is blocked. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
7d3ce4d97c
commit
3d2442eea3
5 changed files with 158 additions and 13 deletions
|
|
@ -291,3 +291,8 @@ TELEGRAM_OPS_CHAT_ID=
|
|||
# AGENT_COMPACTION_PROVIDER=zai
|
||||
# AGENT_COMPACTION_MODEL=glm-4.6
|
||||
|
||||
# Idle maintenance interval for heartbeat-enabled agents in ms (default: 7200000 = 2h).
|
||||
# Controls how often agents with no pending tasks receive a maintenance idle prompt.
|
||||
# Separate from the 30s task pickup tick.
|
||||
# AGENT_HEARTBEAT_IDLE_INTERVAL_MS=7200000
|
||||
|
||||
|
|
|
|||
|
|
@ -143,6 +143,8 @@ const envConfig = readEnvFile([
|
|||
'AGENT_SESSION_COMPACT_TIMEOUT_MS',
|
||||
'AGENT_SESSION_COMPACT_KEEP_TURNS',
|
||||
'AGENT_SESSION_COMPACT_MIN_ENTRIES',
|
||||
// Controlplane heartbeat
|
||||
'AGENT_HEARTBEAT_IDLE_INTERVAL_MS',
|
||||
// Vision (optional helper model for OCR/screenshot reading)
|
||||
'VISION_PROVIDER',
|
||||
'VISION_MODEL',
|
||||
|
|
@ -328,6 +330,14 @@ export const AGENT_CHAT_DAILY_TOKEN_LIMIT =
|
|||
process.env.AGENT_CHAT_DAILY_TOKEN_LIMIT ||
|
||||
envConfig.AGENT_CHAT_DAILY_TOKEN_LIMIT,
|
||||
) ?? 0;
|
||||
// How long between idle maintenance runs for heartbeat-enabled agents (ms).
|
||||
// Controls how often agents with no pending tasks receive an idle prompt.
|
||||
// Separate from tickIntervalMs (task pickup frequency).
|
||||
export const AGENT_HEARTBEAT_IDLE_INTERVAL_MS =
|
||||
parseOptionalInt(
|
||||
process.env.AGENT_HEARTBEAT_IDLE_INTERVAL_MS ||
|
||||
envConfig.AGENT_HEARTBEAT_IDLE_INTERVAL_MS,
|
||||
) ?? 7_200_000; // 2 hours
|
||||
export const AGENT_MAX_INBOUND_CHARS =
|
||||
parseOptionalInt(
|
||||
process.env.AGENT_MAX_INBOUND_CHARS || envConfig.AGENT_MAX_INBOUND_CHARS,
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ function makeConfig(
|
|||
apiKey: 'test-key',
|
||||
agentName: 'clawdie',
|
||||
tickIntervalMs: 60000,
|
||||
agentIdleIntervalMs: 7_200_000,
|
||||
maxSessionEntries: 50,
|
||||
...overrides,
|
||||
};
|
||||
|
|
@ -61,3 +62,111 @@ describe('runAgentHeartbeat — no task', () => {
|
|||
expect(result.woke).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('runAgentHeartbeat — budget exhausted, no task (idle tick)', () => {
|
||||
it('returns woke:false with budget reason and does NOT write to activity log', async () => {
|
||||
// Pool returns hard_limit_exceeded budget and no tasks
|
||||
const querySpy = vi.fn().mockImplementation((sql: string) => {
|
||||
const s = sql.trim().toUpperCase();
|
||||
// Budget query: SELECT * FROM agent_budgets WHERE agent_id = $1
|
||||
if (s.includes('AGENT_BUDGETS') && s.startsWith('SELECT')) {
|
||||
return Promise.resolve({
|
||||
rows: [{
|
||||
agent_id: 'sysadmin',
|
||||
daily_tokens: 50000,
|
||||
spent_today: 50000,
|
||||
hard_limit_exceeded: true,
|
||||
reset_at: new Date(Date.now() - 3600000).toISOString(),
|
||||
}],
|
||||
rowCount: 1,
|
||||
command: 'SELECT',
|
||||
oid: 0,
|
||||
fields: [],
|
||||
});
|
||||
}
|
||||
// Tasks query: return empty (no pending tasks)
|
||||
if (s.includes('AGENT_TASKS') || s.includes('TASKS')) {
|
||||
return Promise.resolve({ rows: [], rowCount: 0, command: 'SELECT', oid: 0, fields: [] });
|
||||
}
|
||||
return Promise.resolve({ rows: [], rowCount: 0, command: '', oid: 0, fields: [] });
|
||||
});
|
||||
|
||||
const mockPool = { query: querySpy } as unknown as Pool;
|
||||
const config = makeConfig({ pool: mockPool });
|
||||
|
||||
const result = await runAgentHeartbeat(
|
||||
config,
|
||||
'sysadmin',
|
||||
'interval_elapsed',
|
||||
undefined,
|
||||
'Run your scheduled maintenance check.',
|
||||
);
|
||||
|
||||
expect(result.woke).toBe(false);
|
||||
expect(result.reason).toBe('hard_limit_exceeded');
|
||||
|
||||
// Must NOT have written an activity INSERT (no task to block)
|
||||
const insertCalls = querySpy.mock.calls.filter(([sql]: [string]) =>
|
||||
/INSERT INTO agent_activity/i.test(sql),
|
||||
);
|
||||
expect(insertCalls).toHaveLength(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe('runAgentHeartbeat — budget exhausted, real task pending', () => {
|
||||
it('returns woke:false and DOES write activity error when a task is blocked', async () => {
|
||||
const querySpy = vi.fn().mockImplementation((sql: string) => {
|
||||
const s = sql.trim().toUpperCase();
|
||||
if (s.includes('AGENT_BUDGETS') && s.startsWith('SELECT')) {
|
||||
return Promise.resolve({
|
||||
rows: [{
|
||||
agent_id: 'sysadmin',
|
||||
daily_tokens: 50000,
|
||||
spent_today: 50000,
|
||||
hard_limit_exceeded: true,
|
||||
reset_at: new Date(Date.now() - 3600000).toISOString(),
|
||||
}],
|
||||
rowCount: 1,
|
||||
command: 'SELECT',
|
||||
oid: 0,
|
||||
fields: [],
|
||||
});
|
||||
}
|
||||
// Tasks query: return a pending task
|
||||
if (s.includes('TASKS') && s.startsWith('SELECT')) {
|
||||
return Promise.resolve({
|
||||
rows: [{
|
||||
id: 'task-123',
|
||||
title: 'Check jail status',
|
||||
assigned_to: 'sysadmin',
|
||||
status: 'pending',
|
||||
priority: 'normal',
|
||||
description: '',
|
||||
context: null,
|
||||
deadline: null,
|
||||
created_at: new Date().toISOString(),
|
||||
}],
|
||||
rowCount: 1,
|
||||
command: 'SELECT',
|
||||
oid: 0,
|
||||
fields: [],
|
||||
});
|
||||
}
|
||||
return Promise.resolve({ rows: [], rowCount: 0, command: '', oid: 0, fields: [] });
|
||||
});
|
||||
|
||||
const mockPool = { query: querySpy } as unknown as Pool;
|
||||
const config = makeConfig({ pool: mockPool });
|
||||
|
||||
const result = await runAgentHeartbeat(config, 'sysadmin', 'assignment', 'task-123');
|
||||
|
||||
expect(result.woke).toBe(false);
|
||||
expect(result.reason).toBe('hard_limit_exceeded');
|
||||
|
||||
// MUST have written an activity INSERT because a real task was blocked
|
||||
const insertCalls = querySpy.mock.calls.filter(([sql]: [string]) =>
|
||||
/INSERT INTO agent_activity/i.test(sql),
|
||||
);
|
||||
expect(insertCalls.length).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -73,6 +73,8 @@ export interface ControlplaneHeartbeatConfig {
|
|||
apiKey: string;
|
||||
agentName: string;
|
||||
tickIntervalMs: number;
|
||||
/** Minimum ms between idle maintenance runs per agent. Separate from tickIntervalMs. */
|
||||
agentIdleIntervalMs: number;
|
||||
maxSessionEntries: number;
|
||||
sendMessage?: (jid: string, text: string) => Promise<void>;
|
||||
}
|
||||
|
|
@ -274,25 +276,31 @@ export async function runAgentHeartbeat(
|
|||
): Promise<HeartbeatResult> {
|
||||
const { pool, workspaceCwd, sessionCwd, apiKey } = config;
|
||||
|
||||
const budget = await checkBudget(pool, agentId, 500);
|
||||
if (!budget.allowed) {
|
||||
await insertActivity(pool, {
|
||||
agent_id: agentId,
|
||||
event_type: 'error',
|
||||
payload: { reason: budget.reason, wake_blocked: true },
|
||||
});
|
||||
return { agentId, woke: false, reason: budget.reason };
|
||||
}
|
||||
|
||||
// Fetch tasks first so we know whether there is real work before deciding
|
||||
// whether to log a budget-exceeded error (idle ticks should not spam activity).
|
||||
const tasks = await getTasksByRole(pool, agentId);
|
||||
const task = taskId ? tasks.find((t) => t.id === taskId) : tasks[0];
|
||||
|
||||
// Skip heartbeat if no pending tasks and this is an interval tick,
|
||||
// unless an idlePrompt is provided (e.g. scheduled maintenance agents).
|
||||
// Skip early: no pending task, interval tick, no idle prompt → silent return.
|
||||
if (!task && wakeReason === 'interval_elapsed' && !idlePrompt) {
|
||||
return { agentId, woke: false, reason: 'no_pending_tasks' };
|
||||
}
|
||||
|
||||
const hasRealTask = !!task;
|
||||
const budget = await checkBudget(pool, agentId, 500);
|
||||
if (!budget.allowed) {
|
||||
// Only log to activity when a real assigned task is being blocked.
|
||||
// Idle maintenance ticks silently skip to avoid flooding the activity log.
|
||||
if (hasRealTask) {
|
||||
await insertActivity(pool, {
|
||||
agent_id: agentId,
|
||||
event_type: 'error',
|
||||
payload: { reason: budget.reason, wake_blocked: true },
|
||||
});
|
||||
}
|
||||
return { agentId, woke: false, reason: budget.reason };
|
||||
}
|
||||
|
||||
if (!task && wakeReason === 'assignment') {
|
||||
return { agentId, woke: false, reason: 'no_task_found' };
|
||||
}
|
||||
|
|
@ -549,6 +557,9 @@ export function startControlplaneHeartbeatLoop(
|
|||
}
|
||||
|
||||
let lastBudgetResetCheck = 0;
|
||||
// Track when each agent last received an idle maintenance prompt (ms timestamp).
|
||||
// Prevents idle prompts from firing every tick — they run on agentIdleIntervalMs cadence.
|
||||
const lastIdleRunAt = new Map<string, number>();
|
||||
|
||||
const loop = async () => {
|
||||
try {
|
||||
|
|
@ -577,12 +588,20 @@ export function startControlplaneHeartbeatLoop(
|
|||
for (const agent of agents) {
|
||||
if (!agent.heartbeat_enabled) continue;
|
||||
try {
|
||||
const lastIdle = lastIdleRunAt.get(agent.id) ?? 0;
|
||||
const idleReady = now - lastIdle >= config.agentIdleIntervalMs;
|
||||
const idlePrompt = idleReady
|
||||
? 'Run your scheduled maintenance check.'
|
||||
: undefined;
|
||||
// Record attempt time before running so budget exhaustion doesn't
|
||||
// cause a retry every 30s for the rest of the day.
|
||||
if (idleReady) lastIdleRunAt.set(agent.id, now);
|
||||
await runAgentHeartbeat(
|
||||
config,
|
||||
agent.id,
|
||||
'interval_elapsed',
|
||||
undefined,
|
||||
'Run your scheduled maintenance check.',
|
||||
idlePrompt,
|
||||
);
|
||||
} catch (err) {
|
||||
logger.error(
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ import {
|
|||
AGENT_BUDGET_PAUSE_PCT,
|
||||
AGENT_BUDGET_PAUSE_NOTICE_COOLDOWN_SEC,
|
||||
AGENT_CHAT_DAILY_TOKEN_LIMIT,
|
||||
AGENT_HEARTBEAT_IDLE_INTERVAL_MS,
|
||||
TELEGRAM_OPS_CHAT_ID,
|
||||
WATCHDOG_MODE,
|
||||
} from './config.js';
|
||||
|
|
@ -1075,6 +1076,7 @@ async function main(): Promise<void> {
|
|||
apiKey: CONTROLPLANE_SHARED_SECRET || OPENAI_API_KEY,
|
||||
agentName: AGENT_NAME,
|
||||
tickIntervalMs: 30000,
|
||||
agentIdleIntervalMs: AGENT_HEARTBEAT_IDLE_INTERVAL_MS,
|
||||
maxSessionEntries: 100,
|
||||
sendMessage: async (jid, rawText) => {
|
||||
const channel = findChannel(channels, jid);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue