fix: idle heartbeat runs on 2h cadence, budget noise silenced

Two problems found from activity log analysis:

1. Idle maintenance prompt fired every 30s (same as task pickup tick).
   With 50k daily budget and 15-22k tokens/run, sysadmin exhausted its
   budget after 3 runs then logged a budget error every 30s for the
   rest of the day (160+ noise rows in 2h).

2. Budget-exceeded errors were logged for idle ticks even when no task
   was being blocked — only the tick cadence was the issue.

Fixes:
- Add agentIdleIntervalMs to ControlplaneHeartbeatConfig (default 2h
  via AGENT_HEARTBEAT_IDLE_INTERVAL_MS). The 30s tick still handles
  task pickup; idle prompts only fire when the interval has elapsed.
  Attempt time is recorded before running so budget exhaustion doesn't
  cause a retry every 30s for the rest of the day.
- Reorder runAgentHeartbeat: fetch tasks before budget check. Only log
  budget-exceeded activity when a real assigned task is being blocked.
  Idle ticks with no task return silently.

Tests: covers idle tick with no log on budget exhaustion, and confirms
the error IS logged when a real task is blocked.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mevy Assistant 2026-04-21 11:18:50 +02:00
parent 7d3ce4d97c
commit 3d2442eea3
5 changed files with 158 additions and 13 deletions

View file

@ -291,3 +291,8 @@ TELEGRAM_OPS_CHAT_ID=
# AGENT_COMPACTION_PROVIDER=zai
# AGENT_COMPACTION_MODEL=glm-4.6
# Idle maintenance interval for heartbeat-enabled agents in ms (default: 7200000 = 2h).
# Controls how often agents with no pending tasks receive a maintenance idle prompt.
# Separate from the 30s task pickup tick.
# AGENT_HEARTBEAT_IDLE_INTERVAL_MS=7200000

View file

@ -143,6 +143,8 @@ const envConfig = readEnvFile([
'AGENT_SESSION_COMPACT_TIMEOUT_MS',
'AGENT_SESSION_COMPACT_KEEP_TURNS',
'AGENT_SESSION_COMPACT_MIN_ENTRIES',
// Controlplane heartbeat
'AGENT_HEARTBEAT_IDLE_INTERVAL_MS',
// Vision (optional helper model for OCR/screenshot reading)
'VISION_PROVIDER',
'VISION_MODEL',
@ -328,6 +330,14 @@ export const AGENT_CHAT_DAILY_TOKEN_LIMIT =
process.env.AGENT_CHAT_DAILY_TOKEN_LIMIT ||
envConfig.AGENT_CHAT_DAILY_TOKEN_LIMIT,
) ?? 0;
// How long between idle maintenance runs for heartbeat-enabled agents (ms).
// Controls how often agents with no pending tasks receive an idle prompt.
// Separate from tickIntervalMs (task pickup frequency).
export const AGENT_HEARTBEAT_IDLE_INTERVAL_MS =
parseOptionalInt(
process.env.AGENT_HEARTBEAT_IDLE_INTERVAL_MS ||
envConfig.AGENT_HEARTBEAT_IDLE_INTERVAL_MS,
) ?? 7_200_000; // 2 hours
export const AGENT_MAX_INBOUND_CHARS =
parseOptionalInt(
process.env.AGENT_MAX_INBOUND_CHARS || envConfig.AGENT_MAX_INBOUND_CHARS,

View file

@ -35,6 +35,7 @@ function makeConfig(
apiKey: 'test-key',
agentName: 'clawdie',
tickIntervalMs: 60000,
agentIdleIntervalMs: 7_200_000,
maxSessionEntries: 50,
...overrides,
};
@ -61,3 +62,111 @@ describe('runAgentHeartbeat — no task', () => {
expect(result.woke).toBe(false);
});
});
describe('runAgentHeartbeat — budget exhausted, no task (idle tick)', () => {
it('returns woke:false with budget reason and does NOT write to activity log', async () => {
// Pool returns hard_limit_exceeded budget and no tasks
const querySpy = vi.fn().mockImplementation((sql: string) => {
const s = sql.trim().toUpperCase();
// Budget query: SELECT * FROM agent_budgets WHERE agent_id = $1
if (s.includes('AGENT_BUDGETS') && s.startsWith('SELECT')) {
return Promise.resolve({
rows: [{
agent_id: 'sysadmin',
daily_tokens: 50000,
spent_today: 50000,
hard_limit_exceeded: true,
reset_at: new Date(Date.now() - 3600000).toISOString(),
}],
rowCount: 1,
command: 'SELECT',
oid: 0,
fields: [],
});
}
// Tasks query: return empty (no pending tasks)
if (s.includes('AGENT_TASKS') || s.includes('TASKS')) {
return Promise.resolve({ rows: [], rowCount: 0, command: 'SELECT', oid: 0, fields: [] });
}
return Promise.resolve({ rows: [], rowCount: 0, command: '', oid: 0, fields: [] });
});
const mockPool = { query: querySpy } as unknown as Pool;
const config = makeConfig({ pool: mockPool });
const result = await runAgentHeartbeat(
config,
'sysadmin',
'interval_elapsed',
undefined,
'Run your scheduled maintenance check.',
);
expect(result.woke).toBe(false);
expect(result.reason).toBe('hard_limit_exceeded');
// Must NOT have written an activity INSERT (no task to block)
const insertCalls = querySpy.mock.calls.filter(([sql]: [string]) =>
/INSERT INTO agent_activity/i.test(sql),
);
expect(insertCalls).toHaveLength(0);
});
});
describe('runAgentHeartbeat — budget exhausted, real task pending', () => {
it('returns woke:false and DOES write activity error when a task is blocked', async () => {
const querySpy = vi.fn().mockImplementation((sql: string) => {
const s = sql.trim().toUpperCase();
if (s.includes('AGENT_BUDGETS') && s.startsWith('SELECT')) {
return Promise.resolve({
rows: [{
agent_id: 'sysadmin',
daily_tokens: 50000,
spent_today: 50000,
hard_limit_exceeded: true,
reset_at: new Date(Date.now() - 3600000).toISOString(),
}],
rowCount: 1,
command: 'SELECT',
oid: 0,
fields: [],
});
}
// Tasks query: return a pending task
if (s.includes('TASKS') && s.startsWith('SELECT')) {
return Promise.resolve({
rows: [{
id: 'task-123',
title: 'Check jail status',
assigned_to: 'sysadmin',
status: 'pending',
priority: 'normal',
description: '',
context: null,
deadline: null,
created_at: new Date().toISOString(),
}],
rowCount: 1,
command: 'SELECT',
oid: 0,
fields: [],
});
}
return Promise.resolve({ rows: [], rowCount: 0, command: '', oid: 0, fields: [] });
});
const mockPool = { query: querySpy } as unknown as Pool;
const config = makeConfig({ pool: mockPool });
const result = await runAgentHeartbeat(config, 'sysadmin', 'assignment', 'task-123');
expect(result.woke).toBe(false);
expect(result.reason).toBe('hard_limit_exceeded');
// MUST have written an activity INSERT because a real task was blocked
const insertCalls = querySpy.mock.calls.filter(([sql]: [string]) =>
/INSERT INTO agent_activity/i.test(sql),
);
expect(insertCalls.length).toBeGreaterThan(0);
});
});

View file

@ -73,6 +73,8 @@ export interface ControlplaneHeartbeatConfig {
apiKey: string;
agentName: string;
tickIntervalMs: number;
/** Minimum ms between idle maintenance runs per agent. Separate from tickIntervalMs. */
agentIdleIntervalMs: number;
maxSessionEntries: number;
sendMessage?: (jid: string, text: string) => Promise<void>;
}
@ -274,25 +276,31 @@ export async function runAgentHeartbeat(
): Promise<HeartbeatResult> {
const { pool, workspaceCwd, sessionCwd, apiKey } = config;
const budget = await checkBudget(pool, agentId, 500);
if (!budget.allowed) {
await insertActivity(pool, {
agent_id: agentId,
event_type: 'error',
payload: { reason: budget.reason, wake_blocked: true },
});
return { agentId, woke: false, reason: budget.reason };
}
// Fetch tasks first so we know whether there is real work before deciding
// whether to log a budget-exceeded error (idle ticks should not spam activity).
const tasks = await getTasksByRole(pool, agentId);
const task = taskId ? tasks.find((t) => t.id === taskId) : tasks[0];
// Skip heartbeat if no pending tasks and this is an interval tick,
// unless an idlePrompt is provided (e.g. scheduled maintenance agents).
// Skip early: no pending task, interval tick, no idle prompt → silent return.
if (!task && wakeReason === 'interval_elapsed' && !idlePrompt) {
return { agentId, woke: false, reason: 'no_pending_tasks' };
}
const hasRealTask = !!task;
const budget = await checkBudget(pool, agentId, 500);
if (!budget.allowed) {
// Only log to activity when a real assigned task is being blocked.
// Idle maintenance ticks silently skip to avoid flooding the activity log.
if (hasRealTask) {
await insertActivity(pool, {
agent_id: agentId,
event_type: 'error',
payload: { reason: budget.reason, wake_blocked: true },
});
}
return { agentId, woke: false, reason: budget.reason };
}
if (!task && wakeReason === 'assignment') {
return { agentId, woke: false, reason: 'no_task_found' };
}
@ -549,6 +557,9 @@ export function startControlplaneHeartbeatLoop(
}
let lastBudgetResetCheck = 0;
// Track when each agent last received an idle maintenance prompt (ms timestamp).
// Prevents idle prompts from firing every tick — they run on agentIdleIntervalMs cadence.
const lastIdleRunAt = new Map<string, number>();
const loop = async () => {
try {
@ -577,12 +588,20 @@ export function startControlplaneHeartbeatLoop(
for (const agent of agents) {
if (!agent.heartbeat_enabled) continue;
try {
const lastIdle = lastIdleRunAt.get(agent.id) ?? 0;
const idleReady = now - lastIdle >= config.agentIdleIntervalMs;
const idlePrompt = idleReady
? 'Run your scheduled maintenance check.'
: undefined;
// Record attempt time before running so budget exhaustion doesn't
// cause a retry every 30s for the rest of the day.
if (idleReady) lastIdleRunAt.set(agent.id, now);
await runAgentHeartbeat(
config,
agent.id,
'interval_elapsed',
undefined,
'Run your scheduled maintenance check.',
idlePrompt,
);
} catch (err) {
logger.error(

View file

@ -30,6 +30,7 @@ import {
AGENT_BUDGET_PAUSE_PCT,
AGENT_BUDGET_PAUSE_NOTICE_COOLDOWN_SEC,
AGENT_CHAT_DAILY_TOKEN_LIMIT,
AGENT_HEARTBEAT_IDLE_INTERVAL_MS,
TELEGRAM_OPS_CHAT_ID,
WATCHDOG_MODE,
} from './config.js';
@ -1075,6 +1076,7 @@ async function main(): Promise<void> {
apiKey: CONTROLPLANE_SHARED_SECRET || OPENAI_API_KEY,
agentName: AGENT_NAME,
tickIntervalMs: 30000,
agentIdleIntervalMs: AGENT_HEARTBEAT_IDLE_INTERVAL_MS,
maxSessionEntries: 100,
sendMessage: async (jid, rawText) => {
const channel = findChannel(channels, jid);