fix: idle heartbeat runs on 2h cadence, budget noise silenced

Two problems found from activity log analysis: 1. Idle maintenance prompt fired every 30s (same as task pickup tick). With 50k daily budget and 15-22k tokens/run, sysadmin exhausted its budget after 3 runs then logged a budget error every 30s for the rest of the day (160+ noise rows in 2h). 2. Budget-exceeded errors were logged for idle ticks even when no task was being blocked — only the tick cadence was the issue. Fixes: - Add agentIdleIntervalMs to ControlplaneHeartbeatConfig (default 2h via AGENT_HEARTBEAT_IDLE_INTERVAL_MS). The 30s tick still handles task pickup; idle prompts only fire when the interval has elapsed. Attempt time is recorded before running so budget exhaustion doesn't cause a retry every 30s for the rest of the day. - Reorder runAgentHeartbeat: fetch tasks before budget check. Only log budget-exceeded activity when a real assigned task is being blocked. Idle ticks with no task return silently. Tests: covers idle tick with no log on budget exhaustion, and confirms the error IS logged when a real task is blocked. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-21 11:18:50 +02:00 · 2026-04-21 11:18:50 +02:00 · 3d2442eea3
commit 3d2442eea3
parent 7d3ce4d97c
5 changed files with 158 additions and 13 deletions
--- a/.env.example
+++ b/.env.example
@ -291,3 +291,8 @@ TELEGRAM_OPS_CHAT_ID=
 # AGENT_COMPACTION_PROVIDER=zai
 # AGENT_COMPACTION_MODEL=glm-4.6

+# Idle maintenance interval for heartbeat-enabled agents in ms (default: 7200000 = 2h).
+# Controls how often agents with no pending tasks receive a maintenance idle prompt.
+# Separate from the 30s task pickup tick.
+# AGENT_HEARTBEAT_IDLE_INTERVAL_MS=7200000
+
--- a/src/config.ts
+++ b/src/config.ts
@ -143,6 +143,8 @@ const envConfig = readEnvFile([
  'AGENT_SESSION_COMPACT_TIMEOUT_MS',
  'AGENT_SESSION_COMPACT_KEEP_TURNS',
  'AGENT_SESSION_COMPACT_MIN_ENTRIES',
+  // Controlplane heartbeat
+  'AGENT_HEARTBEAT_IDLE_INTERVAL_MS',
  // Vision (optional helper model for OCR/screenshot reading)
  'VISION_PROVIDER',
  'VISION_MODEL',
@ -328,6 +330,14 @@ export const AGENT_CHAT_DAILY_TOKEN_LIMIT =
    process.env.AGENT_CHAT_DAILY_TOKEN_LIMIT ||
      envConfig.AGENT_CHAT_DAILY_TOKEN_LIMIT,
  ) ?? 0;
+// How long between idle maintenance runs for heartbeat-enabled agents (ms).
+// Controls how often agents with no pending tasks receive an idle prompt.
+// Separate from tickIntervalMs (task pickup frequency).
+export const AGENT_HEARTBEAT_IDLE_INTERVAL_MS =
+  parseOptionalInt(
+    process.env.AGENT_HEARTBEAT_IDLE_INTERVAL_MS ||
+      envConfig.AGENT_HEARTBEAT_IDLE_INTERVAL_MS,
+  ) ?? 7_200_000; // 2 hours
 export const AGENT_MAX_INBOUND_CHARS =
  parseOptionalInt(
    process.env.AGENT_MAX_INBOUND_CHARS || envConfig.AGENT_MAX_INBOUND_CHARS,
--- a/src/controlplane-heartbeat.test.ts
+++ b/src/controlplane-heartbeat.test.ts
@ -35,6 +35,7 @@ function makeConfig(
    apiKey: 'test-key',
    agentName: 'clawdie',
    tickIntervalMs: 60000,
+    agentIdleIntervalMs: 7_200_000,
    maxSessionEntries: 50,
    ...overrides,
  };
@ -61,3 +62,111 @@ describe('runAgentHeartbeat — no task', () => {
    expect(result.woke).toBe(false);
  });
 });
+
+describe('runAgentHeartbeat — budget exhausted, no task (idle tick)', () => {
+  it('returns woke:false with budget reason and does NOT write to activity log', async () => {
+    // Pool returns hard_limit_exceeded budget and no tasks
+    const querySpy = vi.fn().mockImplementation((sql: string) => {
+      const s = sql.trim().toUpperCase();
+      // Budget query: SELECT * FROM agent_budgets WHERE agent_id = $1
+      if (s.includes('AGENT_BUDGETS') && s.startsWith('SELECT')) {
+        return Promise.resolve({
+          rows: [{
+            agent_id: 'sysadmin',
+            daily_tokens: 50000,
+            spent_today: 50000,
+            hard_limit_exceeded: true,
+            reset_at: new Date(Date.now() - 3600000).toISOString(),
+          }],
+          rowCount: 1,
+          command: 'SELECT',
+          oid: 0,
+          fields: [],
+        });
+      }
+      // Tasks query: return empty (no pending tasks)
+      if (s.includes('AGENT_TASKS') || s.includes('TASKS')) {
+        return Promise.resolve({ rows: [], rowCount: 0, command: 'SELECT', oid: 0, fields: [] });
+      }
+      return Promise.resolve({ rows: [], rowCount: 0, command: '', oid: 0, fields: [] });
+    });
+
+    const mockPool = { query: querySpy } as unknown as Pool;
+    const config = makeConfig({ pool: mockPool });
+
+    const result = await runAgentHeartbeat(
+      config,
+      'sysadmin',
+      'interval_elapsed',
+      undefined,
+      'Run your scheduled maintenance check.',
+    );
+
+    expect(result.woke).toBe(false);
+    expect(result.reason).toBe('hard_limit_exceeded');
+
+    // Must NOT have written an activity INSERT (no task to block)
+    const insertCalls = querySpy.mock.calls.filter(([sql]: [string]) =>
+      /INSERT INTO agent_activity/i.test(sql),
+    );
+    expect(insertCalls).toHaveLength(0);
+  });
+});
+
+describe('runAgentHeartbeat — budget exhausted, real task pending', () => {
+  it('returns woke:false and DOES write activity error when a task is blocked', async () => {
+    const querySpy = vi.fn().mockImplementation((sql: string) => {
+      const s = sql.trim().toUpperCase();
+      if (s.includes('AGENT_BUDGETS') && s.startsWith('SELECT')) {
+        return Promise.resolve({
+          rows: [{
+            agent_id: 'sysadmin',
+            daily_tokens: 50000,
+            spent_today: 50000,
+            hard_limit_exceeded: true,
+            reset_at: new Date(Date.now() - 3600000).toISOString(),
+          }],
+          rowCount: 1,
+          command: 'SELECT',
+          oid: 0,
+          fields: [],
+        });
+      }
+      // Tasks query: return a pending task
+      if (s.includes('TASKS') && s.startsWith('SELECT')) {
+        return Promise.resolve({
+          rows: [{
+            id: 'task-123',
+            title: 'Check jail status',
+            assigned_to: 'sysadmin',
+            status: 'pending',
+            priority: 'normal',
+            description: '',
+            context: null,
+            deadline: null,
+            created_at: new Date().toISOString(),
+          }],
+          rowCount: 1,
+          command: 'SELECT',
+          oid: 0,
+          fields: [],
+        });
+      }
+      return Promise.resolve({ rows: [], rowCount: 0, command: '', oid: 0, fields: [] });
+    });
+
+    const mockPool = { query: querySpy } as unknown as Pool;
+    const config = makeConfig({ pool: mockPool });
+
+    const result = await runAgentHeartbeat(config, 'sysadmin', 'assignment', 'task-123');
+
+    expect(result.woke).toBe(false);
+    expect(result.reason).toBe('hard_limit_exceeded');
+
+    // MUST have written an activity INSERT because a real task was blocked
+    const insertCalls = querySpy.mock.calls.filter(([sql]: [string]) =>
+      /INSERT INTO agent_activity/i.test(sql),
+    );
+    expect(insertCalls.length).toBeGreaterThan(0);
+  });
+});
--- a/src/controlplane-heartbeat.ts
+++ b/src/controlplane-heartbeat.ts
@ -73,6 +73,8 @@ export interface ControlplaneHeartbeatConfig {
  apiKey: string;
  agentName: string;
  tickIntervalMs: number;
+  /** Minimum ms between idle maintenance runs per agent. Separate from tickIntervalMs. */
+  agentIdleIntervalMs: number;
  maxSessionEntries: number;
  sendMessage?: (jid: string, text: string) => Promise<void>;
 }
@ -274,25 +276,31 @@ export async function runAgentHeartbeat(
 ): Promise<HeartbeatResult> {
  const { pool, workspaceCwd, sessionCwd, apiKey } = config;

-  const budget = await checkBudget(pool, agentId, 500);
-  if (!budget.allowed) {
-    await insertActivity(pool, {
-      agent_id: agentId,
-      event_type: 'error',
-      payload: { reason: budget.reason, wake_blocked: true },
-    });
-    return { agentId, woke: false, reason: budget.reason };
-  }
-
+  // Fetch tasks first so we know whether there is real work before deciding
+  // whether to log a budget-exceeded error (idle ticks should not spam activity).
  const tasks = await getTasksByRole(pool, agentId);
  const task = taskId ? tasks.find((t) => t.id === taskId) : tasks[0];

-  // Skip heartbeat if no pending tasks and this is an interval tick,
-  // unless an idlePrompt is provided (e.g. scheduled maintenance agents).
+  // Skip early: no pending task, interval tick, no idle prompt → silent return.
  if (!task && wakeReason === 'interval_elapsed' && !idlePrompt) {
    return { agentId, woke: false, reason: 'no_pending_tasks' };
  }

+  const hasRealTask = !!task;
+  const budget = await checkBudget(pool, agentId, 500);
+  if (!budget.allowed) {
+    // Only log to activity when a real assigned task is being blocked.
+    // Idle maintenance ticks silently skip to avoid flooding the activity log.
+    if (hasRealTask) {
+      await insertActivity(pool, {
+        agent_id: agentId,
+        event_type: 'error',
+        payload: { reason: budget.reason, wake_blocked: true },
+      });
+    }
+    return { agentId, woke: false, reason: budget.reason };
+  }
+
  if (!task && wakeReason === 'assignment') {
    return { agentId, woke: false, reason: 'no_task_found' };
  }
@ -549,6 +557,9 @@ export function startControlplaneHeartbeatLoop(
  }

  let lastBudgetResetCheck = 0;
+  // Track when each agent last received an idle maintenance prompt (ms timestamp).
+  // Prevents idle prompts from firing every tick — they run on agentIdleIntervalMs cadence.
+  const lastIdleRunAt = new Map<string, number>();

  const loop = async () => {
    try {
@ -577,12 +588,20 @@ export function startControlplaneHeartbeatLoop(
      for (const agent of agents) {
        if (!agent.heartbeat_enabled) continue;
        try {
+          const lastIdle = lastIdleRunAt.get(agent.id) ?? 0;
+          const idleReady = now - lastIdle >= config.agentIdleIntervalMs;
+          const idlePrompt = idleReady
+            ? 'Run your scheduled maintenance check.'
+            : undefined;
+          // Record attempt time before running so budget exhaustion doesn't
+          // cause a retry every 30s for the rest of the day.
+          if (idleReady) lastIdleRunAt.set(agent.id, now);
          await runAgentHeartbeat(
            config,
            agent.id,
            'interval_elapsed',
            undefined,
-            'Run your scheduled maintenance check.',
+            idlePrompt,
          );
        } catch (err) {
          logger.error(
--- a/src/index.ts
+++ b/src/index.ts
@ -30,6 +30,7 @@ import {
  AGENT_BUDGET_PAUSE_PCT,
  AGENT_BUDGET_PAUSE_NOTICE_COOLDOWN_SEC,
  AGENT_CHAT_DAILY_TOKEN_LIMIT,
+  AGENT_HEARTBEAT_IDLE_INTERVAL_MS,
  TELEGRAM_OPS_CHAT_ID,
  WATCHDOG_MODE,
 } from './config.js';
@ -1075,6 +1076,7 @@ async function main(): Promise<void> {
    apiKey: CONTROLPLANE_SHARED_SECRET || OPENAI_API_KEY,
    agentName: AGENT_NAME,
    tickIntervalMs: 30000,
+    agentIdleIntervalMs: AGENT_HEARTBEAT_IDLE_INTERVAL_MS,
    maxSessionEntries: 100,
    sendMessage: async (jid, rawText) => {
      const channel = findChannel(channels, jid);