feat(runtime): validate fallback config and add /clearcooldown

Two operator-facing safety nets for the provider-fallback feature shipped earlier today: 1. Startup config check: when LLM_FALLBACK_PROVIDER is set, verify the matching API key (OPENROUTER_API_KEY etc.) is present. Without this, the fallback would silently fail the moment the primary provider caps — which is exactly when the operator can least afford to find out. Also warns when LLM_FALLBACK_PROVIDER is set without LLM_FALLBACK_MODEL since the primary model name rarely exists on the fallback provider. 2. /clearcooldown admin command: manually release a provider cooldown if the cap is lifted early or if the cap-error parser ever false-trips. Lists active cooldowns when called without args; takes <provider> or "all". Persists immediately so the cleared state survives restart. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --- Build: pass | Tests: pass — Tests 1941 passed (1941) --- Build: pass | Tests: pass — Tests 1944 passed (1944)
2026-04-26 08:44:37 +02:00 · 2026-04-26 08:44:37 +02:00 · 3d33482c14
commit 3d33482c14
parent b4996f732f
3 changed files with 103 additions and 1 deletions
--- a/src/channels/telegram.ts
+++ b/src/channels/telegram.ts
@ -58,6 +58,7 @@ import {
  handleUsageCommand,
  handleUpdatesCommand,
  handleBudgetResetCommand,
+  handleClearCooldownCommand,
  handleWhoamiCommand,
  handleResumeCommand,
  handlePolicyCommand,
@ -134,6 +135,7 @@ export class TelegramChannel implements Channel {
          '/tokens — Show runtime token burn per agent\n' +
          '/updates — Show FreeBSD base/ports updates\n' +
          '/budgetreset — Reset agent token budget (admin)\n' +
+          '/clearcooldown — Clear provider fallback cooldown (admin)\n' +
          '/tts — Control voice replies (on/off/status)\n' +
          '/stop — Stop running agent\n' +
          '/new — Reset session, start fresh\n' +
@ -333,6 +335,11 @@ export class TelegramChannel implements Channel {
      await handleBudgetResetCommand(ctx, chatJid);
    });

+    this.bot.command('clearcooldown', async (ctx) => {
+      const chatJid = `tg:${ctx.chat.id}`;
+      await handleClearCooldownCommand(ctx, chatJid);
+    });
+
    this.bot.command('rescue', async (ctx) => {
      const chatJid = `tg:${ctx.chat.id}`;
      await handleBudgetResetCommand(ctx, chatJid);
--- a/src/index.ts
+++ b/src/index.ts
@ -124,7 +124,10 @@ import { startSchedulerLoop } from './task-scheduler.js';
 import { bridgeTelegramMessage } from './controlplane-telegram.js';
 import { Channel, NewMessage, RegisteredGroup } from './types.js';
 import { logger } from './logger.js';
-import { loadProviderCooldowns } from './provider-fallback.js';
+import {
+  getFallbackPolicy,
+  loadProviderCooldowns,
+} from './provider-fallback.js';
 import { formatSkillsListText } from './skills-list.js';
 import { shouldApplyTts, stripTtsMarker, synthesize } from './tts.js';
 import { getTtsModeForChat, setCommandContext } from './telegram-commands.js';
@ -996,6 +999,38 @@ async function main(): Promise<void> {
  if (AGENT_ENGINE === 'pi-tui' && llmKey) criticalConfig.push(llmKey);
  if (sttKey) criticalConfig.push(sttKey);

+  // If a provider fallback is configured, make sure the fallback's API key is
+  // present too — otherwise the fallback silently fails the moment the primary
+  // provider hits its cap, which is exactly when we can least afford it.
+  const fallbackPolicy = getFallbackPolicy();
+  const fallbackProvider = (fallbackPolicy.fallbackProvider || '')
+    .trim()
+    .toLowerCase();
+  if (fallbackProvider) {
+    const fallbackKey =
+      fallbackProvider === 'openrouter'
+        ? { name: 'OPENROUTER_API_KEY', value: OPENROUTER_API_KEY }
+        : fallbackProvider === 'zai'
+          ? { name: 'ZAI_API_KEY', value: ZAI_API_KEY }
+          : fallbackProvider === 'anthropic'
+            ? { name: 'ANTHROPIC_API_KEY', value: ANTHROPIC_API_KEY }
+            : fallbackProvider === 'google'
+              ? { name: 'GOOGLE_API_KEY', value: GOOGLE_API_KEY }
+              : null;
+    if (fallbackKey) {
+      criticalConfig.push({
+        ...fallbackKey,
+        hint: `LLM_FALLBACK_PROVIDER=${fallbackProvider} — fallback will fail when primary provider hits its cap`,
+      });
+    }
+    if (!fallbackPolicy.fallbackModel) {
+      logger.warn(
+        { fallbackProvider },
+        'LLM_FALLBACK_PROVIDER set without LLM_FALLBACK_MODEL — fallback will reuse the primary model name, which may not exist on the fallback provider',
+      );
+    }
+  }
+
  for (const c of criticalConfig) {
    if (!c.value) {
      logger.warn({ config: c.name }, `Missing ${c.name} — ${c.hint}`);
--- a/src/telegram-commands.ts
+++ b/src/telegram-commands.ts
@ -59,8 +59,10 @@ import { buildDiskReport, renderDiskReport } from './reports/disk-report.js';
 import { buildBudgetReport, renderBudgetReport } from './reports/budget-report.js';
 import { buildTasksReport, renderTasksReport } from './reports/tasks-report.js';
 import {
+  clearProviderCooldown,
  getFallbackPolicy,
  listProviderCooldowns,
+  persistProviderCooldowns,
 } from './provider-fallback.js';
 import {
  buildPublishReport,
@ -965,6 +967,64 @@ export async function handleBudgetResetCallback(
  });
 }

+// ── /clearcooldown ───────────────────────────────────────────────────────
+
+export async function handleClearCooldownCommand(
+  ctxArg: any,
+  chatJid: string,
+): Promise<void> {
+  if (!(await requireAdmin(ctxArg))) return;
+  if (!(await requireOpsChat(ctxArg, chatJid))) return;
+
+  const text = (ctxArg.message?.text || '').trim();
+  const args = text.split(/\s+/).slice(1);
+  const target = (args[0] || '').trim().toLowerCase();
+
+  const active = listProviderCooldowns();
+
+  if (!target) {
+    if (active.length === 0) {
+      await ctxArg.reply('No provider cooldowns active.');
+      return;
+    }
+    const lines = active
+      .map((c) => `- ${c.provider} until ${c.until.toISOString()} (${c.reason})`)
+      .join('\n');
+    await ctxArg.reply(
+      `Active provider cooldowns:\n${lines}\n\nUsage: /clearcooldown <provider|all>`,
+    );
+    return;
+  }
+
+  if (target === 'all') {
+    if (active.length === 0) {
+      await ctxArg.reply('No provider cooldowns to clear.');
+      return;
+    }
+    for (const c of active) clearProviderCooldown(c.provider);
+    await persistProviderCooldowns().catch(() => undefined);
+    await ctxArg.reply(
+      `Cleared ${active.length} provider cooldown${active.length === 1 ? '' : 's'}: ${active
+        .map((c) => c.provider)
+        .join(', ')}`,
+    );
+    return;
+  }
+
+  const match = active.find((c) => c.provider === target);
+  if (!match) {
+    await ctxArg.reply(
+      `No active cooldown for "${target}". Active: ${
+        active.length === 0 ? 'none' : active.map((c) => c.provider).join(', ')
+      }`,
+    );
+    return;
+  }
+  clearProviderCooldown(target);
+  await persistProviderCooldowns().catch(() => undefined);
+  await ctxArg.reply(`Cleared provider cooldown: ${target}`);
+}
+
 function splitTextChunks(text: string, maxLen: number): string[] {
  if (!text) return [];
  if (maxLen <= 0) return [text];