diff --git a/.env.example b/.env.example index 1fdfe44..f6ea82b 100644 --- a/.env.example +++ b/.env.example @@ -229,3 +229,8 @@ OPERATOR_PASSWORD= # TTS_RATE=+0% # TTS_VOLUME=+0% # TTS_MAX_CHARS=2000 + +# STT (optional) — transcribe Telegram voice notes. +# Default implementation supports OpenAI Whisper, or attempts OpenRouter with an OpenAI-compatible endpoint. +# STT_PROVIDER=openrouter +# STT_MODEL=openai/whisper-1 diff --git a/src/channels/telegram.ts b/src/channels/telegram.ts index b52571b..df80902 100644 --- a/src/channels/telegram.ts +++ b/src/channels/telegram.ts @@ -25,7 +25,11 @@ import { RegisteredGroup, } from '../types.js'; import { registerChannel } from './registry.js'; -import { isTranscriptionAvailable, transcribeAudio } from '../transcription.js'; +import { + getTranscriptionStatus, + isTranscriptionAvailable, + transcribeAudio, +} from '../transcription.js'; export interface TelegramChannelOpts { onMessage: OnInboundMessage; @@ -296,8 +300,8 @@ export class TelegramChannel implements Channel { try { if (!isTranscriptionAvailable()) { - content = - '[Voice message — transcription unavailable (missing OPENAI_API_KEY)]'; + const stt = getTranscriptionStatus(); + content = `[Voice message — transcription unavailable (${stt.reason})]`; throw new Error('transcription-unavailable'); } diff --git a/src/config.ts b/src/config.ts index b496ff9..02711b5 100644 --- a/src/config.ts +++ b/src/config.ts @@ -128,6 +128,9 @@ const envConfig = readEnvFile([ 'TTS_RATE', 'TTS_VOLUME', 'TTS_MAX_CHARS', + // STT (speech-to-text, optional) + 'STT_PROVIDER', + 'STT_MODEL', ]); // ── Agent identity (single source — everything derives from this) ── @@ -238,6 +241,8 @@ export const TELEGRAM_BOT_TOKEN = process.env.TELEGRAM_BOT_TOKEN || envConfig.TELEGRAM_BOT_TOKEN || ''; export const OPENAI_API_KEY = process.env.OPENAI_API_KEY || envConfig.OPENAI_API_KEY || ''; +export const OPENROUTER_API_KEY = + process.env.OPENROUTER_API_KEY || envConfig.OPENROUTER_API_KEY || ''; // ── Prompt/token guardrails ──────────────────────────────────────────────── @@ -645,6 +650,14 @@ export const TTS_MAX_CHARS = Math.max( 200, parseInt(process.env.TTS_MAX_CHARS || envConfig.TTS_MAX_CHARS || '2000', 10), ); + +// ── STT (speech-to-text) ─────────────────────────────────────────────────── +export type SttProvider = 'openai' | 'openrouter' | ''; +export const STT_PROVIDER: SttProvider = (process.env.STT_PROVIDER || + envConfig.STT_PROVIDER || + '') as SttProvider; +export const STT_MODEL = + process.env.STT_MODEL || envConfig.STT_MODEL || 'whisper-1'; export const WATCHDOG_MODE = (process.env.WATCHDOG_MODE || 'auto') as | 'auto' | 'slow' diff --git a/src/index.ts b/src/index.ts index edcac3c..890e3e6 100644 --- a/src/index.ts +++ b/src/index.ts @@ -13,8 +13,11 @@ import { MAIN_GROUP_FOLDER, METRICS_PORT, OPENAI_API_KEY, + OPENROUTER_API_KEY, PROJECT_ROOT, TELEGRAM_BOT_TOKEN, + STT_MODEL, + STT_PROVIDER, TTS_MODE, TMP_DIR, TRIGGER_PATTERN, @@ -844,7 +847,12 @@ async function main(): Promise { await loadState(); // Initialize voice transcription - initTranscription(OPENAI_API_KEY); + initTranscription({ + provider: STT_PROVIDER, + model: STT_MODEL, + openaiApiKey: OPENAI_API_KEY, + openrouterApiKey: OPENROUTER_API_KEY, + }); // Connect memory pool — required (db jail must be running) try { diff --git a/src/transcription.test.ts b/src/transcription.test.ts index 2544393..207a2b0 100644 --- a/src/transcription.test.ts +++ b/src/transcription.test.ts @@ -60,7 +60,13 @@ describe('initTranscription', () => { it('does not throw when called with an API key', async () => { const { initTranscription: init } = await import('./transcription.js'); - expect(() => init('sk-test-key')).not.toThrow(); + expect(() => + init({ + provider: 'openai', + model: 'whisper-1', + openaiApiKey: 'sk-test-key', + }), + ).not.toThrow(); }); }); @@ -96,7 +102,7 @@ describe('transcribeAudio — initialized', () => { it('returns transcript text on success', async () => { mockCreate.mockResolvedValue({ text: ' hello world ' }); const { initTranscription: init, transcribeAudio: transcribe } = await import('./transcription.js'); - init('sk-test-key'); + init({ provider: 'openai', model: 'whisper-1', openaiApiKey: 'sk-test-key' }); const result = await transcribe( require('path').resolve(process.cwd(), 'tmp', 'audio.ogg'), ); @@ -106,7 +112,7 @@ describe('transcribeAudio — initialized', () => { it('returns null on OpenAI error', async () => { mockCreate.mockRejectedValue(new Error('API error')); const { initTranscription: init, transcribeAudio: transcribe } = await import('./transcription.js'); - init('sk-test-key'); + init({ provider: 'openai', model: 'whisper-1', openaiApiKey: 'sk-test-key' }); const result = await transcribe( require('path').resolve(process.cwd(), 'tmp', 'audio.ogg'), ); @@ -116,7 +122,7 @@ describe('transcribeAudio — initialized', () => { it('calls the transcription API with whisper-1 model', async () => { mockCreate.mockResolvedValue({ text: 'test' }); const { initTranscription: init, transcribeAudio: transcribe } = await import('./transcription.js'); - init('sk-test-key'); + init({ provider: 'openai', model: 'whisper-1', openaiApiKey: 'sk-test-key' }); await transcribe(require('path').resolve(process.cwd(), 'tmp', 'audio.ogg')); expect(mockCreate).toHaveBeenCalledWith( expect.objectContaining({ model: 'whisper-1' }), @@ -126,7 +132,7 @@ describe('transcribeAudio — initialized', () => { it('returns null for empty transcript text', async () => { mockCreate.mockResolvedValue({ text: ' ' }); const { initTranscription: init, transcribeAudio: transcribe } = await import('./transcription.js'); - init('sk-test-key'); + init({ provider: 'openai', model: 'whisper-1', openaiApiKey: 'sk-test-key' }); const result = await transcribe( require('path').resolve(process.cwd(), 'tmp', 'audio.ogg'), ); diff --git a/src/transcription.ts b/src/transcription.ts index b58afc4..8889423 100644 --- a/src/transcription.ts +++ b/src/transcription.ts @@ -4,22 +4,99 @@ import fs from 'fs'; let openaiClient: OpenAI | null = null; let transcriptionReady = false; +let transcriptionReason = 'not initialized'; +let transcriptionProvider: string | null = null; +let transcriptionModel: string | null = null; -export function initTranscription(apiKey?: string): void { - if (!apiKey) { - logger.warn('OPENAI_API_KEY not set - voice transcription unavailable'); +export function initTranscription( + options?: { + provider: 'openai' | 'openrouter' | ''; + model: string; + openaiApiKey?: string; + openrouterApiKey?: string; + }, +): void { + const provider = options?.provider || ''; + const model = options?.model || 'whisper-1'; + const openaiApiKey = options?.openaiApiKey; + const openrouterApiKey = options?.openrouterApiKey; + + transcriptionProvider = provider || null; + transcriptionModel = model || null; + + if (!provider) { transcriptionReady = false; + transcriptionReason = 'STT_PROVIDER not set'; + logger.info('Voice transcription disabled (no STT_PROVIDER configured)'); return; } - openaiClient = new OpenAI({ apiKey }); - transcriptionReady = true; - logger.info('Voice transcription initialized with OpenAI Whisper'); + + if (provider === 'openai') { + if (!openaiApiKey) { + logger.warn('OPENAI_API_KEY not set - voice transcription unavailable'); + transcriptionReady = false; + transcriptionReason = 'missing OPENAI_API_KEY'; + return; + } + openaiClient = new OpenAI({ apiKey: openaiApiKey }); + transcriptionReady = true; + transcriptionReason = 'ok'; + logger.info({ provider: 'openai', model }, 'Voice transcription initialized'); + return; + } + + if (provider === 'openrouter') { + if (!openrouterApiKey) { + logger.warn('OPENROUTER_API_KEY not set - voice transcription unavailable'); + transcriptionReady = false; + transcriptionReason = 'missing OPENROUTER_API_KEY'; + return; + } + // OpenRouter uses provider-scoped model IDs (e.g. "openai/whisper-1"). + const normalizedModel = + model === 'whisper-1' ? 'openai/whisper-1' : model; + transcriptionModel = normalizedModel; + openaiClient = new OpenAI({ + apiKey: openrouterApiKey, + baseURL: 'https://openrouter.ai/api/v1', + defaultHeaders: { + 'HTTP-Referer': 'https://codeberg.org/Clawdie/Clawdie-AI', + 'X-Title': 'mevy-ai', + }, + }); + transcriptionReady = false; + transcriptionReady = true; + transcriptionReason = 'ok'; + logger.info( + { provider: 'openrouter', model: normalizedModel }, + 'Voice transcription initialized (OpenRouter)', + ); + return; + } + + transcriptionReady = false; + transcriptionReason = `unsupported provider: ${provider}`; + logger.warn({ provider }, 'Voice transcription unavailable'); } export function isTranscriptionAvailable(): boolean { return transcriptionReady && !!openaiClient; } +export function getTranscriptionStatus(): { + available: boolean; + reason: string; + provider: string | null; + model: string | null; +} { + return { + available: isTranscriptionAvailable(), + reason: transcriptionReason, + provider: transcriptionProvider, + model: transcriptionModel, + }; +} + export async function transcribeAudio( audioPath: string, ): Promise { @@ -32,7 +109,7 @@ export async function transcribeAudio( const audioFile = fs.createReadStream(audioPath); const response = await openaiClient.audio.transcriptions.create({ file: audioFile, - model: 'whisper-1', + model: transcriptionModel || 'whisper-1', }); const transcript = response.text.trim();