STT: support OpenRouter keys (Whisper-compatible)

---

Build: pass | Tests: pass — 1541 passed (94 files)
This commit is contained in:
Mevy Assistant 2026-04-19 11:46:53 +00:00
parent 65fa7873f2
commit 125ecb92f2
6 changed files with 129 additions and 16 deletions

View file

@ -229,3 +229,8 @@ OPERATOR_PASSWORD=
# TTS_RATE=+0%
# TTS_VOLUME=+0%
# TTS_MAX_CHARS=2000
# STT (optional) — transcribe Telegram voice notes.
# Default implementation supports OpenAI Whisper, or attempts OpenRouter with an OpenAI-compatible endpoint.
# STT_PROVIDER=openrouter
# STT_MODEL=openai/whisper-1

View file

@ -25,7 +25,11 @@ import {
RegisteredGroup,
} from '../types.js';
import { registerChannel } from './registry.js';
import { isTranscriptionAvailable, transcribeAudio } from '../transcription.js';
import {
getTranscriptionStatus,
isTranscriptionAvailable,
transcribeAudio,
} from '../transcription.js';
export interface TelegramChannelOpts {
onMessage: OnInboundMessage;
@ -296,8 +300,8 @@ export class TelegramChannel implements Channel {
try {
if (!isTranscriptionAvailable()) {
content =
'[Voice message — transcription unavailable (missing OPENAI_API_KEY)]';
const stt = getTranscriptionStatus();
content = `[Voice message — transcription unavailable (${stt.reason})]`;
throw new Error('transcription-unavailable');
}

View file

@ -128,6 +128,9 @@ const envConfig = readEnvFile([
'TTS_RATE',
'TTS_VOLUME',
'TTS_MAX_CHARS',
// STT (speech-to-text, optional)
'STT_PROVIDER',
'STT_MODEL',
]);
// ── Agent identity (single source — everything derives from this) ──
@ -238,6 +241,8 @@ export const TELEGRAM_BOT_TOKEN =
process.env.TELEGRAM_BOT_TOKEN || envConfig.TELEGRAM_BOT_TOKEN || '';
export const OPENAI_API_KEY =
process.env.OPENAI_API_KEY || envConfig.OPENAI_API_KEY || '';
export const OPENROUTER_API_KEY =
process.env.OPENROUTER_API_KEY || envConfig.OPENROUTER_API_KEY || '';
// ── Prompt/token guardrails ────────────────────────────────────────────────
@ -645,6 +650,14 @@ export const TTS_MAX_CHARS = Math.max(
200,
parseInt(process.env.TTS_MAX_CHARS || envConfig.TTS_MAX_CHARS || '2000', 10),
);
// ── STT (speech-to-text) ───────────────────────────────────────────────────
export type SttProvider = 'openai' | 'openrouter' | '';
export const STT_PROVIDER: SttProvider = (process.env.STT_PROVIDER ||
envConfig.STT_PROVIDER ||
'') as SttProvider;
export const STT_MODEL =
process.env.STT_MODEL || envConfig.STT_MODEL || 'whisper-1';
export const WATCHDOG_MODE = (process.env.WATCHDOG_MODE || 'auto') as
| 'auto'
| 'slow'

View file

@ -13,8 +13,11 @@ import {
MAIN_GROUP_FOLDER,
METRICS_PORT,
OPENAI_API_KEY,
OPENROUTER_API_KEY,
PROJECT_ROOT,
TELEGRAM_BOT_TOKEN,
STT_MODEL,
STT_PROVIDER,
TTS_MODE,
TMP_DIR,
TRIGGER_PATTERN,
@ -844,7 +847,12 @@ async function main(): Promise<void> {
await loadState();
// Initialize voice transcription
initTranscription(OPENAI_API_KEY);
initTranscription({
provider: STT_PROVIDER,
model: STT_MODEL,
openaiApiKey: OPENAI_API_KEY,
openrouterApiKey: OPENROUTER_API_KEY,
});
// Connect memory pool — required (db jail must be running)
try {

View file

@ -60,7 +60,13 @@ describe('initTranscription', () => {
it('does not throw when called with an API key', async () => {
const { initTranscription: init } = await import('./transcription.js');
expect(() => init('sk-test-key')).not.toThrow();
expect(() =>
init({
provider: 'openai',
model: 'whisper-1',
openaiApiKey: 'sk-test-key',
}),
).not.toThrow();
});
});
@ -96,7 +102,7 @@ describe('transcribeAudio — initialized', () => {
it('returns transcript text on success', async () => {
mockCreate.mockResolvedValue({ text: ' hello world ' });
const { initTranscription: init, transcribeAudio: transcribe } = await import('./transcription.js');
init('sk-test-key');
init({ provider: 'openai', model: 'whisper-1', openaiApiKey: 'sk-test-key' });
const result = await transcribe(
require('path').resolve(process.cwd(), 'tmp', 'audio.ogg'),
);
@ -106,7 +112,7 @@ describe('transcribeAudio — initialized', () => {
it('returns null on OpenAI error', async () => {
mockCreate.mockRejectedValue(new Error('API error'));
const { initTranscription: init, transcribeAudio: transcribe } = await import('./transcription.js');
init('sk-test-key');
init({ provider: 'openai', model: 'whisper-1', openaiApiKey: 'sk-test-key' });
const result = await transcribe(
require('path').resolve(process.cwd(), 'tmp', 'audio.ogg'),
);
@ -116,7 +122,7 @@ describe('transcribeAudio — initialized', () => {
it('calls the transcription API with whisper-1 model', async () => {
mockCreate.mockResolvedValue({ text: 'test' });
const { initTranscription: init, transcribeAudio: transcribe } = await import('./transcription.js');
init('sk-test-key');
init({ provider: 'openai', model: 'whisper-1', openaiApiKey: 'sk-test-key' });
await transcribe(require('path').resolve(process.cwd(), 'tmp', 'audio.ogg'));
expect(mockCreate).toHaveBeenCalledWith(
expect.objectContaining({ model: 'whisper-1' }),
@ -126,7 +132,7 @@ describe('transcribeAudio — initialized', () => {
it('returns null for empty transcript text', async () => {
mockCreate.mockResolvedValue({ text: ' ' });
const { initTranscription: init, transcribeAudio: transcribe } = await import('./transcription.js');
init('sk-test-key');
init({ provider: 'openai', model: 'whisper-1', openaiApiKey: 'sk-test-key' });
const result = await transcribe(
require('path').resolve(process.cwd(), 'tmp', 'audio.ogg'),
);

View file

@ -4,22 +4,99 @@ import fs from 'fs';
let openaiClient: OpenAI | null = null;
let transcriptionReady = false;
let transcriptionReason = 'not initialized';
let transcriptionProvider: string | null = null;
let transcriptionModel: string | null = null;
export function initTranscription(apiKey?: string): void {
if (!apiKey) {
logger.warn('OPENAI_API_KEY not set - voice transcription unavailable');
export function initTranscription(
options?: {
provider: 'openai' | 'openrouter' | '';
model: string;
openaiApiKey?: string;
openrouterApiKey?: string;
},
): void {
const provider = options?.provider || '';
const model = options?.model || 'whisper-1';
const openaiApiKey = options?.openaiApiKey;
const openrouterApiKey = options?.openrouterApiKey;
transcriptionProvider = provider || null;
transcriptionModel = model || null;
if (!provider) {
transcriptionReady = false;
transcriptionReason = 'STT_PROVIDER not set';
logger.info('Voice transcription disabled (no STT_PROVIDER configured)');
return;
}
openaiClient = new OpenAI({ apiKey });
transcriptionReady = true;
logger.info('Voice transcription initialized with OpenAI Whisper');
if (provider === 'openai') {
if (!openaiApiKey) {
logger.warn('OPENAI_API_KEY not set - voice transcription unavailable');
transcriptionReady = false;
transcriptionReason = 'missing OPENAI_API_KEY';
return;
}
openaiClient = new OpenAI({ apiKey: openaiApiKey });
transcriptionReady = true;
transcriptionReason = 'ok';
logger.info({ provider: 'openai', model }, 'Voice transcription initialized');
return;
}
if (provider === 'openrouter') {
if (!openrouterApiKey) {
logger.warn('OPENROUTER_API_KEY not set - voice transcription unavailable');
transcriptionReady = false;
transcriptionReason = 'missing OPENROUTER_API_KEY';
return;
}
// OpenRouter uses provider-scoped model IDs (e.g. "openai/whisper-1").
const normalizedModel =
model === 'whisper-1' ? 'openai/whisper-1' : model;
transcriptionModel = normalizedModel;
openaiClient = new OpenAI({
apiKey: openrouterApiKey,
baseURL: 'https://openrouter.ai/api/v1',
defaultHeaders: {
'HTTP-Referer': 'https://codeberg.org/Clawdie/Clawdie-AI',
'X-Title': 'mevy-ai',
},
});
transcriptionReady = false;
transcriptionReady = true;
transcriptionReason = 'ok';
logger.info(
{ provider: 'openrouter', model: normalizedModel },
'Voice transcription initialized (OpenRouter)',
);
return;
}
transcriptionReady = false;
transcriptionReason = `unsupported provider: ${provider}`;
logger.warn({ provider }, 'Voice transcription unavailable');
}
export function isTranscriptionAvailable(): boolean {
return transcriptionReady && !!openaiClient;
}
export function getTranscriptionStatus(): {
available: boolean;
reason: string;
provider: string | null;
model: string | null;
} {
return {
available: isTranscriptionAvailable(),
reason: transcriptionReason,
provider: transcriptionProvider,
model: transcriptionModel,
};
}
export async function transcribeAudio(
audioPath: string,
): Promise<string | null> {
@ -32,7 +109,7 @@ export async function transcribeAudio(
const audioFile = fs.createReadStream(audioPath);
const response = await openaiClient.audio.transcriptions.create({
file: audioFile,
model: 'whisper-1',
model: transcriptionModel || 'whisper-1',
});
const transcript = response.text.trim();