STT: support OpenRouter keys (Whisper-compatible)
--- Build: pass | Tests: pass — 1541 passed (94 files)
This commit is contained in:
parent
65fa7873f2
commit
125ecb92f2
6 changed files with 129 additions and 16 deletions
|
|
@ -229,3 +229,8 @@ OPERATOR_PASSWORD=
|
|||
# TTS_RATE=+0%
|
||||
# TTS_VOLUME=+0%
|
||||
# TTS_MAX_CHARS=2000
|
||||
|
||||
# STT (optional) — transcribe Telegram voice notes.
|
||||
# Default implementation supports OpenAI Whisper, or attempts OpenRouter with an OpenAI-compatible endpoint.
|
||||
# STT_PROVIDER=openrouter
|
||||
# STT_MODEL=openai/whisper-1
|
||||
|
|
|
|||
|
|
@ -25,7 +25,11 @@ import {
|
|||
RegisteredGroup,
|
||||
} from '../types.js';
|
||||
import { registerChannel } from './registry.js';
|
||||
import { isTranscriptionAvailable, transcribeAudio } from '../transcription.js';
|
||||
import {
|
||||
getTranscriptionStatus,
|
||||
isTranscriptionAvailable,
|
||||
transcribeAudio,
|
||||
} from '../transcription.js';
|
||||
|
||||
export interface TelegramChannelOpts {
|
||||
onMessage: OnInboundMessage;
|
||||
|
|
@ -296,8 +300,8 @@ export class TelegramChannel implements Channel {
|
|||
|
||||
try {
|
||||
if (!isTranscriptionAvailable()) {
|
||||
content =
|
||||
'[Voice message — transcription unavailable (missing OPENAI_API_KEY)]';
|
||||
const stt = getTranscriptionStatus();
|
||||
content = `[Voice message — transcription unavailable (${stt.reason})]`;
|
||||
throw new Error('transcription-unavailable');
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -128,6 +128,9 @@ const envConfig = readEnvFile([
|
|||
'TTS_RATE',
|
||||
'TTS_VOLUME',
|
||||
'TTS_MAX_CHARS',
|
||||
// STT (speech-to-text, optional)
|
||||
'STT_PROVIDER',
|
||||
'STT_MODEL',
|
||||
]);
|
||||
|
||||
// ── Agent identity (single source — everything derives from this) ──
|
||||
|
|
@ -238,6 +241,8 @@ export const TELEGRAM_BOT_TOKEN =
|
|||
process.env.TELEGRAM_BOT_TOKEN || envConfig.TELEGRAM_BOT_TOKEN || '';
|
||||
export const OPENAI_API_KEY =
|
||||
process.env.OPENAI_API_KEY || envConfig.OPENAI_API_KEY || '';
|
||||
export const OPENROUTER_API_KEY =
|
||||
process.env.OPENROUTER_API_KEY || envConfig.OPENROUTER_API_KEY || '';
|
||||
|
||||
// ── Prompt/token guardrails ────────────────────────────────────────────────
|
||||
|
||||
|
|
@ -645,6 +650,14 @@ export const TTS_MAX_CHARS = Math.max(
|
|||
200,
|
||||
parseInt(process.env.TTS_MAX_CHARS || envConfig.TTS_MAX_CHARS || '2000', 10),
|
||||
);
|
||||
|
||||
// ── STT (speech-to-text) ───────────────────────────────────────────────────
|
||||
export type SttProvider = 'openai' | 'openrouter' | '';
|
||||
export const STT_PROVIDER: SttProvider = (process.env.STT_PROVIDER ||
|
||||
envConfig.STT_PROVIDER ||
|
||||
'') as SttProvider;
|
||||
export const STT_MODEL =
|
||||
process.env.STT_MODEL || envConfig.STT_MODEL || 'whisper-1';
|
||||
export const WATCHDOG_MODE = (process.env.WATCHDOG_MODE || 'auto') as
|
||||
| 'auto'
|
||||
| 'slow'
|
||||
|
|
|
|||
10
src/index.ts
10
src/index.ts
|
|
@ -13,8 +13,11 @@ import {
|
|||
MAIN_GROUP_FOLDER,
|
||||
METRICS_PORT,
|
||||
OPENAI_API_KEY,
|
||||
OPENROUTER_API_KEY,
|
||||
PROJECT_ROOT,
|
||||
TELEGRAM_BOT_TOKEN,
|
||||
STT_MODEL,
|
||||
STT_PROVIDER,
|
||||
TTS_MODE,
|
||||
TMP_DIR,
|
||||
TRIGGER_PATTERN,
|
||||
|
|
@ -844,7 +847,12 @@ async function main(): Promise<void> {
|
|||
await loadState();
|
||||
|
||||
// Initialize voice transcription
|
||||
initTranscription(OPENAI_API_KEY);
|
||||
initTranscription({
|
||||
provider: STT_PROVIDER,
|
||||
model: STT_MODEL,
|
||||
openaiApiKey: OPENAI_API_KEY,
|
||||
openrouterApiKey: OPENROUTER_API_KEY,
|
||||
});
|
||||
|
||||
// Connect memory pool — required (db jail must be running)
|
||||
try {
|
||||
|
|
|
|||
|
|
@ -60,7 +60,13 @@ describe('initTranscription', () => {
|
|||
|
||||
it('does not throw when called with an API key', async () => {
|
||||
const { initTranscription: init } = await import('./transcription.js');
|
||||
expect(() => init('sk-test-key')).not.toThrow();
|
||||
expect(() =>
|
||||
init({
|
||||
provider: 'openai',
|
||||
model: 'whisper-1',
|
||||
openaiApiKey: 'sk-test-key',
|
||||
}),
|
||||
).not.toThrow();
|
||||
});
|
||||
});
|
||||
|
||||
|
|
@ -96,7 +102,7 @@ describe('transcribeAudio — initialized', () => {
|
|||
it('returns transcript text on success', async () => {
|
||||
mockCreate.mockResolvedValue({ text: ' hello world ' });
|
||||
const { initTranscription: init, transcribeAudio: transcribe } = await import('./transcription.js');
|
||||
init('sk-test-key');
|
||||
init({ provider: 'openai', model: 'whisper-1', openaiApiKey: 'sk-test-key' });
|
||||
const result = await transcribe(
|
||||
require('path').resolve(process.cwd(), 'tmp', 'audio.ogg'),
|
||||
);
|
||||
|
|
@ -106,7 +112,7 @@ describe('transcribeAudio — initialized', () => {
|
|||
it('returns null on OpenAI error', async () => {
|
||||
mockCreate.mockRejectedValue(new Error('API error'));
|
||||
const { initTranscription: init, transcribeAudio: transcribe } = await import('./transcription.js');
|
||||
init('sk-test-key');
|
||||
init({ provider: 'openai', model: 'whisper-1', openaiApiKey: 'sk-test-key' });
|
||||
const result = await transcribe(
|
||||
require('path').resolve(process.cwd(), 'tmp', 'audio.ogg'),
|
||||
);
|
||||
|
|
@ -116,7 +122,7 @@ describe('transcribeAudio — initialized', () => {
|
|||
it('calls the transcription API with whisper-1 model', async () => {
|
||||
mockCreate.mockResolvedValue({ text: 'test' });
|
||||
const { initTranscription: init, transcribeAudio: transcribe } = await import('./transcription.js');
|
||||
init('sk-test-key');
|
||||
init({ provider: 'openai', model: 'whisper-1', openaiApiKey: 'sk-test-key' });
|
||||
await transcribe(require('path').resolve(process.cwd(), 'tmp', 'audio.ogg'));
|
||||
expect(mockCreate).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ model: 'whisper-1' }),
|
||||
|
|
@ -126,7 +132,7 @@ describe('transcribeAudio — initialized', () => {
|
|||
it('returns null for empty transcript text', async () => {
|
||||
mockCreate.mockResolvedValue({ text: ' ' });
|
||||
const { initTranscription: init, transcribeAudio: transcribe } = await import('./transcription.js');
|
||||
init('sk-test-key');
|
||||
init({ provider: 'openai', model: 'whisper-1', openaiApiKey: 'sk-test-key' });
|
||||
const result = await transcribe(
|
||||
require('path').resolve(process.cwd(), 'tmp', 'audio.ogg'),
|
||||
);
|
||||
|
|
|
|||
|
|
@ -4,22 +4,99 @@ import fs from 'fs';
|
|||
|
||||
let openaiClient: OpenAI | null = null;
|
||||
let transcriptionReady = false;
|
||||
let transcriptionReason = 'not initialized';
|
||||
let transcriptionProvider: string | null = null;
|
||||
let transcriptionModel: string | null = null;
|
||||
|
||||
export function initTranscription(apiKey?: string): void {
|
||||
if (!apiKey) {
|
||||
logger.warn('OPENAI_API_KEY not set - voice transcription unavailable');
|
||||
export function initTranscription(
|
||||
options?: {
|
||||
provider: 'openai' | 'openrouter' | '';
|
||||
model: string;
|
||||
openaiApiKey?: string;
|
||||
openrouterApiKey?: string;
|
||||
},
|
||||
): void {
|
||||
const provider = options?.provider || '';
|
||||
const model = options?.model || 'whisper-1';
|
||||
const openaiApiKey = options?.openaiApiKey;
|
||||
const openrouterApiKey = options?.openrouterApiKey;
|
||||
|
||||
transcriptionProvider = provider || null;
|
||||
transcriptionModel = model || null;
|
||||
|
||||
if (!provider) {
|
||||
transcriptionReady = false;
|
||||
transcriptionReason = 'STT_PROVIDER not set';
|
||||
logger.info('Voice transcription disabled (no STT_PROVIDER configured)');
|
||||
return;
|
||||
}
|
||||
openaiClient = new OpenAI({ apiKey });
|
||||
transcriptionReady = true;
|
||||
logger.info('Voice transcription initialized with OpenAI Whisper');
|
||||
|
||||
if (provider === 'openai') {
|
||||
if (!openaiApiKey) {
|
||||
logger.warn('OPENAI_API_KEY not set - voice transcription unavailable');
|
||||
transcriptionReady = false;
|
||||
transcriptionReason = 'missing OPENAI_API_KEY';
|
||||
return;
|
||||
}
|
||||
openaiClient = new OpenAI({ apiKey: openaiApiKey });
|
||||
transcriptionReady = true;
|
||||
transcriptionReason = 'ok';
|
||||
logger.info({ provider: 'openai', model }, 'Voice transcription initialized');
|
||||
return;
|
||||
}
|
||||
|
||||
if (provider === 'openrouter') {
|
||||
if (!openrouterApiKey) {
|
||||
logger.warn('OPENROUTER_API_KEY not set - voice transcription unavailable');
|
||||
transcriptionReady = false;
|
||||
transcriptionReason = 'missing OPENROUTER_API_KEY';
|
||||
return;
|
||||
}
|
||||
// OpenRouter uses provider-scoped model IDs (e.g. "openai/whisper-1").
|
||||
const normalizedModel =
|
||||
model === 'whisper-1' ? 'openai/whisper-1' : model;
|
||||
transcriptionModel = normalizedModel;
|
||||
openaiClient = new OpenAI({
|
||||
apiKey: openrouterApiKey,
|
||||
baseURL: 'https://openrouter.ai/api/v1',
|
||||
defaultHeaders: {
|
||||
'HTTP-Referer': 'https://codeberg.org/Clawdie/Clawdie-AI',
|
||||
'X-Title': 'mevy-ai',
|
||||
},
|
||||
});
|
||||
transcriptionReady = false;
|
||||
transcriptionReady = true;
|
||||
transcriptionReason = 'ok';
|
||||
logger.info(
|
||||
{ provider: 'openrouter', model: normalizedModel },
|
||||
'Voice transcription initialized (OpenRouter)',
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
transcriptionReady = false;
|
||||
transcriptionReason = `unsupported provider: ${provider}`;
|
||||
logger.warn({ provider }, 'Voice transcription unavailable');
|
||||
}
|
||||
|
||||
export function isTranscriptionAvailable(): boolean {
|
||||
return transcriptionReady && !!openaiClient;
|
||||
}
|
||||
|
||||
export function getTranscriptionStatus(): {
|
||||
available: boolean;
|
||||
reason: string;
|
||||
provider: string | null;
|
||||
model: string | null;
|
||||
} {
|
||||
return {
|
||||
available: isTranscriptionAvailable(),
|
||||
reason: transcriptionReason,
|
||||
provider: transcriptionProvider,
|
||||
model: transcriptionModel,
|
||||
};
|
||||
}
|
||||
|
||||
export async function transcribeAudio(
|
||||
audioPath: string,
|
||||
): Promise<string | null> {
|
||||
|
|
@ -32,7 +109,7 @@ export async function transcribeAudio(
|
|||
const audioFile = fs.createReadStream(audioPath);
|
||||
const response = await openaiClient.audio.transcriptions.create({
|
||||
file: audioFile,
|
||||
model: 'whisper-1',
|
||||
model: transcriptionModel || 'whisper-1',
|
||||
});
|
||||
|
||||
const transcript = response.text.trim();
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue