diff --git a/gateway/run.py b/gateway/run.py index 984ce1f0a..54de48e66 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -11538,7 +11538,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew # when we successfully transcribed the audio — it's redundant. _placeholder = "(The user sent a message with no text content)" if user_text and user_text.strip() == _placeholder: - return prefix + return prefix, successful_transcripts if user_text: return f"{prefix}\n\n{user_text}", successful_transcripts return prefix, successful_transcripts diff --git a/tests/gateway/test_stt_config.py b/tests/gateway/test_stt_config.py index 004dd907e..6f98a058d 100644 --- a/tests/gateway/test_stt_config.py +++ b/tests/gateway/test_stt_config.py @@ -102,6 +102,45 @@ async def test_enrich_message_with_transcription_avoids_bogus_no_provider_messag assert transcripts == [] +@pytest.mark.asyncio +async def test_enrich_message_with_transcription_returns_tuple_for_empty_content_placeholder(): + """A successful transcription whose caption is the empty-content placeholder + must still return the ``(text, transcripts)`` tuple. + + The Discord adapter delivers a captionless voice note as the literal + ``"(The user sent a message with no text content)"`` placeholder. When STT + succeeds we strip that redundant placeholder and return just the transcript + prefix — but the method's contract (and every caller, which unpacks the + result as ``text, transcripts = ...``) requires a 2-tuple. Returning a bare + string here raised ``ValueError: too many values to unpack`` and dropped the + whole voice message on the floor. + """ + from gateway.run import GatewayRunner + + runner = GatewayRunner.__new__(GatewayRunner) + runner.config = GatewayConfig(stt_enabled=True) + runner._has_setup_skill = lambda: False + + with patch( + "tools.transcription_tools.transcribe_audio", + return_value={ + "success": True, + "transcript": "hello from a captionless voice note", + "provider": "local_command", + }, + ): + result, transcripts = await runner._enrich_message_with_transcription( + "(The user sent a message with no text content)", + ["/tmp/voice.ogg"], + ) + + # The redundant placeholder is stripped, leaving only the transcript prefix. + assert "hello from a captionless voice note" in result + assert "(The user sent a message with no text content)" not in result + # Crucially, the transcripts are still surfaced so callers can echo them. + assert transcripts == ["hello from a captionless voice note"] + + @pytest.mark.asyncio async def test_prepare_inbound_message_text_transcribes_queued_voice_event(): from gateway.run import GatewayRunner