From ac17a9f292cbfbd68406df123449edcb7e58eba9 Mon Sep 17 00:00:00 2001 From: josedario87 Date: Sat, 14 Feb 2026 00:16:01 -0600 Subject: [PATCH] fix: Improve Whisper transcription with WebM to WAV conversion - Add ffmpeg conversion from WebM/Opus to WAV (16kHz mono PCM) - Optimize transcription parameters (VAD, temperature, beam_size) - Add Honduras Spanish context prompt with local expressions - Fix chunk accumulation display in voice panel - Add 1.5s recording buffer after releasing Ctrl+Space - Skip small audio chunks (<5KB) that cause ffmpeg errors - Use large-v3 model for better accuracy --- frontend/src/components/FloatingVoice.vue | 82 ++++++++++----- server/whisper_server.py | 123 +++++++++++++++++++--- 2 files changed, 163 insertions(+), 42 deletions(-) diff --git a/frontend/src/components/FloatingVoice.vue b/frontend/src/components/FloatingVoice.vue index 24cc6f0..8952a1a 100644 --- a/frontend/src/components/FloatingVoice.vue +++ b/frontend/src/components/FloatingVoice.vue @@ -42,6 +42,7 @@ const connected = ref(false) let keyDownTime = 0 let holdTimeout: number | null = null const isPushToTalk = ref(false) +let pendingWhisperSend = false // Flag to send transcript when Whisper responds // ============ WHISPER MODE ============ const useWhisper = ref(false) @@ -194,19 +195,27 @@ function connectWhisperSocket() { const fullText = msg.text.trim() if (msg.partial) { - // For partial results, show as interim (will be replaced) - // Only show text that's new since last transcription - const newText = fullText.substring(lastTranscriptLength).trim() - if (newText) { - interimTranscript.value = newText - console.log(`[Voice] 馃攧 WHISPER partial:`, newText) - } - } else { - // Final result - replace everything + // For partial results, show full accumulated transcription + transcript.value = fullText + ' ' + interimTranscript.value = '' + console.log(`[Voice] 馃攧 WHISPER partial:`, fullText) + } else { + // Final result transcript.value = fullText + ' ' interimTranscript.value = '' - lastTranscriptLength = 0 console.log(`[Voice] 馃幆 WHISPER-GPU (${msg.model}/${msg.device}):`, fullText) + + // Auto-send if push-to-talk was waiting for this + if (pendingWhisperSend) { + pendingWhisperSend = false + console.log('[Voice] Whisper response received, sending transcript') + if (transcript.value.trim()) { + sendTranscriptAndClose() + } else { + isPushToTalk.value = false + close() + } + } } // Update last transcript length for next partial @@ -214,6 +223,11 @@ function connectWhisperSocket() { } else if (msg.error) { error.value = msg.error console.error('[Voice] Whisper error:', msg.error) + // Clear pending send on error + if (pendingWhisperSend) { + pendingWhisperSend = false + isPushToTalk.value = false + } } } } catch (e) { @@ -263,7 +277,6 @@ async function startWhisperRecording() { // Start recording mediaRecorder.start(100) // Collect data every 100ms isRecording.value = true - interimTranscript.value = 'Listening (Whisper GPU)...' // Send chunks periodically for progressive transcription chunkInterval = window.setInterval(() => { @@ -283,6 +296,16 @@ function sendAudioChunk(isFinal: boolean) { // Always send ALL accumulated audio (webm needs header from first chunk) const audioBlob = new Blob(audioChunks, { type: 'audio/webm' }) + const chunkCount = audioChunks.length + + // Skip if audio is too small (< 5KB) - WebM header alone is ~1-2KB + if (audioBlob.size < 5000) { + console.log(`[Voice] Skipping small chunk (${audioBlob.size} bytes)`) + if (isFinal) { + audioChunks = [] + } + return + } // Clear chunks only if final if (isFinal) { @@ -295,16 +318,13 @@ function sendAudioChunk(isFinal: boolean) { const base64 = (reader.result as string).split(',')[1] if (whisperSocket?.readyState === WebSocket.OPEN) { - if (!isFinal) { - interimTranscript.value = 'Processing...' - } whisperSocket.send(JSON.stringify({ type: 'transcribe', audio: base64, language: 'es', partial: !isFinal })) - console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${audioChunks.length} chunks, ${audioBlob.size} bytes)`) + console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${chunkCount} chunks, ${audioBlob.size} bytes)`) } } reader.readAsDataURL(audioBlob) @@ -524,21 +544,31 @@ function handleKeyUp(e: KeyboardEvent) { holdTimeout = null } - // If was push-to-talk recording, stop and send after 1200ms + // If was push-to-talk recording, continue recording for 1.5s buffer then stop if (isPushToTalk.value && isRecording.value) { - console.log('[Voice] Stopping recording, will send in 1200ms') - stopRecording() + console.log('[Voice] Key released, continuing recording for 1.5s buffer...') + + // Keep recording for 1.5s more (UX buffer for trailing words) setTimeout(() => { - console.log('[Voice] Sending transcript:', transcript.value.trim()) - console.log('[Voice] Socket state:', socket?.readyState) - if (transcript.value.trim()) { - sendTranscriptAndClose() + console.log('[Voice] Buffer complete, stopping recording') + stopRecording() + + if (useWhisper.value) { + // For Whisper: wait for server response (handled in onmessage) + console.log('[Voice] Waiting for Whisper transcription...') + pendingWhisperSend = true } else { - // No transcript, just close - isPushToTalk.value = false - close() + // For Web Speech API: send after short delay for final results + setTimeout(() => { + if (transcript.value.trim()) { + sendTranscriptAndClose() + } else { + isPushToTalk.value = false + close() + } + }, 300) } - }, 1200) + }, 1500) } keyDownTime = 0 diff --git a/server/whisper_server.py b/server/whisper_server.py index 835bb3e..e63b00f 100644 --- a/server/whisper_server.py +++ b/server/whisper_server.py @@ -11,6 +11,7 @@ import io import wave import tempfile import os +import subprocess from pathlib import Path try: @@ -21,13 +22,75 @@ except ImportError as e: print("Run: pip install faster-whisper websockets") sys.exit(1) + +def convert_audio_to_wav(input_data: bytes, input_format: str = "webm") -> bytes: + """ + Convert audio data to WAV format using ffmpeg. + Whisper requires WAV/PCM format, but browsers typically record in WebM/Opus. + """ + # Create temp files for input and output + with tempfile.NamedTemporaryFile(suffix=f".{input_format}", delete=False) as in_file: + in_file.write(input_data) + input_path = in_file.name + + output_path = input_path.replace(f".{input_format}", ".wav") + + try: + # Use ffmpeg to convert to WAV (16kHz mono, which Whisper prefers) + result = subprocess.run([ + "ffmpeg", "-y", # Overwrite output + "-i", input_path, # Input file + "-ar", "16000", # Sample rate 16kHz + "-ac", "1", # Mono + "-c:a", "pcm_s16le", # PCM 16-bit little-endian + output_path + ], capture_output=True, text=True, timeout=30) + + if result.returncode != 0: + print(f"[Whisper] ffmpeg error: {result.stderr}") + return None + + # Read the converted WAV file + with open(output_path, "rb") as f: + wav_data = f.read() + + return wav_data + + except subprocess.TimeoutExpired: + print("[Whisper] ffmpeg conversion timed out") + return None + except FileNotFoundError: + print("[Whisper] ffmpeg not found - please install ffmpeg") + return None + except Exception as e: + print(f"[Whisper] Conversion error: {e}") + return None + finally: + # Cleanup temp files + try: + os.unlink(input_path) + except: + pass + try: + os.unlink(output_path) + except: + pass + # Configuration HOST = "localhost" PORT = 4104 -MODEL_SIZE = "medium" # tiny, base, small, medium, large-v2, large-v3 +MODEL_SIZE = "large-v3" # tiny, base, small, medium, large-v2, large-v3 DEVICE = "cuda" # cuda or cpu COMPUTE_TYPE = "float16" # float16 for GPU, int8 for CPU +# Spanish context prompt to improve accuracy (Honduras Spanish + tech context) +INITIAL_PROMPT = """Transcripci贸n en espa帽ol hondure帽o de un desarrollador de software. +Contexto: programaci贸n, TypeScript, Vue, Python, comandos de terminal, c贸digo. +Vocabulario t茅cnico: servidor, frontend, backend, chunks, WebSocket, transcripci贸n, +componente, funci贸n, variable, API, modelo, Whisper, Claude, MCP, configuraci贸n. +Expresiones hondure帽as: vos, ten茅s, pod茅s, mir谩, pues, verdad, aj谩, entonces. +Diminutivos comunes: ahorita, ratito, prontito, despuesito, chiquito, tantito, poquito.""" + # Global model instance model = None model_loading = False @@ -70,29 +133,51 @@ async def load_model(): model_loading = False return model -def transcribe_audio(audio_data: bytes, language: str = "es") -> dict: +def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = True) -> dict: """Transcribe audio data using Whisper""" global model if model is None: return {"error": "Model not loaded"} + print(f"[Whisper] Received {len(audio_data)} bytes of audio data") + + # Convert WebM to WAV if needed + if is_webm: + print("[Whisper] Converting WebM to WAV...") + wav_data = convert_audio_to_wav(audio_data, "webm") + if wav_data is None: + return {"error": "Failed to convert audio format. Ensure ffmpeg is installed."} + print(f"[Whisper] Converted to {len(wav_data)} bytes WAV") + else: + wav_data = audio_data + # Save audio to temp file (faster-whisper needs a file path) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: - f.write(audio_data) + f.write(wav_data) temp_path = f.name try: - # Transcribe + # Transcribe with optimized parameters + print(f"[Whisper] Transcribing {temp_path}...") segments, info = model.transcribe( temp_path, language=language, beam_size=5, + best_of=5, # Number of candidates when sampling + temperature=0.0, # Use greedy decoding (most accurate) vad_filter=True, # Voice activity detection vad_parameters=dict( - min_silence_duration_ms=500, - speech_pad_ms=400 - ) + min_silence_duration_ms=300, # Shorter silence detection + speech_pad_ms=200, # Padding around speech + threshold=0.5 # VAD sensitivity (lower = more sensitive) + ), + initial_prompt=INITIAL_PROMPT, # Context for better Spanish transcription + condition_on_previous_text=True, # Use context from previous segments + no_speech_threshold=0.6, + log_prob_threshold=-1.0, + compression_ratio_threshold=2.4, + word_timestamps=False # Faster without word-level timestamps ) # Collect all segments @@ -106,6 +191,8 @@ def transcribe_audio(audio_data: bytes, language: str = "es") -> dict: "text": segment.text }) + print(f"[Whisper] Transcription result: '{text.strip()}'") + return { "success": True, "text": text.strip(), @@ -119,6 +206,7 @@ def transcribe_audio(audio_data: bytes, language: str = "es") -> dict: } except Exception as e: + print(f"[Whisper] Transcription error: {e}") return {"error": str(e)} finally: @@ -145,16 +233,14 @@ async def handle_client(websocket): try: async for message in websocket: if isinstance(message, bytes): - # Binary audio data - print(f"[Whisper] Received {len(message)} bytes of audio") + # Binary audio data (likely WebM format from browser) + print(f"[Whisper] Received {len(message)} bytes of binary audio") # Transcribe in thread pool to not block loop = asyncio.get_event_loop() result = await loop.run_in_executor( None, - transcribe_audio, - message, - "es" # Default to Spanish + lambda: transcribe_audio(message, "es", is_webm=True) ) await websocket.send(json.dumps({ @@ -168,19 +254,24 @@ async def handle_client(websocket): cmd = json.loads(message) if cmd.get("type") == "transcribe": - # Audio data sent as base64 + # Audio data sent as base64 (WebM format from browser) import base64 audio_data = base64.b64decode(cmd.get("audio", "")) language = cmd.get("language", "es") + is_partial = cmd.get("partial", False) + + print(f"[Whisper] Transcribe request: {len(audio_data)} bytes, lang={language}, partial={is_partial}") loop = asyncio.get_event_loop() result = await loop.run_in_executor( None, - transcribe_audio, - audio_data, - language + lambda: transcribe_audio(audio_data, language, is_webm=True) ) + # Add partial flag to result + if is_partial: + result["partial"] = True + await websocket.send(json.dumps({ "type": "transcription", **result