From ac17a9f292cbfbd68406df123449edcb7e58eba9 Mon Sep 17 00:00:00 2001
From: josedario87 <jodarioel87@gmail.com>
Date: Sat, 14 Feb 2026 00:16:01 -0600
Subject: [PATCH] fix: Improve Whisper transcription with WebM to WAV
 conversion

- Add ffmpeg conversion from WebM/Opus to WAV (16kHz mono PCM)
- Optimize transcription parameters (VAD, temperature, beam_size)
- Add Honduras Spanish context prompt with local expressions
- Fix chunk accumulation display in voice panel
- Add 1.5s recording buffer after releasing Ctrl+Space
- Skip small audio chunks (<5KB) that cause ffmpeg errors
- Use large-v3 model for better accuracy
---
 frontend/src/components/FloatingVoice.vue |  82 ++++++++++-----
 server/whisper_server.py                  | 123 +++++++++++++++++++---
 2 files changed, 163 insertions(+), 42 deletions(-)

diff --git a/frontend/src/components/FloatingVoice.vue b/frontend/src/components/FloatingVoice.vue
index 24cc6f0..8952a1a 100644
--- a/frontend/src/components/FloatingVoice.vue
+++ b/frontend/src/components/FloatingVoice.vue
@@ -42,6 +42,7 @@ const connected = ref(false)
 let keyDownTime = 0
 let holdTimeout: number | null = null
 const isPushToTalk = ref(false)
+let pendingWhisperSend = false // Flag to send transcript when Whisper responds
 
 // ============ WHISPER MODE ============
 const useWhisper = ref(false)
@@ -194,19 +195,27 @@ function connectWhisperSocket() {
           const fullText = msg.text.trim()
 
           if (msg.partial) {
-            // For partial results, show as interim (will be replaced)
-            // Only show text that's new since last transcription
-            const newText = fullText.substring(lastTranscriptLength).trim()
-            if (newText) {
-              interimTranscript.value = newText
-              console.log(`[Voice] 🔄 WHISPER partial:`, newText)
-            }
-          } else {
-            // Final result - replace everything
+            // For partial results, show full accumulated transcription
+            transcript.value = fullText + ' '
+            interimTranscript.value = ''
+            console.log(`[Voice] 🔄 WHISPER partial:`, fullText)
+          } else {
+            // Final result
             transcript.value = fullText + ' '
             interimTranscript.value = ''
-            lastTranscriptLength = 0
             console.log(`[Voice] 🎯 WHISPER-GPU (${msg.model}/${msg.device}):`, fullText)
+
+            // Auto-send if push-to-talk was waiting for this
+            if (pendingWhisperSend) {
+              pendingWhisperSend = false
+              console.log('[Voice] Whisper response received, sending transcript')
+              if (transcript.value.trim()) {
+                sendTranscriptAndClose()
+              } else {
+                isPushToTalk.value = false
+                close()
+              }
+            }
           }
 
           // Update last transcript length for next partial
@@ -214,6 +223,11 @@ function connectWhisperSocket() {
         } else if (msg.error) {
           error.value = msg.error
           console.error('[Voice] Whisper error:', msg.error)
+          // Clear pending send on error
+          if (pendingWhisperSend) {
+            pendingWhisperSend = false
+            isPushToTalk.value = false
+          }
         }
       }
     } catch (e) {
@@ -263,7 +277,6 @@ async function startWhisperRecording() {
     // Start recording
     mediaRecorder.start(100) // Collect data every 100ms
     isRecording.value = true
-    interimTranscript.value = 'Listening (Whisper GPU)...'
 
     // Send chunks periodically for progressive transcription
     chunkInterval = window.setInterval(() => {
@@ -283,6 +296,16 @@ function sendAudioChunk(isFinal: boolean) {
 
   // Always send ALL accumulated audio (webm needs header from first chunk)
   const audioBlob = new Blob(audioChunks, { type: 'audio/webm' })
+  const chunkCount = audioChunks.length
+
+  // Skip if audio is too small (< 5KB) - WebM header alone is ~1-2KB
+  if (audioBlob.size < 5000) {
+    console.log(`[Voice] Skipping small chunk (${audioBlob.size} bytes)`)
+    if (isFinal) {
+      audioChunks = []
+    }
+    return
+  }
 
   // Clear chunks only if final
   if (isFinal) {
@@ -295,16 +318,13 @@ function sendAudioChunk(isFinal: boolean) {
     const base64 = (reader.result as string).split(',')[1]
 
     if (whisperSocket?.readyState === WebSocket.OPEN) {
-      if (!isFinal) {
-        interimTranscript.value = 'Processing...'
-      }
       whisperSocket.send(JSON.stringify({
         type: 'transcribe',
         audio: base64,
         language: 'es',
         partial: !isFinal
       }))
-      console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${audioChunks.length} chunks, ${audioBlob.size} bytes)`)
+      console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${chunkCount} chunks, ${audioBlob.size} bytes)`)
     }
   }
   reader.readAsDataURL(audioBlob)
@@ -524,21 +544,31 @@ function handleKeyUp(e: KeyboardEvent) {
       holdTimeout = null
     }
 
-    // If was push-to-talk recording, stop and send after 1200ms
+    // If was push-to-talk recording, continue recording for 1.5s buffer then stop
     if (isPushToTalk.value && isRecording.value) {
-      console.log('[Voice] Stopping recording, will send in 1200ms')
-      stopRecording()
+      console.log('[Voice] Key released, continuing recording for 1.5s buffer...')
+
+      // Keep recording for 1.5s more (UX buffer for trailing words)
       setTimeout(() => {
-        console.log('[Voice] Sending transcript:', transcript.value.trim())
-        console.log('[Voice] Socket state:', socket?.readyState)
-        if (transcript.value.trim()) {
-          sendTranscriptAndClose()
+        console.log('[Voice] Buffer complete, stopping recording')
+        stopRecording()
+
+        if (useWhisper.value) {
+          // For Whisper: wait for server response (handled in onmessage)
+          console.log('[Voice] Waiting for Whisper transcription...')
+          pendingWhisperSend = true
         } else {
-          // No transcript, just close
-          isPushToTalk.value = false
-          close()
+          // For Web Speech API: send after short delay for final results
+          setTimeout(() => {
+            if (transcript.value.trim()) {
+              sendTranscriptAndClose()
+            } else {
+              isPushToTalk.value = false
+              close()
+            }
+          }, 300)
         }
-      }, 1200)
+      }, 1500)
     }
 
     keyDownTime = 0
diff --git a/server/whisper_server.py b/server/whisper_server.py
index 835bb3e..e63b00f 100644
--- a/server/whisper_server.py
+++ b/server/whisper_server.py
@@ -11,6 +11,7 @@ import io
 import wave
 import tempfile
 import os
+import subprocess
 from pathlib import Path
 
 try:
@@ -21,13 +22,75 @@ except ImportError as e:
     print("Run: pip install faster-whisper websockets")
     sys.exit(1)
 
+
+def convert_audio_to_wav(input_data: bytes, input_format: str = "webm") -> bytes:
+    """
+    Convert audio data to WAV format using ffmpeg.
+    Whisper requires WAV/PCM format, but browsers typically record in WebM/Opus.
+    """
+    # Create temp files for input and output
+    with tempfile.NamedTemporaryFile(suffix=f".{input_format}", delete=False) as in_file:
+        in_file.write(input_data)
+        input_path = in_file.name
+
+    output_path = input_path.replace(f".{input_format}", ".wav")
+
+    try:
+        # Use ffmpeg to convert to WAV (16kHz mono, which Whisper prefers)
+        result = subprocess.run([
+            "ffmpeg", "-y",  # Overwrite output
+            "-i", input_path,  # Input file
+            "-ar", "16000",  # Sample rate 16kHz
+            "-ac", "1",  # Mono
+            "-c:a", "pcm_s16le",  # PCM 16-bit little-endian
+            output_path
+        ], capture_output=True, text=True, timeout=30)
+
+        if result.returncode != 0:
+            print(f"[Whisper] ffmpeg error: {result.stderr}")
+            return None
+
+        # Read the converted WAV file
+        with open(output_path, "rb") as f:
+            wav_data = f.read()
+
+        return wav_data
+
+    except subprocess.TimeoutExpired:
+        print("[Whisper] ffmpeg conversion timed out")
+        return None
+    except FileNotFoundError:
+        print("[Whisper] ffmpeg not found - please install ffmpeg")
+        return None
+    except Exception as e:
+        print(f"[Whisper] Conversion error: {e}")
+        return None
+    finally:
+        # Cleanup temp files
+        try:
+            os.unlink(input_path)
+        except:
+            pass
+        try:
+            os.unlink(output_path)
+        except:
+            pass
+
 # Configuration
 HOST = "localhost"
 PORT = 4104
-MODEL_SIZE = "medium"  # tiny, base, small, medium, large-v2, large-v3
+MODEL_SIZE = "large-v3"  # tiny, base, small, medium, large-v2, large-v3
 DEVICE = "cuda"  # cuda or cpu
 COMPUTE_TYPE = "float16"  # float16 for GPU, int8 for CPU
 
+# Spanish context prompt to improve accuracy (Honduras Spanish + tech context)
+INITIAL_PROMPT = """Transcripción en español hondureño de un desarrollador de software.
+Contexto: programación, TypeScript, Vue, Python, comandos de terminal, código.
+Vocabulario técnico: servidor, frontend, backend, chunks, WebSocket, transcripción,
+componente, función, variable, API, modelo, Whisper, Claude, MCP, configuración.
+Expresiones hondureñas: vos, tenés, podés, mirá, pues, verdad, ajá, entonces.
+Diminutivos comunes: ahorita, ratito, prontito, despuesito, chiquito, tantito, poquito."""
+
 # Global model instance
 model = None
 model_loading = False
@@ -70,29 +133,51 @@ async def load_model():
     model_loading = False
     return model
 
-def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
+def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = True) -> dict:
     """Transcribe audio data using Whisper"""
     global model
 
     if model is None:
         return {"error": "Model not loaded"}
 
+    print(f"[Whisper] Received {len(audio_data)} bytes of audio data")
+
+    # Convert WebM to WAV if needed
+    if is_webm:
+        print("[Whisper] Converting WebM to WAV...")
+        wav_data = convert_audio_to_wav(audio_data, "webm")
+        if wav_data is None:
+            return {"error": "Failed to convert audio format. Ensure ffmpeg is installed."}
+        print(f"[Whisper] Converted to {len(wav_data)} bytes WAV")
+    else:
+        wav_data = audio_data
+
     # Save audio to temp file (faster-whisper needs a file path)
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-        f.write(audio_data)
+        f.write(wav_data)
         temp_path = f.name
 
     try:
-        # Transcribe
+        # Transcribe with optimized parameters
+        print(f"[Whisper] Transcribing {temp_path}...")
         segments, info = model.transcribe(
             temp_path,
             language=language,
             beam_size=5,
+            best_of=5,  # Number of candidates when sampling
+            temperature=0.0,  # Use greedy decoding (most accurate)
             vad_filter=True,  # Voice activity detection
             vad_parameters=dict(
-                min_silence_duration_ms=500,
-                speech_pad_ms=400
-            )
+                min_silence_duration_ms=300,  # Shorter silence detection
+                speech_pad_ms=200,  # Padding around speech
+                threshold=0.5  # VAD sensitivity (lower = more sensitive)
+            ),
+            initial_prompt=INITIAL_PROMPT,  # Context for better Spanish transcription
+            condition_on_previous_text=True,  # Use context from previous segments
+            no_speech_threshold=0.6,
+            log_prob_threshold=-1.0,
+            compression_ratio_threshold=2.4,
+            word_timestamps=False  # Faster without word-level timestamps
         )
 
         # Collect all segments
@@ -106,6 +191,8 @@ def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
                 "text": segment.text
             })
 
+        print(f"[Whisper] Transcription result: '{text.strip()}'")
+
         return {
             "success": True,
             "text": text.strip(),
@@ -119,6 +206,7 @@ def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
         }
 
     except Exception as e:
+        print(f"[Whisper] Transcription error: {e}")
         return {"error": str(e)}
 
     finally:
@@ -145,16 +233,14 @@ async def handle_client(websocket):
     try:
         async for message in websocket:
             if isinstance(message, bytes):
-                # Binary audio data
-                print(f"[Whisper] Received {len(message)} bytes of audio")
+                # Binary audio data (likely WebM format from browser)
+                print(f"[Whisper] Received {len(message)} bytes of binary audio")
 
                 # Transcribe in thread pool to not block
                 loop = asyncio.get_event_loop()
                 result = await loop.run_in_executor(
                     None,
-                    transcribe_audio,
-                    message,
-                    "es"  # Default to Spanish
+                    lambda: transcribe_audio(message, "es", is_webm=True)
                 )
 
                 await websocket.send(json.dumps({
@@ -168,19 +254,24 @@ async def handle_client(websocket):
                     cmd = json.loads(message)
 
                     if cmd.get("type") == "transcribe":
-                        # Audio data sent as base64
+                        # Audio data sent as base64 (WebM format from browser)
                         import base64
                         audio_data = base64.b64decode(cmd.get("audio", ""))
                         language = cmd.get("language", "es")
+                        is_partial = cmd.get("partial", False)
+
+                        print(f"[Whisper] Transcribe request: {len(audio_data)} bytes, lang={language}, partial={is_partial}")
 
                         loop = asyncio.get_event_loop()
                         result = await loop.run_in_executor(
                             None,
-                            transcribe_audio,
-                            audio_data,
-                            language
+                            lambda: transcribe_audio(audio_data, language, is_webm=True)
                         )
 
+                        # Add partial flag to result
+                        if is_partial:
+                            result["partial"] = True
+
                         await websocket.send(json.dumps({
                             "type": "transcription",
                             **result