fix: Improve Whisper transcription with WebM to WAV conversion

- Add ffmpeg conversion from WebM/Opus to WAV (16kHz mono PCM) - Optimize transcription parameters (VAD, temperature, beam_size) - Add Honduras Spanish context prompt with local expressions - Fix chunk accumulation display in voice panel - Add 1.5s recording buffer after releasing Ctrl+Space - Skip small audio chunks (<5KB) that cause ffmpeg errors - Use large-v3 model for better accuracy
2026-02-14 00:16:01 -06:00
parent 638e6ac8e0
commit ac17a9f292
2 changed files with 163 additions and 42 deletions
--- a/frontend/src/components/FloatingVoice.vue
+++ b/frontend/src/components/FloatingVoice.vue
@@ -42,6 +42,7 @@ const connected = ref(false)
 let keyDownTime = 0
 let holdTimeout: number | null = null
 const isPushToTalk = ref(false)
 let pendingWhisperSend = false // Flag to send transcript when Whisper responds
 // ============ WHISPER MODE ============
 const useWhisper = ref(false)
@@ -194,19 +195,27 @@ function connectWhisperSocket() {
          const fullText = msg.text.trim()
          if (msg.partial) {
-            // For partial results, show as interim (will be replaced)
+            // For partial results, show full accumulated transcription
-            // Only show text that's new since last transcription
+            transcript.value = fullText + ' '
-            const newText = fullText.substring(lastTranscriptLength).trim()
+            interimTranscript.value = ''
-            if (newText) {
+            console.log(`[Voice] 🔄 WHISPER partial:`, fullText)
-              interimTranscript.value = newText
+          } else {
-              console.log(`[Voice] 🔄 WHISPER partial:`, newText)
+            // Final result
            }
          } else {
            // Final result - replace everything
            transcript.value = fullText + ' '
            interimTranscript.value = ''
            lastTranscriptLength = 0
            console.log(`[Voice] 🎯 WHISPER-GPU (${msg.model}/${msg.device}):`, fullText)
            // Auto-send if push-to-talk was waiting for this
            if (pendingWhisperSend) {
              pendingWhisperSend = false
              console.log('[Voice] Whisper response received, sending transcript')
              if (transcript.value.trim()) {
                sendTranscriptAndClose()
              } else {
                isPushToTalk.value = false
                close()
              }
            }
          }
          // Update last transcript length for next partial
@@ -214,6 +223,11 @@ function connectWhisperSocket() {
        } else if (msg.error) {
          error.value = msg.error
          console.error('[Voice] Whisper error:', msg.error)
          // Clear pending send on error
          if (pendingWhisperSend) {
            pendingWhisperSend = false
            isPushToTalk.value = false
          }
        }
      }
    } catch (e) {
@@ -263,7 +277,6 @@ async function startWhisperRecording() {
    // Start recording
    mediaRecorder.start(100) // Collect data every 100ms
    isRecording.value = true
    interimTranscript.value = 'Listening (Whisper GPU)...'
    // Send chunks periodically for progressive transcription
    chunkInterval = window.setInterval(() => {
@@ -283,6 +296,16 @@ function sendAudioChunk(isFinal: boolean) {
  // Always send ALL accumulated audio (webm needs header from first chunk)
  const audioBlob = new Blob(audioChunks, { type: 'audio/webm' })
  const chunkCount = audioChunks.length
  // Skip if audio is too small (< 5KB) - WebM header alone is ~1-2KB
  if (audioBlob.size < 5000) {
    console.log(`[Voice] Skipping small chunk (${audioBlob.size} bytes)`)
    if (isFinal) {
      audioChunks = []
    }
    return
  }
  // Clear chunks only if final
  if (isFinal) {
@@ -295,16 +318,13 @@ function sendAudioChunk(isFinal: boolean) {
    const base64 = (reader.result as string).split(',')[1]
    if (whisperSocket?.readyState === WebSocket.OPEN) {
      if (!isFinal) {
        interimTranscript.value = 'Processing...'
      }
      whisperSocket.send(JSON.stringify({
        type: 'transcribe',
        audio: base64,
        language: 'es',
        partial: !isFinal
      }))
-      console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${audioChunks.length} chunks, ${audioBlob.size} bytes)`)
+      console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${chunkCount} chunks, ${audioBlob.size} bytes)`)
    }
  }
  reader.readAsDataURL(audioBlob)
@@ -524,21 +544,31 @@ function handleKeyUp(e: KeyboardEvent) {
      holdTimeout = null
    }
-    // If was push-to-talk recording, stop and send after 1200ms
+    // If was push-to-talk recording, continue recording for 1.5s buffer then stop
    if (isPushToTalk.value && isRecording.value) {
-      console.log('[Voice] Stopping recording, will send in 1200ms')
+      console.log('[Voice] Key released, continuing recording for 1.5s buffer...')
-      stopRecording()
+
      // Keep recording for 1.5s more (UX buffer for trailing words)
      setTimeout(() => {
        console.log('[Voice] Buffer complete, stopping recording')
        stopRecording()
        if (useWhisper.value) {
          // For Whisper: wait for server response (handled in onmessage)
          console.log('[Voice] Waiting for Whisper transcription...')
          pendingWhisperSend = true
        } else {
          // For Web Speech API: send after short delay for final results
          setTimeout(() => {
        console.log('[Voice] Sending transcript:', transcript.value.trim())
        console.log('[Voice] Socket state:', socket?.readyState)
            if (transcript.value.trim()) {
              sendTranscriptAndClose()
            } else {
          // No transcript, just close
              isPushToTalk.value = false
              close()
            }
-      }, 1200)
+          }, 300)
        }
      }, 1500)
    }
    keyDownTime = 0
--- a/server/whisper_server.py
+++ b/server/whisper_server.py
@@ -11,6 +11,7 @@ import io
 import wave
 import tempfile
 import os
 import subprocess
 from pathlib import Path
 try:
@@ -21,13 +22,75 @@ except ImportError as e:
    print("Run: pip install faster-whisper websockets")
    sys.exit(1)
 def convert_audio_to_wav(input_data: bytes, input_format: str = "webm") -> bytes:
    """
    Convert audio data to WAV format using ffmpeg.
    Whisper requires WAV/PCM format, but browsers typically record in WebM/Opus.
    """
    # Create temp files for input and output
    with tempfile.NamedTemporaryFile(suffix=f".{input_format}", delete=False) as in_file:
        in_file.write(input_data)
        input_path = in_file.name
    output_path = input_path.replace(f".{input_format}", ".wav")
    try:
        # Use ffmpeg to convert to WAV (16kHz mono, which Whisper prefers)
        result = subprocess.run([
            "ffmpeg", "-y",  # Overwrite output
            "-i", input_path,  # Input file
            "-ar", "16000",  # Sample rate 16kHz
            "-ac", "1",  # Mono
            "-c:a", "pcm_s16le",  # PCM 16-bit little-endian
            output_path
        ], capture_output=True, text=True, timeout=30)
        if result.returncode != 0:
            print(f"[Whisper] ffmpeg error: {result.stderr}")
            return None
        # Read the converted WAV file
        with open(output_path, "rb") as f:
            wav_data = f.read()
        return wav_data
    except subprocess.TimeoutExpired:
        print("[Whisper] ffmpeg conversion timed out")
        return None
    except FileNotFoundError:
        print("[Whisper] ffmpeg not found - please install ffmpeg")
        return None
    except Exception as e:
        print(f"[Whisper] Conversion error: {e}")
        return None
    finally:
        # Cleanup temp files
        try:
            os.unlink(input_path)
        except:
            pass
        try:
            os.unlink(output_path)
        except:
            pass
 # Configuration
 HOST = "localhost"
 PORT = 4104
-MODEL_SIZE = "medium"  # tiny, base, small, medium, large-v2, large-v3
+MODEL_SIZE = "large-v3"  # tiny, base, small, medium, large-v2, large-v3
 DEVICE = "cuda"  # cuda or cpu
 COMPUTE_TYPE = "float16"  # float16 for GPU, int8 for CPU
 # Spanish context prompt to improve accuracy (Honduras Spanish + tech context)
 INITIAL_PROMPT = """Transcripción en español hondureño de un desarrollador de software.
 Contexto: programación, TypeScript, Vue, Python, comandos de terminal, código.
 Vocabulario técnico: servidor, frontend, backend, chunks, WebSocket, transcripción,
 componente, función, variable, API, modelo, Whisper, Claude, MCP, configuración.
 Expresiones hondureñas: vos, tenés, podés, mirá, pues, verdad, ajá, entonces.
 Diminutivos comunes: ahorita, ratito, prontito, despuesito, chiquito, tantito, poquito."""
 # Global model instance
 model = None
 model_loading = False
@@ -70,29 +133,51 @@ async def load_model():
    model_loading = False
    return model
-def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
+def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = True) -> dict:
    """Transcribe audio data using Whisper"""
    global model
    if model is None:
        return {"error": "Model not loaded"}
    print(f"[Whisper] Received {len(audio_data)} bytes of audio data")
    # Convert WebM to WAV if needed
    if is_webm:
        print("[Whisper] Converting WebM to WAV...")
        wav_data = convert_audio_to_wav(audio_data, "webm")
        if wav_data is None:
            return {"error": "Failed to convert audio format. Ensure ffmpeg is installed."}
        print(f"[Whisper] Converted to {len(wav_data)} bytes WAV")
    else:
        wav_data = audio_data
    # Save audio to temp file (faster-whisper needs a file path)
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-        f.write(audio_data)
+        f.write(wav_data)
        temp_path = f.name
    try:
-        # Transcribe
+        # Transcribe with optimized parameters
        print(f"[Whisper] Transcribing {temp_path}...")
        segments, info = model.transcribe(
            temp_path,
            language=language,
            beam_size=5,
            best_of=5,  # Number of candidates when sampling
            temperature=0.0,  # Use greedy decoding (most accurate)
            vad_filter=True,  # Voice activity detection
            vad_parameters=dict(
-                min_silence_duration_ms=500,
+                min_silence_duration_ms=300,  # Shorter silence detection
-                speech_pad_ms=400
+                speech_pad_ms=200,  # Padding around speech
-            )
+                threshold=0.5  # VAD sensitivity (lower = more sensitive)
            ),
            initial_prompt=INITIAL_PROMPT,  # Context for better Spanish transcription
            condition_on_previous_text=True,  # Use context from previous segments
            no_speech_threshold=0.6,
            log_prob_threshold=-1.0,
            compression_ratio_threshold=2.4,
            word_timestamps=False  # Faster without word-level timestamps
        )
        # Collect all segments
@@ -106,6 +191,8 @@ def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
                "text": segment.text
            })
        print(f"[Whisper] Transcription result: '{text.strip()}'")
        return {
            "success": True,
            "text": text.strip(),
@@ -119,6 +206,7 @@ def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
        }
    except Exception as e:
        print(f"[Whisper] Transcription error: {e}")
        return {"error": str(e)}
    finally:
@@ -145,16 +233,14 @@ async def handle_client(websocket):
    try:
        async for message in websocket:
            if isinstance(message, bytes):
-                # Binary audio data
+                # Binary audio data (likely WebM format from browser)
-                print(f"[Whisper] Received {len(message)} bytes of audio")
+                print(f"[Whisper] Received {len(message)} bytes of binary audio")
                # Transcribe in thread pool to not block
                loop = asyncio.get_event_loop()
                result = await loop.run_in_executor(
                    None,
-                    transcribe_audio,
+                    lambda: transcribe_audio(message, "es", is_webm=True)
                    message,
                    "es"  # Default to Spanish
                )
                await websocket.send(json.dumps({
@@ -168,19 +254,24 @@ async def handle_client(websocket):
                    cmd = json.loads(message)
                    if cmd.get("type") == "transcribe":
-                        # Audio data sent as base64
+                        # Audio data sent as base64 (WebM format from browser)
                        import base64
                        audio_data = base64.b64decode(cmd.get("audio", ""))
                        language = cmd.get("language", "es")
                        is_partial = cmd.get("partial", False)
                        print(f"[Whisper] Transcribe request: {len(audio_data)} bytes, lang={language}, partial={is_partial}")
                        loop = asyncio.get_event_loop()
                        result = await loop.run_in_executor(
                            None,
-                            transcribe_audio,
+                            lambda: transcribe_audio(audio_data, language, is_webm=True)
                            audio_data,
                            language
                        )
                        # Add partial flag to result
                        if is_partial:
                            result["partial"] = True
                        await websocket.send(json.dumps({
                            "type": "transcription",
                            **result