fix: Improve Whisper transcription with WebM to WAV conversion

- Add ffmpeg conversion from WebM/Opus to WAV (16kHz mono PCM) - Optimize transcription parameters (VAD, temperature, beam_size) - Add Honduras Spanish context prompt with local expressions - Fix chunk accumulation display in voice panel - Add 1.5s recording buffer after releasing Ctrl+Space - Skip small audio chunks (<5KB) that cause ffmpeg errors - Use large-v3 model for better accuracy
2026-02-14 00:16:01 -06:00
parent 638e6ac8e0
commit ac17a9f292
2 changed files with 163 additions and 42 deletions
--- a/server/whisper_server.py
+++ b/server/whisper_server.py
@@ -11,6 +11,7 @@ import io
 import wave
 import tempfile
 import os
+import subprocess
 from pathlib import Path

 try:
@@ -21,13 +22,75 @@ except ImportError as e:
    print("Run: pip install faster-whisper websockets")
    sys.exit(1)

+
+def convert_audio_to_wav(input_data: bytes, input_format: str = "webm") -> bytes:
+    """
+    Convert audio data to WAV format using ffmpeg.
+    Whisper requires WAV/PCM format, but browsers typically record in WebM/Opus.
+    """
+    # Create temp files for input and output
+    with tempfile.NamedTemporaryFile(suffix=f".{input_format}", delete=False) as in_file:
+        in_file.write(input_data)
+        input_path = in_file.name
+
+    output_path = input_path.replace(f".{input_format}", ".wav")
+
+    try:
+        # Use ffmpeg to convert to WAV (16kHz mono, which Whisper prefers)
+        result = subprocess.run([
+            "ffmpeg", "-y",  # Overwrite output
+            "-i", input_path,  # Input file
+            "-ar", "16000",  # Sample rate 16kHz
+            "-ac", "1",  # Mono
+            "-c:a", "pcm_s16le",  # PCM 16-bit little-endian
+            output_path
+        ], capture_output=True, text=True, timeout=30)
+
+        if result.returncode != 0:
+            print(f"[Whisper] ffmpeg error: {result.stderr}")
+            return None
+
+        # Read the converted WAV file
+        with open(output_path, "rb") as f:
+            wav_data = f.read()
+
+        return wav_data
+
+    except subprocess.TimeoutExpired:
+        print("[Whisper] ffmpeg conversion timed out")
+        return None
+    except FileNotFoundError:
+        print("[Whisper] ffmpeg not found - please install ffmpeg")
+        return None
+    except Exception as e:
+        print(f"[Whisper] Conversion error: {e}")
+        return None
+    finally:
+        # Cleanup temp files
+        try:
+            os.unlink(input_path)
+        except:
+            pass
+        try:
+            os.unlink(output_path)
+        except:
+            pass
+
 # Configuration
 HOST = "localhost"
 PORT = 4104
-MODEL_SIZE = "medium"  # tiny, base, small, medium, large-v2, large-v3
+MODEL_SIZE = "large-v3"  # tiny, base, small, medium, large-v2, large-v3
 DEVICE = "cuda"  # cuda or cpu
 COMPUTE_TYPE = "float16"  # float16 for GPU, int8 for CPU

+# Spanish context prompt to improve accuracy (Honduras Spanish + tech context)
+INITIAL_PROMPT = """Transcripción en español hondureño de un desarrollador de software.
+Contexto: programación, TypeScript, Vue, Python, comandos de terminal, código.
+Vocabulario técnico: servidor, frontend, backend, chunks, WebSocket, transcripción,
+componente, función, variable, API, modelo, Whisper, Claude, MCP, configuración.
+Expresiones hondureñas: vos, tenés, podés, mirá, pues, verdad, ajá, entonces.
+Diminutivos comunes: ahorita, ratito, prontito, despuesito, chiquito, tantito, poquito."""
+
 # Global model instance
 model = None
 model_loading = False
@@ -70,29 +133,51 @@ async def load_model():
    model_loading = False
    return model

-def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
+def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = True) -> dict:
    """Transcribe audio data using Whisper"""
    global model

    if model is None:
        return {"error": "Model not loaded"}

+    print(f"[Whisper] Received {len(audio_data)} bytes of audio data")
+
+    # Convert WebM to WAV if needed
+    if is_webm:
+        print("[Whisper] Converting WebM to WAV...")
+        wav_data = convert_audio_to_wav(audio_data, "webm")
+        if wav_data is None:
+            return {"error": "Failed to convert audio format. Ensure ffmpeg is installed."}
+        print(f"[Whisper] Converted to {len(wav_data)} bytes WAV")
+    else:
+        wav_data = audio_data
+
    # Save audio to temp file (faster-whisper needs a file path)
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-        f.write(audio_data)
+        f.write(wav_data)
        temp_path = f.name

    try:
-        # Transcribe
+        # Transcribe with optimized parameters
+        print(f"[Whisper] Transcribing {temp_path}...")
        segments, info = model.transcribe(
            temp_path,
            language=language,
            beam_size=5,
+            best_of=5,  # Number of candidates when sampling
+            temperature=0.0,  # Use greedy decoding (most accurate)
            vad_filter=True,  # Voice activity detection
            vad_parameters=dict(
-                min_silence_duration_ms=500,
-                speech_pad_ms=400
-            )
+                min_silence_duration_ms=300,  # Shorter silence detection
+                speech_pad_ms=200,  # Padding around speech
+                threshold=0.5  # VAD sensitivity (lower = more sensitive)
+            ),
+            initial_prompt=INITIAL_PROMPT,  # Context for better Spanish transcription
+            condition_on_previous_text=True,  # Use context from previous segments
+            no_speech_threshold=0.6,
+            log_prob_threshold=-1.0,
+            compression_ratio_threshold=2.4,
+            word_timestamps=False  # Faster without word-level timestamps
        )

        # Collect all segments
@@ -106,6 +191,8 @@ def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
                "text": segment.text
            })

+        print(f"[Whisper] Transcription result: '{text.strip()}'")
+
        return {
            "success": True,
            "text": text.strip(),
@@ -119,6 +206,7 @@ def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
        }

    except Exception as e:
+        print(f"[Whisper] Transcription error: {e}")
        return {"error": str(e)}

    finally:
@@ -145,16 +233,14 @@ async def handle_client(websocket):
    try:
        async for message in websocket:
            if isinstance(message, bytes):
-                # Binary audio data
-                print(f"[Whisper] Received {len(message)} bytes of audio")
+                # Binary audio data (likely WebM format from browser)
+                print(f"[Whisper] Received {len(message)} bytes of binary audio")

                # Transcribe in thread pool to not block
                loop = asyncio.get_event_loop()
                result = await loop.run_in_executor(
                    None,
-                    transcribe_audio,
-                    message,
-                    "es"  # Default to Spanish
+                    lambda: transcribe_audio(message, "es", is_webm=True)
                )

                await websocket.send(json.dumps({
@@ -168,19 +254,24 @@ async def handle_client(websocket):
                    cmd = json.loads(message)

                    if cmd.get("type") == "transcribe":
-                        # Audio data sent as base64
+                        # Audio data sent as base64 (WebM format from browser)
                        import base64
                        audio_data = base64.b64decode(cmd.get("audio", ""))
                        language = cmd.get("language", "es")
+                        is_partial = cmd.get("partial", False)
+
+                        print(f"[Whisper] Transcribe request: {len(audio_data)} bytes, lang={language}, partial={is_partial}")

                        loop = asyncio.get_event_loop()
                        result = await loop.run_in_executor(
                            None,
-                            transcribe_audio,
-                            audio_data,
-                            language
+                            lambda: transcribe_audio(audio_data, language, is_webm=True)
                        )

+                        # Add partial flag to result
+                        if is_partial:
+                            result["partial"] = True
+
                        await websocket.send(json.dumps({
                            "type": "transcription",
                            **result