fix: Improve Whisper server startup with async polling and reduce logs

- Make server startup async to avoid Bun's 10s timeout - Add frontend polling to detect when server is ready - Use PowerShell Get-NetTCPConnection for reliable port detection - Add starting state to prevent multiple simultaneous starts - Reduce verbose logging, keep only essential info - Add dev-dist and nul to gitignore
2026-02-14 01:02:54 -06:00
parent 9f1e10b8d5
commit 5be0fb91ab
5 changed files with 180 additions and 73 deletions
--- a/server/whisper_server.py
+++ b/server/whisper_server.py
@@ -79,10 +79,13 @@ def convert_audio_to_wav(input_data: bytes, input_format: str = "webm") -> bytes
 # Configuration
 HOST = "localhost"
 PORT = 4104
-MODEL_SIZE = "large-v3"  # tiny, base, small, medium, large-v2, large-v3
+MODEL_SIZE = "large-v3"  # Best standard model for Spanish
 DEVICE = "cuda"  # cuda or cpu
 COMPUTE_TYPE = "float16"  # float16 for GPU, int8 for CPU

+# Model display name (extract from path if needed)
+MODEL_NAME = MODEL_SIZE.split("/")[-1] if "/" in MODEL_SIZE else MODEL_SIZE
+
 # Spanish context prompt to improve accuracy (Honduras Spanish + tech context)
 INITIAL_PROMPT = """Transcripción en español hondureño de un desarrollador de software.
 Contexto: programación, TypeScript, Vue, Python, comandos de terminal, código.
@@ -109,7 +112,7 @@ async def load_model():
        return model

    model_loading = True
-    print(f"[Whisper] Loading model '{MODEL_SIZE}' on {DEVICE}...")
+    print(f"[Whisper] Loading model '{MODEL_NAME}' on {DEVICE}...")

    try:
        # Load model - this downloads on first run
@@ -140,15 +143,11 @@ def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = Tr
    if model is None:
        return {"error": "Model not loaded"}

-    print(f"[Whisper] Received {len(audio_data)} bytes of audio data")
-
    # Convert WebM to WAV if needed
    if is_webm:
-        print("[Whisper] Converting WebM to WAV...")
        wav_data = convert_audio_to_wav(audio_data, "webm")
        if wav_data is None:
            return {"error": "Failed to convert audio format. Ensure ffmpeg is installed."}
-        print(f"[Whisper] Converted to {len(wav_data)} bytes WAV")
    else:
        wav_data = audio_data

@@ -159,7 +158,6 @@ def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = Tr

    try:
        # Transcribe with optimized parameters
-        print(f"[Whisper] Transcribing {temp_path}...")
        segments, info = model.transcribe(
            temp_path,
            language=language,
@@ -191,7 +189,6 @@ def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = Tr
                "text": segment.text
            })

-        print(f"[Whisper] Transcription result: '{text.strip()}'")

        return {
            "success": True,
@@ -201,7 +198,7 @@ def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = Tr
            "duration": info.duration,
            "segments": segments_list,
            "engine": "whisper-gpu",
-            "model": MODEL_SIZE,
+            "model": MODEL_NAME,
            "device": DEVICE
        }

@@ -218,7 +215,6 @@ def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = Tr

 async def handle_client(websocket):
    """Handle WebSocket client connection"""
-    print(f"[Whisper] Client connected")

    # Ensure model is loaded
    await load_model()
@@ -226,7 +222,7 @@ async def handle_client(websocket):
    # Send ready message
    await websocket.send(json.dumps({
        "type": "ready",
-        "model": MODEL_SIZE,
+        "model": MODEL_NAME,
        "device": DEVICE
    }))

@@ -234,8 +230,6 @@ async def handle_client(websocket):
        async for message in websocket:
            if isinstance(message, bytes):
                # Binary audio data (likely WebM format from browser)
-                print(f"[Whisper] Received {len(message)} bytes of binary audio")
-
                # Transcribe in thread pool to not block
                loop = asyncio.get_event_loop()
                result = await loop.run_in_executor(
@@ -260,8 +254,6 @@ async def handle_client(websocket):
                        language = cmd.get("language", "es")
                        is_partial = cmd.get("partial", False)

-                        print(f"[Whisper] Transcribe request: {len(audio_data)} bytes, lang={language}, partial={is_partial}")
-
                        loop = asyncio.get_event_loop()
                        result = await loop.run_in_executor(
                            None,
@@ -283,7 +275,7 @@ async def handle_client(websocket):
                    elif cmd.get("type") == "status":
                        await websocket.send(json.dumps({
                            "type": "status",
-                            "model": MODEL_SIZE,
+                            "model": MODEL_NAME,
                            "device": DEVICE,
                            "ready": model is not None
                        }))
@@ -295,21 +287,19 @@ async def handle_client(websocket):
                    }))

    except websockets.exceptions.ConnectionClosed:
-        print("[Whisper] Client disconnected")
+        pass
    except Exception as e:
        print(f"[Whisper] Error: {e}")

 async def main():
    """Start WebSocket server"""
-    print(f"[Whisper] Starting server on ws://{HOST}:{PORT}")
-    print(f"[Whisper] Model: {MODEL_SIZE}, Device: {DEVICE}")
+    print(f"[Whisper] Model: {MODEL_NAME} | Device: {DEVICE} | Port: {PORT}")

    # Pre-load model
-    print("[Whisper] Pre-loading model...")
    await load_model()

    async with websockets.serve(handle_client, HOST, PORT):
-        print(f"[Whisper] Server ready! Listening on ws://{HOST}:{PORT}")
+        print(f"[Whisper] Ready")
        await asyncio.Future()  # Run forever

 if __name__ == "__main__":