#!/usr/bin/env python3 """ Whisper Server - GPU-accelerated speech-to-text using faster-whisper WebSocket server that receives audio and returns transcriptions """ import asyncio import json import sys import io import wave import tempfile import os from pathlib import Path try: import websockets from faster_whisper import WhisperModel except ImportError as e: print(f"Missing dependency: {e}") print("Run: pip install faster-whisper websockets") sys.exit(1) # Configuration HOST = "localhost" PORT = 4104 MODEL_SIZE = "medium" # tiny, base, small, medium, large-v2, large-v3 DEVICE = "cuda" # cuda or cpu COMPUTE_TYPE = "float16" # float16 for GPU, int8 for CPU # Global model instance model = None model_loading = False async def load_model(): """Load Whisper model (lazy loading on first request)""" global model, model_loading if model is not None: return model if model_loading: # Wait for model to finish loading while model_loading: await asyncio.sleep(0.1) return model model_loading = True print(f"[Whisper] Loading model '{MODEL_SIZE}' on {DEVICE}...") try: # Load model - this downloads on first run model = WhisperModel( MODEL_SIZE, device=DEVICE, compute_type=COMPUTE_TYPE, download_root=str(Path.home() / ".cache" / "whisper") ) print(f"[Whisper] Model loaded successfully!") except Exception as e: print(f"[Whisper] Error loading model: {e}") print("[Whisper] Falling back to CPU...") model = WhisperModel( MODEL_SIZE, device="cpu", compute_type="int8", download_root=str(Path.home() / ".cache" / "whisper") ) model_loading = False return model def transcribe_audio(audio_data: bytes, language: str = "es") -> dict: """Transcribe audio data using Whisper""" global model if model is None: return {"error": "Model not loaded"} # Save audio to temp file (faster-whisper needs a file path) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: f.write(audio_data) temp_path = f.name try: # Transcribe segments, info = model.transcribe( temp_path, language=language, beam_size=5, vad_filter=True, # Voice activity detection vad_parameters=dict( min_silence_duration_ms=500, speech_pad_ms=400 ) ) # Collect all segments text = "" segments_list = [] for segment in segments: text += segment.text + " " segments_list.append({ "start": segment.start, "end": segment.end, "text": segment.text }) return { "success": True, "text": text.strip(), "language": info.language, "language_probability": info.language_probability, "duration": info.duration, "segments": segments_list, "engine": "whisper-gpu", "model": MODEL_SIZE, "device": DEVICE } except Exception as e: return {"error": str(e)} finally: # Cleanup temp file try: os.unlink(temp_path) except: pass async def handle_client(websocket): """Handle WebSocket client connection""" print(f"[Whisper] Client connected") # Ensure model is loaded await load_model() # Send ready message await websocket.send(json.dumps({ "type": "ready", "model": MODEL_SIZE, "device": DEVICE })) try: async for message in websocket: if isinstance(message, bytes): # Binary audio data print(f"[Whisper] Received {len(message)} bytes of audio") # Transcribe in thread pool to not block loop = asyncio.get_event_loop() result = await loop.run_in_executor( None, transcribe_audio, message, "es" # Default to Spanish ) await websocket.send(json.dumps({ "type": "transcription", **result })) else: # JSON command try: cmd = json.loads(message) if cmd.get("type") == "transcribe": # Audio data sent as base64 import base64 audio_data = base64.b64decode(cmd.get("audio", "")) language = cmd.get("language", "es") loop = asyncio.get_event_loop() result = await loop.run_in_executor( None, transcribe_audio, audio_data, language ) await websocket.send(json.dumps({ "type": "transcription", **result })) elif cmd.get("type") == "ping": await websocket.send(json.dumps({"type": "pong"})) elif cmd.get("type") == "status": await websocket.send(json.dumps({ "type": "status", "model": MODEL_SIZE, "device": DEVICE, "ready": model is not None })) except json.JSONDecodeError: await websocket.send(json.dumps({ "type": "error", "message": "Invalid JSON" })) except websockets.exceptions.ConnectionClosed: print("[Whisper] Client disconnected") except Exception as e: print(f"[Whisper] Error: {e}") async def main(): """Start WebSocket server""" print(f"[Whisper] Starting server on ws://{HOST}:{PORT}") print(f"[Whisper] Model: {MODEL_SIZE}, Device: {DEVICE}") # Pre-load model print("[Whisper] Pre-loading model...") await load_model() async with websockets.serve(handle_client, HOST, PORT): print(f"[Whisper] Server ready! Listening on ws://{HOST}:{PORT}") await asyncio.Future() # Run forever if __name__ == "__main__": # Install websockets if needed try: import websockets except ImportError: import subprocess subprocess.check_call([sys.executable, "-m", "pip", "install", "websockets"]) import websockets asyncio.run(main())