#!/usr/bin/env python3 """ Whisper Server - GPU-accelerated speech-to-text using faster-whisper WebSocket server that receives audio and returns transcriptions """ import asyncio import json import sys import io import wave import tempfile import os import subprocess from pathlib import Path try: import websockets from faster_whisper import WhisperModel except ImportError as e: print(f"Missing dependency: {e}") print("Run: pip install faster-whisper websockets") sys.exit(1) def convert_audio_to_wav(input_data: bytes, input_format: str = "webm") -> bytes: """ Convert audio data to WAV format using ffmpeg. Uses stdin/stdout pipes so ffmpeg probes the actual data format instead of relying on file extensions. """ try: result = subprocess.run([ "ffmpeg", "-y", "-i", "pipe:0", # Read from stdin (auto-detect format) "-ar", "16000", # Sample rate 16kHz "-ac", "1", # Mono "-c:a", "pcm_s16le", # PCM 16-bit little-endian "-f", "wav", # Output format "pipe:1" # Write to stdout ], input=input_data, capture_output=True, timeout=30) if result.returncode != 0: print(f"[Whisper] ffmpeg error: {result.stderr.decode('utf-8', errors='replace')}") return None return result.stdout except subprocess.TimeoutExpired: print("[Whisper] ffmpeg conversion timed out") return None except FileNotFoundError: print("[Whisper] ffmpeg not found - please install ffmpeg") return None except Exception as e: print(f"[Whisper] Conversion error: {e}") return None # Configuration HOST = "0.0.0.0" # Listen on all interfaces (needed for Traefik proxy) PORT = 4104 MODEL_SIZE = "large-v3" # Best standard model for Spanish DEVICE = "cuda" # cuda or cpu COMPUTE_TYPE = "float16" # float16 for GPU, int8 for CPU # Model display name (extract from path if needed) MODEL_NAME = MODEL_SIZE.split("/")[-1] if "/" in MODEL_SIZE else MODEL_SIZE # Spanish context prompt to improve accuracy (Honduras Spanish + tech context) INITIAL_PROMPT = """Transcripción en español hondureño de un desarrollador de software. Contexto: programación, TypeScript, Vue, Python, comandos de terminal, código. Vocabulario técnico: servidor, frontend, backend, chunks, WebSocket, transcripción, componente, función, variable, API, modelo, Whisper, Claude, MCP, configuración. Expresiones hondureñas: vos, tenés, podés, mirá, pues, verdad, ajá, entonces. Diminutivos comunes: ahorita, ratito, prontito, despuesito, chiquito, tantito, poquito.""" # Global model instance model = None model_loading = False async def load_model(): """Load Whisper model (lazy loading on first request)""" global model, model_loading if model is not None: return model if model_loading: # Wait for model to finish loading while model_loading: await asyncio.sleep(0.1) return model model_loading = True print(f"[Whisper] Loading model '{MODEL_NAME}' on {DEVICE}...") try: # Load model - this downloads on first run model = WhisperModel( MODEL_SIZE, device=DEVICE, compute_type=COMPUTE_TYPE, download_root=str(Path.home() / ".cache" / "whisper") ) print(f"[Whisper] Model loaded successfully!") except Exception as e: print(f"[Whisper] Error loading model: {e}") print("[Whisper] Falling back to CPU...") model = WhisperModel( MODEL_SIZE, device="cpu", compute_type="int8", download_root=str(Path.home() / ".cache" / "whisper") ) model_loading = False return model def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = True) -> dict: """Transcribe audio data using Whisper""" global model if model is None: return {"error": "Model not loaded"} # Convert WebM to WAV if needed if is_webm: wav_data = convert_audio_to_wav(audio_data, "webm") if wav_data is None: return {"error": "Failed to convert audio format. Ensure ffmpeg is installed."} else: wav_data = audio_data # Save audio to temp file (faster-whisper needs a file path) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: f.write(wav_data) temp_path = f.name try: # Transcribe with optimized parameters segments, info = model.transcribe( temp_path, language=language, beam_size=5, best_of=5, # Number of candidates when sampling temperature=0.0, # Use greedy decoding (most accurate) vad_filter=True, # Voice activity detection vad_parameters=dict( min_silence_duration_ms=300, # Shorter silence detection speech_pad_ms=200, # Padding around speech threshold=0.5 # VAD sensitivity (lower = more sensitive) ), initial_prompt=INITIAL_PROMPT, # Context for better Spanish transcription condition_on_previous_text=True, # Use context from previous segments no_speech_threshold=0.6, log_prob_threshold=-1.0, compression_ratio_threshold=2.4, word_timestamps=False # Faster without word-level timestamps ) # Collect all segments text = "" segments_list = [] for segment in segments: text += segment.text + " " segments_list.append({ "start": segment.start, "end": segment.end, "text": segment.text }) return { "success": True, "text": text.strip(), "language": info.language, "language_probability": info.language_probability, "duration": info.duration, "segments": segments_list, "engine": "whisper-gpu", "model": MODEL_NAME, "device": DEVICE } except Exception as e: print(f"[Whisper] Transcription error: {e}") return {"error": str(e)} finally: # Cleanup temp file try: os.unlink(temp_path) except: pass async def handle_client(websocket): """Handle WebSocket client connection""" # Ensure model is loaded await load_model() # Send ready message await websocket.send(json.dumps({ "type": "ready", "model": MODEL_NAME, "device": DEVICE })) try: async for message in websocket: if isinstance(message, bytes): # Binary audio data (likely WebM format from browser) # Transcribe in thread pool to not block loop = asyncio.get_event_loop() result = await loop.run_in_executor( None, lambda: transcribe_audio(message, "es", is_webm=True) ) await websocket.send(json.dumps({ "type": "transcription", **result })) else: # JSON command try: cmd = json.loads(message) if cmd.get("type") == "transcribe": # Audio data sent as base64 (WebM format from browser) import base64 audio_data = base64.b64decode(cmd.get("audio", "")) language = cmd.get("language", "es") is_partial = cmd.get("partial", False) loop = asyncio.get_event_loop() result = await loop.run_in_executor( None, lambda: transcribe_audio(audio_data, language, is_webm=True) ) # Add partial flag to result if is_partial: result["partial"] = True await websocket.send(json.dumps({ "type": "transcription", **result })) elif cmd.get("type") == "ping": await websocket.send(json.dumps({"type": "pong"})) elif cmd.get("type") == "status": await websocket.send(json.dumps({ "type": "status", "model": MODEL_NAME, "device": DEVICE, "ready": model is not None })) except json.JSONDecodeError: await websocket.send(json.dumps({ "type": "error", "message": "Invalid JSON" })) except websockets.exceptions.ConnectionClosed: pass except Exception as e: print(f"[Whisper] Error: {e}") async def main(): """Start WebSocket server""" print(f"[Whisper] Model: {MODEL_NAME} | Device: {DEVICE} | Port: {PORT}") # Pre-load model await load_model() async with websockets.serve(handle_client, HOST, PORT): print(f"[Whisper] Ready") await asyncio.Future() # Run forever if __name__ == "__main__": # Install websockets if needed try: import websockets except ImportError: import subprocess subprocess.check_call([sys.executable, "-m", "pip", "install", "websockets"]) import websockets asyncio.run(main())