agent-ui/server/whisper_server.py

#!/usr/bin/env python3
"""
Whisper Server - GPU-accelerated speech-to-text using faster-whisper
WebSocket server that receives audio and returns transcriptions
"""

import asyncio
import json
import sys
import io
import wave
import tempfile
import os
import subprocess
from pathlib import Path

try:
    import websockets
    from faster_whisper import WhisperModel
except ImportError as e:
    print(f"Missing dependency: {e}")
    print("Run: pip install faster-whisper websockets")
    sys.exit(1)


def convert_audio_to_wav(input_data: bytes, input_format: str = "webm") -> bytes:
    """
    Convert audio data to WAV format using ffmpeg.
    Whisper requires WAV/PCM format, but browsers typically record in WebM/Opus.
    """
    # Create temp files for input and output
    with tempfile.NamedTemporaryFile(suffix=f".{input_format}", delete=False) as in_file:
        in_file.write(input_data)
        input_path = in_file.name

    output_path = input_path.replace(f".{input_format}", ".wav")

    try:
        # Use ffmpeg to convert to WAV (16kHz mono, which Whisper prefers)
        result = subprocess.run([
            "ffmpeg", "-y",  # Overwrite output
            "-i", input_path,  # Input file
            "-ar", "16000",  # Sample rate 16kHz
            "-ac", "1",  # Mono
            "-c:a", "pcm_s16le",  # PCM 16-bit little-endian
            output_path
        ], capture_output=True, text=True, timeout=30)

        if result.returncode != 0:
            print(f"[Whisper] ffmpeg error: {result.stderr}")
            return None

        # Read the converted WAV file
        with open(output_path, "rb") as f:
            wav_data = f.read()

        return wav_data

    except subprocess.TimeoutExpired:
        print("[Whisper] ffmpeg conversion timed out")
        return None
    except FileNotFoundError:
        print("[Whisper] ffmpeg not found - please install ffmpeg")
        return None
    except Exception as e:
        print(f"[Whisper] Conversion error: {e}")
        return None
    finally:
        # Cleanup temp files
        try:
            os.unlink(input_path)
        except:
            pass
        try:
            os.unlink(output_path)
        except:
            pass

# Configuration
HOST = "localhost"
PORT = 4104
MODEL_SIZE = "large-v3"  # tiny, base, small, medium, large-v2, large-v3
DEVICE = "cuda"  # cuda or cpu
COMPUTE_TYPE = "float16"  # float16 for GPU, int8 for CPU

# Spanish context prompt to improve accuracy (Honduras Spanish + tech context)
INITIAL_PROMPT = """Transcripción en español hondureño de un desarrollador de software.
Contexto: programación, TypeScript, Vue, Python, comandos de terminal, código.
Vocabulario técnico: servidor, frontend, backend, chunks, WebSocket, transcripción,
componente, función, variable, API, modelo, Whisper, Claude, MCP, configuración.
Expresiones hondureñas: vos, tenés, podés, mirá, pues, verdad, ajá, entonces.
Diminutivos comunes: ahorita, ratito, prontito, despuesito, chiquito, tantito, poquito."""

# Global model instance
model = None
model_loading = False

async def load_model():
    """Load Whisper model (lazy loading on first request)"""
    global model, model_loading

    if model is not None:
        return model

    if model_loading:
        # Wait for model to finish loading
        while model_loading:
            await asyncio.sleep(0.1)
        return model

    model_loading = True
    print(f"[Whisper] Loading model '{MODEL_SIZE}' on {DEVICE}...")

    try:
        # Load model - this downloads on first run
        model = WhisperModel(
            MODEL_SIZE,
            device=DEVICE,
            compute_type=COMPUTE_TYPE,
            download_root=str(Path.home() / ".cache" / "whisper")
        )
        print(f"[Whisper] Model loaded successfully!")
    except Exception as e:
        print(f"[Whisper] Error loading model: {e}")
        print("[Whisper] Falling back to CPU...")
        model = WhisperModel(
            MODEL_SIZE,
            device="cpu",
            compute_type="int8",
            download_root=str(Path.home() / ".cache" / "whisper")
        )

    model_loading = False
    return model

def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = True) -> dict:
    """Transcribe audio data using Whisper"""
    global model

    if model is None:
        return {"error": "Model not loaded"}

    print(f"[Whisper] Received {len(audio_data)} bytes of audio data")

    # Convert WebM to WAV if needed
    if is_webm:
        print("[Whisper] Converting WebM to WAV...")
        wav_data = convert_audio_to_wav(audio_data, "webm")
        if wav_data is None:
            return {"error": "Failed to convert audio format. Ensure ffmpeg is installed."}
        print(f"[Whisper] Converted to {len(wav_data)} bytes WAV")
    else:
        wav_data = audio_data

    # Save audio to temp file (faster-whisper needs a file path)
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        f.write(wav_data)
        temp_path = f.name

    try:
        # Transcribe with optimized parameters
        print(f"[Whisper] Transcribing {temp_path}...")
        segments, info = model.transcribe(
            temp_path,
            language=language,
            beam_size=5,
            best_of=5,  # Number of candidates when sampling
            temperature=0.0,  # Use greedy decoding (most accurate)
            vad_filter=True,  # Voice activity detection
            vad_parameters=dict(
                min_silence_duration_ms=300,  # Shorter silence detection
                speech_pad_ms=200,  # Padding around speech
                threshold=0.5  # VAD sensitivity (lower = more sensitive)
            ),
            initial_prompt=INITIAL_PROMPT,  # Context for better Spanish transcription
            condition_on_previous_text=True,  # Use context from previous segments
            no_speech_threshold=0.6,
            log_prob_threshold=-1.0,
            compression_ratio_threshold=2.4,
            word_timestamps=False  # Faster without word-level timestamps
        )

        # Collect all segments
        text = ""
        segments_list = []
        for segment in segments:
            text += segment.text + " "
            segments_list.append({
                "start": segment.start,
                "end": segment.end,
                "text": segment.text
            })

        print(f"[Whisper] Transcription result: '{text.strip()}'")

        return {
            "success": True,
            "text": text.strip(),
            "language": info.language,
            "language_probability": info.language_probability,
            "duration": info.duration,
            "segments": segments_list,
            "engine": "whisper-gpu",
            "model": MODEL_SIZE,
            "device": DEVICE
        }

    except Exception as e:
        print(f"[Whisper] Transcription error: {e}")
        return {"error": str(e)}

    finally:
        # Cleanup temp file
        try:
            os.unlink(temp_path)
        except:
            pass

async def handle_client(websocket):
    """Handle WebSocket client connection"""
    print(f"[Whisper] Client connected")

    # Ensure model is loaded
    await load_model()

    # Send ready message
    await websocket.send(json.dumps({
        "type": "ready",
        "model": MODEL_SIZE,
        "device": DEVICE
    }))

    try:
        async for message in websocket:
            if isinstance(message, bytes):
                # Binary audio data (likely WebM format from browser)
                print(f"[Whisper] Received {len(message)} bytes of binary audio")

                # Transcribe in thread pool to not block
                loop = asyncio.get_event_loop()
                result = await loop.run_in_executor(
                    None,
                    lambda: transcribe_audio(message, "es", is_webm=True)
                )

                await websocket.send(json.dumps({
                    "type": "transcription",
                    **result
                }))

            else:
                # JSON command
                try:
                    cmd = json.loads(message)

                    if cmd.get("type") == "transcribe":
                        # Audio data sent as base64 (WebM format from browser)
                        import base64
                        audio_data = base64.b64decode(cmd.get("audio", ""))
                        language = cmd.get("language", "es")
                        is_partial = cmd.get("partial", False)

                        print(f"[Whisper] Transcribe request: {len(audio_data)} bytes, lang={language}, partial={is_partial}")

                        loop = asyncio.get_event_loop()
                        result = await loop.run_in_executor(
                            None,
                            lambda: transcribe_audio(audio_data, language, is_webm=True)
                        )

                        # Add partial flag to result
                        if is_partial:
                            result["partial"] = True

                        await websocket.send(json.dumps({
                            "type": "transcription",
                            **result
                        }))

                    elif cmd.get("type") == "ping":
                        await websocket.send(json.dumps({"type": "pong"}))

                    elif cmd.get("type") == "status":
                        await websocket.send(json.dumps({
                            "type": "status",
                            "model": MODEL_SIZE,
                            "device": DEVICE,
                            "ready": model is not None
                        }))

                except json.JSONDecodeError:
                    await websocket.send(json.dumps({
                        "type": "error",
                        "message": "Invalid JSON"
                    }))

    except websockets.exceptions.ConnectionClosed:
        print("[Whisper] Client disconnected")
    except Exception as e:
        print(f"[Whisper] Error: {e}")

async def main():
    """Start WebSocket server"""
    print(f"[Whisper] Starting server on ws://{HOST}:{PORT}")
    print(f"[Whisper] Model: {MODEL_SIZE}, Device: {DEVICE}")

    # Pre-load model
    print("[Whisper] Pre-loading model...")
    await load_model()

    async with websockets.serve(handle_client, HOST, PORT):
        print(f"[Whisper] Server ready! Listening on ws://{HOST}:{PORT}")
        await asyncio.Future()  # Run forever

if __name__ == "__main__":
    # Install websockets if needed
    try:
        import websockets
    except ImportError:
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "websockets"])
        import websockets

    asyncio.run(main())