agent-ui/server/whisper_server.py

#!/usr/bin/env python3
"""
Whisper Server - GPU-accelerated speech-to-text using faster-whisper
WebSocket server that receives audio and returns transcriptions
"""

import asyncio
import json
import sys
import io
import wave
import tempfile
import os
from pathlib import Path

try:
    import websockets
    from faster_whisper import WhisperModel
except ImportError as e:
    print(f"Missing dependency: {e}")
    print("Run: pip install faster-whisper websockets")
    sys.exit(1)

# Configuration
HOST = "localhost"
PORT = 4104
MODEL_SIZE = "medium"  # tiny, base, small, medium, large-v2, large-v3
DEVICE = "cuda"  # cuda or cpu
COMPUTE_TYPE = "float16"  # float16 for GPU, int8 for CPU

# Global model instance
model = None
model_loading = False

async def load_model():
    """Load Whisper model (lazy loading on first request)"""
    global model, model_loading

    if model is not None:
        return model

    if model_loading:
        # Wait for model to finish loading
        while model_loading:
            await asyncio.sleep(0.1)
        return model

    model_loading = True
    print(f"[Whisper] Loading model '{MODEL_SIZE}' on {DEVICE}...")

    try:
        # Load model - this downloads on first run
        model = WhisperModel(
            MODEL_SIZE,
            device=DEVICE,
            compute_type=COMPUTE_TYPE,
            download_root=str(Path.home() / ".cache" / "whisper")
        )
        print(f"[Whisper] Model loaded successfully!")
    except Exception as e:
        print(f"[Whisper] Error loading model: {e}")
        print("[Whisper] Falling back to CPU...")
        model = WhisperModel(
            MODEL_SIZE,
            device="cpu",
            compute_type="int8",
            download_root=str(Path.home() / ".cache" / "whisper")
        )

    model_loading = False
    return model

def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
    """Transcribe audio data using Whisper"""
    global model

    if model is None:
        return {"error": "Model not loaded"}

    # Save audio to temp file (faster-whisper needs a file path)
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        f.write(audio_data)
        temp_path = f.name

    try:
        # Transcribe
        segments, info = model.transcribe(
            temp_path,
            language=language,
            beam_size=5,
            vad_filter=True,  # Voice activity detection
            vad_parameters=dict(
                min_silence_duration_ms=500,
                speech_pad_ms=400
            )
        )

        # Collect all segments
        text = ""
        segments_list = []
        for segment in segments:
            text += segment.text + " "
            segments_list.append({
                "start": segment.start,
                "end": segment.end,
                "text": segment.text
            })

        return {
            "success": True,
            "text": text.strip(),
            "language": info.language,
            "language_probability": info.language_probability,
            "duration": info.duration,
            "segments": segments_list,
            "engine": "whisper-gpu",
            "model": MODEL_SIZE,
            "device": DEVICE
        }

    except Exception as e:
        return {"error": str(e)}

    finally:
        # Cleanup temp file
        try:
            os.unlink(temp_path)
        except:
            pass

async def handle_client(websocket):
    """Handle WebSocket client connection"""
    print(f"[Whisper] Client connected")

    # Ensure model is loaded
    await load_model()

    # Send ready message
    await websocket.send(json.dumps({
        "type": "ready",
        "model": MODEL_SIZE,
        "device": DEVICE
    }))

    try:
        async for message in websocket:
            if isinstance(message, bytes):
                # Binary audio data
                print(f"[Whisper] Received {len(message)} bytes of audio")

                # Transcribe in thread pool to not block
                loop = asyncio.get_event_loop()
                result = await loop.run_in_executor(
                    None,
                    transcribe_audio,
                    message,
                    "es"  # Default to Spanish
                )

                await websocket.send(json.dumps({
                    "type": "transcription",
                    **result
                }))

            else:
                # JSON command
                try:
                    cmd = json.loads(message)

                    if cmd.get("type") == "transcribe":
                        # Audio data sent as base64
                        import base64
                        audio_data = base64.b64decode(cmd.get("audio", ""))
                        language = cmd.get("language", "es")

                        loop = asyncio.get_event_loop()
                        result = await loop.run_in_executor(
                            None,
                            transcribe_audio,
                            audio_data,
                            language
                        )

                        await websocket.send(json.dumps({
                            "type": "transcription",
                            **result
                        }))

                    elif cmd.get("type") == "ping":
                        await websocket.send(json.dumps({"type": "pong"}))

                    elif cmd.get("type") == "status":
                        await websocket.send(json.dumps({
                            "type": "status",
                            "model": MODEL_SIZE,
                            "device": DEVICE,
                            "ready": model is not None
                        }))

                except json.JSONDecodeError:
                    await websocket.send(json.dumps({
                        "type": "error",
                        "message": "Invalid JSON"
                    }))

    except websockets.exceptions.ConnectionClosed:
        print("[Whisper] Client disconnected")
    except Exception as e:
        print(f"[Whisper] Error: {e}")

async def main():
    """Start WebSocket server"""
    print(f"[Whisper] Starting server on ws://{HOST}:{PORT}")
    print(f"[Whisper] Model: {MODEL_SIZE}, Device: {DEVICE}")

    # Pre-load model
    print("[Whisper] Pre-loading model...")
    await load_model()

    async with websockets.serve(handle_client, HOST, PORT):
        print(f"[Whisper] Server ready! Listening on ws://{HOST}:{PORT}")
        await asyncio.Future()  # Run forever

if __name__ == "__main__":
    # Install websockets if needed
    try:
        import websockets
    except ImportError:
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "websockets"])
        import websockets

    asyncio.run(main())