Files
agent-ui/server/whisper_server.py
josedario87 f9b5ad3db6 feat: Push-to-talk on voice FAB button
- Hold FAB to open panel and start recording immediately
- Release to stop recording and send after 1s buffer
- Orange pulsing animation when PTT active
- PTT also works on record button inside modal
- Added stopRecordingAndSend exposed method
2026-02-14 04:51:50 -06:00

315 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Whisper Server - GPU-accelerated speech-to-text using faster-whisper
WebSocket server that receives audio and returns transcriptions
"""
import asyncio
import json
import sys
import io
import wave
import tempfile
import os
import subprocess
from pathlib import Path
try:
import websockets
from faster_whisper import WhisperModel
except ImportError as e:
print(f"Missing dependency: {e}")
print("Run: pip install faster-whisper websockets")
sys.exit(1)
def convert_audio_to_wav(input_data: bytes, input_format: str = "webm") -> bytes:
"""
Convert audio data to WAV format using ffmpeg.
Whisper requires WAV/PCM format, but browsers typically record in WebM/Opus.
"""
# Create temp files for input and output
with tempfile.NamedTemporaryFile(suffix=f".{input_format}", delete=False) as in_file:
in_file.write(input_data)
input_path = in_file.name
output_path = input_path.replace(f".{input_format}", ".wav")
try:
# Use ffmpeg to convert to WAV (16kHz mono, which Whisper prefers)
result = subprocess.run([
"ffmpeg", "-y", # Overwrite output
"-i", input_path, # Input file
"-ar", "16000", # Sample rate 16kHz
"-ac", "1", # Mono
"-c:a", "pcm_s16le", # PCM 16-bit little-endian
output_path
], capture_output=True, text=True, timeout=30)
if result.returncode != 0:
print(f"[Whisper] ffmpeg error: {result.stderr}")
return None
# Read the converted WAV file
with open(output_path, "rb") as f:
wav_data = f.read()
return wav_data
except subprocess.TimeoutExpired:
print("[Whisper] ffmpeg conversion timed out")
return None
except FileNotFoundError:
print("[Whisper] ffmpeg not found - please install ffmpeg")
return None
except Exception as e:
print(f"[Whisper] Conversion error: {e}")
return None
finally:
# Cleanup temp files
try:
os.unlink(input_path)
except:
pass
try:
os.unlink(output_path)
except:
pass
# Configuration
HOST = "0.0.0.0" # Listen on all interfaces (needed for Traefik proxy)
PORT = 4104
MODEL_SIZE = "large-v3" # Best standard model for Spanish
DEVICE = "cuda" # cuda or cpu
COMPUTE_TYPE = "float16" # float16 for GPU, int8 for CPU
# Model display name (extract from path if needed)
MODEL_NAME = MODEL_SIZE.split("/")[-1] if "/" in MODEL_SIZE else MODEL_SIZE
# Spanish context prompt to improve accuracy (Honduras Spanish + tech context)
INITIAL_PROMPT = """Transcripción en español hondureño de un desarrollador de software.
Contexto: programación, TypeScript, Vue, Python, comandos de terminal, código.
Vocabulario técnico: servidor, frontend, backend, chunks, WebSocket, transcripción,
componente, función, variable, API, modelo, Whisper, Claude, MCP, configuración.
Expresiones hondureñas: vos, tenés, podés, mirá, pues, verdad, ajá, entonces.
Diminutivos comunes: ahorita, ratito, prontito, despuesito, chiquito, tantito, poquito."""
# Global model instance
model = None
model_loading = False
async def load_model():
"""Load Whisper model (lazy loading on first request)"""
global model, model_loading
if model is not None:
return model
if model_loading:
# Wait for model to finish loading
while model_loading:
await asyncio.sleep(0.1)
return model
model_loading = True
print(f"[Whisper] Loading model '{MODEL_NAME}' on {DEVICE}...")
try:
# Load model - this downloads on first run
model = WhisperModel(
MODEL_SIZE,
device=DEVICE,
compute_type=COMPUTE_TYPE,
download_root=str(Path.home() / ".cache" / "whisper")
)
print(f"[Whisper] Model loaded successfully!")
except Exception as e:
print(f"[Whisper] Error loading model: {e}")
print("[Whisper] Falling back to CPU...")
model = WhisperModel(
MODEL_SIZE,
device="cpu",
compute_type="int8",
download_root=str(Path.home() / ".cache" / "whisper")
)
model_loading = False
return model
def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = True) -> dict:
"""Transcribe audio data using Whisper"""
global model
if model is None:
return {"error": "Model not loaded"}
# Convert WebM to WAV if needed
if is_webm:
wav_data = convert_audio_to_wav(audio_data, "webm")
if wav_data is None:
return {"error": "Failed to convert audio format. Ensure ffmpeg is installed."}
else:
wav_data = audio_data
# Save audio to temp file (faster-whisper needs a file path)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(wav_data)
temp_path = f.name
try:
# Transcribe with optimized parameters
segments, info = model.transcribe(
temp_path,
language=language,
beam_size=5,
best_of=5, # Number of candidates when sampling
temperature=0.0, # Use greedy decoding (most accurate)
vad_filter=True, # Voice activity detection
vad_parameters=dict(
min_silence_duration_ms=300, # Shorter silence detection
speech_pad_ms=200, # Padding around speech
threshold=0.5 # VAD sensitivity (lower = more sensitive)
),
initial_prompt=INITIAL_PROMPT, # Context for better Spanish transcription
condition_on_previous_text=True, # Use context from previous segments
no_speech_threshold=0.6,
log_prob_threshold=-1.0,
compression_ratio_threshold=2.4,
word_timestamps=False # Faster without word-level timestamps
)
# Collect all segments
text = ""
segments_list = []
for segment in segments:
text += segment.text + " "
segments_list.append({
"start": segment.start,
"end": segment.end,
"text": segment.text
})
return {
"success": True,
"text": text.strip(),
"language": info.language,
"language_probability": info.language_probability,
"duration": info.duration,
"segments": segments_list,
"engine": "whisper-gpu",
"model": MODEL_NAME,
"device": DEVICE
}
except Exception as e:
print(f"[Whisper] Transcription error: {e}")
return {"error": str(e)}
finally:
# Cleanup temp file
try:
os.unlink(temp_path)
except:
pass
async def handle_client(websocket):
"""Handle WebSocket client connection"""
# Ensure model is loaded
await load_model()
# Send ready message
await websocket.send(json.dumps({
"type": "ready",
"model": MODEL_NAME,
"device": DEVICE
}))
try:
async for message in websocket:
if isinstance(message, bytes):
# Binary audio data (likely WebM format from browser)
# Transcribe in thread pool to not block
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
lambda: transcribe_audio(message, "es", is_webm=True)
)
await websocket.send(json.dumps({
"type": "transcription",
**result
}))
else:
# JSON command
try:
cmd = json.loads(message)
if cmd.get("type") == "transcribe":
# Audio data sent as base64 (WebM format from browser)
import base64
audio_data = base64.b64decode(cmd.get("audio", ""))
language = cmd.get("language", "es")
is_partial = cmd.get("partial", False)
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
lambda: transcribe_audio(audio_data, language, is_webm=True)
)
# Add partial flag to result
if is_partial:
result["partial"] = True
await websocket.send(json.dumps({
"type": "transcription",
**result
}))
elif cmd.get("type") == "ping":
await websocket.send(json.dumps({"type": "pong"}))
elif cmd.get("type") == "status":
await websocket.send(json.dumps({
"type": "status",
"model": MODEL_NAME,
"device": DEVICE,
"ready": model is not None
}))
except json.JSONDecodeError:
await websocket.send(json.dumps({
"type": "error",
"message": "Invalid JSON"
}))
except websockets.exceptions.ConnectionClosed:
pass
except Exception as e:
print(f"[Whisper] Error: {e}")
async def main():
"""Start WebSocket server"""
print(f"[Whisper] Model: {MODEL_NAME} | Device: {DEVICE} | Port: {PORT}")
# Pre-load model
await load_model()
async with websockets.serve(handle_client, HOST, PORT):
print(f"[Whisper] Ready")
await asyncio.Future() # Run forever
if __name__ == "__main__":
# Install websockets if needed
try:
import websockets
except ImportError:
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "websockets"])
import websockets
asyncio.run(main())