- Add faster-whisper Python server for GPU-accelerated transcription - Support dual mode: Web Speech API or Whisper GPU (toggleable) - Progressive transcription every 3 seconds while recording - Separate terminal server process (stable during hot-reload) - Add Ctrl+V paste and Ctrl+C copy support in FloatingTerminal - Add MCP tools: whisper_start, whisper_stop, whisper_toggle, whisper_status - Update package.json with separate api/terminal/frontend processes
234 lines
6.7 KiB
Python
234 lines
6.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Whisper Server - GPU-accelerated speech-to-text using faster-whisper
|
|
WebSocket server that receives audio and returns transcriptions
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import sys
|
|
import io
|
|
import wave
|
|
import tempfile
|
|
import os
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import websockets
|
|
from faster_whisper import WhisperModel
|
|
except ImportError as e:
|
|
print(f"Missing dependency: {e}")
|
|
print("Run: pip install faster-whisper websockets")
|
|
sys.exit(1)
|
|
|
|
# Configuration
|
|
HOST = "localhost"
|
|
PORT = 4104
|
|
MODEL_SIZE = "medium" # tiny, base, small, medium, large-v2, large-v3
|
|
DEVICE = "cuda" # cuda or cpu
|
|
COMPUTE_TYPE = "float16" # float16 for GPU, int8 for CPU
|
|
|
|
# Global model instance
|
|
model = None
|
|
model_loading = False
|
|
|
|
async def load_model():
|
|
"""Load Whisper model (lazy loading on first request)"""
|
|
global model, model_loading
|
|
|
|
if model is not None:
|
|
return model
|
|
|
|
if model_loading:
|
|
# Wait for model to finish loading
|
|
while model_loading:
|
|
await asyncio.sleep(0.1)
|
|
return model
|
|
|
|
model_loading = True
|
|
print(f"[Whisper] Loading model '{MODEL_SIZE}' on {DEVICE}...")
|
|
|
|
try:
|
|
# Load model - this downloads on first run
|
|
model = WhisperModel(
|
|
MODEL_SIZE,
|
|
device=DEVICE,
|
|
compute_type=COMPUTE_TYPE,
|
|
download_root=str(Path.home() / ".cache" / "whisper")
|
|
)
|
|
print(f"[Whisper] Model loaded successfully!")
|
|
except Exception as e:
|
|
print(f"[Whisper] Error loading model: {e}")
|
|
print("[Whisper] Falling back to CPU...")
|
|
model = WhisperModel(
|
|
MODEL_SIZE,
|
|
device="cpu",
|
|
compute_type="int8",
|
|
download_root=str(Path.home() / ".cache" / "whisper")
|
|
)
|
|
|
|
model_loading = False
|
|
return model
|
|
|
|
def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
|
|
"""Transcribe audio data using Whisper"""
|
|
global model
|
|
|
|
if model is None:
|
|
return {"error": "Model not loaded"}
|
|
|
|
# Save audio to temp file (faster-whisper needs a file path)
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
f.write(audio_data)
|
|
temp_path = f.name
|
|
|
|
try:
|
|
# Transcribe
|
|
segments, info = model.transcribe(
|
|
temp_path,
|
|
language=language,
|
|
beam_size=5,
|
|
vad_filter=True, # Voice activity detection
|
|
vad_parameters=dict(
|
|
min_silence_duration_ms=500,
|
|
speech_pad_ms=400
|
|
)
|
|
)
|
|
|
|
# Collect all segments
|
|
text = ""
|
|
segments_list = []
|
|
for segment in segments:
|
|
text += segment.text + " "
|
|
segments_list.append({
|
|
"start": segment.start,
|
|
"end": segment.end,
|
|
"text": segment.text
|
|
})
|
|
|
|
return {
|
|
"success": True,
|
|
"text": text.strip(),
|
|
"language": info.language,
|
|
"language_probability": info.language_probability,
|
|
"duration": info.duration,
|
|
"segments": segments_list,
|
|
"engine": "whisper-gpu",
|
|
"model": MODEL_SIZE,
|
|
"device": DEVICE
|
|
}
|
|
|
|
except Exception as e:
|
|
return {"error": str(e)}
|
|
|
|
finally:
|
|
# Cleanup temp file
|
|
try:
|
|
os.unlink(temp_path)
|
|
except:
|
|
pass
|
|
|
|
async def handle_client(websocket):
|
|
"""Handle WebSocket client connection"""
|
|
print(f"[Whisper] Client connected")
|
|
|
|
# Ensure model is loaded
|
|
await load_model()
|
|
|
|
# Send ready message
|
|
await websocket.send(json.dumps({
|
|
"type": "ready",
|
|
"model": MODEL_SIZE,
|
|
"device": DEVICE
|
|
}))
|
|
|
|
try:
|
|
async for message in websocket:
|
|
if isinstance(message, bytes):
|
|
# Binary audio data
|
|
print(f"[Whisper] Received {len(message)} bytes of audio")
|
|
|
|
# Transcribe in thread pool to not block
|
|
loop = asyncio.get_event_loop()
|
|
result = await loop.run_in_executor(
|
|
None,
|
|
transcribe_audio,
|
|
message,
|
|
"es" # Default to Spanish
|
|
)
|
|
|
|
await websocket.send(json.dumps({
|
|
"type": "transcription",
|
|
**result
|
|
}))
|
|
|
|
else:
|
|
# JSON command
|
|
try:
|
|
cmd = json.loads(message)
|
|
|
|
if cmd.get("type") == "transcribe":
|
|
# Audio data sent as base64
|
|
import base64
|
|
audio_data = base64.b64decode(cmd.get("audio", ""))
|
|
language = cmd.get("language", "es")
|
|
|
|
loop = asyncio.get_event_loop()
|
|
result = await loop.run_in_executor(
|
|
None,
|
|
transcribe_audio,
|
|
audio_data,
|
|
language
|
|
)
|
|
|
|
await websocket.send(json.dumps({
|
|
"type": "transcription",
|
|
**result
|
|
}))
|
|
|
|
elif cmd.get("type") == "ping":
|
|
await websocket.send(json.dumps({"type": "pong"}))
|
|
|
|
elif cmd.get("type") == "status":
|
|
await websocket.send(json.dumps({
|
|
"type": "status",
|
|
"model": MODEL_SIZE,
|
|
"device": DEVICE,
|
|
"ready": model is not None
|
|
}))
|
|
|
|
except json.JSONDecodeError:
|
|
await websocket.send(json.dumps({
|
|
"type": "error",
|
|
"message": "Invalid JSON"
|
|
}))
|
|
|
|
except websockets.exceptions.ConnectionClosed:
|
|
print("[Whisper] Client disconnected")
|
|
except Exception as e:
|
|
print(f"[Whisper] Error: {e}")
|
|
|
|
async def main():
|
|
"""Start WebSocket server"""
|
|
print(f"[Whisper] Starting server on ws://{HOST}:{PORT}")
|
|
print(f"[Whisper] Model: {MODEL_SIZE}, Device: {DEVICE}")
|
|
|
|
# Pre-load model
|
|
print("[Whisper] Pre-loading model...")
|
|
await load_model()
|
|
|
|
async with websockets.serve(handle_client, HOST, PORT):
|
|
print(f"[Whisper] Server ready! Listening on ws://{HOST}:{PORT}")
|
|
await asyncio.Future() # Run forever
|
|
|
|
if __name__ == "__main__":
|
|
# Install websockets if needed
|
|
try:
|
|
import websockets
|
|
except ImportError:
|
|
import subprocess
|
|
subprocess.check_call([sys.executable, "-m", "pip", "install", "websockets"])
|
|
import websockets
|
|
|
|
asyncio.run(main())
|