feat: Add Whisper GPU speech-to-text with progressive transcription
- Add faster-whisper Python server for GPU-accelerated transcription - Support dual mode: Web Speech API or Whisper GPU (toggleable) - Progressive transcription every 3 seconds while recording - Separate terminal server process (stable during hot-reload) - Add Ctrl+V paste and Ctrl+C copy support in FloatingTerminal - Add MCP tools: whisper_start, whisper_stop, whisper_toggle, whisper_status - Update package.json with separate api/terminal/frontend processes
This commit is contained in:
233
server/whisper_server.py
Normal file
233
server/whisper_server.py
Normal file
@@ -0,0 +1,233 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Whisper Server - GPU-accelerated speech-to-text using faster-whisper
|
||||
WebSocket server that receives audio and returns transcriptions
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import io
|
||||
import wave
|
||||
import tempfile
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import websockets
|
||||
from faster_whisper import WhisperModel
|
||||
except ImportError as e:
|
||||
print(f"Missing dependency: {e}")
|
||||
print("Run: pip install faster-whisper websockets")
|
||||
sys.exit(1)
|
||||
|
||||
# Configuration
|
||||
HOST = "localhost"
|
||||
PORT = 4104
|
||||
MODEL_SIZE = "medium" # tiny, base, small, medium, large-v2, large-v3
|
||||
DEVICE = "cuda" # cuda or cpu
|
||||
COMPUTE_TYPE = "float16" # float16 for GPU, int8 for CPU
|
||||
|
||||
# Global model instance
|
||||
model = None
|
||||
model_loading = False
|
||||
|
||||
async def load_model():
|
||||
"""Load Whisper model (lazy loading on first request)"""
|
||||
global model, model_loading
|
||||
|
||||
if model is not None:
|
||||
return model
|
||||
|
||||
if model_loading:
|
||||
# Wait for model to finish loading
|
||||
while model_loading:
|
||||
await asyncio.sleep(0.1)
|
||||
return model
|
||||
|
||||
model_loading = True
|
||||
print(f"[Whisper] Loading model '{MODEL_SIZE}' on {DEVICE}...")
|
||||
|
||||
try:
|
||||
# Load model - this downloads on first run
|
||||
model = WhisperModel(
|
||||
MODEL_SIZE,
|
||||
device=DEVICE,
|
||||
compute_type=COMPUTE_TYPE,
|
||||
download_root=str(Path.home() / ".cache" / "whisper")
|
||||
)
|
||||
print(f"[Whisper] Model loaded successfully!")
|
||||
except Exception as e:
|
||||
print(f"[Whisper] Error loading model: {e}")
|
||||
print("[Whisper] Falling back to CPU...")
|
||||
model = WhisperModel(
|
||||
MODEL_SIZE,
|
||||
device="cpu",
|
||||
compute_type="int8",
|
||||
download_root=str(Path.home() / ".cache" / "whisper")
|
||||
)
|
||||
|
||||
model_loading = False
|
||||
return model
|
||||
|
||||
def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
|
||||
"""Transcribe audio data using Whisper"""
|
||||
global model
|
||||
|
||||
if model is None:
|
||||
return {"error": "Model not loaded"}
|
||||
|
||||
# Save audio to temp file (faster-whisper needs a file path)
|
||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
||||
f.write(audio_data)
|
||||
temp_path = f.name
|
||||
|
||||
try:
|
||||
# Transcribe
|
||||
segments, info = model.transcribe(
|
||||
temp_path,
|
||||
language=language,
|
||||
beam_size=5,
|
||||
vad_filter=True, # Voice activity detection
|
||||
vad_parameters=dict(
|
||||
min_silence_duration_ms=500,
|
||||
speech_pad_ms=400
|
||||
)
|
||||
)
|
||||
|
||||
# Collect all segments
|
||||
text = ""
|
||||
segments_list = []
|
||||
for segment in segments:
|
||||
text += segment.text + " "
|
||||
segments_list.append({
|
||||
"start": segment.start,
|
||||
"end": segment.end,
|
||||
"text": segment.text
|
||||
})
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"text": text.strip(),
|
||||
"language": info.language,
|
||||
"language_probability": info.language_probability,
|
||||
"duration": info.duration,
|
||||
"segments": segments_list,
|
||||
"engine": "whisper-gpu",
|
||||
"model": MODEL_SIZE,
|
||||
"device": DEVICE
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
finally:
|
||||
# Cleanup temp file
|
||||
try:
|
||||
os.unlink(temp_path)
|
||||
except:
|
||||
pass
|
||||
|
||||
async def handle_client(websocket):
|
||||
"""Handle WebSocket client connection"""
|
||||
print(f"[Whisper] Client connected")
|
||||
|
||||
# Ensure model is loaded
|
||||
await load_model()
|
||||
|
||||
# Send ready message
|
||||
await websocket.send(json.dumps({
|
||||
"type": "ready",
|
||||
"model": MODEL_SIZE,
|
||||
"device": DEVICE
|
||||
}))
|
||||
|
||||
try:
|
||||
async for message in websocket:
|
||||
if isinstance(message, bytes):
|
||||
# Binary audio data
|
||||
print(f"[Whisper] Received {len(message)} bytes of audio")
|
||||
|
||||
# Transcribe in thread pool to not block
|
||||
loop = asyncio.get_event_loop()
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
transcribe_audio,
|
||||
message,
|
||||
"es" # Default to Spanish
|
||||
)
|
||||
|
||||
await websocket.send(json.dumps({
|
||||
"type": "transcription",
|
||||
**result
|
||||
}))
|
||||
|
||||
else:
|
||||
# JSON command
|
||||
try:
|
||||
cmd = json.loads(message)
|
||||
|
||||
if cmd.get("type") == "transcribe":
|
||||
# Audio data sent as base64
|
||||
import base64
|
||||
audio_data = base64.b64decode(cmd.get("audio", ""))
|
||||
language = cmd.get("language", "es")
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
transcribe_audio,
|
||||
audio_data,
|
||||
language
|
||||
)
|
||||
|
||||
await websocket.send(json.dumps({
|
||||
"type": "transcription",
|
||||
**result
|
||||
}))
|
||||
|
||||
elif cmd.get("type") == "ping":
|
||||
await websocket.send(json.dumps({"type": "pong"}))
|
||||
|
||||
elif cmd.get("type") == "status":
|
||||
await websocket.send(json.dumps({
|
||||
"type": "status",
|
||||
"model": MODEL_SIZE,
|
||||
"device": DEVICE,
|
||||
"ready": model is not None
|
||||
}))
|
||||
|
||||
except json.JSONDecodeError:
|
||||
await websocket.send(json.dumps({
|
||||
"type": "error",
|
||||
"message": "Invalid JSON"
|
||||
}))
|
||||
|
||||
except websockets.exceptions.ConnectionClosed:
|
||||
print("[Whisper] Client disconnected")
|
||||
except Exception as e:
|
||||
print(f"[Whisper] Error: {e}")
|
||||
|
||||
async def main():
|
||||
"""Start WebSocket server"""
|
||||
print(f"[Whisper] Starting server on ws://{HOST}:{PORT}")
|
||||
print(f"[Whisper] Model: {MODEL_SIZE}, Device: {DEVICE}")
|
||||
|
||||
# Pre-load model
|
||||
print("[Whisper] Pre-loading model...")
|
||||
await load_model()
|
||||
|
||||
async with websockets.serve(handle_client, HOST, PORT):
|
||||
print(f"[Whisper] Server ready! Listening on ws://{HOST}:{PORT}")
|
||||
await asyncio.Future() # Run forever
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Install websockets if needed
|
||||
try:
|
||||
import websockets
|
||||
except ImportError:
|
||||
import subprocess
|
||||
subprocess.check_call([sys.executable, "-m", "pip", "install", "websockets"])
|
||||
import websockets
|
||||
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user