295 lines
9.6 KiB
Python
295 lines
9.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Whisper Server - GPU-accelerated speech-to-text using faster-whisper
|
|
WebSocket server that receives audio and returns transcriptions
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import sys
|
|
import io
|
|
import wave
|
|
import tempfile
|
|
import os
|
|
import subprocess
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import websockets
|
|
from faster_whisper import WhisperModel
|
|
except ImportError as e:
|
|
print(f"Missing dependency: {e}")
|
|
print("Run: pip install faster-whisper websockets")
|
|
sys.exit(1)
|
|
|
|
|
|
def convert_audio_to_wav(input_data: bytes, input_format: str = "webm") -> bytes:
|
|
"""
|
|
Convert audio data to WAV format using ffmpeg.
|
|
Uses stdin/stdout pipes so ffmpeg probes the actual data format
|
|
instead of relying on file extensions.
|
|
"""
|
|
try:
|
|
result = subprocess.run([
|
|
"ffmpeg", "-y",
|
|
"-i", "pipe:0", # Read from stdin (auto-detect format)
|
|
"-ar", "16000", # Sample rate 16kHz
|
|
"-ac", "1", # Mono
|
|
"-c:a", "pcm_s16le", # PCM 16-bit little-endian
|
|
"-f", "wav", # Output format
|
|
"pipe:1" # Write to stdout
|
|
], input=input_data, capture_output=True, timeout=30)
|
|
|
|
if result.returncode != 0:
|
|
print(f"[Whisper] ffmpeg error: {result.stderr.decode('utf-8', errors='replace')}")
|
|
return None
|
|
|
|
return result.stdout
|
|
|
|
except subprocess.TimeoutExpired:
|
|
print("[Whisper] ffmpeg conversion timed out")
|
|
return None
|
|
except FileNotFoundError:
|
|
print("[Whisper] ffmpeg not found - please install ffmpeg")
|
|
return None
|
|
except Exception as e:
|
|
print(f"[Whisper] Conversion error: {e}")
|
|
return None
|
|
|
|
# Configuration
|
|
HOST = "0.0.0.0" # Listen on all interfaces (needed for Traefik proxy)
|
|
PORT = 4104
|
|
MODEL_SIZE = "large-v3" # Best standard model for Spanish
|
|
DEVICE = "cuda" # cuda or cpu
|
|
COMPUTE_TYPE = "float16" # float16 for GPU, int8 for CPU
|
|
|
|
# Model display name (extract from path if needed)
|
|
MODEL_NAME = MODEL_SIZE.split("/")[-1] if "/" in MODEL_SIZE else MODEL_SIZE
|
|
|
|
# Spanish context prompt to improve accuracy (Honduras Spanish + tech context)
|
|
INITIAL_PROMPT = """Transcripción en español hondureño de un desarrollador de software.
|
|
Contexto: programación, TypeScript, Vue, Python, comandos de terminal, código.
|
|
Vocabulario técnico: servidor, frontend, backend, chunks, WebSocket, transcripción,
|
|
componente, función, variable, API, modelo, Whisper, Claude, MCP, configuración.
|
|
Expresiones hondureñas: vos, tenés, podés, mirá, pues, verdad, ajá, entonces.
|
|
Diminutivos comunes: ahorita, ratito, prontito, despuesito, chiquito, tantito, poquito."""
|
|
|
|
# Global model instance
|
|
model = None
|
|
model_loading = False
|
|
|
|
async def load_model():
|
|
"""Load Whisper model (lazy loading on first request)"""
|
|
global model, model_loading
|
|
|
|
if model is not None:
|
|
return model
|
|
|
|
if model_loading:
|
|
# Wait for model to finish loading
|
|
while model_loading:
|
|
await asyncio.sleep(0.1)
|
|
return model
|
|
|
|
model_loading = True
|
|
print(f"[Whisper] Loading model '{MODEL_NAME}' on {DEVICE}...")
|
|
|
|
try:
|
|
# Load model - this downloads on first run
|
|
model = WhisperModel(
|
|
MODEL_SIZE,
|
|
device=DEVICE,
|
|
compute_type=COMPUTE_TYPE,
|
|
download_root=str(Path.home() / ".cache" / "whisper")
|
|
)
|
|
print(f"[Whisper] Model loaded successfully!")
|
|
except Exception as e:
|
|
print(f"[Whisper] Error loading model: {e}")
|
|
print("[Whisper] Falling back to CPU...")
|
|
model = WhisperModel(
|
|
MODEL_SIZE,
|
|
device="cpu",
|
|
compute_type="int8",
|
|
download_root=str(Path.home() / ".cache" / "whisper")
|
|
)
|
|
|
|
model_loading = False
|
|
return model
|
|
|
|
def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = True) -> dict:
|
|
"""Transcribe audio data using Whisper"""
|
|
global model
|
|
|
|
if model is None:
|
|
return {"error": "Model not loaded"}
|
|
|
|
# Convert WebM to WAV if needed
|
|
if is_webm:
|
|
wav_data = convert_audio_to_wav(audio_data, "webm")
|
|
if wav_data is None:
|
|
return {"error": "Failed to convert audio format. Ensure ffmpeg is installed."}
|
|
else:
|
|
wav_data = audio_data
|
|
|
|
# Save audio to temp file (faster-whisper needs a file path)
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
f.write(wav_data)
|
|
temp_path = f.name
|
|
|
|
try:
|
|
# Transcribe with optimized parameters
|
|
segments, info = model.transcribe(
|
|
temp_path,
|
|
language=language,
|
|
beam_size=5,
|
|
best_of=5, # Number of candidates when sampling
|
|
temperature=0.0, # Use greedy decoding (most accurate)
|
|
vad_filter=True, # Voice activity detection
|
|
vad_parameters=dict(
|
|
min_silence_duration_ms=300, # Shorter silence detection
|
|
speech_pad_ms=200, # Padding around speech
|
|
threshold=0.5 # VAD sensitivity (lower = more sensitive)
|
|
),
|
|
initial_prompt=INITIAL_PROMPT, # Context for better Spanish transcription
|
|
condition_on_previous_text=True, # Use context from previous segments
|
|
no_speech_threshold=0.6,
|
|
log_prob_threshold=-1.0,
|
|
compression_ratio_threshold=2.4,
|
|
word_timestamps=False # Faster without word-level timestamps
|
|
)
|
|
|
|
# Collect all segments
|
|
text = ""
|
|
segments_list = []
|
|
for segment in segments:
|
|
text += segment.text + " "
|
|
segments_list.append({
|
|
"start": segment.start,
|
|
"end": segment.end,
|
|
"text": segment.text
|
|
})
|
|
|
|
|
|
return {
|
|
"success": True,
|
|
"text": text.strip(),
|
|
"language": info.language,
|
|
"language_probability": info.language_probability,
|
|
"duration": info.duration,
|
|
"segments": segments_list,
|
|
"engine": "whisper-gpu",
|
|
"model": MODEL_NAME,
|
|
"device": DEVICE
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f"[Whisper] Transcription error: {e}")
|
|
return {"error": str(e)}
|
|
|
|
finally:
|
|
# Cleanup temp file
|
|
try:
|
|
os.unlink(temp_path)
|
|
except:
|
|
pass
|
|
|
|
async def handle_client(websocket):
|
|
"""Handle WebSocket client connection"""
|
|
|
|
# Ensure model is loaded
|
|
await load_model()
|
|
|
|
# Send ready message
|
|
await websocket.send(json.dumps({
|
|
"type": "ready",
|
|
"model": MODEL_NAME,
|
|
"device": DEVICE
|
|
}))
|
|
|
|
try:
|
|
async for message in websocket:
|
|
if isinstance(message, bytes):
|
|
# Binary audio data (likely WebM format from browser)
|
|
# Transcribe in thread pool to not block
|
|
loop = asyncio.get_event_loop()
|
|
result = await loop.run_in_executor(
|
|
None,
|
|
lambda: transcribe_audio(message, "es", is_webm=True)
|
|
)
|
|
|
|
await websocket.send(json.dumps({
|
|
"type": "transcription",
|
|
**result
|
|
}))
|
|
|
|
else:
|
|
# JSON command
|
|
try:
|
|
cmd = json.loads(message)
|
|
|
|
if cmd.get("type") == "transcribe":
|
|
# Audio data sent as base64 (WebM format from browser)
|
|
import base64
|
|
audio_data = base64.b64decode(cmd.get("audio", ""))
|
|
language = cmd.get("language", "es")
|
|
is_partial = cmd.get("partial", False)
|
|
|
|
loop = asyncio.get_event_loop()
|
|
result = await loop.run_in_executor(
|
|
None,
|
|
lambda: transcribe_audio(audio_data, language, is_webm=True)
|
|
)
|
|
|
|
# Add partial flag to result
|
|
if is_partial:
|
|
result["partial"] = True
|
|
|
|
await websocket.send(json.dumps({
|
|
"type": "transcription",
|
|
**result
|
|
}))
|
|
|
|
elif cmd.get("type") == "ping":
|
|
await websocket.send(json.dumps({"type": "pong"}))
|
|
|
|
elif cmd.get("type") == "status":
|
|
await websocket.send(json.dumps({
|
|
"type": "status",
|
|
"model": MODEL_NAME,
|
|
"device": DEVICE,
|
|
"ready": model is not None
|
|
}))
|
|
|
|
except json.JSONDecodeError:
|
|
await websocket.send(json.dumps({
|
|
"type": "error",
|
|
"message": "Invalid JSON"
|
|
}))
|
|
|
|
except websockets.exceptions.ConnectionClosed:
|
|
pass
|
|
except Exception as e:
|
|
print(f"[Whisper] Error: {e}")
|
|
|
|
async def main():
|
|
"""Start WebSocket server"""
|
|
print(f"[Whisper] Model: {MODEL_NAME} | Device: {DEVICE} | Port: {PORT}")
|
|
|
|
# Pre-load model
|
|
await load_model()
|
|
|
|
async with websockets.serve(handle_client, HOST, PORT):
|
|
print(f"[Whisper] Ready")
|
|
await asyncio.Future() # Run forever
|
|
|
|
if __name__ == "__main__":
|
|
# Install websockets if needed
|
|
try:
|
|
import websockets
|
|
except ImportError:
|
|
import subprocess
|
|
subprocess.check_call([sys.executable, "-m", "pip", "install", "websockets"])
|
|
import websockets
|
|
|
|
asyncio.run(main())
|