fix: Improve Whisper transcription with WebM to WAV conversion

- Add ffmpeg conversion from WebM/Opus to WAV (16kHz mono PCM)
- Optimize transcription parameters (VAD, temperature, beam_size)
- Add Honduras Spanish context prompt with local expressions
- Fix chunk accumulation display in voice panel
- Add 1.5s recording buffer after releasing Ctrl+Space
- Skip small audio chunks (<5KB) that cause ffmpeg errors
- Use large-v3 model for better accuracy
This commit is contained in:
2026-02-14 00:16:01 -06:00
parent 638e6ac8e0
commit ac17a9f292
2 changed files with 163 additions and 42 deletions

View File

@@ -11,6 +11,7 @@ import io
import wave
import tempfile
import os
import subprocess
from pathlib import Path
try:
@@ -21,13 +22,75 @@ except ImportError as e:
print("Run: pip install faster-whisper websockets")
sys.exit(1)
def convert_audio_to_wav(input_data: bytes, input_format: str = "webm") -> bytes:
"""
Convert audio data to WAV format using ffmpeg.
Whisper requires WAV/PCM format, but browsers typically record in WebM/Opus.
"""
# Create temp files for input and output
with tempfile.NamedTemporaryFile(suffix=f".{input_format}", delete=False) as in_file:
in_file.write(input_data)
input_path = in_file.name
output_path = input_path.replace(f".{input_format}", ".wav")
try:
# Use ffmpeg to convert to WAV (16kHz mono, which Whisper prefers)
result = subprocess.run([
"ffmpeg", "-y", # Overwrite output
"-i", input_path, # Input file
"-ar", "16000", # Sample rate 16kHz
"-ac", "1", # Mono
"-c:a", "pcm_s16le", # PCM 16-bit little-endian
output_path
], capture_output=True, text=True, timeout=30)
if result.returncode != 0:
print(f"[Whisper] ffmpeg error: {result.stderr}")
return None
# Read the converted WAV file
with open(output_path, "rb") as f:
wav_data = f.read()
return wav_data
except subprocess.TimeoutExpired:
print("[Whisper] ffmpeg conversion timed out")
return None
except FileNotFoundError:
print("[Whisper] ffmpeg not found - please install ffmpeg")
return None
except Exception as e:
print(f"[Whisper] Conversion error: {e}")
return None
finally:
# Cleanup temp files
try:
os.unlink(input_path)
except:
pass
try:
os.unlink(output_path)
except:
pass
# Configuration
HOST = "localhost"
PORT = 4104
MODEL_SIZE = "medium" # tiny, base, small, medium, large-v2, large-v3
MODEL_SIZE = "large-v3" # tiny, base, small, medium, large-v2, large-v3
DEVICE = "cuda" # cuda or cpu
COMPUTE_TYPE = "float16" # float16 for GPU, int8 for CPU
# Spanish context prompt to improve accuracy (Honduras Spanish + tech context)
INITIAL_PROMPT = """Transcripción en español hondureño de un desarrollador de software.
Contexto: programación, TypeScript, Vue, Python, comandos de terminal, código.
Vocabulario técnico: servidor, frontend, backend, chunks, WebSocket, transcripción,
componente, función, variable, API, modelo, Whisper, Claude, MCP, configuración.
Expresiones hondureñas: vos, tenés, podés, mirá, pues, verdad, ajá, entonces.
Diminutivos comunes: ahorita, ratito, prontito, despuesito, chiquito, tantito, poquito."""
# Global model instance
model = None
model_loading = False
@@ -70,29 +133,51 @@ async def load_model():
model_loading = False
return model
def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = True) -> dict:
"""Transcribe audio data using Whisper"""
global model
if model is None:
return {"error": "Model not loaded"}
print(f"[Whisper] Received {len(audio_data)} bytes of audio data")
# Convert WebM to WAV if needed
if is_webm:
print("[Whisper] Converting WebM to WAV...")
wav_data = convert_audio_to_wav(audio_data, "webm")
if wav_data is None:
return {"error": "Failed to convert audio format. Ensure ffmpeg is installed."}
print(f"[Whisper] Converted to {len(wav_data)} bytes WAV")
else:
wav_data = audio_data
# Save audio to temp file (faster-whisper needs a file path)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(audio_data)
f.write(wav_data)
temp_path = f.name
try:
# Transcribe
# Transcribe with optimized parameters
print(f"[Whisper] Transcribing {temp_path}...")
segments, info = model.transcribe(
temp_path,
language=language,
beam_size=5,
best_of=5, # Number of candidates when sampling
temperature=0.0, # Use greedy decoding (most accurate)
vad_filter=True, # Voice activity detection
vad_parameters=dict(
min_silence_duration_ms=500,
speech_pad_ms=400
)
min_silence_duration_ms=300, # Shorter silence detection
speech_pad_ms=200, # Padding around speech
threshold=0.5 # VAD sensitivity (lower = more sensitive)
),
initial_prompt=INITIAL_PROMPT, # Context for better Spanish transcription
condition_on_previous_text=True, # Use context from previous segments
no_speech_threshold=0.6,
log_prob_threshold=-1.0,
compression_ratio_threshold=2.4,
word_timestamps=False # Faster without word-level timestamps
)
# Collect all segments
@@ -106,6 +191,8 @@ def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
"text": segment.text
})
print(f"[Whisper] Transcription result: '{text.strip()}'")
return {
"success": True,
"text": text.strip(),
@@ -119,6 +206,7 @@ def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
}
except Exception as e:
print(f"[Whisper] Transcription error: {e}")
return {"error": str(e)}
finally:
@@ -145,16 +233,14 @@ async def handle_client(websocket):
try:
async for message in websocket:
if isinstance(message, bytes):
# Binary audio data
print(f"[Whisper] Received {len(message)} bytes of audio")
# Binary audio data (likely WebM format from browser)
print(f"[Whisper] Received {len(message)} bytes of binary audio")
# Transcribe in thread pool to not block
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
transcribe_audio,
message,
"es" # Default to Spanish
lambda: transcribe_audio(message, "es", is_webm=True)
)
await websocket.send(json.dumps({
@@ -168,19 +254,24 @@ async def handle_client(websocket):
cmd = json.loads(message)
if cmd.get("type") == "transcribe":
# Audio data sent as base64
# Audio data sent as base64 (WebM format from browser)
import base64
audio_data = base64.b64decode(cmd.get("audio", ""))
language = cmd.get("language", "es")
is_partial = cmd.get("partial", False)
print(f"[Whisper] Transcribe request: {len(audio_data)} bytes, lang={language}, partial={is_partial}")
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
transcribe_audio,
audio_data,
language
lambda: transcribe_audio(audio_data, language, is_webm=True)
)
# Add partial flag to result
if is_partial:
result["partial"] = True
await websocket.send(json.dumps({
"type": "transcription",
**result