fix: Improve Whisper transcription with WebM to WAV conversion
- Add ffmpeg conversion from WebM/Opus to WAV (16kHz mono PCM) - Optimize transcription parameters (VAD, temperature, beam_size) - Add Honduras Spanish context prompt with local expressions - Fix chunk accumulation display in voice panel - Add 1.5s recording buffer after releasing Ctrl+Space - Skip small audio chunks (<5KB) that cause ffmpeg errors - Use large-v3 model for better accuracy
This commit is contained in:
@@ -11,6 +11,7 @@ import io
|
||||
import wave
|
||||
import tempfile
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
@@ -21,13 +22,75 @@ except ImportError as e:
|
||||
print("Run: pip install faster-whisper websockets")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def convert_audio_to_wav(input_data: bytes, input_format: str = "webm") -> bytes:
|
||||
"""
|
||||
Convert audio data to WAV format using ffmpeg.
|
||||
Whisper requires WAV/PCM format, but browsers typically record in WebM/Opus.
|
||||
"""
|
||||
# Create temp files for input and output
|
||||
with tempfile.NamedTemporaryFile(suffix=f".{input_format}", delete=False) as in_file:
|
||||
in_file.write(input_data)
|
||||
input_path = in_file.name
|
||||
|
||||
output_path = input_path.replace(f".{input_format}", ".wav")
|
||||
|
||||
try:
|
||||
# Use ffmpeg to convert to WAV (16kHz mono, which Whisper prefers)
|
||||
result = subprocess.run([
|
||||
"ffmpeg", "-y", # Overwrite output
|
||||
"-i", input_path, # Input file
|
||||
"-ar", "16000", # Sample rate 16kHz
|
||||
"-ac", "1", # Mono
|
||||
"-c:a", "pcm_s16le", # PCM 16-bit little-endian
|
||||
output_path
|
||||
], capture_output=True, text=True, timeout=30)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"[Whisper] ffmpeg error: {result.stderr}")
|
||||
return None
|
||||
|
||||
# Read the converted WAV file
|
||||
with open(output_path, "rb") as f:
|
||||
wav_data = f.read()
|
||||
|
||||
return wav_data
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
print("[Whisper] ffmpeg conversion timed out")
|
||||
return None
|
||||
except FileNotFoundError:
|
||||
print("[Whisper] ffmpeg not found - please install ffmpeg")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"[Whisper] Conversion error: {e}")
|
||||
return None
|
||||
finally:
|
||||
# Cleanup temp files
|
||||
try:
|
||||
os.unlink(input_path)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
os.unlink(output_path)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Configuration
|
||||
HOST = "localhost"
|
||||
PORT = 4104
|
||||
MODEL_SIZE = "medium" # tiny, base, small, medium, large-v2, large-v3
|
||||
MODEL_SIZE = "large-v3" # tiny, base, small, medium, large-v2, large-v3
|
||||
DEVICE = "cuda" # cuda or cpu
|
||||
COMPUTE_TYPE = "float16" # float16 for GPU, int8 for CPU
|
||||
|
||||
# Spanish context prompt to improve accuracy (Honduras Spanish + tech context)
|
||||
INITIAL_PROMPT = """Transcripción en español hondureño de un desarrollador de software.
|
||||
Contexto: programación, TypeScript, Vue, Python, comandos de terminal, código.
|
||||
Vocabulario técnico: servidor, frontend, backend, chunks, WebSocket, transcripción,
|
||||
componente, función, variable, API, modelo, Whisper, Claude, MCP, configuración.
|
||||
Expresiones hondureñas: vos, tenés, podés, mirá, pues, verdad, ajá, entonces.
|
||||
Diminutivos comunes: ahorita, ratito, prontito, despuesito, chiquito, tantito, poquito."""
|
||||
|
||||
# Global model instance
|
||||
model = None
|
||||
model_loading = False
|
||||
@@ -70,29 +133,51 @@ async def load_model():
|
||||
model_loading = False
|
||||
return model
|
||||
|
||||
def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
|
||||
def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = True) -> dict:
|
||||
"""Transcribe audio data using Whisper"""
|
||||
global model
|
||||
|
||||
if model is None:
|
||||
return {"error": "Model not loaded"}
|
||||
|
||||
print(f"[Whisper] Received {len(audio_data)} bytes of audio data")
|
||||
|
||||
# Convert WebM to WAV if needed
|
||||
if is_webm:
|
||||
print("[Whisper] Converting WebM to WAV...")
|
||||
wav_data = convert_audio_to_wav(audio_data, "webm")
|
||||
if wav_data is None:
|
||||
return {"error": "Failed to convert audio format. Ensure ffmpeg is installed."}
|
||||
print(f"[Whisper] Converted to {len(wav_data)} bytes WAV")
|
||||
else:
|
||||
wav_data = audio_data
|
||||
|
||||
# Save audio to temp file (faster-whisper needs a file path)
|
||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
||||
f.write(audio_data)
|
||||
f.write(wav_data)
|
||||
temp_path = f.name
|
||||
|
||||
try:
|
||||
# Transcribe
|
||||
# Transcribe with optimized parameters
|
||||
print(f"[Whisper] Transcribing {temp_path}...")
|
||||
segments, info = model.transcribe(
|
||||
temp_path,
|
||||
language=language,
|
||||
beam_size=5,
|
||||
best_of=5, # Number of candidates when sampling
|
||||
temperature=0.0, # Use greedy decoding (most accurate)
|
||||
vad_filter=True, # Voice activity detection
|
||||
vad_parameters=dict(
|
||||
min_silence_duration_ms=500,
|
||||
speech_pad_ms=400
|
||||
)
|
||||
min_silence_duration_ms=300, # Shorter silence detection
|
||||
speech_pad_ms=200, # Padding around speech
|
||||
threshold=0.5 # VAD sensitivity (lower = more sensitive)
|
||||
),
|
||||
initial_prompt=INITIAL_PROMPT, # Context for better Spanish transcription
|
||||
condition_on_previous_text=True, # Use context from previous segments
|
||||
no_speech_threshold=0.6,
|
||||
log_prob_threshold=-1.0,
|
||||
compression_ratio_threshold=2.4,
|
||||
word_timestamps=False # Faster without word-level timestamps
|
||||
)
|
||||
|
||||
# Collect all segments
|
||||
@@ -106,6 +191,8 @@ def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
|
||||
"text": segment.text
|
||||
})
|
||||
|
||||
print(f"[Whisper] Transcription result: '{text.strip()}'")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"text": text.strip(),
|
||||
@@ -119,6 +206,7 @@ def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"[Whisper] Transcription error: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
finally:
|
||||
@@ -145,16 +233,14 @@ async def handle_client(websocket):
|
||||
try:
|
||||
async for message in websocket:
|
||||
if isinstance(message, bytes):
|
||||
# Binary audio data
|
||||
print(f"[Whisper] Received {len(message)} bytes of audio")
|
||||
# Binary audio data (likely WebM format from browser)
|
||||
print(f"[Whisper] Received {len(message)} bytes of binary audio")
|
||||
|
||||
# Transcribe in thread pool to not block
|
||||
loop = asyncio.get_event_loop()
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
transcribe_audio,
|
||||
message,
|
||||
"es" # Default to Spanish
|
||||
lambda: transcribe_audio(message, "es", is_webm=True)
|
||||
)
|
||||
|
||||
await websocket.send(json.dumps({
|
||||
@@ -168,19 +254,24 @@ async def handle_client(websocket):
|
||||
cmd = json.loads(message)
|
||||
|
||||
if cmd.get("type") == "transcribe":
|
||||
# Audio data sent as base64
|
||||
# Audio data sent as base64 (WebM format from browser)
|
||||
import base64
|
||||
audio_data = base64.b64decode(cmd.get("audio", ""))
|
||||
language = cmd.get("language", "es")
|
||||
is_partial = cmd.get("partial", False)
|
||||
|
||||
print(f"[Whisper] Transcribe request: {len(audio_data)} bytes, lang={language}, partial={is_partial}")
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
transcribe_audio,
|
||||
audio_data,
|
||||
language
|
||||
lambda: transcribe_audio(audio_data, language, is_webm=True)
|
||||
)
|
||||
|
||||
# Add partial flag to result
|
||||
if is_partial:
|
||||
result["partial"] = True
|
||||
|
||||
await websocket.send(json.dumps({
|
||||
"type": "transcription",
|
||||
**result
|
||||
|
||||
Reference in New Issue
Block a user