fix: Improve Whisper transcription with WebM to WAV conversion

- Add ffmpeg conversion from WebM/Opus to WAV (16kHz mono PCM)
- Optimize transcription parameters (VAD, temperature, beam_size)
- Add Honduras Spanish context prompt with local expressions
- Fix chunk accumulation display in voice panel
- Add 1.5s recording buffer after releasing Ctrl+Space
- Skip small audio chunks (<5KB) that cause ffmpeg errors
- Use large-v3 model for better accuracy
This commit is contained in:
2026-02-14 00:16:01 -06:00
parent 638e6ac8e0
commit ac17a9f292
2 changed files with 163 additions and 42 deletions

View File

@@ -42,6 +42,7 @@ const connected = ref(false)
let keyDownTime = 0 let keyDownTime = 0
let holdTimeout: number | null = null let holdTimeout: number | null = null
const isPushToTalk = ref(false) const isPushToTalk = ref(false)
let pendingWhisperSend = false // Flag to send transcript when Whisper responds
// ============ WHISPER MODE ============ // ============ WHISPER MODE ============
const useWhisper = ref(false) const useWhisper = ref(false)
@@ -194,19 +195,27 @@ function connectWhisperSocket() {
const fullText = msg.text.trim() const fullText = msg.text.trim()
if (msg.partial) { if (msg.partial) {
// For partial results, show as interim (will be replaced) // For partial results, show full accumulated transcription
// Only show text that's new since last transcription transcript.value = fullText + ' '
const newText = fullText.substring(lastTranscriptLength).trim() interimTranscript.value = ''
if (newText) { console.log(`[Voice] 🔄 WHISPER partial:`, fullText)
interimTranscript.value = newText } else {
console.log(`[Voice] 🔄 WHISPER partial:`, newText) // Final result
}
} else {
// Final result - replace everything
transcript.value = fullText + ' ' transcript.value = fullText + ' '
interimTranscript.value = '' interimTranscript.value = ''
lastTranscriptLength = 0
console.log(`[Voice] 🎯 WHISPER-GPU (${msg.model}/${msg.device}):`, fullText) console.log(`[Voice] 🎯 WHISPER-GPU (${msg.model}/${msg.device}):`, fullText)
// Auto-send if push-to-talk was waiting for this
if (pendingWhisperSend) {
pendingWhisperSend = false
console.log('[Voice] Whisper response received, sending transcript')
if (transcript.value.trim()) {
sendTranscriptAndClose()
} else {
isPushToTalk.value = false
close()
}
}
} }
// Update last transcript length for next partial // Update last transcript length for next partial
@@ -214,6 +223,11 @@ function connectWhisperSocket() {
} else if (msg.error) { } else if (msg.error) {
error.value = msg.error error.value = msg.error
console.error('[Voice] Whisper error:', msg.error) console.error('[Voice] Whisper error:', msg.error)
// Clear pending send on error
if (pendingWhisperSend) {
pendingWhisperSend = false
isPushToTalk.value = false
}
} }
} }
} catch (e) { } catch (e) {
@@ -263,7 +277,6 @@ async function startWhisperRecording() {
// Start recording // Start recording
mediaRecorder.start(100) // Collect data every 100ms mediaRecorder.start(100) // Collect data every 100ms
isRecording.value = true isRecording.value = true
interimTranscript.value = 'Listening (Whisper GPU)...'
// Send chunks periodically for progressive transcription // Send chunks periodically for progressive transcription
chunkInterval = window.setInterval(() => { chunkInterval = window.setInterval(() => {
@@ -283,6 +296,16 @@ function sendAudioChunk(isFinal: boolean) {
// Always send ALL accumulated audio (webm needs header from first chunk) // Always send ALL accumulated audio (webm needs header from first chunk)
const audioBlob = new Blob(audioChunks, { type: 'audio/webm' }) const audioBlob = new Blob(audioChunks, { type: 'audio/webm' })
const chunkCount = audioChunks.length
// Skip if audio is too small (< 5KB) - WebM header alone is ~1-2KB
if (audioBlob.size < 5000) {
console.log(`[Voice] Skipping small chunk (${audioBlob.size} bytes)`)
if (isFinal) {
audioChunks = []
}
return
}
// Clear chunks only if final // Clear chunks only if final
if (isFinal) { if (isFinal) {
@@ -295,16 +318,13 @@ function sendAudioChunk(isFinal: boolean) {
const base64 = (reader.result as string).split(',')[1] const base64 = (reader.result as string).split(',')[1]
if (whisperSocket?.readyState === WebSocket.OPEN) { if (whisperSocket?.readyState === WebSocket.OPEN) {
if (!isFinal) {
interimTranscript.value = 'Processing...'
}
whisperSocket.send(JSON.stringify({ whisperSocket.send(JSON.stringify({
type: 'transcribe', type: 'transcribe',
audio: base64, audio: base64,
language: 'es', language: 'es',
partial: !isFinal partial: !isFinal
})) }))
console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${audioChunks.length} chunks, ${audioBlob.size} bytes)`) console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${chunkCount} chunks, ${audioBlob.size} bytes)`)
} }
} }
reader.readAsDataURL(audioBlob) reader.readAsDataURL(audioBlob)
@@ -524,21 +544,31 @@ function handleKeyUp(e: KeyboardEvent) {
holdTimeout = null holdTimeout = null
} }
// If was push-to-talk recording, stop and send after 1200ms // If was push-to-talk recording, continue recording for 1.5s buffer then stop
if (isPushToTalk.value && isRecording.value) { if (isPushToTalk.value && isRecording.value) {
console.log('[Voice] Stopping recording, will send in 1200ms') console.log('[Voice] Key released, continuing recording for 1.5s buffer...')
stopRecording()
// Keep recording for 1.5s more (UX buffer for trailing words)
setTimeout(() => {
console.log('[Voice] Buffer complete, stopping recording')
stopRecording()
if (useWhisper.value) {
// For Whisper: wait for server response (handled in onmessage)
console.log('[Voice] Waiting for Whisper transcription...')
pendingWhisperSend = true
} else {
// For Web Speech API: send after short delay for final results
setTimeout(() => { setTimeout(() => {
console.log('[Voice] Sending transcript:', transcript.value.trim())
console.log('[Voice] Socket state:', socket?.readyState)
if (transcript.value.trim()) { if (transcript.value.trim()) {
sendTranscriptAndClose() sendTranscriptAndClose()
} else { } else {
// No transcript, just close
isPushToTalk.value = false isPushToTalk.value = false
close() close()
} }
}, 1200) }, 300)
}
}, 1500)
} }
keyDownTime = 0 keyDownTime = 0

View File

@@ -11,6 +11,7 @@ import io
import wave import wave
import tempfile import tempfile
import os import os
import subprocess
from pathlib import Path from pathlib import Path
try: try:
@@ -21,13 +22,75 @@ except ImportError as e:
print("Run: pip install faster-whisper websockets") print("Run: pip install faster-whisper websockets")
sys.exit(1) sys.exit(1)
def convert_audio_to_wav(input_data: bytes, input_format: str = "webm") -> bytes:
"""
Convert audio data to WAV format using ffmpeg.
Whisper requires WAV/PCM format, but browsers typically record in WebM/Opus.
"""
# Create temp files for input and output
with tempfile.NamedTemporaryFile(suffix=f".{input_format}", delete=False) as in_file:
in_file.write(input_data)
input_path = in_file.name
output_path = input_path.replace(f".{input_format}", ".wav")
try:
# Use ffmpeg to convert to WAV (16kHz mono, which Whisper prefers)
result = subprocess.run([
"ffmpeg", "-y", # Overwrite output
"-i", input_path, # Input file
"-ar", "16000", # Sample rate 16kHz
"-ac", "1", # Mono
"-c:a", "pcm_s16le", # PCM 16-bit little-endian
output_path
], capture_output=True, text=True, timeout=30)
if result.returncode != 0:
print(f"[Whisper] ffmpeg error: {result.stderr}")
return None
# Read the converted WAV file
with open(output_path, "rb") as f:
wav_data = f.read()
return wav_data
except subprocess.TimeoutExpired:
print("[Whisper] ffmpeg conversion timed out")
return None
except FileNotFoundError:
print("[Whisper] ffmpeg not found - please install ffmpeg")
return None
except Exception as e:
print(f"[Whisper] Conversion error: {e}")
return None
finally:
# Cleanup temp files
try:
os.unlink(input_path)
except:
pass
try:
os.unlink(output_path)
except:
pass
# Configuration # Configuration
HOST = "localhost" HOST = "localhost"
PORT = 4104 PORT = 4104
MODEL_SIZE = "medium" # tiny, base, small, medium, large-v2, large-v3 MODEL_SIZE = "large-v3" # tiny, base, small, medium, large-v2, large-v3
DEVICE = "cuda" # cuda or cpu DEVICE = "cuda" # cuda or cpu
COMPUTE_TYPE = "float16" # float16 for GPU, int8 for CPU COMPUTE_TYPE = "float16" # float16 for GPU, int8 for CPU
# Spanish context prompt to improve accuracy (Honduras Spanish + tech context)
INITIAL_PROMPT = """Transcripción en español hondureño de un desarrollador de software.
Contexto: programación, TypeScript, Vue, Python, comandos de terminal, código.
Vocabulario técnico: servidor, frontend, backend, chunks, WebSocket, transcripción,
componente, función, variable, API, modelo, Whisper, Claude, MCP, configuración.
Expresiones hondureñas: vos, tenés, podés, mirá, pues, verdad, ajá, entonces.
Diminutivos comunes: ahorita, ratito, prontito, despuesito, chiquito, tantito, poquito."""
# Global model instance # Global model instance
model = None model = None
model_loading = False model_loading = False
@@ -70,29 +133,51 @@ async def load_model():
model_loading = False model_loading = False
return model return model
def transcribe_audio(audio_data: bytes, language: str = "es") -> dict: def transcribe_audio(audio_data: bytes, language: str = "es", is_webm: bool = True) -> dict:
"""Transcribe audio data using Whisper""" """Transcribe audio data using Whisper"""
global model global model
if model is None: if model is None:
return {"error": "Model not loaded"} return {"error": "Model not loaded"}
print(f"[Whisper] Received {len(audio_data)} bytes of audio data")
# Convert WebM to WAV if needed
if is_webm:
print("[Whisper] Converting WebM to WAV...")
wav_data = convert_audio_to_wav(audio_data, "webm")
if wav_data is None:
return {"error": "Failed to convert audio format. Ensure ffmpeg is installed."}
print(f"[Whisper] Converted to {len(wav_data)} bytes WAV")
else:
wav_data = audio_data
# Save audio to temp file (faster-whisper needs a file path) # Save audio to temp file (faster-whisper needs a file path)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(audio_data) f.write(wav_data)
temp_path = f.name temp_path = f.name
try: try:
# Transcribe # Transcribe with optimized parameters
print(f"[Whisper] Transcribing {temp_path}...")
segments, info = model.transcribe( segments, info = model.transcribe(
temp_path, temp_path,
language=language, language=language,
beam_size=5, beam_size=5,
best_of=5, # Number of candidates when sampling
temperature=0.0, # Use greedy decoding (most accurate)
vad_filter=True, # Voice activity detection vad_filter=True, # Voice activity detection
vad_parameters=dict( vad_parameters=dict(
min_silence_duration_ms=500, min_silence_duration_ms=300, # Shorter silence detection
speech_pad_ms=400 speech_pad_ms=200, # Padding around speech
) threshold=0.5 # VAD sensitivity (lower = more sensitive)
),
initial_prompt=INITIAL_PROMPT, # Context for better Spanish transcription
condition_on_previous_text=True, # Use context from previous segments
no_speech_threshold=0.6,
log_prob_threshold=-1.0,
compression_ratio_threshold=2.4,
word_timestamps=False # Faster without word-level timestamps
) )
# Collect all segments # Collect all segments
@@ -106,6 +191,8 @@ def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
"text": segment.text "text": segment.text
}) })
print(f"[Whisper] Transcription result: '{text.strip()}'")
return { return {
"success": True, "success": True,
"text": text.strip(), "text": text.strip(),
@@ -119,6 +206,7 @@ def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
} }
except Exception as e: except Exception as e:
print(f"[Whisper] Transcription error: {e}")
return {"error": str(e)} return {"error": str(e)}
finally: finally:
@@ -145,16 +233,14 @@ async def handle_client(websocket):
try: try:
async for message in websocket: async for message in websocket:
if isinstance(message, bytes): if isinstance(message, bytes):
# Binary audio data # Binary audio data (likely WebM format from browser)
print(f"[Whisper] Received {len(message)} bytes of audio") print(f"[Whisper] Received {len(message)} bytes of binary audio")
# Transcribe in thread pool to not block # Transcribe in thread pool to not block
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
result = await loop.run_in_executor( result = await loop.run_in_executor(
None, None,
transcribe_audio, lambda: transcribe_audio(message, "es", is_webm=True)
message,
"es" # Default to Spanish
) )
await websocket.send(json.dumps({ await websocket.send(json.dumps({
@@ -168,19 +254,24 @@ async def handle_client(websocket):
cmd = json.loads(message) cmd = json.loads(message)
if cmd.get("type") == "transcribe": if cmd.get("type") == "transcribe":
# Audio data sent as base64 # Audio data sent as base64 (WebM format from browser)
import base64 import base64
audio_data = base64.b64decode(cmd.get("audio", "")) audio_data = base64.b64decode(cmd.get("audio", ""))
language = cmd.get("language", "es") language = cmd.get("language", "es")
is_partial = cmd.get("partial", False)
print(f"[Whisper] Transcribe request: {len(audio_data)} bytes, lang={language}, partial={is_partial}")
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
result = await loop.run_in_executor( result = await loop.run_in_executor(
None, None,
transcribe_audio, lambda: transcribe_audio(audio_data, language, is_webm=True)
audio_data,
language
) )
# Add partial flag to result
if is_partial:
result["partial"] = True
await websocket.send(json.dumps({ await websocket.send(json.dumps({
"type": "transcription", "type": "transcription",
**result **result