fix: Improve Whisper transcription with WebM to WAV conversion

- Add ffmpeg conversion from WebM/Opus to WAV (16kHz mono PCM)
- Optimize transcription parameters (VAD, temperature, beam_size)
- Add Honduras Spanish context prompt with local expressions
- Fix chunk accumulation display in voice panel
- Add 1.5s recording buffer after releasing Ctrl+Space
- Skip small audio chunks (<5KB) that cause ffmpeg errors
- Use large-v3 model for better accuracy
This commit is contained in:
2026-02-14 00:16:01 -06:00
parent 638e6ac8e0
commit ac17a9f292
2 changed files with 163 additions and 42 deletions

View File

@@ -42,6 +42,7 @@ const connected = ref(false)
let keyDownTime = 0
let holdTimeout: number | null = null
const isPushToTalk = ref(false)
let pendingWhisperSend = false // Flag to send transcript when Whisper responds
// ============ WHISPER MODE ============
const useWhisper = ref(false)
@@ -194,19 +195,27 @@ function connectWhisperSocket() {
const fullText = msg.text.trim()
if (msg.partial) {
// For partial results, show as interim (will be replaced)
// Only show text that's new since last transcription
const newText = fullText.substring(lastTranscriptLength).trim()
if (newText) {
interimTranscript.value = newText
console.log(`[Voice] 🔄 WHISPER partial:`, newText)
}
} else {
// Final result - replace everything
// For partial results, show full accumulated transcription
transcript.value = fullText + ' '
interimTranscript.value = ''
console.log(`[Voice] 🔄 WHISPER partial:`, fullText)
} else {
// Final result
transcript.value = fullText + ' '
interimTranscript.value = ''
lastTranscriptLength = 0
console.log(`[Voice] 🎯 WHISPER-GPU (${msg.model}/${msg.device}):`, fullText)
// Auto-send if push-to-talk was waiting for this
if (pendingWhisperSend) {
pendingWhisperSend = false
console.log('[Voice] Whisper response received, sending transcript')
if (transcript.value.trim()) {
sendTranscriptAndClose()
} else {
isPushToTalk.value = false
close()
}
}
}
// Update last transcript length for next partial
@@ -214,6 +223,11 @@ function connectWhisperSocket() {
} else if (msg.error) {
error.value = msg.error
console.error('[Voice] Whisper error:', msg.error)
// Clear pending send on error
if (pendingWhisperSend) {
pendingWhisperSend = false
isPushToTalk.value = false
}
}
}
} catch (e) {
@@ -263,7 +277,6 @@ async function startWhisperRecording() {
// Start recording
mediaRecorder.start(100) // Collect data every 100ms
isRecording.value = true
interimTranscript.value = 'Listening (Whisper GPU)...'
// Send chunks periodically for progressive transcription
chunkInterval = window.setInterval(() => {
@@ -283,6 +296,16 @@ function sendAudioChunk(isFinal: boolean) {
// Always send ALL accumulated audio (webm needs header from first chunk)
const audioBlob = new Blob(audioChunks, { type: 'audio/webm' })
const chunkCount = audioChunks.length
// Skip if audio is too small (< 5KB) - WebM header alone is ~1-2KB
if (audioBlob.size < 5000) {
console.log(`[Voice] Skipping small chunk (${audioBlob.size} bytes)`)
if (isFinal) {
audioChunks = []
}
return
}
// Clear chunks only if final
if (isFinal) {
@@ -295,16 +318,13 @@ function sendAudioChunk(isFinal: boolean) {
const base64 = (reader.result as string).split(',')[1]
if (whisperSocket?.readyState === WebSocket.OPEN) {
if (!isFinal) {
interimTranscript.value = 'Processing...'
}
whisperSocket.send(JSON.stringify({
type: 'transcribe',
audio: base64,
language: 'es',
partial: !isFinal
}))
console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${audioChunks.length} chunks, ${audioBlob.size} bytes)`)
console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${chunkCount} chunks, ${audioBlob.size} bytes)`)
}
}
reader.readAsDataURL(audioBlob)
@@ -524,21 +544,31 @@ function handleKeyUp(e: KeyboardEvent) {
holdTimeout = null
}
// If was push-to-talk recording, stop and send after 1200ms
// If was push-to-talk recording, continue recording for 1.5s buffer then stop
if (isPushToTalk.value && isRecording.value) {
console.log('[Voice] Stopping recording, will send in 1200ms')
stopRecording()
console.log('[Voice] Key released, continuing recording for 1.5s buffer...')
// Keep recording for 1.5s more (UX buffer for trailing words)
setTimeout(() => {
console.log('[Voice] Sending transcript:', transcript.value.trim())
console.log('[Voice] Socket state:', socket?.readyState)
if (transcript.value.trim()) {
sendTranscriptAndClose()
console.log('[Voice] Buffer complete, stopping recording')
stopRecording()
if (useWhisper.value) {
// For Whisper: wait for server response (handled in onmessage)
console.log('[Voice] Waiting for Whisper transcription...')
pendingWhisperSend = true
} else {
// No transcript, just close
isPushToTalk.value = false
close()
// For Web Speech API: send after short delay for final results
setTimeout(() => {
if (transcript.value.trim()) {
sendTranscriptAndClose()
} else {
isPushToTalk.value = false
close()
}
}, 300)
}
}, 1200)
}, 1500)
}
keyDownTime = 0