fix: Improve Whisper transcription with WebM to WAV conversion

- Add ffmpeg conversion from WebM/Opus to WAV (16kHz mono PCM) - Optimize transcription parameters (VAD, temperature, beam_size) - Add Honduras Spanish context prompt with local expressions - Fix chunk accumulation display in voice panel - Add 1.5s recording buffer after releasing Ctrl+Space - Skip small audio chunks (<5KB) that cause ffmpeg errors - Use large-v3 model for better accuracy
2026-02-14 00:16:01 -06:00
parent 638e6ac8e0
commit ac17a9f292
2 changed files with 163 additions and 42 deletions
--- a/frontend/src/components/FloatingVoice.vue
+++ b/frontend/src/components/FloatingVoice.vue
@@ -42,6 +42,7 @@ const connected = ref(false)
 let keyDownTime = 0
 let holdTimeout: number | null = null
 const isPushToTalk = ref(false)
+let pendingWhisperSend = false // Flag to send transcript when Whisper responds

 // ============ WHISPER MODE ============
 const useWhisper = ref(false)
@@ -194,19 +195,27 @@ function connectWhisperSocket() {
          const fullText = msg.text.trim()

          if (msg.partial) {
-            // For partial results, show as interim (will be replaced)
-            // Only show text that's new since last transcription
-            const newText = fullText.substring(lastTranscriptLength).trim()
-            if (newText) {
-              interimTranscript.value = newText
-              console.log(`[Voice] 🔄 WHISPER partial:`, newText)
-            }
-          } else {
-            // Final result - replace everything
+            // For partial results, show full accumulated transcription
+            transcript.value = fullText + ' '
+            interimTranscript.value = ''
+            console.log(`[Voice] 🔄 WHISPER partial:`, fullText)
+          } else {
+            // Final result
            transcript.value = fullText + ' '
            interimTranscript.value = ''
-            lastTranscriptLength = 0
            console.log(`[Voice] 🎯 WHISPER-GPU (${msg.model}/${msg.device}):`, fullText)
+
+            // Auto-send if push-to-talk was waiting for this
+            if (pendingWhisperSend) {
+              pendingWhisperSend = false
+              console.log('[Voice] Whisper response received, sending transcript')
+              if (transcript.value.trim()) {
+                sendTranscriptAndClose()
+              } else {
+                isPushToTalk.value = false
+                close()
+              }
+            }
          }

          // Update last transcript length for next partial
@@ -214,6 +223,11 @@ function connectWhisperSocket() {
        } else if (msg.error) {
          error.value = msg.error
          console.error('[Voice] Whisper error:', msg.error)
+          // Clear pending send on error
+          if (pendingWhisperSend) {
+            pendingWhisperSend = false
+            isPushToTalk.value = false
+          }
        }
      }
    } catch (e) {
@@ -263,7 +277,6 @@ async function startWhisperRecording() {
    // Start recording
    mediaRecorder.start(100) // Collect data every 100ms
    isRecording.value = true
-    interimTranscript.value = 'Listening (Whisper GPU)...'

    // Send chunks periodically for progressive transcription
    chunkInterval = window.setInterval(() => {
@@ -283,6 +296,16 @@ function sendAudioChunk(isFinal: boolean) {

  // Always send ALL accumulated audio (webm needs header from first chunk)
  const audioBlob = new Blob(audioChunks, { type: 'audio/webm' })
+  const chunkCount = audioChunks.length
+
+  // Skip if audio is too small (< 5KB) - WebM header alone is ~1-2KB
+  if (audioBlob.size < 5000) {
+    console.log(`[Voice] Skipping small chunk (${audioBlob.size} bytes)`)
+    if (isFinal) {
+      audioChunks = []
+    }
+    return
+  }

  // Clear chunks only if final
  if (isFinal) {
@@ -295,16 +318,13 @@ function sendAudioChunk(isFinal: boolean) {
    const base64 = (reader.result as string).split(',')[1]

    if (whisperSocket?.readyState === WebSocket.OPEN) {
-      if (!isFinal) {
-        interimTranscript.value = 'Processing...'
-      }
      whisperSocket.send(JSON.stringify({
        type: 'transcribe',
        audio: base64,
        language: 'es',
        partial: !isFinal
      }))
-      console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${audioChunks.length} chunks, ${audioBlob.size} bytes)`)
+      console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${chunkCount} chunks, ${audioBlob.size} bytes)`)
    }
  }
  reader.readAsDataURL(audioBlob)
@@ -524,21 +544,31 @@ function handleKeyUp(e: KeyboardEvent) {
      holdTimeout = null
    }

-    // If was push-to-talk recording, stop and send after 1200ms
+    // If was push-to-talk recording, continue recording for 1.5s buffer then stop
    if (isPushToTalk.value && isRecording.value) {
-      console.log('[Voice] Stopping recording, will send in 1200ms')
-      stopRecording()
+      console.log('[Voice] Key released, continuing recording for 1.5s buffer...')
+
+      // Keep recording for 1.5s more (UX buffer for trailing words)
      setTimeout(() => {
-        console.log('[Voice] Sending transcript:', transcript.value.trim())
-        console.log('[Voice] Socket state:', socket?.readyState)
-        if (transcript.value.trim()) {
-          sendTranscriptAndClose()
+        console.log('[Voice] Buffer complete, stopping recording')
+        stopRecording()
+
+        if (useWhisper.value) {
+          // For Whisper: wait for server response (handled in onmessage)
+          console.log('[Voice] Waiting for Whisper transcription...')
+          pendingWhisperSend = true
        } else {
-          // No transcript, just close
-          isPushToTalk.value = false
-          close()
+          // For Web Speech API: send after short delay for final results
+          setTimeout(() => {
+            if (transcript.value.trim()) {
+              sendTranscriptAndClose()
+            } else {
+              isPushToTalk.value = false
+              close()
+            }
+          }, 300)
        }
-      }, 1200)
+      }, 1500)
    }

    keyDownTime = 0