fix: robust whisper recording with stop/restart segment strategy

Replace fragile chunked WebM recording with stop/restart approach: - Each segment is a complete, independently-decodable WebM file - Eliminates audio corruption from concatenating partial WebM clusters - Streaming partial transcription via periodic stop/restart every 3s - Transcript text accumulated per segment on the client - Proper lifecycle: onstop sends segment and restarts recorder
2026-02-20 00:06:18 -06:00
parent 016e92ffe5
commit b7f03a777b
1 changed files with 535 additions and 0 deletions
--- a/frontend/src/composables/useVoiceInput.ts
+++ b/frontend/src/composables/useVoiceInput.ts
@@ -0,0 +1,535 @@
+/**
+ * useVoiceInput - Dual-mode voice input composable (Web Speech API + Whisper GPU)
+ *
+ * Extracts core voice functionality from FloatingVoice.vue (System A)
+ * but uses the whisperSocket.ts singleton (System B pattern).
+ *
+ * Does NOT include: PTT, terminal sending, panel UI, audio playback/saving.
+ */
+
+import { ref, type Ref } from 'vue'
+import {
+  initWhisperSocket,
+  sendAudio,
+  onTranscription,
+  getWhisperStatus,
+  isConnected,
+  type WhisperStatus
+} from '../services/whisperSocket'
+
+// ── Web Speech API types ──
+
+interface SpeechRecognitionEvent extends Event {
+  resultIndex: number
+  results: SpeechRecognitionResultList
+}
+
+interface SpeechRecognitionErrorEvent extends Event {
+  error: string
+  message?: string
+}
+
+interface SpeechRecognition extends EventTarget {
+  continuous: boolean
+  interimResults: boolean
+  lang: string
+  onresult: ((event: SpeechRecognitionEvent) => void) | null
+  onerror: ((event: SpeechRecognitionErrorEvent) => void) | null
+  onend: (() => void) | null
+  start(): void
+  stop(): void
+  abort(): void
+}
+
+// ── Types ──
+
+export type VoiceMode = 'web' | 'whisper'
+
+export interface VoiceInput {
+  isRecording: Ref<boolean>
+  transcript: Ref<string>
+  interimTranscript: Ref<string>
+  error: Ref<string>
+  voiceMode: Ref<VoiceMode>
+  whisperStatus: Ref<WhisperStatus>
+  audioDevices: Ref<MediaDeviceInfo[]>
+  selectedDeviceId: Ref<string>
+  isAndroid: Ref<boolean>
+  lastAudioUrl: Ref<string>
+  isPlayingAudio: Ref<boolean>
+
+  startRecording: () => void
+  stopRecording: () => void
+  toggleRecording: () => void
+  setMode: (mode: VoiceMode) => void
+  loadAudioDevices: (skipPermission?: boolean) => Promise<void>
+  selectMicrophone: (deviceId: string) => void
+  clearTranscript: () => void
+  playLastAudio: () => void
+  init: () => Promise<void>
+  cleanup: () => void
+}
+
+export function useVoiceInput(options?: {
+  language?: string
+}): VoiceInput {
+  const language = options?.language ?? 'es-419'
+
+  // ── Reactive state ──
+  const isRecording = ref(false)
+  const transcript = ref('')
+  const interimTranscript = ref('')
+  const error = ref('')
+  const voiceMode = ref<VoiceMode>('web')
+  const whisperStatus = getWhisperStatus()
+  const audioDevices = ref<MediaDeviceInfo[]>([])
+  const selectedDeviceId = ref<string>('')
+  const isAndroid = ref(false)
+  const lastAudioUrl = ref<string>('')
+  const isPlayingAudio = ref(false)
+
+  // ── Internal state ──
+  let recognition: SpeechRecognition | null = null
+  let lastProcessedResult = ''
+  let audioElement: HTMLAudioElement | null = null
+  let mediaRecorder: MediaRecorder | null = null
+  let audioChunks: Blob[] = []
+  let chunkInterval: number | null = null
+  let mediaStream: MediaStream | null = null
+  let supportedMimeType = 'audio/webm;codecs=opus'
+  let unsubTranscription: (() => void) | null = null
+  const CHUNK_INTERVAL_MS = 3000
+
+  // ── Mobile detection ──
+
+  function checkMobile() {
+    isAndroid.value = /Android/i.test(navigator.userAgent)
+  }
+
+  // ── Audio format detection ──
+
+  function detectAudioFormat(): string {
+    const formats = [
+      'audio/webm;codecs=opus',
+      'audio/webm',
+      'audio/mp4',
+      'audio/mp4;codecs=mp4a.40.2',
+      'audio/aac',
+      'audio/ogg;codecs=opus',
+      'audio/wav'
+    ]
+    for (const format of formats) {
+      if (MediaRecorder.isTypeSupported(format)) {
+        return format
+      }
+    }
+    return ''
+  }
+
+  // ── Device selection ──
+
+  async function loadAudioDevices(skipPermission = false) {
+    try {
+      if (!skipPermission) {
+        const tempStream = await navigator.mediaDevices.getUserMedia({ audio: true })
+        tempStream.getTracks().forEach(track => track.stop())
+      }
+      const devices = await navigator.mediaDevices.enumerateDevices()
+      audioDevices.value = devices.filter(d => d.kind === 'audioinput')
+      if (!selectedDeviceId.value && audioDevices.value.length > 0) {
+        selectedDeviceId.value = audioDevices.value[0]?.deviceId || ''
+      }
+    } catch (e) {
+      console.error('[VoiceInput] Failed to enumerate devices:', e)
+    }
+  }
+
+  function selectMicrophone(deviceId: string) {
+    selectedDeviceId.value = deviceId
+    if (isRecording.value) {
+      stopRecording()
+      setTimeout(() => startRecording(), 100)
+    }
+  }
+
+  // ── Web Speech API ──
+
+  function initRecognition(): SpeechRecognition | null {
+    const SR = (window as any).SpeechRecognition || (window as any).webkitSpeechRecognition
+    if (!SR) {
+      error.value = 'Speech recognition not supported in this browser'
+      return null
+    }
+
+    const rec = new SR() as SpeechRecognition
+    rec.continuous = !isAndroid.value
+    rec.interimResults = true
+    rec.lang = language
+
+    rec.onresult = (event: SpeechRecognitionEvent) => {
+      let interim = ''
+      let final = ''
+
+      for (let i = event.resultIndex; i < event.results.length; i++) {
+        const result = event.results[i]
+        if (!result || !result[0]) continue
+        if (result.isFinal) {
+          final += result[0].transcript + ' '
+        } else {
+          interim += result[0].transcript
+        }
+      }
+
+      if (final) {
+        const trimmedFinal = final.trim()
+        if (isAndroid.value && lastProcessedResult && trimmedFinal.startsWith(lastProcessedResult.trim())) {
+          const newPart = trimmedFinal.slice(lastProcessedResult.trim().length).trim()
+          if (newPart) {
+            transcript.value += newPart + ' '
+            lastProcessedResult = trimmedFinal
+          }
+        } else {
+          transcript.value += final
+          lastProcessedResult = trimmedFinal
+        }
+      }
+      interimTranscript.value = interim
+    }
+
+    rec.onerror = (event: SpeechRecognitionErrorEvent) => {
+      console.error('[VoiceInput] Recognition error:', event.error)
+      if (event.error === 'not-allowed') {
+        error.value = 'Microphone access denied'
+      } else {
+        error.value = `Error: ${event.error}`
+      }
+      isRecording.value = false
+    }
+
+    rec.onend = () => {
+      if (isRecording.value && voiceMode.value === 'web') {
+        if (isAndroid.value) {
+          isRecording.value = false
+        } else {
+          rec.start()
+        }
+      }
+    }
+
+    return rec
+  }
+
+  // ── Whisper recording ──
+  // Strategy: stop/restart MediaRecorder every CHUNK_INTERVAL_MS so each
+  // segment sent to the server is a complete, independently-decodable WebM file.
+  // Transcript text is accumulated (appended) on the client per segment.
+
+  function handleTranscription(msg: {
+    success?: boolean
+    text?: string
+    error?: string
+    partial?: boolean
+  }) {
+    if (!isRecording.value && !msg.partial) {
+      // Process final results even after stop (they arrive async)
+    } else if (!isRecording.value) {
+      return
+    }
+
+    if (msg.success && msg.text) {
+      const newText = msg.text.trim()
+      if (newText) {
+        transcript.value += newText + ' '
+      }
+      interimTranscript.value = ''
+    } else if (msg.error) {
+      error.value = msg.error
+    }
+  }
+
+  function sendWhisperBlob(blob: Blob, partial: boolean) {
+    const reader = new FileReader()
+    reader.onloadend = () => {
+      const base64 = (reader.result as string).split(',')[1]
+      sendAudio(base64, 'es', partial)
+    }
+    reader.readAsDataURL(blob)
+  }
+
+  function createRecorderSegment() {
+    if (!mediaStream) return
+
+    const recorderOptions: MediaRecorderOptions = {}
+    if (supportedMimeType) {
+      recorderOptions.mimeType = supportedMimeType
+    }
+
+    mediaRecorder = new MediaRecorder(mediaStream, recorderOptions)
+    audioChunks = []
+
+    mediaRecorder.ondataavailable = (event) => {
+      if (event.data.size > 0) {
+        audioChunks.push(event.data)
+      }
+    }
+
+    mediaRecorder.onstop = () => {
+      if (audioChunks.length > 0) {
+        const mimeType = mediaRecorder?.mimeType || supportedMimeType || 'audio/webm'
+        const segmentBlob = new Blob(audioChunks, { type: mimeType })
+        audioChunks = []
+        const isFinal = !isRecording.value
+
+        if (isFinal) {
+          // Save last segment for playback
+          saveAudioForPlayback(segmentBlob)
+        }
+
+        // Skip tiny segments (silence) for intermediate, always send final
+        if (segmentBlob.size >= 2000 || isFinal) {
+          sendWhisperBlob(segmentBlob, !isFinal)
+        }
+      }
+
+      // If still recording, start next segment on the same stream
+      if (isRecording.value && mediaStream) {
+        createRecorderSegment()
+        mediaRecorder!.start()
+      } else {
+        // Final: cleanup stream
+        if (mediaStream) {
+          mediaStream.getTracks().forEach(track => track.stop())
+          mediaStream = null
+        }
+      }
+    }
+  }
+
+  async function startWhisperRecording() {
+    if (!isConnected()) {
+      error.value = 'Whisper server not connected'
+      return
+    }
+
+    try {
+      const audioConstraints: MediaTrackConstraints = {
+        echoCancellation: true,
+        noiseSuppression: true,
+        autoGainControl: true,
+        ...(selectedDeviceId.value ? { deviceId: { exact: selectedDeviceId.value } } : {})
+      }
+
+      mediaStream = await navigator.mediaDevices.getUserMedia({ audio: audioConstraints })
+
+      createRecorderSegment()
+      mediaRecorder!.start()
+      isRecording.value = true
+
+      // Stop/restart every CHUNK_INTERVAL_MS for streaming partial transcription
+      // Each segment is a complete WebM file (own EBML header)
+      chunkInterval = window.setInterval(() => {
+        if (isRecording.value && mediaRecorder?.state === 'recording' && isConnected()) {
+          mediaRecorder.stop() // onstop sends segment and restarts recorder
+        }
+      }, CHUNK_INTERVAL_MS)
+
+      // Refresh device labels after first recording
+      loadAudioDevices(true)
+    } catch (e: any) {
+      error.value = `Microphone error: ${e.message}`
+      console.error('[VoiceInput] Microphone error:', e)
+    }
+  }
+
+  function stopWhisperRecording() {
+    if (chunkInterval) {
+      clearInterval(chunkInterval)
+      chunkInterval = null
+    }
+
+    isRecording.value = false
+
+    if (mediaRecorder && mediaRecorder.state !== 'inactive') {
+      mediaRecorder.stop() // onstop sends final segment + cleanup
+    } else {
+      if (mediaStream) {
+        mediaStream.getTracks().forEach(track => track.stop())
+        mediaStream = null
+      }
+    }
+  }
+
+  // ── Public recording controls ──
+
+  function startRecording() {
+    error.value = ''
+
+    if (voiceMode.value === 'whisper' && isConnected()) {
+      startWhisperRecording()
+    } else {
+      // Fallback to Web Speech API (or explicit web mode)
+      if (voiceMode.value === 'whisper' && !isConnected()) {
+        voiceMode.value = 'web'
+      }
+      if (!recognition) {
+        recognition = initRecognition()
+      }
+      if (recognition) {
+        try {
+          recognition.start()
+          isRecording.value = true
+        } catch (e) {
+          console.error('[VoiceInput] Failed to start Web Speech:', e)
+        }
+      }
+    }
+  }
+
+  function stopRecording() {
+    if (voiceMode.value === 'whisper' && mediaRecorder) {
+      stopWhisperRecording()
+    } else {
+      if (recognition) {
+        recognition.stop()
+      }
+      isRecording.value = false
+    }
+    interimTranscript.value = ''
+  }
+
+  function toggleRecording() {
+    if (isRecording.value) {
+      stopRecording()
+    } else {
+      startRecording()
+    }
+  }
+
+  function setMode(mode: VoiceMode) {
+    if (isRecording.value) {
+      stopRecording()
+    }
+    voiceMode.value = mode
+  }
+
+  // ── Audio playback ──
+
+  function saveAudioForPlayback(blob: Blob) {
+    if (lastAudioUrl.value) {
+      URL.revokeObjectURL(lastAudioUrl.value)
+    }
+    lastAudioUrl.value = URL.createObjectURL(blob)
+  }
+
+  function playLastAudio() {
+    if (!lastAudioUrl.value) return
+    if (audioElement) {
+      audioElement.pause()
+      audioElement = null
+    }
+    isPlayingAudio.value = true
+    audioElement = new Audio(lastAudioUrl.value)
+    audioElement.onended = () => { isPlayingAudio.value = false }
+    audioElement.onerror = () => { isPlayingAudio.value = false }
+    audioElement.play().catch(() => { isPlayingAudio.value = false })
+  }
+
+  function clearTranscript() {
+    transcript.value = ''
+    interimTranscript.value = ''
+    lastProcessedResult = ''
+  }
+
+  // ── Lifecycle ──
+
+  let gpuPollTimer: number | null = null
+
+  async function init() {
+    checkMobile()
+    supportedMimeType = detectAudioFormat()
+    await loadAudioDevices(true)
+
+    // Subscribe to whisper transcriptions via singleton
+    unsubTranscription = onTranscription(handleTranscription)
+
+    // GPU is the default mode — always start as whisper
+    voiceMode.value = 'whisper'
+
+    // If already ready, done
+    if (whisperStatus.value === 'ready') return
+
+    // Poll for up to 60 seconds waiting for GPU to connect
+    const GPU_POLL_INTERVAL = 2000
+    const GPU_POLL_MAX = 60000
+    let elapsed = 0
+
+    gpuPollTimer = window.setInterval(() => {
+      elapsed += GPU_POLL_INTERVAL
+      if (whisperStatus.value === 'ready') {
+        // GPU connected — stay in whisper mode
+        if (gpuPollTimer) clearInterval(gpuPollTimer)
+        gpuPollTimer = null
+        console.log('[VoiceInput] GPU connected after', elapsed, 'ms')
+        return
+      }
+      if (elapsed >= GPU_POLL_MAX) {
+        // Timeout — fallback to web speech
+        if (gpuPollTimer) clearInterval(gpuPollTimer)
+        gpuPollTimer = null
+        voiceMode.value = 'web'
+        console.warn('[VoiceInput] GPU timeout after 60s, falling back to Web Speech')
+      }
+    }, GPU_POLL_INTERVAL)
+  }
+
+  function cleanup() {
+    if (gpuPollTimer) {
+      clearInterval(gpuPollTimer)
+      gpuPollTimer = null
+    }
+    if (isRecording.value) {
+      stopRecording()
+    }
+    if (recognition) {
+      recognition.abort()
+      recognition = null
+    }
+    if (unsubTranscription) {
+      unsubTranscription()
+      unsubTranscription = null
+    }
+    if (chunkInterval) {
+      clearInterval(chunkInterval)
+      chunkInterval = null
+    }
+    if (mediaStream) {
+      mediaStream.getTracks().forEach(track => track.stop())
+      mediaStream = null
+    }
+  }
+
+  return {
+    isRecording,
+    transcript,
+    interimTranscript,
+    error,
+    voiceMode,
+    whisperStatus,
+    audioDevices,
+    selectedDeviceId,
+    isAndroid,
+    lastAudioUrl,
+    isPlayingAudio,
+
+    startRecording,
+    stopRecording,
+    toggleRecording,
+    setMode,
+    loadAudioDevices,
+    selectMicrophone,
+    clearTranscript,
+    playLastAudio,
+    init,
+    cleanup
+  }
+}