fix: robust whisper recording with stop/restart segment strategy

Replace fragile chunked WebM recording with stop/restart approach: - Each segment is a complete, independently-decodable WebM file - Eliminates audio corruption from concatenating partial WebM clusters - Streaming partial transcription via periodic stop/restart every 3s - Transcript text accumulated per segment on the client - Proper lifecycle: onstop sends segment and restarts recorder
2026-02-20 00:06:18 -06:00
parent 016e92ffe5
commit b7f03a777b
1 changed files with 535 additions and 0 deletions
--- a/frontend/src/composables/useVoiceInput.ts
+++ b/frontend/src/composables/useVoiceInput.ts
@@ -0,0 +1,535 @@
 /**
 * useVoiceInput - Dual-mode voice input composable (Web Speech API + Whisper GPU)
 *
 * Extracts core voice functionality from FloatingVoice.vue (System A)
 * but uses the whisperSocket.ts singleton (System B pattern).
 *
 * Does NOT include: PTT, terminal sending, panel UI, audio playback/saving.
 */
 import { ref, type Ref } from 'vue'
 import {
  initWhisperSocket,
  sendAudio,
  onTranscription,
  getWhisperStatus,
  isConnected,
  type WhisperStatus
 } from '../services/whisperSocket'
 // ── Web Speech API types ──
 interface SpeechRecognitionEvent extends Event {
  resultIndex: number
  results: SpeechRecognitionResultList
 }
 interface SpeechRecognitionErrorEvent extends Event {
  error: string
  message?: string
 }
 interface SpeechRecognition extends EventTarget {
  continuous: boolean
  interimResults: boolean
  lang: string
  onresult: ((event: SpeechRecognitionEvent) => void) | null
  onerror: ((event: SpeechRecognitionErrorEvent) => void) | null
  onend: (() => void) | null
  start(): void
  stop(): void
  abort(): void
 }
 // ── Types ──
 export type VoiceMode = 'web' | 'whisper'
 export interface VoiceInput {
  isRecording: Ref<boolean>
  transcript: Ref<string>
  interimTranscript: Ref<string>
  error: Ref<string>
  voiceMode: Ref<VoiceMode>
  whisperStatus: Ref<WhisperStatus>
  audioDevices: Ref<MediaDeviceInfo[]>
  selectedDeviceId: Ref<string>
  isAndroid: Ref<boolean>
  lastAudioUrl: Ref<string>
  isPlayingAudio: Ref<boolean>
  startRecording: () => void
  stopRecording: () => void
  toggleRecording: () => void
  setMode: (mode: VoiceMode) => void
  loadAudioDevices: (skipPermission?: boolean) => Promise<void>
  selectMicrophone: (deviceId: string) => void
  clearTranscript: () => void
  playLastAudio: () => void
  init: () => Promise<void>
  cleanup: () => void
 }
 export function useVoiceInput(options?: {
  language?: string
 }): VoiceInput {
  const language = options?.language ?? 'es-419'
  // ── Reactive state ──
  const isRecording = ref(false)
  const transcript = ref('')
  const interimTranscript = ref('')
  const error = ref('')
  const voiceMode = ref<VoiceMode>('web')
  const whisperStatus = getWhisperStatus()
  const audioDevices = ref<MediaDeviceInfo[]>([])
  const selectedDeviceId = ref<string>('')
  const isAndroid = ref(false)
  const lastAudioUrl = ref<string>('')
  const isPlayingAudio = ref(false)
  // ── Internal state ──
  let recognition: SpeechRecognition | null = null
  let lastProcessedResult = ''
  let audioElement: HTMLAudioElement | null = null
  let mediaRecorder: MediaRecorder | null = null
  let audioChunks: Blob[] = []
  let chunkInterval: number | null = null
  let mediaStream: MediaStream | null = null
  let supportedMimeType = 'audio/webm;codecs=opus'
  let unsubTranscription: (() => void) | null = null
  const CHUNK_INTERVAL_MS = 3000
  // ── Mobile detection ──
  function checkMobile() {
    isAndroid.value = /Android/i.test(navigator.userAgent)
  }
  // ── Audio format detection ──
  function detectAudioFormat(): string {
    const formats = [
      'audio/webm;codecs=opus',
      'audio/webm',
      'audio/mp4',
      'audio/mp4;codecs=mp4a.40.2',
      'audio/aac',
      'audio/ogg;codecs=opus',
      'audio/wav'
    ]
    for (const format of formats) {
      if (MediaRecorder.isTypeSupported(format)) {
        return format
      }
    }
    return ''
  }
  // ── Device selection ──
  async function loadAudioDevices(skipPermission = false) {
    try {
      if (!skipPermission) {
        const tempStream = await navigator.mediaDevices.getUserMedia({ audio: true })
        tempStream.getTracks().forEach(track => track.stop())
      }
      const devices = await navigator.mediaDevices.enumerateDevices()
      audioDevices.value = devices.filter(d => d.kind === 'audioinput')
      if (!selectedDeviceId.value && audioDevices.value.length > 0) {
        selectedDeviceId.value = audioDevices.value[0]?.deviceId || ''
      }
    } catch (e) {
      console.error('[VoiceInput] Failed to enumerate devices:', e)
    }
  }
  function selectMicrophone(deviceId: string) {
    selectedDeviceId.value = deviceId
    if (isRecording.value) {
      stopRecording()
      setTimeout(() => startRecording(), 100)
    }
  }
  // ── Web Speech API ──
  function initRecognition(): SpeechRecognition | null {
    const SR = (window as any).SpeechRecognition || (window as any).webkitSpeechRecognition
    if (!SR) {
      error.value = 'Speech recognition not supported in this browser'
      return null
    }
    const rec = new SR() as SpeechRecognition
    rec.continuous = !isAndroid.value
    rec.interimResults = true
    rec.lang = language
    rec.onresult = (event: SpeechRecognitionEvent) => {
      let interim = ''
      let final = ''
      for (let i = event.resultIndex; i < event.results.length; i++) {
        const result = event.results[i]
        if (!result || !result[0]) continue
        if (result.isFinal) {
          final += result[0].transcript + ' '
        } else {
          interim += result[0].transcript
        }
      }
      if (final) {
        const trimmedFinal = final.trim()
        if (isAndroid.value && lastProcessedResult && trimmedFinal.startsWith(lastProcessedResult.trim())) {
          const newPart = trimmedFinal.slice(lastProcessedResult.trim().length).trim()
          if (newPart) {
            transcript.value += newPart + ' '
            lastProcessedResult = trimmedFinal
          }
        } else {
          transcript.value += final
          lastProcessedResult = trimmedFinal
        }
      }
      interimTranscript.value = interim
    }
    rec.onerror = (event: SpeechRecognitionErrorEvent) => {
      console.error('[VoiceInput] Recognition error:', event.error)
      if (event.error === 'not-allowed') {
        error.value = 'Microphone access denied'
      } else {
        error.value = `Error: ${event.error}`
      }
      isRecording.value = false
    }
    rec.onend = () => {
      if (isRecording.value && voiceMode.value === 'web') {
        if (isAndroid.value) {
          isRecording.value = false
        } else {
          rec.start()
        }
      }
    }
    return rec
  }
  // ── Whisper recording ──
  // Strategy: stop/restart MediaRecorder every CHUNK_INTERVAL_MS so each
  // segment sent to the server is a complete, independently-decodable WebM file.
  // Transcript text is accumulated (appended) on the client per segment.
  function handleTranscription(msg: {
    success?: boolean
    text?: string
    error?: string
    partial?: boolean
  }) {
    if (!isRecording.value && !msg.partial) {
      // Process final results even after stop (they arrive async)
    } else if (!isRecording.value) {
      return
    }
    if (msg.success && msg.text) {
      const newText = msg.text.trim()
      if (newText) {
        transcript.value += newText + ' '
      }
      interimTranscript.value = ''
    } else if (msg.error) {
      error.value = msg.error
    }
  }
  function sendWhisperBlob(blob: Blob, partial: boolean) {
    const reader = new FileReader()
    reader.onloadend = () => {
      const base64 = (reader.result as string).split(',')[1]
      sendAudio(base64, 'es', partial)
    }
    reader.readAsDataURL(blob)
  }
  function createRecorderSegment() {
    if (!mediaStream) return
    const recorderOptions: MediaRecorderOptions = {}
    if (supportedMimeType) {
      recorderOptions.mimeType = supportedMimeType
    }
    mediaRecorder = new MediaRecorder(mediaStream, recorderOptions)
    audioChunks = []
    mediaRecorder.ondataavailable = (event) => {
      if (event.data.size > 0) {
        audioChunks.push(event.data)
      }
    }
    mediaRecorder.onstop = () => {
      if (audioChunks.length > 0) {
        const mimeType = mediaRecorder?.mimeType || supportedMimeType || 'audio/webm'
        const segmentBlob = new Blob(audioChunks, { type: mimeType })
        audioChunks = []
        const isFinal = !isRecording.value
        if (isFinal) {
          // Save last segment for playback
          saveAudioForPlayback(segmentBlob)
        }
        // Skip tiny segments (silence) for intermediate, always send final
        if (segmentBlob.size >= 2000 || isFinal) {
          sendWhisperBlob(segmentBlob, !isFinal)
        }
      }
      // If still recording, start next segment on the same stream
      if (isRecording.value && mediaStream) {
        createRecorderSegment()
        mediaRecorder!.start()
      } else {
        // Final: cleanup stream
        if (mediaStream) {
          mediaStream.getTracks().forEach(track => track.stop())
          mediaStream = null
        }
      }
    }
  }
  async function startWhisperRecording() {
    if (!isConnected()) {
      error.value = 'Whisper server not connected'
      return
    }
    try {
      const audioConstraints: MediaTrackConstraints = {
        echoCancellation: true,
        noiseSuppression: true,
        autoGainControl: true,
        ...(selectedDeviceId.value ? { deviceId: { exact: selectedDeviceId.value } } : {})
      }
      mediaStream = await navigator.mediaDevices.getUserMedia({ audio: audioConstraints })
      createRecorderSegment()
      mediaRecorder!.start()
      isRecording.value = true
      // Stop/restart every CHUNK_INTERVAL_MS for streaming partial transcription
      // Each segment is a complete WebM file (own EBML header)
      chunkInterval = window.setInterval(() => {
        if (isRecording.value && mediaRecorder?.state === 'recording' && isConnected()) {
          mediaRecorder.stop() // onstop sends segment and restarts recorder
        }
      }, CHUNK_INTERVAL_MS)
      // Refresh device labels after first recording
      loadAudioDevices(true)
    } catch (e: any) {
      error.value = `Microphone error: ${e.message}`
      console.error('[VoiceInput] Microphone error:', e)
    }
  }
  function stopWhisperRecording() {
    if (chunkInterval) {
      clearInterval(chunkInterval)
      chunkInterval = null
    }
    isRecording.value = false
    if (mediaRecorder && mediaRecorder.state !== 'inactive') {
      mediaRecorder.stop() // onstop sends final segment + cleanup
    } else {
      if (mediaStream) {
        mediaStream.getTracks().forEach(track => track.stop())
        mediaStream = null
      }
    }
  }
  // ── Public recording controls ──
  function startRecording() {
    error.value = ''
    if (voiceMode.value === 'whisper' && isConnected()) {
      startWhisperRecording()
    } else {
      // Fallback to Web Speech API (or explicit web mode)
      if (voiceMode.value === 'whisper' && !isConnected()) {
        voiceMode.value = 'web'
      }
      if (!recognition) {
        recognition = initRecognition()
      }
      if (recognition) {
        try {
          recognition.start()
          isRecording.value = true
        } catch (e) {
          console.error('[VoiceInput] Failed to start Web Speech:', e)
        }
      }
    }
  }
  function stopRecording() {
    if (voiceMode.value === 'whisper' && mediaRecorder) {
      stopWhisperRecording()
    } else {
      if (recognition) {
        recognition.stop()
      }
      isRecording.value = false
    }
    interimTranscript.value = ''
  }
  function toggleRecording() {
    if (isRecording.value) {
      stopRecording()
    } else {
      startRecording()
    }
  }
  function setMode(mode: VoiceMode) {
    if (isRecording.value) {
      stopRecording()
    }
    voiceMode.value = mode
  }
  // ── Audio playback ──
  function saveAudioForPlayback(blob: Blob) {
    if (lastAudioUrl.value) {
      URL.revokeObjectURL(lastAudioUrl.value)
    }
    lastAudioUrl.value = URL.createObjectURL(blob)
  }
  function playLastAudio() {
    if (!lastAudioUrl.value) return
    if (audioElement) {
      audioElement.pause()
      audioElement = null
    }
    isPlayingAudio.value = true
    audioElement = new Audio(lastAudioUrl.value)
    audioElement.onended = () => { isPlayingAudio.value = false }
    audioElement.onerror = () => { isPlayingAudio.value = false }
    audioElement.play().catch(() => { isPlayingAudio.value = false })
  }
  function clearTranscript() {
    transcript.value = ''
    interimTranscript.value = ''
    lastProcessedResult = ''
  }
  // ── Lifecycle ──
  let gpuPollTimer: number | null = null
  async function init() {
    checkMobile()
    supportedMimeType = detectAudioFormat()
    await loadAudioDevices(true)
    // Subscribe to whisper transcriptions via singleton
    unsubTranscription = onTranscription(handleTranscription)
    // GPU is the default mode — always start as whisper
    voiceMode.value = 'whisper'
    // If already ready, done
    if (whisperStatus.value === 'ready') return
    // Poll for up to 60 seconds waiting for GPU to connect
    const GPU_POLL_INTERVAL = 2000
    const GPU_POLL_MAX = 60000
    let elapsed = 0
    gpuPollTimer = window.setInterval(() => {
      elapsed += GPU_POLL_INTERVAL
      if (whisperStatus.value === 'ready') {
        // GPU connected — stay in whisper mode
        if (gpuPollTimer) clearInterval(gpuPollTimer)
        gpuPollTimer = null
        console.log('[VoiceInput] GPU connected after', elapsed, 'ms')
        return
      }
      if (elapsed >= GPU_POLL_MAX) {
        // Timeout — fallback to web speech
        if (gpuPollTimer) clearInterval(gpuPollTimer)
        gpuPollTimer = null
        voiceMode.value = 'web'
        console.warn('[VoiceInput] GPU timeout after 60s, falling back to Web Speech')
      }
    }, GPU_POLL_INTERVAL)
  }
  function cleanup() {
    if (gpuPollTimer) {
      clearInterval(gpuPollTimer)
      gpuPollTimer = null
    }
    if (isRecording.value) {
      stopRecording()
    }
    if (recognition) {
      recognition.abort()
      recognition = null
    }
    if (unsubTranscription) {
      unsubTranscription()
      unsubTranscription = null
    }
    if (chunkInterval) {
      clearInterval(chunkInterval)
      chunkInterval = null
    }
    if (mediaStream) {
      mediaStream.getTracks().forEach(track => track.stop())
      mediaStream = null
    }
  }
  return {
    isRecording,
    transcript,
    interimTranscript,
    error,
    voiceMode,
    whisperStatus,
    audioDevices,
    selectedDeviceId,
    isAndroid,
    lastAudioUrl,
    isPlayingAudio,
    startRecording,
    stopRecording,
    toggleRecording,
    setMode,
    loadAudioDevices,
    selectMicrophone,
    clearTranscript,
    playLastAudio,
    init,
    cleanup
  }
 }