From b7f03a777b7ffbd3ce70db7744d6eb38be977ece Mon Sep 17 00:00:00 2001 From: josedario87 Date: Fri, 20 Feb 2026 00:06:18 -0600 Subject: [PATCH] fix: robust whisper recording with stop/restart segment strategy Replace fragile chunked WebM recording with stop/restart approach: - Each segment is a complete, independently-decodable WebM file - Eliminates audio corruption from concatenating partial WebM clusters - Streaming partial transcription via periodic stop/restart every 3s - Transcript text accumulated per segment on the client - Proper lifecycle: onstop sends segment and restarts recorder --- frontend/src/composables/useVoiceInput.ts | 535 ++++++++++++++++++++++ 1 file changed, 535 insertions(+) create mode 100644 frontend/src/composables/useVoiceInput.ts diff --git a/frontend/src/composables/useVoiceInput.ts b/frontend/src/composables/useVoiceInput.ts new file mode 100644 index 0000000..6631b91 --- /dev/null +++ b/frontend/src/composables/useVoiceInput.ts @@ -0,0 +1,535 @@ +/** + * useVoiceInput - Dual-mode voice input composable (Web Speech API + Whisper GPU) + * + * Extracts core voice functionality from FloatingVoice.vue (System A) + * but uses the whisperSocket.ts singleton (System B pattern). + * + * Does NOT include: PTT, terminal sending, panel UI, audio playback/saving. + */ + +import { ref, type Ref } from 'vue' +import { + initWhisperSocket, + sendAudio, + onTranscription, + getWhisperStatus, + isConnected, + type WhisperStatus +} from '../services/whisperSocket' + +// ── Web Speech API types ── + +interface SpeechRecognitionEvent extends Event { + resultIndex: number + results: SpeechRecognitionResultList +} + +interface SpeechRecognitionErrorEvent extends Event { + error: string + message?: string +} + +interface SpeechRecognition extends EventTarget { + continuous: boolean + interimResults: boolean + lang: string + onresult: ((event: SpeechRecognitionEvent) => void) | null + onerror: ((event: SpeechRecognitionErrorEvent) => void) | null + onend: (() => void) | null + start(): void + stop(): void + abort(): void +} + +// ── Types ── + +export type VoiceMode = 'web' | 'whisper' + +export interface VoiceInput { + isRecording: Ref + transcript: Ref + interimTranscript: Ref + error: Ref + voiceMode: Ref + whisperStatus: Ref + audioDevices: Ref + selectedDeviceId: Ref + isAndroid: Ref + lastAudioUrl: Ref + isPlayingAudio: Ref + + startRecording: () => void + stopRecording: () => void + toggleRecording: () => void + setMode: (mode: VoiceMode) => void + loadAudioDevices: (skipPermission?: boolean) => Promise + selectMicrophone: (deviceId: string) => void + clearTranscript: () => void + playLastAudio: () => void + init: () => Promise + cleanup: () => void +} + +export function useVoiceInput(options?: { + language?: string +}): VoiceInput { + const language = options?.language ?? 'es-419' + + // ── Reactive state ── + const isRecording = ref(false) + const transcript = ref('') + const interimTranscript = ref('') + const error = ref('') + const voiceMode = ref('web') + const whisperStatus = getWhisperStatus() + const audioDevices = ref([]) + const selectedDeviceId = ref('') + const isAndroid = ref(false) + const lastAudioUrl = ref('') + const isPlayingAudio = ref(false) + + // ── Internal state ── + let recognition: SpeechRecognition | null = null + let lastProcessedResult = '' + let audioElement: HTMLAudioElement | null = null + let mediaRecorder: MediaRecorder | null = null + let audioChunks: Blob[] = [] + let chunkInterval: number | null = null + let mediaStream: MediaStream | null = null + let supportedMimeType = 'audio/webm;codecs=opus' + let unsubTranscription: (() => void) | null = null + const CHUNK_INTERVAL_MS = 3000 + + // ── Mobile detection ── + + function checkMobile() { + isAndroid.value = /Android/i.test(navigator.userAgent) + } + + // ── Audio format detection ── + + function detectAudioFormat(): string { + const formats = [ + 'audio/webm;codecs=opus', + 'audio/webm', + 'audio/mp4', + 'audio/mp4;codecs=mp4a.40.2', + 'audio/aac', + 'audio/ogg;codecs=opus', + 'audio/wav' + ] + for (const format of formats) { + if (MediaRecorder.isTypeSupported(format)) { + return format + } + } + return '' + } + + // ── Device selection ── + + async function loadAudioDevices(skipPermission = false) { + try { + if (!skipPermission) { + const tempStream = await navigator.mediaDevices.getUserMedia({ audio: true }) + tempStream.getTracks().forEach(track => track.stop()) + } + const devices = await navigator.mediaDevices.enumerateDevices() + audioDevices.value = devices.filter(d => d.kind === 'audioinput') + if (!selectedDeviceId.value && audioDevices.value.length > 0) { + selectedDeviceId.value = audioDevices.value[0]?.deviceId || '' + } + } catch (e) { + console.error('[VoiceInput] Failed to enumerate devices:', e) + } + } + + function selectMicrophone(deviceId: string) { + selectedDeviceId.value = deviceId + if (isRecording.value) { + stopRecording() + setTimeout(() => startRecording(), 100) + } + } + + // ── Web Speech API ── + + function initRecognition(): SpeechRecognition | null { + const SR = (window as any).SpeechRecognition || (window as any).webkitSpeechRecognition + if (!SR) { + error.value = 'Speech recognition not supported in this browser' + return null + } + + const rec = new SR() as SpeechRecognition + rec.continuous = !isAndroid.value + rec.interimResults = true + rec.lang = language + + rec.onresult = (event: SpeechRecognitionEvent) => { + let interim = '' + let final = '' + + for (let i = event.resultIndex; i < event.results.length; i++) { + const result = event.results[i] + if (!result || !result[0]) continue + if (result.isFinal) { + final += result[0].transcript + ' ' + } else { + interim += result[0].transcript + } + } + + if (final) { + const trimmedFinal = final.trim() + if (isAndroid.value && lastProcessedResult && trimmedFinal.startsWith(lastProcessedResult.trim())) { + const newPart = trimmedFinal.slice(lastProcessedResult.trim().length).trim() + if (newPart) { + transcript.value += newPart + ' ' + lastProcessedResult = trimmedFinal + } + } else { + transcript.value += final + lastProcessedResult = trimmedFinal + } + } + interimTranscript.value = interim + } + + rec.onerror = (event: SpeechRecognitionErrorEvent) => { + console.error('[VoiceInput] Recognition error:', event.error) + if (event.error === 'not-allowed') { + error.value = 'Microphone access denied' + } else { + error.value = `Error: ${event.error}` + } + isRecording.value = false + } + + rec.onend = () => { + if (isRecording.value && voiceMode.value === 'web') { + if (isAndroid.value) { + isRecording.value = false + } else { + rec.start() + } + } + } + + return rec + } + + // ── Whisper recording ── + // Strategy: stop/restart MediaRecorder every CHUNK_INTERVAL_MS so each + // segment sent to the server is a complete, independently-decodable WebM file. + // Transcript text is accumulated (appended) on the client per segment. + + function handleTranscription(msg: { + success?: boolean + text?: string + error?: string + partial?: boolean + }) { + if (!isRecording.value && !msg.partial) { + // Process final results even after stop (they arrive async) + } else if (!isRecording.value) { + return + } + + if (msg.success && msg.text) { + const newText = msg.text.trim() + if (newText) { + transcript.value += newText + ' ' + } + interimTranscript.value = '' + } else if (msg.error) { + error.value = msg.error + } + } + + function sendWhisperBlob(blob: Blob, partial: boolean) { + const reader = new FileReader() + reader.onloadend = () => { + const base64 = (reader.result as string).split(',')[1] + sendAudio(base64, 'es', partial) + } + reader.readAsDataURL(blob) + } + + function createRecorderSegment() { + if (!mediaStream) return + + const recorderOptions: MediaRecorderOptions = {} + if (supportedMimeType) { + recorderOptions.mimeType = supportedMimeType + } + + mediaRecorder = new MediaRecorder(mediaStream, recorderOptions) + audioChunks = [] + + mediaRecorder.ondataavailable = (event) => { + if (event.data.size > 0) { + audioChunks.push(event.data) + } + } + + mediaRecorder.onstop = () => { + if (audioChunks.length > 0) { + const mimeType = mediaRecorder?.mimeType || supportedMimeType || 'audio/webm' + const segmentBlob = new Blob(audioChunks, { type: mimeType }) + audioChunks = [] + const isFinal = !isRecording.value + + if (isFinal) { + // Save last segment for playback + saveAudioForPlayback(segmentBlob) + } + + // Skip tiny segments (silence) for intermediate, always send final + if (segmentBlob.size >= 2000 || isFinal) { + sendWhisperBlob(segmentBlob, !isFinal) + } + } + + // If still recording, start next segment on the same stream + if (isRecording.value && mediaStream) { + createRecorderSegment() + mediaRecorder!.start() + } else { + // Final: cleanup stream + if (mediaStream) { + mediaStream.getTracks().forEach(track => track.stop()) + mediaStream = null + } + } + } + } + + async function startWhisperRecording() { + if (!isConnected()) { + error.value = 'Whisper server not connected' + return + } + + try { + const audioConstraints: MediaTrackConstraints = { + echoCancellation: true, + noiseSuppression: true, + autoGainControl: true, + ...(selectedDeviceId.value ? { deviceId: { exact: selectedDeviceId.value } } : {}) + } + + mediaStream = await navigator.mediaDevices.getUserMedia({ audio: audioConstraints }) + + createRecorderSegment() + mediaRecorder!.start() + isRecording.value = true + + // Stop/restart every CHUNK_INTERVAL_MS for streaming partial transcription + // Each segment is a complete WebM file (own EBML header) + chunkInterval = window.setInterval(() => { + if (isRecording.value && mediaRecorder?.state === 'recording' && isConnected()) { + mediaRecorder.stop() // onstop sends segment and restarts recorder + } + }, CHUNK_INTERVAL_MS) + + // Refresh device labels after first recording + loadAudioDevices(true) + } catch (e: any) { + error.value = `Microphone error: ${e.message}` + console.error('[VoiceInput] Microphone error:', e) + } + } + + function stopWhisperRecording() { + if (chunkInterval) { + clearInterval(chunkInterval) + chunkInterval = null + } + + isRecording.value = false + + if (mediaRecorder && mediaRecorder.state !== 'inactive') { + mediaRecorder.stop() // onstop sends final segment + cleanup + } else { + if (mediaStream) { + mediaStream.getTracks().forEach(track => track.stop()) + mediaStream = null + } + } + } + + // ── Public recording controls ── + + function startRecording() { + error.value = '' + + if (voiceMode.value === 'whisper' && isConnected()) { + startWhisperRecording() + } else { + // Fallback to Web Speech API (or explicit web mode) + if (voiceMode.value === 'whisper' && !isConnected()) { + voiceMode.value = 'web' + } + if (!recognition) { + recognition = initRecognition() + } + if (recognition) { + try { + recognition.start() + isRecording.value = true + } catch (e) { + console.error('[VoiceInput] Failed to start Web Speech:', e) + } + } + } + } + + function stopRecording() { + if (voiceMode.value === 'whisper' && mediaRecorder) { + stopWhisperRecording() + } else { + if (recognition) { + recognition.stop() + } + isRecording.value = false + } + interimTranscript.value = '' + } + + function toggleRecording() { + if (isRecording.value) { + stopRecording() + } else { + startRecording() + } + } + + function setMode(mode: VoiceMode) { + if (isRecording.value) { + stopRecording() + } + voiceMode.value = mode + } + + // ── Audio playback ── + + function saveAudioForPlayback(blob: Blob) { + if (lastAudioUrl.value) { + URL.revokeObjectURL(lastAudioUrl.value) + } + lastAudioUrl.value = URL.createObjectURL(blob) + } + + function playLastAudio() { + if (!lastAudioUrl.value) return + if (audioElement) { + audioElement.pause() + audioElement = null + } + isPlayingAudio.value = true + audioElement = new Audio(lastAudioUrl.value) + audioElement.onended = () => { isPlayingAudio.value = false } + audioElement.onerror = () => { isPlayingAudio.value = false } + audioElement.play().catch(() => { isPlayingAudio.value = false }) + } + + function clearTranscript() { + transcript.value = '' + interimTranscript.value = '' + lastProcessedResult = '' + } + + // ── Lifecycle ── + + let gpuPollTimer: number | null = null + + async function init() { + checkMobile() + supportedMimeType = detectAudioFormat() + await loadAudioDevices(true) + + // Subscribe to whisper transcriptions via singleton + unsubTranscription = onTranscription(handleTranscription) + + // GPU is the default mode — always start as whisper + voiceMode.value = 'whisper' + + // If already ready, done + if (whisperStatus.value === 'ready') return + + // Poll for up to 60 seconds waiting for GPU to connect + const GPU_POLL_INTERVAL = 2000 + const GPU_POLL_MAX = 60000 + let elapsed = 0 + + gpuPollTimer = window.setInterval(() => { + elapsed += GPU_POLL_INTERVAL + if (whisperStatus.value === 'ready') { + // GPU connected — stay in whisper mode + if (gpuPollTimer) clearInterval(gpuPollTimer) + gpuPollTimer = null + console.log('[VoiceInput] GPU connected after', elapsed, 'ms') + return + } + if (elapsed >= GPU_POLL_MAX) { + // Timeout — fallback to web speech + if (gpuPollTimer) clearInterval(gpuPollTimer) + gpuPollTimer = null + voiceMode.value = 'web' + console.warn('[VoiceInput] GPU timeout after 60s, falling back to Web Speech') + } + }, GPU_POLL_INTERVAL) + } + + function cleanup() { + if (gpuPollTimer) { + clearInterval(gpuPollTimer) + gpuPollTimer = null + } + if (isRecording.value) { + stopRecording() + } + if (recognition) { + recognition.abort() + recognition = null + } + if (unsubTranscription) { + unsubTranscription() + unsubTranscription = null + } + if (chunkInterval) { + clearInterval(chunkInterval) + chunkInterval = null + } + if (mediaStream) { + mediaStream.getTracks().forEach(track => track.stop()) + mediaStream = null + } + } + + return { + isRecording, + transcript, + interimTranscript, + error, + voiceMode, + whisperStatus, + audioDevices, + selectedDeviceId, + isAndroid, + lastAudioUrl, + isPlayingAudio, + + startRecording, + stopRecording, + toggleRecording, + setMode, + loadAudioDevices, + selectMicrophone, + clearTranscript, + playLastAudio, + init, + cleanup + } +}