fix: robust whisper recording with stop/restart segment strategy

Replace fragile chunked WebM recording with stop/restart approach:
- Each segment is a complete, independently-decodable WebM file
- Eliminates audio corruption from concatenating partial WebM clusters
- Streaming partial transcription via periodic stop/restart every 3s
- Transcript text accumulated per segment on the client
- Proper lifecycle: onstop sends segment and restarts recorder
This commit is contained in:
2026-02-20 00:06:18 -06:00
parent 016e92ffe5
commit b7f03a777b

View File

@@ -0,0 +1,535 @@
/**
* useVoiceInput - Dual-mode voice input composable (Web Speech API + Whisper GPU)
*
* Extracts core voice functionality from FloatingVoice.vue (System A)
* but uses the whisperSocket.ts singleton (System B pattern).
*
* Does NOT include: PTT, terminal sending, panel UI, audio playback/saving.
*/
import { ref, type Ref } from 'vue'
import {
initWhisperSocket,
sendAudio,
onTranscription,
getWhisperStatus,
isConnected,
type WhisperStatus
} from '../services/whisperSocket'
// ── Web Speech API types ──
interface SpeechRecognitionEvent extends Event {
resultIndex: number
results: SpeechRecognitionResultList
}
interface SpeechRecognitionErrorEvent extends Event {
error: string
message?: string
}
interface SpeechRecognition extends EventTarget {
continuous: boolean
interimResults: boolean
lang: string
onresult: ((event: SpeechRecognitionEvent) => void) | null
onerror: ((event: SpeechRecognitionErrorEvent) => void) | null
onend: (() => void) | null
start(): void
stop(): void
abort(): void
}
// ── Types ──
export type VoiceMode = 'web' | 'whisper'
export interface VoiceInput {
isRecording: Ref<boolean>
transcript: Ref<string>
interimTranscript: Ref<string>
error: Ref<string>
voiceMode: Ref<VoiceMode>
whisperStatus: Ref<WhisperStatus>
audioDevices: Ref<MediaDeviceInfo[]>
selectedDeviceId: Ref<string>
isAndroid: Ref<boolean>
lastAudioUrl: Ref<string>
isPlayingAudio: Ref<boolean>
startRecording: () => void
stopRecording: () => void
toggleRecording: () => void
setMode: (mode: VoiceMode) => void
loadAudioDevices: (skipPermission?: boolean) => Promise<void>
selectMicrophone: (deviceId: string) => void
clearTranscript: () => void
playLastAudio: () => void
init: () => Promise<void>
cleanup: () => void
}
export function useVoiceInput(options?: {
language?: string
}): VoiceInput {
const language = options?.language ?? 'es-419'
// ── Reactive state ──
const isRecording = ref(false)
const transcript = ref('')
const interimTranscript = ref('')
const error = ref('')
const voiceMode = ref<VoiceMode>('web')
const whisperStatus = getWhisperStatus()
const audioDevices = ref<MediaDeviceInfo[]>([])
const selectedDeviceId = ref<string>('')
const isAndroid = ref(false)
const lastAudioUrl = ref<string>('')
const isPlayingAudio = ref(false)
// ── Internal state ──
let recognition: SpeechRecognition | null = null
let lastProcessedResult = ''
let audioElement: HTMLAudioElement | null = null
let mediaRecorder: MediaRecorder | null = null
let audioChunks: Blob[] = []
let chunkInterval: number | null = null
let mediaStream: MediaStream | null = null
let supportedMimeType = 'audio/webm;codecs=opus'
let unsubTranscription: (() => void) | null = null
const CHUNK_INTERVAL_MS = 3000
// ── Mobile detection ──
function checkMobile() {
isAndroid.value = /Android/i.test(navigator.userAgent)
}
// ── Audio format detection ──
function detectAudioFormat(): string {
const formats = [
'audio/webm;codecs=opus',
'audio/webm',
'audio/mp4',
'audio/mp4;codecs=mp4a.40.2',
'audio/aac',
'audio/ogg;codecs=opus',
'audio/wav'
]
for (const format of formats) {
if (MediaRecorder.isTypeSupported(format)) {
return format
}
}
return ''
}
// ── Device selection ──
async function loadAudioDevices(skipPermission = false) {
try {
if (!skipPermission) {
const tempStream = await navigator.mediaDevices.getUserMedia({ audio: true })
tempStream.getTracks().forEach(track => track.stop())
}
const devices = await navigator.mediaDevices.enumerateDevices()
audioDevices.value = devices.filter(d => d.kind === 'audioinput')
if (!selectedDeviceId.value && audioDevices.value.length > 0) {
selectedDeviceId.value = audioDevices.value[0]?.deviceId || ''
}
} catch (e) {
console.error('[VoiceInput] Failed to enumerate devices:', e)
}
}
function selectMicrophone(deviceId: string) {
selectedDeviceId.value = deviceId
if (isRecording.value) {
stopRecording()
setTimeout(() => startRecording(), 100)
}
}
// ── Web Speech API ──
function initRecognition(): SpeechRecognition | null {
const SR = (window as any).SpeechRecognition || (window as any).webkitSpeechRecognition
if (!SR) {
error.value = 'Speech recognition not supported in this browser'
return null
}
const rec = new SR() as SpeechRecognition
rec.continuous = !isAndroid.value
rec.interimResults = true
rec.lang = language
rec.onresult = (event: SpeechRecognitionEvent) => {
let interim = ''
let final = ''
for (let i = event.resultIndex; i < event.results.length; i++) {
const result = event.results[i]
if (!result || !result[0]) continue
if (result.isFinal) {
final += result[0].transcript + ' '
} else {
interim += result[0].transcript
}
}
if (final) {
const trimmedFinal = final.trim()
if (isAndroid.value && lastProcessedResult && trimmedFinal.startsWith(lastProcessedResult.trim())) {
const newPart = trimmedFinal.slice(lastProcessedResult.trim().length).trim()
if (newPart) {
transcript.value += newPart + ' '
lastProcessedResult = trimmedFinal
}
} else {
transcript.value += final
lastProcessedResult = trimmedFinal
}
}
interimTranscript.value = interim
}
rec.onerror = (event: SpeechRecognitionErrorEvent) => {
console.error('[VoiceInput] Recognition error:', event.error)
if (event.error === 'not-allowed') {
error.value = 'Microphone access denied'
} else {
error.value = `Error: ${event.error}`
}
isRecording.value = false
}
rec.onend = () => {
if (isRecording.value && voiceMode.value === 'web') {
if (isAndroid.value) {
isRecording.value = false
} else {
rec.start()
}
}
}
return rec
}
// ── Whisper recording ──
// Strategy: stop/restart MediaRecorder every CHUNK_INTERVAL_MS so each
// segment sent to the server is a complete, independently-decodable WebM file.
// Transcript text is accumulated (appended) on the client per segment.
function handleTranscription(msg: {
success?: boolean
text?: string
error?: string
partial?: boolean
}) {
if (!isRecording.value && !msg.partial) {
// Process final results even after stop (they arrive async)
} else if (!isRecording.value) {
return
}
if (msg.success && msg.text) {
const newText = msg.text.trim()
if (newText) {
transcript.value += newText + ' '
}
interimTranscript.value = ''
} else if (msg.error) {
error.value = msg.error
}
}
function sendWhisperBlob(blob: Blob, partial: boolean) {
const reader = new FileReader()
reader.onloadend = () => {
const base64 = (reader.result as string).split(',')[1]
sendAudio(base64, 'es', partial)
}
reader.readAsDataURL(blob)
}
function createRecorderSegment() {
if (!mediaStream) return
const recorderOptions: MediaRecorderOptions = {}
if (supportedMimeType) {
recorderOptions.mimeType = supportedMimeType
}
mediaRecorder = new MediaRecorder(mediaStream, recorderOptions)
audioChunks = []
mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 0) {
audioChunks.push(event.data)
}
}
mediaRecorder.onstop = () => {
if (audioChunks.length > 0) {
const mimeType = mediaRecorder?.mimeType || supportedMimeType || 'audio/webm'
const segmentBlob = new Blob(audioChunks, { type: mimeType })
audioChunks = []
const isFinal = !isRecording.value
if (isFinal) {
// Save last segment for playback
saveAudioForPlayback(segmentBlob)
}
// Skip tiny segments (silence) for intermediate, always send final
if (segmentBlob.size >= 2000 || isFinal) {
sendWhisperBlob(segmentBlob, !isFinal)
}
}
// If still recording, start next segment on the same stream
if (isRecording.value && mediaStream) {
createRecorderSegment()
mediaRecorder!.start()
} else {
// Final: cleanup stream
if (mediaStream) {
mediaStream.getTracks().forEach(track => track.stop())
mediaStream = null
}
}
}
}
async function startWhisperRecording() {
if (!isConnected()) {
error.value = 'Whisper server not connected'
return
}
try {
const audioConstraints: MediaTrackConstraints = {
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
...(selectedDeviceId.value ? { deviceId: { exact: selectedDeviceId.value } } : {})
}
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: audioConstraints })
createRecorderSegment()
mediaRecorder!.start()
isRecording.value = true
// Stop/restart every CHUNK_INTERVAL_MS for streaming partial transcription
// Each segment is a complete WebM file (own EBML header)
chunkInterval = window.setInterval(() => {
if (isRecording.value && mediaRecorder?.state === 'recording' && isConnected()) {
mediaRecorder.stop() // onstop sends segment and restarts recorder
}
}, CHUNK_INTERVAL_MS)
// Refresh device labels after first recording
loadAudioDevices(true)
} catch (e: any) {
error.value = `Microphone error: ${e.message}`
console.error('[VoiceInput] Microphone error:', e)
}
}
function stopWhisperRecording() {
if (chunkInterval) {
clearInterval(chunkInterval)
chunkInterval = null
}
isRecording.value = false
if (mediaRecorder && mediaRecorder.state !== 'inactive') {
mediaRecorder.stop() // onstop sends final segment + cleanup
} else {
if (mediaStream) {
mediaStream.getTracks().forEach(track => track.stop())
mediaStream = null
}
}
}
// ── Public recording controls ──
function startRecording() {
error.value = ''
if (voiceMode.value === 'whisper' && isConnected()) {
startWhisperRecording()
} else {
// Fallback to Web Speech API (or explicit web mode)
if (voiceMode.value === 'whisper' && !isConnected()) {
voiceMode.value = 'web'
}
if (!recognition) {
recognition = initRecognition()
}
if (recognition) {
try {
recognition.start()
isRecording.value = true
} catch (e) {
console.error('[VoiceInput] Failed to start Web Speech:', e)
}
}
}
}
function stopRecording() {
if (voiceMode.value === 'whisper' && mediaRecorder) {
stopWhisperRecording()
} else {
if (recognition) {
recognition.stop()
}
isRecording.value = false
}
interimTranscript.value = ''
}
function toggleRecording() {
if (isRecording.value) {
stopRecording()
} else {
startRecording()
}
}
function setMode(mode: VoiceMode) {
if (isRecording.value) {
stopRecording()
}
voiceMode.value = mode
}
// ── Audio playback ──
function saveAudioForPlayback(blob: Blob) {
if (lastAudioUrl.value) {
URL.revokeObjectURL(lastAudioUrl.value)
}
lastAudioUrl.value = URL.createObjectURL(blob)
}
function playLastAudio() {
if (!lastAudioUrl.value) return
if (audioElement) {
audioElement.pause()
audioElement = null
}
isPlayingAudio.value = true
audioElement = new Audio(lastAudioUrl.value)
audioElement.onended = () => { isPlayingAudio.value = false }
audioElement.onerror = () => { isPlayingAudio.value = false }
audioElement.play().catch(() => { isPlayingAudio.value = false })
}
function clearTranscript() {
transcript.value = ''
interimTranscript.value = ''
lastProcessedResult = ''
}
// ── Lifecycle ──
let gpuPollTimer: number | null = null
async function init() {
checkMobile()
supportedMimeType = detectAudioFormat()
await loadAudioDevices(true)
// Subscribe to whisper transcriptions via singleton
unsubTranscription = onTranscription(handleTranscription)
// GPU is the default mode — always start as whisper
voiceMode.value = 'whisper'
// If already ready, done
if (whisperStatus.value === 'ready') return
// Poll for up to 60 seconds waiting for GPU to connect
const GPU_POLL_INTERVAL = 2000
const GPU_POLL_MAX = 60000
let elapsed = 0
gpuPollTimer = window.setInterval(() => {
elapsed += GPU_POLL_INTERVAL
if (whisperStatus.value === 'ready') {
// GPU connected — stay in whisper mode
if (gpuPollTimer) clearInterval(gpuPollTimer)
gpuPollTimer = null
console.log('[VoiceInput] GPU connected after', elapsed, 'ms')
return
}
if (elapsed >= GPU_POLL_MAX) {
// Timeout — fallback to web speech
if (gpuPollTimer) clearInterval(gpuPollTimer)
gpuPollTimer = null
voiceMode.value = 'web'
console.warn('[VoiceInput] GPU timeout after 60s, falling back to Web Speech')
}
}, GPU_POLL_INTERVAL)
}
function cleanup() {
if (gpuPollTimer) {
clearInterval(gpuPollTimer)
gpuPollTimer = null
}
if (isRecording.value) {
stopRecording()
}
if (recognition) {
recognition.abort()
recognition = null
}
if (unsubTranscription) {
unsubTranscription()
unsubTranscription = null
}
if (chunkInterval) {
clearInterval(chunkInterval)
chunkInterval = null
}
if (mediaStream) {
mediaStream.getTracks().forEach(track => track.stop())
mediaStream = null
}
}
return {
isRecording,
transcript,
interimTranscript,
error,
voiceMode,
whisperStatus,
audioDevices,
selectedDeviceId,
isAndroid,
lastAudioUrl,
isPlayingAudio,
startRecording,
stopRecording,
toggleRecording,
setMode,
loadAudioDevices,
selectMicrophone,
clearTranscript,
playLastAudio,
init,
cleanup
}
}