fix: robust whisper recording with stop/restart segment strategy
Replace fragile chunked WebM recording with stop/restart approach: - Each segment is a complete, independently-decodable WebM file - Eliminates audio corruption from concatenating partial WebM clusters - Streaming partial transcription via periodic stop/restart every 3s - Transcript text accumulated per segment on the client - Proper lifecycle: onstop sends segment and restarts recorder
This commit is contained in:
535
frontend/src/composables/useVoiceInput.ts
Normal file
535
frontend/src/composables/useVoiceInput.ts
Normal file
@@ -0,0 +1,535 @@
|
||||
/**
|
||||
* useVoiceInput - Dual-mode voice input composable (Web Speech API + Whisper GPU)
|
||||
*
|
||||
* Extracts core voice functionality from FloatingVoice.vue (System A)
|
||||
* but uses the whisperSocket.ts singleton (System B pattern).
|
||||
*
|
||||
* Does NOT include: PTT, terminal sending, panel UI, audio playback/saving.
|
||||
*/
|
||||
|
||||
import { ref, type Ref } from 'vue'
|
||||
import {
|
||||
initWhisperSocket,
|
||||
sendAudio,
|
||||
onTranscription,
|
||||
getWhisperStatus,
|
||||
isConnected,
|
||||
type WhisperStatus
|
||||
} from '../services/whisperSocket'
|
||||
|
||||
// ── Web Speech API types ──
|
||||
|
||||
interface SpeechRecognitionEvent extends Event {
|
||||
resultIndex: number
|
||||
results: SpeechRecognitionResultList
|
||||
}
|
||||
|
||||
interface SpeechRecognitionErrorEvent extends Event {
|
||||
error: string
|
||||
message?: string
|
||||
}
|
||||
|
||||
interface SpeechRecognition extends EventTarget {
|
||||
continuous: boolean
|
||||
interimResults: boolean
|
||||
lang: string
|
||||
onresult: ((event: SpeechRecognitionEvent) => void) | null
|
||||
onerror: ((event: SpeechRecognitionErrorEvent) => void) | null
|
||||
onend: (() => void) | null
|
||||
start(): void
|
||||
stop(): void
|
||||
abort(): void
|
||||
}
|
||||
|
||||
// ── Types ──
|
||||
|
||||
export type VoiceMode = 'web' | 'whisper'
|
||||
|
||||
export interface VoiceInput {
|
||||
isRecording: Ref<boolean>
|
||||
transcript: Ref<string>
|
||||
interimTranscript: Ref<string>
|
||||
error: Ref<string>
|
||||
voiceMode: Ref<VoiceMode>
|
||||
whisperStatus: Ref<WhisperStatus>
|
||||
audioDevices: Ref<MediaDeviceInfo[]>
|
||||
selectedDeviceId: Ref<string>
|
||||
isAndroid: Ref<boolean>
|
||||
lastAudioUrl: Ref<string>
|
||||
isPlayingAudio: Ref<boolean>
|
||||
|
||||
startRecording: () => void
|
||||
stopRecording: () => void
|
||||
toggleRecording: () => void
|
||||
setMode: (mode: VoiceMode) => void
|
||||
loadAudioDevices: (skipPermission?: boolean) => Promise<void>
|
||||
selectMicrophone: (deviceId: string) => void
|
||||
clearTranscript: () => void
|
||||
playLastAudio: () => void
|
||||
init: () => Promise<void>
|
||||
cleanup: () => void
|
||||
}
|
||||
|
||||
export function useVoiceInput(options?: {
|
||||
language?: string
|
||||
}): VoiceInput {
|
||||
const language = options?.language ?? 'es-419'
|
||||
|
||||
// ── Reactive state ──
|
||||
const isRecording = ref(false)
|
||||
const transcript = ref('')
|
||||
const interimTranscript = ref('')
|
||||
const error = ref('')
|
||||
const voiceMode = ref<VoiceMode>('web')
|
||||
const whisperStatus = getWhisperStatus()
|
||||
const audioDevices = ref<MediaDeviceInfo[]>([])
|
||||
const selectedDeviceId = ref<string>('')
|
||||
const isAndroid = ref(false)
|
||||
const lastAudioUrl = ref<string>('')
|
||||
const isPlayingAudio = ref(false)
|
||||
|
||||
// ── Internal state ──
|
||||
let recognition: SpeechRecognition | null = null
|
||||
let lastProcessedResult = ''
|
||||
let audioElement: HTMLAudioElement | null = null
|
||||
let mediaRecorder: MediaRecorder | null = null
|
||||
let audioChunks: Blob[] = []
|
||||
let chunkInterval: number | null = null
|
||||
let mediaStream: MediaStream | null = null
|
||||
let supportedMimeType = 'audio/webm;codecs=opus'
|
||||
let unsubTranscription: (() => void) | null = null
|
||||
const CHUNK_INTERVAL_MS = 3000
|
||||
|
||||
// ── Mobile detection ──
|
||||
|
||||
function checkMobile() {
|
||||
isAndroid.value = /Android/i.test(navigator.userAgent)
|
||||
}
|
||||
|
||||
// ── Audio format detection ──
|
||||
|
||||
function detectAudioFormat(): string {
|
||||
const formats = [
|
||||
'audio/webm;codecs=opus',
|
||||
'audio/webm',
|
||||
'audio/mp4',
|
||||
'audio/mp4;codecs=mp4a.40.2',
|
||||
'audio/aac',
|
||||
'audio/ogg;codecs=opus',
|
||||
'audio/wav'
|
||||
]
|
||||
for (const format of formats) {
|
||||
if (MediaRecorder.isTypeSupported(format)) {
|
||||
return format
|
||||
}
|
||||
}
|
||||
return ''
|
||||
}
|
||||
|
||||
// ── Device selection ──
|
||||
|
||||
async function loadAudioDevices(skipPermission = false) {
|
||||
try {
|
||||
if (!skipPermission) {
|
||||
const tempStream = await navigator.mediaDevices.getUserMedia({ audio: true })
|
||||
tempStream.getTracks().forEach(track => track.stop())
|
||||
}
|
||||
const devices = await navigator.mediaDevices.enumerateDevices()
|
||||
audioDevices.value = devices.filter(d => d.kind === 'audioinput')
|
||||
if (!selectedDeviceId.value && audioDevices.value.length > 0) {
|
||||
selectedDeviceId.value = audioDevices.value[0]?.deviceId || ''
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('[VoiceInput] Failed to enumerate devices:', e)
|
||||
}
|
||||
}
|
||||
|
||||
function selectMicrophone(deviceId: string) {
|
||||
selectedDeviceId.value = deviceId
|
||||
if (isRecording.value) {
|
||||
stopRecording()
|
||||
setTimeout(() => startRecording(), 100)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Web Speech API ──
|
||||
|
||||
function initRecognition(): SpeechRecognition | null {
|
||||
const SR = (window as any).SpeechRecognition || (window as any).webkitSpeechRecognition
|
||||
if (!SR) {
|
||||
error.value = 'Speech recognition not supported in this browser'
|
||||
return null
|
||||
}
|
||||
|
||||
const rec = new SR() as SpeechRecognition
|
||||
rec.continuous = !isAndroid.value
|
||||
rec.interimResults = true
|
||||
rec.lang = language
|
||||
|
||||
rec.onresult = (event: SpeechRecognitionEvent) => {
|
||||
let interim = ''
|
||||
let final = ''
|
||||
|
||||
for (let i = event.resultIndex; i < event.results.length; i++) {
|
||||
const result = event.results[i]
|
||||
if (!result || !result[0]) continue
|
||||
if (result.isFinal) {
|
||||
final += result[0].transcript + ' '
|
||||
} else {
|
||||
interim += result[0].transcript
|
||||
}
|
||||
}
|
||||
|
||||
if (final) {
|
||||
const trimmedFinal = final.trim()
|
||||
if (isAndroid.value && lastProcessedResult && trimmedFinal.startsWith(lastProcessedResult.trim())) {
|
||||
const newPart = trimmedFinal.slice(lastProcessedResult.trim().length).trim()
|
||||
if (newPart) {
|
||||
transcript.value += newPart + ' '
|
||||
lastProcessedResult = trimmedFinal
|
||||
}
|
||||
} else {
|
||||
transcript.value += final
|
||||
lastProcessedResult = trimmedFinal
|
||||
}
|
||||
}
|
||||
interimTranscript.value = interim
|
||||
}
|
||||
|
||||
rec.onerror = (event: SpeechRecognitionErrorEvent) => {
|
||||
console.error('[VoiceInput] Recognition error:', event.error)
|
||||
if (event.error === 'not-allowed') {
|
||||
error.value = 'Microphone access denied'
|
||||
} else {
|
||||
error.value = `Error: ${event.error}`
|
||||
}
|
||||
isRecording.value = false
|
||||
}
|
||||
|
||||
rec.onend = () => {
|
||||
if (isRecording.value && voiceMode.value === 'web') {
|
||||
if (isAndroid.value) {
|
||||
isRecording.value = false
|
||||
} else {
|
||||
rec.start()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rec
|
||||
}
|
||||
|
||||
// ── Whisper recording ──
|
||||
// Strategy: stop/restart MediaRecorder every CHUNK_INTERVAL_MS so each
|
||||
// segment sent to the server is a complete, independently-decodable WebM file.
|
||||
// Transcript text is accumulated (appended) on the client per segment.
|
||||
|
||||
function handleTranscription(msg: {
|
||||
success?: boolean
|
||||
text?: string
|
||||
error?: string
|
||||
partial?: boolean
|
||||
}) {
|
||||
if (!isRecording.value && !msg.partial) {
|
||||
// Process final results even after stop (they arrive async)
|
||||
} else if (!isRecording.value) {
|
||||
return
|
||||
}
|
||||
|
||||
if (msg.success && msg.text) {
|
||||
const newText = msg.text.trim()
|
||||
if (newText) {
|
||||
transcript.value += newText + ' '
|
||||
}
|
||||
interimTranscript.value = ''
|
||||
} else if (msg.error) {
|
||||
error.value = msg.error
|
||||
}
|
||||
}
|
||||
|
||||
function sendWhisperBlob(blob: Blob, partial: boolean) {
|
||||
const reader = new FileReader()
|
||||
reader.onloadend = () => {
|
||||
const base64 = (reader.result as string).split(',')[1]
|
||||
sendAudio(base64, 'es', partial)
|
||||
}
|
||||
reader.readAsDataURL(blob)
|
||||
}
|
||||
|
||||
function createRecorderSegment() {
|
||||
if (!mediaStream) return
|
||||
|
||||
const recorderOptions: MediaRecorderOptions = {}
|
||||
if (supportedMimeType) {
|
||||
recorderOptions.mimeType = supportedMimeType
|
||||
}
|
||||
|
||||
mediaRecorder = new MediaRecorder(mediaStream, recorderOptions)
|
||||
audioChunks = []
|
||||
|
||||
mediaRecorder.ondataavailable = (event) => {
|
||||
if (event.data.size > 0) {
|
||||
audioChunks.push(event.data)
|
||||
}
|
||||
}
|
||||
|
||||
mediaRecorder.onstop = () => {
|
||||
if (audioChunks.length > 0) {
|
||||
const mimeType = mediaRecorder?.mimeType || supportedMimeType || 'audio/webm'
|
||||
const segmentBlob = new Blob(audioChunks, { type: mimeType })
|
||||
audioChunks = []
|
||||
const isFinal = !isRecording.value
|
||||
|
||||
if (isFinal) {
|
||||
// Save last segment for playback
|
||||
saveAudioForPlayback(segmentBlob)
|
||||
}
|
||||
|
||||
// Skip tiny segments (silence) for intermediate, always send final
|
||||
if (segmentBlob.size >= 2000 || isFinal) {
|
||||
sendWhisperBlob(segmentBlob, !isFinal)
|
||||
}
|
||||
}
|
||||
|
||||
// If still recording, start next segment on the same stream
|
||||
if (isRecording.value && mediaStream) {
|
||||
createRecorderSegment()
|
||||
mediaRecorder!.start()
|
||||
} else {
|
||||
// Final: cleanup stream
|
||||
if (mediaStream) {
|
||||
mediaStream.getTracks().forEach(track => track.stop())
|
||||
mediaStream = null
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function startWhisperRecording() {
|
||||
if (!isConnected()) {
|
||||
error.value = 'Whisper server not connected'
|
||||
return
|
||||
}
|
||||
|
||||
try {
|
||||
const audioConstraints: MediaTrackConstraints = {
|
||||
echoCancellation: true,
|
||||
noiseSuppression: true,
|
||||
autoGainControl: true,
|
||||
...(selectedDeviceId.value ? { deviceId: { exact: selectedDeviceId.value } } : {})
|
||||
}
|
||||
|
||||
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: audioConstraints })
|
||||
|
||||
createRecorderSegment()
|
||||
mediaRecorder!.start()
|
||||
isRecording.value = true
|
||||
|
||||
// Stop/restart every CHUNK_INTERVAL_MS for streaming partial transcription
|
||||
// Each segment is a complete WebM file (own EBML header)
|
||||
chunkInterval = window.setInterval(() => {
|
||||
if (isRecording.value && mediaRecorder?.state === 'recording' && isConnected()) {
|
||||
mediaRecorder.stop() // onstop sends segment and restarts recorder
|
||||
}
|
||||
}, CHUNK_INTERVAL_MS)
|
||||
|
||||
// Refresh device labels after first recording
|
||||
loadAudioDevices(true)
|
||||
} catch (e: any) {
|
||||
error.value = `Microphone error: ${e.message}`
|
||||
console.error('[VoiceInput] Microphone error:', e)
|
||||
}
|
||||
}
|
||||
|
||||
function stopWhisperRecording() {
|
||||
if (chunkInterval) {
|
||||
clearInterval(chunkInterval)
|
||||
chunkInterval = null
|
||||
}
|
||||
|
||||
isRecording.value = false
|
||||
|
||||
if (mediaRecorder && mediaRecorder.state !== 'inactive') {
|
||||
mediaRecorder.stop() // onstop sends final segment + cleanup
|
||||
} else {
|
||||
if (mediaStream) {
|
||||
mediaStream.getTracks().forEach(track => track.stop())
|
||||
mediaStream = null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Public recording controls ──
|
||||
|
||||
function startRecording() {
|
||||
error.value = ''
|
||||
|
||||
if (voiceMode.value === 'whisper' && isConnected()) {
|
||||
startWhisperRecording()
|
||||
} else {
|
||||
// Fallback to Web Speech API (or explicit web mode)
|
||||
if (voiceMode.value === 'whisper' && !isConnected()) {
|
||||
voiceMode.value = 'web'
|
||||
}
|
||||
if (!recognition) {
|
||||
recognition = initRecognition()
|
||||
}
|
||||
if (recognition) {
|
||||
try {
|
||||
recognition.start()
|
||||
isRecording.value = true
|
||||
} catch (e) {
|
||||
console.error('[VoiceInput] Failed to start Web Speech:', e)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function stopRecording() {
|
||||
if (voiceMode.value === 'whisper' && mediaRecorder) {
|
||||
stopWhisperRecording()
|
||||
} else {
|
||||
if (recognition) {
|
||||
recognition.stop()
|
||||
}
|
||||
isRecording.value = false
|
||||
}
|
||||
interimTranscript.value = ''
|
||||
}
|
||||
|
||||
function toggleRecording() {
|
||||
if (isRecording.value) {
|
||||
stopRecording()
|
||||
} else {
|
||||
startRecording()
|
||||
}
|
||||
}
|
||||
|
||||
function setMode(mode: VoiceMode) {
|
||||
if (isRecording.value) {
|
||||
stopRecording()
|
||||
}
|
||||
voiceMode.value = mode
|
||||
}
|
||||
|
||||
// ── Audio playback ──
|
||||
|
||||
function saveAudioForPlayback(blob: Blob) {
|
||||
if (lastAudioUrl.value) {
|
||||
URL.revokeObjectURL(lastAudioUrl.value)
|
||||
}
|
||||
lastAudioUrl.value = URL.createObjectURL(blob)
|
||||
}
|
||||
|
||||
function playLastAudio() {
|
||||
if (!lastAudioUrl.value) return
|
||||
if (audioElement) {
|
||||
audioElement.pause()
|
||||
audioElement = null
|
||||
}
|
||||
isPlayingAudio.value = true
|
||||
audioElement = new Audio(lastAudioUrl.value)
|
||||
audioElement.onended = () => { isPlayingAudio.value = false }
|
||||
audioElement.onerror = () => { isPlayingAudio.value = false }
|
||||
audioElement.play().catch(() => { isPlayingAudio.value = false })
|
||||
}
|
||||
|
||||
function clearTranscript() {
|
||||
transcript.value = ''
|
||||
interimTranscript.value = ''
|
||||
lastProcessedResult = ''
|
||||
}
|
||||
|
||||
// ── Lifecycle ──
|
||||
|
||||
let gpuPollTimer: number | null = null
|
||||
|
||||
async function init() {
|
||||
checkMobile()
|
||||
supportedMimeType = detectAudioFormat()
|
||||
await loadAudioDevices(true)
|
||||
|
||||
// Subscribe to whisper transcriptions via singleton
|
||||
unsubTranscription = onTranscription(handleTranscription)
|
||||
|
||||
// GPU is the default mode — always start as whisper
|
||||
voiceMode.value = 'whisper'
|
||||
|
||||
// If already ready, done
|
||||
if (whisperStatus.value === 'ready') return
|
||||
|
||||
// Poll for up to 60 seconds waiting for GPU to connect
|
||||
const GPU_POLL_INTERVAL = 2000
|
||||
const GPU_POLL_MAX = 60000
|
||||
let elapsed = 0
|
||||
|
||||
gpuPollTimer = window.setInterval(() => {
|
||||
elapsed += GPU_POLL_INTERVAL
|
||||
if (whisperStatus.value === 'ready') {
|
||||
// GPU connected — stay in whisper mode
|
||||
if (gpuPollTimer) clearInterval(gpuPollTimer)
|
||||
gpuPollTimer = null
|
||||
console.log('[VoiceInput] GPU connected after', elapsed, 'ms')
|
||||
return
|
||||
}
|
||||
if (elapsed >= GPU_POLL_MAX) {
|
||||
// Timeout — fallback to web speech
|
||||
if (gpuPollTimer) clearInterval(gpuPollTimer)
|
||||
gpuPollTimer = null
|
||||
voiceMode.value = 'web'
|
||||
console.warn('[VoiceInput] GPU timeout after 60s, falling back to Web Speech')
|
||||
}
|
||||
}, GPU_POLL_INTERVAL)
|
||||
}
|
||||
|
||||
function cleanup() {
|
||||
if (gpuPollTimer) {
|
||||
clearInterval(gpuPollTimer)
|
||||
gpuPollTimer = null
|
||||
}
|
||||
if (isRecording.value) {
|
||||
stopRecording()
|
||||
}
|
||||
if (recognition) {
|
||||
recognition.abort()
|
||||
recognition = null
|
||||
}
|
||||
if (unsubTranscription) {
|
||||
unsubTranscription()
|
||||
unsubTranscription = null
|
||||
}
|
||||
if (chunkInterval) {
|
||||
clearInterval(chunkInterval)
|
||||
chunkInterval = null
|
||||
}
|
||||
if (mediaStream) {
|
||||
mediaStream.getTracks().forEach(track => track.stop())
|
||||
mediaStream = null
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
isRecording,
|
||||
transcript,
|
||||
interimTranscript,
|
||||
error,
|
||||
voiceMode,
|
||||
whisperStatus,
|
||||
audioDevices,
|
||||
selectedDeviceId,
|
||||
isAndroid,
|
||||
lastAudioUrl,
|
||||
isPlayingAudio,
|
||||
|
||||
startRecording,
|
||||
stopRecording,
|
||||
toggleRecording,
|
||||
setMode,
|
||||
loadAudioDevices,
|
||||
selectMicrophone,
|
||||
clearTranscript,
|
||||
playLastAudio,
|
||||
init,
|
||||
cleanup
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user