fix: Improve Whisper transcription with WebM to WAV conversion
- Add ffmpeg conversion from WebM/Opus to WAV (16kHz mono PCM) - Optimize transcription parameters (VAD, temperature, beam_size) - Add Honduras Spanish context prompt with local expressions - Fix chunk accumulation display in voice panel - Add 1.5s recording buffer after releasing Ctrl+Space - Skip small audio chunks (<5KB) that cause ffmpeg errors - Use large-v3 model for better accuracy
This commit is contained in:
@@ -42,6 +42,7 @@ const connected = ref(false)
|
||||
let keyDownTime = 0
|
||||
let holdTimeout: number | null = null
|
||||
const isPushToTalk = ref(false)
|
||||
let pendingWhisperSend = false // Flag to send transcript when Whisper responds
|
||||
|
||||
// ============ WHISPER MODE ============
|
||||
const useWhisper = ref(false)
|
||||
@@ -194,19 +195,27 @@ function connectWhisperSocket() {
|
||||
const fullText = msg.text.trim()
|
||||
|
||||
if (msg.partial) {
|
||||
// For partial results, show as interim (will be replaced)
|
||||
// Only show text that's new since last transcription
|
||||
const newText = fullText.substring(lastTranscriptLength).trim()
|
||||
if (newText) {
|
||||
interimTranscript.value = newText
|
||||
console.log(`[Voice] 🔄 WHISPER partial:`, newText)
|
||||
}
|
||||
} else {
|
||||
// Final result - replace everything
|
||||
// For partial results, show full accumulated transcription
|
||||
transcript.value = fullText + ' '
|
||||
interimTranscript.value = ''
|
||||
console.log(`[Voice] 🔄 WHISPER partial:`, fullText)
|
||||
} else {
|
||||
// Final result
|
||||
transcript.value = fullText + ' '
|
||||
interimTranscript.value = ''
|
||||
lastTranscriptLength = 0
|
||||
console.log(`[Voice] 🎯 WHISPER-GPU (${msg.model}/${msg.device}):`, fullText)
|
||||
|
||||
// Auto-send if push-to-talk was waiting for this
|
||||
if (pendingWhisperSend) {
|
||||
pendingWhisperSend = false
|
||||
console.log('[Voice] Whisper response received, sending transcript')
|
||||
if (transcript.value.trim()) {
|
||||
sendTranscriptAndClose()
|
||||
} else {
|
||||
isPushToTalk.value = false
|
||||
close()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update last transcript length for next partial
|
||||
@@ -214,6 +223,11 @@ function connectWhisperSocket() {
|
||||
} else if (msg.error) {
|
||||
error.value = msg.error
|
||||
console.error('[Voice] Whisper error:', msg.error)
|
||||
// Clear pending send on error
|
||||
if (pendingWhisperSend) {
|
||||
pendingWhisperSend = false
|
||||
isPushToTalk.value = false
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
@@ -263,7 +277,6 @@ async function startWhisperRecording() {
|
||||
// Start recording
|
||||
mediaRecorder.start(100) // Collect data every 100ms
|
||||
isRecording.value = true
|
||||
interimTranscript.value = 'Listening (Whisper GPU)...'
|
||||
|
||||
// Send chunks periodically for progressive transcription
|
||||
chunkInterval = window.setInterval(() => {
|
||||
@@ -283,6 +296,16 @@ function sendAudioChunk(isFinal: boolean) {
|
||||
|
||||
// Always send ALL accumulated audio (webm needs header from first chunk)
|
||||
const audioBlob = new Blob(audioChunks, { type: 'audio/webm' })
|
||||
const chunkCount = audioChunks.length
|
||||
|
||||
// Skip if audio is too small (< 5KB) - WebM header alone is ~1-2KB
|
||||
if (audioBlob.size < 5000) {
|
||||
console.log(`[Voice] Skipping small chunk (${audioBlob.size} bytes)`)
|
||||
if (isFinal) {
|
||||
audioChunks = []
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Clear chunks only if final
|
||||
if (isFinal) {
|
||||
@@ -295,16 +318,13 @@ function sendAudioChunk(isFinal: boolean) {
|
||||
const base64 = (reader.result as string).split(',')[1]
|
||||
|
||||
if (whisperSocket?.readyState === WebSocket.OPEN) {
|
||||
if (!isFinal) {
|
||||
interimTranscript.value = 'Processing...'
|
||||
}
|
||||
whisperSocket.send(JSON.stringify({
|
||||
type: 'transcribe',
|
||||
audio: base64,
|
||||
language: 'es',
|
||||
partial: !isFinal
|
||||
}))
|
||||
console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${audioChunks.length} chunks, ${audioBlob.size} bytes)`)
|
||||
console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${chunkCount} chunks, ${audioBlob.size} bytes)`)
|
||||
}
|
||||
}
|
||||
reader.readAsDataURL(audioBlob)
|
||||
@@ -524,21 +544,31 @@ function handleKeyUp(e: KeyboardEvent) {
|
||||
holdTimeout = null
|
||||
}
|
||||
|
||||
// If was push-to-talk recording, stop and send after 1200ms
|
||||
// If was push-to-talk recording, continue recording for 1.5s buffer then stop
|
||||
if (isPushToTalk.value && isRecording.value) {
|
||||
console.log('[Voice] Stopping recording, will send in 1200ms')
|
||||
stopRecording()
|
||||
console.log('[Voice] Key released, continuing recording for 1.5s buffer...')
|
||||
|
||||
// Keep recording for 1.5s more (UX buffer for trailing words)
|
||||
setTimeout(() => {
|
||||
console.log('[Voice] Sending transcript:', transcript.value.trim())
|
||||
console.log('[Voice] Socket state:', socket?.readyState)
|
||||
if (transcript.value.trim()) {
|
||||
sendTranscriptAndClose()
|
||||
console.log('[Voice] Buffer complete, stopping recording')
|
||||
stopRecording()
|
||||
|
||||
if (useWhisper.value) {
|
||||
// For Whisper: wait for server response (handled in onmessage)
|
||||
console.log('[Voice] Waiting for Whisper transcription...')
|
||||
pendingWhisperSend = true
|
||||
} else {
|
||||
// No transcript, just close
|
||||
isPushToTalk.value = false
|
||||
close()
|
||||
// For Web Speech API: send after short delay for final results
|
||||
setTimeout(() => {
|
||||
if (transcript.value.trim()) {
|
||||
sendTranscriptAndClose()
|
||||
} else {
|
||||
isPushToTalk.value = false
|
||||
close()
|
||||
}
|
||||
}, 300)
|
||||
}
|
||||
}, 1200)
|
||||
}, 1500)
|
||||
}
|
||||
|
||||
keyDownTime = 0
|
||||
|
||||
Reference in New Issue
Block a user