feat: Add Whisper GPU speech-to-text with progressive transcription

- Add faster-whisper Python server for GPU-accelerated transcription - Support dual mode: Web Speech API or Whisper GPU (toggleable) - Progressive transcription every 3 seconds while recording - Separate terminal server process (stable during hot-reload) - Add Ctrl+V paste and Ctrl+C copy support in FloatingTerminal - Add MCP tools: whisper_start, whisper_stop, whisper_toggle, whisper_status - Update package.json with separate api/terminal/frontend processes
2026-02-13 23:47:52 -06:00
parent e867b7873e
commit 638e6ac8e0
10 changed files with 1009 additions and 31 deletions
--- a/frontend/src/components/FloatingTerminal.vue
+++ b/frontend/src/components/FloatingTerminal.vue
@@ -250,13 +250,38 @@ function initTerminal() {
    }
  })

-  // Capture Ctrl+E even when terminal has focus
+  // Capture Ctrl+E and Ctrl+V when terminal has focus
  terminal.attachCustomKeyEventHandler((e) => {
+    // Ctrl+E: Toggle terminal
    if (e.ctrlKey && e.key === 'e') {
      e.preventDefault()
      toggleTerminal()
-      return false // Prevent terminal from processing
+      return false
    }
+
+    // Ctrl+V: Paste from clipboard
+    if (e.ctrlKey && e.key === 'v' && e.type === 'keydown') {
+      e.preventDefault()
+      navigator.clipboard.readText().then((text) => {
+        if (text && socket && socket.readyState === WebSocket.OPEN) {
+          socket.send(JSON.stringify({ type: 'input', data: text }))
+        }
+      }).catch((err) => {
+        console.error('[Terminal] Clipboard read failed:', err)
+      })
+      return false
+    }
+
+    // Ctrl+C: Copy selection (if any)
+    if (e.ctrlKey && e.key === 'c' && e.type === 'keydown') {
+      const selection = terminal?.getSelection()
+      if (selection) {
+        navigator.clipboard.writeText(selection).catch(console.error)
+        return false
+      }
+      // If no selection, let Ctrl+C pass through as SIGINT
+    }
+
    return true // Let terminal handle other keys
  })
 }
--- a/frontend/src/components/FloatingVoice.vue
+++ b/frontend/src/components/FloatingVoice.vue
@@ -30,19 +30,32 @@ const isDragging = ref(false)
 const dragOffset = ref({ x: 0, y: 0 })
 const containerRef = ref<HTMLElement | null>(null)

-// Speech recognition
+// Speech recognition (Web Speech API)
 let recognition: SpeechRecognition | null = null

-// WebSocket connection (own session)
+// WebSocket connection to terminal
 const WS_URL = `ws://${window.location.hostname}:4103`
 let socket: WebSocket | null = null
 const connected = ref(false)

-// Push-to-talk state (Ctrl+S)
+// Push-to-talk state (Ctrl+Space)
 let keyDownTime = 0
 let holdTimeout: number | null = null
 const isPushToTalk = ref(false)

+// ============ WHISPER MODE ============
+const useWhisper = ref(false)
+const whisperReady = ref(false)
+const whisperLoading = ref(false)
+const WHISPER_WS_URL = `ws://${window.location.hostname}:4104`
+let whisperSocket: WebSocket | null = null
+let mediaRecorder: MediaRecorder | null = null
+let audioChunks: Blob[] = []
+let lastTranscriptLength = 0 // Track length of last transcription to show only new text
+let chunkInterval: number | null = null
+const CHUNK_INTERVAL_MS = 3000 // Send audio every 3 seconds
+let mediaStream: MediaStream | null = null
+
 const displayText = computed(() => {
  if (interimTranscript.value) {
    return transcript.value + ' ' + interimTranscript.value
@@ -73,7 +86,7 @@ function initRecognition() {
  const rec = new SpeechRecognition()
  rec.continuous = true
  rec.interimResults = true
-  rec.lang = 'es-ES'
+  rec.lang = 'es-419' // Latin American Spanish (better for accents)

  rec.onresult = (event: SpeechRecognitionEvent) => {
    let interim = ''
@@ -105,7 +118,7 @@ function initRecognition() {
  }

  rec.onend = () => {
-    if (isRecording.value) {
+    if (isRecording.value && !useWhisper.value) {
      // Restart if still recording (browser stops after silence)
      rec.start()
    }
@@ -114,6 +127,215 @@ function initRecognition() {
  return rec
 }

+// ============ WHISPER FUNCTIONS ============
+
+async function checkWhisperStatus() {
+  try {
+    const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/status`)
+    const data = await res.json()
+    useWhisper.value = data.enabled
+    whisperReady.value = data.running
+    return data
+  } catch {
+    useWhisper.value = false
+    whisperReady.value = false
+    return null
+  }
+}
+
+async function toggleWhisperMode() {
+  whisperLoading.value = true
+  error.value = ''
+
+  try {
+    const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/toggle`, {
+      method: 'POST'
+    })
+    const data = await res.json()
+
+    useWhisper.value = data.enabled
+    whisperReady.value = data.running
+
+    if (data.enabled) {
+      canvasStore.showNotification('Whisper GPU enabled', 'success')
+      connectWhisperSocket()
+    } else {
+      canvasStore.showNotification('Using Web Speech API', 'info')
+      disconnectWhisperSocket()
+    }
+  } catch (e: any) {
+    error.value = 'Failed to toggle Whisper'
+    console.error('[Voice] Whisper toggle error:', e)
+  } finally {
+    whisperLoading.value = false
+  }
+}
+
+function connectWhisperSocket() {
+  if (whisperSocket?.readyState === WebSocket.OPEN) return
+
+  console.log('[Voice] Connecting to Whisper server...')
+  whisperSocket = new WebSocket(WHISPER_WS_URL)
+
+  whisperSocket.onopen = () => {
+    console.log('[Voice] Whisper WebSocket connected')
+    whisperReady.value = true
+  }
+
+  whisperSocket.onmessage = (event) => {
+    try {
+      const msg = JSON.parse(event.data)
+
+      if (msg.type === 'ready') {
+        console.log('[Voice] Whisper ready:', msg.model, msg.device)
+        whisperReady.value = true
+      } else if (msg.type === 'transcription') {
+        if (msg.success && msg.text) {
+          const fullText = msg.text.trim()
+
+          if (msg.partial) {
+            // For partial results, show as interim (will be replaced)
+            // Only show text that's new since last transcription
+            const newText = fullText.substring(lastTranscriptLength).trim()
+            if (newText) {
+              interimTranscript.value = newText
+              console.log(`[Voice] 🔄 WHISPER partial:`, newText)
+            }
+          } else {
+            // Final result - replace everything
+            transcript.value = fullText + ' '
+            interimTranscript.value = ''
+            lastTranscriptLength = 0
+            console.log(`[Voice] 🎯 WHISPER-GPU (${msg.model}/${msg.device}):`, fullText)
+          }
+
+          // Update last transcript length for next partial
+          lastTranscriptLength = fullText.length
+        } else if (msg.error) {
+          error.value = msg.error
+          console.error('[Voice] Whisper error:', msg.error)
+        }
+      }
+    } catch (e) {
+      console.error('[Voice] Whisper message error:', e)
+    }
+  }
+
+  whisperSocket.onclose = () => {
+    console.log('[Voice] Whisper WebSocket closed')
+    whisperReady.value = false
+  }
+
+  whisperSocket.onerror = (e) => {
+    console.error('[Voice] Whisper WebSocket error:', e)
+    whisperReady.value = false
+  }
+}
+
+function disconnectWhisperSocket() {
+  if (whisperSocket) {
+    whisperSocket.close()
+    whisperSocket = null
+  }
+  whisperReady.value = false
+}
+
+async function startWhisperRecording() {
+  try {
+    mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
+
+    mediaRecorder = new MediaRecorder(mediaStream, {
+      mimeType: 'audio/webm;codecs=opus'
+    })
+
+    audioChunks = []
+
+    mediaRecorder.ondataavailable = (event) => {
+      if (event.data.size > 0) {
+        audioChunks.push(event.data)
+      }
+    }
+
+    // Reset state for new recording
+    audioChunks = []
+    lastTranscriptLength = 0
+
+    // Start recording
+    mediaRecorder.start(100) // Collect data every 100ms
+    isRecording.value = true
+    interimTranscript.value = 'Listening (Whisper GPU)...'
+
+    // Send chunks periodically for progressive transcription
+    chunkInterval = window.setInterval(() => {
+      if (audioChunks.length > 0 && whisperSocket?.readyState === WebSocket.OPEN) {
+        sendAudioChunk(false) // false = partial, don't clear
+      }
+    }, CHUNK_INTERVAL_MS)
+
+  } catch (e: any) {
+    error.value = `Microphone error: ${e.message}`
+    console.error('[Voice] Microphone error:', e)
+  }
+}
+
+function sendAudioChunk(isFinal: boolean) {
+  if (audioChunks.length === 0) return
+
+  // Always send ALL accumulated audio (webm needs header from first chunk)
+  const audioBlob = new Blob(audioChunks, { type: 'audio/webm' })
+
+  // Clear chunks only if final
+  if (isFinal) {
+    audioChunks = []
+    lastTranscriptLength = 0
+  }
+
+  const reader = new FileReader()
+  reader.onloadend = () => {
+    const base64 = (reader.result as string).split(',')[1]
+
+    if (whisperSocket?.readyState === WebSocket.OPEN) {
+      if (!isFinal) {
+        interimTranscript.value = 'Processing...'
+      }
+      whisperSocket.send(JSON.stringify({
+        type: 'transcribe',
+        audio: base64,
+        language: 'es',
+        partial: !isFinal
+      }))
+      console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${audioChunks.length} chunks, ${audioBlob.size} bytes)`)
+    }
+  }
+  reader.readAsDataURL(audioBlob)
+}
+
+function stopWhisperRecording() {
+  // Clear the chunk interval
+  if (chunkInterval) {
+    clearInterval(chunkInterval)
+    chunkInterval = null
+  }
+
+  // Send final chunk
+  if (audioChunks.length > 0) {
+    sendAudioChunk(true) // true = final
+  }
+
+  // Stop recorder
+  if (mediaRecorder && mediaRecorder.state !== 'inactive') {
+    mediaRecorder.stop()
+  }
+
+  // Stop media stream
+  if (mediaStream) {
+    mediaStream.getTracks().forEach(track => track.stop())
+    mediaStream = null
+  }
+
+  isRecording.value = false
+}
+
 function toggleRecording() {
  if (isRecording.value) {
    stopRecording()
@@ -124,24 +346,35 @@ function toggleRecording() {

 function startRecording() {
  error.value = ''
-  if (!recognition) {
-    recognition = initRecognition()
-  }
-  if (recognition) {
-    try {
-      recognition.start()
-      isRecording.value = true
-    } catch (e) {
-      console.error('[Voice] Failed to start:', e)
+
+  if (useWhisper.value && whisperReady.value) {
+    // Use Whisper GPU mode
+    startWhisperRecording()
+  } else {
+    // Use Web Speech API
+    if (!recognition) {
+      recognition = initRecognition()
+    }
+    if (recognition) {
+      try {
+        recognition.start()
+        isRecording.value = true
+      } catch (e) {
+        console.error('[Voice] Failed to start:', e)
+      }
    }
  }
 }

 function stopRecording() {
-  if (recognition) {
-    recognition.stop()
+  if (useWhisper.value) {
+    stopWhisperRecording()
+  } else {
+    if (recognition) {
+      recognition.stop()
+    }
+    isRecording.value = false
  }
-  isRecording.value = false
  interimTranscript.value = ''
 }

@@ -209,6 +442,7 @@ function sendTranscript() {

 function close() {
  stopRecording()
+  clearTranscript()
  isOpen.value = false
 }

@@ -349,17 +583,28 @@ function sendTranscriptAndClose() {
  typeChar()
 }

-onMounted(() => {
+onMounted(async () => {
  recognition = initRecognition()
  // Use capture phase to intercept before terminal or other elements
  document.addEventListener('keydown', handleKeyDown, { capture: true })
  document.addEventListener('keyup', handleKeyUp, { capture: true })
+
+  // Check Whisper status on mount
+  await checkWhisperStatus()
+  if (useWhisper.value) {
+    connectWhisperSocket()
+  }
 })

 onBeforeUnmount(() => {
  stopRecording()
  recognition = null
  disconnectSocket()
+  disconnectWhisperSocket()
+  if (chunkInterval) clearInterval(chunkInterval)
+  if (mediaStream) {
+    mediaStream.getTracks().forEach(track => track.stop())
+  }
  document.removeEventListener('keydown', handleKeyDown, { capture: true })
  document.removeEventListener('keyup', handleKeyUp, { capture: true })
  document.removeEventListener('mousemove', onDrag)
@@ -408,8 +653,23 @@ defineExpose({
              </svg>
              <span>Voice</span>
              <i class="dot" :class="{ recording: isRecording, ptt: isPushToTalk }"></i>
+              <span class="mode-badge" :class="{ gpu: useWhisper }">
+                {{ useWhisper ? 'GPU' : 'Web' }}
+              </span>
            </div>
            <div class="window-controls">
+              <button
+                class="whisper-toggle"
+                :class="{ active: useWhisper, loading: whisperLoading }"
+                @click.stop="toggleWhisperMode"
+                :title="useWhisper ? 'Using Whisper GPU - Click to use Web Speech' : 'Using Web Speech - Click to use Whisper GPU'"
+              >
+                <svg width="10" height="10" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                  <rect x="4" y="4" width="16" height="16" rx="2"/>
+                  <line x1="9" y1="9" x2="9" y2="15"/>
+                  <line x1="15" y1="9" x2="15" y2="15"/>
+                </svg>
+              </button>
              <button class="x" @click="close" title="Close">
                <svg width="8" height="8" viewBox="0 0 10 10">
                  <line x1="0" y1="0" x2="10" y2="10" stroke="currentColor" stroke-width="1.5"/>
@@ -545,6 +805,53 @@ defineExpose({
  box-shadow: 0 0 6px #f90;
 }

+.mode-badge {
+  font-size: 8px;
+  padding: 1px 4px;
+  border-radius: 3px;
+  background: rgba(0, 0, 0, 0.2);
+  color: #555;
+  font-weight: 600;
+  text-transform: uppercase;
+}
+
+.mode-badge.gpu {
+  background: linear-gradient(135deg, #10b981, #059669);
+  color: #fff;
+  box-shadow: 0 0 4px rgba(16, 185, 129, 0.5);
+}
+
+.whisper-toggle {
+  width: 20px;
+  height: 18px;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  background: rgba(255, 255, 255, 0.3);
+  border: 1px solid rgba(0, 0, 0, 0.1);
+  border-radius: 3px;
+  color: #666;
+  cursor: pointer;
+  transition: all 0.15s;
+}
+
+.whisper-toggle:hover {
+  background: rgba(255, 255, 255, 0.5);
+}
+
+.whisper-toggle.active {
+  background: linear-gradient(180deg, #10b981 0%, #059669 100%);
+  border-color: #047857;
+  color: #fff;
+}
+
+.whisper-toggle.loading {
+  animation: pulse 0.6s infinite;
+  background: linear-gradient(180deg, #f59e0b 0%, #d97706 100%);
+  border-color: #b45309;
+  color: #fff;
+}
+
@keyframes pulse {
  0%, 100% { opacity: 1; }
  50% { opacity: 0.5; }