feat: Add Whisper GPU speech-to-text with progressive transcription

- Add faster-whisper Python server for GPU-accelerated transcription - Support dual mode: Web Speech API or Whisper GPU (toggleable) - Progressive transcription every 3 seconds while recording - Separate terminal server process (stable during hot-reload) - Add Ctrl+V paste and Ctrl+C copy support in FloatingTerminal - Add MCP tools: whisper_start, whisper_stop, whisper_toggle, whisper_status - Update package.json with separate api/terminal/frontend processes
2026-02-13 23:47:52 -06:00
parent e867b7873e
commit 638e6ac8e0
10 changed files with 1009 additions and 31 deletions
--- a/frontend/src/components/FloatingTerminal.vue
+++ b/frontend/src/components/FloatingTerminal.vue
@@ -250,13 +250,38 @@ function initTerminal() {
    }
  })
-  // Capture Ctrl+E even when terminal has focus
+  // Capture Ctrl+E and Ctrl+V when terminal has focus
  terminal.attachCustomKeyEventHandler((e) => {
    // Ctrl+E: Toggle terminal
    if (e.ctrlKey && e.key === 'e') {
      e.preventDefault()
      toggleTerminal()
-      return false // Prevent terminal from processing
+      return false
    }
    // Ctrl+V: Paste from clipboard
    if (e.ctrlKey && e.key === 'v' && e.type === 'keydown') {
      e.preventDefault()
      navigator.clipboard.readText().then((text) => {
        if (text && socket && socket.readyState === WebSocket.OPEN) {
          socket.send(JSON.stringify({ type: 'input', data: text }))
        }
      }).catch((err) => {
        console.error('[Terminal] Clipboard read failed:', err)
      })
      return false
    }
    // Ctrl+C: Copy selection (if any)
    if (e.ctrlKey && e.key === 'c' && e.type === 'keydown') {
      const selection = terminal?.getSelection()
      if (selection) {
        navigator.clipboard.writeText(selection).catch(console.error)
        return false
      }
      // If no selection, let Ctrl+C pass through as SIGINT
    }
    return true // Let terminal handle other keys
  })
 }
--- a/frontend/src/components/FloatingVoice.vue
+++ b/frontend/src/components/FloatingVoice.vue
@@ -30,19 +30,32 @@ const isDragging = ref(false)
 const dragOffset = ref({ x: 0, y: 0 })
 const containerRef = ref<HTMLElement | null>(null)
-// Speech recognition
+// Speech recognition (Web Speech API)
 let recognition: SpeechRecognition | null = null
-// WebSocket connection (own session)
+// WebSocket connection to terminal
 const WS_URL = `ws://${window.location.hostname}:4103`
 let socket: WebSocket | null = null
 const connected = ref(false)
-// Push-to-talk state (Ctrl+S)
+// Push-to-talk state (Ctrl+Space)
 let keyDownTime = 0
 let holdTimeout: number | null = null
 const isPushToTalk = ref(false)
 // ============ WHISPER MODE ============
 const useWhisper = ref(false)
 const whisperReady = ref(false)
 const whisperLoading = ref(false)
 const WHISPER_WS_URL = `ws://${window.location.hostname}:4104`
 let whisperSocket: WebSocket | null = null
 let mediaRecorder: MediaRecorder | null = null
 let audioChunks: Blob[] = []
 let lastTranscriptLength = 0 // Track length of last transcription to show only new text
 let chunkInterval: number | null = null
 const CHUNK_INTERVAL_MS = 3000 // Send audio every 3 seconds
 let mediaStream: MediaStream | null = null
 const displayText = computed(() => {
  if (interimTranscript.value) {
    return transcript.value + ' ' + interimTranscript.value
@@ -73,7 +86,7 @@ function initRecognition() {
  const rec = new SpeechRecognition()
  rec.continuous = true
  rec.interimResults = true
-  rec.lang = 'es-ES'
+  rec.lang = 'es-419' // Latin American Spanish (better for accents)
  rec.onresult = (event: SpeechRecognitionEvent) => {
    let interim = ''
@@ -105,7 +118,7 @@ function initRecognition() {
  }
  rec.onend = () => {
-    if (isRecording.value) {
+    if (isRecording.value && !useWhisper.value) {
      // Restart if still recording (browser stops after silence)
      rec.start()
    }
@@ -114,6 +127,215 @@ function initRecognition() {
  return rec
 }
 // ============ WHISPER FUNCTIONS ============
 async function checkWhisperStatus() {
  try {
    const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/status`)
    const data = await res.json()
    useWhisper.value = data.enabled
    whisperReady.value = data.running
    return data
  } catch {
    useWhisper.value = false
    whisperReady.value = false
    return null
  }
 }
 async function toggleWhisperMode() {
  whisperLoading.value = true
  error.value = ''
  try {
    const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/toggle`, {
      method: 'POST'
    })
    const data = await res.json()
    useWhisper.value = data.enabled
    whisperReady.value = data.running
    if (data.enabled) {
      canvasStore.showNotification('Whisper GPU enabled', 'success')
      connectWhisperSocket()
    } else {
      canvasStore.showNotification('Using Web Speech API', 'info')
      disconnectWhisperSocket()
    }
  } catch (e: any) {
    error.value = 'Failed to toggle Whisper'
    console.error('[Voice] Whisper toggle error:', e)
  } finally {
    whisperLoading.value = false
  }
 }
 function connectWhisperSocket() {
  if (whisperSocket?.readyState === WebSocket.OPEN) return
  console.log('[Voice] Connecting to Whisper server...')
  whisperSocket = new WebSocket(WHISPER_WS_URL)
  whisperSocket.onopen = () => {
    console.log('[Voice] Whisper WebSocket connected')
    whisperReady.value = true
  }
  whisperSocket.onmessage = (event) => {
    try {
      const msg = JSON.parse(event.data)
      if (msg.type === 'ready') {
        console.log('[Voice] Whisper ready:', msg.model, msg.device)
        whisperReady.value = true
      } else if (msg.type === 'transcription') {
        if (msg.success && msg.text) {
          const fullText = msg.text.trim()
          if (msg.partial) {
            // For partial results, show as interim (will be replaced)
            // Only show text that's new since last transcription
            const newText = fullText.substring(lastTranscriptLength).trim()
            if (newText) {
              interimTranscript.value = newText
              console.log(`[Voice] 🔄 WHISPER partial:`, newText)
            }
          } else {
            // Final result - replace everything
            transcript.value = fullText + ' '
            interimTranscript.value = ''
            lastTranscriptLength = 0
            console.log(`[Voice] 🎯 WHISPER-GPU (${msg.model}/${msg.device}):`, fullText)
          }
          // Update last transcript length for next partial
          lastTranscriptLength = fullText.length
        } else if (msg.error) {
          error.value = msg.error
          console.error('[Voice] Whisper error:', msg.error)
        }
      }
    } catch (e) {
      console.error('[Voice] Whisper message error:', e)
    }
  }
  whisperSocket.onclose = () => {
    console.log('[Voice] Whisper WebSocket closed')
    whisperReady.value = false
  }
  whisperSocket.onerror = (e) => {
    console.error('[Voice] Whisper WebSocket error:', e)
    whisperReady.value = false
  }
 }
 function disconnectWhisperSocket() {
  if (whisperSocket) {
    whisperSocket.close()
    whisperSocket = null
  }
  whisperReady.value = false
 }
 async function startWhisperRecording() {
  try {
    mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
    mediaRecorder = new MediaRecorder(mediaStream, {
      mimeType: 'audio/webm;codecs=opus'
    })
    audioChunks = []
    mediaRecorder.ondataavailable = (event) => {
      if (event.data.size > 0) {
        audioChunks.push(event.data)
      }
    }
    // Reset state for new recording
    audioChunks = []
    lastTranscriptLength = 0
    // Start recording
    mediaRecorder.start(100) // Collect data every 100ms
    isRecording.value = true
    interimTranscript.value = 'Listening (Whisper GPU)...'
    // Send chunks periodically for progressive transcription
    chunkInterval = window.setInterval(() => {
      if (audioChunks.length > 0 && whisperSocket?.readyState === WebSocket.OPEN) {
        sendAudioChunk(false) // false = partial, don't clear
      }
    }, CHUNK_INTERVAL_MS)
  } catch (e: any) {
    error.value = `Microphone error: ${e.message}`
    console.error('[Voice] Microphone error:', e)
  }
 }
 function sendAudioChunk(isFinal: boolean) {
  if (audioChunks.length === 0) return
  // Always send ALL accumulated audio (webm needs header from first chunk)
  const audioBlob = new Blob(audioChunks, { type: 'audio/webm' })
  // Clear chunks only if final
  if (isFinal) {
    audioChunks = []
    lastTranscriptLength = 0
  }
  const reader = new FileReader()
  reader.onloadend = () => {
    const base64 = (reader.result as string).split(',')[1]
    if (whisperSocket?.readyState === WebSocket.OPEN) {
      if (!isFinal) {
        interimTranscript.value = 'Processing...'
      }
      whisperSocket.send(JSON.stringify({
        type: 'transcribe',
        audio: base64,
        language: 'es',
        partial: !isFinal
      }))
      console.log(`[Voice] Sent ${isFinal ? 'FINAL' : 'partial'} audio (${audioChunks.length} chunks, ${audioBlob.size} bytes)`)
    }
  }
  reader.readAsDataURL(audioBlob)
 }
 function stopWhisperRecording() {
  // Clear the chunk interval
  if (chunkInterval) {
    clearInterval(chunkInterval)
    chunkInterval = null
  }
  // Send final chunk
  if (audioChunks.length > 0) {
    sendAudioChunk(true) // true = final
  }
  // Stop recorder
  if (mediaRecorder && mediaRecorder.state !== 'inactive') {
    mediaRecorder.stop()
  }
  // Stop media stream
  if (mediaStream) {
    mediaStream.getTracks().forEach(track => track.stop())
    mediaStream = null
  }
  isRecording.value = false
 }
 function toggleRecording() {
  if (isRecording.value) {
    stopRecording()
@@ -124,6 +346,12 @@ function toggleRecording() {
 function startRecording() {
  error.value = ''
  if (useWhisper.value && whisperReady.value) {
    // Use Whisper GPU mode
    startWhisperRecording()
  } else {
    // Use Web Speech API
    if (!recognition) {
      recognition = initRecognition()
    }
@@ -135,13 +363,18 @@ function startRecording() {
        console.error('[Voice] Failed to start:', e)
      }
    }
  }
 }
 function stopRecording() {
  if (useWhisper.value) {
    stopWhisperRecording()
  } else {
    if (recognition) {
      recognition.stop()
    }
    isRecording.value = false
  }
  interimTranscript.value = ''
 }
@@ -209,6 +442,7 @@ function sendTranscript() {
 function close() {
  stopRecording()
  clearTranscript()
  isOpen.value = false
 }
@@ -349,17 +583,28 @@ function sendTranscriptAndClose() {
  typeChar()
 }
-onMounted(() => {
+onMounted(async () => {
  recognition = initRecognition()
  // Use capture phase to intercept before terminal or other elements
  document.addEventListener('keydown', handleKeyDown, { capture: true })
  document.addEventListener('keyup', handleKeyUp, { capture: true })
  // Check Whisper status on mount
  await checkWhisperStatus()
  if (useWhisper.value) {
    connectWhisperSocket()
  }
 })
 onBeforeUnmount(() => {
  stopRecording()
  recognition = null
  disconnectSocket()
  disconnectWhisperSocket()
  if (chunkInterval) clearInterval(chunkInterval)
  if (mediaStream) {
    mediaStream.getTracks().forEach(track => track.stop())
  }
  document.removeEventListener('keydown', handleKeyDown, { capture: true })
  document.removeEventListener('keyup', handleKeyUp, { capture: true })
  document.removeEventListener('mousemove', onDrag)
@@ -408,8 +653,23 @@ defineExpose({
              </svg>
              <span>Voice</span>
              <i class="dot" :class="{ recording: isRecording, ptt: isPushToTalk }"></i>
              <span class="mode-badge" :class="{ gpu: useWhisper }">
                {{ useWhisper ? 'GPU' : 'Web' }}
              </span>
            </div>
            <div class="window-controls">
              <button
                class="whisper-toggle"
                :class="{ active: useWhisper, loading: whisperLoading }"
                @click.stop="toggleWhisperMode"
                :title="useWhisper ? 'Using Whisper GPU - Click to use Web Speech' : 'Using Web Speech - Click to use Whisper GPU'"
              >
                <svg width="10" height="10" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                  <rect x="4" y="4" width="16" height="16" rx="2"/>
                  <line x1="9" y1="9" x2="9" y2="15"/>
                  <line x1="15" y1="9" x2="15" y2="15"/>
                </svg>
              </button>
              <button class="x" @click="close" title="Close">
                <svg width="8" height="8" viewBox="0 0 10 10">
                  <line x1="0" y1="0" x2="10" y2="10" stroke="currentColor" stroke-width="1.5"/>
@@ -545,6 +805,53 @@ defineExpose({
  box-shadow: 0 0 6px #f90;
 }
 .mode-badge {
  font-size: 8px;
  padding: 1px 4px;
  border-radius: 3px;
  background: rgba(0, 0, 0, 0.2);
  color: #555;
  font-weight: 600;
  text-transform: uppercase;
 }
 .mode-badge.gpu {
  background: linear-gradient(135deg, #10b981, #059669);
  color: #fff;
  box-shadow: 0 0 4px rgba(16, 185, 129, 0.5);
 }
 .whisper-toggle {
  width: 20px;
  height: 18px;
  display: flex;
  align-items: center;
  justify-content: center;
  background: rgba(255, 255, 255, 0.3);
  border: 1px solid rgba(0, 0, 0, 0.1);
  border-radius: 3px;
  color: #666;
  cursor: pointer;
  transition: all 0.15s;
 }
 .whisper-toggle:hover {
  background: rgba(255, 255, 255, 0.5);
 }
 .whisper-toggle.active {
  background: linear-gradient(180deg, #10b981 0%, #059669 100%);
  border-color: #047857;
  color: #fff;
 }
 .whisper-toggle.loading {
  animation: pulse 0.6s infinite;
  background: linear-gradient(180deg, #f59e0b 0%, #d97706 100%);
  border-color: #b45309;
  color: #fff;
 }
@keyframes pulse {
  0%, 100% { opacity: 1; }
  50% { opacity: 0.5; }
--- a/frontend/src/services/tools/handlers/globalHandlers.ts
+++ b/frontend/src/services/tools/handlers/globalHandlers.ts
@@ -223,6 +223,112 @@ export function createGlobalHandlers(callbacks: ToolManagementCallbacks): ToolCo
        }, 100)
        return 'Recargando pagina...'
      }
    },
    {
      name: 'whisper_status',
      description: 'Obtiene el estado del servidor Whisper GPU para speech-to-text.',
      category: 'global',
      schema: {
        type: 'object',
        properties: {}
      },
      handler: async () => {
        try {
          const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/status`)
          const data = await res.json()
          return `Whisper GPU Status:\n` +
            `  Enabled: ${data.enabled ? 'Yes' : 'No'}\n` +
            `  Running: ${data.running ? 'Yes' : 'No'}\n` +
            `  Model: ${data.model}\n` +
            `  Device: ${data.device}\n` +
            `  Port: ${data.port}`
        } catch (e: any) {
          return `Error checking Whisper status: ${e.message}`
        }
      }
    },
    {
      name: 'whisper_toggle',
      description: 'Activa o desactiva Whisper GPU para speech-to-text. Cuando esta activo usa la GPU para transcribir voz con mejor precision para acentos latinos.',
      category: 'global',
      schema: {
        type: 'object',
        properties: {}
      },
      handler: async () => {
        try {
          const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/toggle`, {
            method: 'POST'
          })
          const data = await res.json()
          if (data.enabled) {
            return `Whisper GPU ENABLED\n` +
              `  Model: ${data.model}\n` +
              `  Device: ${data.device}\n` +
              `  Port: ws://localhost:${data.port}\n\n` +
              `Voice input will now use GPU-accelerated transcription.`
          } else {
            return `Whisper GPU DISABLED\n\n` +
              `Voice input will use Web Speech API (browser native).`
          }
        } catch (e: any) {
          return `Error toggling Whisper: ${e.message}`
        }
      }
    },
    {
      name: 'whisper_start',
      description: 'Inicia el servidor Whisper GPU si no esta corriendo.',
      category: 'global',
      schema: {
        type: 'object',
        properties: {}
      },
      handler: async () => {
        try {
          const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/start`, {
            method: 'POST'
          })
          const data = await res.json()
          if (data.success) {
            return `Whisper server started!\n` +
              `  Model: ${data.model}\n` +
              `  Device: ${data.device}\n` +
              `  Ready: ${data.running ? 'Yes' : 'Loading...'}`
          } else {
            return `Failed to start Whisper server: ${data.message}`
          }
        } catch (e: any) {
          return `Error starting Whisper: ${e.message}`
        }
      }
    },
    {
      name: 'whisper_stop',
      description: 'Detiene el servidor Whisper GPU para liberar memoria de la GPU.',
      category: 'global',
      schema: {
        type: 'object',
        properties: {}
      },
      handler: async () => {
        try {
          const res = await fetch(`http://${window.location.hostname}:4100/api/whisper/stop`, {
            method: 'POST'
          })
          const data = await res.json()
          if (data.success) {
            return `Whisper server stopped. GPU memory released.`
          } else {
            return `Failed to stop Whisper server: ${data.message}`
          }
        } catch (e: any) {
          return `Error stopping Whisper: ${e.message}`
        }
      }
    }
  ]
 }
--- a/package.json
+++ b/package.json
@@ -3,7 +3,10 @@
  "version": "1.0.0",
  "description": "Dynamic canvas for Claude Code interaction",
  "scripts": {
-    "start": "concurrently -n server,frontend -c blue,green \"cd server && bun --watch run index.ts\" \"cd frontend && bun run dev --host\""
+    "start": "concurrently -n api,terminal,frontend -c blue,yellow,green \"cd server && bun --watch run index.ts\" \"cd server && bun run terminal.ts\" \"cd frontend && bun run dev --host\"",
    "start:api": "cd server && bun --watch run index.ts",
    "start:terminal": "cd server && bun run terminal.ts",
    "start:frontend": "cd frontend && bun run dev --host"
  },
  "devDependencies": {
    "concurrently": "^9.2.1"
--- a/server/index.ts
+++ b/server/index.ts
@@ -1,7 +1,6 @@
 import { PORT_HTTP, WORKING_DIR } from './config'
 import { initDatabase } from './db'
 import { handleRequest } from './routes'
 import { startTerminalServer } from './services/terminal'
 // Initialize database
 initDatabase()
@@ -12,18 +11,10 @@ Bun.serve({
  fetch: handleRequest
 })
 console.log(`[HTTP] API running at http://localhost:${PORT_HTTP}`)
 // Start Terminal WebSocket server
 startTerminalServer()
 // Startup summary
 console.log('')
 console.log('='.repeat(50))
-console.log('Agent UI Server started')
+console.log('Agent UI API Server (hot-reload enabled)')
 console.log(`  API: http://localhost:${PORT_HTTP}`)
 console.log(`  Terminal: ws://localhost:4103`)
 console.log(`  Working Dir: ${WORKING_DIR}`)
 console.log('')
 console.log('WebMCP starts separately with Claude Code MCP')
 console.log('='.repeat(50))
--- a/server/routes/index.ts
+++ b/server/routes/index.ts
@@ -7,6 +7,7 @@ import { handleThemes, handleActiveTheme, handleDesignTokens, handleThemeById, h
 import { handleCanvas, handleCanvasById, handleToolbarCanvas, handleDefaultCanvas, handleCanvasComponents, handleCanvasComponentById } from './canvas'
 import { handleGiteaRepo, handleGiteaTree, handleGiteaFile } from './gitea'
 import { handleTables, handleStats, handleTableSchema, handleTableData, handleQuery } from './database'
 import { handleWhisperRoutes } from './whisper'
 export async function handleRequest(req: Request): Promise<Response> {
  const url = new URL(req.url)
@@ -168,5 +169,11 @@ export async function handleRequest(req: Request): Promise<Response> {
    return handleQuery(req)
  }
  // Whisper (GPU speech-to-text)
  if (path.startsWith('/api/whisper/')) {
    const res = await handleWhisperRoutes(req)
    if (res) return res
  }
  return notFoundResponse()
 }
--- a/server/routes/whisper.ts
+++ b/server/routes/whisper.ts
@@ -0,0 +1,66 @@
 /**
 * Whisper API routes
 * Control the local GPU-accelerated speech-to-text server
 */
 import {
  startWhisperServer,
  stopWhisperServer,
  toggleWhisperServer,
  getWhisperState,
  getWhisperPort
 } from '../services/whisper'
 export async function handleWhisperRoutes(req: Request): Promise<Response | null> {
  const url = new URL(req.url)
  const path = url.pathname
  // GET /api/whisper/status - Get current state
  if (path === '/api/whisper/status' && req.method === 'GET') {
    const state = await getWhisperState()
    return Response.json(state)
  }
  // POST /api/whisper/start - Start Whisper server
  if (path === '/api/whisper/start' && req.method === 'POST') {
    const success = await startWhisperServer()
    const state = await getWhisperState()
    return Response.json({
      success,
      ...state,
      message: success ? 'Whisper server started' : 'Failed to start Whisper server'
    })
  }
  // POST /api/whisper/stop - Stop Whisper server
  if (path === '/api/whisper/stop' && req.method === 'POST') {
    const success = stopWhisperServer()
    const state = await getWhisperState()
    return Response.json({
      success,
      ...state,
      message: success ? 'Whisper server stopped' : 'Failed to stop Whisper server'
    })
  }
  // POST /api/whisper/toggle - Toggle Whisper on/off
  if (path === '/api/whisper/toggle' && req.method === 'POST') {
    const result = await toggleWhisperServer()
    const state = await getWhisperState()
    return Response.json({
      ...result,
      ...state,
      message: state.enabled ? 'Whisper enabled (GPU)' : 'Whisper disabled (using Web Speech API)'
    })
  }
  // GET /api/whisper/port - Get Whisper WebSocket port
  if (path === '/api/whisper/port' && req.method === 'GET') {
    return Response.json({
      port: getWhisperPort(),
      url: `ws://localhost:${getWhisperPort()}`
    })
  }
  return null
 }
--- a/server/services/whisper.ts
+++ b/server/services/whisper.ts
@@ -0,0 +1,218 @@
 /**
 * Whisper Service - Manages the Python Whisper server process
 * Provides GPU-accelerated speech-to-text as an alternative to Web Speech API
 */
 import { join } from 'path'
 import { Subprocess } from 'bun'
 const WHISPER_PORT = 4104
 const WHISPER_SCRIPT = join(import.meta.dir, '..', 'whisper_server.py')
 interface WhisperState {
  enabled: boolean
  running: boolean
  process: Subprocess | null
  model: string
  device: string
 }
 const state: WhisperState = {
  enabled: false,
  running: false,
  process: null,
  model: 'medium',
  device: 'cuda'
 }
 /**
 * Kill any process using the Whisper port
 */
 async function killProcessOnPort(port: number): Promise<void> {
  try {
    // Use PowerShell to find and kill process on port
    const proc = Bun.spawn(['powershell', '-Command',
      `Get-NetTCPConnection -LocalPort ${port} -ErrorAction SilentlyContinue | ForEach-Object { Stop-Process -Id $_.OwningProcess -Force -ErrorAction SilentlyContinue }`
    ], { stdout: 'ignore', stderr: 'ignore' })
    await proc.exited
    // Wait a moment for port to be released
    await new Promise(resolve => setTimeout(resolve, 1000))
  } catch {
    // Ignore errors
  }
 }
 /**
 * Start the Whisper Python server
 */
 export async function startWhisperServer(): Promise<boolean> {
  if (state.running && state.process) {
    console.log('[Whisper] Server already running')
    return true
  }
  console.log('[Whisper] ====== STARTING (v3) ======')
  console.log('[Whisper] Script:', WHISPER_SCRIPT)
  // Kill any existing process on the port
  console.log('[Whisper] Cleaning up port', WHISPER_PORT)
  await killProcessOnPort(WHISPER_PORT)
  try {
    // Use Bun.spawn with inherit to show logs directly in console
    const proc = Bun.spawn(['python', WHISPER_SCRIPT], {
      cwd: join(import.meta.dir, '..'),
      stdout: 'inherit',
      stderr: 'inherit',
      env: { ...process.env }
    })
    state.process = proc
    // Wait a bit for the server to start, then check if port is listening
    await new Promise(resolve => setTimeout(resolve, 3000))
    // Check if process is still running
    if (proc.exitCode !== null) {
      console.error('[Whisper] Process exited with code:', proc.exitCode)
      state.process = null
      return false
    }
    // Check if port is listening (simple TCP check)
    const isListening = await checkPort(WHISPER_PORT)
    if (isListening) {
      console.log('[Whisper] Server started successfully on port', WHISPER_PORT)
      state.running = true
      state.enabled = true
      return true
    }
    // Wait more if model is still loading (up to 90 seconds total)
    console.log('[Whisper] Waiting for model to load...')
    for (let i = 0; i < 30; i++) {
      await new Promise(resolve => setTimeout(resolve, 3000))
      if (proc.exitCode !== null) {
        console.error('[Whisper] Process died while loading')
        state.process = null
        return false
      }
      if (await checkPort(WHISPER_PORT)) {
        console.log('[Whisper] Server ready!')
        state.running = true
        state.enabled = true
        return true
      }
    }
    console.log('[Whisper] Timeout waiting for server')
    return false
  } catch (err: any) {
    console.error('[Whisper] Failed to start:', err.message)
    state.process = null
    return false
  }
 }
 /**
 * Check if a port is listening using PowerShell
 */
 async function checkPort(port: number): Promise<boolean> {
  try {
    const proc = Bun.spawn(['powershell', '-Command',
      `if (Get-NetTCPConnection -LocalPort ${port} -State Listen -ErrorAction SilentlyContinue) { exit 0 } else { exit 1 }`
    ], { stdout: 'ignore', stderr: 'ignore' })
    const exitCode = await proc.exited
    return exitCode === 0
  } catch {
    return false
  }
 }
 /**
 * Stop the Whisper server
 */
 export function stopWhisperServer(): boolean {
  if (!state.process) {
    console.log('[Whisper] No server running')
    return true
  }
  console.log('[Whisper] Stopping server...')
  try {
    state.process.kill()
    state.process = null
    state.running = false
    state.enabled = false
    console.log('[Whisper] Server stopped')
    return true
  } catch (err) {
    console.error('[Whisper] Error stopping server:', err)
    return false
  }
 }
 /**
 * Toggle Whisper server on/off
 */
 export async function toggleWhisperServer(): Promise<{ enabled: boolean; success: boolean }> {
  if (state.enabled && state.running) {
    const success = stopWhisperServer()
    return { enabled: false, success }
  } else {
    const success = await startWhisperServer()
    return { enabled: success, success }
  }
 }
 /**
 * Get current Whisper state (checks real port status)
 */
 export async function getWhisperState(): Promise<{
  enabled: boolean
  running: boolean
  port: number
  model: string
  device: string
 }> {
  // Check if port is actually listening
  const isListening = await checkPort(WHISPER_PORT)
  // Sync state with reality
  if (isListening && !state.running) {
    state.running = true
    state.enabled = true
  } else if (!isListening && state.running) {
    state.running = false
    state.enabled = false
    state.process = null
  }
  return {
    enabled: state.enabled,
    running: state.running,
    port: WHISPER_PORT,
    model: state.model,
    device: state.device
  }
 }
 /**
 * Check if Whisper is enabled
 */
 export function isWhisperEnabled(): boolean {
  return state.enabled && state.running
 }
 // WebSocket server for Whisper (proxies to Python server or handles directly)
 let whisperWsServer: any = null
 export function getWhisperPort(): number {
  return WHISPER_PORT
 }
--- a/server/terminal.ts
+++ b/server/terminal.ts
@@ -0,0 +1,22 @@
 #!/usr/bin/env bun
 /**
 * Terminal Server - Independent process
 * This runs separately from the main server to maintain stable Claude Code sessions
 * even when the main server restarts due to code changes.
 */
 import { startTerminalServer } from './services/terminal'
 import { WORKING_DIR } from './config'
 console.log('')
 console.log('='.repeat(50))
 console.log('Terminal Server (Independent Process)')
 console.log(`  WebSocket: ws://localhost:4103`)
 console.log(`  Working Dir: ${WORKING_DIR}`)
 console.log('')
 console.log('This process is stable and won\'t restart')
 console.log('when the main server reloads.')
 console.log('='.repeat(50))
 console.log('')
 startTerminalServer()
--- a/server/whisper_server.py
+++ b/server/whisper_server.py
@@ -0,0 +1,233 @@
 #!/usr/bin/env python3
 """
 Whisper Server - GPU-accelerated speech-to-text using faster-whisper
 WebSocket server that receives audio and returns transcriptions
 """
 import asyncio
 import json
 import sys
 import io
 import wave
 import tempfile
 import os
 from pathlib import Path
 try:
    import websockets
    from faster_whisper import WhisperModel
 except ImportError as e:
    print(f"Missing dependency: {e}")
    print("Run: pip install faster-whisper websockets")
    sys.exit(1)
 # Configuration
 HOST = "localhost"
 PORT = 4104
 MODEL_SIZE = "medium"  # tiny, base, small, medium, large-v2, large-v3
 DEVICE = "cuda"  # cuda or cpu
 COMPUTE_TYPE = "float16"  # float16 for GPU, int8 for CPU
 # Global model instance
 model = None
 model_loading = False
 async def load_model():
    """Load Whisper model (lazy loading on first request)"""
    global model, model_loading
    if model is not None:
        return model
    if model_loading:
        # Wait for model to finish loading
        while model_loading:
            await asyncio.sleep(0.1)
        return model
    model_loading = True
    print(f"[Whisper] Loading model '{MODEL_SIZE}' on {DEVICE}...")
    try:
        # Load model - this downloads on first run
        model = WhisperModel(
            MODEL_SIZE,
            device=DEVICE,
            compute_type=COMPUTE_TYPE,
            download_root=str(Path.home() / ".cache" / "whisper")
        )
        print(f"[Whisper] Model loaded successfully!")
    except Exception as e:
        print(f"[Whisper] Error loading model: {e}")
        print("[Whisper] Falling back to CPU...")
        model = WhisperModel(
            MODEL_SIZE,
            device="cpu",
            compute_type="int8",
            download_root=str(Path.home() / ".cache" / "whisper")
        )
    model_loading = False
    return model
 def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
    """Transcribe audio data using Whisper"""
    global model
    if model is None:
        return {"error": "Model not loaded"}
    # Save audio to temp file (faster-whisper needs a file path)
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        f.write(audio_data)
        temp_path = f.name
    try:
        # Transcribe
        segments, info = model.transcribe(
            temp_path,
            language=language,
            beam_size=5,
            vad_filter=True,  # Voice activity detection
            vad_parameters=dict(
                min_silence_duration_ms=500,
                speech_pad_ms=400
            )
        )
        # Collect all segments
        text = ""
        segments_list = []
        for segment in segments:
            text += segment.text + " "
            segments_list.append({
                "start": segment.start,
                "end": segment.end,
                "text": segment.text
            })
        return {
            "success": True,
            "text": text.strip(),
            "language": info.language,
            "language_probability": info.language_probability,
            "duration": info.duration,
            "segments": segments_list,
            "engine": "whisper-gpu",
            "model": MODEL_SIZE,
            "device": DEVICE
        }
    except Exception as e:
        return {"error": str(e)}
    finally:
        # Cleanup temp file
        try:
            os.unlink(temp_path)
        except:
            pass
 async def handle_client(websocket):
    """Handle WebSocket client connection"""
    print(f"[Whisper] Client connected")
    # Ensure model is loaded
    await load_model()
    # Send ready message
    await websocket.send(json.dumps({
        "type": "ready",
        "model": MODEL_SIZE,
        "device": DEVICE
    }))
    try:
        async for message in websocket:
            if isinstance(message, bytes):
                # Binary audio data
                print(f"[Whisper] Received {len(message)} bytes of audio")
                # Transcribe in thread pool to not block
                loop = asyncio.get_event_loop()
                result = await loop.run_in_executor(
                    None,
                    transcribe_audio,
                    message,
                    "es"  # Default to Spanish
                )
                await websocket.send(json.dumps({
                    "type": "transcription",
                    **result
                }))
            else:
                # JSON command
                try:
                    cmd = json.loads(message)
                    if cmd.get("type") == "transcribe":
                        # Audio data sent as base64
                        import base64
                        audio_data = base64.b64decode(cmd.get("audio", ""))
                        language = cmd.get("language", "es")
                        loop = asyncio.get_event_loop()
                        result = await loop.run_in_executor(
                            None,
                            transcribe_audio,
                            audio_data,
                            language
                        )
                        await websocket.send(json.dumps({
                            "type": "transcription",
                            **result
                        }))
                    elif cmd.get("type") == "ping":
                        await websocket.send(json.dumps({"type": "pong"}))
                    elif cmd.get("type") == "status":
                        await websocket.send(json.dumps({
                            "type": "status",
                            "model": MODEL_SIZE,
                            "device": DEVICE,
                            "ready": model is not None
                        }))
                except json.JSONDecodeError:
                    await websocket.send(json.dumps({
                        "type": "error",
                        "message": "Invalid JSON"
                    }))
    except websockets.exceptions.ConnectionClosed:
        print("[Whisper] Client disconnected")
    except Exception as e:
        print(f"[Whisper] Error: {e}")
 async def main():
    """Start WebSocket server"""
    print(f"[Whisper] Starting server on ws://{HOST}:{PORT}")
    print(f"[Whisper] Model: {MODEL_SIZE}, Device: {DEVICE}")
    # Pre-load model
    print("[Whisper] Pre-loading model...")
    await load_model()
    async with websockets.serve(handle_client, HOST, PORT):
        print(f"[Whisper] Server ready! Listening on ws://{HOST}:{PORT}")
        await asyncio.Future()  # Run forever
 if __name__ == "__main__":
    # Install websockets if needed
    try:
        import websockets
    except ImportError:
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "websockets"])
        import websockets
    asyncio.run(main())