feat: Add Whisper GPU speech-to-text with progressive transcription

- Add faster-whisper Python server for GPU-accelerated transcription - Support dual mode: Web Speech API or Whisper GPU (toggleable) - Progressive transcription every 3 seconds while recording - Separate terminal server process (stable during hot-reload) - Add Ctrl+V paste and Ctrl+C copy support in FloatingTerminal - Add MCP tools: whisper_start, whisper_stop, whisper_toggle, whisper_status - Update package.json with separate api/terminal/frontend processes
2026-02-13 23:47:52 -06:00
parent e867b7873e
commit 638e6ac8e0
10 changed files with 1009 additions and 31 deletions
--- a/server/index.ts
+++ b/server/index.ts
@@ -1,7 +1,6 @@
 import { PORT_HTTP, WORKING_DIR } from './config'
 import { initDatabase } from './db'
 import { handleRequest } from './routes'
-import { startTerminalServer } from './services/terminal'

 // Initialize database
 initDatabase()
@@ -12,18 +11,10 @@ Bun.serve({
  fetch: handleRequest
 })

-console.log(`[HTTP] API running at http://localhost:${PORT_HTTP}`)
-
-// Start Terminal WebSocket server
-startTerminalServer()
-
 // Startup summary
 console.log('')
 console.log('='.repeat(50))
-console.log('Agent UI Server started')
+console.log('Agent UI API Server (hot-reload enabled)')
 console.log(`  API: http://localhost:${PORT_HTTP}`)
-console.log(`  Terminal: ws://localhost:4103`)
 console.log(`  Working Dir: ${WORKING_DIR}`)
-console.log('')
-console.log('WebMCP starts separately with Claude Code MCP')
 console.log('='.repeat(50))
--- a/server/routes/index.ts
+++ b/server/routes/index.ts
@@ -7,6 +7,7 @@ import { handleThemes, handleActiveTheme, handleDesignTokens, handleThemeById, h
 import { handleCanvas, handleCanvasById, handleToolbarCanvas, handleDefaultCanvas, handleCanvasComponents, handleCanvasComponentById } from './canvas'
 import { handleGiteaRepo, handleGiteaTree, handleGiteaFile } from './gitea'
 import { handleTables, handleStats, handleTableSchema, handleTableData, handleQuery } from './database'
+import { handleWhisperRoutes } from './whisper'

 export async function handleRequest(req: Request): Promise<Response> {
  const url = new URL(req.url)
@@ -168,5 +169,11 @@ export async function handleRequest(req: Request): Promise<Response> {
    return handleQuery(req)
  }

+  // Whisper (GPU speech-to-text)
+  if (path.startsWith('/api/whisper/')) {
+    const res = await handleWhisperRoutes(req)
+    if (res) return res
+  }
+
  return notFoundResponse()
 }
--- a/server/routes/whisper.ts
+++ b/server/routes/whisper.ts
@@ -0,0 +1,66 @@
+/**
+ * Whisper API routes
+ * Control the local GPU-accelerated speech-to-text server
+ */
+
+import {
+  startWhisperServer,
+  stopWhisperServer,
+  toggleWhisperServer,
+  getWhisperState,
+  getWhisperPort
+} from '../services/whisper'
+
+export async function handleWhisperRoutes(req: Request): Promise<Response | null> {
+  const url = new URL(req.url)
+  const path = url.pathname
+
+  // GET /api/whisper/status - Get current state
+  if (path === '/api/whisper/status' && req.method === 'GET') {
+    const state = await getWhisperState()
+    return Response.json(state)
+  }
+
+  // POST /api/whisper/start - Start Whisper server
+  if (path === '/api/whisper/start' && req.method === 'POST') {
+    const success = await startWhisperServer()
+    const state = await getWhisperState()
+    return Response.json({
+      success,
+      ...state,
+      message: success ? 'Whisper server started' : 'Failed to start Whisper server'
+    })
+  }
+
+  // POST /api/whisper/stop - Stop Whisper server
+  if (path === '/api/whisper/stop' && req.method === 'POST') {
+    const success = stopWhisperServer()
+    const state = await getWhisperState()
+    return Response.json({
+      success,
+      ...state,
+      message: success ? 'Whisper server stopped' : 'Failed to stop Whisper server'
+    })
+  }
+
+  // POST /api/whisper/toggle - Toggle Whisper on/off
+  if (path === '/api/whisper/toggle' && req.method === 'POST') {
+    const result = await toggleWhisperServer()
+    const state = await getWhisperState()
+    return Response.json({
+      ...result,
+      ...state,
+      message: state.enabled ? 'Whisper enabled (GPU)' : 'Whisper disabled (using Web Speech API)'
+    })
+  }
+
+  // GET /api/whisper/port - Get Whisper WebSocket port
+  if (path === '/api/whisper/port' && req.method === 'GET') {
+    return Response.json({
+      port: getWhisperPort(),
+      url: `ws://localhost:${getWhisperPort()}`
+    })
+  }
+
+  return null
+}
--- a/server/services/whisper.ts
+++ b/server/services/whisper.ts
@@ -0,0 +1,218 @@
+/**
+ * Whisper Service - Manages the Python Whisper server process
+ * Provides GPU-accelerated speech-to-text as an alternative to Web Speech API
+ */
+
+import { join } from 'path'
+import { Subprocess } from 'bun'
+
+const WHISPER_PORT = 4104
+const WHISPER_SCRIPT = join(import.meta.dir, '..', 'whisper_server.py')
+
+interface WhisperState {
+  enabled: boolean
+  running: boolean
+  process: Subprocess | null
+  model: string
+  device: string
+}
+
+const state: WhisperState = {
+  enabled: false,
+  running: false,
+  process: null,
+  model: 'medium',
+  device: 'cuda'
+}
+
+/**
+ * Kill any process using the Whisper port
+ */
+async function killProcessOnPort(port: number): Promise<void> {
+  try {
+    // Use PowerShell to find and kill process on port
+    const proc = Bun.spawn(['powershell', '-Command',
+      `Get-NetTCPConnection -LocalPort ${port} -ErrorAction SilentlyContinue | ForEach-Object { Stop-Process -Id $_.OwningProcess -Force -ErrorAction SilentlyContinue }`
+    ], { stdout: 'ignore', stderr: 'ignore' })
+    await proc.exited
+    // Wait a moment for port to be released
+    await new Promise(resolve => setTimeout(resolve, 1000))
+  } catch {
+    // Ignore errors
+  }
+}
+
+/**
+ * Start the Whisper Python server
+ */
+export async function startWhisperServer(): Promise<boolean> {
+  if (state.running && state.process) {
+    console.log('[Whisper] Server already running')
+    return true
+  }
+
+  console.log('[Whisper] ====== STARTING (v3) ======')
+  console.log('[Whisper] Script:', WHISPER_SCRIPT)
+
+  // Kill any existing process on the port
+  console.log('[Whisper] Cleaning up port', WHISPER_PORT)
+  await killProcessOnPort(WHISPER_PORT)
+
+  try {
+    // Use Bun.spawn with inherit to show logs directly in console
+    const proc = Bun.spawn(['python', WHISPER_SCRIPT], {
+      cwd: join(import.meta.dir, '..'),
+      stdout: 'inherit',
+      stderr: 'inherit',
+      env: { ...process.env }
+    })
+
+    state.process = proc
+
+    // Wait a bit for the server to start, then check if port is listening
+    await new Promise(resolve => setTimeout(resolve, 3000))
+
+    // Check if process is still running
+    if (proc.exitCode !== null) {
+      console.error('[Whisper] Process exited with code:', proc.exitCode)
+      state.process = null
+      return false
+    }
+
+    // Check if port is listening (simple TCP check)
+    const isListening = await checkPort(WHISPER_PORT)
+
+    if (isListening) {
+      console.log('[Whisper] Server started successfully on port', WHISPER_PORT)
+      state.running = true
+      state.enabled = true
+      return true
+    }
+
+    // Wait more if model is still loading (up to 90 seconds total)
+    console.log('[Whisper] Waiting for model to load...')
+    for (let i = 0; i < 30; i++) {
+      await new Promise(resolve => setTimeout(resolve, 3000))
+
+      if (proc.exitCode !== null) {
+        console.error('[Whisper] Process died while loading')
+        state.process = null
+        return false
+      }
+
+      if (await checkPort(WHISPER_PORT)) {
+        console.log('[Whisper] Server ready!')
+        state.running = true
+        state.enabled = true
+        return true
+      }
+    }
+
+    console.log('[Whisper] Timeout waiting for server')
+    return false
+
+  } catch (err: any) {
+    console.error('[Whisper] Failed to start:', err.message)
+    state.process = null
+    return false
+  }
+}
+
+/**
+ * Check if a port is listening using PowerShell
+ */
+async function checkPort(port: number): Promise<boolean> {
+  try {
+    const proc = Bun.spawn(['powershell', '-Command',
+      `if (Get-NetTCPConnection -LocalPort ${port} -State Listen -ErrorAction SilentlyContinue) { exit 0 } else { exit 1 }`
+    ], { stdout: 'ignore', stderr: 'ignore' })
+
+    const exitCode = await proc.exited
+    return exitCode === 0
+  } catch {
+    return false
+  }
+}
+
+/**
+ * Stop the Whisper server
+ */
+export function stopWhisperServer(): boolean {
+  if (!state.process) {
+    console.log('[Whisper] No server running')
+    return true
+  }
+
+  console.log('[Whisper] Stopping server...')
+
+  try {
+    state.process.kill()
+    state.process = null
+    state.running = false
+    state.enabled = false
+    console.log('[Whisper] Server stopped')
+    return true
+  } catch (err) {
+    console.error('[Whisper] Error stopping server:', err)
+    return false
+  }
+}
+
+/**
+ * Toggle Whisper server on/off
+ */
+export async function toggleWhisperServer(): Promise<{ enabled: boolean; success: boolean }> {
+  if (state.enabled && state.running) {
+    const success = stopWhisperServer()
+    return { enabled: false, success }
+  } else {
+    const success = await startWhisperServer()
+    return { enabled: success, success }
+  }
+}
+
+/**
+ * Get current Whisper state (checks real port status)
+ */
+export async function getWhisperState(): Promise<{
+  enabled: boolean
+  running: boolean
+  port: number
+  model: string
+  device: string
+}> {
+  // Check if port is actually listening
+  const isListening = await checkPort(WHISPER_PORT)
+
+  // Sync state with reality
+  if (isListening && !state.running) {
+    state.running = true
+    state.enabled = true
+  } else if (!isListening && state.running) {
+    state.running = false
+    state.enabled = false
+    state.process = null
+  }
+
+  return {
+    enabled: state.enabled,
+    running: state.running,
+    port: WHISPER_PORT,
+    model: state.model,
+    device: state.device
+  }
+}
+
+/**
+ * Check if Whisper is enabled
+ */
+export function isWhisperEnabled(): boolean {
+  return state.enabled && state.running
+}
+
+// WebSocket server for Whisper (proxies to Python server or handles directly)
+let whisperWsServer: any = null
+
+export function getWhisperPort(): number {
+  return WHISPER_PORT
+}
--- a/server/terminal.ts
+++ b/server/terminal.ts
@@ -0,0 +1,22 @@
+#!/usr/bin/env bun
+/**
+ * Terminal Server - Independent process
+ * This runs separately from the main server to maintain stable Claude Code sessions
+ * even when the main server restarts due to code changes.
+ */
+
+import { startTerminalServer } from './services/terminal'
+import { WORKING_DIR } from './config'
+
+console.log('')
+console.log('='.repeat(50))
+console.log('Terminal Server (Independent Process)')
+console.log(`  WebSocket: ws://localhost:4103`)
+console.log(`  Working Dir: ${WORKING_DIR}`)
+console.log('')
+console.log('This process is stable and won\'t restart')
+console.log('when the main server reloads.')
+console.log('='.repeat(50))
+console.log('')
+
+startTerminalServer()
--- a/server/whisper_server.py
+++ b/server/whisper_server.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+"""
+Whisper Server - GPU-accelerated speech-to-text using faster-whisper
+WebSocket server that receives audio and returns transcriptions
+"""
+
+import asyncio
+import json
+import sys
+import io
+import wave
+import tempfile
+import os
+from pathlib import Path
+
+try:
+    import websockets
+    from faster_whisper import WhisperModel
+except ImportError as e:
+    print(f"Missing dependency: {e}")
+    print("Run: pip install faster-whisper websockets")
+    sys.exit(1)
+
+# Configuration
+HOST = "localhost"
+PORT = 4104
+MODEL_SIZE = "medium"  # tiny, base, small, medium, large-v2, large-v3
+DEVICE = "cuda"  # cuda or cpu
+COMPUTE_TYPE = "float16"  # float16 for GPU, int8 for CPU
+
+# Global model instance
+model = None
+model_loading = False
+
+async def load_model():
+    """Load Whisper model (lazy loading on first request)"""
+    global model, model_loading
+
+    if model is not None:
+        return model
+
+    if model_loading:
+        # Wait for model to finish loading
+        while model_loading:
+            await asyncio.sleep(0.1)
+        return model
+
+    model_loading = True
+    print(f"[Whisper] Loading model '{MODEL_SIZE}' on {DEVICE}...")
+
+    try:
+        # Load model - this downloads on first run
+        model = WhisperModel(
+            MODEL_SIZE,
+            device=DEVICE,
+            compute_type=COMPUTE_TYPE,
+            download_root=str(Path.home() / ".cache" / "whisper")
+        )
+        print(f"[Whisper] Model loaded successfully!")
+    except Exception as e:
+        print(f"[Whisper] Error loading model: {e}")
+        print("[Whisper] Falling back to CPU...")
+        model = WhisperModel(
+            MODEL_SIZE,
+            device="cpu",
+            compute_type="int8",
+            download_root=str(Path.home() / ".cache" / "whisper")
+        )
+
+    model_loading = False
+    return model
+
+def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
+    """Transcribe audio data using Whisper"""
+    global model
+
+    if model is None:
+        return {"error": "Model not loaded"}
+
+    # Save audio to temp file (faster-whisper needs a file path)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+        f.write(audio_data)
+        temp_path = f.name
+
+    try:
+        # Transcribe
+        segments, info = model.transcribe(
+            temp_path,
+            language=language,
+            beam_size=5,
+            vad_filter=True,  # Voice activity detection
+            vad_parameters=dict(
+                min_silence_duration_ms=500,
+                speech_pad_ms=400
+            )
+        )
+
+        # Collect all segments
+        text = ""
+        segments_list = []
+        for segment in segments:
+            text += segment.text + " "
+            segments_list.append({
+                "start": segment.start,
+                "end": segment.end,
+                "text": segment.text
+            })
+
+        return {
+            "success": True,
+            "text": text.strip(),
+            "language": info.language,
+            "language_probability": info.language_probability,
+            "duration": info.duration,
+            "segments": segments_list,
+            "engine": "whisper-gpu",
+            "model": MODEL_SIZE,
+            "device": DEVICE
+        }
+
+    except Exception as e:
+        return {"error": str(e)}
+
+    finally:
+        # Cleanup temp file
+        try:
+            os.unlink(temp_path)
+        except:
+            pass
+
+async def handle_client(websocket):
+    """Handle WebSocket client connection"""
+    print(f"[Whisper] Client connected")
+
+    # Ensure model is loaded
+    await load_model()
+
+    # Send ready message
+    await websocket.send(json.dumps({
+        "type": "ready",
+        "model": MODEL_SIZE,
+        "device": DEVICE
+    }))
+
+    try:
+        async for message in websocket:
+            if isinstance(message, bytes):
+                # Binary audio data
+                print(f"[Whisper] Received {len(message)} bytes of audio")
+
+                # Transcribe in thread pool to not block
+                loop = asyncio.get_event_loop()
+                result = await loop.run_in_executor(
+                    None,
+                    transcribe_audio,
+                    message,
+                    "es"  # Default to Spanish
+                )
+
+                await websocket.send(json.dumps({
+                    "type": "transcription",
+                    **result
+                }))
+
+            else:
+                # JSON command
+                try:
+                    cmd = json.loads(message)
+
+                    if cmd.get("type") == "transcribe":
+                        # Audio data sent as base64
+                        import base64
+                        audio_data = base64.b64decode(cmd.get("audio", ""))
+                        language = cmd.get("language", "es")
+
+                        loop = asyncio.get_event_loop()
+                        result = await loop.run_in_executor(
+                            None,
+                            transcribe_audio,
+                            audio_data,
+                            language
+                        )
+
+                        await websocket.send(json.dumps({
+                            "type": "transcription",
+                            **result
+                        }))
+
+                    elif cmd.get("type") == "ping":
+                        await websocket.send(json.dumps({"type": "pong"}))
+
+                    elif cmd.get("type") == "status":
+                        await websocket.send(json.dumps({
+                            "type": "status",
+                            "model": MODEL_SIZE,
+                            "device": DEVICE,
+                            "ready": model is not None
+                        }))
+
+                except json.JSONDecodeError:
+                    await websocket.send(json.dumps({
+                        "type": "error",
+                        "message": "Invalid JSON"
+                    }))
+
+    except websockets.exceptions.ConnectionClosed:
+        print("[Whisper] Client disconnected")
+    except Exception as e:
+        print(f"[Whisper] Error: {e}")
+
+async def main():
+    """Start WebSocket server"""
+    print(f"[Whisper] Starting server on ws://{HOST}:{PORT}")
+    print(f"[Whisper] Model: {MODEL_SIZE}, Device: {DEVICE}")
+
+    # Pre-load model
+    print("[Whisper] Pre-loading model...")
+    await load_model()
+
+    async with websockets.serve(handle_client, HOST, PORT):
+        print(f"[Whisper] Server ready! Listening on ws://{HOST}:{PORT}")
+        await asyncio.Future()  # Run forever
+
+if __name__ == "__main__":
+    # Install websockets if needed
+    try:
+        import websockets
+    except ImportError:
+        import subprocess
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "websockets"])
+        import websockets
+
+    asyncio.run(main())