feat: Add Whisper GPU speech-to-text with progressive transcription

- Add faster-whisper Python server for GPU-accelerated transcription
- Support dual mode: Web Speech API or Whisper GPU (toggleable)
- Progressive transcription every 3 seconds while recording
- Separate terminal server process (stable during hot-reload)
- Add Ctrl+V paste and Ctrl+C copy support in FloatingTerminal
- Add MCP tools: whisper_start, whisper_stop, whisper_toggle, whisper_status
- Update package.json with separate api/terminal/frontend processes
This commit is contained in:
2026-02-13 23:47:52 -06:00
parent e867b7873e
commit 638e6ac8e0
10 changed files with 1009 additions and 31 deletions

View File

@@ -1,7 +1,6 @@
import { PORT_HTTP, WORKING_DIR } from './config'
import { initDatabase } from './db'
import { handleRequest } from './routes'
import { startTerminalServer } from './services/terminal'
// Initialize database
initDatabase()
@@ -12,18 +11,10 @@ Bun.serve({
fetch: handleRequest
})
console.log(`[HTTP] API running at http://localhost:${PORT_HTTP}`)
// Start Terminal WebSocket server
startTerminalServer()
// Startup summary
console.log('')
console.log('='.repeat(50))
console.log('Agent UI Server started')
console.log('Agent UI API Server (hot-reload enabled)')
console.log(` API: http://localhost:${PORT_HTTP}`)
console.log(` Terminal: ws://localhost:4103`)
console.log(` Working Dir: ${WORKING_DIR}`)
console.log('')
console.log('WebMCP starts separately with Claude Code MCP')
console.log('='.repeat(50))

View File

@@ -7,6 +7,7 @@ import { handleThemes, handleActiveTheme, handleDesignTokens, handleThemeById, h
import { handleCanvas, handleCanvasById, handleToolbarCanvas, handleDefaultCanvas, handleCanvasComponents, handleCanvasComponentById } from './canvas'
import { handleGiteaRepo, handleGiteaTree, handleGiteaFile } from './gitea'
import { handleTables, handleStats, handleTableSchema, handleTableData, handleQuery } from './database'
import { handleWhisperRoutes } from './whisper'
export async function handleRequest(req: Request): Promise<Response> {
const url = new URL(req.url)
@@ -168,5 +169,11 @@ export async function handleRequest(req: Request): Promise<Response> {
return handleQuery(req)
}
// Whisper (GPU speech-to-text)
if (path.startsWith('/api/whisper/')) {
const res = await handleWhisperRoutes(req)
if (res) return res
}
return notFoundResponse()
}

66
server/routes/whisper.ts Normal file
View File

@@ -0,0 +1,66 @@
/**
* Whisper API routes
* Control the local GPU-accelerated speech-to-text server
*/
import {
startWhisperServer,
stopWhisperServer,
toggleWhisperServer,
getWhisperState,
getWhisperPort
} from '../services/whisper'
export async function handleWhisperRoutes(req: Request): Promise<Response | null> {
const url = new URL(req.url)
const path = url.pathname
// GET /api/whisper/status - Get current state
if (path === '/api/whisper/status' && req.method === 'GET') {
const state = await getWhisperState()
return Response.json(state)
}
// POST /api/whisper/start - Start Whisper server
if (path === '/api/whisper/start' && req.method === 'POST') {
const success = await startWhisperServer()
const state = await getWhisperState()
return Response.json({
success,
...state,
message: success ? 'Whisper server started' : 'Failed to start Whisper server'
})
}
// POST /api/whisper/stop - Stop Whisper server
if (path === '/api/whisper/stop' && req.method === 'POST') {
const success = stopWhisperServer()
const state = await getWhisperState()
return Response.json({
success,
...state,
message: success ? 'Whisper server stopped' : 'Failed to stop Whisper server'
})
}
// POST /api/whisper/toggle - Toggle Whisper on/off
if (path === '/api/whisper/toggle' && req.method === 'POST') {
const result = await toggleWhisperServer()
const state = await getWhisperState()
return Response.json({
...result,
...state,
message: state.enabled ? 'Whisper enabled (GPU)' : 'Whisper disabled (using Web Speech API)'
})
}
// GET /api/whisper/port - Get Whisper WebSocket port
if (path === '/api/whisper/port' && req.method === 'GET') {
return Response.json({
port: getWhisperPort(),
url: `ws://localhost:${getWhisperPort()}`
})
}
return null
}

218
server/services/whisper.ts Normal file
View File

@@ -0,0 +1,218 @@
/**
* Whisper Service - Manages the Python Whisper server process
* Provides GPU-accelerated speech-to-text as an alternative to Web Speech API
*/
import { join } from 'path'
import { Subprocess } from 'bun'
const WHISPER_PORT = 4104
const WHISPER_SCRIPT = join(import.meta.dir, '..', 'whisper_server.py')
interface WhisperState {
enabled: boolean
running: boolean
process: Subprocess | null
model: string
device: string
}
const state: WhisperState = {
enabled: false,
running: false,
process: null,
model: 'medium',
device: 'cuda'
}
/**
* Kill any process using the Whisper port
*/
async function killProcessOnPort(port: number): Promise<void> {
try {
// Use PowerShell to find and kill process on port
const proc = Bun.spawn(['powershell', '-Command',
`Get-NetTCPConnection -LocalPort ${port} -ErrorAction SilentlyContinue | ForEach-Object { Stop-Process -Id $_.OwningProcess -Force -ErrorAction SilentlyContinue }`
], { stdout: 'ignore', stderr: 'ignore' })
await proc.exited
// Wait a moment for port to be released
await new Promise(resolve => setTimeout(resolve, 1000))
} catch {
// Ignore errors
}
}
/**
* Start the Whisper Python server
*/
export async function startWhisperServer(): Promise<boolean> {
if (state.running && state.process) {
console.log('[Whisper] Server already running')
return true
}
console.log('[Whisper] ====== STARTING (v3) ======')
console.log('[Whisper] Script:', WHISPER_SCRIPT)
// Kill any existing process on the port
console.log('[Whisper] Cleaning up port', WHISPER_PORT)
await killProcessOnPort(WHISPER_PORT)
try {
// Use Bun.spawn with inherit to show logs directly in console
const proc = Bun.spawn(['python', WHISPER_SCRIPT], {
cwd: join(import.meta.dir, '..'),
stdout: 'inherit',
stderr: 'inherit',
env: { ...process.env }
})
state.process = proc
// Wait a bit for the server to start, then check if port is listening
await new Promise(resolve => setTimeout(resolve, 3000))
// Check if process is still running
if (proc.exitCode !== null) {
console.error('[Whisper] Process exited with code:', proc.exitCode)
state.process = null
return false
}
// Check if port is listening (simple TCP check)
const isListening = await checkPort(WHISPER_PORT)
if (isListening) {
console.log('[Whisper] Server started successfully on port', WHISPER_PORT)
state.running = true
state.enabled = true
return true
}
// Wait more if model is still loading (up to 90 seconds total)
console.log('[Whisper] Waiting for model to load...')
for (let i = 0; i < 30; i++) {
await new Promise(resolve => setTimeout(resolve, 3000))
if (proc.exitCode !== null) {
console.error('[Whisper] Process died while loading')
state.process = null
return false
}
if (await checkPort(WHISPER_PORT)) {
console.log('[Whisper] Server ready!')
state.running = true
state.enabled = true
return true
}
}
console.log('[Whisper] Timeout waiting for server')
return false
} catch (err: any) {
console.error('[Whisper] Failed to start:', err.message)
state.process = null
return false
}
}
/**
* Check if a port is listening using PowerShell
*/
async function checkPort(port: number): Promise<boolean> {
try {
const proc = Bun.spawn(['powershell', '-Command',
`if (Get-NetTCPConnection -LocalPort ${port} -State Listen -ErrorAction SilentlyContinue) { exit 0 } else { exit 1 }`
], { stdout: 'ignore', stderr: 'ignore' })
const exitCode = await proc.exited
return exitCode === 0
} catch {
return false
}
}
/**
* Stop the Whisper server
*/
export function stopWhisperServer(): boolean {
if (!state.process) {
console.log('[Whisper] No server running')
return true
}
console.log('[Whisper] Stopping server...')
try {
state.process.kill()
state.process = null
state.running = false
state.enabled = false
console.log('[Whisper] Server stopped')
return true
} catch (err) {
console.error('[Whisper] Error stopping server:', err)
return false
}
}
/**
* Toggle Whisper server on/off
*/
export async function toggleWhisperServer(): Promise<{ enabled: boolean; success: boolean }> {
if (state.enabled && state.running) {
const success = stopWhisperServer()
return { enabled: false, success }
} else {
const success = await startWhisperServer()
return { enabled: success, success }
}
}
/**
* Get current Whisper state (checks real port status)
*/
export async function getWhisperState(): Promise<{
enabled: boolean
running: boolean
port: number
model: string
device: string
}> {
// Check if port is actually listening
const isListening = await checkPort(WHISPER_PORT)
// Sync state with reality
if (isListening && !state.running) {
state.running = true
state.enabled = true
} else if (!isListening && state.running) {
state.running = false
state.enabled = false
state.process = null
}
return {
enabled: state.enabled,
running: state.running,
port: WHISPER_PORT,
model: state.model,
device: state.device
}
}
/**
* Check if Whisper is enabled
*/
export function isWhisperEnabled(): boolean {
return state.enabled && state.running
}
// WebSocket server for Whisper (proxies to Python server or handles directly)
let whisperWsServer: any = null
export function getWhisperPort(): number {
return WHISPER_PORT
}

22
server/terminal.ts Normal file
View File

@@ -0,0 +1,22 @@
#!/usr/bin/env bun
/**
* Terminal Server - Independent process
* This runs separately from the main server to maintain stable Claude Code sessions
* even when the main server restarts due to code changes.
*/
import { startTerminalServer } from './services/terminal'
import { WORKING_DIR } from './config'
console.log('')
console.log('='.repeat(50))
console.log('Terminal Server (Independent Process)')
console.log(` WebSocket: ws://localhost:4103`)
console.log(` Working Dir: ${WORKING_DIR}`)
console.log('')
console.log('This process is stable and won\'t restart')
console.log('when the main server reloads.')
console.log('='.repeat(50))
console.log('')
startTerminalServer()

233
server/whisper_server.py Normal file
View File

@@ -0,0 +1,233 @@
#!/usr/bin/env python3
"""
Whisper Server - GPU-accelerated speech-to-text using faster-whisper
WebSocket server that receives audio and returns transcriptions
"""
import asyncio
import json
import sys
import io
import wave
import tempfile
import os
from pathlib import Path
try:
import websockets
from faster_whisper import WhisperModel
except ImportError as e:
print(f"Missing dependency: {e}")
print("Run: pip install faster-whisper websockets")
sys.exit(1)
# Configuration
HOST = "localhost"
PORT = 4104
MODEL_SIZE = "medium" # tiny, base, small, medium, large-v2, large-v3
DEVICE = "cuda" # cuda or cpu
COMPUTE_TYPE = "float16" # float16 for GPU, int8 for CPU
# Global model instance
model = None
model_loading = False
async def load_model():
"""Load Whisper model (lazy loading on first request)"""
global model, model_loading
if model is not None:
return model
if model_loading:
# Wait for model to finish loading
while model_loading:
await asyncio.sleep(0.1)
return model
model_loading = True
print(f"[Whisper] Loading model '{MODEL_SIZE}' on {DEVICE}...")
try:
# Load model - this downloads on first run
model = WhisperModel(
MODEL_SIZE,
device=DEVICE,
compute_type=COMPUTE_TYPE,
download_root=str(Path.home() / ".cache" / "whisper")
)
print(f"[Whisper] Model loaded successfully!")
except Exception as e:
print(f"[Whisper] Error loading model: {e}")
print("[Whisper] Falling back to CPU...")
model = WhisperModel(
MODEL_SIZE,
device="cpu",
compute_type="int8",
download_root=str(Path.home() / ".cache" / "whisper")
)
model_loading = False
return model
def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
"""Transcribe audio data using Whisper"""
global model
if model is None:
return {"error": "Model not loaded"}
# Save audio to temp file (faster-whisper needs a file path)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(audio_data)
temp_path = f.name
try:
# Transcribe
segments, info = model.transcribe(
temp_path,
language=language,
beam_size=5,
vad_filter=True, # Voice activity detection
vad_parameters=dict(
min_silence_duration_ms=500,
speech_pad_ms=400
)
)
# Collect all segments
text = ""
segments_list = []
for segment in segments:
text += segment.text + " "
segments_list.append({
"start": segment.start,
"end": segment.end,
"text": segment.text
})
return {
"success": True,
"text": text.strip(),
"language": info.language,
"language_probability": info.language_probability,
"duration": info.duration,
"segments": segments_list,
"engine": "whisper-gpu",
"model": MODEL_SIZE,
"device": DEVICE
}
except Exception as e:
return {"error": str(e)}
finally:
# Cleanup temp file
try:
os.unlink(temp_path)
except:
pass
async def handle_client(websocket):
"""Handle WebSocket client connection"""
print(f"[Whisper] Client connected")
# Ensure model is loaded
await load_model()
# Send ready message
await websocket.send(json.dumps({
"type": "ready",
"model": MODEL_SIZE,
"device": DEVICE
}))
try:
async for message in websocket:
if isinstance(message, bytes):
# Binary audio data
print(f"[Whisper] Received {len(message)} bytes of audio")
# Transcribe in thread pool to not block
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
transcribe_audio,
message,
"es" # Default to Spanish
)
await websocket.send(json.dumps({
"type": "transcription",
**result
}))
else:
# JSON command
try:
cmd = json.loads(message)
if cmd.get("type") == "transcribe":
# Audio data sent as base64
import base64
audio_data = base64.b64decode(cmd.get("audio", ""))
language = cmd.get("language", "es")
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
transcribe_audio,
audio_data,
language
)
await websocket.send(json.dumps({
"type": "transcription",
**result
}))
elif cmd.get("type") == "ping":
await websocket.send(json.dumps({"type": "pong"}))
elif cmd.get("type") == "status":
await websocket.send(json.dumps({
"type": "status",
"model": MODEL_SIZE,
"device": DEVICE,
"ready": model is not None
}))
except json.JSONDecodeError:
await websocket.send(json.dumps({
"type": "error",
"message": "Invalid JSON"
}))
except websockets.exceptions.ConnectionClosed:
print("[Whisper] Client disconnected")
except Exception as e:
print(f"[Whisper] Error: {e}")
async def main():
"""Start WebSocket server"""
print(f"[Whisper] Starting server on ws://{HOST}:{PORT}")
print(f"[Whisper] Model: {MODEL_SIZE}, Device: {DEVICE}")
# Pre-load model
print("[Whisper] Pre-loading model...")
await load_model()
async with websockets.serve(handle_client, HOST, PORT):
print(f"[Whisper] Server ready! Listening on ws://{HOST}:{PORT}")
await asyncio.Future() # Run forever
if __name__ == "__main__":
# Install websockets if needed
try:
import websockets
except ImportError:
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "websockets"])
import websockets
asyncio.run(main())