feat: Add Whisper GPU speech-to-text with progressive transcription
- Add faster-whisper Python server for GPU-accelerated transcription - Support dual mode: Web Speech API or Whisper GPU (toggleable) - Progressive transcription every 3 seconds while recording - Separate terminal server process (stable during hot-reload) - Add Ctrl+V paste and Ctrl+C copy support in FloatingTerminal - Add MCP tools: whisper_start, whisper_stop, whisper_toggle, whisper_status - Update package.json with separate api/terminal/frontend processes
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
import { PORT_HTTP, WORKING_DIR } from './config'
|
||||
import { initDatabase } from './db'
|
||||
import { handleRequest } from './routes'
|
||||
import { startTerminalServer } from './services/terminal'
|
||||
|
||||
// Initialize database
|
||||
initDatabase()
|
||||
@@ -12,18 +11,10 @@ Bun.serve({
|
||||
fetch: handleRequest
|
||||
})
|
||||
|
||||
console.log(`[HTTP] API running at http://localhost:${PORT_HTTP}`)
|
||||
|
||||
// Start Terminal WebSocket server
|
||||
startTerminalServer()
|
||||
|
||||
// Startup summary
|
||||
console.log('')
|
||||
console.log('='.repeat(50))
|
||||
console.log('Agent UI Server started')
|
||||
console.log('Agent UI API Server (hot-reload enabled)')
|
||||
console.log(` API: http://localhost:${PORT_HTTP}`)
|
||||
console.log(` Terminal: ws://localhost:4103`)
|
||||
console.log(` Working Dir: ${WORKING_DIR}`)
|
||||
console.log('')
|
||||
console.log('WebMCP starts separately with Claude Code MCP')
|
||||
console.log('='.repeat(50))
|
||||
|
||||
@@ -7,6 +7,7 @@ import { handleThemes, handleActiveTheme, handleDesignTokens, handleThemeById, h
|
||||
import { handleCanvas, handleCanvasById, handleToolbarCanvas, handleDefaultCanvas, handleCanvasComponents, handleCanvasComponentById } from './canvas'
|
||||
import { handleGiteaRepo, handleGiteaTree, handleGiteaFile } from './gitea'
|
||||
import { handleTables, handleStats, handleTableSchema, handleTableData, handleQuery } from './database'
|
||||
import { handleWhisperRoutes } from './whisper'
|
||||
|
||||
export async function handleRequest(req: Request): Promise<Response> {
|
||||
const url = new URL(req.url)
|
||||
@@ -168,5 +169,11 @@ export async function handleRequest(req: Request): Promise<Response> {
|
||||
return handleQuery(req)
|
||||
}
|
||||
|
||||
// Whisper (GPU speech-to-text)
|
||||
if (path.startsWith('/api/whisper/')) {
|
||||
const res = await handleWhisperRoutes(req)
|
||||
if (res) return res
|
||||
}
|
||||
|
||||
return notFoundResponse()
|
||||
}
|
||||
|
||||
66
server/routes/whisper.ts
Normal file
66
server/routes/whisper.ts
Normal file
@@ -0,0 +1,66 @@
|
||||
/**
|
||||
* Whisper API routes
|
||||
* Control the local GPU-accelerated speech-to-text server
|
||||
*/
|
||||
|
||||
import {
|
||||
startWhisperServer,
|
||||
stopWhisperServer,
|
||||
toggleWhisperServer,
|
||||
getWhisperState,
|
||||
getWhisperPort
|
||||
} from '../services/whisper'
|
||||
|
||||
export async function handleWhisperRoutes(req: Request): Promise<Response | null> {
|
||||
const url = new URL(req.url)
|
||||
const path = url.pathname
|
||||
|
||||
// GET /api/whisper/status - Get current state
|
||||
if (path === '/api/whisper/status' && req.method === 'GET') {
|
||||
const state = await getWhisperState()
|
||||
return Response.json(state)
|
||||
}
|
||||
|
||||
// POST /api/whisper/start - Start Whisper server
|
||||
if (path === '/api/whisper/start' && req.method === 'POST') {
|
||||
const success = await startWhisperServer()
|
||||
const state = await getWhisperState()
|
||||
return Response.json({
|
||||
success,
|
||||
...state,
|
||||
message: success ? 'Whisper server started' : 'Failed to start Whisper server'
|
||||
})
|
||||
}
|
||||
|
||||
// POST /api/whisper/stop - Stop Whisper server
|
||||
if (path === '/api/whisper/stop' && req.method === 'POST') {
|
||||
const success = stopWhisperServer()
|
||||
const state = await getWhisperState()
|
||||
return Response.json({
|
||||
success,
|
||||
...state,
|
||||
message: success ? 'Whisper server stopped' : 'Failed to stop Whisper server'
|
||||
})
|
||||
}
|
||||
|
||||
// POST /api/whisper/toggle - Toggle Whisper on/off
|
||||
if (path === '/api/whisper/toggle' && req.method === 'POST') {
|
||||
const result = await toggleWhisperServer()
|
||||
const state = await getWhisperState()
|
||||
return Response.json({
|
||||
...result,
|
||||
...state,
|
||||
message: state.enabled ? 'Whisper enabled (GPU)' : 'Whisper disabled (using Web Speech API)'
|
||||
})
|
||||
}
|
||||
|
||||
// GET /api/whisper/port - Get Whisper WebSocket port
|
||||
if (path === '/api/whisper/port' && req.method === 'GET') {
|
||||
return Response.json({
|
||||
port: getWhisperPort(),
|
||||
url: `ws://localhost:${getWhisperPort()}`
|
||||
})
|
||||
}
|
||||
|
||||
return null
|
||||
}
|
||||
218
server/services/whisper.ts
Normal file
218
server/services/whisper.ts
Normal file
@@ -0,0 +1,218 @@
|
||||
/**
|
||||
* Whisper Service - Manages the Python Whisper server process
|
||||
* Provides GPU-accelerated speech-to-text as an alternative to Web Speech API
|
||||
*/
|
||||
|
||||
import { join } from 'path'
|
||||
import { Subprocess } from 'bun'
|
||||
|
||||
const WHISPER_PORT = 4104
|
||||
const WHISPER_SCRIPT = join(import.meta.dir, '..', 'whisper_server.py')
|
||||
|
||||
interface WhisperState {
|
||||
enabled: boolean
|
||||
running: boolean
|
||||
process: Subprocess | null
|
||||
model: string
|
||||
device: string
|
||||
}
|
||||
|
||||
const state: WhisperState = {
|
||||
enabled: false,
|
||||
running: false,
|
||||
process: null,
|
||||
model: 'medium',
|
||||
device: 'cuda'
|
||||
}
|
||||
|
||||
/**
|
||||
* Kill any process using the Whisper port
|
||||
*/
|
||||
async function killProcessOnPort(port: number): Promise<void> {
|
||||
try {
|
||||
// Use PowerShell to find and kill process on port
|
||||
const proc = Bun.spawn(['powershell', '-Command',
|
||||
`Get-NetTCPConnection -LocalPort ${port} -ErrorAction SilentlyContinue | ForEach-Object { Stop-Process -Id $_.OwningProcess -Force -ErrorAction SilentlyContinue }`
|
||||
], { stdout: 'ignore', stderr: 'ignore' })
|
||||
await proc.exited
|
||||
// Wait a moment for port to be released
|
||||
await new Promise(resolve => setTimeout(resolve, 1000))
|
||||
} catch {
|
||||
// Ignore errors
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Start the Whisper Python server
|
||||
*/
|
||||
export async function startWhisperServer(): Promise<boolean> {
|
||||
if (state.running && state.process) {
|
||||
console.log('[Whisper] Server already running')
|
||||
return true
|
||||
}
|
||||
|
||||
console.log('[Whisper] ====== STARTING (v3) ======')
|
||||
console.log('[Whisper] Script:', WHISPER_SCRIPT)
|
||||
|
||||
// Kill any existing process on the port
|
||||
console.log('[Whisper] Cleaning up port', WHISPER_PORT)
|
||||
await killProcessOnPort(WHISPER_PORT)
|
||||
|
||||
try {
|
||||
// Use Bun.spawn with inherit to show logs directly in console
|
||||
const proc = Bun.spawn(['python', WHISPER_SCRIPT], {
|
||||
cwd: join(import.meta.dir, '..'),
|
||||
stdout: 'inherit',
|
||||
stderr: 'inherit',
|
||||
env: { ...process.env }
|
||||
})
|
||||
|
||||
state.process = proc
|
||||
|
||||
// Wait a bit for the server to start, then check if port is listening
|
||||
await new Promise(resolve => setTimeout(resolve, 3000))
|
||||
|
||||
// Check if process is still running
|
||||
if (proc.exitCode !== null) {
|
||||
console.error('[Whisper] Process exited with code:', proc.exitCode)
|
||||
state.process = null
|
||||
return false
|
||||
}
|
||||
|
||||
// Check if port is listening (simple TCP check)
|
||||
const isListening = await checkPort(WHISPER_PORT)
|
||||
|
||||
if (isListening) {
|
||||
console.log('[Whisper] Server started successfully on port', WHISPER_PORT)
|
||||
state.running = true
|
||||
state.enabled = true
|
||||
return true
|
||||
}
|
||||
|
||||
// Wait more if model is still loading (up to 90 seconds total)
|
||||
console.log('[Whisper] Waiting for model to load...')
|
||||
for (let i = 0; i < 30; i++) {
|
||||
await new Promise(resolve => setTimeout(resolve, 3000))
|
||||
|
||||
if (proc.exitCode !== null) {
|
||||
console.error('[Whisper] Process died while loading')
|
||||
state.process = null
|
||||
return false
|
||||
}
|
||||
|
||||
if (await checkPort(WHISPER_PORT)) {
|
||||
console.log('[Whisper] Server ready!')
|
||||
state.running = true
|
||||
state.enabled = true
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
console.log('[Whisper] Timeout waiting for server')
|
||||
return false
|
||||
|
||||
} catch (err: any) {
|
||||
console.error('[Whisper] Failed to start:', err.message)
|
||||
state.process = null
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a port is listening using PowerShell
|
||||
*/
|
||||
async function checkPort(port: number): Promise<boolean> {
|
||||
try {
|
||||
const proc = Bun.spawn(['powershell', '-Command',
|
||||
`if (Get-NetTCPConnection -LocalPort ${port} -State Listen -ErrorAction SilentlyContinue) { exit 0 } else { exit 1 }`
|
||||
], { stdout: 'ignore', stderr: 'ignore' })
|
||||
|
||||
const exitCode = await proc.exited
|
||||
return exitCode === 0
|
||||
} catch {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop the Whisper server
|
||||
*/
|
||||
export function stopWhisperServer(): boolean {
|
||||
if (!state.process) {
|
||||
console.log('[Whisper] No server running')
|
||||
return true
|
||||
}
|
||||
|
||||
console.log('[Whisper] Stopping server...')
|
||||
|
||||
try {
|
||||
state.process.kill()
|
||||
state.process = null
|
||||
state.running = false
|
||||
state.enabled = false
|
||||
console.log('[Whisper] Server stopped')
|
||||
return true
|
||||
} catch (err) {
|
||||
console.error('[Whisper] Error stopping server:', err)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Toggle Whisper server on/off
|
||||
*/
|
||||
export async function toggleWhisperServer(): Promise<{ enabled: boolean; success: boolean }> {
|
||||
if (state.enabled && state.running) {
|
||||
const success = stopWhisperServer()
|
||||
return { enabled: false, success }
|
||||
} else {
|
||||
const success = await startWhisperServer()
|
||||
return { enabled: success, success }
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current Whisper state (checks real port status)
|
||||
*/
|
||||
export async function getWhisperState(): Promise<{
|
||||
enabled: boolean
|
||||
running: boolean
|
||||
port: number
|
||||
model: string
|
||||
device: string
|
||||
}> {
|
||||
// Check if port is actually listening
|
||||
const isListening = await checkPort(WHISPER_PORT)
|
||||
|
||||
// Sync state with reality
|
||||
if (isListening && !state.running) {
|
||||
state.running = true
|
||||
state.enabled = true
|
||||
} else if (!isListening && state.running) {
|
||||
state.running = false
|
||||
state.enabled = false
|
||||
state.process = null
|
||||
}
|
||||
|
||||
return {
|
||||
enabled: state.enabled,
|
||||
running: state.running,
|
||||
port: WHISPER_PORT,
|
||||
model: state.model,
|
||||
device: state.device
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if Whisper is enabled
|
||||
*/
|
||||
export function isWhisperEnabled(): boolean {
|
||||
return state.enabled && state.running
|
||||
}
|
||||
|
||||
// WebSocket server for Whisper (proxies to Python server or handles directly)
|
||||
let whisperWsServer: any = null
|
||||
|
||||
export function getWhisperPort(): number {
|
||||
return WHISPER_PORT
|
||||
}
|
||||
22
server/terminal.ts
Normal file
22
server/terminal.ts
Normal file
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* Terminal Server - Independent process
|
||||
* This runs separately from the main server to maintain stable Claude Code sessions
|
||||
* even when the main server restarts due to code changes.
|
||||
*/
|
||||
|
||||
import { startTerminalServer } from './services/terminal'
|
||||
import { WORKING_DIR } from './config'
|
||||
|
||||
console.log('')
|
||||
console.log('='.repeat(50))
|
||||
console.log('Terminal Server (Independent Process)')
|
||||
console.log(` WebSocket: ws://localhost:4103`)
|
||||
console.log(` Working Dir: ${WORKING_DIR}`)
|
||||
console.log('')
|
||||
console.log('This process is stable and won\'t restart')
|
||||
console.log('when the main server reloads.')
|
||||
console.log('='.repeat(50))
|
||||
console.log('')
|
||||
|
||||
startTerminalServer()
|
||||
233
server/whisper_server.py
Normal file
233
server/whisper_server.py
Normal file
@@ -0,0 +1,233 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Whisper Server - GPU-accelerated speech-to-text using faster-whisper
|
||||
WebSocket server that receives audio and returns transcriptions
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import io
|
||||
import wave
|
||||
import tempfile
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import websockets
|
||||
from faster_whisper import WhisperModel
|
||||
except ImportError as e:
|
||||
print(f"Missing dependency: {e}")
|
||||
print("Run: pip install faster-whisper websockets")
|
||||
sys.exit(1)
|
||||
|
||||
# Configuration
|
||||
HOST = "localhost"
|
||||
PORT = 4104
|
||||
MODEL_SIZE = "medium" # tiny, base, small, medium, large-v2, large-v3
|
||||
DEVICE = "cuda" # cuda or cpu
|
||||
COMPUTE_TYPE = "float16" # float16 for GPU, int8 for CPU
|
||||
|
||||
# Global model instance
|
||||
model = None
|
||||
model_loading = False
|
||||
|
||||
async def load_model():
|
||||
"""Load Whisper model (lazy loading on first request)"""
|
||||
global model, model_loading
|
||||
|
||||
if model is not None:
|
||||
return model
|
||||
|
||||
if model_loading:
|
||||
# Wait for model to finish loading
|
||||
while model_loading:
|
||||
await asyncio.sleep(0.1)
|
||||
return model
|
||||
|
||||
model_loading = True
|
||||
print(f"[Whisper] Loading model '{MODEL_SIZE}' on {DEVICE}...")
|
||||
|
||||
try:
|
||||
# Load model - this downloads on first run
|
||||
model = WhisperModel(
|
||||
MODEL_SIZE,
|
||||
device=DEVICE,
|
||||
compute_type=COMPUTE_TYPE,
|
||||
download_root=str(Path.home() / ".cache" / "whisper")
|
||||
)
|
||||
print(f"[Whisper] Model loaded successfully!")
|
||||
except Exception as e:
|
||||
print(f"[Whisper] Error loading model: {e}")
|
||||
print("[Whisper] Falling back to CPU...")
|
||||
model = WhisperModel(
|
||||
MODEL_SIZE,
|
||||
device="cpu",
|
||||
compute_type="int8",
|
||||
download_root=str(Path.home() / ".cache" / "whisper")
|
||||
)
|
||||
|
||||
model_loading = False
|
||||
return model
|
||||
|
||||
def transcribe_audio(audio_data: bytes, language: str = "es") -> dict:
|
||||
"""Transcribe audio data using Whisper"""
|
||||
global model
|
||||
|
||||
if model is None:
|
||||
return {"error": "Model not loaded"}
|
||||
|
||||
# Save audio to temp file (faster-whisper needs a file path)
|
||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
||||
f.write(audio_data)
|
||||
temp_path = f.name
|
||||
|
||||
try:
|
||||
# Transcribe
|
||||
segments, info = model.transcribe(
|
||||
temp_path,
|
||||
language=language,
|
||||
beam_size=5,
|
||||
vad_filter=True, # Voice activity detection
|
||||
vad_parameters=dict(
|
||||
min_silence_duration_ms=500,
|
||||
speech_pad_ms=400
|
||||
)
|
||||
)
|
||||
|
||||
# Collect all segments
|
||||
text = ""
|
||||
segments_list = []
|
||||
for segment in segments:
|
||||
text += segment.text + " "
|
||||
segments_list.append({
|
||||
"start": segment.start,
|
||||
"end": segment.end,
|
||||
"text": segment.text
|
||||
})
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"text": text.strip(),
|
||||
"language": info.language,
|
||||
"language_probability": info.language_probability,
|
||||
"duration": info.duration,
|
||||
"segments": segments_list,
|
||||
"engine": "whisper-gpu",
|
||||
"model": MODEL_SIZE,
|
||||
"device": DEVICE
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
finally:
|
||||
# Cleanup temp file
|
||||
try:
|
||||
os.unlink(temp_path)
|
||||
except:
|
||||
pass
|
||||
|
||||
async def handle_client(websocket):
|
||||
"""Handle WebSocket client connection"""
|
||||
print(f"[Whisper] Client connected")
|
||||
|
||||
# Ensure model is loaded
|
||||
await load_model()
|
||||
|
||||
# Send ready message
|
||||
await websocket.send(json.dumps({
|
||||
"type": "ready",
|
||||
"model": MODEL_SIZE,
|
||||
"device": DEVICE
|
||||
}))
|
||||
|
||||
try:
|
||||
async for message in websocket:
|
||||
if isinstance(message, bytes):
|
||||
# Binary audio data
|
||||
print(f"[Whisper] Received {len(message)} bytes of audio")
|
||||
|
||||
# Transcribe in thread pool to not block
|
||||
loop = asyncio.get_event_loop()
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
transcribe_audio,
|
||||
message,
|
||||
"es" # Default to Spanish
|
||||
)
|
||||
|
||||
await websocket.send(json.dumps({
|
||||
"type": "transcription",
|
||||
**result
|
||||
}))
|
||||
|
||||
else:
|
||||
# JSON command
|
||||
try:
|
||||
cmd = json.loads(message)
|
||||
|
||||
if cmd.get("type") == "transcribe":
|
||||
# Audio data sent as base64
|
||||
import base64
|
||||
audio_data = base64.b64decode(cmd.get("audio", ""))
|
||||
language = cmd.get("language", "es")
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
transcribe_audio,
|
||||
audio_data,
|
||||
language
|
||||
)
|
||||
|
||||
await websocket.send(json.dumps({
|
||||
"type": "transcription",
|
||||
**result
|
||||
}))
|
||||
|
||||
elif cmd.get("type") == "ping":
|
||||
await websocket.send(json.dumps({"type": "pong"}))
|
||||
|
||||
elif cmd.get("type") == "status":
|
||||
await websocket.send(json.dumps({
|
||||
"type": "status",
|
||||
"model": MODEL_SIZE,
|
||||
"device": DEVICE,
|
||||
"ready": model is not None
|
||||
}))
|
||||
|
||||
except json.JSONDecodeError:
|
||||
await websocket.send(json.dumps({
|
||||
"type": "error",
|
||||
"message": "Invalid JSON"
|
||||
}))
|
||||
|
||||
except websockets.exceptions.ConnectionClosed:
|
||||
print("[Whisper] Client disconnected")
|
||||
except Exception as e:
|
||||
print(f"[Whisper] Error: {e}")
|
||||
|
||||
async def main():
|
||||
"""Start WebSocket server"""
|
||||
print(f"[Whisper] Starting server on ws://{HOST}:{PORT}")
|
||||
print(f"[Whisper] Model: {MODEL_SIZE}, Device: {DEVICE}")
|
||||
|
||||
# Pre-load model
|
||||
print("[Whisper] Pre-loading model...")
|
||||
await load_model()
|
||||
|
||||
async with websockets.serve(handle_client, HOST, PORT):
|
||||
print(f"[Whisper] Server ready! Listening on ws://{HOST}:{PORT}")
|
||||
await asyncio.Future() # Run forever
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Install websockets if needed
|
||||
try:
|
||||
import websockets
|
||||
except ImportError:
|
||||
import subprocess
|
||||
subprocess.check_call([sys.executable, "-m", "pip", "install", "websockets"])
|
||||
import websockets
|
||||
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user